GBE: remove stacksize 64KB limitation.

If stacksize large 64KB, the formula of calculate the stackptr should change, form "threadId * perThreadSize + laneId*perLaneSize" to "(threadId * simdWidth + laneId)*perLaneSize", to avoid Dword * Dword. V2: Only support UD * UW, and UD is IMM must be src0 but IMM register can't be src0, so move it to tmp register. Signed-off-by: Yang Rong <rong.r.yang@intel.com> Reviewed-by: Ruiling Song <ruiling.song@intel.com>
author: Yang, Rong R <rong.r.yang@intel.com> 2016-02-14 14:42:16 +0800
committer: Yang Rong <rong.r.yang@intel.com> 2016-03-02 13:56:48 +0800
commit: 4e7d5a0c7a269b2c0b70e37e4e7fcb254065c042 (patch)
tree: fe1ec5523e5c35ed70580061bf6acd130a37fca3 /backend
parent: 49f040c9b34cf2d3e6af89af3cb77f82c2882b1e (diff)
download: beignet-4e7d5a0c7a269b2c0b70e37e4e7fcb254065c042.tar.gz
4 files changed, 41 insertions, 40 deletions
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 5adeabc4..09917868 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -398,7 +398,7 @@ namespace gbe
     uint32_t stackSize = 128;
     while (stackSize < fn.getStackSize()) {
       stackSize *= 3;
-      GBE_ASSERT(stackSize <= 64*KB);
+      //GBE_ASSERT(stackSize <= 64*KB);
     }
     this->kernel->stackSize = stackSize;
   }
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index fa8b0295..43767349 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -66,37 +66,39 @@ namespace gbe
 
     // Check that everything is consistent in the kernel code
     const uint32_t perLaneSize = kernel->getStackSize();
-    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
 
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
-
-    loadLaneID(stackptr);
+    // borrow block ip as temporary register as we will
+    // initialize block ip latter.
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+    const GenRegister tmpReg_ud = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UD);
 
     // We compute the per-lane stack pointer here
-    // private address start from zero
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
       //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
-      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
-      p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
-      p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+      p->AND(stackptr, GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
+      p->SHR(stackptr, stackptr, GenRegister::immud(7));
+      p->SHL(tmpReg, tmpReg, GenRegister::immud(2));
+      p->ADD(tmpReg, tmpReg, stackptr); //threadId
+
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
       p->curr.execWidth = this->simdWidth;
-      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
+      loadLaneID(stackptr);
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
       p->curr.execWidth = 1;
-      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
-      p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
-      if(perThreadSize > 0xffff) {
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
-      } else
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
+      p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
       p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+      p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize
+
     p->pop();
   }
 
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0ea0dd09..70912ef1 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -148,33 +148,33 @@ namespace gbe
   }
 
   /* Get proper block ip register according to current label width. */
-  static GenRegister getBlockIP(GenContext &ctx) {
+  GenRegister GenContext::getBlockIP(void) {
     GenRegister blockip;
-    if (!ctx.isDWLabel())
-      blockip = ctx.ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+    if (!isDWLabel())
+      blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
     else
-      blockip = ctx.ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
+      blockip = ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
     return blockip;
   }
 
   /* Set current block ip register to a specified constant label value. */
-  static void setBlockIP(GenContext &ctx, GenRegister blockip, uint32_t label) {
-    if (!ctx.isDWLabel())
-      ctx.p->MOV(blockip, GenRegister::immuw(label));
+  void GenContext::setBlockIP(GenRegister blockip, uint32_t label) {
+    if (!isDWLabel())
+      p->MOV(blockip, GenRegister::immuw(label));
     else
-      ctx.p->MOV(blockip, GenRegister::immud(label));
+      p->MOV(blockip, GenRegister::immud(label));
   }
 
   void GenContext::clearFlagRegister(void) {
     // when group size not aligned to simdWidth, flag register need clear to
     // make prediction(any8/16h) work correctly
-    const GenRegister blockip = getBlockIP(*this);
+    const GenRegister blockip = getBlockIP();
     p->push();
       p->curr.noMask = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      setBlockIP(*this, blockip, getMaxLabel());
+      setBlockIP(blockip, getMaxLabel());
       p->curr.noMask = 0;
-      setBlockIP(*this, blockip, 0);
+      setBlockIP(blockip, 0);
       p->curr.execWidth = 1;
       if (ra->isAllocated(ir::ocl::zero))
         p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::zero)), GenRegister::immuw(0));
@@ -219,7 +219,6 @@ namespace gbe
 
     // Check that everything is consistent in the kernel code
     const uint32_t perLaneSize = kernel->getStackSize();
-    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
 
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
@@ -228,28 +227,27 @@ namespace gbe
     const GenRegister stackptr = ra->genReg(selStatckPtr);
     // borrow block ip as temporary register as we will
     // initialize block ip latter.
-    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP(*this)), GEN_TYPE_UD);
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+    const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
 
     loadLaneID(stackptr);
 
     // We compute the per-lane stack pointer here
-    // threadId * perThreadSize + laneId*perLaneSize
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
     // let private address start from zero
     //p->MOV(stackptr, GenRegister::immud(0));
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
       p->curr.execWidth = this->simdWidth;
-      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
       p->curr.execWidth = 1;
-      if(perThreadSize > 0xffff) {
-        p->MUL(tmpReg, tmpReg, GenRegister::immuw(perLaneSize));
-        p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
-      } else
-        p->MUL(tmpReg, tmpReg, GenRegister::immuw(perThreadSize));
+      p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
       p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, tmpReg);
+      p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize
     p->pop();
   }
 
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 22ec0eac..25cce85b 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -110,7 +110,8 @@ namespace gbe
     }
 
     void loadLaneID(GenRegister dst);
-
+    GenRegister getBlockIP(void);
+    void setBlockIP(GenRegister blockip, uint32_t label);
     void collectShifter(GenRegister dest, GenRegister src);
     void loadTopHalf(GenRegister dest, GenRegister src);
     void storeTopHalf(GenRegister dest, GenRegister src);
author	Yang, Rong R <rong.r.yang@intel.com>	2016-02-14 14:42:16 +0800
committer	Yang Rong <rong.r.yang@intel.com>	2016-03-02 13:56:48 +0800
commit	4e7d5a0c7a269b2c0b70e37e4e7fcb254065c042 (patch)
tree	fe1ec5523e5c35ed70580061bf6acd130a37fca3 /backend
parent	49f040c9b34cf2d3e6af89af3cb77f82c2882b1e (diff)
download	beignet-4e7d5a0c7a269b2c0b70e37e4e7fcb254065c042.tar.gz