summaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
authorYang, Rong R <rong.r.yang@intel.com>2016-02-14 14:42:16 +0800
committerYang Rong <rong.r.yang@intel.com>2016-03-02 13:56:48 +0800
commit4e7d5a0c7a269b2c0b70e37e4e7fcb254065c042 (patch)
treefe1ec5523e5c35ed70580061bf6acd130a37fca3 /backend
parent49f040c9b34cf2d3e6af89af3cb77f82c2882b1e (diff)
downloadbeignet-4e7d5a0c7a269b2c0b70e37e4e7fcb254065c042.tar.gz
GBE: remove stacksize 64KB limitation.
If stacksize large 64KB, the formula of calculate the stackptr should change, form "threadId * perThreadSize + laneId*perLaneSize" to "(threadId * simdWidth + laneId)*perLaneSize", to avoid Dword * Dword. V2: Only support UD * UW, and UD is IMM must be src0 but IMM register can't be src0, so move it to tmp register. Signed-off-by: Yang Rong <rong.r.yang@intel.com> Reviewed-by: Ruiling Song <ruiling.song@intel.com>
Diffstat (limited to 'backend')
-rw-r--r--backend/src/backend/context.cpp2
-rw-r--r--backend/src/backend/gen75_context.cpp34
-rw-r--r--backend/src/backend/gen_context.cpp42
-rw-r--r--backend/src/backend/gen_context.hpp3
4 files changed, 41 insertions, 40 deletions
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 5adeabc4..09917868 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -398,7 +398,7 @@ namespace gbe
uint32_t stackSize = 128;
while (stackSize < fn.getStackSize()) {
stackSize *= 3;
- GBE_ASSERT(stackSize <= 64*KB);
+ //GBE_ASSERT(stackSize <= 64*KB);
}
this->kernel->stackSize = stackSize;
}
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index fa8b0295..43767349 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -66,37 +66,39 @@ namespace gbe
// Check that everything is consistent in the kernel code
const uint32_t perLaneSize = kernel->getStackSize();
- const uint32_t perThreadSize = perLaneSize * this->simdWidth;
GBE_ASSERT(perLaneSize > 0);
const GenRegister selStatckPtr = this->simdWidth == 8 ?
GenRegister::ud8grf(ir::ocl::stackptr) :
GenRegister::ud16grf(ir::ocl::stackptr);
const GenRegister stackptr = ra->genReg(selStatckPtr);
-
- loadLaneID(stackptr);
+ // borrow block ip as temporary register as we will
+ // initialize block ip latter.
+ const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+ const GenRegister tmpReg_ud = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UD);
// We compute the per-lane stack pointer here
- // private address start from zero
+ // threadId * perThreadSize + laneId*perLaneSize or
+ // (threadId * simdWidth + laneId)*perLaneSize
p->push();
p->curr.execWidth = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
//p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
- p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
- p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
- p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
+ p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+ p->AND(stackptr, GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
+ p->SHR(stackptr, stackptr, GenRegister::immud(7));
+ p->SHL(tmpReg, tmpReg, GenRegister::immud(2));
+ p->ADD(tmpReg, tmpReg, stackptr); //threadId
+
+ p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth
p->curr.execWidth = this->simdWidth;
- p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K
+ loadLaneID(stackptr);
+ p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K
p->curr.execWidth = 1;
- p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
- p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
- if(perThreadSize > 0xffff) {
- p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
- p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K
- } else
- p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
+ p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
p->curr.execWidth = this->simdWidth;
- p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+ p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize
+
p->pop();
}
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0ea0dd09..70912ef1 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -148,33 +148,33 @@ namespace gbe
}
/* Get proper block ip register according to current label width. */
- static GenRegister getBlockIP(GenContext &ctx) {
+ GenRegister GenContext::getBlockIP(void) {
GenRegister blockip;
- if (!ctx.isDWLabel())
- blockip = ctx.ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+ if (!isDWLabel())
+ blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
else
- blockip = ctx.ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
+ blockip = ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
return blockip;
}
/* Set current block ip register to a specified constant label value. */
- static void setBlockIP(GenContext &ctx, GenRegister blockip, uint32_t label) {
- if (!ctx.isDWLabel())
- ctx.p->MOV(blockip, GenRegister::immuw(label));
+ void GenContext::setBlockIP(GenRegister blockip, uint32_t label) {
+ if (!isDWLabel())
+ p->MOV(blockip, GenRegister::immuw(label));
else
- ctx.p->MOV(blockip, GenRegister::immud(label));
+ p->MOV(blockip, GenRegister::immud(label));
}
void GenContext::clearFlagRegister(void) {
// when group size not aligned to simdWidth, flag register need clear to
// make prediction(any8/16h) work correctly
- const GenRegister blockip = getBlockIP(*this);
+ const GenRegister blockip = getBlockIP();
p->push();
p->curr.noMask = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
- setBlockIP(*this, blockip, getMaxLabel());
+ setBlockIP(blockip, getMaxLabel());
p->curr.noMask = 0;
- setBlockIP(*this, blockip, 0);
+ setBlockIP(blockip, 0);
p->curr.execWidth = 1;
if (ra->isAllocated(ir::ocl::zero))
p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::zero)), GenRegister::immuw(0));
@@ -219,7 +219,6 @@ namespace gbe
// Check that everything is consistent in the kernel code
const uint32_t perLaneSize = kernel->getStackSize();
- const uint32_t perThreadSize = perLaneSize * this->simdWidth;
GBE_ASSERT(perLaneSize > 0);
const GenRegister selStatckPtr = this->simdWidth == 8 ?
@@ -228,28 +227,27 @@ namespace gbe
const GenRegister stackptr = ra->genReg(selStatckPtr);
// borrow block ip as temporary register as we will
// initialize block ip latter.
- const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP(*this)), GEN_TYPE_UD);
+ const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+ const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
loadLaneID(stackptr);
// We compute the per-lane stack pointer here
- // threadId * perThreadSize + laneId*perLaneSize
+ // threadId * perThreadSize + laneId*perLaneSize or
+ // (threadId * simdWidth + laneId)*perLaneSize
// let private address start from zero
//p->MOV(stackptr, GenRegister::immud(0));
p->push();
p->curr.execWidth = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
- p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+ p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+ p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth
p->curr.execWidth = this->simdWidth;
- p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K
+ p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K
p->curr.execWidth = 1;
- if(perThreadSize > 0xffff) {
- p->MUL(tmpReg, tmpReg, GenRegister::immuw(perLaneSize));
- p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K
- } else
- p->MUL(tmpReg, tmpReg, GenRegister::immuw(perThreadSize));
+ p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
p->curr.execWidth = this->simdWidth;
- p->ADD(stackptr, stackptr, tmpReg);
+ p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize
p->pop();
}
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 22ec0eac..25cce85b 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -110,7 +110,8 @@ namespace gbe
}
void loadLaneID(GenRegister dst);
-
+ GenRegister getBlockIP(void);
+ void setBlockIP(GenRegister blockip, uint32_t label);
void collectShifter(GenRegister dest, GenRegister src);
void loadTopHalf(GenRegister dest, GenRegister src);
void storeTopHalf(GenRegister dest, GenRegister src);