diff options
author | Yang, Rong R <rong.r.yang@intel.com> | 2016-02-14 14:42:16 +0800 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2016-03-02 13:56:48 +0800 |
commit | 4e7d5a0c7a269b2c0b70e37e4e7fcb254065c042 (patch) | |
tree | fe1ec5523e5c35ed70580061bf6acd130a37fca3 /backend | |
parent | 49f040c9b34cf2d3e6af89af3cb77f82c2882b1e (diff) | |
download | beignet-4e7d5a0c7a269b2c0b70e37e4e7fcb254065c042.tar.gz |
GBE: remove stacksize 64KB limitation.
If stacksize large 64KB, the formula of calculate the stackptr should
change, form "threadId * perThreadSize + laneId*perLaneSize" to
"(threadId * simdWidth + laneId)*perLaneSize", to avoid Dword * Dword.
V2: Only support UD * UW, and UD is IMM must be src0 but IMM register
can't be src0, so move it to tmp register.
Signed-off-by: Yang Rong <rong.r.yang@intel.com>
Reviewed-by: Ruiling Song <ruiling.song@intel.com>
Diffstat (limited to 'backend')
-rw-r--r-- | backend/src/backend/context.cpp | 2 | ||||
-rw-r--r-- | backend/src/backend/gen75_context.cpp | 34 | ||||
-rw-r--r-- | backend/src/backend/gen_context.cpp | 42 | ||||
-rw-r--r-- | backend/src/backend/gen_context.hpp | 3 |
4 files changed, 41 insertions, 40 deletions
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index 5adeabc4..09917868 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -398,7 +398,7 @@ namespace gbe uint32_t stackSize = 128; while (stackSize < fn.getStackSize()) { stackSize *= 3; - GBE_ASSERT(stackSize <= 64*KB); + //GBE_ASSERT(stackSize <= 64*KB); } this->kernel->stackSize = stackSize; } diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp index fa8b0295..43767349 100644 --- a/backend/src/backend/gen75_context.cpp +++ b/backend/src/backend/gen75_context.cpp @@ -66,37 +66,39 @@ namespace gbe // Check that everything is consistent in the kernel code const uint32_t perLaneSize = kernel->getStackSize(); - const uint32_t perThreadSize = perLaneSize * this->simdWidth; GBE_ASSERT(perLaneSize > 0); const GenRegister selStatckPtr = this->simdWidth == 8 ? GenRegister::ud8grf(ir::ocl::stackptr) : GenRegister::ud16grf(ir::ocl::stackptr); const GenRegister stackptr = ra->genReg(selStatckPtr); - - loadLaneID(stackptr); + // borrow block ip as temporary register as we will + // initialize block ip latter. + const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW); + const GenRegister tmpReg_ud = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UD); // We compute the per-lane stack pointer here - // private address start from zero + // threadId * perThreadSize + laneId*perLaneSize or + // (threadId * simdWidth + laneId)*perLaneSize p->push(); p->curr.execWidth = 1; p->curr.predicate = GEN_PREDICATE_NONE; //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff)); - p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f)); - p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180)); - p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7)); + p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x7f)); + p->AND(stackptr, GenRegister::ud1grf(0,5), GenRegister::immud(0x180)); + p->SHR(stackptr, stackptr, GenRegister::immud(7)); + p->SHL(tmpReg, tmpReg, GenRegister::immud(2)); + p->ADD(tmpReg, tmpReg, stackptr); //threadId + + p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth p->curr.execWidth = this->simdWidth; - p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K + loadLaneID(stackptr); + p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K p->curr.execWidth = 1; - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2)); - p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4)); - if(perThreadSize > 0xffff) { - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize)); - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K - } else - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize)); + p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize)); p->curr.execWidth = this->simdWidth; - p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0)); + p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize + p->pop(); } diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 0ea0dd09..70912ef1 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -148,33 +148,33 @@ namespace gbe } /* Get proper block ip register according to current label width. */ - static GenRegister getBlockIP(GenContext &ctx) { + GenRegister GenContext::getBlockIP(void) { GenRegister blockip; - if (!ctx.isDWLabel()) - blockip = ctx.ra->genReg(GenRegister::uw8grf(ir::ocl::blockip)); + if (!isDWLabel()) + blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip)); else - blockip = ctx.ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip)); + blockip = ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip)); return blockip; } /* Set current block ip register to a specified constant label value. */ - static void setBlockIP(GenContext &ctx, GenRegister blockip, uint32_t label) { - if (!ctx.isDWLabel()) - ctx.p->MOV(blockip, GenRegister::immuw(label)); + void GenContext::setBlockIP(GenRegister blockip, uint32_t label) { + if (!isDWLabel()) + p->MOV(blockip, GenRegister::immuw(label)); else - ctx.p->MOV(blockip, GenRegister::immud(label)); + p->MOV(blockip, GenRegister::immud(label)); } void GenContext::clearFlagRegister(void) { // when group size not aligned to simdWidth, flag register need clear to // make prediction(any8/16h) work correctly - const GenRegister blockip = getBlockIP(*this); + const GenRegister blockip = getBlockIP(); p->push(); p->curr.noMask = 1; p->curr.predicate = GEN_PREDICATE_NONE; - setBlockIP(*this, blockip, getMaxLabel()); + setBlockIP(blockip, getMaxLabel()); p->curr.noMask = 0; - setBlockIP(*this, blockip, 0); + setBlockIP(blockip, 0); p->curr.execWidth = 1; if (ra->isAllocated(ir::ocl::zero)) p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::zero)), GenRegister::immuw(0)); @@ -219,7 +219,6 @@ namespace gbe // Check that everything is consistent in the kernel code const uint32_t perLaneSize = kernel->getStackSize(); - const uint32_t perThreadSize = perLaneSize * this->simdWidth; GBE_ASSERT(perLaneSize > 0); const GenRegister selStatckPtr = this->simdWidth == 8 ? @@ -228,28 +227,27 @@ namespace gbe const GenRegister stackptr = ra->genReg(selStatckPtr); // borrow block ip as temporary register as we will // initialize block ip latter. - const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP(*this)), GEN_TYPE_UD); + const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW); + const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD); loadLaneID(stackptr); // We compute the per-lane stack pointer here - // threadId * perThreadSize + laneId*perLaneSize + // threadId * perThreadSize + laneId*perLaneSize or + // (threadId * simdWidth + laneId)*perLaneSize // let private address start from zero //p->MOV(stackptr, GenRegister::immud(0)); p->push(); p->curr.execWidth = 1; p->curr.predicate = GEN_PREDICATE_NONE; - p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff)); + p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId + p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth p->curr.execWidth = this->simdWidth; - p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K + p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K p->curr.execWidth = 1; - if(perThreadSize > 0xffff) { - p->MUL(tmpReg, tmpReg, GenRegister::immuw(perLaneSize)); - p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K - } else - p->MUL(tmpReg, tmpReg, GenRegister::immuw(perThreadSize)); + p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize)); p->curr.execWidth = this->simdWidth; - p->ADD(stackptr, stackptr, tmpReg); + p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize p->pop(); } diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 22ec0eac..25cce85b 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -110,7 +110,8 @@ namespace gbe } void loadLaneID(GenRegister dst); - + GenRegister getBlockIP(void); + void setBlockIP(GenRegister blockip, uint32_t label); void collectShifter(GenRegister dest, GenRegister src); void loadTopHalf(GenRegister dest, GenRegister src); void storeTopHalf(GenRegister dest, GenRegister src); |