diff options
author | Grigore Lupescu <grigore.lupescu at intel.com> | 2016-04-11 17:34:21 +0300 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2016-05-18 15:10:36 +0800 |
commit | d7c0c436663bd2f4ebf8557e937d331f91d98b77 (patch) | |
tree | 58fd2f4dda6f20b7a0df98bb263e83f11cdc93bb /backend | |
parent | a58415e7fe0efab87f1c2e3f393065f538299b69 (diff) | |
download | beignet-d7c0c436663bd2f4ebf8557e937d331f91d98b77.tar.gz |
Backend: Support for workgroup op predicate, scan inclusive/exclusive
Extended workgroup reduce functionality to support predicate and scan inclusive/exclusive
Most of the functionality is defined into gen context
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
Reviewed-by: Pan Xiuli <xiuli.pan@intel.com>
Diffstat (limited to 'backend')
-rw-r--r-- | backend/src/backend/gen_context.cpp | 486 | ||||
-rw-r--r-- | backend/src/backend/gen_insn_selection.cpp | 19 | ||||
-rw-r--r-- | backend/src/ir/instruction.cpp | 4 | ||||
-rw-r--r-- | backend/src/libocl/src/ocl_work_group.cl | 4 | ||||
-rw-r--r-- | backend/src/llvm/llvm_gen_backend.cpp | 18 |
5 files changed, 356 insertions, 175 deletions
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index c6b3b589..e0416cee 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -2845,123 +2845,262 @@ namespace gbe } } p->pop(); } - static void workgroupOpBetweenThread(GenRegister partialRes, - GenRegister value, - uint32_t wg_op, - GenEncoder *p) + + static void workgroupOp(GenRegister dst, + GenRegister src1, + GenRegister src2, + uint32_t wg_op, + GenEncoder *p) { - if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN - || wg_op == ir::WORKGROUP_OP_REDUCE_MAX) { - if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) - cond = GEN_CONDITIONAL_LE; - else - cond = GEN_CONDITIONAL_GE; - p->SEL_CMP(cond, partialRes, partialRes, value); - } else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD) { - p->ADD(partialRes, partialRes, value); - } - } + // REDUCE + if (wg_op == ir::WORKGROUP_OP_ANY) + p->OR(dst, src1, src2); + else if (wg_op == ir::WORKGROUP_OP_ALL) + p->AND(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD) + p->ADD(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); + + // INCLUSIVE + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD) + p->ADD(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); + + // EXCLUSIVE + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD) + p->ADD(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); - static void initValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op) { - if (dataReg.type == GEN_TYPE_UD) { - if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN - || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) { - p->MOV(dataReg, GenRegister::immud(0xFFFFFFFF)); - } else { - GBE_ASSERT(wg_op == ir::WORKGROUP_OP_REDUCE_MAX || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX - || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX || wg_op == ir::WORKGROUP_OP_REDUCE_ADD - || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD); - p->MOV(dataReg, GenRegister::immud(0)); - } - } else if (dataReg.type == GEN_TYPE_F) { - if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN - || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) { - p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000)); // inf - } else if (wg_op == ir::WORKGROUP_OP_REDUCE_MAX || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX - || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) { - p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000)); // -inf - } else { - GBE_ASSERT(wg_op == ir::WORKGROUP_OP_REDUCE_ADD || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD - || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD); - p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x0)); - } - } else { + else GBE_ASSERT(0); - } } - static void workgroupOpInThread(GenRegister msgData, GenRegister theVal, GenRegister threadData, - GenRegister tmp, uint32_t simd, uint32_t wg_op, GenEncoder *p) { - p->push(); - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->curr.execWidth = 1; - - /* Setting the init value here. */ - threadData = GenRegister::retype(threadData, theVal.type); - initValue(p, threadData, wg_op); + static void initValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op) + { - if (theVal.hstride != GEN_HORIZONTAL_STRIDE_0) { - /* We need to set the value out of dispatch mask to MAX. */ - tmp = GenRegister::retype(tmp, theVal.type); - p->push(); - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - p->curr.execWidth = simd; - initValue(p, tmp, wg_op); - p->curr.noMask = 0; - p->MOV(tmp, theVal); - p->pop(); + if (wg_op == ir::WORKGROUP_OP_ALL) + { + if (dataReg.type == GEN_TYPE_D + || dataReg.type == GEN_TYPE_UD + || dataReg.type == GEN_TYPE_F) + p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF)); + else if(dataReg.type == GEN_TYPE_L || + dataReg.type == GEN_TYPE_UL) + p->MOV(dataReg, GenRegister::immint64(0xFFFFFFFFFFFFFFFFL)); + else + GBE_ASSERT(0); /* unsupported data-type */ } - if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN || wg_op == ir::WORKGROUP_OP_REDUCE_MAX) { - uint32_t cond; - if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) - cond = GEN_CONDITIONAL_LE; + else if(wg_op == ir::WORKGROUP_OP_ANY + || wg_op == ir::WORKGROUP_OP_REDUCE_ADD + || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD) + { + if (dataReg.type == GEN_TYPE_D) + p->MOV(dataReg, GenRegister::immd(0x0)); + else if (dataReg.type == GEN_TYPE_UD) + p->MOV(dataReg, GenRegister::immud(0x0)); + else if (dataReg.type == GEN_TYPE_F) + p->MOV(dataReg, GenRegister::immf(0x0)); + else if (dataReg.type == GEN_TYPE_L) + p->MOV(dataReg, GenRegister::immint64(0x0)); + else if (dataReg.type == GEN_TYPE_UL) + p->MOV(dataReg, GenRegister::immuint64(0x0)); else - cond = GEN_CONDITIONAL_GE; + GBE_ASSERT(0); /* unsupported data-type */ + } - if (theVal.hstride == GEN_HORIZONTAL_STRIDE_0) { // an uniform value. - p->SEL_CMP(cond, threadData, threadData, theVal); - } else { - GBE_ASSERT(tmp.type == theVal.type); - GenRegister v = GenRegister::toUniform(tmp, theVal.type); - for (uint32_t i = 0; i < simd; i++) { - p->SEL_CMP(cond, threadData, threadData, v); - v.subnr += typeSize(theVal.type); - if (v.subnr == 32) { - v.subnr = 0; - v.nr++; - } - } - } + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN + || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) + { + if (dataReg.type == GEN_TYPE_D) + p->MOV(dataReg, GenRegister::immd(0x7FFFFFFF)); + else if (dataReg.type == GEN_TYPE_UD) + p->MOV(dataReg, GenRegister::immud(0xFFFFFFFF)); + else if (dataReg.type == GEN_TYPE_F) + p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000)); + else if (dataReg.type == GEN_TYPE_L) + p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL)); + else if (dataReg.type == GEN_TYPE_UL) + p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL)); + else + GBE_ASSERT(0); /* unsupported data-type */ } - else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD){ - tmp.hstride = GEN_HORIZONTAL_STRIDE_1; - tmp.vstride = GEN_VERTICAL_STRIDE_4; - tmp.width = 2; - - GBE_ASSERT(tmp.type == theVal.type); - GenRegister partialSum = tmp; - - /* adjust offset, compute add with ADD4/ADD */ - for (uint32_t i = 1; i < simd/4; i++){ - p->push(); { - tmp = tmp.suboffset(tmp, 4); - p->curr.execWidth = 4; - p->ADD(partialSum, partialSum, tmp); - } p->pop(); - } - for (uint32_t i = 0; i < 4; i++){ - p->push(); { - p->ADD(threadData, threadData, partialSum); - partialSum = partialSum.suboffset(partialSum, 1); - } p->pop(); - } + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX + || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + { + if (dataReg.type == GEN_TYPE_D) + p->MOV(dataReg, GenRegister::immd(0x80000000)); + else if (dataReg.type == GEN_TYPE_UD) + p->MOV(dataReg, GenRegister::immud(0x0)); + else if (dataReg.type == GEN_TYPE_F) + p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000)); + else if (dataReg.type == GEN_TYPE_L) + p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L)); + else if (dataReg.type == GEN_TYPE_UL) + p->MOV(dataReg, GenRegister::immuint64(0x0)); + else + GBE_ASSERT(0); /* unsupported data-type */ } - p->pop(); -} + + /* unsupported operation */ + else + GBE_ASSERT(0); + } + + static void workgroupOpInThread(GenRegister threadDst, GenRegister inputVal, GenRegister threadExchangeData, + GenRegister resultVal, uint32_t simd, uint32_t wg_op, GenEncoder *p) { + p->push(); + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + + /* setting the type */ + resultVal = GenRegister::retype(resultVal, inputVal.type); + threadDst = GenRegister::retype(threadDst, inputVal.type); + threadExchangeData = GenRegister::retype(threadExchangeData, inputVal.type); + + if (inputVal.hstride == GEN_HORIZONTAL_STRIDE_0) { + p->MOV(threadExchangeData, inputVal); + p->pop(); + return; + } + + /* init thread data to min/max/null values */ + p->push(); { + p->curr.execWidth = simd; + initValue(p, threadExchangeData, wg_op); + p->MOV(resultVal, inputVal); + } p->pop(); + + GenRegister resultValSingle = resultVal; + resultValSingle.hstride = GEN_HORIZONTAL_STRIDE_0; + resultValSingle.vstride = GEN_VERTICAL_STRIDE_0; + resultValSingle.width = GEN_WIDTH_1; + + GenRegister inputValSingle = inputVal; + inputValSingle.hstride = GEN_HORIZONTAL_STRIDE_0; + inputValSingle.vstride = GEN_VERTICAL_STRIDE_0; + inputValSingle.width = GEN_WIDTH_1; + + vector<GenRegister> input; + vector<GenRegister> result; + + /* make an array of registers for easy accesing */ + for(uint32_t i = 0; i < simd; i++){ + /* add all resultVal offset reg positions from list */ + result.push_back(resultValSingle); + input.push_back(inputValSingle); + + /* move to next position */ + resultValSingle.subnr += typeSize(resultValSingle.type); + if (resultValSingle.subnr == 32) { + resultValSingle.subnr = 0; + resultValSingle.nr++; + } + /* move to next position */ + inputValSingle.subnr += typeSize(inputValSingle.type); + if (inputValSingle.subnr == 32) { + inputValSingle.subnr = 0; + inputValSingle.nr++; + } + } + + uint32_t start_i = 0; + if(wg_op == ir::WORKGROUP_OP_ANY || + wg_op == ir::WORKGROUP_OP_ALL || + wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) { + p->MOV(result[0], input[0]); + start_i = 1; + } + + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) { + p->MOV(result[1], input[0]); + start_i = 2; + } + + /* algorithm workgroup */ + for (uint32_t i = start_i; i < simd; i++) + { + if(wg_op == ir::WORKGROUP_OP_ANY || + wg_op == ir::WORKGROUP_OP_ALL || + wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + workgroupOp(result[0], result[0], input[i], wg_op, p); + + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) + workgroupOp(result[i], result[i - 1], input[i], wg_op, p); + + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + workgroupOp(result[i], result[i - 1], input[i - 1], wg_op, p); + + else + GBE_ASSERT(0); + } + + if(wg_op == ir::WORKGROUP_OP_ANY || + wg_op == ir::WORKGROUP_OP_ALL || + wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + { + p->curr.execWidth = 16; + /* value exchanged with other threads */ + p->MOV(threadExchangeData, result[0]); + /* partial result thread */ + p->MOV(threadDst, result[0]); + } + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) + { + p->curr.execWidth = 16; + /* value exchanged with other threads */ + p->MOV(threadExchangeData, result[simd - 1]); + /* partial result thread */ + p->MOV(threadDst, resultVal); + } + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + { + p->curr.execWidth = 1; + /* set result[0] to min/max/null */ + initValue(p, result[0], wg_op); + + p->curr.execWidth = 16; + /* value exchanged with other threads */ + workgroupOp(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p); + /* partial result thread */ + p->MOV(threadDst, resultVal); + } + + p->pop(); + } /** * Basic idea: @@ -2976,22 +3115,17 @@ namespace gbe void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn){ const GenRegister dst = ra->genReg(insn.dst(0)); const GenRegister tmp = ra->genReg(insn.dst(1)); - - const GenRegister theVal = ra->genReg(insn.src(0)); - GenRegister threadData = ra->genReg(insn.src(1)); + const GenRegister theVal = ra->genReg(insn.src(2)); + GenRegister threadData = ra->genReg(insn.src(3)); GenRegister partialData = GenRegister::toUniform(threadData, dst.type); - GenRegister threadId = ra->genReg(insn.src(2)); - GenRegister threadNum = ra->genReg(insn.src(3)); - - threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD); - threadNum = GenRegister::toUniform(threadNum, GEN_TYPE_UD); + GenRegister threadId = ra->genReg(insn.src(0)); + GenRegister threadLoop = ra->genReg(insn.src(1)); uint32_t wg_op = insn.extra.workgroupOp; uint32_t simd = p->curr.execWidth; int32_t jip0, jip1; - GenRegister result = GenRegister::offset(dst, 0, 16); - result = GenRegister::toUniform(result, dst.type); + threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD); /* Use of continuous GRF allocation from insn selection */ GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type); @@ -3000,12 +3134,44 @@ namespace gbe GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type); /* Do some calculation within each thread */ - workgroupOpInThread(msg, theVal, threadData, tmp, simd, wg_op, p); + workgroupOpInThread(dst, theVal, threadData, tmp, simd, wg_op, p); + + p->curr.execWidth = 16; + p->MOV(theVal, dst); threadData = GenRegister::toUniform(threadData, dst.type); /* Store thread count for future use on read/write to SLM */ - GenRegister threadN = GenRegister::retype(tmp, GEN_TYPE_UD); - p->MOV(threadN, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn))); + if (wg_op == ir::WORKGROUP_OP_ANY || + wg_op == ir::WORKGROUP_OP_ALL || + wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + { + //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn))); + threadLoop = GenRegister::retype(tmp, GEN_TYPE_D); + p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn))); + } + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + { + //p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid))); + threadLoop = GenRegister::retype(tmp, GEN_TYPE_D); + p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid))); + } + + /* TODO implement communication for DW types */ + if(dst.type == GEN_TYPE_UL || + dst.type == GEN_TYPE_L || + dst.type == GEN_TYPE_DF_IMM) + { + p->curr.execWidth = 16; + p->MOV(dst, threadData); + return; + } /* All threads write the partial results to SLM memory */ p->curr.execWidth = 8; @@ -3017,30 +3183,10 @@ namespace gbe /* Init partialData register, it will hold the final result */ initValue(p, partialData, wg_op); - /* Thread 0 will write extra elements for future reads in chunks of 4 */ - p->push();{ - p->curr.noMask = 1; - p->curr.flag = 0; - p->curr.subFlag = 1; - p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0)); - p->curr.predicate = GEN_PREDICATE_NORMAL; - p->curr.execWidth = 8; - p->MOV(msgData.offset(msgData, 0), partialData); - p->MOV(msgData.offset(msgData, 1), partialData); - p->MOV(msgData.offset(msgData, 2), partialData); - p->MUL(msgAddr, threadN, GenRegister::immd(0x4)); - p->ADD(msgAddr, msgAddr, msgSlmOff); - p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 3); - } p->pop(); - - /* Round threadN to nearest upper number divisible with 4 required for - * reading in chunks of 4 elements from SLM */ - p->ADD(threadN, threadN, GenRegister::immd(0x3)); - p->SHR(threadN, threadN, GenRegister::immd(0x2)); - p->SHL(threadN, threadN, GenRegister::immd(0x2)); - - /* Wait for all writes to complete in work-group */ - p->FENCE(tmp); + p->FENCE(msgData); + p->MOV(msgData, msgData); + p->FENCE(msgData); + p->MOV(msgData, msgData); /* Perform a loop, based on thread count (which is now multiple of 4) */ p->push();{ @@ -3050,31 +3196,67 @@ namespace gbe p->curr.predicate = GEN_PREDICATE_NONE; /* Read in chunks of 4 to optimize SLM reads and reduce SEND messages */ - p->ADD(threadN, threadN, GenRegister::immd(-4)); - p->MUL(msgAddr, threadN, GenRegister::immd(0x4)); + p->ADD(threadLoop, threadLoop, GenRegister::immd(-1)); + p->MUL(msgAddr, threadLoop, GenRegister::immd(0x4)); p->ADD(msgAddr, msgAddr, msgSlmOff); - p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 4); + p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1); /* Perform operation, process 4 elements, partialData will hold result */ - workgroupOpBetweenThread(partialData, msgData.offset(msgData, 0), wg_op, p); - workgroupOpBetweenThread(partialData, msgData.offset(msgData, 1), wg_op, p); - workgroupOpBetweenThread(partialData, msgData.offset(msgData, 2), wg_op, p); - workgroupOpBetweenThread(partialData, msgData.offset(msgData, 3), wg_op, p); + workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p); /* While threadN is not 0, cycle read SLM / update value */ p->curr.noMask = 1; p->curr.flag = 0; p->curr.subFlag = 1; - p->CMP(GEN_CONDITIONAL_G, threadN, GenRegister::immd(0x0)); + p->CMP(GEN_CONDITIONAL_G, threadLoop, GenRegister::immd(0x0)); p->curr.predicate = GEN_PREDICATE_NORMAL; jip1 = p->n_instruction(); p->JMPI(GenRegister::immud(0)); p->patchJMPI(jip1, jip0 - jip1, 0); } p->pop(); - /* Save result to final register location dst */ - p->curr.execWidth = 16; - p->MOV(dst, partialData); + if(wg_op == ir::WORKGROUP_OP_ANY || + wg_op == ir::WORKGROUP_OP_ALL || + wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + { + /* Save result to final register location dst */ + p->curr.execWidth = 16; + p->MOV(dst, partialData); + } + else { + /* Save result to final register location dst */ + p->curr.execWidth = 16; + if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD) + p->ADD(dst, dst, partialData); + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData); + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData); + } + + /* corner cases for threads 0 */ + if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + { + p->push();{ + p->curr.flag = 0; + p->curr.subFlag = 1; + p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0)); + p->curr.predicate = GEN_PREDICATE_NORMAL; + + p->curr.execWidth = 16; + p->MOV(dst, theVal); + } p->pop(); + } } void GenContext::emitPrintfLongInstruction(GenRegister& addr, GenRegister& data, diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 00691bb5..eea8739b 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -1984,10 +1984,10 @@ namespace gbe for(uint32_t i = 0; i < msg.size(); i++) insn->dst(2 + i) = msg[i]; - insn->src(0) = src; - insn->src(1) = data; - insn->src(2) = threadId; - insn->src(3) = threadN; + insn->src(0) = threadId; + insn->src(1) = threadN; + insn->src(2) = src; + insn->src(3) = data; insn->src(4) = slmOff; } @@ -6216,7 +6216,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn); GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid); GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), type); - GenRegister data = sel.selReg(sel.reg(FAMILY_WORD), type); + GenRegister data = sel.selReg(sel.reg(FAMILY_DWORD), type); GenRegister slmOff = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); vector<GenRegister> msg; @@ -6225,8 +6225,11 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp /* compute individual slice of workitems, (e.g. 0->16 workitems) */ sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr())); - sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId, - threadN, tmp, slmOff, msg); + + /* barrier for syn prior to workgroup */ + sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); + sel.WORKGROUP_OP(workGroupOp, dst, src, data, threadId, threadN, tmp, slmOff, msg); + return true; } @@ -6304,7 +6307,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp if (workGroupOp == WORKGROUP_OP_BROADCAST){ return emitWGBroadcast(sel, insn); } - else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX){ + else if (workGroupOp >= WORKGROUP_OP_ANY && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX){ return emitWGReduce(sel, insn); } else diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index 652c1fbf..d9051aba 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -1461,13 +1461,11 @@ namespace ir { INLINE bool WorkGroupInstruction::wellFormed(const Function &fn, std::string &whyNot) const { const RegisterFamily family = getFamily(this->type); + if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false)) return false; if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false)) return false; - const Register src0 = fn.getRegister(src, 0); - if (UNLIKELY(checkRegisterData(family, src0, fn, whyNot) == false)) - return false; switch (this->workGroupOp) { case WORKGROUP_OP_ANY: diff --git a/backend/src/libocl/src/ocl_work_group.cl b/backend/src/libocl/src/ocl_work_group.cl index 5ad5a8f4..2c43d6d6 100644 --- a/backend/src/libocl/src/ocl_work_group.cl +++ b/backend/src/libocl/src/ocl_work_group.cl @@ -17,12 +17,12 @@ */ #include "ocl_work_group.h" -int __gen_ocl_work_group_all(int predicate); +int __gen_ocl_work_group_all(int); int work_group_all(int predicate) { return __gen_ocl_work_group_all(predicate); } -int __gen_ocl_work_group_any(int predicate); +int __gen_ocl_work_group_any(int); int work_group_any(int predicate) { return __gen_ocl_work_group_any(predicate); } diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 5eaca9cd..59a58c5e 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -3817,18 +3817,12 @@ namespace gbe GBE_ASSERT(f.getwgBroadcastSLM() >= 0); } - if (f.gettidMapSLM() < 0 && opcode >= ir::WORKGROUP_OP_REDUCE_ADD && opcode <= ir::WORKGROUP_OP_EXCLUSIVE_MAX) { + else if (f.gettidMapSLM() < 0 && opcode >= ir::WORKGROUP_OP_ANY && opcode <= ir::WORKGROUP_OP_EXCLUSIVE_MAX) { /* 1. For thread SLM based communication (default): * Threads will use SLM to write partial results computed individually and then read the whole set. Because the read is done in chunks of 4 extra padding is required. - * 2. For thread message based communication: - * Because we can not know the thread ID and the EUID for every physical - thead which the work items execute on before the run time. We need to - sync the thread execution order when using work group functions. We - create the workitems/threadID map table in slm. - When we come to here, the global thread local vars should have all been allocated, so it's safe for us to steal a piece of SLM for this usage. */ @@ -3847,9 +3841,12 @@ namespace gbe if (opcode == ir::WORKGROUP_OP_ALL || opcode == ir::WORKGROUP_OP_ANY) { GBE_ASSERT(getType(ctx, (*AI)->getType()) == ir::TYPE_S32); - const ir::Register src = this->getRegister(*(AI++)); - const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1); - ctx.WORKGROUP(opcode, (uint32_t)0, getRegister(&I), srcTuple, 1, ir::TYPE_S32); + ir::Register src[3]; + src[0] = ir::ocl::threadn; + src[1] = ir::ocl::threadid; + src[2] = this->getRegister(*(AI++)); + const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3); + ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(), getRegister(&I), srcTuple, 3, ir::TYPE_S32); } else if (opcode == ir::WORKGROUP_OP_BROADCAST) { int argNum = CS.arg_size(); std::vector<ir::Register> src(argNum); @@ -3867,6 +3864,7 @@ namespace gbe ir::Type ty; if (isSign) { ty = getType(ctx, (*AI)->getType()); + } else { ty = getUnsignedType(ctx, (*AI)->getType()); } |