summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHomer Hsing <homer.xing@intel.com>2013-06-21 12:26:31 +0800
committerZhigang Gong <zhigang.gong@linux.intel.com>2013-06-21 14:27:57 +0800
commit675086fabb6f32e38c5912931c8566ed8819eb79 (patch)
tree214c52f73ef9efe064c80765a2d889b56ba42b9a
parent104839f4c631dad188791c6584c9a114bc551a7b (diff)
downloadbeignet-675086fabb6f32e38c5912931c8566ed8819eb79.tar.gz
Support 64-bit float
support: arithmetic(+ - *) store load immediate_value if else select other change: add "nib control" field in machine instruction format support "nib control" fix "directly store float-64 after load float-64". change hard coded store size (4) to flexible size (4 or 8) when using float-64 load(store), change to SIMD8 example: /* support arithmetic store load immediate_value */ kernel void f(global double *src, global double *dst) { int i = get_global_id(0); double d = 1.234567890123456789; dst[i] = d * (src[i] + d); } /* support if else */ kernel void f(global float *src, global double *dst) { int i = get_global_id(0); float d = 1.234567890123456789f; if (i < 14) dst[i] = d * (d + src[i]); else dst[i] = 14; } /* support select */ kernel void f(global float *src, global double *dst) { int i = get_global_id(0); float d = 1.234567890123456789f; dst[i] = i < 14 ? d : 14; } Signed-off-by: Homer Hsing <homer.xing@intel.com> Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com> Tested-by: Yang Rong <rong.r.yang@intel.com>
-rw-r--r--backend/src/backend/gen_context.cpp17
-rw-r--r--backend/src/backend/gen_context.hpp2
-rw-r--r--backend/src/backend/gen_defs.hpp10
-rw-r--r--backend/src/backend/gen_encoder.cpp203
-rw-r--r--backend/src/backend/gen_encoder.hpp6
-rw-r--r--backend/src/backend/gen_insn_gen7_schedule_info.hxx2
-rw-r--r--backend/src/backend/gen_insn_selection.cpp148
-rw-r--r--backend/src/backend/gen_insn_selection.hxx4
-rw-r--r--backend/src/backend/gen_reg_allocation.cpp1
-rw-r--r--backend/src/backend/gen_register.hpp102
-rw-r--r--backend/src/llvm/llvm_gen_backend.cpp3
11 files changed, 476 insertions, 22 deletions
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 70c5bcf1..53ba73c1 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -153,6 +153,8 @@ namespace gbe
const GenRegister src0 = ra->genReg(insn.src(0));
const GenRegister src1 = ra->genReg(insn.src(1));
switch (insn.opcode) {
+ case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src1, src0.value.df); break;
+ case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
case SEL_OP_SEL: p->SEL(dst, src0, src1); break;
case SEL_OP_AND: p->AND(dst, src0, src1); break;
case SEL_OP_OR: p->OR (dst, src0, src1); break;
@@ -269,6 +271,14 @@ namespace gbe
p->pop();
}
+ void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.extra.function;
+ const uint32_t elemNum = insn.extra.elem;
+ p->READ_FLOAT64(dst, src, bti, elemNum);
+ }
+
void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
@@ -277,6 +287,13 @@ namespace gbe
p->UNTYPED_READ(dst, src, bti, elemNum);
}
+ void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.extra.function;
+ const uint32_t elemNum = insn.extra.elem;
+ p->WRITE_FLOAT64(src, bti, elemNum);
+ }
+
void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
const uint32_t bti = insn.extra.function;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 1566cbb0..804384d8 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -87,6 +87,8 @@ namespace gbe
void emitBarrierInstruction(const SelectionInstruction &insn);
void emitFenceInstruction(const SelectionInstruction &insn);
void emitMathInstruction(const SelectionInstruction &insn);
+ void emitReadFloat64Instruction(const SelectionInstruction &insn);
+ void emitWriteFloat64Instruction(const SelectionInstruction &insn);
void emitUntypedReadInstruction(const SelectionInstruction &insn);
void emitUntypedWriteInstruction(const SelectionInstruction &insn);
void emitByteGatherInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index f4e4938d..9d8db5bf 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -215,6 +215,7 @@ enum GenMessageTarget {
#define GEN_TYPE_VF 5 /* packed float vector, immediates only? */
#define GEN_TYPE_HF 6
#define GEN_TYPE_V 6 /* packed int vector, immediates only, uword dest only */
+#define GEN_TYPE_DF 6
#define GEN_TYPE_F 7
#define GEN_ARF_NULL 0x00
@@ -303,6 +304,7 @@ enum GenMessageTarget {
#define GEN_BYTE_SCATTER_BYTE 0
#define GEN_BYTE_SCATTER_WORD 1
#define GEN_BYTE_SCATTER_DWORD 2
+#define GEN_BYTE_SCATTER_QWORD 3
#define GEN_SAMPLER_RETURN_FORMAT_FLOAT32 0
#define GEN_SAMPLER_RETURN_FORMAT_UINT32 2
@@ -418,7 +420,7 @@ struct GenInstruction
uint32_t src0_reg_type:3;
uint32_t src1_reg_file:2;
uint32_t src1_reg_type:3;
- uint32_t pad:1;
+ uint32_t nib_ctrl:1;
uint32_t dest_subreg_nr:5;
uint32_t dest_reg_nr:8;
uint32_t dest_horiz_stride:2;
@@ -432,7 +434,7 @@ struct GenInstruction
uint32_t src0_reg_type:3;
uint32_t src1_reg_file:2; /* 0x00000c00 */
uint32_t src1_reg_type:3; /* 0x00007000 */
- uint32_t pad:1;
+ uint32_t nib_ctrl:1;
int dest_indirect_offset:10; /* offset against the deref'd address reg */
uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
uint32_t dest_horiz_stride:2;
@@ -446,7 +448,7 @@ struct GenInstruction
uint32_t src0_reg_type:3;
uint32_t src1_reg_file:2;
uint32_t src1_reg_type:3;
- uint32_t pad:1;
+ uint32_t nib_ctrl:1;
uint32_t dest_writemask:4;
uint32_t dest_subreg_nr:1;
uint32_t dest_reg_nr:8;
@@ -459,7 +461,7 @@ struct GenInstruction
uint32_t dest_reg_type:3;
uint32_t src0_reg_file:2;
uint32_t src0_reg_type:3;
- uint32_t pad0:6;
+ uint32_t nib_ctrl:1;
uint32_t dest_writemask:4;
int dest_indirect_offset:6;
uint32_t dest_subreg_nr:3;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 859a1b98..3d8afe89 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -235,6 +235,7 @@ namespace gbe
NOT_IMPLEMENTED;
insn->header.acc_wr_control = this->curr.accWrEnable;
insn->header.quarter_control = this->curr.quarterControl;
+ insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
insn->header.mask_control = this->curr.noMask;
insn->bits2.ia1.flag_reg_nr = this->curr.flag;
insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
@@ -355,6 +356,105 @@ namespace gbe
0
};
+ static int dst_type(int exec_width) {
+ if (exec_width == 8)
+ return GEN_TYPE_UD;
+ if (exec_width == 16)
+ return GEN_TYPE_UW;
+ NOT_IMPLEMENTED;
+ return 0;
+ }
+
+ void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+ int w = curr.execWidth;
+ dst = GenRegister::h2(dst);
+ dst.type = GEN_TYPE_UD;
+ src.type = GEN_TYPE_UD;
+ GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
+ GenRegister imm4 = GenRegister::immud(4);
+ GenInstruction *insn;
+ insn = next(GEN_OPCODE_SEND);
+ setHeader(insn);
+ setDst(insn, GenRegister::uw16grf(r.nr, 0));
+ setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+ push();
+ curr.quarterControl = 0;
+ curr.nibControl = 0;
+ MOV(dst, r);
+ if (w == 8)
+ curr.nibControl = 1;
+ else
+ curr.quarterControl = 1;
+ MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2));
+ pop();
+ ADD(src, src, imm4);
+ insn = next(GEN_OPCODE_SEND);
+ setHeader(insn);
+ setDst(insn, GenRegister::uw16grf(r.nr, 0));
+ setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+ push();
+ curr.quarterControl = 0;
+ curr.nibControl = 0;
+ MOV(GenRegister::suboffset(dst, 1), r);
+ if (w == 8)
+ curr.nibControl = 1;
+ else
+ curr.quarterControl = 1;
+ MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2));
+ pop();
+ }
+
+ void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+ int w = curr.execWidth;
+ GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
+ r.type = GEN_TYPE_UD;
+ GenRegister hdr = GenRegister::h2(r);
+ GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
+ src.hstride = GEN_HORIZONTAL_STRIDE_2;
+ GenRegister data = GenRegister::offset(r, w / 8);
+ GenRegister imm4 = GenRegister::immud(4);
+ MOV(r, GenRegister::ud8grf(msg.nr, 0));
+ push();
+ curr.quarterControl = 0;
+ curr.nibControl = 0;
+ MOV(data, src);
+ if (w == 8)
+ curr.nibControl = 1;
+ else
+ curr.quarterControl = 1;
+ MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w));
+ pop();
+ GenInstruction *insn;
+ insn = next(GEN_OPCODE_SEND);
+ setHeader(insn);
+ setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
+ setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
+ setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
+
+ ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4);
+ push();
+ curr.quarterControl = 0;
+ curr.nibControl = 0;
+ MOV(data, GenRegister::suboffset(src, 1));
+ if (w == 8)
+ curr.nibControl = 1;
+ else
+ curr.quarterControl = 1;
+ MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1));
+ pop();
+ insn = next(GEN_OPCODE_SEND);
+ setHeader(insn);
+ setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
+ setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
+ setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
+ }
+
void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
@@ -467,7 +567,25 @@ namespace gbe
}
INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
- if (needToSplitAlu1(p, dst, src) == false) {
+ if (dst.isdf() && src.isdf()) {
+ int w = p->curr.execWidth;
+ p->push();
+ p->curr.quarterControl = 0;
+ p->curr.nibControl = 0;
+ GenInstruction *insn = p->next(opcode);
+ p->setHeader(insn);
+ p->setDst(insn, dst);
+ p->setSrc0(insn, src);
+ if (w == 8)
+ p->curr.nibControl = 1; // second 1/8 mask
+ else // w == 16
+ p->curr.quarterControl = 1; // second 1/4 mask
+ insn = p->next(opcode);
+ p->setHeader(insn);
+ p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+ p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
+ p->pop();
+ } else if (needToSplitAlu1(p, dst, src) == false) {
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, dst);
@@ -499,7 +617,27 @@ namespace gbe
GenRegister src0,
GenRegister src1)
{
- if (needToSplitAlu2(p, dst, src0, src1) == false) {
+ if (dst.isdf() && src0.isdf() && src1.isdf()) {
+ int w = p->curr.execWidth;
+ p->push();
+ p->curr.quarterControl = 0;
+ p->curr.nibControl = 0;
+ GenInstruction *insn = p->next(opcode);
+ p->setHeader(insn);
+ p->setDst(insn, dst);
+ p->setSrc0(insn, src0);
+ p->setSrc1(insn, src1);
+ if (w == 8)
+ p->curr.nibControl = 1; // second 1/8 mask
+ else // w == 16
+ p->curr.quarterControl = 1; // second 1/4 mask
+ insn = p->next(opcode);
+ p->setHeader(insn);
+ p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+ p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+ p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
+ p->pop();
+ } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, dst);
@@ -620,6 +758,67 @@ namespace gbe
alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
}
+ void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
+ union { double d; unsigned u[2]; } u;
+ u.d = value;
+ GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
+ push();
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.execWidth = 1;
+ MOV(r, GenRegister::immud(u.u[1]));
+ MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
+ pop();
+ r.type = GEN_TYPE_DF;
+ r.vstride = GEN_VERTICAL_STRIDE_0;
+ r.width = GEN_WIDTH_1;
+ r.hstride = GEN_HORIZONTAL_STRIDE_0;
+ push();
+ MOV(dest, r);
+ pop();
+ }
+
+ void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+ int w = curr.execWidth;
+ if (src0.isdf()) {
+ push();
+ curr.execWidth = 16;
+ MOV(dest, src0);
+ if (w == 16) {
+ curr.quarterControl = 1;
+ MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
+ }
+ pop();
+ } else {
+ GenRegister r0 = GenRegister::h2(r);
+ push();
+ curr.execWidth = 8;
+ curr.predicate = GEN_PREDICATE_NONE;
+ MOV(r0, src0);
+ MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 4));
+ curr.predicate = GEN_PREDICATE_NORMAL;
+ curr.quarterControl = 0;
+ curr.nibControl = 0;
+ MOV(dest, r);
+ curr.nibControl = 1;
+ MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8));
+ pop();
+ if (w == 16) {
+ push();
+ curr.execWidth = 8;
+ curr.predicate = GEN_PREDICATE_NONE;
+ MOV(r0, GenRegister::suboffset(src0, 8));
+ MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 12));
+ curr.predicate = GEN_PREDICATE_NORMAL;
+ curr.quarterControl = 1;
+ curr.nibControl = 0;
+ MOV(GenRegister::suboffset(dest, 8), r);
+ curr.nibControl = 1;
+ MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8));
+ pop();
+ }
+ }
+ }
+
ALU1(MOV)
ALU1(RNDZ)
ALU1(RNDE)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index c98774f8..1a5dcf9d 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -113,9 +113,11 @@ namespace gbe
ALU2(LINE)
ALU2(PLN)
ALU3(MAD)
+ ALU2(MOV_DF);
#undef ALU1
#undef ALU2
#undef ALU3
+ void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
/*! Barrier message (to synchronize threads of a workgroup) */
void BARRIER(GenRegister src);
/*! Memory fence message (to order loads and stores between threads) */
@@ -132,6 +134,10 @@ namespace gbe
void NOP(void);
/*! Wait instruction (used for the barrier) */
void WAIT(void);
+ /*! Read 64-bits float arrays */
+ void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+ /*! Write 64-bits float arrays */
+ void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Untyped read (upto 4 channels) */
void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Untyped write (upto 4 channels) */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 098d9ecc..a3b4621a 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -12,6 +12,8 @@ DECL_GEN7_SCHEDULE(Wait, 20, 2, 2)
DECL_GEN7_SCHEDULE(Math, 20, 4, 2)
DECL_GEN7_SCHEDULE(Barrier, 80, 1, 1)
DECL_GEN7_SCHEDULE(Fence, 80, 1, 1)
+DECL_GEN7_SCHEDULE(ReadFloat64, 80, 1, 1)
+DECL_GEN7_SCHEDULE(WriteFloat64, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedRead, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedWrite, 80, 1, 1)
DECL_GEN7_SCHEDULE(ByteGather, 80, 1, 1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 4e7cebd5..59014191 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -129,6 +129,7 @@ namespace gbe
case TYPE_S32: return GEN_TYPE_D;
case TYPE_U32: return GEN_TYPE_UD;
case TYPE_FLOAT: return GEN_TYPE_F;
+ case TYPE_DOUBLE: return GEN_TYPE_DF;
default: NOT_SUPPORTED; return GEN_TYPE_F;
}
}
@@ -166,11 +167,13 @@ namespace gbe
bool SelectionInstruction::isRead(void) const {
return this->opcode == SEL_OP_UNTYPED_READ ||
+ this->opcode == SEL_OP_READ_FLOAT64 ||
this->opcode == SEL_OP_BYTE_GATHER;
}
bool SelectionInstruction::isWrite(void) const {
return this->opcode == SEL_OP_UNTYPED_WRITE ||
+ this->opcode == SEL_OP_WRITE_FLOAT64 ||
this->opcode == SEL_OP_BYTE_SCATTER;
}
@@ -406,6 +409,8 @@ namespace gbe
#define ALU3(OP) \
INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
ALU1(MOV)
+ ALU2(MOV_DF)
+ ALU2(LOAD_DF_IMM)
ALU1(RNDZ)
ALU1(RNDE)
ALU2(SEL)
@@ -449,6 +454,10 @@ namespace gbe
void NOP(void);
/*! Wait instruction (used for the barrier) */
void WAIT(void);
+ /*! Read 64 bits float array */
+ void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+ /*! Write 64 bits float array */
+ void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
/*! Untyped read (up to 4 elements) */
void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
/*! Untyped write (up to 4 elements) */
@@ -610,20 +619,23 @@ namespace gbe
ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID) {
SelectionBlock *block = insn->parent;
- const uint32_t simdWidth = ctx.getSimdWidth();
+ uint32_t simdWidth = ctx.getSimdWidth();
ir::Register tmp;
+ ir::RegisterFamily f = file.get(insn->dst(regID).reg()).family;
+ int genType = f == ir::FAMILY_QWORD ? GEN_TYPE_DF : GEN_TYPE_F;
+ GenRegister gr;
// This will append the temporary register in the instruction block
this->block = block;
- tmp = this->reg(ir::FAMILY_DWORD);
+ tmp = this->reg(f);
// Generate the MOV instruction and replace the register in the instruction
SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
- mov->dst(0) = GenRegister::retype(insn->dst(regID), GEN_TYPE_F);
+ mov->dst(0) = GenRegister::retype(insn->dst(regID), genType);
mov->state = GenInstructionState(simdWidth);
- insn->dst(regID) = mov->src(0) = GenRegister::fxgrf(simdWidth, tmp);
+ gr = f == ir::FAMILY_QWORD ? GenRegister::dfxgrf(simdWidth, tmp) : GenRegister::fxgrf(simdWidth, tmp);
+ insn->dst(regID) = mov->src(0) = gr;
insn->append(*mov);
-
return tmp;
}
@@ -657,6 +669,7 @@ namespace gbe
case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
+ case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
default: NOT_SUPPORTED;
}
GBE_ASSERT(false);
@@ -719,6 +732,33 @@ namespace gbe
void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
+ void Selection::Opaque::READ_FLOAT64(Reg addr,
+ const GenRegister *dst,
+ uint32_t elemNum,
+ uint32_t bti)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
+ SelectionVector *srcVector = this->appendVector();
+ SelectionVector *dstVector = this->appendVector();
+
+ // Regular instruction to encode
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ insn->dst(elemID) = dst[elemID];
+ insn->src(0) = addr;
+ insn->extra.function = bti;
+ insn->extra.elem = elemNum;
+
+ // Sends require contiguous allocation
+ dstVector->regNum = elemNum;
+ dstVector->isSrc = 0;
+ dstVector->reg = &insn->dst(0);
+
+ // Source cannot be scalar (yet)
+ srcVector->regNum = 1;
+ srcVector->isSrc = 1;
+ srcVector->reg = &insn->src(0);
+ }
+
void Selection::Opaque::UNTYPED_READ(Reg addr,
const GenRegister *dst,
uint32_t elemNum,
@@ -746,6 +786,27 @@ namespace gbe
srcVector->reg = &insn->src(0);
}
+ void Selection::Opaque::WRITE_FLOAT64(Reg addr,
+ const GenRegister *src,
+ uint32_t elemNum,
+ uint32_t bti)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1);
+ SelectionVector *vector = this->appendVector();
+
+ // Regular instruction to encode
+ insn->src(0) = addr;
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ insn->src(elemID+1) = src[elemID];
+ insn->extra.function = bti;
+ insn->extra.elem = elemNum;
+
+ // Sends require contiguous allocation for the sources
+ vector->regNum = elemNum+1;
+ vector->reg = &insn->src(0);
+ vector->isSrc = 1;
+ }
+
void Selection::Opaque::UNTYPED_WRITE(Reg addr,
const GenRegister *src,
uint32_t elemNum,
@@ -1092,6 +1153,15 @@ namespace gbe
// Implementation of all patterns
///////////////////////////////////////////////////////////////////////////
+ bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
+ using namespace ir;
+ const auto &childInsn = cast<LoadImmInstruction>(insn);
+ const auto &imm = childInsn.getImmediate();
+ if(imm.type != TYPE_DOUBLE)
+ return true;
+ return false;
+ }
+
GenRegister getRegisterFromImmediate(ir::Immediate imm)
{
using namespace ir;
@@ -1103,6 +1173,7 @@ namespace gbe
case TYPE_S16: return GenRegister::immw(imm.data.s16);
case TYPE_U8: return GenRegister::immuw(imm.data.u8);
case TYPE_S8: return GenRegister::immw(imm.data.s8);
+ case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
default: NOT_SUPPORTED; return GenRegister::immuw(0);
}
}
@@ -1146,7 +1217,13 @@ namespace gbe
const GenRegister src = sel.selReg(insn.getSrc(0));
switch (opcode) {
case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break;
- case ir::OP_MOV: sel.MOV(dst, src); break;
+ case ir::OP_MOV:
+ if (dst.isdf()) {
+ ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+ sel.MOV_DF(dst, src, sel.selReg(r));
+ } else
+ sel.MOV(dst, src);
+ break;
case ir::OP_RNDD: sel.RNDD(dst, src); break;
case ir::OP_RNDE: sel.RNDE(dst, src); break;
case ir::OP_RNDU: sel.RNDU(dst, src); break;
@@ -1225,14 +1302,14 @@ namespace gbe
SelectionDAG *dag1 = dag.child[1];
// Right source can always be an immediate
- if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+ if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
src0 = sel.selReg(insn.getSrc(0), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
if (dag0) dag0->isRoot = 1;
}
// Left source cannot be immediate but it is OK if we can commute
- else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) {
+ else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
src0 = sel.selReg(insn.getSrc(1), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1268,7 +1345,7 @@ namespace gbe
case OP_SHR: sel.SHR(dst, src0, src1); break;
case OP_ASR: sel.ASR(dst, src0, src1); break;
case OP_MUL:
- if (type == TYPE_FLOAT)
+ if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
sel.MUL(dst, src0, src1);
else if (type == TYPE_U32 || type == TYPE_S32) {
sel.pop();
@@ -1599,6 +1676,7 @@ namespace gbe
case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); break;
case TYPE_U8: sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
case TYPE_S8: sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
+ case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.data.f64), sel.selReg(sel.reg(FAMILY_QWORD))); break;
default: NOT_SUPPORTED;
}
sel.pop();
@@ -1650,6 +1728,8 @@ namespace gbe
INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
using namespace ir;
switch (type) {
+ case TYPE_DOUBLE:
+ return GEN_BYTE_SCATTER_QWORD;
case TYPE_FLOAT:
case TYPE_U32:
case TYPE_S32:
@@ -1681,6 +1761,22 @@ namespace gbe
sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
}
+ void emitReadFloat64(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister addr,
+ uint32_t bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ vector<GenRegister> dst(valueNum);
+ for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
+ dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+ if (sel.ctx.getSimdWidth() == 16)
+ dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+ sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
+ }
+
void emitByteGather(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
const uint32_t elemSize,
@@ -1732,6 +1828,8 @@ namespace gbe
const uint32_t elemSize = getByteScatterGatherSize(type);
if (insn.getAddressSpace() == MEM_CONSTANT)
this->emitIndirectMove(sel, insn, address);
+ else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitReadFloat64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else {
@@ -1762,6 +1860,25 @@ namespace gbe
sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
}
+ void emitWriteFloat64(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ uint32_t bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ const uint32_t addrID = ir::StoreInstruction::addressIndex;
+ GenRegister addr;
+ vector<GenRegister> value(valueNum);
+
+ addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+ value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
+ value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+ if (sel.ctx.getSimdWidth() == 16)
+ value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+ sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
+ }
+
void emitByteScatter(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
const uint32_t elemSize,
@@ -1791,7 +1908,9 @@ namespace gbe
const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
- if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+ if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitWriteFloat64(sel, insn, bti);
+ else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedWrite(sel, insn, bti);
else {
const GenRegister address = sel.selReg(insn.getAddress());
@@ -1839,7 +1958,7 @@ namespace gbe
SelectionDAG *dag1 = dag.child[1];
// Right source can always be an immediate
- if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+ if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
src0 = sel.selReg(insn.getSrc(0), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1873,7 +1992,7 @@ namespace gbe
const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
// We need two instructions to make the conversion
- if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) {
+ if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {
GenRegister unpacked;
if (dstFamily == FAMILY_WORD) {
const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
@@ -1886,6 +2005,9 @@ namespace gbe
}
sel.MOV(unpacked, src);
sel.MOV(dst, unpacked);
+ } else if (dst.isdf()) {
+ ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+ sel.MOV_DF(dst, src, sel.selReg(r));
} else
sel.MOV(dst, src);
return true;
@@ -1919,7 +2041,7 @@ namespace gbe
SelectionDAG *dag2 = dag.child[2];
// Right source can always be an immediate
- if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI) {
+ if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag2->insn)) {
const auto &childInsn = cast<LoadImmInstruction>(dag2->insn);
src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 789c81ca..4b5525b4 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,5 +1,7 @@
DECL_SELECTION_IR(LABEL, LabelInstruction)
DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(MOV_DF, BinaryInstruction)
+DECL_SELECTION_IR(LOAD_DF_IMM, BinaryInstruction)
DECL_SELECTION_IR(NOT, UnaryInstruction)
DECL_SELECTION_IR(LZD, UnaryInstruction)
DECL_SELECTION_IR(RNDZ, UnaryInstruction)
@@ -32,6 +34,8 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)
DECL_SELECTION_IR(FENCE, FenceInstruction)
DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
+DECL_SELECTION_IR(READ_FLOAT64, ReadFloat64Instruction)
+DECL_SELECTION_IR(WRITE_FLOAT64, WriteFloat64Instruction)
DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
DECL_SELECTION_IR(SAMPLE, SampleInstruction)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 9765b025..e7c96ac4 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -458,7 +458,6 @@ namespace gbe
}
bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
-
// Perform the linear scan allocator
const uint32_t regNum = ctx.sel->getRegNum();
for (uint32_t startID = 0; startID < regNum; ++startID) {
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index d772b0d6..fedb743a 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -69,11 +69,12 @@ namespace gbe
/*! Type size in bytes for each Gen type */
INLINE int typeSize(uint32_t type) {
switch(type) {
+ case GEN_TYPE_DF:
+ return 8;
case GEN_TYPE_UD:
case GEN_TYPE_D:
case GEN_TYPE_F:
return 4;
- case GEN_TYPE_HF:
case GEN_TYPE_UW:
case GEN_TYPE_W:
return 2;
@@ -110,6 +111,7 @@ namespace gbe
INLINE GenInstructionState(uint32_t simdWidth = 8) {
this->execWidth = simdWidth;
this->quarterControl = GEN_COMPRESSION_Q1;
+ this->nibControl = 0;
this->accWrEnable = 0;
this->noMask = 0;
this->flag = 0;
@@ -126,6 +128,7 @@ namespace gbe
uint32_t flagIndex:16; //!< Only if virtual flag (index of the register)
uint32_t execWidth:5;
uint32_t quarterControl:1;
+ uint32_t nibControl:1;
uint32_t accWrEnable:1;
uint32_t noMask:1;
uint32_t predicate:4;
@@ -192,6 +195,7 @@ namespace gbe
/*! For immediates or virtual register */
union {
+ double df;
float f;
int32_t d;
uint32_t ud;
@@ -211,6 +215,31 @@ namespace gbe
uint32_t quarter:1; //!< To choose which part we want (Q1 / Q2)
uint32_t address_mode:1; //!< direct or indirect
+ static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
+ GenRegister r = reg;
+ r.nr += nr;
+ r.subnr += subnr;
+ return r;
+ }
+
+ INLINE bool isimmdf(void) const {
+ if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
+ return true;
+ return false;
+ }
+
+ INLINE bool isdf(void) const {
+ if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
+ return true;
+ return false;
+ }
+
+ static INLINE GenRegister h2(GenRegister reg) {
+ GenRegister r = reg;
+ r.hstride = GEN_HORIZONTAL_STRIDE_2;
+ return r;
+ }
+
static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) {
GBE_ASSERT(reg.physical == 0);
if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
@@ -293,6 +322,18 @@ namespace gbe
return reg;
}
+ static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
+ return retype(vec16(file, reg), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
+ return retype(vec8(file, reg), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
+ return retype(vec1(file, reg), GEN_TYPE_DF);
+ }
+
static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
return retype(vec16(file, reg), GEN_TYPE_UD);
}
@@ -371,6 +412,12 @@ namespace gbe
GEN_HORIZONTAL_STRIDE_0);
}
+ static INLINE GenRegister immdf(double df) {
+ GenRegister immediate = imm(GEN_TYPE_DF);
+ immediate.value.df = df;
+ return immediate;
+ }
+
static INLINE GenRegister immf(float f) {
GenRegister immediate = imm(GEN_TYPE_F);
immediate.value.f = f;
@@ -448,6 +495,18 @@ namespace gbe
return vec16(GEN_GENERAL_REGISTER_FILE, reg);
}
+ static INLINE GenRegister df1grf(ir::Register reg) {
+ return df1(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister df8grf(ir::Register reg) {
+ return df8(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister df16grf(ir::Register reg) {
+ return df16(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
static INLINE GenRegister ud16grf(ir::Register reg) {
return ud16(GEN_GENERAL_REGISTER_FILE, reg);
}
@@ -608,11 +667,37 @@ namespace gbe
GEN_HORIZONTAL_STRIDE_0);
}
+ static INLINE int hstride_size(GenRegister reg) {
+ switch (reg.hstride) {
+ case GEN_HORIZONTAL_STRIDE_0: return 0;
+ case GEN_HORIZONTAL_STRIDE_1: return 1;
+ case GEN_HORIZONTAL_STRIDE_2: return 2;
+ case GEN_HORIZONTAL_STRIDE_4: return 4;
+ default: NOT_IMPLEMENTED; return 0;
+ }
+ }
+
static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
- reg.subnr += delta * typeSize(reg.type);
+ if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+ reg.subnr += delta * typeSize(reg.type);
+ reg.nr += reg.subnr / 32;
+ reg.subnr %= 32;
+ }
return reg;
}
+ static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
+ }
+
static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
}
@@ -685,6 +770,18 @@ namespace gbe
return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
+ static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
+ return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
+ return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
+ return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
@@ -790,6 +887,7 @@ namespace gbe
return SIMD1(values...); \
} \
}
+ DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3a59da38..5b7754cc 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2164,6 +2164,7 @@ namespace gbe
}
void GenWriter::regAllocateStoreInst(StoreInst &I) {}
+ extern int OCL_SIMD_WIDTH;
template <bool isLoad, typename T>
INLINE void GenWriter::emitLoadOrStore(T &I)
{
@@ -2178,6 +2179,8 @@ namespace gbe
// Scalar is easy. We neednot build register tuples
if (isScalarType(llvmType) == true) {
const ir::Type type = getType(ctx, llvmType);
+ if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
+ OCL_SIMD_WIDTH = 8;
const ir::Register values = this->getRegister(llvmValues);
if (isLoad)
ctx.LOAD(type, ptr, addrSpace, dwAligned, values);