summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTimur Kristóf <timur.kristof@gmail.com>2019-11-27 11:04:47 +0100
committerDaniel Schürmann <daniel@schuermann.dev>2019-12-04 10:36:01 +0000
commite0bcefc3a0a15a8c7ec00cfa53fd8fffcc07342a (patch)
tree5ec0a9aef34d60726058b553bc6ffca03e409513
parentb4efe179ede6ea7d53bc8074048c96d2aa146701 (diff)
downloadmesa-e0bcefc3a0a15a8c7ec00cfa53fd8fffcc07342a.tar.gz
aco/wave32: Use lane mask regclass for exec/vcc.
Currently all usages of exec and vcc are hardcoded to use s2 regclass. This commit makes it possible to use s1 in wave32 mode and s2 in wave64 mode. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
-rw-r--r--src/amd/compiler/aco_builder_h.py21
-rw-r--r--src/amd/compiler/aco_insert_exec_mask.cpp72
-rw-r--r--src/amd/compiler/aco_instruction_selection.cpp249
-rw-r--r--src/amd/compiler/aco_instruction_selection_setup.cpp26
-rw-r--r--src/amd/compiler/aco_ir.h1
-rw-r--r--src/amd/compiler/aco_live_var_analysis.cpp12
-rw-r--r--src/amd/compiler/aco_lower_bool_phis.cpp30
-rw-r--r--src/amd/compiler/aco_lower_to_hw_instr.cpp42
-rw-r--r--src/amd/compiler/aco_reduce_assign.cpp2
-rw-r--r--src/amd/compiler/aco_register_allocation.cpp1
-rw-r--r--src/amd/compiler/aco_ssa_elimination.cpp1
-rw-r--r--src/amd/compiler/aco_validate.cpp2
12 files changed, 250 insertions, 209 deletions
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index e70d9317b3f..ada0806f6a9 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -135,13 +135,14 @@ public:
Program *program;
bool use_iterator;
bool start; // only when use_iterator == false
+ RegClass lm;
std::vector<aco_ptr<Instruction>> *instructions;
std::vector<aco_ptr<Instruction>>::iterator it;
- Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), instructions(NULL) {}
- Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), instructions(&block->instructions) {}
- Builder(Program *pgm, std::vector<aco_ptr<Instruction>> *instrs) : program(pgm), use_iterator(false), start(false), instructions(instrs) {}
+ Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), lm(pgm->lane_mask), instructions(NULL) {}
+ Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(&block->instructions) {}
+ Builder(Program *pgm, std::vector<aco_ptr<Instruction>> *instrs) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(instrs) {}
void moveEnd(Block *block) {
instructions = &block->instructions;
@@ -265,17 +266,26 @@ public:
% for fixed in ['m0', 'vcc', 'exec', 'scc']:
Operand ${fixed}(Temp tmp) {
+ % if fixed == 'vcc' or fixed == 'exec':
+ assert(tmp.regClass() == lm);
+ % endif
Operand op(tmp);
op.setFixed(aco::${fixed});
return op;
}
Definition ${fixed}(Definition def) {
+ % if fixed == 'vcc' or fixed == 'exec':
+ assert(def.regClass() == lm);
+ % endif
def.setFixed(aco::${fixed});
return def;
}
Definition hint_${fixed}(Definition def) {
+ % if fixed == 'vcc' or fixed == 'exec':
+ assert(def.regClass() == lm);
+ % endif
def.setHint(aco::${fixed});
return def;
}
@@ -350,11 +360,11 @@ public:
assert((post_ra || b.op.hasRegClass()) && b.op.regClass().type() == RegType::vgpr);
if (!carry_in.op.isUndefined())
- return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(s2)), a, b, carry_in);
+ return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(lm)), a, b, carry_in);
else if (program->chip_class >= GFX10 && carry_out)
return vop3(aco_opcode::v_add_co_u32_e64, Definition(dst), def(s2), a, b);
else if (program->chip_class < GFX9 || carry_out)
- return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(s2)), a, b);
+ return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(lm)), a, b);
else
return vop2(aco_opcode::v_add_u32, Definition(dst), a, b);
}
@@ -407,6 +417,7 @@ public:
}
return insert(std::move(sub));
}
+
<%
import itertools
formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]),
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index 31ae5ca658c..cbc0698096b 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -302,14 +302,15 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
return;
if (ctx.info[idx].exec.back().second & mask_type_global) {
Temp exec_mask = ctx.info[idx].exec.back().first;
- exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), exec_mask);
+ exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask);
ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
return;
}
/* otherwise, the WQM mask should be one below the current mask */
ctx.info[idx].exec.pop_back();
assert(ctx.info[idx].exec.back().second & mask_type_wqm);
- ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+ assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
+ ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first);
}
@@ -324,14 +325,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
!(ctx.info[idx].exec.back().second & mask_type_loop)) {
ctx.info[idx].exec.pop_back();
assert(ctx.info[idx].exec.back().second & mask_type_exact);
- ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+ assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
+ ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first);
return;
}
/* otherwise, we create an exact mask and push to the stack */
Temp wqm = ctx.info[idx].exec.back().first;
- Temp exact = bld.tmp(s2);
- wqm = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+ Temp exact = bld.tmp(bld.lm);
+ wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm));
ctx.info[idx].exec.back().first = wqm;
ctx.info[idx].exec.emplace_back(exact, mask_type_exact);
@@ -359,7 +361,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
} else {
uint8_t mask = mask_type_global;
if (ctx.program->needs_wqm) {
- exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), bld.exec(exec_mask));
+ exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask));
mask |= mask_type_wqm;
} else {
mask |= mask_type_exact;
@@ -383,7 +385,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
aco_ptr<Pseudo_instruction> phi;
for (int i = 0; i < info.num_exec_masks - 1; i++) {
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
- phi->definitions[0] = bld.def(s2);
+ phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first);
ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
}
@@ -393,7 +395,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
if (info.has_divergent_break) {
/* this phi might be trivial but ensures a parallelcopy on the loop header */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
- phi->definitions[0] = bld.def(s2);
+ phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
}
@@ -401,9 +403,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
/* create ssa name for loop active mask */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
if (info.has_divergent_continue)
- phi->definitions[0] = bld.def(s2);
+ phi->definitions[0] = bld.def(bld.lm);
else
- phi->definitions[0] = bld.def(s2, exec);
+ phi->definitions[0] = bld.def(bld.lm, exec);
phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first);
Temp loop_active = bld.insert(std::move(phi));
@@ -423,7 +425,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
i++;
}
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
- ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+ assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
+ ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first), mask_type);
}
@@ -480,7 +483,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
} else {
/* create phi for loop footer */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
- phi->definitions[0] = bld.def(s2);
+ phi->definitions[0] = bld.def(bld.lm);
for (unsigned i = 0; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first);
ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
@@ -510,7 +513,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
transition_to_Exact(ctx, bld, idx);
}
- ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
+ assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
+ ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first);
ctx.loop.pop_back();
@@ -536,7 +540,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
continue;
}
- Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(s2, exec) : bld.def(s2),
+ Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm),
ctx.info[preds[0]].exec[i].first,
ctx.info[preds[1]].exec[i].first);
uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
@@ -578,7 +582,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
if (block->kind & block_kind_merge) {
Temp restore = ctx.info[idx].exec.back().first;
- ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), restore);
+ assert(restore.size() == bld.lm.size());
+ ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore);
}
return i;
@@ -589,7 +594,7 @@ void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instructi
Operand offset = instr->operands[1];
if (need_check) {
/* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
- Temp nonempty = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), cur_exec, Operand(0u));
+ Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u));
if (offset.isLiteral())
offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset);
@@ -665,7 +670,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
assert(num);
Operand cond = instr->operands[0];
for (int i = num - 1; i >= 0; i--) {
- Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+ Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == num - 1) {
andn2->operands[0].setFixed(exec);
@@ -689,8 +694,9 @@ void process_instructions(exec_ctx& ctx, Block* block,
if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) {
Definition dst = instr->definitions[0];
+ assert(dst.size() == bld.lm.size());
if (state == Exact) {
- instr.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b64, Format::SOP1, 1, 1));
+ instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
instr->operands[0] = Operand(0u);
instr->definitions[0] = dst;
} else {
@@ -710,7 +716,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial);
assert(exact_mask.second & mask_type_exact);
- instr.reset(create_instruction<SOP2_instruction>(aco_opcode::s_andn2_b64, Format::SOP2, 2, 2));
+ instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */
instr->operands[1] = Operand(exact_mask.first);
instr->definitions[0] = dst;
@@ -726,8 +732,8 @@ void process_instructions(exec_ctx& ctx, Block* block,
if (instr->operands.empty()) {
/* transition to exact and set exec to zero */
Temp old_exec = ctx.info[block->index].exec.back().first;
- Temp new_exec = bld.tmp(s2);
- cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+ Temp new_exec = bld.tmp(bld.lm);
+ cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
if (ctx.info[block->index].exec.back().second & mask_type_exact) {
ctx.info[block->index].exec.back().first = new_exec;
@@ -746,7 +752,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
num += ctx.info[block->index].exec.size() - 1;
for (int i = num - 1; i >= 0; i--) {
if (ctx.info[block->index].exec[i].second & mask_type_exact) {
- Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+ Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == num - 1) {
andn2->operands[0].setFixed(exec);
@@ -878,13 +884,13 @@ void add_branch_code(exec_ctx& ctx, Block* block)
}
Temp old_exec = ctx.info[idx].exec.back().first;
- Temp new_exec = bld.tmp(s2);
- Temp cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+ Temp new_exec = bld.tmp(bld.lm);
+ Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
ctx.info[idx].exec.back().first = new_exec;
for (int i = num - 1; i >= 0; i--) {
- Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+ Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == (int)ctx.info[idx].exec.size() - 1)
andn2->definitions[0].setFixed(exec);
@@ -912,7 +918,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
Temp cond = Temp();
for (int exec_idx = ctx.info[idx].exec.size() - 1; exec_idx >= 0; exec_idx--) {
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) {
- cond = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u));
+ cond = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u));
break;
}
}
@@ -957,8 +963,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
Temp current_exec = ctx.info[idx].exec.back().first;
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
- Temp then_mask = bld.tmp(s2);
- Temp old_exec = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
+ Temp then_mask = bld.tmp(bld.lm);
+ Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(then_mask)), cond, bld.exec(current_exec));
ctx.info[idx].exec.back().first = old_exec;
@@ -978,7 +984,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
uint8_t mask_type = ctx.info[idx].exec.back().second;
ctx.info[idx].exec.pop_back();
Temp orig_exec = ctx.info[idx].exec.back().first;
- Temp else_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2, exec),
+ Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec),
bld.def(s1, scc), orig_exec, bld.exec(then_mask));
/* add next current exec to the stack */
@@ -998,7 +1004,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
cond = bld.tmp(s1);
Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
- exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
+ exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
exec_mask, current_exec);
ctx.info[idx].exec[exec_idx].first = exec_mask;
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
@@ -1010,7 +1016,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
- ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
+ ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
@@ -1028,7 +1034,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
break;
cond = bld.tmp(s1);
Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
- exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
+ exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
exec_mask, bld.exec(current_exec));
ctx.info[idx].exec[exec_idx].first = exec_mask;
}
@@ -1039,7 +1045,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
- ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
+ ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index a2b2c21170c..9de9d5dec14 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -351,12 +351,12 @@ Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2
{
Builder bld(ctx->program, ctx->block);
if (!dst.id())
- dst = bld.tmp(s2);
+ dst = bld.tmp(bld.lm);
assert(val.regClass() == s1);
- assert(dst.regClass() == s2);
+ assert(dst.regClass() == bld.lm);
- return bld.sop2(aco_opcode::s_cselect_b64, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
+ return bld.sop2(Builder::s_cselect, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
}
Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
@@ -365,12 +365,12 @@ Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1
if (!dst.id())
dst = bld.tmp(s1);
- assert(val.regClass() == s2);
+ assert(val.regClass() == bld.lm);
assert(dst.regClass() == s1);
/* if we're currently in WQM mode, ensure that the source is also computed in WQM */
Temp tmp = bld.tmp(s1);
- bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(Definition(tmp)), val, Operand(exec, s2));
+ bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
return emit_wqm(ctx, tmp, dst);
}
@@ -489,6 +489,8 @@ void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
{
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = get_alu_src(ctx, instr->src[1]);
+ assert(src0.size() == src1.size());
+
aco_ptr<Instruction> vopc;
if (src1.type() == RegType::sgpr) {
if (src0.type() == RegType::vgpr) {
@@ -549,12 +551,13 @@ void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
{
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = get_alu_src(ctx, instr->src[1]);
+ Builder bld(ctx->program, ctx->block);
- assert(dst.regClass() == s2);
+ assert(dst.regClass() == bld.lm);
assert(src0.type() == RegType::sgpr);
assert(src1.type() == RegType::sgpr);
+ assert(src0.regClass() == src1.regClass());
- Builder bld(ctx->program, ctx->block);
/* Emit the SALU comparison instruction */
Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
/* Turn the result into a per-lane bool */
@@ -580,17 +583,17 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
emit_sopc_instruction(ctx, instr, op, dst);
}
-void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op64, Temp dst)
+void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
{
Builder bld(ctx->program, ctx->block);
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = get_alu_src(ctx, instr->src[1]);
- assert(dst.regClass() == s2);
- assert(src0.regClass() == s2);
- assert(src1.regClass() == s2);
+ assert(dst.regClass() == bld.lm);
+ assert(src0.regClass() == bld.lm);
+ assert(src1.regClass() == bld.lm);
- bld.sop2(op64, Definition(dst), bld.def(s1, scc), src0, src1);
+ bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
}
void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
@@ -600,7 +603,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
Temp then = get_alu_src(ctx, instr->src[1]);
Temp els = get_alu_src(ctx, instr->src[2]);
- assert(cond.regClass() == s2);
+ assert(cond.regClass() == bld.lm);
if (dst.type() == RegType::vgpr) {
aco_ptr<Instruction> bcsel;
@@ -628,14 +631,15 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
}
if (instr->dest.dest.ssa.bit_size == 1) {
- assert(dst.regClass() == s2);
- assert(then.regClass() == s2);
- assert(els.regClass() == s2);
+ assert(dst.regClass() == bld.lm);
+ assert(then.regClass() == bld.lm);
+ assert(els.regClass() == bld.lm);
}
if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
if (dst.regClass() == s1 || dst.regClass() == s2) {
assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
+ assert(dst.size() == then.size());
aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
} else {
@@ -652,20 +656,20 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
assert(instr->dest.dest.ssa.bit_size == 1);
if (cond.id() != then.id())
- then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
+ then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
if (cond.id() == els.id())
- bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
+ bld.sop1(Builder::s_mov, Definition(dst), then);
else
- bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
- bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
+ bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
+ bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
}
void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
aco_opcode op, uint32_t undo)
{
/* multiply by 16777216 to handle denormals */
- Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(s2)),
+ Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
scaled = bld.vop1(op, bld.def(v1), scaled);
@@ -766,9 +770,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
case nir_op_inot: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->dest.dest.ssa.bit_size == 1) {
- assert(src.regClass() == s2);
- assert(dst.regClass() == s2);
- bld.sop2(aco_opcode::s_andn2_b64, Definition(dst), bld.def(s1, scc), Operand(exec, s2), src);
+ assert(src.regClass() == bld.lm);
+ assert(dst.regClass() == bld.lm);
+ bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src);
} else if (dst.regClass() == v1) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
} else if (dst.type() == RegType::sgpr) {
@@ -835,12 +839,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
} else if (dst.regClass() == v1) {
Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
- Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+ Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
} else if (dst.regClass() == v2) {
Temp upper = emit_extract_vector(ctx, src, 1, v1);
Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
- Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+ Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
@@ -901,7 +905,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
case nir_op_ior: {
if (instr->dest.dest.ssa.bit_size == 1) {
- emit_boolean_logic(ctx, instr, aco_opcode::s_or_b64, dst);
+ emit_boolean_logic(ctx, instr, Builder::s_or, dst);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
} else if (dst.regClass() == s1) {
@@ -917,7 +921,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
case nir_op_iand: {
if (instr->dest.dest.ssa.bit_size == 1) {
- emit_boolean_logic(ctx, instr, aco_opcode::s_and_b64, dst);
+ emit_boolean_logic(ctx, instr, Builder::s_and, dst);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
} else if (dst.regClass() == s1) {
@@ -933,7 +937,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
case nir_op_ixor: {
if (instr->dest.dest.ssa.bit_size == 1) {
- emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst);
+ emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
} else if (dst.regClass() == v1) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
} else if (dst.regClass() == s1) {
@@ -1709,16 +1713,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
case nir_op_fsign: {
Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
if (dst.size() == 1) {
- Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+ Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
- cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+ cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
} else if (dst.size() == 2) {
- Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+ Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
- cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
+ cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
@@ -1922,7 +1926,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
- Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
+ Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
@@ -1986,7 +1990,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
case nir_op_b2f32: {
Temp src = get_alu_src(ctx, instr->src[0]);
- assert(src.regClass() == s2);
+ assert(src.regClass() == bld.lm);
if (dst.regClass() == s1) {
src = bool_to_scalar_condition(ctx, src);
@@ -2000,7 +2004,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
case nir_op_b2f64: {
Temp src = get_alu_src(ctx, instr->src[0]);
- assert(src.regClass() == s2);
+ assert(src.regClass() == bld.lm);
if (dst.regClass() == s2) {
src = bool_to_scalar_condition(ctx, src);
@@ -2073,7 +2077,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
case nir_op_b2i32: {
Temp src = get_alu_src(ctx, instr->src[0]);
- assert(src.regClass() == s2);
+ assert(src.regClass() == bld.lm);
if (dst.regClass() == s1) {
// TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
@@ -2087,7 +2091,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
case nir_op_i2b1: {
Temp src = get_alu_src(ctx, instr->src[0]);
- assert(dst.regClass() == s2);
+ assert(dst.regClass() == bld.lm);
if (src.type() == RegType::vgpr) {
assert(src.regClass() == v1 || src.regClass() == v2);
@@ -2164,7 +2168,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
- Temp cmp_res = bld.tmp(s2);
+ Temp cmp_res = bld.tmp(bld.lm);
bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
@@ -2338,14 +2342,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
case nir_op_ieq: {
if (instr->src[0].src.ssa->bit_size == 1)
- emit_boolean_logic(ctx, instr, aco_opcode::s_xnor_b64, dst);
+ emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
else
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64);
break;
}
case nir_op_ine: {
if (instr->src[0].src.ssa->bit_size == 1)
- emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst);
+ emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
else
emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64);
break;
@@ -2405,8 +2409,10 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
Builder bld(ctx->program, ctx->block);
if (instr->def.bit_size == 1) {
- assert(dst.regClass() == s2);
- bld.sop1(aco_opcode::s_mov_b64, Definition(dst), Operand((uint64_t)(instr->value[0].b ? -1 : 0)));
+ assert(dst.regClass() == bld.lm);
+ int val = instr->value[0].b ? -1 : 0;
+ Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
+ bld.sop1(Builder::s_mov, Definition(dst), op);
} else if (dst.size() == 1) {
bld.copy(Definition(dst), Operand(instr->value[0].u32));
} else {
@@ -3033,7 +3039,7 @@ Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alph
/* Convert back to the right type. */
if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
- Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
+ Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
} else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
@@ -3599,8 +3605,8 @@ void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
// TODO: optimize uniform conditions
Builder bld(ctx->program, ctx->block);
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- assert(src.regClass() == s2);
- src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+ assert(src.regClass() == bld.lm);
+ src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
bld.pseudo(aco_opcode::p_discard_if, src);
ctx->block->kind |= block_kind_uses_discard_if;
return;
@@ -3663,7 +3669,7 @@ void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
ctx->program->needs_exact = true;
/* save exec somewhere temporarily so that it doesn't get
* overwritten before the discard from outer exec masks */
- Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
+ Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
bld.pseudo(aco_opcode::p_discard_if, cond);
ctx->block->kind |= block_kind_uses_discard_if;
return;
@@ -3950,7 +3956,7 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coo
/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
* resource descriptor is 0 (invalid),
*/
- Temp compare = bld.tmp(s2);
+ Temp compare = bld.tmp(bld.lm);
bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
@@ -4739,12 +4745,12 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
if (offset > 0 && ctx->options->chip_class < GFX9) {
Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
- Temp carry = bld.tmp(s2);
+ Temp carry = bld.tmp(bld.lm);
bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
Operand(offset), addr0);
- bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
+ bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
Operand(0u), addr1,
carry).def(1).setHint(vcc);
@@ -5219,25 +5225,25 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te
return src;
} if (op == nir_op_iand && cluster_size == 4) {
//subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
- Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
- return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
- bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
+ Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
+ return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
+ bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
} else if (op == nir_op_ior && cluster_size == 4) {
//subgroupClusteredOr(val, 4) -> wqm(val & exec)
- return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
- bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
+ return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
+ bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
} else if (op == nir_op_iand && cluster_size == 64) {
//subgroupAnd(val) -> (exec & ~val) == 0
- Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
- return bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
+ Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
+ return bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp));
} else if (op == nir_op_ior && cluster_size == 64) {
//subgroupOr(val) -> (val & exec) != 0
- Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
+ Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
return bool_to_vector_condition(ctx, tmp);
} else if (op == nir_op_ixor && cluster_size == 64) {
//subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
- Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
- tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s1), bld.def(s1, scc), tmp);
+ Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
+ tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
return bool_to_vector_condition(ctx, tmp);
} else {
@@ -5256,25 +5262,28 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te
Temp tmp;
if (op == nir_op_iand)
- tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+ tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
else
- tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+ tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
- tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
+ if (ctx->program->wave_size == 64)
+ tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
+ else
+ tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
tmp = emit_extract_vector(ctx, tmp, 0, v1);
if (cluster_mask != 0xffffffff)
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
Definition cmp_def = Definition();
if (op == nir_op_iand) {
- cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
+ cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0);
} else if (op == nir_op_ior) {
- cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
+ cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
} else if (op == nir_op_ixor) {
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
- cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
+ cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
}
cmp_def.setHint(vcc);
return cmp_def.getTemp();
@@ -5290,9 +5299,9 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
//subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
Temp tmp;
if (op == nir_op_iand)
- tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
+ tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
else
- tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+ tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm));
Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
Temp lo = lohi.def(0).getTemp();
@@ -5301,11 +5310,11 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
Definition cmp_def = Definition();
if (op == nir_op_iand)
- cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
+ cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
else if (op == nir_op_ior)
- cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
+ cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
else if (op == nir_op_ixor)
- cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
+ cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u),
bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
cmp_def.setHint(vcc);
return cmp_def.getTemp();
@@ -5320,11 +5329,11 @@ Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
//subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
if (op == nir_op_iand)
- return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
+ return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
else if (op == nir_op_ior)
- return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
+ return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
else if (op == nir_op_ixor)
- return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
+ return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
assert(false);
return Temp();
@@ -5453,7 +5462,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
Temp pck0 = bld.tmp(v1);
Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
tmp1 = as_vgpr(ctx, tmp1);
- Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
+ Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
/* sample_pos = flat_load_dwordx2 addr */
@@ -5685,11 +5694,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
}
case nir_intrinsic_ballot: {
- Definition tmp = bld.def(s2);
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
+ Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ Definition tmp = bld.def(dst.regClass());
if (instr->src[0].ssa->bit_size == 1) {
- assert(src.regClass() == s2);
- bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
+ assert(src.regClass() == bld.lm);
+ bld.sop2(Builder::s_and, tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
} else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
} else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
@@ -5699,7 +5709,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
nir_print_instr(&instr->instr, stderr);
fprintf(stderr, "\n");
}
- emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
+ emit_wqm(ctx, tmp.getTemp(), dst);
break;
}
case nir_intrinsic_shuffle:
@@ -5722,15 +5732,19 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
emit_split_vector(ctx, dst, 2);
} else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
- assert(src.regClass() == s2);
- Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, tid);
+ assert(src.regClass() == bld.lm);
+ Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
} else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
- assert(src.regClass() == s2);
- Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
+ assert(src.regClass() == bld.lm);
+ Temp tmp;
+ if (ctx->program->wave_size == 64)
+ tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
+ else
+ tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
tmp = emit_extract_vector(ctx, tmp, 0, v1);
tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
- emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
+ emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
@@ -5763,9 +5777,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
emit_split_vector(ctx, dst, 2);
} else if (instr->dest.ssa.bit_size == 1) {
- assert(src.regClass() == s2);
- Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
- bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)));
+ assert(src.regClass() == bld.lm);
+ Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
+ bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
} else if (src.regClass() == s1) {
bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
@@ -5781,22 +5795,22 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
case nir_intrinsic_vote_all: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- assert(src.regClass() == s2);
- assert(dst.regClass() == s2);
+ assert(src.regClass() == bld.lm);
+ assert(dst.regClass() == bld.lm);
- Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
- Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp));
+ Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
+ Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp));
emit_wqm(ctx, val, dst);
break;
}
case nir_intrinsic_vote_any: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- assert(src.regClass() == s2);
- assert(dst.regClass() == s2);
+ assert(src.regClass() == bld.lm);
+ assert(dst.regClass() == bld.lm);
- Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
- Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(-1u), Operand(0u), bld.scc(tmp));
+ Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
+ Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), Operand(0u), bld.scc(tmp));
emit_wqm(ctx, val, dst);
break;
}
@@ -5879,7 +5893,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
Temp tmp_dst = bld.tmp(dst.regClass());
reduce->definitions[0] = Definition(tmp_dst);
- reduce->definitions[1] = bld.def(s2); // used internally
+ reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally
reduce->definitions[2] = Definition();
reduce->definitions[3] = Definition(scc, s1);
reduce->definitions[4] = Definition();
@@ -5899,13 +5913,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
if (instr->dest.ssa.bit_size == 1) {
- assert(src.regClass() == s2);
+ assert(src.regClass() == bld.lm);
+ assert(dst.regClass() == bld.lm);
uint32_t half_mask = 0x11111111u << lane;
Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
- Temp tmp = bld.tmp(s2);
- bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
- bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
- bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
+ Temp tmp = bld.tmp(bld.lm);
+ bld.sop1(Builder::s_wqm, Definition(tmp),
+ bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
+ bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
emit_wqm(ctx, tmp, dst);
} else if (instr->dest.ssa.bit_size == 32) {
emit_wqm(ctx,
@@ -5957,10 +5972,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
if (instr->dest.ssa.bit_size == 1) {
- assert(src.regClass() == s2);
+ assert(src.regClass() == bld.lm);
src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
- Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
+ Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
emit_wqm(ctx, tmp, dst);
} else if (instr->dest.ssa.bit_size == 32) {
Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
@@ -6060,15 +6075,15 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_demote_if: {
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
- assert(src.regClass() == s2);
- Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
+ assert(src.regClass() == bld.lm);
+ Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
bld.pseudo(aco_opcode::p_demote_to_helper, cond);
ctx->block->kind |= block_kind_uses_demote;
ctx->program->needs_exact = true;
break;
}
case nir_intrinsic_first_invocation: {
- emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
+ emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
get_ssa_temp(ctx, &instr->dest.ssa));
break;
}
@@ -6180,14 +6195,14 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
Operand two(0x40000000u);
Operand four(0x40800000u);
- Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
+ Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
- Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
+ Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
- is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
- Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
+ is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
+ Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
// select sc
Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
@@ -6667,7 +6682,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
bld.scc(compare_cube_wa));
}
- tg4_compare_cube_wa64 = bld.tmp(s2);
+ tg4_compare_cube_wa64 = bld.tmp(bld.lm);
bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
@@ -6800,7 +6815,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
assert(dmask == 1 && dst.regClass() == v1);
assert(dst.id() != tmp_dst.id());
- Temp tmp = bld.tmp(s2);
+ Temp tmp = bld.tmp(bld.lm);
bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
@@ -6921,7 +6936,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
{
aco_ptr<Pseudo_instruction> phi;
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
- assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == s2);
+ assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
logical |= ctx->block->kind & block_kind_merge;
@@ -7295,7 +7310,7 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond
ctx->block->kind |= block_kind_branch;
/* branch to linear then block */
- assert(cond.regClass() == s2);
+ assert(cond.regClass() == ctx->program->lane_mask);
aco_ptr<Pseudo_branch_instruction> branch;
branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
branch->operands[0] = Operand(cond);
@@ -7439,7 +7454,7 @@ static void visit_if(isel_context *ctx, nir_if *if_stmt)
ctx->block->kind |= block_kind_uniform;
/* emit branch */
- assert(cond.regClass() == s2);
+ assert(cond.regClass() == bld.lm);
// TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
cond = bool_to_scalar_condition(ctx, cond);
@@ -7825,7 +7840,7 @@ void handle_bc_optimize(isel_context *ctx)
ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
if (uses_center && uses_centroid) {
- Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)),
+ Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
@@ -7934,7 +7949,7 @@ void select_program(Program *program,
Builder bld(ctx.program, ctx.block);
Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
Temp thread_id = emit_mbcnt(&ctx, bld.def(v1));
- Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
+ Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id);
begin_divergent_if_then(&ctx, &ic, cond);
}
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index ab96a4507cf..a7446c6c058 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -126,6 +126,7 @@ unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
void init_context(isel_context *ctx, nir_shader *shader)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+ unsigned lane_mask_size = ctx->program->lane_mask.size();
ctx->shader = shader;
ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform);
@@ -207,7 +208,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
case nir_op_ieq:
case nir_op_ine:
case nir_op_i2b1:
- size = 2;
+ size = lane_mask_size;
break;
case nir_op_f2i64:
case nir_op_f2u64:
@@ -219,7 +220,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
break;
case nir_op_bcsel:
if (alu_instr->dest.dest.ssa.bit_size == 1) {
- size = 2;
+ size = lane_mask_size;
} else {
if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) {
type = RegType::vgpr;
@@ -237,14 +238,14 @@ void init_context(isel_context *ctx, nir_shader *shader)
break;
case nir_op_mov:
if (alu_instr->dest.dest.ssa.bit_size == 1) {
- size = 2;
+ size = lane_mask_size;
} else {
type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr;
}
break;
default:
if (alu_instr->dest.dest.ssa.bit_size == 1) {
- size = 2;
+ size = lane_mask_size;
} else {
for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
@@ -261,7 +262,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
if (nir_instr_as_load_const(instr)->def.bit_size == 64)
size *= 2;
else if (nir_instr_as_load_const(instr)->def.bit_size == 1)
- size *= 2;
+ size *= lane_mask_size;
allocated[nir_instr_as_load_const(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size));
break;
}
@@ -289,11 +290,11 @@ void init_context(isel_context *ctx, nir_shader *shader)
case nir_intrinsic_first_invocation:
type = RegType::sgpr;
if (intrinsic->dest.ssa.bit_size == 1)
- size = 2;
+ size = lane_mask_size;
break;
case nir_intrinsic_ballot:
type = RegType::sgpr;
- size = 2;
+ size = lane_mask_size;
break;
case nir_intrinsic_load_sample_id:
case nir_intrinsic_load_sample_mask_in:
@@ -369,7 +370,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
case nir_intrinsic_inclusive_scan:
case nir_intrinsic_exclusive_scan:
if (intrinsic->dest.ssa.bit_size == 1) {
- size = 2;
+ size = lane_mask_size;
type = RegType::sgpr;
} else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) {
type = RegType::sgpr;
@@ -384,11 +385,11 @@ void init_context(isel_context *ctx, nir_shader *shader)
case nir_intrinsic_load_helper_invocation:
case nir_intrinsic_is_helper_invocation:
type = RegType::sgpr;
- size = 2;
+ size = lane_mask_size;
break;
case nir_intrinsic_reduce:
if (intrinsic->dest.ssa.bit_size == 1) {
- size = 2;
+ size = lane_mask_size;
type = RegType::sgpr;
} else if (nir_intrinsic_cluster_size(intrinsic) == 0 ||
!ctx->divergent_vals[intrinsic->dest.ssa.index]) {
@@ -489,7 +490,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
if (phi->dest.ssa.bit_size == 1) {
assert(size == 1 && "multiple components not yet supported on boolean phis.");
type = RegType::sgpr;
- size *= 2;
+ size *= lane_mask_size;
allocated[phi->dest.ssa.index] = Temp(0, RegClass(type, size));
break;
}
@@ -590,7 +591,7 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx)
startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
arg++;
}
- startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, s2};
+ startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, ctx->program->lane_mask};
Pseudo_instruction *instr = startpgm.get();
ctx->block->instructions.push_back(std::move(startpgm));
@@ -796,6 +797,7 @@ setup_isel_context(Program* program,
program->chip_class = args->options->chip_class;
program->family = args->options->family;
program->wave_size = args->shader_info->wave_size;
+ program->lane_mask = program->wave_size == 32 ? s1 : s2;
program->lds_alloc_granule = args->options->chip_class >= GFX7 ? 512 : 256;
program->lds_limit = args->options->chip_class >= GFX7 ? 65536 : 32768;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 4073086662a..1f4721f5ffd 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1149,6 +1149,7 @@ public:
enum chip_class chip_class;
enum radeon_family family;
unsigned wave_size;
+ RegClass lane_mask;
Stage stage; /* Stage */
bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
bool needs_wqm = false; /* there exists a p_wqm instruction */
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 05ddb7bc68a..4255d56173b 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -54,7 +54,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
bool exec_live = false;
if (block->live_out_exec != Temp()) {
live_sgprs.insert(block->live_out_exec);
- new_demand.sgpr += 2;
+ new_demand.sgpr += program->lane_mask.size();
exec_live = true;
}
@@ -77,10 +77,10 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
if (is_phi(insn))
break;
- /* substract the 2 sgprs from exec */
+ /* substract the 1 or 2 sgprs from exec */
if (exec_live)
- assert(new_demand.sgpr >= 2);
- register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? 2 : 0));
+ assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
+ register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? program->lane_mask.size() : 0));
/* KILL */
for (Definition& definition : insn->definitions) {
@@ -144,8 +144,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
/* update block's register demand for a last time */
if (exec_live)
- assert(new_demand.sgpr >= 2);
- new_demand.sgpr -= exec_live ? 2 : 0;
+ assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
+ new_demand.sgpr -= exec_live ? program->lane_mask.size() : 0;
block->register_demand.update(new_demand);
/* handle phi definitions */
diff --git a/src/amd/compiler/aco_lower_bool_phis.cpp b/src/amd/compiler/aco_lower_bool_phis.cpp
index dc64f0133b5..988f753c82d 100644
--- a/src/amd/compiler/aco_lower_bool_phis.cpp
+++ b/src/amd/compiler/aco_lower_bool_phis.cpp
@@ -54,12 +54,12 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state)
while (true) {
auto pos = state->latest.find(block_idx);
if (pos != state->latest.end())
- return Operand({pos->second, s2});
+ return Operand({pos->second, program->lane_mask});
Block& block = program->blocks[block_idx];
size_t pred = block.linear_preds.size();
if (pred == 0) {
- return Operand(s2);
+ return Operand(program->lane_mask);
} else if (pred == 1) {
block_idx = block.linear_preds[0];
continue;
@@ -75,10 +75,10 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state)
state->phis[phi->operands[i].tempId()][(phi_use){&block, res}] |= (uint64_t)1 << i;
}
}
- phi->definitions[0] = Definition(Temp{res, s2});
+ phi->definitions[0] = Definition(Temp{res, program->lane_mask});
block.instructions.emplace(block.instructions.begin(), std::move(phi));
- return Operand({res, s2});
+ return Operand({res, program->lane_mask});
}
}
}
@@ -118,7 +118,7 @@ Temp write_ssa(Program *program, Block *block, ssa_state *state, unsigned previo
update_phi(program, state, phi.first.block, phi.first.phi_def, phi.second);
}
- return {id, s2};
+ return {id, program->lane_mask};
}
void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
@@ -150,23 +150,25 @@ void lower_divergent_bool_phi(Program *program, Block *block, aco_ptr<Instructio
assert(phi->operands[i].isTemp());
Temp phi_src = phi->operands[i].getTemp();
- assert(phi_src.regClass() == s2);
+ assert(phi_src.regClass() == bld.lm);
Operand cur = get_ssa(program, pred->index, &state);
+ assert(cur.regClass() == bld.lm);
Temp new_cur = write_ssa(program, pred, &state, cur.isTemp() ? cur.tempId() : 0);
+ assert(new_cur.regClass() == bld.lm);
if (cur.isUndefined()) {
insert_before_logical_end(pred, bld.sop1(aco_opcode::s_mov_b64, Definition(new_cur), phi_src).get_ptr());
} else {
- Temp tmp1 = bld.tmp(s2), tmp2 = bld.tmp(s2);
+ Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm);
insert_before_logical_end(pred,
- bld.sop2(aco_opcode::s_andn2_b64, Definition(tmp1), bld.def(s1, scc),
- cur, Operand(exec, s2)).get_ptr());
+ bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc),
+ cur, Operand(exec, bld.lm)).get_ptr());
insert_before_logical_end(pred,
- bld.sop2(aco_opcode::s_and_b64, Definition(tmp2), bld.def(s1, scc),
- phi_src, Operand(exec, s2)).get_ptr());
+ bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc),
+ phi_src, Operand(exec, bld.lm)).get_ptr());
insert_before_logical_end(pred,
- bld.sop2(aco_opcode::s_or_b64, Definition(new_cur), bld.def(s1, scc),
+ bld.sop2(Builder::s_or, Definition(new_cur), bld.def(s1, scc),
tmp1, tmp2).get_ptr());
}
}
@@ -192,8 +194,8 @@ void lower_bool_phis(Program* program)
for (Block& block : program->blocks) {
for (aco_ptr<Instruction>& phi : block.instructions) {
if (phi->opcode == aco_opcode::p_phi) {
- assert(phi->definitions[0].regClass() != s1);
- if (phi->definitions[0].regClass() == s2)
+ assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2);
+ if (phi->definitions[0].regClass() == program->lane_mask)
lower_divergent_bool_phi(program, &block, phi);
} else if (!is_phi(phi)) {
break;
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index cbb3b55179c..e9c2d66d823 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -76,8 +76,10 @@ aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
{
Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
- if (instr->definitions.size() >= 2)
+ if (instr->definitions.size() >= 2) {
+ assert(instr->definitions[1].regClass() == bld.lm);
instr->definitions[1].setFixed(vcc);
+ }
}
void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
@@ -99,12 +101,12 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph
bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
dpp_ctrl, row_mask, bank_mask, bound_ctrl);
- bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(s2, vcc), vtmp_op[0], src1[0]);
+ bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
} else {
- bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(s2, vcc), src0[0], src1[0],
+ bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
dpp_ctrl, row_mask, bank_mask, bound_ctrl);
}
- bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(s2, vcc), src0[1], src1[1], Operand(vcc, s2),
+ bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm),
dpp_ctrl, row_mask, bank_mask, bound_ctrl);
} else if (op == iand64) {
bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0],
@@ -149,9 +151,9 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph
bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1],
dpp_ctrl, row_mask, bank_mask, bound_ctrl);
- bld.vopc(cmp, bld.def(s2, vcc), vtmp_op64, src1_64);
- bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, s2));
- bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, s2));
+ bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
+ bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
+ bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
} else if (op == imul64) {
/* t4 = dpp(x_hi)
* t1 = umul_lo(t4, y_lo)
@@ -216,11 +218,11 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
if (op == iadd64) {
if (ctx->program->chip_class >= GFX10) {
- bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(s2, vcc), src0[0], src1[0]);
+ bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
} else {
- bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(s2, vcc), src0[0], src1[0]);
+ bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
}
- bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(s2, vcc), src0[1], src1[1], Operand(vcc, s2));
+ bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm));
} else if (op == iand64) {
bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
@@ -249,9 +251,9 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe
break;
}
- bld.vopc(cmp, bld.def(s2, vcc), src0_64, src1_64);
- bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, s2));
- bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, s2));
+ bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
+ bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
+ bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
} else if (op == imul64) {
if (src1_reg == dst_reg) {
/* it's fine if src0==dst but not if src1==dst */
@@ -298,7 +300,7 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg
if (!vop3) {
if (opcode == aco_opcode::v_add_co_u32)
- bld.vop2_dpp(opcode, dst, bld.def(s2, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+ bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
else
bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
return;
@@ -342,7 +344,7 @@ void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1
if (vop3) {
bld.vop3(opcode, dst, src0, src1);
} else if (opcode == aco_opcode::v_add_co_u32) {
- bld.vop2(opcode, dst, bld.def(s2, vcc), src0, src1);
+ bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
} else {
bld.vop2(opcode, dst, src0, src1);
}
@@ -420,7 +422,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
Operand vcndmask_identity[2] = {identity[0], identity[1]};
/* First, copy the source to tmp and set inactive lanes to the identity */
- bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2));
+ bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm));
for (unsigned i = 0; i < src.size(); i++) {
/* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
@@ -440,7 +442,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
for (unsigned i = 0; i < src.size(); i++) {
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
- Operand(stmp, s2));
+ Operand(stmp, bld.lm));
}
bool exec_restored = false;
@@ -463,7 +465,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
if (cluster_size == 32) {
for (unsigned i = 0; i < src.size(); i++)
bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), ds_pattern_bitmode(0x1f, 0, 0x10));
- bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2));
+ bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
exec_restored = true;
emit_op(ctx, dst.physReg(), vtmp, tmp, PhysReg{0}, reduce_op, src.size());
dst_written = true;
@@ -500,7 +502,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
Operand(0xffffffffu), Operand(0xffffffffu)).instr;
static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
}
- bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
+ bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX));
/* fill in the gap in row 2 */
for (unsigned i = 0; i < src.size(); i++) {
@@ -559,7 +561,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
}
if (!exec_restored)
- bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2));
+ bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
if (op == aco_opcode::p_reduce && cluster_size == 64) {
for (unsigned k = 0; k < src.size(); k++) {
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp
index 28a779580a2..68a0dc15761 100644
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -172,7 +172,7 @@ void setup_reduce_temp(Program* program)
clobber_vcc = true;
if (clobber_vcc)
- instr->definitions[4] = Definition(vcc, s2);
+ instr->definitions[4] = Definition(vcc, bld.lm);
}
}
}
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index c4144cc42f0..504ad015746 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -1719,6 +1719,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
pc->operands[i] = parallelcopy[i].first;
pc->definitions[i] = parallelcopy[i].second;
+ assert(pc->operands[i].size() == pc->definitions[i].size());
/* it might happen that the operand is already renamed. we have to restore the original name. */
std::map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId());
diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp
index 3d76dcd8867..54e691ba476 100644
--- a/src/amd/compiler/aco_ssa_elimination.cpp
+++ b/src/amd/compiler/aco_ssa_elimination.cpp
@@ -58,6 +58,7 @@ void collect_phi_info(ssa_elimination_ctx& ctx)
std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
phi_info& info = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info : ctx.linear_phi_info;
const auto result = info.emplace(preds[i], std::vector<std::pair<Definition, Operand>>());
+ assert(phi->definitions[0].size() == phi->operands[i].size());
result.first->second.emplace_back(phi->definitions[0], phi->operands[i]);
ctx.empty_blocks[preds[i]] = false;
}
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index 8d2bf8449db..8282d7e27e3 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -190,7 +190,7 @@ void validate(Program* program, FILE * output)
}
} else if (instr->opcode == aco_opcode::p_phi) {
check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
- check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == s2, "Logical Phi Definition must be vgpr or divergent boolean", instr.get());
+ check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == program->lane_mask, "Logical Phi Definition must be vgpr or divergent boolean", instr.get());
} else if (instr->opcode == aco_opcode::p_linear_phi) {
for (const Operand& op : instr->operands)
check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get());