/* * Copyright © 2018 Valve Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Authors: * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de) * Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl) * */ #include #include #include #include #include "aco_ir.h" #include "sid.h" namespace aco { namespace { struct ra_ctx { std::bitset<512> war_hint; Program* program; std::unordered_map> assignments; std::map orig_names; unsigned max_used_sgpr = 0; unsigned max_used_vgpr = 0; std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */ ra_ctx(Program* program) : program(program) {} }; /* helper function for debugging */ #if 0 void print_regs(ra_ctx& ctx, bool vgprs, std::array& reg_file) { unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr; unsigned lb = vgprs ? 256 : 0; unsigned ub = lb + max; char reg_char = vgprs ? 'v' : 's'; /* print markers */ printf(" "); for (unsigned i = lb; i < ub; i += 3) { printf("%.2u ", i - lb); } printf("\n"); /* print usage */ printf("%cgprs: ", reg_char); unsigned free_regs = 0; unsigned prev = 0; bool char_select = false; for (unsigned i = lb; i < ub; i++) { if (reg_file[i] == 0xFFFF) { printf("~"); } else if (reg_file[i]) { if (reg_file[i] != prev) { prev = reg_file[i]; char_select = !char_select; } printf(char_select ? "#" : "@"); } else { free_regs++; printf("."); } } printf("\n"); printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max); /* print assignments */ prev = 0; unsigned size = 0; for (unsigned i = lb; i < ub; i++) { if (reg_file[i] != prev) { if (prev && size > 1) printf("-%d]\n", i - 1 - lb); else if (prev) printf("]\n"); prev = reg_file[i]; if (prev && prev != 0xFFFF) { if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i]) printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb); else printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb); } size = 1; } else { size++; } } if (prev && size > 1) printf("-%d]\n", ub - lb - 1); else if (prev) printf("]\n"); } #endif void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) { unsigned max_addressible_sgpr = ctx.program->sgpr_limit; unsigned size = rc.size(); if (rc.type() == RegType::vgpr) { assert(reg >= 256); unsigned hi = reg - 256 + size - 1; ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi); } else if (reg + rc.size() <= max_addressible_sgpr) { unsigned hi = reg + size - 1; ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr)); } } void update_renames(ra_ctx& ctx, std::array& reg_file, std::vector>& parallelcopies, aco_ptr& instr) { /* allocate id's and rename operands: this is done transparently here */ for (std::pair& copy : parallelcopies) { /* the definitions with id are not from this function and already handled */ if (copy.second.isTemp()) continue; // FIXME: if a definition got moved, change the target location and remove the parallelcopy copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass())); ctx.assignments[copy.second.tempId()] = {copy.second.physReg(), copy.second.regClass()}; for (unsigned i = copy.second.physReg().reg; i < copy.second.physReg() + copy.second.size(); i++) reg_file[i] = copy.second.tempId(); /* check if we moved an operand */ for (Operand& op : instr->operands) { if (!op.isTemp()) continue; if (op.tempId() == copy.first.tempId()) { bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKill(); for (std::pair& pc : parallelcopies) { PhysReg def_reg = pc.second.physReg(); omit_renaming &= def_reg > copy.first.physReg() ? (copy.first.physReg() + copy.first.size() <= def_reg.reg) : (def_reg + pc.second.size() <= copy.first.physReg().reg); } if (omit_renaming) continue; op.setTemp(copy.second.getTemp()); op.setFixed(copy.second.physReg()); } } } } std::pair get_reg_simple(ra_ctx& ctx, std::array& reg_file, uint32_t lb, uint32_t ub, uint32_t size, uint32_t stride, RegClass rc) { /* best fit algorithm: find the smallest gap to fit in the variable */ if (stride == 1) { unsigned best_pos = 0xFFFF; unsigned gap_size = 0xFFFF; unsigned next_pos = 0xFFFF; for (unsigned current_reg = lb; current_reg < ub; current_reg++) { if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) { if (next_pos == 0xFFFF) continue; /* check if the variable fits */ if (next_pos + size > current_reg) { next_pos = 0xFFFF; continue; } /* check if the tested gap is smaller */ if (current_reg - next_pos < gap_size) { best_pos = next_pos; gap_size = current_reg - next_pos; } next_pos = 0xFFFF; continue; } if (next_pos == 0xFFFF) next_pos = current_reg; } /* final check */ if (next_pos != 0xFFFF && next_pos + size <= ub && ub - next_pos < gap_size) { best_pos = next_pos; gap_size = ub - next_pos; } if (best_pos != 0xFFFF) { adjust_max_used_regs(ctx, rc, best_pos); return {PhysReg{best_pos}, true}; } return {{}, false}; } bool found = false; unsigned reg_lo = lb; unsigned reg_hi = lb + size - 1; while (!found && reg_lo + size <= ub) { if (reg_file[reg_lo] != 0) { reg_lo += stride; continue; } reg_hi = reg_lo + size - 1; found = true; for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) { if (reg_file[reg] != 0 || ctx.war_hint[reg]) found = false; } if (found) { adjust_max_used_regs(ctx, rc, reg_lo); return {PhysReg{reg_lo}, true}; } reg_lo += stride; } return {{}, false}; } bool get_regs_for_copies(ra_ctx& ctx, std::array& reg_file, std::vector>& parallelcopies, std::set> vars, uint32_t lb, uint32_t ub, aco_ptr& instr, uint32_t def_reg_lo, uint32_t def_reg_hi) { /* variables are sorted from small sized to large */ /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */ for (std::set>::reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) { unsigned id = it->second; std::pair var = ctx.assignments[id]; uint32_t size = it->first; uint32_t stride = 1; if (var.second.type() == RegType::sgpr) { if (size == 2) stride = 2; if (size > 3) stride = 4; } /* check if this is a dead operand, then we can re-use the space from the definition */ bool is_dead_operand = false; for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && i < instr->operands.size(); i++) { if (instr->operands[i].isTemp() && instr->operands[i].isKill() && instr->operands[i].tempId() == id) is_dead_operand = true; } std::pair res; if (is_dead_operand) { if (instr->opcode == aco_opcode::p_create_vector) { for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { for (unsigned j = 0; j < size; j++) assert(reg_file[def_reg_lo + offset + j] == 0); res = {PhysReg{def_reg_lo + offset}, true}; break; } } } else { res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.second); } } else { res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.second); if (!res.second) { unsigned lb = (def_reg_hi + stride) & ~(stride - 1); res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.second); } } if (res.second) { /* mark the area as blocked */ for (unsigned i = res.first.reg; i < res.first + size; i++) reg_file[i] = 0xFFFFFFFF; /* create parallelcopy pair (without definition id) */ Temp tmp = Temp(id, var.second); Operand pc_op = Operand(tmp); pc_op.setFixed(var.first); Definition pc_def = Definition(res.first, pc_op.regClass()); parallelcopies.emplace_back(pc_op, pc_def); continue; } unsigned best_pos = lb; unsigned num_moves = 0xFF; unsigned num_vars = 0; /* we use a sliding window to find potential positions */ unsigned reg_lo = lb; unsigned reg_hi = lb + size - 1; for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) { if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) || (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi))) continue; /* second, check that we have at most k=num_moves elements in the window * and no element is larger than the currently processed one */ unsigned k = 0; unsigned n = 0; unsigned last_var = 0; bool found = true; for (unsigned j = reg_lo; found && j <= reg_hi; j++) { if (reg_file[j] == 0 || reg_file[j] == last_var) continue; /* 0xFFFF signals that this area is already blocked! */ if (reg_file[j] == 0xFFFFFFFF || k > num_moves) { found = false; break; } /* we cannot split live ranges of linear vgprs */ if (ctx.assignments[reg_file[j]].second & (1 << 6)) { found = false; break; } bool is_kill = false; for (const Operand& op : instr->operands) { if (op.isTemp() && op.isKill() && op.tempId() == reg_file[j]) { is_kill = true; break; } } if (!is_kill && ctx.assignments[reg_file[j]].second.size() >= size) { found = false; break; } k += ctx.assignments[reg_file[j]].second.size(); last_var = reg_file[j]; n++; if (k > num_moves || (k == num_moves && n <= num_vars)) { found = false; break; } } if (found) { best_pos = reg_lo; num_moves = k; num_vars = n; } } /* FIXME: we messed up and couldn't find space for the variables to be copied */ if (num_moves == 0xFF) return false; reg_lo = best_pos; reg_hi = best_pos + size - 1; /* collect variables and block reg file */ std::set> new_vars; for (unsigned j = reg_lo; j <= reg_hi; j++) { if (reg_file[j] != 0) { unsigned size = ctx.assignments[reg_file[j]].second.size(); unsigned id = reg_file[j]; new_vars.emplace(size, id); for (unsigned k = 0; k < size; k++) reg_file[ctx.assignments[id].first + k] = 0; } } /* mark the area as blocked */ for (unsigned i = reg_lo; i <= reg_hi; i++) reg_file[i] = 0xFFFFFFFF; if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi)) return false; adjust_max_used_regs(ctx, var.second, reg_lo); /* create parallelcopy pair (without definition id) */ Temp tmp = Temp(id, var.second); Operand pc_op = Operand(tmp); pc_op.setFixed(var.first); Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass()); parallelcopies.emplace_back(pc_op, pc_def); } return true; } std::pair get_reg_impl(ra_ctx& ctx, std::array& reg_file, std::vector>& parallelcopies, uint32_t lb, uint32_t ub, uint32_t size, uint32_t stride, RegClass rc, aco_ptr& instr) { unsigned regs_free = 0; /* check how many free regs we have */ for (unsigned j = lb; j < ub; j++) { if (reg_file[j] == 0) regs_free++; } /* mark and count killed operands */ unsigned killed_ops = 0; for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill() && instr->operands[j].physReg() >= lb && instr->operands[j].physReg() < ub) { assert(instr->operands[j].isFixed()); assert(reg_file[instr->operands[j].physReg().reg] == 0); for (unsigned k = 0; k < instr->operands[j].size(); k++) reg_file[instr->operands[j].physReg() + k] = 0xFFFFFFFF; killed_ops += instr->operands[j].getTemp().size(); } } assert(regs_free >= size); /* we might have to move dead operands to dst in order to make space */ unsigned op_moves = 0; if (size > (regs_free - killed_ops)) op_moves = size - (regs_free - killed_ops); /* find the best position to place the definition */ unsigned best_pos = lb; unsigned num_moves = 0xFF; unsigned num_vars = 0; /* we use a sliding window to check potential positions */ unsigned reg_lo = lb; unsigned reg_hi = lb + size - 1; for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) { /* first check the edges: this is what we have to fix to allow for num_moves > size */ if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1]) continue; if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1]) continue; /* second, check that we have at most k=num_moves elements in the window * and no element is larger than the currently processed one */ unsigned k = op_moves; unsigned n = 0; unsigned remaining_op_moves = op_moves; unsigned last_var = 0; bool found = true; bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0; for (unsigned j = reg_lo; found && j <= reg_hi; j++) { if (reg_file[j] == 0 || reg_file[j] == last_var) continue; /* dead operands effectively reduce the number of estimated moves */ if (remaining_op_moves && reg_file[j] == 0xFFFFFFFF) { k--; remaining_op_moves--; continue; } if (ctx.assignments[reg_file[j]].second.size() >= size) { found = false; break; } /* we cannot split live ranges of linear vgprs */ if (ctx.assignments[reg_file[j]].second & (1 << 6)) { found = false; break; } k += ctx.assignments[reg_file[j]].second.size(); n++; last_var = reg_file[j]; } if (!found || k > num_moves) continue; if (k == num_moves && n < num_vars) continue; if (!aligned && k == num_moves && n == num_vars) continue; if (found) { best_pos = reg_lo; num_moves = k; num_vars = n; } } if (num_moves == 0xFF) { /* remove killed operands from reg_file once again */ for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill()) { for (unsigned k = 0; k < instr->operands[i].getTemp().size(); k++) reg_file[instr->operands[i].physReg() + k] = 0; } } for (unsigned i = 0; i < instr->definitions.size(); i++) { Definition def = instr->definitions[i]; if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) { for (unsigned k = 0; k < def.getTemp().size(); k++) reg_file[def.physReg() + k] = def.tempId(); } } return {{}, false}; } std::array register_file = reg_file; /* now, we figured the placement for our definition */ std::set> vars; for (unsigned j = best_pos; j < best_pos + size; j++) { if (reg_file[j] != 0xFFFFFFFF && reg_file[j] != 0) vars.emplace(ctx.assignments[reg_file[j]].second.size(), reg_file[j]); reg_file[j] = 0; } if (instr->opcode == aco_opcode::p_create_vector) { /* move killed operands which aren't yet at the correct position */ for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && instr->operands[i].getTemp().type() == rc.type()) { if (instr->operands[i].physReg() != best_pos + offset) { vars.emplace(instr->operands[i].size(), instr->operands[i].tempId()); for (unsigned j = 0; j < instr->operands[i].size(); j++) reg_file[instr->operands[i].physReg() + j] = 0; } else { for (unsigned j = 0; j < instr->operands[i].size(); j++) reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId(); } } } } else { /* re-enable the killed operands */ for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill()) { for (unsigned k = 0; k < instr->operands[j].getTemp().size(); k++) reg_file[instr->operands[j].physReg() + k] = instr->operands[j].tempId(); } } } std::vector> pc; if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) { reg_file = std::move(register_file); /* remove killed operands from reg_file once again */ if (!is_phi(instr)) { for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKill()) { for (unsigned k = 0; k < op.getTemp().size(); k++) reg_file[op.physReg() + k] = 0; } } } for (unsigned i = 0; i < instr->definitions.size(); i++) { Definition& def = instr->definitions[i]; if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) { for (unsigned k = 0; k < def.getTemp().size(); k++) reg_file[def.physReg() + k] = def.tempId(); } } return {{}, false}; } parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); /* we set the definition regs == 0. the actual caller is responsible for correct setting */ for (unsigned i = 0; i < size; i++) reg_file[best_pos + i] = 0; update_renames(ctx, reg_file, parallelcopies, instr); /* remove killed operands from reg_file once again */ for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed()) continue; assert(!instr->operands[i].isUndefined()); if (instr->operands[i].isFirstKill()) { for (unsigned j = 0; j < instr->operands[i].getTemp().size(); j++) reg_file[instr->operands[i].physReg() + j] = 0; } } for (unsigned i = 0; i < instr->definitions.size(); i++) { Definition def = instr->definitions[i]; if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) { for (unsigned k = 0; k < def.getTemp().size(); k++) reg_file[def.physReg() + k] = def.tempId(); } } adjust_max_used_regs(ctx, rc, best_pos); return {PhysReg{best_pos}, true}; } PhysReg get_reg(ra_ctx& ctx, std::array& reg_file, RegClass rc, std::vector>& parallelcopies, aco_ptr& instr) { uint32_t size = rc.size(); uint32_t stride = 1; uint32_t lb, ub; if (rc.type() == RegType::vgpr) { lb = 256; ub = 256 + ctx.program->max_reg_demand.vgpr; } else { lb = 0; ub = ctx.program->max_reg_demand.sgpr; if (size == 2) stride = 2; else if (size >= 4) stride = 4; } std::pair res = {{}, false}; /* try to find space without live-range splits */ if (rc.type() == RegType::vgpr && (size == 4 || size == 8)) res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc); if (!res.second) res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc); if (res.second) return res.first; /* try to find space with live-range splits */ res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr); if (res.second) return res.first; unsigned regs_free = 0; for (unsigned i = lb; i < ub; i++) { if (!reg_file[i]) regs_free++; } /* We should only fail here because keeping under the limit would require * too many moves. */ assert(regs_free >= size); /* try using more registers */ uint16_t max_addressible_sgpr = ctx.program->sgpr_limit; if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < 256) { update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr)); return get_reg(ctx, reg_file, rc, parallelcopies, instr); } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) { update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1)); return get_reg(ctx, reg_file, rc, parallelcopies, instr); } //FIXME: if nothing helps, shift-rotate the registers to make space unreachable("did not find a register"); } std::pair get_reg_vec(ra_ctx& ctx, std::array& reg_file, RegClass rc) { uint32_t size = rc.size(); uint32_t stride = 1; uint32_t lb, ub; if (rc.type() == RegType::vgpr) { lb = 256; ub = 256 + ctx.program->max_reg_demand.vgpr; } else { lb = 0; ub = ctx.program->max_reg_demand.sgpr; if (size == 2) stride = 2; else if (size >= 4) stride = 4; } return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc); } PhysReg get_reg_create_vector(ra_ctx& ctx, std::array& reg_file, RegClass rc, std::vector>& parallelcopies, aco_ptr& instr) { /* create_vector instructions have different costs w.r.t. register coalescing */ uint32_t size = rc.size(); uint32_t stride = 1; uint32_t lb, ub; if (rc.type() == RegType::vgpr) { lb = 256; ub = 256 + ctx.program->max_reg_demand.vgpr; } else { lb = 0; ub = ctx.program->max_reg_demand.sgpr; if (size == 2) stride = 2; else if (size >= 4) stride = 4; } unsigned best_pos = -1; unsigned num_moves = 0xFF; bool best_war_hint = true; /* test for each operand which definition placement causes the least shuffle instructions */ for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { // TODO: think about, if we can alias live operands on the same register if (!instr->operands[i].isTemp() || !instr->operands[i].isKill() || instr->operands[i].getTemp().type() != rc.type()) continue; if (offset > instr->operands[i].physReg()) continue; unsigned reg_lo = instr->operands[i].physReg() - offset; unsigned reg_hi = reg_lo + size - 1; unsigned k = 0; /* no need to check multiple times */ if (reg_lo == best_pos) continue; /* check borders */ // TODO: this can be improved */ if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0) continue; if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1]) continue; if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1]) continue; /* count variables to be moved and check war_hint */ bool war_hint = false; for (unsigned j = reg_lo; j <= reg_hi; j++) { if (reg_file[j] != 0) k++; war_hint |= ctx.war_hint[j]; } /* count operands in wrong positions */ for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) { if (j == i || !instr->operands[j].isTemp() || instr->operands[j].getTemp().type() != rc.type()) continue; if (instr->operands[j].physReg() != reg_lo + offset) k += instr->operands[j].size(); } bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0; if (k > num_moves || (!aligned && k == num_moves) || (war_hint && !best_war_hint)) continue; best_pos = reg_lo; num_moves = k; best_war_hint = war_hint; } if (num_moves >= size) return get_reg(ctx, reg_file, rc, parallelcopies, instr); /* collect variables to be moved */ std::set> vars; for (unsigned i = best_pos; i < best_pos + size; i++) { if (reg_file[i] != 0) vars.emplace(ctx.assignments[reg_file[i]].second.size(), reg_file[i]); reg_file[i] = 0; } /* move killed operands which aren't yet at the correct position */ for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && instr->operands[i].getTemp().type() == rc.type()) { if (instr->operands[i].physReg() != best_pos + offset) { vars.emplace(instr->operands[i].size(), instr->operands[i].tempId()); } else { for (unsigned j = 0; j < instr->operands[i].size(); j++) reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId(); } } } ASSERTED bool success = false; success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1); assert(success); update_renames(ctx, reg_file, parallelcopies, instr); adjust_max_used_regs(ctx, rc, best_pos); return PhysReg{best_pos}; } bool get_reg_specified(ra_ctx& ctx, std::array& reg_file, RegClass rc, std::vector>& parallelcopies, aco_ptr& instr, PhysReg reg) { uint32_t size = rc.size(); uint32_t stride = 1; uint32_t lb, ub; if (rc.type() == RegType::vgpr) { lb = 256; ub = 256 + ctx.program->max_reg_demand.vgpr; } else { if (size == 2) stride = 2; else if (size >= 4) stride = 4; if (reg % stride != 0) return false; lb = 0; ub = ctx.program->max_reg_demand.sgpr; } uint32_t reg_lo = reg.reg; uint32_t reg_hi = reg + (size - 1); if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi) return false; for (unsigned i = reg_lo; i <= reg_hi; i++) { if (reg_file[i] != 0) return false; } adjust_max_used_regs(ctx, rc, reg_lo); return true; } void handle_pseudo(ra_ctx& ctx, const std::array& reg_file, Instruction* instr) { if (instr->format != Format::PSEUDO) return; /* all instructions which use handle_operands() need this information */ switch (instr->opcode) { case aco_opcode::p_extract_vector: case aco_opcode::p_create_vector: case aco_opcode::p_split_vector: case aco_opcode::p_parallelcopy: case aco_opcode::p_wqm: break; default: return; } /* if all definitions are vgpr, no need to care for SCC */ bool writes_sgpr = false; for (Definition& def : instr->definitions) { if (def.getTemp().type() == RegType::sgpr) { writes_sgpr = true; break; } } if (!writes_sgpr) return; Pseudo_instruction *pi = (Pseudo_instruction *)instr; if (reg_file[scc.reg]) { pi->tmp_in_scc = true; int reg = ctx.max_used_sgpr; for (; reg >= 0 && reg_file[reg]; reg--) ; if (reg < 0) { reg = ctx.max_used_sgpr + 1; for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++) ; assert(reg < ctx.program->max_reg_demand.sgpr); } adjust_max_used_regs(ctx, s1, reg); pi->scratch_sgpr = PhysReg{(unsigned)reg}; } else { pi->tmp_in_scc = false; } } bool operand_can_use_reg(aco_ptr& instr, unsigned idx, PhysReg reg) { switch (instr->format) { case Format::SMEM: return reg != scc && reg != exec && (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */ (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */ default: // TODO: there are more instructions with restrictions on registers return true; } } } /* end namespace */ void register_allocation(Program *program, std::vector> live_out_per_block) { ra_ctx ctx(program); std::vector> renames(program->blocks.size()); struct phi_info { Instruction* phi; unsigned block_idx; std::set uses; }; bool filled[program->blocks.size()]; bool sealed[program->blocks.size()]; memset(filled, 0, sizeof filled); memset(sealed, 0, sizeof sealed); std::vector> incomplete_phis(program->blocks.size()); std::map phi_map; std::map affinities; std::function read_variable; std::function handle_live_in; std::function::iterator)> try_remove_trivial_phi; read_variable = [&](Temp val, unsigned block_idx) -> Temp { std::unordered_map::iterator it = renames[block_idx].find(val.id()); assert(it != renames[block_idx].end()); return it->second; }; handle_live_in = [&](Temp val, Block *block) -> Temp { std::vector& preds = val.is_linear() ? block->linear_preds : block->logical_preds; if (preds.size() == 0 && block->index != 0) { renames[block->index][val.id()] = val; return val; } assert(preds.size() > 0); Temp new_val; if (!sealed[block->index]) { /* consider rename from already processed predecessor */ Temp tmp = read_variable(val, preds[0]); /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */ new_val = Temp{program->allocateId(), val.regClass()}; aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; aco_ptr phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; phi->definitions[0] = Definition(new_val); for (unsigned i = 0; i < preds.size(); i++) phi->operands[i] = Operand(val); if (tmp.regClass() == new_val.regClass()) affinities[new_val.id()] = tmp.id(); phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index}); incomplete_phis[block->index].emplace_back(phi.get()); block->instructions.insert(block->instructions.begin(), std::move(phi)); } else if (preds.size() == 1) { /* if the block has only one predecessor, just look there for the name */ new_val = read_variable(val, preds[0]); } else { /* there are multiple predecessors and the block is sealed */ Temp ops[preds.size()]; /* we start assuming that the name is the same from all predecessors */ renames[block->index][val.id()] = val; bool needs_phi = false; /* get the rename from each predecessor and check if they are the same */ for (unsigned i = 0; i < preds.size(); i++) { ops[i] = read_variable(val, preds[i]); if (i == 0) new_val = ops[i]; else needs_phi |= !(new_val == ops[i]); } if (needs_phi) { /* the variable has been renamed differently in the predecessors: we need to insert a phi */ aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; aco_ptr phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; new_val = Temp{program->allocateId(), val.regClass()}; phi->definitions[0] = Definition(new_val); for (unsigned i = 0; i < preds.size(); i++) { phi->operands[i] = Operand(ops[i]); phi->operands[i].setFixed(ctx.assignments[ops[i].id()].first); if (ops[i].regClass() == new_val.regClass()) affinities[new_val.id()] = ops[i].id(); } phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index}); block->instructions.insert(block->instructions.begin(), std::move(phi)); } } renames[block->index][val.id()] = new_val; renames[block->index][new_val.id()] = new_val; ctx.orig_names[new_val.id()] = val; return new_val; }; try_remove_trivial_phi = [&] (std::map::iterator info) -> Temp { assert(info->second.block_idx != 0); Instruction* phi = info->second.phi; Temp same = Temp(); Definition def = phi->definitions[0]; /* a phi node is trivial if all operands are the same as the definition of the phi */ for (const Operand& op : phi->operands) { const Temp t = op.getTemp(); if (t == same || t == def.getTemp()) continue; if (!(same == Temp()) || !(op.physReg() == def.physReg())) { /* phi is not trivial */ return def.getTemp(); } same = t; } assert(!(same == Temp() || same == def.getTemp())); /* reroute all uses to same and remove phi */ std::vector::iterator> phi_users; std::map::iterator same_phi_info = phi_map.find(same.id()); for (Instruction* instr : info->second.uses) { assert(phi != instr); /* recursively try to remove trivial phis */ if (is_phi(instr)) { /* ignore if the phi was already flagged trivial */ if (instr->definitions.empty()) continue; std::map::iterator it = phi_map.find(instr->definitions[0].tempId()); if (it != phi_map.end() && it != info) phi_users.emplace_back(it); } for (Operand& op : instr->operands) { if (op.isTemp() && op.tempId() == def.tempId()) { op.setTemp(same); if (same_phi_info != phi_map.end()) same_phi_info->second.uses.emplace(instr); } } } auto it = ctx.orig_names.find(same.id()); unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id(); for (unsigned i = 0; i < program->blocks.size(); i++) { auto it = renames[i].find(orig_var); if (it != renames[i].end() && it->second == def.getTemp()) renames[i][orig_var] = same; } unsigned block_idx = info->second.block_idx; phi->definitions.clear(); /* this indicates that the phi can be removed */ phi_map.erase(info); for (auto it : phi_users) { if (sealed[it->second.block_idx]) try_remove_trivial_phi(it); } /* due to the removal of other phis, the name might have changed once again! */ return renames[block_idx][orig_var]; }; std::map vectors; std::vector> phi_ressources; std::map temp_to_phi_ressources; for (std::vector::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) { Block& block = *it; /* first, compute the death points of all live vars within the block */ std::set& live = live_out_per_block[block.index]; std::vector>::reverse_iterator rit; for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) { aco_ptr& instr = *rit; if (!is_phi(instr)) { for (const Operand& op : instr->operands) { if (op.isTemp()) live.emplace(op.getTemp()); } if (instr->opcode == aco_opcode::p_create_vector) { for (const Operand& op : instr->operands) { if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type()) vectors[op.tempId()] = instr.get(); } } } else if (!instr->definitions[0].isKill() && !instr->definitions[0].isFixed()) { /* collect information about affinity-related temporaries */ std::vector affinity_related; /* affinity_related[0] is the last seen affinity-related temp */ affinity_related.emplace_back(instr->definitions[0].getTemp()); affinity_related.emplace_back(instr->definitions[0].getTemp()); for (const Operand& op : instr->operands) { if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) { affinity_related.emplace_back(op.getTemp()); temp_to_phi_ressources[op.tempId()] = phi_ressources.size(); } } phi_ressources.emplace_back(std::move(affinity_related)); } /* erase from live */ for (const Definition& def : instr->definitions) { if (def.isTemp()) { live.erase(def.getTemp()); std::map::iterator it = temp_to_phi_ressources.find(def.tempId()); if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) phi_ressources[it->second][0] = def.getTemp(); } } } } /* create affinities */ for (std::vector& vec : phi_ressources) { assert(vec.size() > 1); for (unsigned i = 1; i < vec.size(); i++) if (vec[i].id() != vec[0].id()) affinities[vec[i].id()] = vec[0].id(); } /* state of register file after phis */ std::vector> sgpr_live_in(program->blocks.size()); for (Block& block : program->blocks) { std::set& live = live_out_per_block[block.index]; /* initialize register file */ assert(block.index != 0 || live.empty()); std::array register_file = {0}; ctx.war_hint.reset(); for (Temp t : live) { Temp renamed = handle_live_in(t, &block); if (ctx.assignments.find(renamed.id()) != ctx.assignments.end()) { for (unsigned i = 0; i < t.size(); i++) register_file[ctx.assignments[renamed.id()].first + i] = renamed.id(); } } std::vector> instructions; std::vector>::iterator it; /* this is a slight adjustment from the paper as we already have phi nodes: * We consider them incomplete phis and only handle the definition. */ /* handle fixed phi definitions */ for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { aco_ptr& phi = *it; if (!is_phi(phi)) break; Definition& definition = phi->definitions[0]; if (!definition.isFixed()) continue; /* check if a dead exec mask phi is needed */ if (definition.isKill()) { for (Operand& op : phi->operands) { assert(op.isTemp()); if (ctx.assignments.find(op.tempId()) == ctx.assignments.end() || ctx.assignments[op.tempId()].first != exec) { definition.setKill(false); break; } } } if (definition.isKill()) continue; assert(definition.physReg() == exec); for (unsigned i = 0; i < definition.size(); i++) { assert(register_file[definition.physReg() + i] == 0); register_file[definition.physReg() + i] = definition.tempId(); } ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; } /* look up the affinities */ for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { aco_ptr& phi = *it; if (!is_phi(phi)) break; Definition& definition = phi->definitions[0]; if (definition.isKill() || definition.isFixed()) continue; if (affinities.find(definition.tempId()) != affinities.end() && ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) { assert(ctx.assignments[affinities[definition.tempId()]].second == definition.regClass()); PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first; bool try_use_special_reg = reg == scc || reg == exec; if (try_use_special_reg) { for (const Operand& op : phi->operands) { if (!op.isTemp() || ctx.assignments.find(op.tempId()) == ctx.assignments.end() || !(ctx.assignments[op.tempId()].first == reg)) { try_use_special_reg = false; break; } } if (!try_use_special_reg) continue; } bool reg_free = true; for (unsigned i = reg.reg; reg_free && i < reg + definition.size(); i++) { if (register_file[i] != 0) reg_free = false; } /* only assign if register is still free */ if (reg_free) { definition.setFixed(reg); for (unsigned i = 0; i < definition.size(); i++) register_file[definition.physReg() + i] = definition.tempId(); ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; } } } /* find registers for phis without affinity or where the register was blocked */ for (it = block.instructions.begin();it != block.instructions.end(); ++it) { aco_ptr& phi = *it; if (!is_phi(phi)) break; Definition& definition = phi->definitions[0]; if (definition.isKill()) continue; renames[block.index][definition.tempId()] = definition.getTemp(); if (!definition.isFixed()) { std::vector> parallelcopy; /* try to find a register that is used by at least one operand */ for (const Operand& op : phi->operands) { if (!op.isTemp() || ctx.assignments.find(op.tempId()) == ctx.assignments.end()) continue; PhysReg reg = ctx.assignments[op.tempId()].first; /* we tried this already on the previous loop */ if (reg == scc || reg == exec) continue; if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) { definition.setFixed(reg); break; } } if (!definition.isFixed()) definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, phi)); /* process parallelcopy */ for (std::pair pc : parallelcopy) { /* rename */ std::map::iterator orig_it = ctx.orig_names.find(pc.first.tempId()); Temp orig = pc.first.getTemp(); if (orig_it != ctx.orig_names.end()) orig = orig_it->second; else ctx.orig_names[pc.second.tempId()] = orig; renames[block.index][orig.id()] = pc.second.getTemp(); renames[block.index][pc.second.tempId()] = pc.second.getTemp(); /* see if it's a copy from a previous phi */ //TODO: prefer moving some previous phis over live-ins //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec) Instruction *prev_phi = NULL; for (auto it2 = instructions.begin(); it2 != instructions.end(); ++it2) { if ((*it2)->definitions[0].tempId() == pc.first.tempId()) prev_phi = it2->get(); } if (prev_phi) { /* if so, just update that phi */ prev_phi->definitions[0] = pc.second; continue; } /* otherwise, this is a live-in and we need to create a new phi * to move it in this block's predecessors */ aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; std::vector& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; aco_ptr new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; new_phi->definitions[0] = pc.second; for (unsigned i = 0; i < preds.size(); i++) new_phi->operands[i] = Operand(pc.first); instructions.emplace_back(std::move(new_phi)); } for (unsigned i = 0; i < definition.size(); i++) register_file[definition.physReg() + i] = definition.tempId(); ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; } live.emplace(definition.getTemp()); /* update phi affinities */ for (const Operand& op : phi->operands) { if (op.isTemp() && op.regClass() == phi->definitions[0].regClass()) affinities[op.tempId()] = definition.tempId(); } instructions.emplace_back(std::move(*it)); } /* fill in sgpr_live_in */ for (unsigned i = 0; i < ctx.max_used_sgpr; i++) sgpr_live_in[block.index][i] = register_file[i]; sgpr_live_in[block.index][127] = register_file[scc.reg]; /* Handle all other instructions of the block */ for (; it != block.instructions.end(); ++it) { aco_ptr& instr = *it; /* parallelcopies from p_phi are inserted here which means * live ranges of killed operands end here as well */ if (instr->opcode == aco_opcode::p_logical_end) { /* no need to process this instruction any further */ if (block.logical_succs.size() != 1) { instructions.emplace_back(std::move(instr)); continue; } Block& succ = program->blocks[block.logical_succs[0]]; unsigned idx = 0; for (; idx < succ.logical_preds.size(); idx++) { if (succ.logical_preds[idx] == block.index) break; } for (aco_ptr& phi : succ.instructions) { if (phi->opcode == aco_opcode::p_phi) { if (phi->operands[idx].isTemp() && phi->operands[idx].getTemp().type() == RegType::sgpr && phi->operands[idx].isFirstKill()) { Temp phi_op = read_variable(phi->operands[idx].getTemp(), block.index); PhysReg reg = ctx.assignments[phi_op.id()].first; assert(register_file[reg] == phi_op.id()); register_file[reg] = 0; } } else if (phi->opcode != aco_opcode::p_linear_phi) { break; } } instructions.emplace_back(std::move(instr)); continue; } std::vector> parallelcopy; assert(!is_phi(instr)); /* handle operands */ for (unsigned i = 0; i < instr->operands.size(); ++i) { auto& operand = instr->operands[i]; if (!operand.isTemp()) continue; /* rename operands */ operand.setTemp(read_variable(operand.getTemp(), block.index)); /* check if the operand is fixed */ if (operand.isFixed()) { if (operand.physReg() == ctx.assignments[operand.tempId()].first) { /* we are fine: the operand is already assigned the correct reg */ } else { /* check if target reg is blocked, and move away the blocking var */ if (register_file[operand.physReg().reg]) { uint32_t blocking_id = register_file[operand.physReg().reg]; RegClass rc = ctx.assignments[blocking_id].second; Operand pc_op = Operand(Temp{blocking_id, rc}); pc_op.setFixed(operand.physReg()); Definition pc_def = Definition(Temp{program->allocateId(), pc_op.regClass()}); /* find free reg */ PhysReg reg = get_reg(ctx, register_file, pc_op.regClass(), parallelcopy, instr); pc_def.setFixed(reg); ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()}; for (unsigned i = 0; i < operand.size(); i++) { register_file[pc_op.physReg() + i] = 0; register_file[pc_def.physReg() + i] = pc_def.tempId(); } parallelcopy.emplace_back(pc_op, pc_def); /* handle renames of previous operands */ for (unsigned j = 0; j < i; j++) { Operand& op = instr->operands[j]; if (op.isTemp() && op.tempId() == blocking_id) { op = Operand(pc_def.getTemp()); op.setFixed(reg); } } } /* move operand to fixed reg and create parallelcopy pair */ Operand pc_op = operand; Temp tmp = Temp{program->allocateId(), operand.regClass()}; Definition pc_def = Definition(tmp); pc_def.setFixed(operand.physReg()); pc_op.setFixed(ctx.assignments[operand.tempId()].first); operand.setTemp(tmp); ctx.assignments[tmp.id()] = {pc_def.physReg(), pc_def.regClass()}; operand.setFixed(pc_def.physReg()); for (unsigned i = 0; i < operand.size(); i++) { register_file[pc_op.physReg() + i] = 0; register_file[pc_def.physReg() + i] = tmp.id(); } parallelcopy.emplace_back(pc_op, pc_def); } } else { assert(ctx.assignments.find(operand.tempId()) != ctx.assignments.end()); PhysReg reg = ctx.assignments[operand.tempId()].first; if (operand_can_use_reg(instr, i, reg)) { operand.setFixed(ctx.assignments[operand.tempId()].first); } else { Operand pc_op = operand; pc_op.setFixed(reg); PhysReg new_reg = get_reg(ctx, register_file, operand.regClass(), parallelcopy, instr); Definition pc_def = Definition(program->allocateId(), new_reg, pc_op.regClass()); ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()}; for (unsigned i = 0; i < operand.size(); i++) { register_file[pc_op.physReg() + i] = 0; register_file[pc_def.physReg() + i] = pc_def.tempId(); } parallelcopy.emplace_back(pc_op, pc_def); operand.setFixed(new_reg); } if (instr->format == Format::EXP || (instr->isVMEM() && i == 3 && program->chip_class == GFX6) || (instr->format == Format::DS && static_cast(instr.get())->gds)) { for (unsigned j = 0; j < operand.size(); j++) ctx.war_hint.set(operand.physReg().reg + j); } } std::map::iterator phi = phi_map.find(operand.getTemp().id()); if (phi != phi_map.end()) phi->second.uses.emplace(instr.get()); } /* remove dead vars from register file */ for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKill()) for (unsigned j = 0; j < op.size(); j++) register_file[op.physReg() + j] = 0; } /* try to optimize v_mad_f32 -> v_mac_f32 */ if (instr->opcode == aco_opcode::v_mad_f32 && instr->operands[2].isTemp() && instr->operands[2].isKill() && instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */ VOP3A_instruction* vop3 = static_cast(instr.get()); bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] || vop3->opsel[0] || vop3->opsel[1] || vop3->opsel[2] || vop3->neg[0] || vop3->neg[1] || vop3->neg[2] || vop3->clamp || vop3->omod); if (can_use_mac) { instr->format = Format::VOP2; instr->opcode = aco_opcode::v_mac_f32; } } /* handle definitions which must have the same register as an operand */ if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_writelane_b32) { instr->definitions[0].setFixed(instr->operands[2].physReg()); } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32) { instr->definitions[0].setFixed(instr->operands[0].physReg()); } else if ((instr->format == Format::MUBUF || instr->format == Format::MIMG) && instr->definitions.size() == 1 && instr->operands.size() == 4) { instr->definitions[0].setFixed(instr->operands[3].physReg()); } ctx.defs_done.reset(); /* handle fixed definitions first */ for (unsigned i = 0; i < instr->definitions.size(); ++i) { auto& definition = instr->definitions[i]; if (!definition.isFixed()) continue; adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); /* check if the target register is blocked */ if (register_file[definition.physReg().reg] != 0) { /* create parallelcopy pair to move blocking var */ Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].second}; Operand pc_op = Operand(tmp); pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg]].first); RegClass rc = pc_op.regClass(); tmp = Temp{program->allocateId(), rc}; Definition pc_def = Definition(tmp); /* re-enable the killed operands, so that we don't move the blocking var there */ for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKill()) for (unsigned j = 0; j < op.size(); j++) register_file[op.physReg() + j] = 0xFFFF; } /* find a new register for the blocking variable */ PhysReg reg = get_reg(ctx, register_file, rc, parallelcopy, instr); /* once again, disable killed operands */ for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKill()) for (unsigned j = 0; j < op.size(); j++) register_file[op.physReg() + j] = 0; } for (unsigned k = 0; k < i; k++) { if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill()) for (unsigned j = 0; j < instr->definitions[k].size(); j++) register_file[instr->definitions[k].physReg() + j] = instr->definitions[k].tempId(); } pc_def.setFixed(reg); /* finish assignment of parallelcopy */ ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()}; parallelcopy.emplace_back(pc_op, pc_def); /* add changes to reg_file */ for (unsigned i = 0; i < pc_op.size(); i++) { register_file[pc_op.physReg() + i] = 0x0; register_file[pc_def.physReg() + i] = pc_def.tempId(); } } ctx.defs_done.set(i); if (!definition.isTemp()) continue; /* set live if it has a kill point */ if (!definition.isKill()) live.emplace(definition.getTemp()); ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; renames[block.index][definition.tempId()] = definition.getTemp(); for (unsigned j = 0; j < definition.size(); j++) register_file[definition.physReg() + j] = definition.tempId(); } /* handle all other definitions */ for (unsigned i = 0; i < instr->definitions.size(); ++i) { auto& definition = instr->definitions[i]; if (definition.isFixed() || !definition.isTemp()) continue; /* find free reg */ if (definition.hasHint() && register_file[definition.physReg().reg] == 0) definition.setFixed(definition.physReg()); else if (instr->opcode == aco_opcode::p_split_vector) { PhysReg reg = PhysReg{instr->operands[0].physReg() + i * definition.size()}; if (!get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); definition.setFixed(reg); } else if (instr->opcode == aco_opcode::p_wqm) { PhysReg reg; if (instr->operands[0].isKill() && instr->operands[0].getTemp().type() == definition.getTemp().type()) { reg = instr->operands[0].physReg(); assert(register_file[reg.reg] == 0); } else { reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); } definition.setFixed(reg); } else if (instr->opcode == aco_opcode::p_extract_vector) { PhysReg reg; if (instr->operands[0].isKill() && instr->operands[0].getTemp().type() == definition.getTemp().type()) { reg = instr->operands[0].physReg(); reg.reg += definition.size() * instr->operands[1].constantValue(); assert(register_file[reg.reg] == 0); } else { reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); } definition.setFixed(reg); } else if (instr->opcode == aco_opcode::p_create_vector) { PhysReg reg = get_reg_create_vector(ctx, register_file, definition.regClass(), parallelcopy, instr); definition.setFixed(reg); } else if (affinities.find(definition.tempId()) != affinities.end() && ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) { PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first; if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) definition.setFixed(reg); else definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr)); } else if (vectors.find(definition.tempId()) != vectors.end()) { Instruction* vec = vectors[definition.tempId()]; unsigned offset = 0; for (const Operand& op : vec->operands) { if (op.isTemp() && op.tempId() == definition.tempId()) break; else offset += op.size(); } unsigned k = 0; for (const Operand& op : vec->operands) { if (op.isTemp() && op.tempId() != definition.tempId() && op.getTemp().type() == definition.getTemp().type() && ctx.assignments.find(op.tempId()) != ctx.assignments.end()) { PhysReg reg = ctx.assignments[op.tempId()].first; reg.reg = reg - k + offset; if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) { definition.setFixed(reg); break; } } k += op.size(); } if (!definition.isFixed()) { std::pair res = get_reg_vec(ctx, register_file, vec->definitions[0].regClass()); PhysReg reg = res.first; if (res.second) { reg.reg += offset; } else { reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); } definition.setFixed(reg); } } else definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr)); assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) || (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256))); ctx.defs_done.set(i); /* set live if it has a kill point */ if (!definition.isKill()) live.emplace(definition.getTemp()); ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; renames[block.index][definition.tempId()] = definition.getTemp(); for (unsigned j = 0; j < definition.size(); j++) register_file[definition.physReg() + j] = definition.tempId(); } handle_pseudo(ctx, register_file, instr.get()); /* kill definitions */ for (const Definition& def : instr->definitions) { if (def.isTemp() && def.isKill()) { for (unsigned j = 0; j < def.size(); j++) { register_file[def.physReg() + j] = 0; } } } /* emit parallelcopy */ if (!parallelcopy.empty()) { aco_ptr pc; pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size())); bool temp_in_scc = register_file[scc.reg]; bool sgpr_operands_alias_defs = false; uint64_t sgpr_operands[4] = {0, 0, 0, 0}; for (unsigned i = 0; i < parallelcopy.size(); i++) { if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) { if (!sgpr_operands_alias_defs) { unsigned reg = parallelcopy[i].first.physReg().reg; unsigned size = parallelcopy[i].first.getTemp().size(); sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u); reg = parallelcopy[i].second.physReg().reg; size = parallelcopy[i].second.getTemp().size(); if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u)) sgpr_operands_alias_defs = true; } } pc->operands[i] = parallelcopy[i].first; pc->definitions[i] = parallelcopy[i].second; /* it might happen that the operand is already renamed. we have to restore the original name. */ std::map::iterator it = ctx.orig_names.find(pc->operands[i].tempId()); if (it != ctx.orig_names.end()) pc->operands[i].setTemp(it->second); unsigned orig_id = pc->operands[i].tempId(); ctx.orig_names[pc->definitions[i].tempId()] = pc->operands[i].getTemp(); pc->operands[i].setTemp(read_variable(pc->operands[i].getTemp(), block.index)); renames[block.index][orig_id] = pc->definitions[i].getTemp(); renames[block.index][pc->definitions[i].tempId()] = pc->definitions[i].getTemp(); std::map::iterator phi = phi_map.find(pc->operands[i].tempId()); if (phi != phi_map.end()) phi->second.uses.emplace(pc.get()); } if (temp_in_scc && sgpr_operands_alias_defs) { /* disable definitions and re-enable operands */ for (const Definition& def : instr->definitions) { if (def.isTemp() && !def.isKill()) { for (unsigned j = 0; j < def.size(); j++) { register_file[def.physReg() + j] = 0x0; } } } for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKill()) { for (unsigned j = 0; j < op.size(); j++) register_file[op.physReg() + j] = 0xFFFF; } } handle_pseudo(ctx, register_file, pc.get()); /* re-enable live vars */ for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKill()) for (unsigned j = 0; j < op.size(); j++) register_file[op.physReg() + j] = 0x0; } for (const Definition& def : instr->definitions) { if (def.isTemp() && !def.isKill()) { for (unsigned j = 0; j < def.size(); j++) { register_file[def.physReg() + j] = def.tempId(); } } } } else { pc->tmp_in_scc = false; } instructions.emplace_back(std::move(pc)); } /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */ bool instr_needs_vop3 = !instr->isVOP3() && ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) || (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) || ((instr->opcode == aco_opcode::v_add_co_u32 || instr->opcode == aco_opcode::v_addc_co_u32 || instr->opcode == aco_opcode::v_sub_co_u32 || instr->opcode == aco_opcode::v_subb_co_u32 || instr->opcode == aco_opcode::v_subrev_co_u32 || instr->opcode == aco_opcode::v_subbrev_co_u32) && !(instr->definitions[1].physReg() == vcc)) || ((instr->opcode == aco_opcode::v_addc_co_u32 || instr->opcode == aco_opcode::v_subb_co_u32 || instr->opcode == aco_opcode::v_subbrev_co_u32) && !(instr->operands[2].physReg() == vcc))); if (instr_needs_vop3) { /* if the first operand is a literal, we have to move it to a reg */ if (instr->operands.size() && instr->operands[0].isLiteral()) { bool can_sgpr = true; /* check, if we have to move to vgpr */ for (const Operand& op : instr->operands) { if (op.isTemp() && op.getTemp().type() == RegType::sgpr) { can_sgpr = false; break; } } aco_ptr mov; if (can_sgpr) mov.reset(create_instruction(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)); else mov.reset(create_instruction(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)); mov->operands[0] = instr->operands[0]; Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1}; mov->definitions[0] = Definition(tmp); /* disable definitions and re-enable operands */ for (const Definition& def : instr->definitions) { for (unsigned j = 0; j < def.size(); j++) { register_file[def.physReg() + j] = 0x0; } } for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKill()) { for (unsigned j = 0; j < op.size(); j++) register_file[op.physReg() + j] = 0xFFFF; } } mov->definitions[0].setFixed(get_reg(ctx, register_file, tmp.regClass(), parallelcopy, mov)); instr->operands[0] = Operand(tmp); instr->operands[0].setFixed(mov->definitions[0].physReg()); instructions.emplace_back(std::move(mov)); /* re-enable live vars */ for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKill()) for (unsigned j = 0; j < op.size(); j++) register_file[op.physReg() + j] = 0x0; } for (const Definition& def : instr->definitions) { if (def.isTemp() && !def.isKill()) { for (unsigned j = 0; j < def.size(); j++) { register_file[def.physReg() + j] = def.tempId(); } } } } /* change the instruction to VOP3 to enable an arbitrary register pair as dst */ aco_ptr tmp = std::move(instr); Format format = asVOP3(tmp->format); instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); for (unsigned i = 0; i < instr->operands.size(); i++) { Operand& operand = tmp->operands[i]; instr->operands[i] = operand; /* keep phi_map up to date */ if (operand.isTemp()) { std::map::iterator phi = phi_map.find(operand.tempId()); if (phi != phi_map.end()) { phi->second.uses.erase(tmp.get()); phi->second.uses.emplace(instr.get()); } } } std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin()); } instructions.emplace_back(std::move(*it)); } /* end for Instr */ block.instructions = std::move(instructions); filled[block.index] = true; for (unsigned succ_idx : block.linear_succs) { Block& succ = program->blocks[succ_idx]; /* seal block if all predecessors are filled */ bool all_filled = true; for (unsigned pred_idx : succ.linear_preds) { if (!filled[pred_idx]) { all_filled = false; break; } } if (all_filled) { /* finish incomplete phis and check if they became trivial */ for (Instruction* phi : incomplete_phis[succ_idx]) { std::vector preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds; for (unsigned i = 0; i < phi->operands.size(); i++) { phi->operands[i].setTemp(read_variable(phi->operands[i].getTemp(), preds[i])); phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].first); } try_remove_trivial_phi(phi_map.find(phi->definitions[0].tempId())); } /* complete the original phi nodes, but no need to check triviality */ for (aco_ptr& instr : succ.instructions) { if (!is_phi(instr)) break; std::vector preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds; for (unsigned i = 0; i < instr->operands.size(); i++) { auto& operand = instr->operands[i]; if (!operand.isTemp()) continue; operand.setTemp(read_variable(operand.getTemp(), preds[i])); operand.setFixed(ctx.assignments[operand.tempId()].first); std::map::iterator phi = phi_map.find(operand.getTemp().id()); if (phi != phi_map.end()) phi->second.uses.emplace(instr.get()); } } sealed[succ_idx] = true; } } } /* end for BB */ /* remove trivial phis */ for (Block& block : program->blocks) { auto end = std::find_if(block.instructions.begin(), block.instructions.end(), [](aco_ptr& instr) { return !is_phi(instr);}); auto middle = std::remove_if(block.instructions.begin(), end, [](const aco_ptr& instr) { return instr->definitions.empty();}); block.instructions.erase(middle, end); } /* find scc spill registers which may be needed for parallelcopies created by phis */ for (Block& block : program->blocks) { if (block.linear_preds.size() <= 1) continue; std::bitset<128> regs = sgpr_live_in[block.index]; if (!regs[127]) continue; /* choose a register */ int16_t reg = 0; for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++) ; assert(reg < ctx.program->max_reg_demand.sgpr); adjust_max_used_regs(ctx, s1, reg); /* update predecessors */ for (unsigned& pred_index : block.linear_preds) { Block& pred = program->blocks[pred_index]; pred.scc_live_out = true; pred.scratch_sgpr = PhysReg{(uint16_t)reg}; } } /* num_gpr = rnd_up(max_used_gpr + 1) */ program->config->num_vgprs = (ctx.max_used_vgpr + 1 + 3) & ~3; if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) { assert(ctx.max_used_sgpr <= 93); ctx.max_used_sgpr = 93; /* workaround hardware bug */ } program->config->num_sgprs = (ctx.max_used_sgpr + 1 + 2 + 7) & ~7; /* + 2 sgprs for vcc */ } }