From b4c4d2826306541ba1dd2145cb30c9fcd3e213a5 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Thu, 11 May 2023 11:46:42 +0200 Subject: aco: emit_wqm on MIMG dst, not operands Now p_wqm always kills its operand, so no movs will be created for it. Long term we want to remove p_wqm in favor of a Definition flag, so this is also a step in that direction. Foz-DB Navi21: Totals from 45351 (33.63% of 134864) affected shaders: VGPRs: 2099552 -> 2116192 (+0.79%); split: -0.14%, +0.93% CodeSize: 179530772 -> 179072104 (-0.26%); split: -0.29%, +0.03% MaxWaves: 1054740 -> 1052262 (-0.23%); split: +0.10%, -0.33% Instrs: 33238535 -> 33188347 (-0.15%); split: -0.17%, +0.02% Latency: 451000471 -> 450869384 (-0.03%); split: -0.11%, +0.08% InvThroughput: 86026785 -> 86286288 (+0.30%); split: -0.11%, +0.41% VClause: 633291 -> 623920 (-1.48%); split: -1.91%, +0.43% SClause: 1436708 -> 1431395 (-0.37%); split: -0.60%, +0.23% Copies: 2166563 -> 2122592 (-2.03%); split: -2.29%, +0.26% Branches: 706846 -> 706838 (-0.00%); split: -0.00%, +0.00% PreSGPRs: 1976162 -> 1976592 (+0.02%) PreVGPRs: 1797409 -> 1794704 (-0.15%) MaxWaves regressions in Detroit: Become Human MaxWaves seem to be due to the scheduler choosing to schedule more aggressively. Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 59 +++++++++++--------------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 7298e9b5ec9..b5d419535b7 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5932,8 +5932,8 @@ image_type_to_components_count(enum glsl_sampler_dim dim, bool array) } static MIMG_instruction* -emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp, - std::vector coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1)) +emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector coords, + bool needs_wqm = false, Operand vdata = Operand(v1)) { /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. * On GFX11 the first 4 vaddr are single registers and the last contains the remaining @@ -5947,8 +5947,6 @@ emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp, for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) { coords[i] = as_vgpr(bld, coords[i]); - if (wqm_mask & (1u << i)) - coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true); } if (nsa_size < coords.size()) { @@ -5970,21 +5968,18 @@ emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp, coord = as_vgpr(bld, coord); } - if (wqm_mask >> nsa_size) { - /* We don't need the bias, sample index, compare value or offset to be - * computed in WQM but if the p_create_vector copies the coordinates, then it - * needs to be in WQM. */ - coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true); - } - coords[nsa_size] = coord; coords.resize(nsa_size + 1); } + bool has_dst = dst.id() != 0; + assert(!needs_wqm || has_dst); + Temp tmp_dst = needs_wqm ? bld.tmp(dst.regClass()) : dst; + aco_ptr mimg{ - create_instruction(op, Format::MIMG, 3 + coords.size(), dst.isTemp())}; - if (dst.isTemp()) - mimg->definitions[0] = dst; + create_instruction(op, Format::MIMG, 3 + coords.size(), has_dst)}; + if (has_dst) + mimg->definitions[0] = Definition(tmp_dst); mimg->operands[0] = Operand(rsrc); mimg->operands[1] = samp; mimg->operands[2] = vdata; @@ -5993,6 +5988,8 @@ emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp, MIMG_instruction* res = mimg.get(); bld.insert(std::move(mimg)); + if (needs_wqm) + emit_wqm(bld, tmp_dst, dst, true); return res; } @@ -6026,8 +6023,8 @@ visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr) args = std::move(scalar_args); } - MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst), - resource, Operand(s4), args); + MIMG_instruction* mimg = + emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args); mimg->dim = ac_image_1d; mimg->dmask = 0xf; mimg->unrm = true; @@ -6245,7 +6242,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1); MIMG_instruction* load = - emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata); + emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, false, vdata); load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; load->dlc = load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3); @@ -6373,7 +6370,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) } MIMG_instruction* store = - emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data)); + emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, false, Operand(data)); store->glc = glc; store->dlc = false; store->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array); @@ -6529,10 +6526,9 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) std::vector coords = get_image_coords(ctx, instr); Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); - Definition def = - return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition(); + Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1); MIMG_instruction* mimg = - emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data)); + emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, false, Operand(data)); mimg->glc = return_previous; mimg->dlc = false; /* Not needed for atomics */ mimg->dim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array); @@ -6544,7 +6540,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) mimg->sync = sync; ctx->program->needs_exact = true; if (return_previous && cmpswap) - bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero()); + bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero()); return; } @@ -9371,7 +9367,6 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) offset = pack; } - unsigned wqm_coord_count = 0; std::vector unpacked_coord; if (ctx->options->gfx_level == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->coord_components) { @@ -9388,10 +9383,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) else coord2d = instr->op == nir_texop_txf ? Operand::c32(0) : Operand::c32(0x3f000000); unpacked_coord.insert(std::next(unpacked_coord.begin()), bld.copy(bld.def(rc), coord2d)); - wqm_coord_count = a16 ? DIV_ROUND_UP(unpacked_coord.size(), 2) : unpacked_coord.size(); } else if (coord != Temp()) { unpacked_coord.push_back(coord); - wqm_coord_count = DIV_ROUND_UP(coord.bytes(), 4); } if (has_sample_index) @@ -9466,8 +9459,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) if (tg4_integer_workarounds) { Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero()); Temp size = bld.tmp(v2); - MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size), - resource, Operand(s4), std::vector{tg4_lod}); + MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource, + Operand(s4), std::vector{tg4_lod}); tex->dim = dim; tex->dmask = 0x3; tex->da = da; @@ -9600,11 +9593,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) /* gather MIMG address components */ std::vector args; - unsigned wqm_mask = 0; - if (has_offset) { - wqm_mask |= u_bit_consecutive(args.size(), 1); + if (has_offset) args.emplace_back(offset); - } if (has_bias) args.emplace_back(emit_pack_v1(ctx, {bias})[0]); if (has_compare) @@ -9612,7 +9602,6 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) if (has_derivs) args.insert(args.end(), derivs.begin(), derivs.end()); - wqm_mask |= u_bit_consecutive(args.size(), wqm_coord_count); args.insert(args.end(), coords.begin(), coords.end()); if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd || @@ -9623,7 +9612,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) : aco_opcode::image_load_mip; Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); MIMG_instruction* tex = - emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata); + emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, false, vdata); if (instr->op == nir_texop_fragment_mask_fetch_amd) tex->dim = da ? ac_image_2darray : ac_image_2d; else @@ -9803,8 +9792,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS; Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); - MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler), - args, implicit_derivs ? wqm_mask : 0, vdata); + MIMG_instruction* tex = + emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, implicit_derivs, vdata); tex->dim = dim; tex->dmask = dmask & 0xf; tex->da = da; -- cgit v1.2.1