diff options
author | Rhys Perry <pendingchaos02@gmail.com> | 2020-09-01 11:55:58 +0100 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-01-13 18:54:18 +0000 |
commit | dfe429eb414511170f3dfc960d247c4aa295f924 (patch) | |
tree | 42b2af52228489f694c8efa8fb18a2d1bdba5728 | |
parent | 74748f16c969c59096a0bf4ce9f86fc92c797905 (diff) | |
download | mesa-dfe429eb414511170f3dfc960d247c4aa295f924.tar.gz |
nir/loop_unroll: unroll more aggressively if it can improve load scheduling
Significantly improves performance of a Control compute shader. Also seems
to increase FPS at the very start of the game by ~5% (RX 580, 1080p,
medium settings, no MSAA).
fossil-db (Sienna):
Totals from 81 (0.06% of 139391) affected shaders:
SGPRs: 3848 -> 4362 (+13.36%); split: -0.99%, +14.35%
VGPRs: 4132 -> 4648 (+12.49%)
CodeSize: 275532 -> 659188 (+139.24%)
MaxWaves: 986 -> 906 (-8.11%)
Instrs: 54422 -> 126865 (+133.11%)
Cycles: 1057240 -> 750464 (-29.02%); split: -42.61%, +13.60%
VMEM: 26507 -> 61829 (+133.26%); split: +135.56%, -2.30%
SMEM: 4748 -> 5895 (+24.16%); split: +31.47%, -7.31%
VClause: 1933 -> 6802 (+251.89%); split: -0.72%, +252.61%
SClause: 1179 -> 1810 (+53.52%); split: -3.14%, +56.66%
Branches: 1174 -> 1157 (-1.45%); split: -23.94%, +22.49%
PreVGPRs: 3219 -> 3387 (+5.22%); split: -0.96%, +6.18%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6538>
-rw-r--r-- | src/amd/vulkan/radv_shader.c | 1 | ||||
-rw-r--r-- | src/compiler/nir/nir.h | 1 | ||||
-rw-r--r-- | src/compiler/nir/nir_opt_loop_unroll.c | 88 |
3 files changed, 83 insertions, 7 deletions
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 846913875dd..e25ac04c29e 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -83,6 +83,7 @@ static const struct nir_shader_compiler_options nir_options = { .has_isub = true, .use_scoped_barrier = true, .max_unroll_iterations = 32, + .max_unroll_iterations_aggressive = 128, .use_interpolated_input_intrinsics = true, .vectorize_vec2_16bit = true, /* nir_lower_int64() isn't actually called for the LLVM backend, but diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 5d43d6aaf72..a60b1c8928b 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3390,6 +3390,7 @@ typedef struct nir_shader_compiler_options { bool support_16bit_alu; unsigned max_unroll_iterations; + unsigned max_unroll_iterations_aggressive; /* For the non-zero value of the enum corresponds multiplier when * calling lower_uniforms_to_ubo */ diff --git a/src/compiler/nir/nir_opt_loop_unroll.c b/src/compiler/nir/nir_opt_loop_unroll.c index 7dc27a10387..86095e45e03 100644 --- a/src/compiler/nir/nir_opt_loop_unroll.c +++ b/src/compiler/nir/nir_opt_loop_unroll.c @@ -750,6 +750,77 @@ partial_unroll(nir_shader *shader, nir_loop *loop, unsigned trip_count) _mesa_hash_table_destroy(remap_table, NULL); } +static bool +is_indirect_load(nir_instr *instr) +{ + if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if ((intrin->intrinsic == nir_intrinsic_load_ubo || + intrin->intrinsic == nir_intrinsic_load_ssbo || + intrin->intrinsic == nir_intrinsic_load_global) && + !nir_src_is_const(intrin->src[1])) { + return true; + } + + if (intrin->intrinsic == nir_intrinsic_load_deref || + intrin->intrinsic == nir_intrinsic_store_deref) { + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + nir_variable_mode mem_modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_global; + if (!nir_deref_mode_may_be(deref, mem_modes)) + return false; + while (deref) { + if ((deref->deref_type == nir_deref_type_array || + deref->deref_type == nir_deref_type_ptr_as_array) && + !nir_src_is_const(deref->arr.index)) { + return true; + } + deref = nir_deref_instr_parent(deref); + } + } + } else if (instr->type == nir_instr_type_tex) { + nir_tex_instr *tex = nir_instr_as_tex(instr); + + for (unsigned i = 0; i < tex->num_srcs; i++) { + if (!nir_src_is_const(tex->src[i].src)) + return true; + } + } + + return false; +} + +static bool +can_pipeline_loads(nir_loop *loop) +{ + if (!loop->info->exact_trip_count_known) + return false; + + bool interesting_loads = false; + + foreach_list_typed(nir_cf_node, cf_node, node, &loop->body) { + if (cf_node == &loop->info->limiting_terminator->nif->cf_node) + continue; + + /* Control flow usually prevents useful scheduling */ + if (cf_node->type != nir_cf_node_block) + return false; + + if (interesting_loads) + continue; + + nir_block *block = nir_cf_node_as_block(cf_node); + nir_foreach_instr(instr, block) { + if (is_indirect_load(instr)) { + interesting_loads = true; + break; + } + } + } + + return interesting_loads; +} + /* * Returns true if we should unroll the loop, otherwise false. */ @@ -764,19 +835,22 @@ check_unrolling_restrictions(nir_shader *shader, nir_loop *loop) nir_loop_info *li = loop->info; unsigned max_iter = shader->options->max_unroll_iterations; + /* Unroll much more aggressively if it can hide load latency. */ + if (shader->options->max_unroll_iterations_aggressive && can_pipeline_loads(loop)) + max_iter = shader->options->max_unroll_iterations_aggressive; unsigned trip_count = li->max_trip_count ? li->max_trip_count : li->guessed_trip_count; - if (trip_count > max_iter) - return false; - - if (li->force_unroll && !li->guessed_trip_count) + if (li->force_unroll && !li->guessed_trip_count && trip_count <= max_iter) return true; - bool loop_not_too_large = - li->instr_cost * trip_count <= max_iter * LOOP_UNROLL_LIMIT; + unsigned cost_limit = max_iter * LOOP_UNROLL_LIMIT; + unsigned cost = li->instr_cost * trip_count; + + if (cost <= cost_limit && trip_count <= max_iter) + return true; - return loop_not_too_large; + return false; } static bool |