diff options
author | Marcin Ślusarz <marcin.slusarz@intel.com> | 2022-11-30 13:47:19 +0100 |
---|---|---|
committer | Eric Engestrom <eric@engestrom.ch> | 2022-12-14 20:47:01 +0000 |
commit | 5f387adc0262bb6b13127fe6a674180fdd9c93ee (patch) | |
tree | 1a327c4e58695b68f645050d829013acf783096b | |
parent | 91c565df53043f4f474f377647bc0e1e05556942 (diff) | |
download | mesa-5f387adc0262bb6b13127fe6a674180fdd9c93ee.tar.gz |
nir/lower_task_shader: fix task payload corruption when shared memory workaround is enabled
We were not taking into account that when all invocations within workgroup
are active, we'll copy more data than needed, corrupting task payload
of other workgroups.
Fixes: 8aff8d3dd42 ("nir: Add common task shader lowering to make the backend's job easier.")
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20080>
(cherry picked from commit ffefa386fda5aec8f66b4499d93b41a846a0b86c)
-rw-r--r-- | .pick_status.json | 2 | ||||
-rw-r--r-- | src/compiler/nir/nir_lower_task_shader.c | 94 |
2 files changed, 76 insertions, 20 deletions
diff --git a/.pick_status.json b/.pick_status.json index bf0a1988d32..568589cfa98 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -3289,7 +3289,7 @@ "description": "nir/lower_task_shader: fix task payload corruption when shared memory workaround is enabled", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "8aff8d3dd42ecc399f1d7d23ddd641e7e3fce777" }, diff --git a/src/compiler/nir/nir_lower_task_shader.c b/src/compiler/nir/nir_lower_task_shader.c index 07dec312d07..c4013a021d2 100644 --- a/src/compiler/nir/nir_lower_task_shader.c +++ b/src/compiler/nir/nir_lower_task_shader.c @@ -192,21 +192,49 @@ lower_task_payload_to_shared(nir_builder *b, } static void +copy_shared_to_payload(nir_builder *b, + unsigned num_components, + nir_ssa_def *addr, + unsigned shared_base, + unsigned off) +{ + /* Read from shared memory. */ + nir_ssa_def *copy = nir_load_shared(b, num_components, 32, addr, + .align_mul = 16, + .base = shared_base + off); + + /* Write to task payload memory. */ + nir_store_task_payload(b, copy, addr, .base = off); +} + +static void emit_shared_to_payload_copy(nir_builder *b, uint32_t payload_addr, uint32_t payload_size, lower_task_state *s) { + /* Copy from shared memory to task payload using as much parallelism + * as possible. This is achieved by splitting the work into max 3 phases: + * 1) copy maximum number of vec4s using all invocations within workgroup + * 2) copy maximum number of vec4s using some invocations + * 3) copy remaining dwords (< 4) using only the first invocation + */ const unsigned invocations = b->shader->info.workgroup_size[0] * - b->shader->info.workgroup_size[1] * - b->shader->info.workgroup_size[2]; - const unsigned bytes_per_copy = 16; - const unsigned copies_needed = DIV_ROUND_UP(payload_size, bytes_per_copy); - const unsigned copies_per_invocation = DIV_ROUND_UP(copies_needed, invocations); + b->shader->info.workgroup_size[1] * + b->shader->info.workgroup_size[2]; + const unsigned vec4size = 16; + const unsigned whole_wg_vec4_copies = payload_size / vec4size; + const unsigned vec4_copies_per_invocation = whole_wg_vec4_copies / invocations; + const unsigned remaining_vec4_copies = whole_wg_vec4_copies % invocations; + const unsigned remaining_dwords = + DIV_ROUND_UP(payload_size + - vec4size * vec4_copies_per_invocation * invocations + - vec4size * remaining_vec4_copies, + 4); const unsigned base_shared_addr = s->payload_shared_addr + payload_addr; nir_ssa_def *invocation_index = nir_load_local_invocation_index(b); - nir_ssa_def *addr = nir_imul_imm(b, invocation_index, bytes_per_copy); + nir_ssa_def *addr = nir_imul_imm(b, invocation_index, vec4size); /* Wait for all previous shared stores to finish. * This is necessary because we placed the payload in shared memory. @@ -216,22 +244,50 @@ emit_shared_to_payload_copy(nir_builder *b, .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared); - for (unsigned i = 0; i < copies_per_invocation; ++i) { - /* Payload_size is a size of user-accessible payload, but on some - * hardware (e.g. Intel) payload has a private header, which we have - * to offset (payload_offset_in_bytes). - */ - unsigned const_off = - bytes_per_copy * invocations * i + s->payload_offset_in_bytes; + /* Payload_size is a size of user-accessible payload, but on some + * hardware (e.g. Intel) payload has a private header, which we have + * to offset (payload_offset_in_bytes). + */ + unsigned off = s->payload_offset_in_bytes; - /* Read from shared memory. */ - nir_ssa_def *copy = - nir_load_shared(b, 4, 32, addr, .align_mul = 16, - .base = base_shared_addr + const_off); + /* Technically dword-alignment is not necessary for correctness + * of the code below, but even if backend implements unaligned + * load/stores, they will very likely be slow(er). + */ + assert(off % 4 == 0); - /* Write to task payload memory. */ - nir_store_task_payload(b, copy, addr, .base = const_off); + /* Copy full vec4s using all invocations in workgroup. */ + for (unsigned i = 0; i < vec4_copies_per_invocation; ++i) { + copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off); + off += vec4size * invocations; } + + /* Copy full vec4s using only the invocations needed to not overflow. */ + if (remaining_vec4_copies > 0) { + assert(remaining_vec4_copies < invocations); + + nir_ssa_def *cmp = nir_ilt(b, invocation_index, nir_imm_int(b, remaining_vec4_copies)); + nir_if *if_stmt = nir_push_if(b, cmp); + { + copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off); + } + nir_pop_if(b, if_stmt); + off += vec4size * remaining_vec4_copies; + } + + /* Copy the last few dwords not forming full vec4. */ + if (remaining_dwords > 0) { + assert(remaining_dwords < 4); + nir_ssa_def *cmp = nir_ieq(b, invocation_index, nir_imm_int(b, 0)); + nir_if *if_stmt = nir_push_if(b, cmp); + { + copy_shared_to_payload(b, remaining_dwords, addr, base_shared_addr, off); + } + nir_pop_if(b, if_stmt); + off += remaining_dwords * 4; + } + + assert(s->payload_offset_in_bytes + ALIGN(payload_size, 4) == off); } static bool |