nir/lower_task_shader: fix task payload corruption when shared memory workaround is enabled

We were not taking into account that when all invocations within workgroup are active, we'll copy more data than needed, corrupting task payload of other workgroups. Fixes: 8aff8d3dd42 ("nir: Add common task shader lowering to make the backend's job easier.") Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20080> (cherry picked from commit ffefa386fda5aec8f66b4499d93b41a846a0b86c)
author: Marcin Ślusarz <marcin.slusarz@intel.com> 2022-11-30 13:47:19 +0100
committer: Eric Engestrom <eric@engestrom.ch> 2022-12-14 20:47:01 +0000
commit: 5f387adc0262bb6b13127fe6a674180fdd9c93ee (patch)
tree: 1a327c4e58695b68f645050d829013acf783096b
parent: 91c565df53043f4f474f377647bc0e1e05556942 (diff)
download: mesa-5f387adc0262bb6b13127fe6a674180fdd9c93ee.tar.gz
2 files changed, 76 insertions, 20 deletions
diff --git a/.pick_status.json b/.pick_status.json
index bf0a1988d32..568589cfa98 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -3289,7 +3289,7 @@
         "description": "nir/lower_task_shader: fix task payload corruption when shared memory workaround is enabled",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": "8aff8d3dd42ecc399f1d7d23ddd641e7e3fce777"
     },
diff --git a/src/compiler/nir/nir_lower_task_shader.c b/src/compiler/nir/nir_lower_task_shader.c
index 07dec312d07..c4013a021d2 100644
--- a/src/compiler/nir/nir_lower_task_shader.c
+++ b/src/compiler/nir/nir_lower_task_shader.c
@@ -192,21 +192,49 @@ lower_task_payload_to_shared(nir_builder *b,
 }
 
 static void
+copy_shared_to_payload(nir_builder *b,
+                       unsigned num_components,
+                       nir_ssa_def *addr,
+                       unsigned shared_base,
+                       unsigned off)
+{
+   /* Read from shared memory. */
+   nir_ssa_def *copy = nir_load_shared(b, num_components, 32, addr,
+                                       .align_mul = 16,
+                                       .base = shared_base + off);
+
+   /* Write to task payload memory. */
+   nir_store_task_payload(b, copy, addr, .base = off);
+}
+
+static void
 emit_shared_to_payload_copy(nir_builder *b,
                             uint32_t payload_addr,
                             uint32_t payload_size,
                             lower_task_state *s)
 {
+   /* Copy from shared memory to task payload using as much parallelism
+    * as possible. This is achieved by splitting the work into max 3 phases:
+    * 1) copy maximum number of vec4s using all invocations within workgroup
+    * 2) copy maximum number of vec4s using some invocations
+    * 3) copy remaining dwords (< 4) using only the first invocation
+    */
    const unsigned invocations = b->shader->info.workgroup_size[0] *
-                          b->shader->info.workgroup_size[1] *
-                          b->shader->info.workgroup_size[2];
-   const unsigned bytes_per_copy = 16;
-   const unsigned copies_needed = DIV_ROUND_UP(payload_size, bytes_per_copy);
-   const unsigned copies_per_invocation = DIV_ROUND_UP(copies_needed, invocations);
+                                b->shader->info.workgroup_size[1] *
+                                b->shader->info.workgroup_size[2];
+   const unsigned vec4size = 16;
+   const unsigned whole_wg_vec4_copies = payload_size / vec4size;
+   const unsigned vec4_copies_per_invocation = whole_wg_vec4_copies / invocations;
+   const unsigned remaining_vec4_copies = whole_wg_vec4_copies % invocations;
+   const unsigned remaining_dwords =
+         DIV_ROUND_UP(payload_size
+                       - vec4size * vec4_copies_per_invocation * invocations
+                       - vec4size * remaining_vec4_copies,
+                      4);
    const unsigned base_shared_addr = s->payload_shared_addr + payload_addr;
 
    nir_ssa_def *invocation_index = nir_load_local_invocation_index(b);
-   nir_ssa_def *addr = nir_imul_imm(b, invocation_index, bytes_per_copy);
+   nir_ssa_def *addr = nir_imul_imm(b, invocation_index, vec4size);
 
    /* Wait for all previous shared stores to finish.
     * This is necessary because we placed the payload in shared memory.
@@ -216,22 +244,50 @@ emit_shared_to_payload_copy(nir_builder *b,
                          .memory_semantics = NIR_MEMORY_ACQ_REL,
                          .memory_modes = nir_var_mem_shared);
 
-   for (unsigned i = 0; i < copies_per_invocation; ++i) {
-      /* Payload_size is a size of user-accessible payload, but on some
-       * hardware (e.g. Intel) payload has a private header, which we have
-       * to offset (payload_offset_in_bytes).
-       */
-      unsigned const_off =
-            bytes_per_copy * invocations * i + s->payload_offset_in_bytes;
+   /* Payload_size is a size of user-accessible payload, but on some
+    * hardware (e.g. Intel) payload has a private header, which we have
+    * to offset (payload_offset_in_bytes).
+    */
+   unsigned off = s->payload_offset_in_bytes;
 
-      /* Read from shared memory. */
-      nir_ssa_def *copy =
-         nir_load_shared(b, 4, 32, addr, .align_mul = 16,
-                         .base = base_shared_addr + const_off);
+   /* Technically dword-alignment is not necessary for correctness
+    * of the code below, but even if backend implements unaligned
+    * load/stores, they will very likely be slow(er).
+    */
+   assert(off % 4 == 0);
 
-      /* Write to task payload memory. */
-      nir_store_task_payload(b, copy, addr, .base = const_off);
+   /* Copy full vec4s using all invocations in workgroup. */
+   for (unsigned i = 0; i < vec4_copies_per_invocation; ++i) {
+      copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off);
+      off += vec4size * invocations;
    }
+
+   /* Copy full vec4s using only the invocations needed to not overflow. */
+   if (remaining_vec4_copies > 0) {
+      assert(remaining_vec4_copies < invocations);
+
+      nir_ssa_def *cmp = nir_ilt(b, invocation_index, nir_imm_int(b, remaining_vec4_copies));
+      nir_if *if_stmt = nir_push_if(b, cmp);
+      {
+         copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off);
+      }
+      nir_pop_if(b, if_stmt);
+      off += vec4size * remaining_vec4_copies;
+   }
+
+   /* Copy the last few dwords not forming full vec4. */
+   if (remaining_dwords > 0) {
+      assert(remaining_dwords < 4);
+      nir_ssa_def *cmp = nir_ieq(b, invocation_index, nir_imm_int(b, 0));
+      nir_if *if_stmt = nir_push_if(b, cmp);
+      {
+         copy_shared_to_payload(b, remaining_dwords, addr, base_shared_addr, off);
+      }
+      nir_pop_if(b, if_stmt);
+      off += remaining_dwords * 4;
+   }
+
+   assert(s->payload_offset_in_bytes + ALIGN(payload_size, 4) == off);
 }
 
 static bool
author	Marcin Ślusarz <marcin.slusarz@intel.com>	2022-11-30 13:47:19 +0100
committer	Eric Engestrom <eric@engestrom.ch>	2022-12-14 20:47:01 +0000
commit	5f387adc0262bb6b13127fe6a674180fdd9c93ee (patch)
tree	1a327c4e58695b68f645050d829013acf783096b
parent	91c565df53043f4f474f377647bc0e1e05556942 (diff)
download	mesa-5f387adc0262bb6b13127fe6a674180fdd9c93ee.tar.gz