From d59f702e268004fd43a0b781f39671be66728d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Sat, 21 Sep 2019 18:03:56 +0200 Subject: aco: Implement subgroup shuffle in GFX10 wave64 mode. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously subgroup shuffle was implemented using the bpermute instruction, which only works accross half-waves, so by itself it's not suitable for implementing subgroup shuffle when the shader is running in wave64 mode. This commit adds a trick using shared VGPRs that allows to implement subgroup shuffle still relatively effectively in this mode. Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann --- src/amd/compiler/aco_reduce_assign.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'src/amd/compiler/aco_reduce_assign.cpp') diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 66a3ec64c04..d9c762a65db 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -118,10 +118,12 @@ void setup_reduce_temp(Program* program) unsigned cluster_size = static_cast(instr)->cluster_size; bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 || op == fmax64; - if (program->chip_class >= GFX10 && cluster_size == 64) + + if (program->chip_class >= GFX10 && cluster_size == 64 && op != gfx10_wave64_bpermute) need_vtmp = true; need_vtmp |= cluster_size == 32; + vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0; if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) { vtmp = {program->allocateId(), vtmp.regClass()}; -- cgit v1.2.1