summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRhys Perry <pendingchaos02@gmail.com>2022-12-01 15:05:49 +0000
committerEric Engestrom <eric@engestrom.ch>2022-12-14 20:47:01 +0000
commit91c565df53043f4f474f377647bc0e1e05556942 (patch)
treeea358705426316d0cdc098df1520ccb624502f5c
parentd8d1b09474ab208484bef7ddaa2ee90c5ff4bd1e (diff)
downloadmesa-91c565df53043f4f474f377647bc0e1e05556942.tar.gz
aco: more carefully apply constant offsets into scratch accesses
Death stranding does scratch_arr[80-idx]. This doesn't seem to work if we try to combine the subtraction into the access. fossil-db (navi21): Totals from 52 (0.04% of 135636) affected shaders: Instrs: 78560 -> 79036 (+0.61%) CodeSize: 427940 -> 431188 (+0.76%) Latency: 1313809 -> 1318142 (+0.33%) InvThroughput: 292833 -> 293842 (+0.34%) VClause: 2361 -> 2555 (+8.22%); split: -0.51%, +8.73% Copies: 8767 -> 8746 (-0.24%); split: -0.35%, +0.11% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Fixes: 0e783d687a3 ("aco: use scratch_* for scratch load/store on GFX9+") Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7735 Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20117> (cherry picked from commit 381de3c809fce5427308c696bbd313360194eff4)
-rw-r--r--.pick_status.json2
-rw-r--r--src/amd/compiler/aco_optimizer.cpp23
2 files changed, 20 insertions, 5 deletions
diff --git a/.pick_status.json b/.pick_status.json
index 454a214bff0..bf0a1988d32 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -3307,7 +3307,7 @@
"description": "aco: more carefully apply constant offsets into scratch accesses",
"nominated": true,
"nomination_type": 1,
- "resolution": 0,
+ "resolution": 1,
"main_sha": null,
"because_sha": "0e783d687a3e13284eeae4081f16ee22033ff89b"
},
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index adb959a64e9..68144868c8c 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -1255,12 +1255,14 @@ is_op_canonicalized(opt_ctx& ctx, Operand op)
}
bool
-is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int32_t offset)
+is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int64_t offset0, int64_t offset1)
{
bool negative_unaligned_scratch_offset_bug = ctx.program->gfx_level == GFX10;
int32_t min = ctx.program->dev.scratch_global_offset_min;
int32_t max = ctx.program->dev.scratch_global_offset_max;
+ int64_t offset = offset0 + offset1;
+
bool has_vgpr_offset = instr && !instr->operands[0].isUndefined();
if (negative_unaligned_scratch_offset_bug && has_vgpr_offset && offset < 0 && offset % 4)
return false;
@@ -1467,15 +1469,28 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
while (info.is_temp())
info = ctx.info[info.temp.id()];
- if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
+ /* The hardware probably does: 'scratch_base + u2u64(saddr) + i2i64(offset)'. This means
+ * we can't combine the addition if the unsigned addition overflows and offset is
+ * positive. In theory, there is also issues if
+ * 'ilt(offset, 0) && ige(saddr, 0) && ilt(saddr + offset, 0)', but that just
+ * replaces an already out-of-bounds access with a larger one since 'saddr + offset'
+ * would be larger than INT32_MAX.
+ */
+ if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
base.regClass() == instr->operands[i].regClass() &&
- is_scratch_offset_valid(ctx, instr.get(), scratch.offset + (int32_t)offset)) {
+ is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
+ instr->operands[i].setTemp(base);
+ scratch.offset += (int32_t)offset;
+ continue;
+ } else if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
+ base.regClass() == instr->operands[i].regClass() && (int32_t)offset < 0 &&
+ is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
instr->operands[i].setTemp(base);
scratch.offset += (int32_t)offset;
continue;
} else if (i <= 1 && info.is_constant_or_literal(32) &&
ctx.program->gfx_level >= GFX10_3 &&
- is_scratch_offset_valid(ctx, NULL, scratch.offset + (int32_t)info.val)) {
+ is_scratch_offset_valid(ctx, NULL, scratch.offset, (int32_t)info.val)) {
/* GFX10.3+ can disable both SADDR and ADDR. */
instr->operands[i] = Operand(instr->operands[i].regClass());
scratch.offset += (int32_t)info.val;