diff options
author | Georg Lehmann <dadschoorse@gmail.com> | 2023-04-23 14:55:17 +0200 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2023-05-12 13:31:16 +0000 |
commit | 151bcc1e8bbc9b012616ae418a59c215b0e6e807 (patch) | |
tree | 0017a7c83b0adc578592961bf6510f50308aebaa | |
parent | 41b0eafc4b0b4ed1083ab00e5fee2a6e0fdfd900 (diff) | |
download | mesa-151bcc1e8bbc9b012616ae418a59c215b0e6e807.tar.gz |
aco: use VOP3+DPP
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22698>
-rw-r--r-- | src/amd/compiler/aco_ir.cpp | 212 | ||||
-rw-r--r-- | src/amd/compiler/aco_ir.h | 8 | ||||
-rw-r--r-- | src/amd/compiler/aco_optimizer.cpp | 37 | ||||
-rw-r--r-- | src/amd/compiler/aco_optimizer_postRA.cpp | 41 | ||||
-rw-r--r-- | src/amd/compiler/aco_validate.cpp | 11 |
5 files changed, 242 insertions, 67 deletions
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 15436575dec..c6cdec446a4 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -336,38 +336,52 @@ convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr) } bool -can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8) +can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8) { assert(instr->isVALU() && !instr->operands.empty()); if (instr->isDPP()) return instr->isDPP8() == dpp8; - if (instr->operands.size() && instr->operands[0].isLiteral()) + if (instr->isSDWA() || instr->isVINTERP_INREG()) return false; - if (instr->isSDWA() || instr->isVINTERP_INREG() || instr->isVOP3P()) + if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11) return false; - if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) && - instr->definitions.back().physReg() != vcc) + if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() && + instr->definitions.back().physReg() != vcc && gfx_level < GFX11) return false; - if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc) + if (instr->operands.size() >= 3 && instr->operands[2].isFixed() && + instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc && + gfx_level < GFX11) return false; - if (instr->isVOP3()) { + if (instr->isVOP3() && gfx_level < GFX11) { const VALU_instruction* vop3 = &instr->valu(); - if (vop3->clamp || vop3->omod || vop3->opsel) + if (vop3->clamp || vop3->omod) return false; if (dpp8) return false; - if (instr->format == Format::VOP3) + } + + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->operands[i].isLiteral()) return false; - if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr)) + if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2) return false; } + /* simpler than listing all VOP3P opcodes which do not support DPP */ + if (instr->isVOP3P()) { + return instr->opcode == aco_opcode::v_fma_mix_f32 || + instr->opcode == aco_opcode::v_fma_mixlo_f16 || + instr->opcode == aco_opcode::v_fma_mixhi_f16 || + instr->opcode == aco_opcode::v_dot2_f32_f16 || + instr->opcode == aco_opcode::v_dot2_f32_bf16; + } + /* there are more cases but those all take 64-bit inputs */ return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 && @@ -375,18 +389,31 @@ can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8) instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 && instr->opcode != aco_opcode::v_readfirstlane_b32 && instr->opcode != aco_opcode::v_cvt_f64_i32 && - instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32; + instr->opcode != aco_opcode::v_cvt_f64_f32 && + instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 && + instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 && + instr->opcode != aco_opcode::v_mul_hi_i32 && + instr->opcode != aco_opcode::v_qsad_pk_u16_u8 && + instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 && + instr->opcode != aco_opcode::v_mqsad_u32_u8 && + instr->opcode != aco_opcode::v_mad_u64_u32 && + instr->opcode != aco_opcode::v_mad_i64_i32 && + instr->opcode != aco_opcode::v_permlane16_b32 && + instr->opcode != aco_opcode::v_permlanex16_b32 && + instr->opcode != aco_opcode::v_permlane64_b32 && + instr->opcode != aco_opcode::v_readlane_b32_e64 && + instr->opcode != aco_opcode::v_writelane_b32_e64; } aco_ptr<Instruction> -convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8) +convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8) { if (instr->isDPP()) return NULL; aco_ptr<Instruction> tmp = std::move(instr); - Format format = (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) | - (dpp8 ? (uint32_t)Format::DPP8 : (uint32_t)Format::DPP16)); + Format format = + (Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16)); if (dpp8) instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); @@ -394,8 +421,7 @@ convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8) instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); - for (unsigned i = 0; i < instr->definitions.size(); i++) - instr->definitions[i] = tmp->definitions[i]; + std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin()); if (dpp8) { DPP8_instruction* dpp = &instr->dpp8(); @@ -410,16 +436,37 @@ convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8) instr->valu().neg = tmp->valu().neg; instr->valu().abs = tmp->valu().abs; + instr->valu().omod = tmp->valu().omod; + instr->valu().clamp = tmp->valu().clamp; instr->valu().opsel = tmp->valu().opsel; + instr->valu().opsel_lo = tmp->valu().opsel_lo; + instr->valu().opsel_hi = tmp->valu().opsel_hi; - if (instr->isVOPC() || instr->definitions.size() > 1) + if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11) instr->definitions.back().setFixed(vcc); - if (instr->operands.size() >= 3) + if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) && + gfx_level < GFX11) instr->operands[2].setFixed(vcc); instr->pass_flags = tmp->pass_flags; + /* DPP16 supports input modifiers, so we might no longer need VOP3. */ + bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp && + (instr->isVOP1() || instr->isVOP2() || instr->isVOPC()); + + /* VOPC/add_co/sub_co definition needs VCC without VOP3. */ + remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr || + !instr->definitions.back().isFixed() || + instr->definitions.back().physReg() == vcc; + + /* addc/subb/cndmask 3rd operand needs VCC without VOP3. */ + remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() || + instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc; + + if (remove_vop3) + instr->format = (Format)((uint32_t)instr->format & ~(uint32_t)Format::VOP3); + return tmp; } @@ -931,27 +978,77 @@ is_cmpx(aco_opcode op) } bool -can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op) +can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1) { + if (idx0 == idx1) { + *new_op = instr->opcode; + return true; + } + + if (idx0 > idx1) + std::swap(idx0, idx1); + if (instr->isDPP()) return false; - if (instr->operands[0].isConstant() || - (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr)) + if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr)) return false; + if (instr->isVOPC()) { + CmpInfo info; + if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) { + *new_op = info.swapped; + return true; + } + } + + /* opcodes not relevant for DPP or SGPRs optimizations are not included. */ switch (instr->opcode) { + case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */ case aco_opcode::v_add_u32: case aco_opcode::v_add_co_u32: case aco_opcode::v_add_co_u32_e64: case aco_opcode::v_add_i32: + case aco_opcode::v_add_i16: + case aco_opcode::v_add_u16_e64: + case aco_opcode::v_add3_u32: case aco_opcode::v_add_f16: case aco_opcode::v_add_f32: + case aco_opcode::v_mul_i32_i24: + case aco_opcode::v_mul_hi_i32_i24: + case aco_opcode::v_mul_u32_u24: + case aco_opcode::v_mul_hi_u32_u24: + case aco_opcode::v_mul_lo_u16: + case aco_opcode::v_mul_lo_u16_e64: case aco_opcode::v_mul_f16: case aco_opcode::v_mul_f32: + case aco_opcode::v_mul_legacy_f32: case aco_opcode::v_or_b32: case aco_opcode::v_and_b32: case aco_opcode::v_xor_b32: + case aco_opcode::v_xnor_b32: + case aco_opcode::v_xor3_b32: + case aco_opcode::v_or3_b32: + case aco_opcode::v_and_b16: + case aco_opcode::v_or_b16: + case aco_opcode::v_xor_b16: + case aco_opcode::v_max3_f32: + case aco_opcode::v_min3_f32: + case aco_opcode::v_max3_f16: + case aco_opcode::v_min3_f16: + case aco_opcode::v_med3_f16: + case aco_opcode::v_max3_u32: + case aco_opcode::v_min3_u32: + case aco_opcode::v_med3_u32: + case aco_opcode::v_max3_i32: + case aco_opcode::v_min3_i32: + case aco_opcode::v_med3_i32: + case aco_opcode::v_max3_u16: + case aco_opcode::v_min3_u16: + case aco_opcode::v_med3_u16: + case aco_opcode::v_max3_i16: + case aco_opcode::v_min3_i16: + case aco_opcode::v_med3_i16: case aco_opcode::v_max_f16: case aco_opcode::v_max_f32: case aco_opcode::v_min_f16: @@ -973,14 +1070,73 @@ can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op) case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true; case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true; case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true; - default: { - CmpInfo info; - if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) { - *new_op = info.swapped; - return true; - } - return false; + case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true; + case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true; + case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true; + case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true; + case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true; + case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true; + case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true; + case aco_opcode::v_addc_co_u32: + case aco_opcode::v_mad_i32_i24: + case aco_opcode::v_mad_u32_u24: + case aco_opcode::v_lerp_u8: + case aco_opcode::v_sad_u8: + case aco_opcode::v_sad_hi_u8: + case aco_opcode::v_sad_u16: + case aco_opcode::v_sad_u32: + case aco_opcode::v_xad_u32: + case aco_opcode::v_add_lshl_u32: + case aco_opcode::v_and_or_b32: + case aco_opcode::v_mad_u16: + case aco_opcode::v_mad_i16: + case aco_opcode::v_mad_u32_u16: + case aco_opcode::v_mad_i32_i16: + case aco_opcode::v_maxmin_f32: + case aco_opcode::v_minmax_f32: + case aco_opcode::v_maxmin_f16: + case aco_opcode::v_minmax_f16: + case aco_opcode::v_maxmin_u32: + case aco_opcode::v_minmax_u32: + case aco_opcode::v_maxmin_i32: + case aco_opcode::v_minmax_i32: + case aco_opcode::v_fma_f32: + case aco_opcode::v_fma_legacy_f32: + case aco_opcode::v_fmac_f32: + case aco_opcode::v_fmac_legacy_f32: + case aco_opcode::v_mac_f32: + case aco_opcode::v_mac_legacy_f32: + case aco_opcode::v_fma_f16: + case aco_opcode::v_fmac_f16: + case aco_opcode::v_mac_f16: + case aco_opcode::v_dot4c_i32_i8: + case aco_opcode::v_dot2c_f32_f16: + case aco_opcode::v_dot2_f32_f16: + case aco_opcode::v_dot2_f32_bf16: + case aco_opcode::v_dot2_f16_f16: + case aco_opcode::v_dot2_bf16_bf16: + case aco_opcode::v_fma_mix_f32: + case aco_opcode::v_fma_mixlo_f16: + case aco_opcode::v_fma_mixhi_f16: + case aco_opcode::v_pk_fmac_f16: { + if (idx1 == 2) + return false; + *new_op = instr->opcode; + return true; + } + case aco_opcode::v_subb_co_u32: { + if (idx1 == 2) + return false; + *new_op = aco_opcode::v_subbrev_co_u32; + return true; } + case aco_opcode::v_subbrev_co_u32: { + if (idx1 == 2) + return false; + *new_op = aco_opcode::v_subb_co_u32; + return true; + } + default: return false; } } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 7a95b00b0e9..04b93420f16 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1803,11 +1803,12 @@ bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx); bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op); uint8_t get_gfx11_true16_mask(aco_opcode op); bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra); -bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8); +bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8); bool can_write_m0(const aco_ptr<Instruction>& instr); /* updates "instr" and returns the old instruction (or NULL if no update was needed) */ aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr); -aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8); +aco_ptr<Instruction> convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, + bool dpp8); bool needs_exec_mask(const Instruction* instr); aco_opcode get_ordered(aco_opcode op); @@ -1820,7 +1821,8 @@ unsigned get_cmp_bitsize(aco_opcode op); bool is_fp_cmp(aco_opcode op); bool is_cmpx(aco_opcode op); -bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op); +bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0 = 0, + unsigned idx1 = 1); uint32_t get_reduction_identity(ReduceOp op, unsigned idx); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index c5d238afcc8..a6d1a60415a 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -4810,7 +4810,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr) } /* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */ - if (instr->isVALU()) { + if (instr->isVALU() && !instr->isDPP()) { for (unsigned i = 0; i < instr->operands.size(); i++) { if (!instr->operands[i].isTemp()) continue; @@ -4819,41 +4819,44 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr) if (!info.is_dpp() || info.instr->pass_flags != instr->pass_flags) continue; - aco_opcode swapped_op; - if (i != 0 && !can_swap_operands(instr, &swapped_op)) - continue; + if (i != 0) { + if (!can_swap_operands(instr, &instr->opcode, 0, i)) + continue; + std::swap(instr->operands[0], instr->operands[i]); + instr->valu().neg[0].swap(instr->valu().neg[i]); + instr->valu().abs[0].swap(instr->valu().abs[i]); + instr->valu().opsel[0].swap(instr->valu().opsel[i]); + instr->valu().opsel_lo[0].swap(instr->valu().opsel_lo[i]); + instr->valu().opsel_hi[0].swap(instr->valu().opsel_hi[i]); + } - if (instr->isDPP() || !can_use_DPP(instr, true, info.is_dpp8())) + if (!can_use_DPP(ctx.program->gfx_level, instr, info.is_dpp8())) continue; bool dpp8 = info.is_dpp8(); bool input_mods = instr_info.can_use_input_modifiers[(int)instr->opcode] && instr_info.operand_size[(int)instr->opcode] == 32; - if (!dpp8 && (info.instr->dpp16().neg[0] || info.instr->dpp16().abs[0]) && !input_mods) + bool mov_uses_mods = info.instr->valu().neg[0] || info.instr->valu().abs[0]; + if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods) continue; - convert_to_DPP(instr, dpp8); - - if (i != 0) { - instr->opcode = swapped_op; - std::swap(instr->operands[0], instr->operands[1]); - instr->valu().neg[0].swap(instr->valu().neg[1]); - instr->valu().abs[0].swap(instr->valu().abs[1]); - instr->valu().opsel[0].swap(instr->valu().opsel[1]); - } + convert_to_DPP(ctx.program->gfx_level, instr, dpp8); if (dpp8) { DPP8_instruction* dpp = &instr->dpp8(); for (unsigned j = 0; j < 8; ++j) dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j]; + if (mov_uses_mods) + instr->format = asVOP3(instr->format); } else { DPP16_instruction* dpp = &instr->dpp16(); dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl; dpp->bound_ctrl = info.instr->dpp16().bound_ctrl; - dpp->neg[0] ^= info.instr->dpp16().neg[0] && !dpp->abs[0]; - dpp->abs[0] |= info.instr->dpp16().abs[0]; } + instr->valu().neg[0] ^= info.instr->valu().neg[0] && !instr->valu().abs[0]; + instr->valu().abs[0] |= info.instr->valu().abs[0]; + if (--ctx.uses[info.instr->definitions[0].tempId()]) ctx.uses[info.instr->operands[0].tempId()]++; instr->operands[0].setTemp(info.instr->operands[0].getTemp()); diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index a21bc335559..e8d593915f2 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -485,7 +485,7 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) if (!instr->isVALU() || instr->isDPP()) return; - for (unsigned i = 0; i < MIN2(2, instr->operands.size()); i++) { + for (unsigned i = 0; i < instr->operands.size(); i++) { Idx op_instr_idx = last_writer_idx(ctx, instr->operands[i]); if (!op_instr_idx.found()) continue; @@ -493,9 +493,6 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) const Instruction* mov = ctx.get(op_instr_idx); if (mov->opcode != aco_opcode::v_mov_b32 || !mov->isDPP()) continue; - bool dpp8 = mov->isDPP8(); - if (!can_use_DPP(instr, false, dpp8)) - return; /* If we aren't going to remove the v_mov_b32, we have to ensure that it doesn't overwrite * it's own operand before we use it. @@ -508,12 +505,25 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) if (is_overwritten_since(ctx, mov->operands[0], op_instr_idx)) continue; - if (i && !can_swap_operands(instr, &instr->opcode)) - continue; - + bool dpp8 = mov->isDPP8(); bool input_mods = instr_info.can_use_input_modifiers[(int)instr->opcode] && instr_info.operand_size[(int)instr->opcode] == 32; - if (!dpp8 && (mov->dpp16().neg[0] || mov->dpp16().abs[0]) && !input_mods) + bool mov_uses_mods = mov->valu().neg[0] || mov->valu().abs[0]; + if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods) + continue; + + if (i != 0) { + if (!can_swap_operands(instr, &instr->opcode, 0, i)) + continue; + std::swap(instr->operands[0], instr->operands[i]); + instr->valu().neg[0].swap(instr->valu().neg[i]); + instr->valu().abs[0].swap(instr->valu().abs[i]); + instr->valu().opsel[0].swap(instr->valu().opsel[i]); + instr->valu().opsel_lo[0].swap(instr->valu().opsel_lo[i]); + instr->valu().opsel_hi[0].swap(instr->valu().opsel_hi[i]); + } + + if (!can_use_DPP(ctx.program->gfx_level, instr, dpp8)) continue; if (!dpp8) /* anything else doesn't make sense in SSA */ @@ -522,27 +532,22 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr) if (--ctx.uses[mov->definitions[0].tempId()]) ctx.uses[mov->operands[0].tempId()]++; - convert_to_DPP(instr, dpp8); - - if (i) { - std::swap(instr->operands[0], instr->operands[1]); - instr->valu().neg[0].swap(instr->valu().neg[1]); - instr->valu().abs[0].swap(instr->valu().abs[1]); - instr->valu().opsel[0].swap(instr->valu().opsel[1]); - } + convert_to_DPP(ctx.program->gfx_level, instr, dpp8); instr->operands[0] = mov->operands[0]; if (dpp8) { DPP8_instruction* dpp = &instr->dpp8(); memcpy(dpp->lane_sel, mov->dpp8().lane_sel, sizeof(dpp->lane_sel)); + if (mov_uses_mods) + instr->format = asVOP3(instr->format); } else { DPP16_instruction* dpp = &instr->dpp16(); dpp->dpp_ctrl = mov->dpp16().dpp_ctrl; dpp->bound_ctrl = true; - dpp->neg[0] ^= mov->dpp16().neg[0] && !dpp->abs[0]; - dpp->abs[0] |= mov->dpp16().abs[0]; } + instr->valu().neg[0] ^= mov->valu().neg[0] && !instr->valu().abs[0]; + instr->valu().abs[0] |= mov->valu().abs[0]; return; } } diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index d4073296ae9..b95e60be0b3 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -142,12 +142,21 @@ validate_ir(Program* program) "Wrong base format for instruction", instr.get()); /* check VOP3 modifiers */ - if (instr->isVOP3() && instr->format != Format::VOP3) { + if (instr->isVOP3() && withoutDPP(instr->format) != Format::VOP3) { check(base_format == Format::VOP2 || base_format == Format::VOP1 || base_format == Format::VOPC || base_format == Format::VINTRP, "Format cannot have VOP3/VOP3B applied", instr.get()); } + if (instr->isDPP()) { + check(base_format == Format::VOP2 || base_format == Format::VOP1 || + base_format == Format::VOPC || base_format == Format::VOP3 || + base_format == Format::VOP3P, + "Format cannot have DPP applied", instr.get()); + check((!instr->isVOP3() && !instr->isVOP3P()) || program->gfx_level >= GFX11, + "VOP3+DPP is GFX11+ only", instr.get()); + } + /* check SDWA */ if (instr->isSDWA()) { check(base_format == Format::VOP2 || base_format == Format::VOP1 || |