aco: use VOP3+DPP

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22698>
author: Georg Lehmann <dadschoorse@gmail.com> 2023-04-23 14:55:17 +0200
committer: Marge Bot <emma+marge@anholt.net> 2023-05-12 13:31:16 +0000
commit: 151bcc1e8bbc9b012616ae418a59c215b0e6e807 (patch)
tree: 0017a7c83b0adc578592961bf6510f50308aebaa
parent: 41b0eafc4b0b4ed1083ab00e5fee2a6e0fdfd900 (diff)
download: mesa-151bcc1e8bbc9b012616ae418a59c215b0e6e807.tar.gz
5 files changed, 242 insertions, 67 deletions
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 15436575dec..c6cdec446a4 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -336,38 +336,52 @@ convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
 }
 
 bool
-can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8)
+can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
 {
    assert(instr->isVALU() && !instr->operands.empty());
 
    if (instr->isDPP())
       return instr->isDPP8() == dpp8;
 
-   if (instr->operands.size() && instr->operands[0].isLiteral())
+   if (instr->isSDWA() || instr->isVINTERP_INREG())
       return false;
 
-   if (instr->isSDWA() || instr->isVINTERP_INREG() || instr->isVOP3P())
+   if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
       return false;
 
-   if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) &&
-       instr->definitions.back().physReg() != vcc)
+   if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
+       instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
       return false;
 
-   if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc)
+   if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
+       instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
+       gfx_level < GFX11)
       return false;
 
-   if (instr->isVOP3()) {
+   if (instr->isVOP3() && gfx_level < GFX11) {
       const VALU_instruction* vop3 = &instr->valu();
-      if (vop3->clamp || vop3->omod || vop3->opsel)
+      if (vop3->clamp || vop3->omod)
          return false;
       if (dpp8)
          return false;
-      if (instr->format == Format::VOP3)
+   }
+
+   for (unsigned i = 0; i < instr->operands.size(); i++) {
+      if (instr->operands[i].isLiteral())
          return false;
-      if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr))
+      if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2)
          return false;
    }
 
+   /* simpler than listing all VOP3P opcodes which do not support DPP */
+   if (instr->isVOP3P()) {
+      return instr->opcode == aco_opcode::v_fma_mix_f32 ||
+             instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
+             instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
+             instr->opcode == aco_opcode::v_dot2_f32_f16 ||
+             instr->opcode == aco_opcode::v_dot2_f32_bf16;
+   }
+
    /* there are more cases but those all take 64-bit inputs */
    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
@@ -375,18 +389,31 @@ can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8)
           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
           instr->opcode != aco_opcode::v_cvt_f64_i32 &&
-          instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32;
+          instr->opcode != aco_opcode::v_cvt_f64_f32 &&
+          instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
+          instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
+          instr->opcode != aco_opcode::v_mul_hi_i32 &&
+          instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
+          instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
+          instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
+          instr->opcode != aco_opcode::v_mad_u64_u32 &&
+          instr->opcode != aco_opcode::v_mad_i64_i32 &&
+          instr->opcode != aco_opcode::v_permlane16_b32 &&
+          instr->opcode != aco_opcode::v_permlanex16_b32 &&
+          instr->opcode != aco_opcode::v_permlane64_b32 &&
+          instr->opcode != aco_opcode::v_readlane_b32_e64 &&
+          instr->opcode != aco_opcode::v_writelane_b32_e64;
 }
 
 aco_ptr<Instruction>
-convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8)
+convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
 {
    if (instr->isDPP())
       return NULL;
 
    aco_ptr<Instruction> tmp = std::move(instr);
-   Format format = (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) |
-                            (dpp8 ? (uint32_t)Format::DPP8 : (uint32_t)Format::DPP16));
+   Format format =
+      (Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
    if (dpp8)
       instr.reset(create_instruction<DPP8_instruction>(tmp->opcode, format, tmp->operands.size(),
                                                        tmp->definitions.size()));
@@ -394,8 +421,7 @@ convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8)
       instr.reset(create_instruction<DPP16_instruction>(tmp->opcode, format, tmp->operands.size(),
                                                         tmp->definitions.size()));
    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
-   for (unsigned i = 0; i < instr->definitions.size(); i++)
-      instr->definitions[i] = tmp->definitions[i];
+   std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
 
    if (dpp8) {
       DPP8_instruction* dpp = &instr->dpp8();
@@ -410,16 +436,37 @@ convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8)
 
    instr->valu().neg = tmp->valu().neg;
    instr->valu().abs = tmp->valu().abs;
+   instr->valu().omod = tmp->valu().omod;
+   instr->valu().clamp = tmp->valu().clamp;
    instr->valu().opsel = tmp->valu().opsel;
+   instr->valu().opsel_lo = tmp->valu().opsel_lo;
+   instr->valu().opsel_hi = tmp->valu().opsel_hi;
 
-   if (instr->isVOPC() || instr->definitions.size() > 1)
+   if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
       instr->definitions.back().setFixed(vcc);
 
-   if (instr->operands.size() >= 3)
+   if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
+       gfx_level < GFX11)
       instr->operands[2].setFixed(vcc);
 
    instr->pass_flags = tmp->pass_flags;
 
+   /* DPP16 supports input modifiers, so we might no longer need VOP3. */
+   bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
+                      (instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
+
+   /* VOPC/add_co/sub_co definition needs VCC without VOP3. */
+   remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
+                  !instr->definitions.back().isFixed() ||
+                  instr->definitions.back().physReg() == vcc;
+
+   /* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
+   remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
+                  instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
+
+   if (remove_vop3)
+      instr->format = (Format)((uint32_t)instr->format & ~(uint32_t)Format::VOP3);
+
    return tmp;
 }
 
@@ -931,27 +978,77 @@ is_cmpx(aco_opcode op)
 }
 
 bool
-can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op)
+can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
 {
+   if (idx0 == idx1) {
+      *new_op = instr->opcode;
+      return true;
+   }
+
+   if (idx0 > idx1)
+      std::swap(idx0, idx1);
+
    if (instr->isDPP())
       return false;
 
-   if (instr->operands[0].isConstant() ||
-       (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
+   if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
       return false;
 
+   if (instr->isVOPC()) {
+      CmpInfo info;
+      if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
+         *new_op = info.swapped;
+         return true;
+      }
+   }
+
+   /* opcodes not relevant for DPP or SGPRs optimizations are not included. */
    switch (instr->opcode) {
+   case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */
    case aco_opcode::v_add_u32:
    case aco_opcode::v_add_co_u32:
    case aco_opcode::v_add_co_u32_e64:
    case aco_opcode::v_add_i32:
+   case aco_opcode::v_add_i16:
+   case aco_opcode::v_add_u16_e64:
+   case aco_opcode::v_add3_u32:
    case aco_opcode::v_add_f16:
    case aco_opcode::v_add_f32:
+   case aco_opcode::v_mul_i32_i24:
+   case aco_opcode::v_mul_hi_i32_i24:
+   case aco_opcode::v_mul_u32_u24:
+   case aco_opcode::v_mul_hi_u32_u24:
+   case aco_opcode::v_mul_lo_u16:
+   case aco_opcode::v_mul_lo_u16_e64:
    case aco_opcode::v_mul_f16:
    case aco_opcode::v_mul_f32:
+   case aco_opcode::v_mul_legacy_f32:
    case aco_opcode::v_or_b32:
    case aco_opcode::v_and_b32:
    case aco_opcode::v_xor_b32:
+   case aco_opcode::v_xnor_b32:
+   case aco_opcode::v_xor3_b32:
+   case aco_opcode::v_or3_b32:
+   case aco_opcode::v_and_b16:
+   case aco_opcode::v_or_b16:
+   case aco_opcode::v_xor_b16:
+   case aco_opcode::v_max3_f32:
+   case aco_opcode::v_min3_f32:
+   case aco_opcode::v_max3_f16:
+   case aco_opcode::v_min3_f16:
+   case aco_opcode::v_med3_f16:
+   case aco_opcode::v_max3_u32:
+   case aco_opcode::v_min3_u32:
+   case aco_opcode::v_med3_u32:
+   case aco_opcode::v_max3_i32:
+   case aco_opcode::v_min3_i32:
+   case aco_opcode::v_med3_i32:
+   case aco_opcode::v_max3_u16:
+   case aco_opcode::v_min3_u16:
+   case aco_opcode::v_med3_u16:
+   case aco_opcode::v_max3_i16:
+   case aco_opcode::v_min3_i16:
+   case aco_opcode::v_med3_i16:
    case aco_opcode::v_max_f16:
    case aco_opcode::v_max_f32:
    case aco_opcode::v_min_f16:
@@ -973,14 +1070,73 @@ can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op)
    case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
    case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
    case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
-   default: {
-      CmpInfo info;
-      if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
-         *new_op = info.swapped;
-         return true;
-      }
-      return false;
+   case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true;
+   case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true;
+   case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true;
+   case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true;
+   case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true;
+   case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true;
+   case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true;
+   case aco_opcode::v_addc_co_u32:
+   case aco_opcode::v_mad_i32_i24:
+   case aco_opcode::v_mad_u32_u24:
+   case aco_opcode::v_lerp_u8:
+   case aco_opcode::v_sad_u8:
+   case aco_opcode::v_sad_hi_u8:
+   case aco_opcode::v_sad_u16:
+   case aco_opcode::v_sad_u32:
+   case aco_opcode::v_xad_u32:
+   case aco_opcode::v_add_lshl_u32:
+   case aco_opcode::v_and_or_b32:
+   case aco_opcode::v_mad_u16:
+   case aco_opcode::v_mad_i16:
+   case aco_opcode::v_mad_u32_u16:
+   case aco_opcode::v_mad_i32_i16:
+   case aco_opcode::v_maxmin_f32:
+   case aco_opcode::v_minmax_f32:
+   case aco_opcode::v_maxmin_f16:
+   case aco_opcode::v_minmax_f16:
+   case aco_opcode::v_maxmin_u32:
+   case aco_opcode::v_minmax_u32:
+   case aco_opcode::v_maxmin_i32:
+   case aco_opcode::v_minmax_i32:
+   case aco_opcode::v_fma_f32:
+   case aco_opcode::v_fma_legacy_f32:
+   case aco_opcode::v_fmac_f32:
+   case aco_opcode::v_fmac_legacy_f32:
+   case aco_opcode::v_mac_f32:
+   case aco_opcode::v_mac_legacy_f32:
+   case aco_opcode::v_fma_f16:
+   case aco_opcode::v_fmac_f16:
+   case aco_opcode::v_mac_f16:
+   case aco_opcode::v_dot4c_i32_i8:
+   case aco_opcode::v_dot2c_f32_f16:
+   case aco_opcode::v_dot2_f32_f16:
+   case aco_opcode::v_dot2_f32_bf16:
+   case aco_opcode::v_dot2_f16_f16:
+   case aco_opcode::v_dot2_bf16_bf16:
+   case aco_opcode::v_fma_mix_f32:
+   case aco_opcode::v_fma_mixlo_f16:
+   case aco_opcode::v_fma_mixhi_f16:
+   case aco_opcode::v_pk_fmac_f16: {
+      if (idx1 == 2)
+         return false;
+      *new_op = instr->opcode;
+      return true;
+   }
+   case aco_opcode::v_subb_co_u32: {
+      if (idx1 == 2)
+         return false;
+      *new_op = aco_opcode::v_subbrev_co_u32;
+      return true;
    }
+   case aco_opcode::v_subbrev_co_u32: {
+      if (idx1 == 2)
+         return false;
+      *new_op = aco_opcode::v_subb_co_u32;
+      return true;
+   }
+   default: return false;
    }
 }
 
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 7a95b00b0e9..04b93420f16 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1803,11 +1803,12 @@ bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
 uint8_t get_gfx11_true16_mask(aco_opcode op);
 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
-bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8);
+bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
 bool can_write_m0(const aco_ptr<Instruction>& instr);
 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
 aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr);
-aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr, bool dpp8);
+aco_ptr<Instruction> convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr,
+                                    bool dpp8);
 bool needs_exec_mask(const Instruction* instr);
 
 aco_opcode get_ordered(aco_opcode op);
@@ -1820,7 +1821,8 @@ unsigned get_cmp_bitsize(aco_opcode op);
 bool is_fp_cmp(aco_opcode op);
 bool is_cmpx(aco_opcode op);
 
-bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
+bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0 = 0,
+                       unsigned idx1 = 1);
 
 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
 
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index c5d238afcc8..a6d1a60415a 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -4810,7 +4810,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    }
 
    /* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */
-   if (instr->isVALU()) {
+   if (instr->isVALU() && !instr->isDPP()) {
       for (unsigned i = 0; i < instr->operands.size(); i++) {
          if (!instr->operands[i].isTemp())
             continue;
@@ -4819,41 +4819,44 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          if (!info.is_dpp() || info.instr->pass_flags != instr->pass_flags)
             continue;
 
-         aco_opcode swapped_op;
-         if (i != 0 && !can_swap_operands(instr, &swapped_op))
-            continue;
+         if (i != 0) {
+            if (!can_swap_operands(instr, &instr->opcode, 0, i))
+               continue;
+            std::swap(instr->operands[0], instr->operands[i]);
+            instr->valu().neg[0].swap(instr->valu().neg[i]);
+            instr->valu().abs[0].swap(instr->valu().abs[i]);
+            instr->valu().opsel[0].swap(instr->valu().opsel[i]);
+            instr->valu().opsel_lo[0].swap(instr->valu().opsel_lo[i]);
+            instr->valu().opsel_hi[0].swap(instr->valu().opsel_hi[i]);
+         }
 
-         if (instr->isDPP() || !can_use_DPP(instr, true, info.is_dpp8()))
+         if (!can_use_DPP(ctx.program->gfx_level, instr, info.is_dpp8()))
             continue;
 
          bool dpp8 = info.is_dpp8();
          bool input_mods = instr_info.can_use_input_modifiers[(int)instr->opcode] &&
                            instr_info.operand_size[(int)instr->opcode] == 32;
-         if (!dpp8 && (info.instr->dpp16().neg[0] || info.instr->dpp16().abs[0]) && !input_mods)
+         bool mov_uses_mods = info.instr->valu().neg[0] || info.instr->valu().abs[0];
+         if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods)
             continue;
 
-         convert_to_DPP(instr, dpp8);
-
-         if (i != 0) {
-            instr->opcode = swapped_op;
-            std::swap(instr->operands[0], instr->operands[1]);
-            instr->valu().neg[0].swap(instr->valu().neg[1]);
-            instr->valu().abs[0].swap(instr->valu().abs[1]);
-            instr->valu().opsel[0].swap(instr->valu().opsel[1]);
-         }
+         convert_to_DPP(ctx.program->gfx_level, instr, dpp8);
 
          if (dpp8) {
             DPP8_instruction* dpp = &instr->dpp8();
             for (unsigned j = 0; j < 8; ++j)
                dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j];
+            if (mov_uses_mods)
+               instr->format = asVOP3(instr->format);
          } else {
             DPP16_instruction* dpp = &instr->dpp16();
             dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl;
             dpp->bound_ctrl = info.instr->dpp16().bound_ctrl;
-            dpp->neg[0] ^= info.instr->dpp16().neg[0] && !dpp->abs[0];
-            dpp->abs[0] |= info.instr->dpp16().abs[0];
          }
 
+         instr->valu().neg[0] ^= info.instr->valu().neg[0] && !instr->valu().abs[0];
+         instr->valu().abs[0] |= info.instr->valu().abs[0];
+
          if (--ctx.uses[info.instr->definitions[0].tempId()])
             ctx.uses[info.instr->operands[0].tempId()]++;
          instr->operands[0].setTemp(info.instr->operands[0].getTemp());
diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp
index a21bc335559..e8d593915f2 100644
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@@ -485,7 +485,7 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
    if (!instr->isVALU() || instr->isDPP())
       return;
 
-   for (unsigned i = 0; i < MIN2(2, instr->operands.size()); i++) {
+   for (unsigned i = 0; i < instr->operands.size(); i++) {
       Idx op_instr_idx = last_writer_idx(ctx, instr->operands[i]);
       if (!op_instr_idx.found())
          continue;
@@ -493,9 +493,6 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
       const Instruction* mov = ctx.get(op_instr_idx);
       if (mov->opcode != aco_opcode::v_mov_b32 || !mov->isDPP())
          continue;
-      bool dpp8 = mov->isDPP8();
-      if (!can_use_DPP(instr, false, dpp8))
-         return;
 
       /* If we aren't going to remove the v_mov_b32, we have to ensure that it doesn't overwrite
        * it's own operand before we use it.
@@ -508,12 +505,25 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
       if (is_overwritten_since(ctx, mov->operands[0], op_instr_idx))
          continue;
 
-      if (i && !can_swap_operands(instr, &instr->opcode))
-         continue;
-
+      bool dpp8 = mov->isDPP8();
       bool input_mods = instr_info.can_use_input_modifiers[(int)instr->opcode] &&
                         instr_info.operand_size[(int)instr->opcode] == 32;
-      if (!dpp8 && (mov->dpp16().neg[0] || mov->dpp16().abs[0]) && !input_mods)
+      bool mov_uses_mods = mov->valu().neg[0] || mov->valu().abs[0];
+      if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods)
+         continue;
+
+      if (i != 0) {
+         if (!can_swap_operands(instr, &instr->opcode, 0, i))
+            continue;
+         std::swap(instr->operands[0], instr->operands[i]);
+         instr->valu().neg[0].swap(instr->valu().neg[i]);
+         instr->valu().abs[0].swap(instr->valu().abs[i]);
+         instr->valu().opsel[0].swap(instr->valu().opsel[i]);
+         instr->valu().opsel_lo[0].swap(instr->valu().opsel_lo[i]);
+         instr->valu().opsel_hi[0].swap(instr->valu().opsel_hi[i]);
+      }
+
+      if (!can_use_DPP(ctx.program->gfx_level, instr, dpp8))
          continue;
 
       if (!dpp8) /* anything else doesn't make sense in SSA */
@@ -522,27 +532,22 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
       if (--ctx.uses[mov->definitions[0].tempId()])
          ctx.uses[mov->operands[0].tempId()]++;
 
-      convert_to_DPP(instr, dpp8);
-
-      if (i) {
-         std::swap(instr->operands[0], instr->operands[1]);
-         instr->valu().neg[0].swap(instr->valu().neg[1]);
-         instr->valu().abs[0].swap(instr->valu().abs[1]);
-         instr->valu().opsel[0].swap(instr->valu().opsel[1]);
-      }
+      convert_to_DPP(ctx.program->gfx_level, instr, dpp8);
 
       instr->operands[0] = mov->operands[0];
 
       if (dpp8) {
          DPP8_instruction* dpp = &instr->dpp8();
          memcpy(dpp->lane_sel, mov->dpp8().lane_sel, sizeof(dpp->lane_sel));
+         if (mov_uses_mods)
+            instr->format = asVOP3(instr->format);
       } else {
          DPP16_instruction* dpp = &instr->dpp16();
          dpp->dpp_ctrl = mov->dpp16().dpp_ctrl;
          dpp->bound_ctrl = true;
-         dpp->neg[0] ^= mov->dpp16().neg[0] && !dpp->abs[0];
-         dpp->abs[0] |= mov->dpp16().abs[0];
       }
+      instr->valu().neg[0] ^= mov->valu().neg[0] && !instr->valu().abs[0];
+      instr->valu().abs[0] |= mov->valu().abs[0];
       return;
    }
 }
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index d4073296ae9..b95e60be0b3 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -142,12 +142,21 @@ validate_ir(Program* program)
                "Wrong base format for instruction", instr.get());
 
          /* check VOP3 modifiers */
-         if (instr->isVOP3() && instr->format != Format::VOP3) {
+         if (instr->isVOP3() && withoutDPP(instr->format) != Format::VOP3) {
             check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
                      base_format == Format::VOPC || base_format == Format::VINTRP,
                   "Format cannot have VOP3/VOP3B applied", instr.get());
          }
 
+         if (instr->isDPP()) {
+            check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
+                     base_format == Format::VOPC || base_format == Format::VOP3 ||
+                     base_format == Format::VOP3P,
+                  "Format cannot have DPP applied", instr.get());
+            check((!instr->isVOP3() && !instr->isVOP3P()) || program->gfx_level >= GFX11,
+                  "VOP3+DPP is GFX11+ only", instr.get());
+         }
+
          /* check SDWA */
          if (instr->isSDWA()) {
             check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
author	Georg Lehmann <dadschoorse@gmail.com>	2023-04-23 14:55:17 +0200
committer	Marge Bot <emma+marge@anholt.net>	2023-05-12 13:31:16 +0000
commit	151bcc1e8bbc9b012616ae418a59c215b0e6e807 (patch)
tree	0017a7c83b0adc578592961bf6510f50308aebaa
parent	41b0eafc4b0b4ed1083ab00e5fee2a6e0fdfd900 (diff)
download	mesa-151bcc1e8bbc9b012616ae418a59c215b0e6e807.tar.gz