diff options
author | Joe Ellis <joe.ellis@arm.com> | 2021-01-11 13:46:01 +0000 |
---|---|---|
committer | Joe Ellis <joe.ellis@arm.com> | 2021-01-13 09:44:09 +0000 |
commit | 3122c66aee7b709046753873c4e94db73742b3de (patch) | |
tree | c3cc40c420c95744080bad7ce94a1b9bd756f2c6 | |
parent | 4cd48535eca06245c89a9158844bb177c6f8eb63 (diff) | |
download | llvm-3122c66aee7b709046753873c4e94db73742b3de.tar.gz |
[AArch64][SVE] Remove chains of unnecessary SVE reinterpret intrinsics
This commit extends SVEIntrinsicOpts::optimizeConvertFromSVBool to
identify and remove longer chains of redundant SVE reintepret
intrinsics. For example, the following chain of redundant SVE
reinterprets is now recognised as redundant:
%a = <vscale x 2 x i1>
%1 = <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool(<vscale x 2 x i1> %a)
%2 = <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool(<vscale x 16 x i1> %1)
%3 = <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool(<vscale x 4 x i1> %2)
%4 = <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool(<vscale x 16 x i1> %3)
%5 = <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool(<vscale x 4 x i1> %4)
%6 = <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool(<vscale x 16 x i1> %5)
ret <vscale x 2 x i1> %6
and will be replaced with:
ret <vscale x 2 x i1> %a
Eliminating these can sometimes mean emitting fewer unnecessary
loads/stores when lowering to assembly.
Differential Revision: https://reviews.llvm.org/D94074
-rw-r--r-- | llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp | 50 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll | 56 |
2 files changed, 95 insertions, 11 deletions
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index 8e8b12c07bbf..9911f33371c6 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -177,22 +177,50 @@ bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { if (isa<PHINode>(I->getArgOperand(0))) return processPhiNode(I); - // If we have a reinterpret intrinsic I of type A which is converting from - // another reinterpret Y of type B, and the source type of Y is A, then we can - // elide away both reinterprets if there are no other users of Y. - auto *Y = isReinterpretToSVBool(I->getArgOperand(0)); - if (!Y) - return false; + SmallVector<Instruction *, 32> CandidatesForRemoval; + Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr; + + const auto *IVTy = cast<VectorType>(I->getType()); + + // Walk the chain of conversions. + while (Cursor) { + // If the type of the cursor has fewer lanes than the final result, zeroing + // must take place, which breaks the equivalence chain. + const auto *CursorVTy = cast<VectorType>(Cursor->getType()); + if (CursorVTy->getElementCount().getKnownMinValue() < + IVTy->getElementCount().getKnownMinValue()) + break; + + // If the cursor has the same type as I, it is a viable replacement. + if (Cursor->getType() == IVTy) + EarliestReplacement = Cursor; - Value *SourceVal = Y->getArgOperand(0); - if (I->getType() != SourceVal->getType()) + auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); + + // If this is not an SVE conversion intrinsic, this is the end of the chain. + if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == + Intrinsic::aarch64_sve_convert_to_svbool || + IntrinsicCursor->getIntrinsicID() == + Intrinsic::aarch64_sve_convert_from_svbool)) + break; + + CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); + Cursor = IntrinsicCursor->getOperand(0); + } + + // If no viable replacement in the conversion chain was found, there is + // nothing to do. + if (!EarliestReplacement) return false; - I->replaceAllUsesWith(SourceVal); + I->replaceAllUsesWith(EarliestReplacement); I->eraseFromParent(); - if (Y->use_empty()) - Y->eraseFromParent(); + while (!CandidatesForRemoval.empty()) { + Instruction *Candidate = CandidatesForRemoval.pop_back_val(); + if (Candidate->use_empty()) + Candidate->eraseFromParent(); + } return true; } diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll index 47e0ff8f19c7..22c61d0565af 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll @@ -67,6 +67,62 @@ define <vscale x 16 x i1> @reinterpret_test_d_rev(<vscale x 16 x i1> %a) { ret <vscale x 16 x i1> %2 } +define <vscale x 2 x i1> @reinterpret_test_full_chain(<vscale x 2 x i1> %a) { +; OPT-LABEL: @reinterpret_test_full_chain( +; OPT: ret <vscale x 2 x i1> %a + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a) + %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) + %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2) + %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %3) + %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %4) + %6 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %5) + ret <vscale x 2 x i1> %6 +} + +; The last two reinterprets are not necessary, since they are doing the same +; work as the first two. +define <vscale x 4 x i1> @reinterpret_test_partial_chain(<vscale x 2 x i1> %a) { +; OPT-LABEL: @reinterpret_test_partial_chain( +; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a) +; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) +; OPT-NEXT: ret <vscale x 4 x i1> %2 + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a) + %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) + %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2) + %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %3) + ret <vscale x 4 x i1> %4 +} + +; The chain cannot be reduced because of the second reinterpret, which causes +; zeroing. +define <vscale x 8 x i1> @reinterpret_test_irreducible_chain(<vscale x 8 x i1> %a) { +; OPT-LABEL: @reinterpret_test_irreducible_chain( +; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a) +; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) +; OPT-NEXT: %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2) +; OPT-NEXT: %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3) +; OPT-NEXT: ret <vscale x 8 x i1> %4 + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a) + %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) + %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2) + %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3) + ret <vscale x 8 x i1> %4 +} + +; Here, the candidate list is larger than the number of instructions that we +; end up removing. +define <vscale x 4 x i1> @reinterpret_test_keep_some_candidates(<vscale x 8 x i1> %a) { +; OPT-LABEL: @reinterpret_test_keep_some_candidates( +; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a) +; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) +; OPT-NEXT: ret <vscale x 4 x i1> %2 + %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a) + %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) + %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2) + %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %3) + ret <vscale x 4 x i1> %4 +} + define <vscale x 2 x i1> @reinterpret_reductions(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) { ; OPT-LABEL: reinterpret_reductions ; OPT-NOT: convert |