From be653f6292e76b82d3532e6fba0e15ce633bd80a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 25 Apr 2022 16:51:57 +0100 Subject: [X86] combineX86ShuffleChain - don't fold to truncate(concat(V1,V2)) if it was already a PACK op Fixes #55050 (cherry picked from commit e8305c0b8f492272446d517e67d451f390d35dbe) --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 ++- .../X86/vector-shuffle-combining-avx512bwvl.ll | 44 ++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 682932b8f3e6..8bb7e81e19bb 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37558,7 +37558,11 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, (RootVT.is128BitVector() && Subtarget.hasVLX())) && (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) && isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) { - if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE) + // Bail if this was already a truncation or PACK node. + // We sometimes fail to match PACK if we demand known undef elements. + if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE || + Root.getOpcode() == X86ISD::PACKSS || + Root.getOpcode() == X86ISD::PACKUS)) return SDValue(); // Nothing to do! ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2); diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index be65effbc724..3be4e6777d6f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -174,3 +174,47 @@ define <8 x i32> @PR46393(<8 x i16> %a0, i8 %a1) { %sel = select <8 x i1> %mask, <8 x i32> %shl, <8 x i32> zeroinitializer ret <8 x i32> %sel } + +define i64 @PR55050() { +; X86-LABEL: PR55050: +; X86: # %bb.0: # %entry +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: testb %al, %al +; X86-NEXT: jne .LBB10_2 +; X86-NEXT: # %bb.1: # %if +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB10_2: # %exit +; X86-NEXT: movl %eax, %edx +; X86-NEXT: retl +; +; X64-LABEL: PR55050: +; X64: # %bb.0: # %entry +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testb %al, %al +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +entry: + %i275 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer) + %i277 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer) + br i1 undef, label %exit, label %if + +if: + %i298 = bitcast <2 x i64> %i275 to <4 x i32> + %i299 = bitcast <2 x i64> %i277 to <4 x i32> + %i300 = shufflevector <4 x i32> %i298, <4 x i32> %i299, <4 x i32> + %i339 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %i300, <4 x i32> undef) + %i354 = shufflevector <8 x i16> %i339, <8 x i16> undef, <8 x i32> + %i356 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %i354, <8 x i16> undef) + %i357 = shufflevector <16 x i8> %i356, <16 x i8> zeroinitializer, <16 x i32> + %i361 = extractelement <16 x i8> %i357, i64 8 + %i360 = and i8 %i361, 63 + %i379 = zext i8 %i360 to i64 + br label %exit + +exit: + %res = phi i64 [ %i379, %if ], [ 0, %entry ] + ret i64 %res +} +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) -- cgit v1.2.1