diff options
Diffstat (limited to 'lib/Target/X86')
33 files changed, 790 insertions, 369 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index b8ea2f011332..2a784c9822a0 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2791,7 +2791,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, isParsingIntelSyntax())) { default: llvm_unreachable("Unexpected match result!"); case Match_Success: - if (validateInstruction(Inst, Operands)) + if (!MatchingInlineAsm && validateInstruction(Inst, Operands)) return true; // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the @@ -3082,7 +3082,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // instruction will already have been filled in correctly, since the failing // matches won't have modified it). if (NumSuccessfulMatches == 1) { - if (validateInstruction(Inst, Operands)) + if (!MatchingInlineAsm && validateInstruction(Inst, Operands)) return true; // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the individual diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 6ff1136cd85a..0c99dbbe328b 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -54,12 +54,12 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, if (TSFlags & X86II::LOCK) OS << "\tlock\t"; if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK) - OS << "\tlock\n"; + OS << "\tlock\t"; if (Flags & X86::IP_HAS_REPEAT_NE) - OS << "\trepne\n"; + OS << "\trepne\t"; else if (Flags & X86::IP_HAS_REPEAT) - OS << "\trep\n"; + OS << "\trep\t"; // Output CALLpcrel32 as "callq" in 64-bit mode. // In Intel annotation it's always emitted as "call". diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 464941a1bab6..1f02600a7982 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -41,13 +41,13 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, uint64_t TSFlags = Desc.TSFlags; if (TSFlags & X86II::LOCK) - OS << "\tlock\n"; + OS << "\tlock\t"; unsigned Flags = MI->getFlags(); if (Flags & X86::IP_HAS_REPEAT_NE) - OS << "\trepne\n"; + OS << "\trepne\t"; else if (Flags & X86::IP_HAS_REPEAT) - OS << "\trep\n"; + OS << "\trep\t"; printInstruction(MI, OS); diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index f4021d7639b8..34f2956e0c0b 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -116,9 +116,15 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", "Enable AVX2 instructions", [FeatureAVX]>; +def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", + "Enable three-operand fused multiple-add", + [FeatureAVX]>; +def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", + "Support 16-bit floating point conversion instructions", + [FeatureAVX]>; def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F", "Enable AVX-512 instructions", - [FeatureAVX2]>; + [FeatureAVX2, FeatureFMA, FeatureF16C]>; def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", "Enable AVX-512 Exponential and Reciprocal Instructions", [FeatureAVX512]>; @@ -154,9 +160,6 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; -def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", - "Enable three-operand fused multiple-add", - [FeatureAVX]>; def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", "Enable four-operand fused multiple-add", [FeatureAVX, FeatureSSE4A]>; @@ -177,9 +180,6 @@ def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true", "Support MOVBE instruction">; def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true", "Support RDRAND instruction">; -def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", - "Support 16-bit floating point conversion instructions", - [FeatureAVX]>; def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true", "Support FS/GS Base instructions">; def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp index 34e384ba3114..d8b5b7d3fc08 100644 --- a/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -34,13 +34,13 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCDwarf.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include <cassert> #include <cstddef> diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp index 7beb9c6e357b..54f937bcda3e 100644 --- a/lib/Target/X86/X86CallLowering.cpp +++ b/lib/Target/X86/X86CallLowering.cpp @@ -34,6 +34,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" @@ -41,7 +42,6 @@ #include "llvm/IR/Value.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/LowLevelTypeImpl.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <cassert> #include <cstdint> diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp index b2cd622b1e8c..e75276960cc8 100644 --- a/lib/Target/X86/X86CmovConversion.cpp +++ b/lib/Target/X86/X86CmovConversion.cpp @@ -57,6 +57,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCSchedule.h" @@ -64,7 +65,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp index 744510a3a3ba..6dd4631a4844 100644 --- a/lib/Target/X86/X86EvexToVex.cpp +++ b/lib/Target/X86/X86EvexToVex.cpp @@ -171,7 +171,7 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { case X86::VALIGNDZ128rri: case X86::VALIGNDZ128rmi: case X86::VALIGNQZ128rri: - case X86::VALIGNQZ128rmi: + case X86::VALIGNQZ128rmi: { assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) && "Unexpected new opcode!"); unsigned Scale = (Opc == X86::VALIGNQZ128rri || @@ -180,6 +180,24 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { Imm.setImm(Imm.getImm() * Scale); break; } + case X86::VSHUFF32X4Z256rmi: + case X86::VSHUFF32X4Z256rri: + case X86::VSHUFF64X2Z256rmi: + case X86::VSHUFF64X2Z256rri: + case X86::VSHUFI32X4Z256rmi: + case X86::VSHUFI32X4Z256rri: + case X86::VSHUFI64X2Z256rmi: + case X86::VSHUFI64X2Z256rri: { + assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr || + NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) && + "Unexpected new opcode!"); + MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1); + int64_t ImmVal = Imm.getImm(); + // Set bit 5, move bit 1 to bit 4, copy bit 0. + Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1)); + break; + } + } } diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp index b2b5a78fcdbf..9664c931c35e 100644 --- a/lib/Target/X86/X86FixupBWInsts.cpp +++ b/lib/Target/X86/X86FixupBWInsts.cpp @@ -55,9 +55,9 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; #define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup" diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 9f649dad8bc0..bbc2bffdb703 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -22,9 +22,9 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; namespace llvm { diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 5582526541ba..7bcbe1991249 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -37,11 +37,11 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 988f2967401b..86e65b83ffa9 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1562,6 +1562,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, bool HasFP = hasFP(MF); uint64_t NumBytes = 0; + bool NeedsDwarfCFI = + (!MF.getTarget().getTargetTriple().isOSDarwin() && + !MF.getTarget().getTargetTriple().isOSWindows()) && + (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry()); + if (IsFunclet) { assert(HasFP && "EH funclets without FP not yet implemented"); NumBytes = getWinEHFuncletFrameSize(MF); @@ -1584,6 +1589,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) .setMIFlag(MachineInstr::FrameDestroy); + if (NeedsDwarfCFI) { + unsigned DwarfStackPtr = + TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true); + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa( + nullptr, DwarfStackPtr, -SlotSize)); + --MBBI; + } } MachineBasicBlock::iterator FirstCSPop = MBBI; @@ -1647,6 +1659,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true); + if (!hasFP(MF) && NeedsDwarfCFI) { + // Define the current CFA rule to use the provided offset. + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( + nullptr, -CSSize - SlotSize)); + } --MBBI; } @@ -1659,6 +1676,23 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (NeedsWin64CFI && MF.hasWinCFI()) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); + if (!hasFP(MF) && NeedsDwarfCFI) { + MBBI = FirstCSPop; + int64_t Offset = -CSSize - SlotSize; + // Mark callee-saved pop instruction. + // Define the current CFA rule to use the provided offset. + while (MBBI != MBB.end()) { + MachineBasicBlock::iterator PI = MBBI; + unsigned Opc = PI->getOpcode(); + ++MBBI; + if (Opc == X86::POP32r || Opc == X86::POP64r) { + Offset += SlotSize; + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaOffset(nullptr, Offset)); + } + } + } + if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. int Offset = -1 * X86FI->getTCReturnAddrDelta(); @@ -2577,6 +2611,7 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, unsigned Regs[2]; unsigned FoundRegs = 0; + auto &MRI = MBB.getParent()->getRegInfo(); auto RegMask = Prev->getOperand(1); auto &RegClass = @@ -2590,6 +2625,10 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, if (!RegMask.clobbersPhysReg(Candidate)) continue; + // Don't clobber reserved registers + if (MRI.isReserved(Candidate)) + continue; + bool IsDef = false; for (const MachineOperand &MO : Prev->implicit_operands()) { if (MO.isReg() && MO.isDef() && @@ -2835,6 +2874,15 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( return MBBI; } +int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const { + return TRI->getSlotSize(); +} + +unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) + const { + return TRI->getDwarfRegNum(StackPtr, true); +} + namespace { // Struct used by orderFrameObjects to help sort the stack objects. struct X86FrameSortingObject { diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 38ac96e16d4e..2bce79262d01 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { @@ -168,6 +168,10 @@ public: MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool RestoreSP = false) const; + int getInitialCFAOffset(const MachineFunction &MF) const override; + + unsigned getInitialCFARegister(const MachineFunction &MF) const override; + private: uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index e43fd508de30..0cbf76017900 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -449,15 +449,15 @@ namespace { // Returns true if this masked compare can be implemented legally with this // type. static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { - if (N->getOpcode() == X86ISD::PCMPEQM || - N->getOpcode() == X86ISD::PCMPGTM || - N->getOpcode() == X86ISD::CMPM || - N->getOpcode() == X86ISD::CMPMU) { + unsigned Opcode = N->getOpcode(); + if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM || + Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM || + Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. - if (N->getOperand(0).getValueType() == MVT::v8i32 || - N->getOperand(0).getValueType() == MVT::v8f32) + EVT OpVT = N->getOperand(0).getValueType(); + if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32) return Subtarget->hasVLX(); return true; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b178ad6c13e7..22b4d7997fa6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -380,8 +380,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. - if (Subtarget.useSoftFloat() || - (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) { + if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } @@ -4998,6 +4997,8 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { switch (Opcode) { default: return false; + case X86ISD::TESTM: + case X86ISD::TESTNM: case X86ISD::PCMPEQM: case X86ISD::PCMPGTM: case X86ISD::CMPM: @@ -6746,6 +6747,9 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); + BitVector UndefElements; + SDValue Ld = BVOp->getSplatValue(&UndefElements); + // Attempt to use VBROADCASTM // From this paterrn: // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) @@ -6753,17 +6757,23 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // // Create (VBROADCASTM v2i1 X) if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) { - MVT EltType; - unsigned NumElts; + MVT EltType = VT.getScalarType(); + unsigned NumElts = VT.getVectorNumElements(); + SDValue BOperand; SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType); - if (ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) { - SDValue BOperand = ZeroExtended.getOperand(0); + if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) || + (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND && + Ld.getOperand(0).getOpcode() == ISD::BITCAST)) { + if (ZeroExtended) + BOperand = ZeroExtended.getOperand(0); + else + BOperand = Ld.getOperand(0).getOperand(0); if (BOperand.getValueType().isVector() && BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) { - if ((EltType == MVT::i64 && - VT.getVectorElementType() == MVT::i8) || // for broadcastmb2q - (EltType == MVT::i32 && - VT.getVectorElementType() == MVT::i16)) { // for broadcastmw2d + if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 || + NumElts == 8)) || // for broadcastmb2q + (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 || + NumElts == 16))) { // for broadcastmw2d SDValue Brdcst = DAG.getNode(X86ISD::VBROADCASTM, dl, MVT::getVectorVT(EltType, NumElts), BOperand); @@ -6773,9 +6783,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, } } - BitVector UndefElements; - SDValue Ld = BVOp->getSplatValue(&UndefElements); - // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { @@ -7707,6 +7714,111 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, return SDValue(); } +// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be +// reasoned to be a permutation of a vector by indices in a non-constant vector. +// (build_vector (extract_elt V, (extract_elt I, 0)), +// (extract_elt V, (extract_elt I, 1)), +// ... +// -> +// (vpermv I, V) +// +// TODO: Handle undefs +// TODO: Utilize pshufb and zero mask blending to support more efficient +// construction of vectors with constant-0 elements. +// TODO: Use smaller-element vectors of same width, and "interpolate" the indices, +// when no native operation available. +static SDValue +LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Look for VPERMV and PSHUFB opportunities. + MVT VT = V.getSimpleValueType(); + switch (VT.SimpleTy) { + default: + return SDValue(); + case MVT::v16i8: + if (!Subtarget.hasSSE3()) + return SDValue(); + break; + case MVT::v8f32: + case MVT::v8i32: + if (!Subtarget.hasAVX2()) + return SDValue(); + break; + case MVT::v4i64: + case MVT::v4f64: + if (!Subtarget.hasVLX()) + return SDValue(); + break; + case MVT::v16f32: + case MVT::v8f64: + case MVT::v16i32: + case MVT::v8i64: + if (!Subtarget.hasAVX512()) + return SDValue(); + break; + case MVT::v32i16: + if (!Subtarget.hasBWI()) + return SDValue(); + break; + case MVT::v8i16: + case MVT::v16i16: + if (!Subtarget.hasVLX() || !Subtarget.hasBWI()) + return SDValue(); + break; + case MVT::v64i8: + if (!Subtarget.hasVBMI()) + return SDValue(); + break; + case MVT::v32i8: + if (!Subtarget.hasVLX() || !Subtarget.hasVBMI()) + return SDValue(); + break; + } + SDValue SrcVec, IndicesVec; + // Check for a match of the permute source vector and permute index elements. + // This is done by checking that the i-th build_vector operand is of the form: + // (extract_elt SrcVec, (extract_elt IndicesVec, i)). + for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { + SDValue Op = V.getOperand(Idx); + if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // If this is the first extract encountered in V, set the source vector, + // otherwise verify the extract is from the previously defined source + // vector. + if (!SrcVec) + SrcVec = Op.getOperand(0); + else if (SrcVec != Op.getOperand(0)) + return SDValue(); + SDValue ExtractedIndex = Op->getOperand(1); + // Peek through extends. + if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || + ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) + ExtractedIndex = ExtractedIndex.getOperand(0); + if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // If this is the first extract from the index vector candidate, set the + // indices vector, otherwise verify the extract is from the previously + // defined indices vector. + if (!IndicesVec) + IndicesVec = ExtractedIndex.getOperand(0); + else if (IndicesVec != ExtractedIndex.getOperand(0)) + return SDValue(); + + auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1)); + if (!PermIdx || PermIdx->getZExtValue() != Idx) + return SDValue(); + } + MVT IndicesVT = VT; + if (VT.isFloatingPoint()) + IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()); + IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); + return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV, + SDLoc(V), VT, IndicesVec, SrcVec); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -7922,6 +8034,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (IsAllConstants) return SDValue(); + if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget)) + return V; + // See if we can use a vector load to get all of the elements. if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) { SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); @@ -10716,10 +10831,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. @@ -11016,10 +11137,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). @@ -12372,6 +12499,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } } + + // Try to use SHUF128 if possible. + if (Subtarget.hasVLX()) { + if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { + unsigned PermMask = ((WidenedMask[0] % 2) << 0) | + ((WidenedMask[1] % 2) << 1); + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); + } + } } // Otherwise form a 128-bit permutation. After accounting for undefs, @@ -13697,10 +13834,6 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) - return Shuf128; - if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four @@ -13722,6 +13855,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -17333,6 +17470,20 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { if (Swap) std::swap(Op0, Op1); + + // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM. + if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) { + SDValue A = peekThroughBitcasts(Op0); + if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) && + ISD::isBuildVectorAllZeros(Op1.getNode())) { + MVT VT0 = Op0.getSimpleValueType(); + SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0)); + SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1)); + return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM, + dl, VT, RHS, LHS); + } + } + if (Opc) return DAG.getNode(Opc, dl, VT, Op0, Op1); Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; @@ -19838,10 +19989,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else PassThru = Src1; - SDValue Rnd = Op.getOperand(5); + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + if (!isRoundModeCurDirection(Rnd)) + return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, + Op.getValueType(), Src1, Src2, + Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src1, Src2, - Src3, Rnd), + Src3), Mask, PassThru, Subtarget, DAG); } case IFMA_OP_MASKZ: @@ -24786,9 +24946,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; - case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; case X86ISD::FRCP: return "X86ISD::FRCP"; - case X86ISD::FRCPS: return "X86ISD::FRCPS"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; @@ -24942,10 +25100,18 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; + case X86ISD::FMADDS1: return "X86ISD::FMADDS1"; + case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1"; + case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1"; + case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1"; case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND"; case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND"; case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND"; case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND"; + case X86ISD::FMADDS3: return "X86ISD::FMADDS3"; + case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3"; + case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3"; + case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3"; case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND"; case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND"; case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND"; @@ -24966,9 +25132,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SELECT: return "X86ISD::SELECT"; case X86ISD::SELECTS: return "X86ISD::SELECTS"; case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; + case X86ISD::RCP14: return "X86ISD::RCP14"; + case X86ISD::RCP14S: return "X86ISD::RCP14S"; case X86ISD::RCP28: return "X86ISD::RCP28"; case X86ISD::RCP28S: return "X86ISD::RCP28S"; case X86ISD::EXP2: return "X86ISD::EXP2"; + case X86ISD::RSQRT14: return "X86ISD::RSQRT14"; + case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; @@ -25006,6 +25176,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; + case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND"; @@ -30314,10 +30485,17 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) + if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) return NewOp; - if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) + // TODO - Remove this once we can handle the implicit zero-extension of + // X86ISD::PEXTRW/X86ISD::PEXTRB in: + // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and + // combineBasicSADPattern. + if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; SDValue InputVector = N->getOperand(0); @@ -30464,16 +30642,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// TODO - merge with combineExtractVectorElt once it can handle the implicit -// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in: -// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and -// combineBasicSADPattern. -static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - return combineExtractWithShuffle(N, DAG, DCI, Subtarget); -} - /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()? @@ -30674,26 +30842,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, unsigned Opcode = Op.getOpcode(); switch (Opcode) { - case X86ISD::PALIGNR: - // PALIGNR can be converted to VALIGND/Q for 128-bit vectors. - if (!VT.is128BitVector()) - return false; - Opcode = X86ISD::VALIGN; - LLVM_FALLTHROUGH; - case X86ISD::VALIGN: { - if (EltVT != MVT::i32 && EltVT != MVT::i64) - return false; - uint64_t Imm = Op.getConstantOperandVal(2); - MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); - unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits(); - unsigned EltSize = EltVT.getSizeInBits(); - // Make sure we can represent the same shift with the new VT. - if ((ShiftAmt % EltSize) != 0) - return false; - Imm = ShiftAmt / EltSize; - return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), - DAG.getConstant(Imm, DL, MVT::i8)); - } case X86ISD::SHUF128: { if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64) return false; @@ -34441,8 +34589,9 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// This function transforms vector truncation of 'extended sign-bits' values. -/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations. +/// This function transforms vector truncation of 'extended sign-bits' or +/// 'extended zero-bits' values. +/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -34475,10 +34624,19 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. unsigned NumSignBits = DAG.ComputeNumSignBits(In); unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16); - if (NumSignBits <= (InSVT.getSizeInBits() - NumPackedBits)) - return SDValue(); + if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + + // Use PACKUS if the input has zero-bits that extend all the way to the + // packed/truncated value. e.g. masks, zext_in_reg, etc. + KnownBits Known; + DAG.computeKnownBits(In, Known); + unsigned NumLeadingZeroBits = Known.countMinLeadingZeros(); + NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; + if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget); - return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + return SDValue(); } static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, @@ -34507,7 +34665,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } - // Try to truncate extended sign bits with PACKSS. + // Try to truncate extended sign/zero bits with PACKSS/PACKUS. if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget)) return V; @@ -35341,9 +35499,11 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, // Do not convert the passthru input of scalar intrinsics. // FIXME: We could allow negations of the lower element only. - bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); + bool NegA = N->getOpcode() != X86ISD::FMADDS1 && + N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); bool NegB = invertIfNegative(B); - bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); + bool NegC = N->getOpcode() != X86ISD::FMADDS3 && + N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); // Negative multiplication when NegA xor NegB bool NegMul = (NegA != NegB); @@ -35371,6 +35531,20 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; } + } else if (N->getOpcode() == X86ISD::FMADDS1) { + switch (NewOpcode) { + case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break; + } + } else if (N->getOpcode() == X86ISD::FMADDS3) { + switch (NewOpcode) { + case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break; + } } else if (N->getOpcode() == X86ISD::FMADDS1_RND) { switch (NewOpcode) { case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break; @@ -36590,10 +36764,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::EXTRACT_VECTOR_ELT: - return combineExtractVectorElt(N, DAG, DCI, Subtarget); case X86ISD::PEXTRW: case X86ISD::PEXTRB: - return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget); + return combineExtractVectorElt(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::EXTRACT_SUBVECTOR: @@ -36689,6 +36862,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMADD_RND: case X86ISD::FMADDS1_RND: case X86ISD::FMADDS3_RND: + case X86ISD::FMADDS1: + case X86ISD::FMADDS3: case ISD::FMA: return combineFMA(N, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return combineGatherScatter(N, DAG); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 17cb976a4c77..d1438e59f9b6 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -254,7 +254,9 @@ namespace llvm { /// Note that these typically require refinement /// in order to obtain suitable precision. FRSQRT, FRCP, - FRSQRTS, FRCPS, + + // AVX-512 reciprocal approximations with a little more precision. + RSQRT14, RSQRT14S, RCP14, RCP14S, // Thread Local Storage. TLSADDR, @@ -487,6 +489,12 @@ namespace llvm { FMADDSUB_RND, FMSUBADD_RND, + // Scalar intrinsic FMA. + FMADDS1, FMADDS3, + FNMADDS1, FNMADDS3, + FMSUBS1, FMSUBS3, + FNMSUBS1, FNMSUBS3, + // Scalar intrinsic FMA with rounding mode. // Two versions, passthru bits on op1 or op3. FMADDS1_RND, FMADDS3_RND, @@ -555,7 +563,7 @@ namespace llvm { RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2, // Conversions between float and half-float. - CVTPS2PH, CVTPH2PS, + CVTPS2PH, CVTPH2PS, CVTPH2PS_RND, // LWP insert record. LWPINS, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index a73ee19423d3..84b44ac677bf 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -6017,16 +6017,17 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in { } multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, - SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> { + string OpcodeStr, SDNode OpNode, SDNode OpNodes1, + SDNode OpNodeRnds1, SDNode OpNodes3, + SDNode OpNodeRnds3, X86VectorVTInfo _, + string SUFF> { let ExeDomain = _.ExeDomain in { defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _, // Operands for intrinsic are in 123 order to preserve passthu // semantics. - (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, - (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, - _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))), + (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, _.RC:$src3)), + (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, + _.ScalarIntMemCPat:$src3)), (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, @@ -6035,10 +6036,9 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, (_.ScalarLdFrag addr:$src3)))), 0>; defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _, - (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, - (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3, - _.RC:$src1, (i32 FROUND_CURRENT))), + (_.VT (OpNodes3 _.RC:$src2, _.RC:$src3, _.RC:$src1)), + (_.VT (OpNodes3 _.RC:$src2, _.ScalarIntMemCPat:$src3, + _.RC:$src1)), (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3, @@ -6050,8 +6050,8 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, // 213 and 231 patterns this helps tablegen's duplicate pattern detection. defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _, (null_frag), - (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3, - _.RC:$src2, (i32 FROUND_CURRENT))), + (_.VT (OpNodes1 _.RC:$src1, _.ScalarIntMemCPat:$src3, + _.RC:$src2)), (null_frag), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3, _.FRC:$src2))), @@ -6061,26 +6061,29 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, } multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, + string OpcodeStr, SDNode OpNode, SDNode OpNodes1, + SDNode OpNodeRnds1, SDNode OpNodes3, SDNode OpNodeRnds3> { let Predicates = [HasAVX512] in { defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, - OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">, + OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3, + f32x_info, "SS">, EVEX_CD8<32, CD8VT1>, VEX_LIG; defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, - OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">, + OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3, + f64x_info, "SD">, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; } } -defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1, - X86FmaddRnds3>; -defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1, - X86FmsubRnds3>; -defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, - X86FnmaddRnds1, X86FnmaddRnds3>; -defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, - X86FnmsubRnds1, X86FnmsubRnds3>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86Fmadds1, + X86FmaddRnds1, X86Fmadds3, X86FmaddRnds3>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86Fmsubs1, + X86FmsubRnds1, X86Fmsubs3, X86FmsubRnds3>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1, + X86FnmaddRnds1, X86Fnmadds3, X86FnmaddRnds3>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1, + X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>; //===----------------------------------------------------------------------===// // AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA @@ -6554,7 +6557,7 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", NotMemoryFoldable; def : Pat<(f64 (fpextend FR32X:$src)), - (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>, + (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>, Requires<[HasAVX512]>; def : Pat<(f64 (fpextend (loadf32 addr:$src))), (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, @@ -6569,7 +6572,7 @@ def : Pat<(f64 (extloadf32 addr:$src)), Requires<[HasAVX512, OptForSpeed]>; def : Pat<(f32 (fpround FR64X:$src)), - (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>, + (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>, Requires<[HasAVX512]>; def : Pat<(v4f32 (X86Movss @@ -7174,34 +7177,44 @@ def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))), //===----------------------------------------------------------------------===// multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, X86MemOperand x86memop, PatFrag ld_frag> { - defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), - "vcvtph2ps", "$src", "$src", - (X86cvtph2ps (_src.VT _src.RC:$src), - (i32 FROUND_CURRENT))>, T8PD; - defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), - "vcvtph2ps", "$src", "$src", - (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))), - (i32 FROUND_CURRENT))>, T8PD; + defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src), "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT _src.RC:$src))>, T8PD; + defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), + (ins x86memop:$src), "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT + (bitconvert + (ld_frag addr:$src))))>, T8PD; } multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { - defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), - "vcvtph2ps", "{sae}, $src", "$src, {sae}", - (X86cvtph2ps (_src.VT _src.RC:$src), - (i32 FROUND_NO_EXC))>, T8PD, EVEX_B; + defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst), + (ins _src.RC:$src), "vcvtph2ps", + "{sae}, $src", "$src, {sae}", + (X86cvtph2psRnd (_src.VT _src.RC:$src), + (i32 FROUND_NO_EXC))>, T8PD, EVEX_B; } -let Predicates = [HasAVX512] in { +let Predicates = [HasAVX512] in defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>, avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; - let Predicates = [HasVLX] in { - defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, - loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; - defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, - loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; - } + +let Predicates = [HasVLX] in { + defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, + loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, + loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + + // Pattern match vcvtph2ps of a scalar i64 load. + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), + (VCVTPH2PSZ128rm addr:$src)>; + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + (VCVTPH2PSZ128rm addr:$src)>; + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VCVTPH2PSZ128rm addr:$src)>; } multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, @@ -7212,17 +7225,16 @@ multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)), NoItinerary, 0, 0>, AVX512AIi8Base; - def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2))), - addr:$dst)]>; - let hasSideEffects = 0, mayStore = 1 in - def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - []>, EVEX_K; + let hasSideEffects = 0, mayStore = 1 in { + def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>; + def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + []>, EVEX_K; + } } multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { let hasSideEffects = 0 in @@ -7242,6 +7254,19 @@ let Predicates = [HasAVX512] in { defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; } + + def : Pat<(store (f64 (extractelt + (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; + def : Pat<(store (i64 (extractelt + (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>; + def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. @@ -7264,35 +7289,6 @@ let Predicates = [HasVLX] in { (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >; } -// Patterns for matching float to half-float conversion when AVX512 is supported -// but F16C isn't. In that case we have to use 512-bit vectors. -let Predicates = [HasAVX512, NoVLX, NoF16C] in { - def : Pat<(fp_to_f16 FR32X:$src), - (i16 (EXTRACT_SUBREG - (VMOVPDI2DIZrr - (v8i16 (EXTRACT_SUBREG - (VCVTPS2PHZrr - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), - sub_xmm), 4), sub_xmm))), sub_16bit))>; - - def : Pat<(f16_to_fp GR16:$src), - (f32 (COPY_TO_REGCLASS - (v4f32 (EXTRACT_SUBREG - (VCVTPH2PSZrr - (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), - (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), - sub_xmm)), sub_xmm)), FR32X))>; - - def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))), - (f32 (COPY_TO_REGCLASS - (v4f32 (EXTRACT_SUBREG - (VCVTPH2PSZrr - (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), - sub_xmm), 4)), sub_xmm)), FR32X))>; -} - // Unordered/Ordered scalar fp compare with Sea and set EFLAGS multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { @@ -7362,13 +7358,13 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, } } -defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>, +defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, f32x_info>, EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable; -defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>, +defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, f64x_info>, VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable; -defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>, +defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, f32x_info>, EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable; -defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>, +defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, f64x_info>, VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable; /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd @@ -7414,8 +7410,8 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { } } -defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; -defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; +defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14>; +defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, @@ -7582,7 +7578,8 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, } multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, - string SUFF, SDNode OpNode, SDNode OpNodeRnd> { + string SUFF, SDNode OpNode, SDNode OpNodeRnd, + Intrinsic Intr> { let ExeDomain = _.ExeDomain in { defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, @@ -7618,21 +7615,35 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, } } +let Predicates = [HasAVX512] in { def : Pat<(_.EltVT (OpNode _.FRC:$src)), (!cast<Instruction>(NAME#SUFF#Zr) (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; + def : Pat<(Intr VR128X:$src), + (!cast<Instruction>(NAME#SUFF#Zr_Int) VR128X:$src, + VR128X:$src)>; +} + +let Predicates = [HasAVX512, OptForSize] in { def : Pat<(_.EltVT (OpNode (load addr:$src))), (!cast<Instruction>(NAME#SUFF#Zm) - (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>; + (_.EltVT (IMPLICIT_DEF)), addr:$src)>; + + def : Pat<(Intr (scalar_to_vector (_.EltVT (load addr:$src2)))), + (!cast<Instruction>(NAME#SUFF#Zm_Int) + (_.VT (IMPLICIT_DEF)), addr:$src2)>; +} + } multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> { defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt, - X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS, - NotMemoryFoldable; + X86fsqrtRnds, int_x86_sse_sqrt_ss>, + EVEX_CD8<32, CD8VT1>, EVEX_4V, XS, NotMemoryFoldable; defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt, - X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W, + X86fsqrtRnds, int_x86_sse2_sqrt_sd>, + EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W, NotMemoryFoldable; } @@ -7641,19 +7652,6 @@ defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>, defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG; -let Predicates = [HasAVX512] in { - def : Pat<(f32 (X86frsqrt FR32X:$src)), - (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>; - def : Pat<(f32 (X86frsqrt (load addr:$src))), - (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; - def : Pat<(f32 (X86frcp FR32X:$src)), - (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>; - def : Pat<(f32 (X86frcp (load addr:$src))), - (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; -} - multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { @@ -8911,6 +8909,123 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" , avx512vl_i8_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; +// Fragments to help convert valignq into masked valignd. Or valignq/valignd +// into vpalignr. +def ValignqImm32XForm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() * 2, SDLoc(N)); +}]>; +def ValignqImm8XForm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() * 8, SDLoc(N)); +}]>; +def ValigndImm8XForm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() * 4, SDLoc(N)); +}]>; + +multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode, + X86VectorVTInfo From, X86VectorVTInfo To, + SDNodeXForm ImmXForm> { + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.RC:$src0)), + (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.ImmAllZerosV)), + (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.RC:$src0)), + (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.ImmAllZerosV)), + (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; +} + +multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode, + X86VectorVTInfo From, + X86VectorVTInfo To, + SDNodeXForm ImmXForm> : + avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> { + def : Pat<(From.VT (OpNode From.RC:$src1, + (bitconvert (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3)), + (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.RC:$src0)), + (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.ImmAllZerosV)), + (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; +} + +let Predicates = [HasAVX512] in { + // For 512-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info, + v16i32_info, ValignqImm32XForm>; +} + +let Predicates = [HasVLX] in { + // For 128-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info, + v4i32x_info, ValignqImm32XForm>; + // For 256-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info, + v8i32x_info, ValignqImm32XForm>; +} + +let Predicates = [HasVLX, HasBWI] in { + // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR. + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info, + v16i8x_info, ValignqImm8XForm>; + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info, + v16i8x_info, ValigndImm8XForm>; +} + defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" , avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 453dcd83df1f..15466c2978fc 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -290,8 +290,7 @@ multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, } multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpStr, Intrinsic IntF32, Intrinsic IntF64, - SDNode OpNode> { + string OpStr, SDNode OpNodeIntrin, SDNode OpNode> { let ExeDomain = SSEPackedSingle in defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode, FR32, f32mem>, @@ -309,43 +308,44 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, // This is because src1 is tied to dest, and the scalar intrinsics // require the pass-through values to come from the first source // operand, not the second. - // TODO: Use AVX512 instructions when possible. - let Predicates = [HasFMA] in { - def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), + let Predicates = [HasFMA, NoAVX512] in { + def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)), (!cast<Instruction>(NAME#"213SSr_Int") VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), + def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)), (!cast<Instruction>(NAME#"213SDr_Int") VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(IntF32 VR128:$src1, VR128:$src2, sse_load_f32:$src3), + def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, + sse_load_f32:$src3)), (!cast<Instruction>(NAME#"213SSm_Int") VR128:$src1, VR128:$src2, sse_load_f32:$src3)>; - def : Pat<(IntF64 VR128:$src1, VR128:$src2, sse_load_f64:$src3), + def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, + sse_load_f64:$src3)), (!cast<Instruction>(NAME#"213SDm_Int") VR128:$src1, VR128:$src2, sse_load_f64:$src3)>; - def : Pat<(IntF32 VR128:$src1, sse_load_f32:$src3, VR128:$src2), + def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, sse_load_f32:$src3, + VR128:$src2)), (!cast<Instruction>(NAME#"132SSm_Int") VR128:$src1, VR128:$src2, sse_load_f32:$src3)>; - def : Pat<(IntF64 VR128:$src1, sse_load_f64:$src3, VR128:$src2), + def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, sse_load_f64:$src3, + VR128:$src2)), (!cast<Instruction>(NAME#"132SDm_Int") VR128:$src1, VR128:$src2, sse_load_f64:$src3)>; } } -defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, - int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG; -defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss, - int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG; +defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadds1, X86Fmadd>, VEX_LIG; +defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsubs1, X86Fmsub>, VEX_LIG; -defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss, - int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG; -defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss, - int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG; +defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd>, + VEX_LIG; +defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub>, + VEX_LIG; //===----------------------------------------------------------------------===// @@ -385,26 +385,28 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in } multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, - ComplexPattern mem_cpat, Intrinsic Int> { + ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> { let isCodeGenOnly = 1 in { def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG; + (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W, + VEX_LIG; def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, - mem_cpat:$src3))]>, VEX_W, VEX_LIG; + [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2, + mem_cpat:$src3)))]>, VEX_W, VEX_LIG; def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG; + (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>, + VEX_LIG; let hasSideEffects = 0 in def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), @@ -475,19 +477,19 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { let ExeDomain = SSEPackedSingle in { // Scalar Instructions defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, - fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, - int_x86_fma_vfmadd_ss>; + fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32, + X86Fmadds1>; defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, - fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, - int_x86_fma_vfmsub_ss>; + fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32, + X86Fmsubs1>; defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, X86Fnmadd, loadf32>, - fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, - int_x86_fma_vfnmadd_ss>; + fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32, + X86Fnmadds1>; defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, X86Fnmsub, loadf32>, - fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, - int_x86_fma_vfnmsub_ss>; + fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32, + X86Fnmsubs1>; // Packed Instructions defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, loadv4f32, loadv8f32>; @@ -506,19 +508,19 @@ let ExeDomain = SSEPackedSingle in { let ExeDomain = SSEPackedDouble in { // Scalar Instructions defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, - fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfmadd_sd>; + fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64, + X86Fmadds1>; defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, - fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfmsub_sd>; + fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64, + X86Fmsubs1>; defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, X86Fnmadd, loadf64>, - fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfnmadd_sd>; + fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64, + X86Fnmadds1>; defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, X86Fnmsub, loadf64>, - fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfnmsub_sd>; + fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64, + X86Fnmsubs1>; // Packed Instructions defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, loadv2f64, loadv4f64>; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index c4f34bdd37e6..d30400836bbc 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -56,8 +56,6 @@ def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>; def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; -def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS", SDTFPBinOp>; -def X86frcp14s : SDNode<"X86ISD::FRCPS", SDTFPBinOp>; def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; @@ -482,11 +480,22 @@ def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutat def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>; // Scalar FMA intrinsics with passthru bits in operand 1. +def X86Fmadds1 : SDNode<"X86ISD::FMADDS1", SDTFPTernaryOp>; +def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>; +def X86Fmsubs1 : SDNode<"X86ISD::FMSUBS1", SDTFPTernaryOp>; +def X86Fnmsubs1 : SDNode<"X86ISD::FNMSUBS1", SDTFPTernaryOp>; + +// Scalar FMA intrinsics with passthru bits in operand 1. def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>; def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>; def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>; def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>; +def X86Fmadds3 : SDNode<"X86ISD::FMADDS3", SDTFPTernaryOp>; +def X86Fnmadds3 : SDNode<"X86ISD::FNMADDS3", SDTFPTernaryOp>; +def X86Fmsubs3 : SDNode<"X86ISD::FMSUBS3", SDTFPTernaryOp>; +def X86Fnmsubs3 : SDNode<"X86ISD::FNMSUBS3", SDTFPTernaryOp>; + // Scalar FMA intrinsics with passthru bits in operand 3. def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound, [SDNPCommutative]>; def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound, [SDNPCommutative]>; @@ -498,10 +507,14 @@ def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>, def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>; def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>; +def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>; +def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>; def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>; def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>; def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>; +def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>; +def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>; def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>; def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>; def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>; @@ -578,7 +591,12 @@ def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>; def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>; def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>; + def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>]> >; + +def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, i16>, SDTCisVT<2, i32>]> >; diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index e665ec1f14dc..02a09c340cef 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -18,7 +18,7 @@ #include "X86InstrFMA3Info.h" #include "X86RegisterInfo.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "X86GenInstrInfo.inc" diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 559a8fcf107a..f00caa130d0b 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -850,7 +850,6 @@ def HasLWP : Predicate<"Subtarget->hasLWP()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; def HasF16C : Predicate<"Subtarget->hasF16C()">; -def NoF16C : Predicate<"!Subtarget->hasF16C()">; def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; def HasBMI : Predicate<"Subtarget->hasBMI()">; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 451303054f56..955a40ee171e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1514,13 +1514,12 @@ let mayLoad = 1 in def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], IIC_SSE_CVT_Scalar_RM>, - XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, + [], IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable; } def : Pat<(f32 (fpround FR64:$src)), - (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>, + (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, Requires<[UseAVX]>; def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), @@ -1574,20 +1573,18 @@ let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], IIC_SSE_CVT_Scalar_RR>, - XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, + [], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable; let mayLoad = 1 in def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], IIC_SSE_CVT_Scalar_RM>, - XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, + [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable; } def : Pat<(f64 (fpextend FR32:$src)), - (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>; + (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; def : Pat<(fpextend (loadf32 addr:$src)), (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; @@ -1899,7 +1896,7 @@ let Predicates = [HasAVX, NoVLX] in { (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), (VCVTTPD2DQrm addr:$src)>; } -} // Predicates = [HasAVX] +} // Predicates = [HasAVX, NoVLX] def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", @@ -3095,7 +3092,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, ValueType vt, ValueType ScalarVT, X86MemOperand x86memop, Intrinsic Intr, SDNode OpNode, Domain d, - OpndItins itins, string Suffix> { + OpndItins itins, Predicate target, string Suffix> { let hasSideEffects = 0 in { def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -3126,21 +3123,17 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, // vrcpss mem, %xmm0, %xmm0 // TODO: In theory, we could fold the load, and avoid the stall caused by // the partial register store, either in ExecutionDepsFix or with smarter RA. - let Predicates = [UseAVX] in { + let Predicates = [target] in { def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; - } - let Predicates = [HasAVX] in { def : Pat<(Intr VR128:$src), (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; } - let Predicates = [HasAVX, OptForSize] in { + let Predicates = [target, OptForSize] in { def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))), (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), addr:$src2)>; - } - let Predicates = [UseAVX, OptForSize] in { def : Pat<(ScalarVT (OpNode (load addr:$src))), (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), addr:$src)>; @@ -3186,7 +3179,7 @@ let Predicates = prds in { /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat("v", OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), @@ -3220,41 +3213,41 @@ let Predicates = [HasAVX] in { } multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { + OpndItins itins, Predicate AVXTarget> { defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, SSEPackedSingle, itins, UseSSE1, "SS">, XS; defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, f32mem, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, - SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG, - NotMemoryFoldable; + SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V, + VEX_LIG, VEX_WIG, NotMemoryFoldable; } multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { + OpndItins itins, Predicate AVXTarget> { defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD; defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, f64mem, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, SSEPackedDouble, itins, "SD">, + OpNode, SSEPackedDouble, itins, AVXTarget, "SD">, XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; } // Square root. -defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >; -defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>; +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>; +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>, + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>; // There is no f64 version of the reciprocal approximation instructions. @@ -7692,22 +7685,24 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { +multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop> { def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (Int VR128:$src))]>, + [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, T8PD, VEX, Sched<[WriteCvtF2F]>; let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX, - Sched<[WriteCvtF2FLd]>; + "vcvtph2ps\t{$src, $dst|$dst, $src}", + [(set RC:$dst, (X86cvtph2ps (bc_v8i16 + (loadv2i64 addr:$src))))]>, + T8PD, VEX, Sched<[WriteCvtF2FLd]>; } -multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { +multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop> { def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), (ins RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, + [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, TAPD, VEX, Sched<[WriteCvtF2F]>; let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteCvtF2FLd, WriteRMW] in @@ -7717,32 +7712,31 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { TAPD, VEX; } -let Predicates = [HasF16C] in { - defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; - defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; - defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; - defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; +let Predicates = [HasF16C, NoVLX] in { + defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem>; + defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem>, VEX_L; + defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem>; + defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem>, VEX_L; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)), + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(int_x86_vcvtph2ps_128 (bitconvert - (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 - (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), - addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16 - (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), - addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)), - addr:$dst), - (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; + def : Pat<(store (f64 (extractelt + (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (i64 (extractelt + (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 6f5e41bcdc62..9edac22d5baa 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -422,12 +422,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0), X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), @@ -1077,12 +1071,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FSUBS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FSUBS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK, X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM, - X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK, X86ISD::CVTPH2PS, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK, + X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK, X86ISD::CVTPS2PH, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK, @@ -1098,8 +1092,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, @@ -1220,8 +1214,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB, @@ -1239,8 +1233,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB, X86ISD::FMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), @@ -1259,8 +1253,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, @@ -1298,8 +1292,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, @@ -1428,26 +1422,26 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0), X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0), X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0), - X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), @@ -1468,6 +1462,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0), X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0), X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, ISD::FMA, 0), + X86_INTRINSIC_DATA(fma_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADDS1, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADDS1, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), @@ -1476,6 +1472,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_sd, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0), + X86_INTRINSIC_DATA(fma_vfmsub_ss, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0), X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), @@ -1484,10 +1482,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0), X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_sd, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_ss, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0), X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0), X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), @@ -1584,6 +1586,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), + X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), + X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), + X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0), X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0), X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0), diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 98b4863134e5..b1438bf7bc09 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -22,6 +22,38 @@ using namespace llvm; using namespace TargetOpcode; +/// FIXME: The following static functions are SizeChangeStrategy functions +/// that are meant to temporarily mimic the behaviour of the old legalization +/// based on doubling/halving non-legal types as closely as possible. This is +/// not entirly possible as only legalizing the types that are exactly a power +/// of 2 times the size of the legal types would require specifying all those +/// sizes explicitly. +/// In practice, not specifying those isn't a problem, and the below functions +/// should disappear quickly as we add support for legalizing non-power-of-2 +/// sized types further. +static void +addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result, + const LegalizerInfo::SizeAndActionsVec &v) { + for (unsigned i = 0; i < v.size(); ++i) { + result.push_back(v[i]); + if (i + 1 < v[i].first && i + 1 < v.size() && + v[i + 1].first != v[i].first + 1) + result.push_back({v[i].first + 1, LegalizerInfo::Unsupported}); + } +} + +static LegalizerInfo::SizeAndActionsVec +widen_1(const LegalizerInfo::SizeAndActionsVec &v) { + assert(v.size() >= 1); + assert(v[0].first > 1); + LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar}, + {2, LegalizerInfo::Unsupported}}; + addAndInterleaveWithUnsupported(result, v); + auto Largest = result.back().first; + result.push_back({Largest + 1, LegalizerInfo::Unsupported}); + return result; +} + X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM) : Subtarget(STI), TM(TM) { @@ -37,6 +69,17 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizerInfoAVX512DQ(); setLegalizerInfoAVX512BW(); + setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1); + for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR}) + setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1); + for (unsigned MemOp : {G_LOAD, G_STORE}) + setLegalizeScalarToDifferentSizeStrategy(MemOp, 0, + narrowToSmallerAndWidenToSmallest); + setLegalizeScalarToDifferentSizeStrategy( + G_GEP, 1, widenToLargerTypesUnsupportedOtherwise); + setLegalizeScalarToDifferentSizeStrategy( + G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest); + computeTables(); } @@ -47,7 +90,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { const LLT s8 = LLT::scalar(8); const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); - const LLT s64 = LLT::scalar(64); for (auto Ty : {p0, s1, s8, s16, s32}) setAction({G_IMPLICIT_DEF, Ty}, Legal); @@ -55,15 +97,10 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { for (auto Ty : {s8, s16, s32, p0}) setAction({G_PHI, Ty}, Legal); - setAction({G_PHI, s1}, WidenScalar); - - for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) { + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) for (auto Ty : {s8, s16, s32}) setAction({BinOp, Ty}, Legal); - setAction({BinOp, s1}, WidenScalar); - } - for (unsigned Op : {G_UADDE}) { setAction({Op, s32}, Legal); setAction({Op, 1, s1}, Legal); @@ -73,7 +110,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { for (auto Ty : {s8, s16, s32, p0}) setAction({MemOp, Ty}, Legal); - setAction({MemOp, s1}, WidenScalar); // And everything's fine in addrspace 0. setAction({MemOp, 1, p0}, Legal); } @@ -85,9 +121,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({G_GEP, p0}, Legal); setAction({G_GEP, 1, s32}, Legal); - for (auto Ty : {s1, s8, s16}) - setAction({G_GEP, 1, Ty}, WidenScalar); - // Control-flow setAction({G_BRCOND, s1}, Legal); @@ -95,9 +128,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { for (auto Ty : {s8, s16, s32, p0}) setAction({TargetOpcode::G_CONSTANT, Ty}, Legal); - setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar); - setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar); - // Extensions for (auto Ty : {s8, s16, s32}) { setAction({G_ZEXT, Ty}, Legal); @@ -105,12 +135,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({G_ANYEXT, Ty}, Legal); } - for (auto Ty : {s1, s8, s16}) { - setAction({G_ZEXT, 1, Ty}, Legal); - setAction({G_SEXT, 1, Ty}, Legal); - setAction({G_ANYEXT, 1, Ty}, Legal); - } - // Comparison setAction({G_ICMP, s1}, Legal); @@ -123,7 +147,6 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { if (!Subtarget.is64Bit()) return; - const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); setAction({G_IMPLICIT_DEF, s64}, Legal); @@ -145,7 +168,6 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { // Extensions for (unsigned extOp : {G_ZEXT, G_SEXT, G_ANYEXT}) { setAction({extOp, s64}, Legal); - setAction({extOp, 1, s32}, Legal); } // Comparison diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp index 0dd13077c37e..67d95c2233de 100644 --- a/lib/Target/X86/X86MacroFusion.cpp +++ b/lib/Target/X86/X86MacroFusion.cpp @@ -14,8 +14,8 @@ #include "X86MacroFusion.h" #include "X86Subtarget.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/TargetInstrInfo.h" using namespace llvm; diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp index 3069d1fd3497..9b7732c1db88 100644 --- a/lib/Target/X86/X86PadShortFunction.cpp +++ b/lib/Target/X86/X86PadShortFunction.cpp @@ -23,10 +23,10 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 1f49650340e7..efa0cd2c6bc1 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -27,14 +27,14 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index b0ce1335bd37..66fea1e688f1 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -161,6 +161,8 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, // In Regcall calling convention those registers are used for passing // parameters. Thus we need to prevent lazy binding in Regcall. return X86II::MO_GOTPCREL; + if (F && F->hasFnAttribute(Attribute::NonLazyBind) && is64Bit()) + return X86II::MO_GOTPCREL; return X86II::MO_PLT; } diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index a8d7f290688a..a21d068c7f4e 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -463,7 +463,7 @@ public: bool hasPCLMUL() const { return HasPCLMUL; } // Prefer FMA4 to FMA - its better for commutation/memory folding and // has equal or better performance on all supported targets. - bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; } + bool hasFMA() const { return HasFMA && !HasFMA4; } bool hasFMA4() const { return HasFMA4; } bool hasAnyFMA() const { return hasFMA() || hasFMA4(); } bool hasXOP() const { return HasXOP; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 6e6c724eb0af..11fe84f162da 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -436,4 +436,11 @@ void X86PassConfig::addPreEmitPass() { addPass(createX86FixupLEAs()); addPass(createX86EvexToVexInsts()); } + + // Verify basic block incoming and outgoing cfa offset and register values and + // correct CFA calculation rule where needed by inserting appropriate CFI + // instructions. + const Triple &TT = TM->getTargetTriple(); + if (!TT.isOSDarwin() && !TT.isOSWindows()) + addPass(createCFIInstrInserter()); } diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index effbd07fa315..6772d96c7997 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1437,7 +1437,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Entry->Cost; } - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src, I); } int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, @@ -2644,12 +2644,15 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8 { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8 + { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32 { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 - { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8 + { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8 + + { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32 }; static const CostTblEntry AVX2InterleavedStoreTbl[] = { diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index fb8c2a71c9ab..ba01f1e25ba5 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -26,13 +26,13 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include <cassert> diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp index fc08f1582ad7..8a186e94d9cf 100644 --- a/lib/Target/X86/X86WinAllocaExpander.cpp +++ b/lib/Target/X86/X86WinAllocaExpander.cpp @@ -25,9 +25,9 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; |