summaryrefslogtreecommitdiff
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp4
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp6
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp6
-rw-r--r--lib/Target/X86/X86.td14
-rw-r--r--lib/Target/X86/X86CallFrameOptimization.cpp2
-rw-r--r--lib/Target/X86/X86CallLowering.cpp2
-rw-r--r--lib/Target/X86/X86CmovConversion.cpp2
-rw-r--r--lib/Target/X86/X86EvexToVex.cpp20
-rw-r--r--lib/Target/X86/X86FixupBWInsts.cpp2
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp2
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp2
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp48
-rw-r--r--lib/Target/X86/X86FrameLowering.h6
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp12
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp305
-rw-r--r--lib/Target/X86/X86ISelLowering.h12
-rw-r--r--lib/Target/X86/X86InstrAVX512.td331
-rw-r--r--lib/Target/X86/X86InstrFMA.td80
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td22
-rw-r--r--lib/Target/X86/X86InstrInfo.h2
-rw-r--r--lib/Target/X86/X86InstrInfo.td1
-rw-r--r--lib/Target/X86/X86InstrSSE.td104
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h78
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp66
-rw-r--r--lib/Target/X86/X86MacroFusion.cpp2
-rw-r--r--lib/Target/X86/X86PadShortFunction.cpp2
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp4
-rw-r--r--lib/Target/X86/X86Subtarget.cpp2
-rw-r--r--lib/Target/X86/X86Subtarget.h2
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp7
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp7
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp2
-rw-r--r--lib/Target/X86/X86WinAllocaExpander.cpp2
33 files changed, 790 insertions, 369 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b8ea2f011332..2a784c9822a0 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -2791,7 +2791,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
isParsingIntelSyntax())) {
default: llvm_unreachable("Unexpected match result!");
case Match_Success:
- if (validateInstruction(Inst, Operands))
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
return true;
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the
@@ -3082,7 +3082,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// instruction will already have been filled in correctly, since the failing
// matches won't have modified it).
if (NumSuccessfulMatches == 1) {
- if (validateInstruction(Inst, Operands))
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
return true;
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the individual
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 6ff1136cd85a..0c99dbbe328b 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -54,12 +54,12 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
if (TSFlags & X86II::LOCK)
OS << "\tlock\t";
if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK)
- OS << "\tlock\n";
+ OS << "\tlock\t";
if (Flags & X86::IP_HAS_REPEAT_NE)
- OS << "\trepne\n";
+ OS << "\trepne\t";
else if (Flags & X86::IP_HAS_REPEAT)
- OS << "\trep\n";
+ OS << "\trep\t";
// Output CALLpcrel32 as "callq" in 64-bit mode.
// In Intel annotation it's always emitted as "call".
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 464941a1bab6..1f02600a7982 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -41,13 +41,13 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
uint64_t TSFlags = Desc.TSFlags;
if (TSFlags & X86II::LOCK)
- OS << "\tlock\n";
+ OS << "\tlock\t";
unsigned Flags = MI->getFlags();
if (Flags & X86::IP_HAS_REPEAT_NE)
- OS << "\trepne\n";
+ OS << "\trepne\t";
else if (Flags & X86::IP_HAS_REPEAT)
- OS << "\trep\n";
+ OS << "\trep\t";
printInstruction(MI, OS);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index f4021d7639b8..34f2956e0c0b 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -116,9 +116,15 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
"Enable AVX2 instructions",
[FeatureAVX]>;
+def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
+ "Enable three-operand fused multiple-add",
+ [FeatureAVX]>;
+def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
+ "Support 16-bit floating point conversion instructions",
+ [FeatureAVX]>;
def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
"Enable AVX-512 instructions",
- [FeatureAVX2]>;
+ [FeatureAVX2, FeatureFMA, FeatureF16C]>;
def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
"Enable AVX-512 Exponential and Reciprocal Instructions",
[FeatureAVX512]>;
@@ -154,9 +160,6 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
-def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
- "Enable three-operand fused multiple-add",
- [FeatureAVX]>;
def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
"Enable four-operand fused multiple-add",
[FeatureAVX, FeatureSSE4A]>;
@@ -177,9 +180,6 @@ def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
"Support MOVBE instruction">;
def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
"Support RDRAND instruction">;
-def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
- "Support 16-bit floating point conversion instructions",
- [FeatureAVX]>;
def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
"Support FS/GS Base instructions">;
def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 34e384ba3114..d8b5b7d3fc08 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -34,13 +34,13 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
#include <cstddef>
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index 7beb9c6e357b..54f937bcda3e 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -34,6 +34,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
@@ -41,7 +42,6 @@
#include "llvm/IR/Value.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <cassert>
#include <cstdint>
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index b2cd622b1e8c..e75276960cc8 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -57,6 +57,7 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCSchedule.h"
@@ -64,7 +65,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 744510a3a3ba..6dd4631a4844 100644
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -171,7 +171,7 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
case X86::VALIGNDZ128rri:
case X86::VALIGNDZ128rmi:
case X86::VALIGNQZ128rri:
- case X86::VALIGNQZ128rmi:
+ case X86::VALIGNQZ128rmi: {
assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
"Unexpected new opcode!");
unsigned Scale = (Opc == X86::VALIGNQZ128rri ||
@@ -180,6 +180,24 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
Imm.setImm(Imm.getImm() * Scale);
break;
}
+ case X86::VSHUFF32X4Z256rmi:
+ case X86::VSHUFF32X4Z256rri:
+ case X86::VSHUFF64X2Z256rmi:
+ case X86::VSHUFF64X2Z256rri:
+ case X86::VSHUFI32X4Z256rmi:
+ case X86::VSHUFI32X4Z256rri:
+ case X86::VSHUFI64X2Z256rmi:
+ case X86::VSHUFI64X2Z256rri: {
+ assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
+ NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
+ "Unexpected new opcode!");
+ MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+ int64_t ImmVal = Imm.getImm();
+ // Set bit 5, move bit 1 to bit 4, copy bit 0.
+ Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
+ break;
+ }
+ }
}
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index b2b5a78fcdbf..9664c931c35e 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -55,9 +55,9 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
#define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup"
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 9f649dad8bc0..bbc2bffdb703 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -22,9 +22,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
namespace llvm {
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 5582526541ba..7bcbe1991249 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -37,11 +37,11 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 988f2967401b..86e65b83ffa9 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1562,6 +1562,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
bool HasFP = hasFP(MF);
uint64_t NumBytes = 0;
+ bool NeedsDwarfCFI =
+ (!MF.getTarget().getTargetTriple().isOSDarwin() &&
+ !MF.getTarget().getTargetTriple().isOSWindows()) &&
+ (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
+
if (IsFunclet) {
assert(HasFP && "EH funclets without FP not yet implemented");
NumBytes = getWinEHFuncletFrameSize(MF);
@@ -1584,6 +1589,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
MachineFramePtr)
.setMIFlag(MachineInstr::FrameDestroy);
+ if (NeedsDwarfCFI) {
+ unsigned DwarfStackPtr =
+ TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
+ nullptr, DwarfStackPtr, -SlotSize));
+ --MBBI;
+ }
}
MachineBasicBlock::iterator FirstCSPop = MBBI;
@@ -1647,6 +1659,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
} else if (NumBytes) {
// Adjust stack pointer back: ESP += numbytes.
emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+ if (!hasFP(MF) && NeedsDwarfCFI) {
+ // Define the current CFA rule to use the provided offset.
+ BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+ nullptr, -CSSize - SlotSize));
+ }
--MBBI;
}
@@ -1659,6 +1676,23 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (NeedsWin64CFI && MF.hasWinCFI())
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+ if (!hasFP(MF) && NeedsDwarfCFI) {
+ MBBI = FirstCSPop;
+ int64_t Offset = -CSSize - SlotSize;
+ // Mark callee-saved pop instruction.
+ // Define the current CFA rule to use the provided offset.
+ while (MBBI != MBB.end()) {
+ MachineBasicBlock::iterator PI = MBBI;
+ unsigned Opc = PI->getOpcode();
+ ++MBBI;
+ if (Opc == X86::POP32r || Opc == X86::POP64r) {
+ Offset += SlotSize;
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createDefCfaOffset(nullptr, Offset));
+ }
+ }
+ }
+
if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
// Add the return addr area delta back since we are not tail calling.
int Offset = -1 * X86FI->getTCReturnAddrDelta();
@@ -2577,6 +2611,7 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
unsigned Regs[2];
unsigned FoundRegs = 0;
+ auto &MRI = MBB.getParent()->getRegInfo();
auto RegMask = Prev->getOperand(1);
auto &RegClass =
@@ -2590,6 +2625,10 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
if (!RegMask.clobbersPhysReg(Candidate))
continue;
+ // Don't clobber reserved registers
+ if (MRI.isReserved(Candidate))
+ continue;
+
bool IsDef = false;
for (const MachineOperand &MO : Prev->implicit_operands()) {
if (MO.isReg() && MO.isDef() &&
@@ -2835,6 +2874,15 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
return MBBI;
}
+int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+ return TRI->getSlotSize();
+}
+
+unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF)
+ const {
+ return TRI->getDwarfRegNum(StackPtr, true);
+}
+
namespace {
// Struct used by orderFrameObjects to help sort the stack objects.
struct X86FrameSortingObject {
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 38ac96e16d4e..2bce79262d01 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
@@ -168,6 +168,10 @@ public:
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, bool RestoreSP = false) const;
+ int getInitialCFAOffset(const MachineFunction &MF) const override;
+
+ unsigned getInitialCFARegister(const MachineFunction &MF) const override;
+
private:
uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index e43fd508de30..0cbf76017900 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -449,15 +449,15 @@ namespace {
// Returns true if this masked compare can be implemented legally with this
// type.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
- if (N->getOpcode() == X86ISD::PCMPEQM ||
- N->getOpcode() == X86ISD::PCMPGTM ||
- N->getOpcode() == X86ISD::CMPM ||
- N->getOpcode() == X86ISD::CMPMU) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM ||
+ Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM ||
+ Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
- if (N->getOperand(0).getValueType() == MVT::v8i32 ||
- N->getOperand(0).getValueType() == MVT::v8f32)
+ EVT OpVT = N->getOperand(0).getValueType();
+ if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32)
return Subtarget->hasVLX();
return true;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b178ad6c13e7..22b4d7997fa6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -380,8 +380,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Special handling for half-precision floating point conversions.
// If we don't have F16C support, then lower half float conversions
// into library calls.
- if (Subtarget.useSoftFloat() ||
- (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
+ if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
}
@@ -4998,6 +4997,8 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
switch (Opcode) {
default:
return false;
+ case X86ISD::TESTM:
+ case X86ISD::TESTNM:
case X86ISD::PCMPEQM:
case X86ISD::PCMPGTM:
case X86ISD::CMPM:
@@ -6746,6 +6747,9 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
+ BitVector UndefElements;
+ SDValue Ld = BVOp->getSplatValue(&UndefElements);
+
// Attempt to use VBROADCASTM
// From this paterrn:
// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
@@ -6753,17 +6757,23 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
//
// Create (VBROADCASTM v2i1 X)
if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
- MVT EltType;
- unsigned NumElts;
+ MVT EltType = VT.getScalarType();
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue BOperand;
SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
- if (ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) {
- SDValue BOperand = ZeroExtended.getOperand(0);
+ if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
+ (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
+ Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
+ if (ZeroExtended)
+ BOperand = ZeroExtended.getOperand(0);
+ else
+ BOperand = Ld.getOperand(0).getOperand(0);
if (BOperand.getValueType().isVector() &&
BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
- if ((EltType == MVT::i64 &&
- VT.getVectorElementType() == MVT::i8) || // for broadcastmb2q
- (EltType == MVT::i32 &&
- VT.getVectorElementType() == MVT::i16)) { // for broadcastmw2d
+ if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
+ NumElts == 8)) || // for broadcastmb2q
+ (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
+ NumElts == 16))) { // for broadcastmw2d
SDValue Brdcst =
DAG.getNode(X86ISD::VBROADCASTM, dl,
MVT::getVectorVT(EltType, NumElts), BOperand);
@@ -6773,9 +6783,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
}
}
- BitVector UndefElements;
- SDValue Ld = BVOp->getSplatValue(&UndefElements);
-
// We need a splat of a single value to use broadcast, and it doesn't
// make any sense if the value is only in one element of the vector.
if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
@@ -7707,6 +7714,111 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
+// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
+// reasoned to be a permutation of a vector by indices in a non-constant vector.
+// (build_vector (extract_elt V, (extract_elt I, 0)),
+// (extract_elt V, (extract_elt I, 1)),
+// ...
+// ->
+// (vpermv I, V)
+//
+// TODO: Handle undefs
+// TODO: Utilize pshufb and zero mask blending to support more efficient
+// construction of vectors with constant-0 elements.
+// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
+// when no native operation available.
+static SDValue
+LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Look for VPERMV and PSHUFB opportunities.
+ MVT VT = V.getSimpleValueType();
+ switch (VT.SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v16i8:
+ if (!Subtarget.hasSSE3())
+ return SDValue();
+ break;
+ case MVT::v8f32:
+ case MVT::v8i32:
+ if (!Subtarget.hasAVX2())
+ return SDValue();
+ break;
+ case MVT::v4i64:
+ case MVT::v4f64:
+ if (!Subtarget.hasVLX())
+ return SDValue();
+ break;
+ case MVT::v16f32:
+ case MVT::v8f64:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+ break;
+ case MVT::v32i16:
+ if (!Subtarget.hasBWI())
+ return SDValue();
+ break;
+ case MVT::v8i16:
+ case MVT::v16i16:
+ if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
+ return SDValue();
+ break;
+ case MVT::v64i8:
+ if (!Subtarget.hasVBMI())
+ return SDValue();
+ break;
+ case MVT::v32i8:
+ if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
+ return SDValue();
+ break;
+ }
+ SDValue SrcVec, IndicesVec;
+ // Check for a match of the permute source vector and permute index elements.
+ // This is done by checking that the i-th build_vector operand is of the form:
+ // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
+ for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
+ SDValue Op = V.getOperand(Idx);
+ if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // If this is the first extract encountered in V, set the source vector,
+ // otherwise verify the extract is from the previously defined source
+ // vector.
+ if (!SrcVec)
+ SrcVec = Op.getOperand(0);
+ else if (SrcVec != Op.getOperand(0))
+ return SDValue();
+ SDValue ExtractedIndex = Op->getOperand(1);
+ // Peek through extends.
+ if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
+ ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
+ ExtractedIndex = ExtractedIndex.getOperand(0);
+ if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // If this is the first extract from the index vector candidate, set the
+ // indices vector, otherwise verify the extract is from the previously
+ // defined indices vector.
+ if (!IndicesVec)
+ IndicesVec = ExtractedIndex.getOperand(0);
+ else if (IndicesVec != ExtractedIndex.getOperand(0))
+ return SDValue();
+
+ auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
+ if (!PermIdx || PermIdx->getZExtValue() != Idx)
+ return SDValue();
+ }
+ MVT IndicesVT = VT;
+ if (VT.isFloatingPoint())
+ IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
+ VT.getVectorNumElements());
+ IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
+ return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
+ SDLoc(V), VT, IndicesVec, SrcVec);
+}
+
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -7922,6 +8034,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (IsAllConstants)
return SDValue();
+ if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
+ return V;
+
// See if we can use a vector load to get all of the elements.
if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
@@ -10716,10 +10831,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
- if (Subtarget.hasSSSE3())
+ if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ }
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
@@ -11016,10 +11137,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
- if (Subtarget.hasSSSE3())
+ if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ }
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
@@ -12372,6 +12499,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
}
}
+
+ // Try to use SHUF128 if possible.
+ if (Subtarget.hasVLX()) {
+ if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
+ unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
+ ((WidenedMask[1] % 2) << 1);
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, DL, MVT::i8));
+ }
+ }
}
// Otherwise form a 128-bit permutation. After accounting for undefs,
@@ -13697,10 +13834,6 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
- return Shuf128;
-
if (V2.isUndef()) {
// When the shuffle is mirrored between the 128-bit lanes of the unit, we
// can use lower latency instructions that will operate on all four
@@ -13722,6 +13855,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
}
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Shuf128;
+
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -17333,6 +17470,20 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
if (Swap)
std::swap(Op0, Op1);
+
+ // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
+ if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
+ SDValue A = peekThroughBitcasts(Op0);
+ if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
+ ISD::isBuildVectorAllZeros(Op1.getNode())) {
+ MVT VT0 = Op0.getSimpleValueType();
+ SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
+ SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
+ return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
+ dl, VT, RHS, LHS);
+ }
+ }
+
if (Opc)
return DAG.getNode(Opc, dl, VT, Op0, Op1);
Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
@@ -19838,10 +19989,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
else
PassThru = Src1;
- SDValue Rnd = Op.getOperand(5);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd))
+ return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
+ Op.getValueType(), Src1, Src2,
+ Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
Op.getValueType(), Src1, Src2,
- Src3, Rnd),
+ Src3),
Mask, PassThru, Subtarget, DAG);
}
case IFMA_OP_MASKZ:
@@ -24786,9 +24946,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
- case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
case X86ISD::FRCP: return "X86ISD::FRCP";
- case X86ISD::FRCPS: return "X86ISD::FRCPS";
case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
@@ -24942,10 +25100,18 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
+ case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
+ case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
+ case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
+ case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
+ case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
+ case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
+ case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
+ case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
@@ -24966,9 +25132,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SELECT: return "X86ISD::SELECT";
case X86ISD::SELECTS: return "X86ISD::SELECTS";
case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP14: return "X86ISD::RCP14";
+ case X86ISD::RCP14S: return "X86ISD::RCP14S";
case X86ISD::RCP28: return "X86ISD::RCP28";
case X86ISD::RCP28S: return "X86ISD::RCP28S";
case X86ISD::EXP2: return "X86ISD::EXP2";
+ case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
+ case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
@@ -25006,6 +25176,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
+ case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
@@ -30314,10 +30485,17 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
+ if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
- if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+ // TODO - Remove this once we can handle the implicit zero-extension of
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in:
+ // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
+ // combineBasicSADPattern.
+ if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
SDValue InputVector = N->getOperand(0);
@@ -30464,16 +30642,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// TODO - merge with combineExtractVectorElt once it can handle the implicit
-// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
-// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
-// combineBasicSADPattern.
-static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
-}
-
/// If a vector select has an operand that is -1 or 0, try to simplify the
/// select to a bitwise logic operation.
/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
@@ -30674,26 +30842,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
- case X86ISD::PALIGNR:
- // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
- if (!VT.is128BitVector())
- return false;
- Opcode = X86ISD::VALIGN;
- LLVM_FALLTHROUGH;
- case X86ISD::VALIGN: {
- if (EltVT != MVT::i32 && EltVT != MVT::i64)
- return false;
- uint64_t Imm = Op.getConstantOperandVal(2);
- MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
- unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
- unsigned EltSize = EltVT.getSizeInBits();
- // Make sure we can represent the same shift with the new VT.
- if ((ShiftAmt % EltSize) != 0)
- return false;
- Imm = ShiftAmt / EltSize;
- return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
- DAG.getConstant(Imm, DL, MVT::i8));
- }
case X86ISD::SHUF128: {
if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
return false;
@@ -34441,8 +34589,9 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// This function transforms vector truncation of 'extended sign-bits' values.
-/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
+/// This function transforms vector truncation of 'extended sign-bits' or
+/// 'extended zero-bits' values.
+/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -34475,10 +34624,19 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
unsigned NumSignBits = DAG.ComputeNumSignBits(In);
unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
- if (NumSignBits <= (InSVT.getSizeInBits() - NumPackedBits))
- return SDValue();
+ if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+
+ // Use PACKUS if the input has zero-bits that extend all the way to the
+ // packed/truncated value. e.g. masks, zext_in_reg, etc.
+ KnownBits Known;
+ DAG.computeKnownBits(In, Known);
+ unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
+ NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
+ if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
+ return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
- return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+ return SDValue();
}
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
@@ -34507,7 +34665,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
}
- // Try to truncate extended sign bits with PACKSS.
+ // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
return V;
@@ -35341,9 +35499,11 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
// Do not convert the passthru input of scalar intrinsics.
// FIXME: We could allow negations of the lower element only.
- bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+ bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
+ N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
bool NegB = invertIfNegative(B);
- bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
+ bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
+ N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
// Negative multiplication when NegA xor NegB
bool NegMul = (NegA != NegB);
@@ -35371,6 +35531,20 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
}
+ } else if (N->getOpcode() == X86ISD::FMADDS1) {
+ switch (NewOpcode) {
+ case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
+ }
+ } else if (N->getOpcode() == X86ISD::FMADDS3) {
+ switch (NewOpcode) {
+ case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
+ }
} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
switch (NewOpcode) {
case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
@@ -36590,10 +36764,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
default: break;
case ISD::EXTRACT_VECTOR_ELT:
- return combineExtractVectorElt(N, DAG, DCI, Subtarget);
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
- return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
+ return combineExtractVectorElt(N, DAG, DCI, Subtarget);
case ISD::INSERT_SUBVECTOR:
return combineInsertSubvector(N, DAG, DCI, Subtarget);
case ISD::EXTRACT_SUBVECTOR:
@@ -36689,6 +36862,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FMADD_RND:
case X86ISD::FMADDS1_RND:
case X86ISD::FMADDS3_RND:
+ case X86ISD::FMADDS1:
+ case X86ISD::FMADDS3:
case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case ISD::MGATHER:
case ISD::MSCATTER: return combineGatherScatter(N, DAG);
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 17cb976a4c77..d1438e59f9b6 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -254,7 +254,9 @@ namespace llvm {
/// Note that these typically require refinement
/// in order to obtain suitable precision.
FRSQRT, FRCP,
- FRSQRTS, FRCPS,
+
+ // AVX-512 reciprocal approximations with a little more precision.
+ RSQRT14, RSQRT14S, RCP14, RCP14S,
// Thread Local Storage.
TLSADDR,
@@ -487,6 +489,12 @@ namespace llvm {
FMADDSUB_RND,
FMSUBADD_RND,
+ // Scalar intrinsic FMA.
+ FMADDS1, FMADDS3,
+ FNMADDS1, FNMADDS3,
+ FMSUBS1, FMSUBS3,
+ FNMSUBS1, FNMSUBS3,
+
// Scalar intrinsic FMA with rounding mode.
// Two versions, passthru bits on op1 or op3.
FMADDS1_RND, FMADDS3_RND,
@@ -555,7 +563,7 @@ namespace llvm {
RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
// Conversions between float and half-float.
- CVTPS2PH, CVTPH2PS,
+ CVTPS2PH, CVTPH2PS, CVTPH2PS_RND,
// LWP insert record.
LWPINS,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index a73ee19423d3..84b44ac677bf 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -6017,16 +6017,17 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
}
multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
- SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
+ string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
+ SDNode OpNodeRnds1, SDNode OpNodes3,
+ SDNode OpNodeRnds3, X86VectorVTInfo _,
+ string SUFF> {
let ExeDomain = _.ExeDomain in {
defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _,
// Operands for intrinsic are in 123 order to preserve passthu
// semantics.
- (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
- (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
- _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, _.RC:$src3)),
+ (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2,
+ _.ScalarIntMemCPat:$src3)),
(_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
@@ -6035,10 +6036,9 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(_.ScalarLdFrag addr:$src3)))), 0>;
defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _,
- (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
- (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
- _.RC:$src1, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodes3 _.RC:$src2, _.RC:$src3, _.RC:$src1)),
+ (_.VT (OpNodes3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
+ _.RC:$src1)),
(_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
@@ -6050,8 +6050,8 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _,
(null_frag),
- (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
- _.RC:$src2, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodes1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
+ _.RC:$src2)),
(null_frag),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
_.FRC:$src2))),
@@ -6061,26 +6061,29 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
}
multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
+ SDNode OpNodeRnds1, SDNode OpNodes3,
SDNode OpNodeRnds3> {
let Predicates = [HasAVX512] in {
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
+ OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
+ f32x_info, "SS">,
EVEX_CD8<32, CD8VT1>, VEX_LIG;
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
+ OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
+ f64x_info, "SD">,
EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
}
}
-defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
- X86FmaddRnds3>;
-defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
- X86FmsubRnds3>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
- X86FnmaddRnds1, X86FnmaddRnds3>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
- X86FnmsubRnds1, X86FnmsubRnds3>;
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86Fmadds1,
+ X86FmaddRnds1, X86Fmadds3, X86FmaddRnds3>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86Fmsubs1,
+ X86FmsubRnds1, X86Fmsubs3, X86FmsubRnds3>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1,
+ X86FnmaddRnds1, X86Fnmadds3, X86FnmaddRnds3>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1,
+ X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>;
//===----------------------------------------------------------------------===//
// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
@@ -6554,7 +6557,7 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
NotMemoryFoldable;
def : Pat<(f64 (fpextend FR32X:$src)),
- (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>,
+ (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
Requires<[HasAVX512]>;
def : Pat<(f64 (fpextend (loadf32 addr:$src))),
(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
@@ -6569,7 +6572,7 @@ def : Pat<(f64 (extloadf32 addr:$src)),
Requires<[HasAVX512, OptForSpeed]>;
def : Pat<(f32 (fpround FR64X:$src)),
- (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>,
+ (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
Requires<[HasAVX512]>;
def : Pat<(v4f32 (X86Movss
@@ -7174,34 +7177,44 @@ def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
//===----------------------------------------------------------------------===//
multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
X86MemOperand x86memop, PatFrag ld_frag> {
- defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
- "vcvtph2ps", "$src", "$src",
- (X86cvtph2ps (_src.VT _src.RC:$src),
- (i32 FROUND_CURRENT))>, T8PD;
- defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
- "vcvtph2ps", "$src", "$src",
- (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
- (i32 FROUND_CURRENT))>, T8PD;
+ defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
+ (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT _src.RC:$src))>, T8PD;
+ defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
+ (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT
+ (bitconvert
+ (ld_frag addr:$src))))>, T8PD;
}
multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
- defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
- "vcvtph2ps", "{sae}, $src", "$src, {sae}",
- (X86cvtph2ps (_src.VT _src.RC:$src),
- (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
+ defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
+ (ins _src.RC:$src), "vcvtph2ps",
+ "{sae}, $src", "$src, {sae}",
+ (X86cvtph2psRnd (_src.VT _src.RC:$src),
+ (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
}
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512] in
defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
- let Predicates = [HasVLX] in {
- defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
- loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
- defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
- loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
- }
+
+let Predicates = [HasVLX] in {
+ defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
+ loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+ loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+
+ // Pattern match vcvtph2ps of a scalar i64 load.
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (VCVTPH2PSZ128rm addr:$src)>;
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+ (VCVTPH2PSZ128rm addr:$src)>;
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (VCVTPH2PSZ128rm addr:$src)>;
}
multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
@@ -7212,17 +7225,16 @@ multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
(X86cvtps2ph (_src.VT _src.RC:$src1),
(i32 imm:$src2)),
NoItinerary, 0, 0>, AVX512AIi8Base;
- def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
- (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
- (i32 imm:$src2))),
- addr:$dst)]>;
- let hasSideEffects = 0, mayStore = 1 in
- def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
- (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- []>, EVEX_K;
+ let hasSideEffects = 0, mayStore = 1 in {
+ def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ []>;
+ def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ []>, EVEX_K;
+ }
}
multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
let hasSideEffects = 0 in
@@ -7242,6 +7254,19 @@ let Predicates = [HasAVX512] in {
defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>,
EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
}
+
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+ def : Pat<(store (i64 (extractelt
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst),
+ (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>;
+ def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst),
+ (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
@@ -7264,35 +7289,6 @@ let Predicates = [HasVLX] in {
(VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
}
-// Patterns for matching float to half-float conversion when AVX512 is supported
-// but F16C isn't. In that case we have to use 512-bit vectors.
-let Predicates = [HasAVX512, NoVLX, NoF16C] in {
- def : Pat<(fp_to_f16 FR32X:$src),
- (i16 (EXTRACT_SUBREG
- (VMOVPDI2DIZrr
- (v8i16 (EXTRACT_SUBREG
- (VCVTPS2PHZrr
- (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
- sub_xmm), 4), sub_xmm))), sub_16bit))>;
-
- def : Pat<(f16_to_fp GR16:$src),
- (f32 (COPY_TO_REGCLASS
- (v4f32 (EXTRACT_SUBREG
- (VCVTPH2PSZrr
- (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)),
- (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)),
- sub_xmm)), sub_xmm)), FR32X))>;
-
- def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
- (f32 (COPY_TO_REGCLASS
- (v4f32 (EXTRACT_SUBREG
- (VCVTPH2PSZrr
- (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
- sub_xmm), 4)), sub_xmm)), FR32X))>;
-}
-
// Unordered/Ordered scalar fp compare with Sea and set EFLAGS
multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr> {
@@ -7362,13 +7358,13 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
}
-defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, f32x_info>,
EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, f64x_info>,
VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, f32x_info>,
EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, f64x_info>,
VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
@@ -7414,8 +7410,8 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
}
}
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14>;
/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -7582,7 +7578,8 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
}
multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+ string SUFF, SDNode OpNode, SDNode OpNodeRnd,
+ Intrinsic Intr> {
let ExeDomain = _.ExeDomain in {
defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -7618,21 +7615,35 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
}
}
+let Predicates = [HasAVX512] in {
def : Pat<(_.EltVT (OpNode _.FRC:$src)),
(!cast<Instruction>(NAME#SUFF#Zr)
(_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+ def : Pat<(Intr VR128X:$src),
+ (!cast<Instruction>(NAME#SUFF#Zr_Int) VR128X:$src,
+ VR128X:$src)>;
+}
+
+let Predicates = [HasAVX512, OptForSize] in {
def : Pat<(_.EltVT (OpNode (load addr:$src))),
(!cast<Instruction>(NAME#SUFF#Zm)
- (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>;
+ (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
+
+ def : Pat<(Intr (scalar_to_vector (_.EltVT (load addr:$src2)))),
+ (!cast<Instruction>(NAME#SUFF#Zm_Int)
+ (_.VT (IMPLICIT_DEF)), addr:$src2)>;
+}
+
}
multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
- X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS,
- NotMemoryFoldable;
+ X86fsqrtRnds, int_x86_sse_sqrt_ss>,
+ EVEX_CD8<32, CD8VT1>, EVEX_4V, XS, NotMemoryFoldable;
defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
- X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W,
+ X86fsqrtRnds, int_x86_sse2_sqrt_sd>,
+ EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W,
NotMemoryFoldable;
}
@@ -7641,19 +7652,6 @@ defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
-let Predicates = [HasAVX512] in {
- def : Pat<(f32 (X86frsqrt FR32X:$src)),
- (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
- def : Pat<(f32 (X86frsqrt (load addr:$src))),
- (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
- Requires<[OptForSize]>;
- def : Pat<(f32 (X86frcp FR32X:$src)),
- (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
- def : Pat<(f32 (X86frcp (load addr:$src))),
- (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
- Requires<[OptForSize]>;
-}
-
multiclass
avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
@@ -8911,6 +8909,123 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
avx512vl_i8_info, avx512vl_i8_info>,
EVEX_CD8<8, CD8VF>;
+// Fragments to help convert valignq into masked valignd. Or valignq/valignd
+// into vpalignr.
+def ValignqImm32XForm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() * 2, SDLoc(N));
+}]>;
+def ValignqImm8XForm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() * 8, SDLoc(N));
+}]>;
+def ValigndImm8XForm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() * 4, SDLoc(N));
+}]>;
+
+multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ SDNodeXForm ImmXForm> {
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ imm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, To.RC:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ imm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
+ To.RC:$src1, To.RC:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.LdFrag addr:$src2)),
+ imm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.LdFrag addr:$src2)),
+ imm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+}
+
+multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDNodeXForm ImmXForm> :
+ avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> {
+ def : Pat<(From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.VT (X86VBroadcast
+ (To.ScalarLdFrag addr:$src2)))),
+ imm:$src3)),
+ (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (X86VBroadcast
+ (To.ScalarLdFrag addr:$src2)))),
+ imm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (X86VBroadcast
+ (To.ScalarLdFrag addr:$src2)))),
+ imm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+}
+
+let Predicates = [HasAVX512] in {
+ // For 512-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info,
+ v16i32_info, ValignqImm32XForm>;
+}
+
+let Predicates = [HasVLX] in {
+ // For 128-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info,
+ v4i32x_info, ValignqImm32XForm>;
+ // For 256-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info,
+ v8i32x_info, ValignqImm32XForm>;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+ // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR.
+ defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info,
+ v16i8x_info, ValignqImm8XForm>;
+ defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info,
+ v16i8x_info, ValigndImm8XForm>;
+}
+
defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 453dcd83df1f..15466c2978fc 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -290,8 +290,7 @@ multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
}
multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, Intrinsic IntF32, Intrinsic IntF64,
- SDNode OpNode> {
+ string OpStr, SDNode OpNodeIntrin, SDNode OpNode> {
let ExeDomain = SSEPackedSingle in
defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
FR32, f32mem>,
@@ -309,43 +308,44 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// This is because src1 is tied to dest, and the scalar intrinsics
// require the pass-through values to come from the first source
// operand, not the second.
- // TODO: Use AVX512 instructions when possible.
- let Predicates = [HasFMA] in {
- def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+ let Predicates = [HasFMA, NoAVX512] in {
+ def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
(!cast<Instruction>(NAME#"213SSr_Int")
VR128:$src1, VR128:$src2, VR128:$src3)>;
- def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+ def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
(!cast<Instruction>(NAME#"213SDr_Int")
VR128:$src1, VR128:$src2, VR128:$src3)>;
- def : Pat<(IntF32 VR128:$src1, VR128:$src2, sse_load_f32:$src3),
+ def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2,
+ sse_load_f32:$src3)),
(!cast<Instruction>(NAME#"213SSm_Int")
VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
- def : Pat<(IntF64 VR128:$src1, VR128:$src2, sse_load_f64:$src3),
+ def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2,
+ sse_load_f64:$src3)),
(!cast<Instruction>(NAME#"213SDm_Int")
VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
- def : Pat<(IntF32 VR128:$src1, sse_load_f32:$src3, VR128:$src2),
+ def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, sse_load_f32:$src3,
+ VR128:$src2)),
(!cast<Instruction>(NAME#"132SSm_Int")
VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
- def : Pat<(IntF64 VR128:$src1, sse_load_f64:$src3, VR128:$src2),
+ def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, sse_load_f64:$src3,
+ VR128:$src2)),
(!cast<Instruction>(NAME#"132SDm_Int")
VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
}
}
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
- int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
- int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadds1, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsubs1, X86Fmsub>, VEX_LIG;
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
- int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
- int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd>,
+ VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub>,
+ VEX_LIG;
//===----------------------------------------------------------------------===//
@@ -385,26 +385,28 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
}
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
- ComplexPattern mem_cpat, Intrinsic Int> {
+ ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> {
let isCodeGenOnly = 1 in {
def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG;
+ (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W,
+ VEX_LIG;
def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
- mem_cpat:$src3))]>, VEX_W, VEX_LIG;
+ [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2,
+ mem_cpat:$src3)))]>, VEX_W, VEX_LIG;
def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+ (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>,
+ VEX_LIG;
let hasSideEffects = 0 in
def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
@@ -475,19 +477,19 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
let ExeDomain = SSEPackedSingle in {
// Scalar Instructions
defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
- fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
- int_x86_fma_vfmadd_ss>;
+ fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32,
+ X86Fmadds1>;
defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
- fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
- int_x86_fma_vfmsub_ss>;
+ fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32,
+ X86Fmsubs1>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
X86Fnmadd, loadf32>,
- fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
- int_x86_fma_vfnmadd_ss>;
+ fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32,
+ X86Fnmadds1>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
X86Fnmsub, loadf32>,
- fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
- int_x86_fma_vfnmsub_ss>;
+ fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32,
+ X86Fnmsubs1>;
// Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32>;
@@ -506,19 +508,19 @@ let ExeDomain = SSEPackedSingle in {
let ExeDomain = SSEPackedDouble in {
// Scalar Instructions
defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
- fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
- int_x86_fma_vfmadd_sd>;
+ fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64,
+ X86Fmadds1>;
defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
- fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
- int_x86_fma_vfmsub_sd>;
+ fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64,
+ X86Fmsubs1>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
X86Fnmadd, loadf64>,
- fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
- int_x86_fma_vfnmadd_sd>;
+ fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64,
+ X86Fnmadds1>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
X86Fnmsub, loadf64>,
- fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
- int_x86_fma_vfnmsub_sd>;
+ fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64,
+ X86Fnmsubs1>;
// Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index c4f34bdd37e6..d30400836bbc 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -56,8 +56,6 @@ def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>;
def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
-def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS", SDTFPBinOp>;
-def X86frcp14s : SDNode<"X86ISD::FRCPS", SDTFPBinOp>;
def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
@@ -482,11 +480,22 @@ def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutat
def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>;
// Scalar FMA intrinsics with passthru bits in operand 1.
+def X86Fmadds1 : SDNode<"X86ISD::FMADDS1", SDTFPTernaryOp>;
+def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>;
+def X86Fmsubs1 : SDNode<"X86ISD::FMSUBS1", SDTFPTernaryOp>;
+def X86Fnmsubs1 : SDNode<"X86ISD::FNMSUBS1", SDTFPTernaryOp>;
+
+// Scalar FMA intrinsics with passthru bits in operand 1.
def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>;
def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>;
def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>;
def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>;
+def X86Fmadds3 : SDNode<"X86ISD::FMADDS3", SDTFPTernaryOp>;
+def X86Fnmadds3 : SDNode<"X86ISD::FNMADDS3", SDTFPTernaryOp>;
+def X86Fmsubs3 : SDNode<"X86ISD::FMSUBS3", SDTFPTernaryOp>;
+def X86Fnmsubs3 : SDNode<"X86ISD::FNMSUBS3", SDTFPTernaryOp>;
+
// Scalar FMA intrinsics with passthru bits in operand 3.
def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound, [SDNPCommutative]>;
def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound, [SDNPCommutative]>;
@@ -498,10 +507,14 @@ def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>;
+def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>;
+def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>;
def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>;
+def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>;
+def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>;
def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>;
def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>;
def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>;
@@ -578,7 +591,12 @@ def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>;
def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>;
def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>;
+
def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>]> >;
+
+def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, i16>,
SDTCisVT<2, i32>]> >;
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index e665ec1f14dc..02a09c340cef 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -18,7 +18,7 @@
#include "X86InstrFMA3Info.h"
#include "X86RegisterInfo.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "X86GenInstrInfo.inc"
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 559a8fcf107a..f00caa130d0b 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -850,7 +850,6 @@ def HasLWP : Predicate<"Subtarget->hasLWP()">;
def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
def HasF16C : Predicate<"Subtarget->hasF16C()">;
-def NoF16C : Predicate<"!Subtarget->hasF16C()">;
def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
def HasBMI : Predicate<"Subtarget->hasBMI()">;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 451303054f56..955a40ee171e 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1514,13 +1514,12 @@ let mayLoad = 1 in
def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
(ins FR32:$src1, f64mem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RM>,
- XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+ [], IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_LIG,
Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
}
def : Pat<(f32 (fpround FR64:$src)),
- (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
+ (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
Requires<[UseAVX]>;
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
@@ -1574,20 +1573,18 @@ let hasSideEffects = 0, Predicates = [UseAVX] in {
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RR>,
- XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+ [], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_LIG,
Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable;
let mayLoad = 1 in
def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
(ins FR64:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RM>,
- XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+ [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG,
Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
}
def : Pat<(f64 (fpextend FR32:$src)),
- (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>;
+ (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
def : Pat<(fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
@@ -1899,7 +1896,7 @@ let Predicates = [HasAVX, NoVLX] in {
(v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
(VCVTTPD2DQrm addr:$src)>;
}
-} // Predicates = [HasAVX]
+} // Predicates = [HasAVX, NoVLX]
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
@@ -3095,7 +3092,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
X86MemOperand x86memop,
Intrinsic Intr, SDNode OpNode, Domain d,
- OpndItins itins, string Suffix> {
+ OpndItins itins, Predicate target, string Suffix> {
let hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3126,21 +3123,17 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
// vrcpss mem, %xmm0, %xmm0
// TODO: In theory, we could fold the load, and avoid the stall caused by
// the partial register store, either in ExecutionDepsFix or with smarter RA.
- let Predicates = [UseAVX] in {
+ let Predicates = [target] in {
def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r)
(ScalarVT (IMPLICIT_DEF)), RC:$src)>;
- }
- let Predicates = [HasAVX] in {
def : Pat<(Intr VR128:$src),
(!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
VR128:$src)>;
}
- let Predicates = [HasAVX, OptForSize] in {
+ let Predicates = [target, OptForSize] in {
def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
(!cast<Instruction>("V"#NAME#Suffix##m_Int)
(vt (IMPLICIT_DEF)), addr:$src2)>;
- }
- let Predicates = [UseAVX, OptForSize] in {
def : Pat<(ScalarVT (OpNode (load addr:$src))),
(!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
addr:$src)>;
@@ -3186,7 +3179,7 @@ let Predicates = prds in {
/// sse2_fp_unop_p - SSE2 unops in vector forms.
multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
SDNode OpNode, OpndItins itins> {
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat("v", OpcodeStr,
"pd\t{$src, $dst|$dst, $src}"),
@@ -3220,41 +3213,41 @@ let Predicates = [HasAVX] in {
}
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
+ OpndItins itins, Predicate AVXTarget> {
defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
SSEPackedSingle, itins, UseSSE1, "SS">, XS;
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
f32mem,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
- SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG,
- NotMemoryFoldable;
+ SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
+ VEX_LIG, VEX_WIG, NotMemoryFoldable;
}
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
+ OpndItins itins, Predicate AVXTarget> {
defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
f64mem,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
- OpNode, SSEPackedDouble, itins, "SD">,
+ OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
}
// Square root.
-defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
- sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
- sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>,
+ sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>,
sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
- sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
-defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
- sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>;
+defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>;
// There is no f64 version of the reciprocal approximation instructions.
@@ -7692,22 +7685,24 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
//===----------------------------------------------------------------------===//
// Half precision conversion instructions
//===----------------------------------------------------------------------===//
-multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop> {
def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
"vcvtph2ps\t{$src, $dst|$dst, $src}",
- [(set RC:$dst, (Int VR128:$src))]>,
+ [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
T8PD, VEX, Sched<[WriteCvtF2F]>;
let hasSideEffects = 0, mayLoad = 1 in
def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
- "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
- Sched<[WriteCvtF2FLd]>;
+ "vcvtph2ps\t{$src, $dst|$dst, $src}",
+ [(set RC:$dst, (X86cvtph2ps (bc_v8i16
+ (loadv2i64 addr:$src))))]>,
+ T8PD, VEX, Sched<[WriteCvtF2FLd]>;
}
-multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop> {
def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
(ins RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+ [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
TAPD, VEX, Sched<[WriteCvtF2F]>;
let hasSideEffects = 0, mayStore = 1,
SchedRW = [WriteCvtF2FLd, WriteRMW] in
@@ -7717,32 +7712,31 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
TAPD, VEX;
}
-let Predicates = [HasF16C] in {
- defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
- defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
- defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
- defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+let Predicates = [HasF16C, NoVLX] in {
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem>;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem>, VEX_L;
+ defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem>;
+ defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem>, VEX_L;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
- (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
- addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
- def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
- (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
- addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
- def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
- addr:$dst),
- (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (i64 (extractelt
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
+ (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 6f5e41bcdc62..9edac22d5baa 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -422,12 +422,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
@@ -1077,12 +1071,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FSUBS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FSUBS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
- X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK,
X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK,
@@ -1098,8 +1092,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
@@ -1220,8 +1214,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
@@ -1239,8 +1233,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
X86ISD::FMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
@@ -1259,8 +1253,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
@@ -1298,8 +1292,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
@@ -1428,26 +1422,26 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0),
X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0),
X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
@@ -1468,6 +1462,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0),
X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0),
X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
@@ -1476,6 +1472,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_sd, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_ss, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
@@ -1484,10 +1482,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_sd, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_ss, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
@@ -1584,6 +1586,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
+ X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 98b4863134e5..b1438bf7bc09 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -22,6 +22,38 @@
using namespace llvm;
using namespace TargetOpcode;
+/// FIXME: The following static functions are SizeChangeStrategy functions
+/// that are meant to temporarily mimic the behaviour of the old legalization
+/// based on doubling/halving non-legal types as closely as possible. This is
+/// not entirly possible as only legalizing the types that are exactly a power
+/// of 2 times the size of the legal types would require specifying all those
+/// sizes explicitly.
+/// In practice, not specifying those isn't a problem, and the below functions
+/// should disappear quickly as we add support for legalizing non-power-of-2
+/// sized types further.
+static void
+addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
+ const LegalizerInfo::SizeAndActionsVec &v) {
+ for (unsigned i = 0; i < v.size(); ++i) {
+ result.push_back(v[i]);
+ if (i + 1 < v[i].first && i + 1 < v.size() &&
+ v[i + 1].first != v[i].first + 1)
+ result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+ }
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 1);
+ LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
+ {2, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ return result;
+}
+
X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
const X86TargetMachine &TM)
: Subtarget(STI), TM(TM) {
@@ -37,6 +69,17 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
setLegalizerInfoAVX512DQ();
setLegalizerInfoAVX512BW();
+ setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
+ for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
+ narrowToSmallerAndWidenToSmallest);
+ setLegalizeScalarToDifferentSizeStrategy(
+ G_GEP, 1, widenToLargerTypesUnsupportedOtherwise);
+ setLegalizeScalarToDifferentSizeStrategy(
+ G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
+
computeTables();
}
@@ -47,7 +90,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
const LLT s8 = LLT::scalar(8);
const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
- const LLT s64 = LLT::scalar(64);
for (auto Ty : {p0, s1, s8, s16, s32})
setAction({G_IMPLICIT_DEF, Ty}, Legal);
@@ -55,15 +97,10 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
for (auto Ty : {s8, s16, s32, p0})
setAction({G_PHI, Ty}, Legal);
- setAction({G_PHI, s1}, WidenScalar);
-
- for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) {
+ for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
for (auto Ty : {s8, s16, s32})
setAction({BinOp, Ty}, Legal);
- setAction({BinOp, s1}, WidenScalar);
- }
-
for (unsigned Op : {G_UADDE}) {
setAction({Op, s32}, Legal);
setAction({Op, 1, s1}, Legal);
@@ -73,7 +110,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
for (auto Ty : {s8, s16, s32, p0})
setAction({MemOp, Ty}, Legal);
- setAction({MemOp, s1}, WidenScalar);
// And everything's fine in addrspace 0.
setAction({MemOp, 1, p0}, Legal);
}
@@ -85,9 +121,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_GEP, p0}, Legal);
setAction({G_GEP, 1, s32}, Legal);
- for (auto Ty : {s1, s8, s16})
- setAction({G_GEP, 1, Ty}, WidenScalar);
-
// Control-flow
setAction({G_BRCOND, s1}, Legal);
@@ -95,9 +128,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
for (auto Ty : {s8, s16, s32, p0})
setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
- setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
- setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar);
-
// Extensions
for (auto Ty : {s8, s16, s32}) {
setAction({G_ZEXT, Ty}, Legal);
@@ -105,12 +135,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_ANYEXT, Ty}, Legal);
}
- for (auto Ty : {s1, s8, s16}) {
- setAction({G_ZEXT, 1, Ty}, Legal);
- setAction({G_SEXT, 1, Ty}, Legal);
- setAction({G_ANYEXT, 1, Ty}, Legal);
- }
-
// Comparison
setAction({G_ICMP, s1}, Legal);
@@ -123,7 +147,6 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
if (!Subtarget.is64Bit())
return;
- const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
setAction({G_IMPLICIT_DEF, s64}, Legal);
@@ -145,7 +168,6 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
// Extensions
for (unsigned extOp : {G_ZEXT, G_SEXT, G_ANYEXT}) {
setAction({extOp, s64}, Legal);
- setAction({extOp, 1, s32}, Legal);
}
// Comparison
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 0dd13077c37e..67d95c2233de 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -14,8 +14,8 @@
#include "X86MacroFusion.h"
#include "X86Subtarget.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 3069d1fd3497..9b7732c1db88 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -23,10 +23,10 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 1f49650340e7..efa0cd2c6bc1 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -27,14 +27,14 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index b0ce1335bd37..66fea1e688f1 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -161,6 +161,8 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
// In Regcall calling convention those registers are used for passing
// parameters. Thus we need to prevent lazy binding in Regcall.
return X86II::MO_GOTPCREL;
+ if (F && F->hasFnAttribute(Attribute::NonLazyBind) && is64Bit())
+ return X86II::MO_GOTPCREL;
return X86II::MO_PLT;
}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index a8d7f290688a..a21d068c7f4e 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -463,7 +463,7 @@ public:
bool hasPCLMUL() const { return HasPCLMUL; }
// Prefer FMA4 to FMA - its better for commutation/memory folding and
// has equal or better performance on all supported targets.
- bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; }
+ bool hasFMA() const { return HasFMA && !HasFMA4; }
bool hasFMA4() const { return HasFMA4; }
bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
bool hasXOP() const { return HasXOP; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 6e6c724eb0af..11fe84f162da 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -436,4 +436,11 @@ void X86PassConfig::addPreEmitPass() {
addPass(createX86FixupLEAs());
addPass(createX86EvexToVexInsts());
}
+
+ // Verify basic block incoming and outgoing cfa offset and register values and
+ // correct CFA calculation rule where needed by inserting appropriate CFI
+ // instructions.
+ const Triple &TT = TM->getTargetTriple();
+ if (!TT.isOSDarwin() && !TT.isOSWindows())
+ addPass(createCFIInstrInserter());
}
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index effbd07fa315..6772d96c7997 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1437,7 +1437,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Entry->Cost;
}
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
}
int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
@@ -2644,12 +2644,15 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
{ 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
{ 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
{ 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
+ { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
{ 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
{ 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
{ 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
{ 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
- { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
+ { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
+
+ { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
};
static const CostTblEntry AVX2InterleavedStoreTbl[] = {
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index fb8c2a71c9ab..ba01f1e25ba5 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -26,13 +26,13 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index fc08f1582ad7..8a186e94d9cf 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -25,9 +25,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;