summaryrefslogtreecommitdiffstats
path: root/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp2193
1 files changed, 1565 insertions, 628 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2a35061..cbaf44e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86-isel"
#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86CallingConv.h"
@@ -23,6 +22,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/VariadicFunction.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -52,6 +52,8 @@
#include <cctype>
using namespace llvm;
+#define DEBUG_TYPE "x86-isel"
+
STATISTIC(NumTailCalls, "Number of tail calls");
// Forward declarations.
@@ -84,7 +86,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+ makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+ ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
@@ -265,10 +268,10 @@ void X86TargetLowering::resetOperationActions() {
// The _ftol2 runtime function has an unusual calling conv, which
// is modeled by a special pseudo-instruction.
- setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
- setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
- setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
- setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
+ setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
+ setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
+ setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
+ setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
}
if (Subtarget->isTargetDarwin()) {
@@ -635,15 +638,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- if (Subtarget->isOSWindows() && !Subtarget->isTargetMacho())
- setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
- MVT::i64 : MVT::i32, Custom);
- else if (TM.Options.EnableSegmentedStacks)
- setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
- MVT::i64 : MVT::i32, Custom);
- else
- setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
- MVT::i64 : MVT::i32, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Custom);
if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
// f32 and f64 use SSE.
@@ -832,7 +828,9 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FRINT, VT, Expand);
setOperationAction(ISD::FNEARBYINT, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
@@ -944,6 +942,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::ADD, MVT::v2i64, Legal);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
setOperationAction(ISD::SUB, MVT::v16i8, Legal);
setOperationAction(ISD::SUB, MVT::v8i16, Legal);
setOperationAction(ISD::SUB, MVT::v4i32, Legal);
@@ -1036,6 +1038,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal);
+
+ setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
}
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
@@ -1064,11 +1070,14 @@ void X86TargetLowering::resetOperationActions() {
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
- setOperationAction(ISD::VSELECT, MVT::v2i64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4i32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v8i16, Custom);
+ // There is no BLENDI for byte vectors. We don't need to custom lower
+ // some vselects for now.
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
- setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
// i8 and i16 vectors are custom , because the source register and source
// source memory operand types are not the same width. f32 vectors are
@@ -1111,9 +1120,6 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
setOperationAction(ISD::SRA, MVT::v4i32, Custom);
-
- setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
- setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
}
if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
@@ -1178,8 +1184,6 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SRA, MVT::v16i16, Custom);
setOperationAction(ISD::SRA, MVT::v32i8, Custom);
- setOperationAction(ISD::SDIV, MVT::v16i16, Custom);
-
setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
@@ -1189,10 +1193,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
- setOperationAction(ISD::VSELECT, MVT::v4i64, Legal);
- setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v8i32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v8f32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
@@ -1232,9 +1236,13 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::MUL, MVT::v16i16, Legal);
// Don't lower v32i8 because there is no 128-bit byte mul
- setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+ setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
- setOperationAction(ISD::SDIV, MVT::v8i32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
@@ -1343,7 +1351,6 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
setOperationAction(ISD::FMA, MVT::v8f64, Legal);
setOperationAction(ISD::FMA, MVT::v16f32, Legal);
- setOperationAction(ISD::SDIV, MVT::v16i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
@@ -1358,9 +1365,11 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
@@ -1392,6 +1401,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
@@ -1474,6 +1485,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ if (!Subtarget->is64Bit())
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
@@ -1498,9 +1511,9 @@ void X86TargetLowering::resetOperationActions() {
if (!Subtarget->is64Bit()) {
// These libcalls are not available in 32-bit.
- setLibcallName(RTLIB::SHL_I128, 0);
- setLibcallName(RTLIB::SRL_I128, 0);
- setLibcallName(RTLIB::SRA_I128, 0);
+ setLibcallName(RTLIB::SHL_I128, nullptr);
+ setLibcallName(RTLIB::SRL_I128, nullptr);
+ setLibcallName(RTLIB::SRA_I128, nullptr);
}
// Combine sin / cos into one node or libcall if possible.
@@ -1516,6 +1529,15 @@ void X86TargetLowering::resetOperationActions() {
}
}
+ if (Subtarget->isTargetWin64()) {
+ setOperationAction(ISD::SDIV, MVT::i128, Custom);
+ setOperationAction(ISD::UDIV, MVT::i128, Custom);
+ setOperationAction(ISD::SREM, MVT::i128, Custom);
+ setOperationAction(ISD::UREM, MVT::i128, Custom);
+ setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+ setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+ }
+
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1540,6 +1562,7 @@ void X86TargetLowering::resetOperationActions() {
setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
@@ -1738,7 +1761,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
// FIXME: Why this routine is here? Move to RegInfo!
std::pair<const TargetRegisterClass*, uint8_t>
X86TargetLowering::findRepresentativeClass(MVT VT) const{
- const TargetRegisterClass *RRC = 0;
+ const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
@@ -1806,8 +1829,8 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
return CCInfo.CheckReturn(Outs, RetCC_X86);
}
-const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
- static const uint16_t ScratchRegs[] = { X86::R11, 0 };
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+ static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
return ScratchRegs;
}
@@ -1930,8 +1953,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
if (Flag.getNode())
RetOps.push_back(Flag);
- return DAG.getNode(X86ISD::RET_FLAG, dl,
- MVT::Other, &RetOps[0], RetOps.size());
+ return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
}
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2285,22 +2307,25 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
InVals.push_back(ArgValue);
}
- // The x86-64 ABIs require that for returning structs by value we copy
- // the sret argument into %rax/%eax (depending on ABI) for the return.
- // Win32 requires us to put the sret argument to %eax as well.
- // Save the argument into a virtual register so that we can access it
- // from the return points.
- if (MF.getFunction()->hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
- X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- unsigned Reg = FuncInfo->getSRetReturnReg();
- if (!Reg) {
- MVT PtrTy = getPointerTy();
- Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
- FuncInfo->setSRetReturnReg(Reg);
+ if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ // The x86-64 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // Win32 requires us to put the sret argument to %eax as well.
+ // Save the argument into a virtual register so that we can access it
+ // from the return points.
+ if (Ins[i].Flags.isSRet()) {
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ MVT PtrTy = getPointerTy();
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ FuncInfo->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+ break;
+ }
}
- SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
}
unsigned StackSize = CCInfo.getNextStackOffset();
@@ -2320,17 +2345,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
// FIXME: We should really autogenerate these arrays
- static const uint16_t GPR64ArgRegsWin64[] = {
+ static const MCPhysReg GPR64ArgRegsWin64[] = {
X86::RCX, X86::RDX, X86::R8, X86::R9
};
- static const uint16_t GPR64ArgRegs64Bit[] = {
+ static const MCPhysReg GPR64ArgRegs64Bit[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
};
- static const uint16_t XMMArgRegs64Bit[] = {
+ static const MCPhysReg XMMArgRegs64Bit[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
- const uint16_t *GPR64ArgRegs;
+ const MCPhysReg *GPR64ArgRegs;
unsigned NumXMMRegs = 0;
if (IsWin64) {
@@ -2424,13 +2449,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
SaveXMMOps.push_back(Val);
}
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
- MVT::Other,
- &SaveXMMOps[0], SaveXMMOps.size()));
+ MVT::Other, SaveXMMOps));
}
if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOps[0], MemOps.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
}
}
@@ -2497,10 +2520,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
-static SDValue
-EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
- SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
- unsigned SlotSize, int FPDiff, SDLoc dl) {
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ EVT PtrVT, unsigned SlotSize,
+ int FPDiff, SDLoc dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
@@ -2537,7 +2560,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (MF.getTarget().Options.DisableTailCalls)
isTailCall = false;
- if (isTailCall) {
+ bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+ if (IsMustTail) {
+ // Force this to be a tail call. The verifier rules are enough to ensure
+ // that we can lower this successfully without moving the return address
+ // around.
+ isTailCall = true;
+ } else if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
@@ -2578,7 +2607,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
- if (isTailCall && !IsSibcall) {
+ if (isTailCall && !IsSibcall && !IsMustTail) {
// Lower arguments at fp - stackoffset + fpdiff.
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
@@ -2683,7 +2712,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
} else if (!IsSibcall && (!isTailCall || isByVal)) {
assert(VA.isMemLoc());
- if (StackPtr.getNode() == 0)
+ if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
getPointerTy());
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -2692,8 +2721,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
if (!MemOpChains.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOpChains[0], MemOpChains.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
if (Subtarget->isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
@@ -2730,7 +2758,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// registers used and is in the range 0 - 8 inclusive.
// Count the number of XMM registers allocated.
- static const uint16_t XMMArgRegs[] = {
+ static const MCPhysReg XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
@@ -2742,8 +2770,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getConstant(NumXMMRegs, MVT::i8)));
}
- // For tail calls lower the arguments to the 'real' stack slot.
- if (isTailCall) {
+ // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
+ // don't need this because the eligibility check rejects calls that require
+ // shuffling arguments passed in memory.
+ if (!IsSibcall && isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
// before any new outgoing arguments are stored to the stack, because the
// outgoing stack slots may alias the incoming argument stack slots, and
@@ -2755,45 +2785,45 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
- if (getTargetMachine().Options.GuaranteedTailCallOpt) {
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
- if (VA.isRegLoc())
- continue;
- assert(VA.isMemLoc());
- SDValue Arg = OutVals[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
- // Create frame index.
- int32_t Offset = VA.getLocMemOffset()+FPDiff;
- uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
- FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
- FIN = DAG.getFrameIndex(FI, getPointerTy());
-
- if (Flags.isByVal()) {
- // Copy relative to framepointer.
- SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
- if (StackPtr.getNode() == 0)
- StackPtr = DAG.getCopyFromReg(Chain, dl,
- RegInfo->getStackRegister(),
- getPointerTy());
- Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
-
- MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
- ArgChain,
- Flags, DAG, dl));
- } else {
- // Store relative to framepointer.
- MemOpChains2.push_back(
- DAG.getStore(ArgChain, dl, Arg, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, 0));
- }
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc())
+ continue;
+ assert(VA.isMemLoc());
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Skip inalloca arguments. They don't require any work.
+ if (Flags.isInAlloca())
+ continue;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+ FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+ if (!StackPtr.getNode())
+ StackPtr = DAG.getCopyFromReg(Chain, dl,
+ RegInfo->getStackRegister(),
+ getPointerTy());
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+ ArgChain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(
+ DAG.getStore(ArgChain, dl, Arg, FIN,
+ MachinePointerInfo::getFixedStack(FI),
+ false, false, 0));
}
}
if (!MemOpChains2.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- &MemOpChains2[0], MemOpChains2.size());
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
@@ -2930,10 +2960,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// This isn't right, although it's probably harmless on x86; liveouts
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
- return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+ return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
}
- Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
// Create the CALLSEQ_END node.
@@ -3927,6 +3957,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
return true;
}
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+ // TODO: Deal with AVX's VINSERTPS
+ if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
+ return false;
+
+ unsigned CorrectPosV1 = 0;
+ unsigned CorrectPosV2 = 0;
+ for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+ if (Mask[i] == i)
+ ++CorrectPosV1;
+ else if (Mask[i] == i + 4)
+ ++CorrectPosV2;
+
+ if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+ // We have 3 elements from one vector, and one from another.
+ return true;
+
+ return false;
+}
+
//
// Some special combinations that can be optimized.
//
@@ -4146,6 +4199,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
return true;
}
+// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
+// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
+static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
+ if (!VT.is512BitVector())
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfSize = NumElts/2;
+ if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
+ if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
+ *Imm = 1;
+ return true;
+ }
+ }
+ if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
+ if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
+ *Imm = 0;
+ return true;
+ }
+ }
+ return false;
+}
+
/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSS,
/// MOVSD, and MOVD, i.e. setting the lowest element.
@@ -4624,11 +4700,17 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
return getInsertVINSERTImmediate(N, 256);
}
+/// isZero - Returns true if Elt is a constant integer zero
+static bool isZero(SDValue V) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+ return C && C->isNullValue();
+}
+
/// isZeroNode - Returns true if Elt is a constant zero or a floating point
/// constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
- if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
- return CN->isNullValue();
+ if (isZero(Elt))
+ return true;
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
return CFP->getValueAPF().isPosZero();
return false;
@@ -4677,7 +4759,7 @@ static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
/// isScalarLoadToVector - Returns true if the node is a scalar load that
/// is promoted to a vector. It also returns the LoadSDNode by reference if
/// required.
-static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
return false;
N = N->getOperand(0).getNode();
@@ -4803,28 +4885,24 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
if (Subtarget->hasInt256()) { // AVX2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
- array_lengthof(Ops));
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else {
// 256-bit logic and arithmetic instructions in AVX are all
// floating-point, no support for integer ops. Emit fp zeroed vectors.
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
- array_lengthof(Ops));
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
}
} else if (VT.is512BitVector()) { // AVX-512
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
} else if (VT.getScalarType() == MVT::i1) {
assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- Ops, VT.getVectorNumElements());
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
} else
llvm_unreachable("Unexpected vector type");
@@ -4844,8 +4922,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
if (VT.is256BitVector()) {
if (HasInt256) { // AVX2
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
- array_lengthof(Ops));
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else { // AVX
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
@@ -5307,7 +5384,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
return SDValue();
SDLoc dl(Op);
- SDValue V(0, 0);
+ SDValue V;
bool First = true;
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
@@ -5320,7 +5397,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
}
if ((i & 1) != 0) {
- SDValue ThisElt(0, 0), LastElt(0, 0);
+ SDValue ThisElt, LastElt;
bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
if (LastIsNonZero) {
LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
@@ -5355,7 +5432,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
return SDValue();
SDLoc dl(Op);
- SDValue V(0, 0);
+ SDValue V;
bool First = true;
for (unsigned i = 0; i < 8; ++i) {
bool isNonZero = (NonZeros & (1 << i)) != 0;
@@ -5376,6 +5453,79 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
return V;
}
+/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
+ unsigned NonZeros, unsigned NumNonZero,
+ unsigned NumZero, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget,
+ const TargetLowering &TLI) {
+ // We know there's at least one non-zero element
+ unsigned FirstNonZeroIdx = 0;
+ SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+ while (FirstNonZero.getOpcode() == ISD::UNDEF ||
+ X86::isZeroNode(FirstNonZero)) {
+ ++FirstNonZeroIdx;
+ FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+ }
+
+ if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
+ return SDValue();
+
+ SDValue V = FirstNonZero.getOperand(0);
+ MVT VVT = V.getSimpleValueType();
+ if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
+ return SDValue();
+
+ unsigned FirstNonZeroDst =
+ cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
+ unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
+ unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
+ unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+
+ for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
+ SDValue Elem = Op.getOperand(Idx);
+ if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
+ continue;
+
+ // TODO: What else can be here? Deal with it.
+ if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // TODO: Some optimizations are still possible here
+ // ex: Getting one element from a vector, and the rest from another.
+ if (Elem.getOperand(0) != V)
+ return SDValue();
+
+ unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
+ if (Dst == Idx)
+ ++CorrectIdx;
+ else if (IncorrectIdx == -1U) {
+ IncorrectIdx = Idx;
+ IncorrectDst = Dst;
+ } else
+ // There was already one element with an incorrect index.
+ // We can't optimize this case to an insertps.
+ return SDValue();
+ }
+
+ if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
+ SDLoc dl(Op);
+ EVT VT = Op.getSimpleValueType();
+ unsigned ElementMoveMask = 0;
+ if (IncorrectIdx == -1U)
+ ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
+ else
+ ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+
+ SDValue InsertpsMask =
+ DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
+ }
+
+ return SDValue();
+}
+
/// getVShift - Return a vector logical shift node.
///
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
@@ -5480,7 +5630,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
- LoadSDNode *LDBase = NULL;
+ LoadSDNode *LDBase = nullptr;
unsigned LastLoadedElt = -1U;
// For each element in the initializer, see if we've found a load or an undef.
@@ -5545,8 +5695,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
- array_lengthof(Ops), MVT::i64,
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
LDBase->getPointerInfo(),
LDBase->getAlignment(),
false/*isVolatile*/, true/*ReadMem*/,
@@ -5661,7 +5810,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
unsigned ScalarSize = CVT.getSizeInBits();
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
- const Constant *C = 0;
+ const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
@@ -5706,6 +5855,41 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
return SDValue();
}
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+ SDValue ExtIdx) {
+ int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+ if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+ return Idx;
+
+ // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+ // lowered this:
+ // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+ // to:
+ // (extract_vector_elt (vector_shuffle<2,u,u,u>
+ // (extract_subvector (v8f32 %vreg0), Constant<4>),
+ // undef)
+ // Constant<0>)
+ // In this case the vector is the extract_subvector expression and the index
+ // is 2, as specified by the shuffle.
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+ SDValue ShuffleVec = SVOp->getOperand(0);
+ MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+ assert(ShuffleVecVT.getVectorElementType() ==
+ ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+ int ShuffleIdx = SVOp->getMaskElt(Idx);
+ if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+ ExtractedFromVec = ShuffleVec;
+ return ShuffleIdx;
+ }
+ return Idx;
+}
+
static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
@@ -5739,34 +5923,32 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+ // Quit if non-constant index.
+ if (!isa<ConstantSDNode>(ExtIdx))
+ return SDValue();
+ int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
// Quit if extracted from vector of different type.
if (ExtractedFromVec.getValueType() != VT)
return SDValue();
- // Quit if non-constant index.
- if (!isa<ConstantSDNode>(ExtIdx))
- return SDValue();
-
- if (VecIn1.getNode() == 0)
+ if (!VecIn1.getNode())
VecIn1 = ExtractedFromVec;
else if (VecIn1 != ExtractedFromVec) {
- if (VecIn2.getNode() == 0)
+ if (!VecIn2.getNode())
VecIn2 = ExtractedFromVec;
else if (VecIn2 != ExtractedFromVec)
// Quit if more than 2 vectors to shuffle
return SDValue();
}
- unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
-
if (ExtractedFromVec == VecIn1)
Mask[i] = Idx;
else if (ExtractedFromVec == VecIn2)
Mask[i] = Idx + NumElems;
}
- if (VecIn1.getNode() == 0)
+ if (!VecIn1.getNode())
return SDValue();
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
@@ -5791,24 +5973,22 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- Ops, VT.getVectorNumElements());
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
}
if (ISD::isBuildVectorAllOnes(Op.getNode())) {
SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
- Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
- Ops, VT.getVectorNumElements());
+ SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
}
bool AllContants = true;
uint64_t Immediate = 0;
int NonConstIdx = -1;
bool IsSplat = true;
+ unsigned NumNonConsts = 0;
+ unsigned NumConsts = 0;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (In.getOpcode() == ISD::UNDEF)
@@ -5816,9 +5996,13 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
if (!isa<ConstantSDNode>(In)) {
AllContants = false;
NonConstIdx = idx;
+ NumNonConsts++;
}
- else if (cast<ConstantSDNode>(In)->getZExtValue())
+ else {
+ NumConsts++;
+ if (cast<ConstantSDNode>(In)->getZExtValue())
Immediate |= (1ULL << idx);
+ }
if (In != Op.getOperand(0))
IsSplat = false;
}
@@ -5830,6 +6014,19 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
DAG.getIntPtrConstant(0));
}
+ if (NumNonConsts == 1 && NonConstIdx != 0) {
+ SDValue DstVec;
+ if (NumConsts) {
+ SDValue VecAsImm = DAG.getConstant(Immediate,
+ MVT::getIntegerVT(VT.getSizeInBits()));
+ DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
+ }
+ else
+ DstVec = DAG.getUNDEF(VT);
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ Op.getOperand(NonConstIdx),
+ DAG.getIntPtrConstant(NonConstIdx));
+ }
if (!IsSplat && (NonConstIdx != 0))
llvm_unreachable("Unsupported BUILD_VECTOR operation");
MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
@@ -6043,9 +6240,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
// Build both the lower and upper subvector.
- SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
- SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
- NumElems/2);
+ SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+ makeArrayRef(&V[0], NumElems/2));
+ SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+ makeArrayRef(&V[NumElems / 2], NumElems/2));
// Recreate the wider vector with the lower and upper part.
if (VT.is256BitVector())
@@ -6078,6 +6276,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (V.getNode()) return V;
}
+ // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+ if (EVTBits == 32 && NumElems == 4) {
+ SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
+ NumZero, DAG, Subtarget, *this);
+ if (V.getNode())
+ return V;
+ }
+
// If element VT is == 32 bits, turn it into a number of shuffles.
SmallVector<SDValue, 8> V(NumElems);
if (NumElems == 4 && NumZero > 0) {
@@ -6332,8 +6538,7 @@ static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
if (ShufVT != VT)
V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
- DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT,
- PshufbMask.data(), PshufbMask.size()));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
}
// v8i16 shuffles - Prefer shuffles in the following order:
@@ -6516,7 +6721,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
NewV.getOperand(0),
@@ -6540,7 +6745,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
NewV.getOperand(0),
@@ -6635,7 +6840,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
}
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, &pshufbMask[0], 16));
+ MVT::v16i8, pshufbMask));
// As PSHUFB will zero elements with negative indices, it's safe to ignore
// the 2nd operand if it's undefined or zero.
@@ -6653,7 +6858,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
}
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, &pshufbMask[0], 16));
+ MVT::v16i8, pshufbMask));
return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
}
@@ -6771,6 +6976,9 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
unsigned Scale;
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected!");
+ case MVT::v2i64:
+ case MVT::v2f64:
+ return SDValue(SVOp, 0);
case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
@@ -6805,7 +7013,7 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT,
SDValue SrcOp, SelectionDAG &DAG,
const X86Subtarget *Subtarget, SDLoc dl) {
if (VT == MVT::v2f64 || VT == MVT::v4f32) {
- LoadSDNode *LD = NULL;
+ LoadSDNode *LD = nullptr;
if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
LD = dyn_cast<LoadSDNode>(SrcOp);
if (!LD) {
@@ -6924,8 +7132,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
}
// Construct the output using a BUILD_VECTOR.
- Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
- SVOps.size());
+ Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
} else if (InputUsed[0] < 0) {
// No input vectors were used! The result is undefined.
Output[l] = DAG.getUNDEF(NVT);
@@ -7207,6 +7414,93 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
getShuffleSHUFImmediate(SVOp), DAG);
}
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+ SelectionDAG &DAG) {
+ SDLoc dl(Load);
+ MVT VT = Load->getSimpleValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ SDValue Addr = Load->getOperand(1);
+ SDValue NewAddr = DAG.getNode(
+ ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+ DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+ SDValue NewLoad =
+ DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Load->getMemOperand(), 0, EVT.getStoreSize()));
+ return NewLoad;
+}
+
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+ SelectionDAG &DAG) {
+ // Generate an insertps instruction when inserting an f32 from memory onto a
+ // v4f32 or when copying a member from one v4f32 to another.
+ // We also use it for transferring i32 from one register to another,
+ // since it simply copies the same bits.
+ // If we're transferring an i32 from memory to a specific element in a
+ // register, we output a generic DAG that will match the PINSRD
+ // instruction.
+ MVT VT = SVOp->getSimpleValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ auto Mask = SVOp->getMask();
+ assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+ "unsupported vector type for insertps/pinsrd");
+
+ int FromV1 = std::count_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i < 4; });
+
+ SDValue From;
+ SDValue To;
+ unsigned DestIndex;
+ if (FromV1 == 1) {
+ From = V1;
+ To = V2;
+ DestIndex = std::find_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i < 4; }) -
+ Mask.begin();
+ } else {
+ From = V2;
+ To = V1;
+ DestIndex = std::find_if(Mask.begin(), Mask.end(),
+ [](const int &i) { return i >= 4; }) -
+ Mask.begin();
+ }
+
+ if (MayFoldLoad(From)) {
+ // Trivial case, when From comes from a load and is only used by the
+ // shuffle. Make it use insertps from the vector that we need from that
+ // load.
+ SDValue NewLoad =
+ NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+ if (!NewLoad.getNode())
+ return SDValue();
+
+ if (EVT == MVT::f32) {
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+ SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+ InsertpsMask);
+ } else { // EVT == MVT::i32
+ // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+ // instruction, to match the PINSRD instruction, which loads an i32 to a
+ // certain vector element.
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+ DAG.getConstant(DestIndex, MVT::i32));
+ }
+ }
+
+ // Vector-element-to-vector
+ unsigned SrcIndex = Mask[DestIndex] % 4;
+ SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
// Reduce a vector shuffle to zext.
static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
@@ -7295,9 +7589,8 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
}
-static SDValue
-NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -7322,31 +7615,29 @@ NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
// If the shuffle can be profitably rewritten as a narrower shuffle, then
// do it!
- if (VT == MVT::v8i16 || VT == MVT::v16i8 ||
- VT == MVT::v16i16 || VT == MVT::v32i8) {
+ if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
+ VT == MVT::v32i8) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
- } else if ((VT == MVT::v4i32 ||
- (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+ } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
// FIXME: Figure out a cleaner way to do this.
- // Try to make use of movq to zero out the top part.
if (ISD::isBuildVectorAllZeros(V2.getNode())) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode()) {
MVT NewVT = NewOp.getSimpleValueType();
if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
NewVT, true, false))
- return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
- DAG, Subtarget, dl);
+ return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
+ dl);
}
} else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode()) {
MVT NewVT = NewOp.getSimpleValueType();
if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
- return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
- DAG, Subtarget, dl);
+ return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
+ dl);
}
}
}
@@ -7609,6 +7900,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
getShuffleSHUFImmediate(SVOp), DAG);
}
+ unsigned Idx;
+ if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
+ return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
+ Idx*(NumElems/2), DAG, dl);
+
// Handle VPERM2F128/VPERM2I128 permutations
if (isVPERM2X128Mask(M, VT, HasFp256))
return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
@@ -7618,6 +7914,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (BlendOp.getNode())
return BlendOp;
+ if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+ return getINSERTPS(SVOp, dl, DAG);
+
unsigned Imm8;
if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
@@ -7631,8 +7930,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
}
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
- &permclMask[0], NumElems);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
if (V2IsUndef)
// Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
return DAG.getNode(X86ISD::VPERMV, dl, VT,
@@ -7684,6 +7982,109 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+ unsigned &MaskValue) {
+ MaskValue = 0;
+ unsigned NumElems = BuildVector->getNumOperands();
+ // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+ unsigned NumLanes = (NumElems - 1) / 8 + 1;
+ unsigned NumElemsInLane = NumElems / NumLanes;
+
+ // Blend for v16i16 should be symetric for the both lanes.
+ for (unsigned i = 0; i < NumElemsInLane; ++i) {
+ SDValue EltCond = BuildVector->getOperand(i);
+ SDValue SndLaneEltCond =
+ (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+ int Lane1Cond = -1, Lane2Cond = -1;
+ if (isa<ConstantSDNode>(EltCond))
+ Lane1Cond = !isZero(EltCond);
+ if (isa<ConstantSDNode>(SndLaneEltCond))
+ Lane2Cond = !isZero(SndLaneEltCond);
+
+ if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+ // Lane1Cond != 0, means we want the first argument.
+ // Lane1Cond == 0, means we want the second argument.
+ // The encoding of this argument is 0 for the first argument, 1
+ // for the second. Therefore, invert the condition.
+ MaskValue |= !Lane1Cond << i;
+ else if (Lane1Cond < 0)
+ MaskValue |= !Lane2Cond << i;
+ else
+ return false;
+ }
+ return true;
+}
+
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Cond = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // There is no blend with immediate in AVX-512.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+ return SDValue();
+ if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+ return SDValue();
+
+ if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+
+ // Check the mask for BLEND and build the value.
+ unsigned MaskValue = 0;
+ if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+ return SDValue();
+
+ // Convert i32 vectors to floating point if it is not AVX2.
+ // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+ MVT BlendVT = VT;
+ if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+ BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+ NumElems);
+ LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+ RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+ }
+
+ SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+ DAG.getConstant(MaskValue, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+ if (BlendOp.getNode())
+ return BlendOp;
+
+ // Some types for vselect were previously set to Expand, not Legal or
+ // Custom. Return an empty SDValue so we fall-through to Expand, after
+ // the Custom lowering phase.
+ MVT VT = Op.getSimpleValueType();
+ switch (VT.SimpleTy) {
+ default:
+ break;
+ case MVT::v8i16:
+ case MVT::v16i16:
+ return SDValue();
+ }
+
+ // We couldn't create a "Blend with immediate" node.
+ // This node should still be legal, but we'll have to emit a blendv*
+ // instruction.
+ return Op;
+}
+
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -7946,10 +8347,47 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
return SDValue();
}
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Elt = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ MVT VecVT = Vec.getSimpleValueType();
+
+ if (!isa<ConstantSDNode>(Idx)) {
+ // Non constant index. Extend source and destination,
+ // insert element and then truncate the result.
+ MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
+ MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
+ SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+ return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+ }
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(IdxVal, MVT::i8));
+ const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ unsigned MaxSift = rc->getSize()*8 - 1;
+ EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(MaxSift, MVT::i8));
+ EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
+ DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+}
SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
+
+ if (EltVT == MVT::i1)
+ return InsertBitToMaskVector(Op, DAG);
SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
@@ -8294,10 +8732,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
if (InFlag) {
SDValue Ops[] = { Chain, TGA, *InFlag };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
} else {
SDValue Ops[] = { Chain, TGA };
- Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
}
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -8325,7 +8763,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
- return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
X86::RAX, X86II::MO_TLSGD);
}
@@ -8342,7 +8780,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
SDValue Base;
if (is64Bit) {
- Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
X86II::MO_TLSLD, /*LocalDynamic=*/true);
} else {
SDValue InFlag;
@@ -8481,7 +8919,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Args[] = { Chain, Offset };
- Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
+ Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -8507,10 +8945,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// Windows 64bit: gs:0x58
// Windows 32bit: fs:__tls_array
- // If GV is an alias then use the aliasee for determining
- // thread-localness.
- if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
- GV = GA->getAliasedGlobal();
SDLoc dl(GA);
SDValue Chain = DAG.getEntryNode();
@@ -8609,15 +9043,15 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
if (Op.getOpcode() == ISD::SHL_PARTS) {
- Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
- Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
} else {
- Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
- Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
}
SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
+ return DAG.getMergeValues(Ops, dl);
}
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -8680,8 +9114,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
X86ISD::FILD, DL,
- Tys, Ops, array_lengthof(Ops),
- SrcVT, MMO);
+ Tys, Ops, SrcVT, MMO);
if (useSSE) {
Chain = Result.getValue(1);
@@ -8704,8 +9137,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
MachineMemOperand::MOStore, SSFISize, SSFISize);
Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
- Ops, array_lengthof(Ops),
- Op.getValueType(), MMO);
+ Ops, Op.getValueType(), MMO);
Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(SSFI),
false, false, false, 0);
@@ -8900,7 +9332,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
- array_lengthof(Ops), MVT::i64, MMO);
+ MVT::i64, MMO);
APInt FF(32, 0x5F800000ULL);
@@ -8993,8 +9425,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
MachineMemOperand::MOLoad, MemSize, MemSize);
- Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
- array_lengthof(Ops), DstTy, MMO);
+ Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
Chain = Value.getValue(1);
SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
@@ -9008,8 +9439,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// Build the FP_TO_INT*_IN_MEM
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
- Ops, array_lengthof(Ops), DstTy,
- MMO);
+ Ops, DstTy, MMO);
return std::make_pair(FIST, StackSlot);
} else {
SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
@@ -9021,8 +9451,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
MVT::i32, eax.getValue(2));
SDValue Ops[] = { eax, edx };
SDValue pair = IsReplace
- ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
- : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
+ ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
+ : DAG.getMergeValues(Ops, DL);
return std::make_pair(pair, SDValue());
}
}
@@ -9217,8 +9647,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
for (unsigned j = 0; j < 8; ++j)
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
}
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
- &pshufbMask[0], 32);
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
@@ -9284,7 +9713,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
/*IsSigned=*/ true, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
- if (FIST.getNode() == 0) return Op;
+ if (!FIST.getNode()) return Op;
if (StackSlot.getNode())
// Load the result.
@@ -9581,12 +10010,29 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
VecIns.back(), VecIns.back());
}
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+ for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+ ++UI) {
+ SDNode *User = *UI;
+ unsigned UOpNo = UI.getOperandNo();
+ if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+ // Look pass truncate.
+ UOpNo = User->use_begin().getOperandNo();
+ User = *User->use_begin();
+ }
+
+ if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+ !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+ return true;
+ }
+ return false;
+}
+
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
-SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
SelectionDAG &DAG) const {
- SDLoc dl(Op);
-
if (Op.getValueType() == MVT::i1)
// KORTEST instruction should be selected
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
@@ -9687,31 +10133,35 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
Opcode = X86ISD::ADD;
NumOperands = 2;
break;
- case ISD::AND: {
- // If the primary and result isn't used, don't bother using X86ISD::AND,
- // because a TEST instruction will be better.
- bool NonFlagUse = false;
- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = Op.getNode()->use_end(); UI != UE; ++UI) {
- SDNode *User = *UI;
- unsigned UOpNo = UI.getOperandNo();
- if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
- // Look pass truncate.
- UOpNo = User->use_begin().getOperandNo();
- User = *User->use_begin();
- }
-
- if (User->getOpcode() != ISD::BRCOND &&
- User->getOpcode() != ISD::SETCC &&
- !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
- NonFlagUse = true;
+ case ISD::SHL:
+ case ISD::SRL:
+ // If we have a constant logical shift that's only used in a comparison
+ // against zero turn it into an equivalent AND. This allows turning it into
+ // a TEST instruction later.
+ if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+ isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+ unsigned ShAmt = Op->getConstantOperandVal(1);
+ if (ShAmt >= BitWidth) // Avoid undefined shifts.
break;
- }
+ APInt Mask = ArithOp.getOpcode() == ISD::SRL
+ ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+ : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+ if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+ break;
+ SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+ DAG.getConstant(Mask, VT));
+ DAG.ReplaceAllUsesWith(Op, New);
+ Op = New;
}
+ break;
- if (!NonFlagUse)
+ case ISD::AND:
+ // If the primary and result isn't used, don't bother using X86ISD::AND,
+ // because a TEST instruction will be better.
+ if (!hasNonFlagsUse(Op))
break;
- }
// FALL THROUGH
case ISD::SUB:
case ISD::OR:
@@ -9794,7 +10244,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
for (unsigned i = 0; i != NumOperands; ++i)
Ops.push_back(Op.getOperand(i));
- SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+ SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
DAG.ReplaceAllUsesWith(Op, New);
return SDValue(New.getNode(), 1);
}
@@ -9802,11 +10252,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
- SelectionDAG &DAG) const {
- SDLoc dl(Op0);
+ SDLoc dl, SelectionDAG &DAG) const {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
if (C->getAPIntValue() == 0)
- return EmitTest(Op0, X86CC, DAG);
+ return EmitTest(Op0, X86CC, dl, DAG);
if (Op0.getValueType() == MVT::i1)
llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
@@ -9888,7 +10337,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
unsigned AndBitWidth = And.getValueSizeInBits();
if (BitWidth > AndBitWidth) {
APInt Zeros, Ones;
- DAG.ComputeMaskedBits(Op0, Zeros, Ones);
+ DAG.computeKnownBits(Op0, Zeros, Ones);
if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
return SDValue();
}
@@ -10054,7 +10503,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
/// operand \p Op1. If non-trivial (for example because it's not constant)
/// return an empty value.
-static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
+static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
{
BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
if (!BV)
@@ -10078,8 +10527,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
}
- return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op1), VT, ULTOp1.data(),
- ULTOp1.size());
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
}
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
@@ -10204,7 +10652,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget->hasAVX())
break;
- SDValue ULEOp1 = ChangeVSETULTtoVSETULE(Op1, DAG);
+ SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
if (ULEOp1.getNode()) {
Op1 = ULEOp1;
Subus = true; Invert = false; Swap = false;
@@ -10383,7 +10831,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (X86CC == X86::COND_INVALID)
return SDValue();
- SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+ SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86CC, MVT::i8), EFLAGS);
@@ -10418,11 +10866,6 @@ static bool isX86LogicalCmp(SDValue Op) {
return false;
}
-static bool isZero(SDValue V) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
- return C && C->isNullValue();
-}
-
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
if (V.getOpcode() != ISD::TRUNCATE)
return false;
@@ -10517,7 +10960,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Res = DAG.getNOT(DL, Res, Res.getValueType());
ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
- if (N2C == 0 || !N2C->isNullValue())
+ if (!N2C || !N2C->isNullValue())
Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
return Res;
}
@@ -10606,7 +11049,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
}
// a < b ? -1 : 0 -> RES = ~setcc_carry
@@ -10646,7 +11089,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// condition is true.
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = { Op2, Op1, CC, Cond };
- return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
}
static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
@@ -11027,7 +11470,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
- Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, dl, DAG);
}
Cond = ConvertCmpIfNecessary(Cond, DAG);
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -11042,13 +11485,50 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
- assert((Subtarget->isOSWindows() ||
- getTargetMachine().Options.EnableSegmentedStacks) &&
- "This should be used only on Windows targets or when segmented stacks "
- "are being used");
- assert(!Subtarget->isTargetMacho() && "Not implemented");
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool SplitStack = MF.shouldSplitStack();
+ bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+ SplitStack;
SDLoc dl(Op);
+ if (!Lower) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDNode* Node = Op.getNode();
+
+ unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+ assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+ " not tell us which reg is the stack pointer!");
+ EVT VT = Node->getValueType(0);
+ SDValue Tmp1 = SDValue(Node, 0);
+ SDValue Tmp2 = SDValue(Node, 1);
+ SDValue Tmp3 = Node->getOperand(2);
+ SDValue Chain = Tmp1.getOperand(0);
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
+ SDLoc(Node));
+
+ SDValue Size = Tmp2.getOperand(1);
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SP.getValue(1);
+ unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+ const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+ unsigned StackAlign = TFI.getStackAlignment();
+ Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ if (Align > StackAlign)
+ Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+ DAG.getConstant(-(uint64_t)Align, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+
+ Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+ DAG.getIntPtrConstant(0, true), SDValue(),
+ SDLoc(Node));
+
+ SDValue Ops[2] = { Tmp1, Tmp2 };
+ return DAG.getMergeValues(Ops, dl);
+ }
+
// Get the inputs.
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
@@ -11058,8 +11538,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
bool Is64Bit = Subtarget->is64Bit();
EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
- if (getTargetMachine().Options.EnableSegmentedStacks) {
- MachineFunction &MF = DAG.getMachineFunction();
+ if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
if (Is64Bit) {
@@ -11081,7 +11560,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
SDValue Ops1[2] = { Value, Chain };
- return DAG.getMergeValues(Ops1, 2, dl);
+ return DAG.getMergeValues(Ops1, dl);
} else {
SDValue Flag;
unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
@@ -11105,7 +11584,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
}
SDValue Ops1[2] = { SP, Chain };
- return DAG.getMergeValues(Ops1, 2, dl);
+ return DAG.getMergeValues(Ops1, dl);
}
}
@@ -11166,8 +11645,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
MachinePointerInfo(SV, 16), false, false, 0);
MemOps.push_back(Store);
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- &MemOps[0], MemOps.size());
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
@@ -11221,8 +11699,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
InstOps.push_back(DAG.getConstant(Align, MVT::i32));
SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
- VTs, &InstOps[0], InstOps.size(),
- MVT::i64,
+ VTs, InstOps, MVT::i64,
MachinePointerInfo(SV),
/*Align=*/0,
/*Volatile=*/false,
@@ -11262,6 +11739,10 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
SelectionDAG &DAG) {
MVT ElementType = VT.getVectorElementType();
+ // Fold this packed shift into its first operand if ShiftAmt is 0.
+ if (ShiftAmt == 0)
+ return SrcOp;
+
// Check for ShiftAmt >= element width
if (ShiftAmt >= ElementType.getSizeInBits()) {
if (Opc == X86ISD::VSRAI)
@@ -11282,7 +11763,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
ConstantSDNode *ND;
switch(Opc) {
- default: llvm_unreachable(0);
+ default: llvm_unreachable(nullptr);
case X86ISD::VSHLI:
for (unsigned i=0; i!=NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
@@ -11321,7 +11802,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
break;
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
}
return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
@@ -11353,7 +11834,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
ShOps[0] = ShAmt;
ShOps[1] = DAG.getConstant(0, MVT::i32);
ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
// The return type has to be a 128-bit type with the same element
// type as the input type.
@@ -11476,6 +11957,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse41_pmuldq:
+ case Intrinsic::x86_avx2_pmul_dq:
+ return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse2_pmulhu_w:
+ case Intrinsic::x86_avx2_pmulhu_w:
+ return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::x86_sse2_pmulh_w:
+ case Intrinsic::x86_avx2_pmulh_w:
+ return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
// SSE2/AVX2 sub with unsigned saturation intrinsics
case Intrinsic::x86_sse2_psubus_b:
case Intrinsic::x86_sse2_psubus_w:
@@ -11927,7 +12423,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
}
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86CC, MVT::i8),
SDValue(PCMP.getNode(), 1));
@@ -11944,7 +12440,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ return DAG.getNode(Opcode, dl, VTs, NewOps);
}
case Intrinsic::x86_fma_vfmadd_ps:
case Intrinsic::x86_fma_vfmadd_pd:
@@ -12042,27 +12538,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
}
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Base, SDValue Index,
- SDValue ScaleOp, SDValue Chain,
- const X86Subtarget * Subtarget) {
- SDLoc dl(Op);
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
- assert(C && "Invalid scale type");
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
- SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
- EVT MaskVT = MVT::getVectorVT(MVT::i1,
- Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
- SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
-}
-
-static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget * Subtarget) {
@@ -12072,7 +12547,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ else
+ MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -12081,12 +12561,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+ return DAG.getMergeValues(RetOps, dl);
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Base, SDValue Index,
- SDValue ScaleOp, SDValue Chain) {
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
@@ -12095,52 +12575,218 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Segment = DAG.getRegister(0, MVT::i32);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ else
+ MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
return SDValue(Res, 1);
}
-static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Mask, SDValue Base,
- SDValue Index, SDValue ScaleOp, SDValue Chain) {
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Mask, SDValue Base, SDValue Index,
+ SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
- EVT MaskVT = MVT::getVectorVT(MVT::i1,
- Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
- SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
- SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- return SDValue(Res, 1);
+ EVT MaskVT =
+ MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ else
+ MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ //SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+ return SDValue(Res, 0);
+}
+
+// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
+// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
+// also used to custom lower READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
+ SelectionDAG &DAG, const X86Subtarget *Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+ SDValue LO, HI;
+
+ // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+ // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+ // and the EAX register is loaded with the low-order 32 bits.
+ if (Subtarget->is64Bit()) {
+ LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ SDValue Chain = HI.getValue(1);
+
+ if (Opcode == X86ISD::RDTSCP_DAG) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+ // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+ // the ECX register. Add 'ecx' explicitly to the chain.
+ SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+ HI.getValue(2));
+ // Explicitly store the content of ECX at the location passed in input
+ // to the 'rdtscp' intrinsic.
+ Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+ MachinePointerInfo(), false, false, 0);
+ }
+
+ if (Subtarget->is64Bit()) {
+ // The EDX register is loaded with the high-order 32 bits of the MSR, and
+ // the EAX register is loaded with the low-order 32 bits.
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SmallVector<SDValue, 2> Results;
+ SDLoc DL(Op);
+ getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ return DAG.getMergeValues(Results, DL);
+}
+
+enum IntrinsicType {
+ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+};
+
+struct IntrinsicData {
+ IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
+ :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
+ IntrinsicType Type;
+ unsigned Opc0;
+ unsigned Opc1;
+};
+
+std::map < unsigned, IntrinsicData> IntrMap;
+static void InitIntinsicsMap() {
+ static bool Initialized = false;
+ if (Initialized)
+ return;
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+ IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+ IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
+ IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
+ IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
+ IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512,
+ IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512,
+ IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512,
+ IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512,
+ IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
+
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
+ IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512,
+ IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512,
+ IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512,
+ IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
+
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
+ X86::VGATHERPF1QPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
+ X86::VGATHERPF1QPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
+ X86::VGATHERPF1DPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
+ X86::VGATHERPF1DPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
+ X86::VSCATTERPF1QPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
+ X86::VSCATTERPF1QPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
+ X86::VSCATTERPF1DPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
+ X86::VSCATTERPF1DPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
+ IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
+ IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
+ IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
+ IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
+ IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
+ IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
+ IntrinsicData(XTEST, X86ISD::XTEST, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
+ IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
+ IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0)));
+ Initialized = true;
}
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- SDLoc dl(Op);
+ InitIntinsicsMap();
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- switch (IntNo) {
- default: return SDValue(); // Don't custom lower most intrinsics.
+ std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
+ if (itr == IntrMap.end())
+ return SDValue();
- // RDRAND/RDSEED intrinsics.
- case Intrinsic::x86_rdrand_16:
- case Intrinsic::x86_rdrand_32:
- case Intrinsic::x86_rdrand_64:
- case Intrinsic::x86_rdseed_16:
- case Intrinsic::x86_rdseed_32:
- case Intrinsic::x86_rdseed_64: {
- unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
- IntNo == Intrinsic::x86_rdseed_32 ||
- IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
- X86ISD::RDRAND;
+ SDLoc dl(Op);
+ IntrinsicData Intr = itr->second;
+ switch(Intr.Type) {
+ case RDSEED:
+ case RDRAND: {
// Emit the node with the right value type.
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
- SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
+ SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
@@ -12150,152 +12796,55 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue(Result.getNode(), 1) };
SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
DAG.getVTList(Op->getValueType(1), MVT::Glue),
- Ops, array_lengthof(Ops));
+ Ops);
// Return { result, isValid, chain }.
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
- //int_gather(index, base, scale);
- case Intrinsic::x86_avx512_gather_qpd_512:
- case Intrinsic::x86_avx512_gather_qps_512:
- case Intrinsic::x86_avx512_gather_dpd_512:
- case Intrinsic::x86_avx512_gather_qpi_512:
- case Intrinsic::x86_avx512_gather_qpq_512:
- case Intrinsic::x86_avx512_gather_dpq_512:
- case Intrinsic::x86_avx512_gather_dps_512:
- case Intrinsic::x86_avx512_gather_dpi_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
- case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
- case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
- case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
- case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
- case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
- }
- SDValue Chain = Op.getOperand(0);
- SDValue Index = Op.getOperand(2);
- SDValue Base = Op.getOperand(3);
- SDValue Scale = Op.getOperand(4);
- return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
- }
- //int_gather_mask(v1, mask, index, base, scale);
- case Intrinsic::x86_avx512_gather_qps_mask_512:
- case Intrinsic::x86_avx512_gather_qpd_mask_512:
- case Intrinsic::x86_avx512_gather_dpd_mask_512:
- case Intrinsic::x86_avx512_gather_dps_mask_512:
- case Intrinsic::x86_avx512_gather_qpi_mask_512:
- case Intrinsic::x86_avx512_gather_qpq_mask_512:
- case Intrinsic::x86_avx512_gather_dpi_mask_512:
- case Intrinsic::x86_avx512_gather_dpq_mask_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_gather_qps_mask_512:
- Opc = X86::VGATHERQPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpd_mask_512:
- Opc = X86::VGATHERQPDZrm; break;
- case Intrinsic::x86_avx512_gather_dpd_mask_512:
- Opc = X86::VGATHERDPDZrm; break;
- case Intrinsic::x86_avx512_gather_dps_mask_512:
- Opc = X86::VGATHERDPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpi_mask_512:
- Opc = X86::VPGATHERQDZrm; break;
- case Intrinsic::x86_avx512_gather_qpq_mask_512:
- Opc = X86::VPGATHERQQZrm; break;
- case Intrinsic::x86_avx512_gather_dpi_mask_512:
- Opc = X86::VPGATHERDDZrm; break;
- case Intrinsic::x86_avx512_gather_dpq_mask_512:
- Opc = X86::VPGATHERDQZrm; break;
- }
+ case GATHER: {
+ //gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
SDValue Src = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
+ SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
- SDValue Base = Op.getOperand(5);
+ SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+ return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
Subtarget);
}
- //int_scatter(base, index, v1, scale);
- case Intrinsic::x86_avx512_scatter_qpd_512:
- case Intrinsic::x86_avx512_scatter_qps_512:
- case Intrinsic::x86_avx512_scatter_dpd_512:
- case Intrinsic::x86_avx512_scatter_qpi_512:
- case Intrinsic::x86_avx512_scatter_qpq_512:
- case Intrinsic::x86_avx512_scatter_dpq_512:
- case Intrinsic::x86_avx512_scatter_dps_512:
- case Intrinsic::x86_avx512_scatter_dpi_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_scatter_qpd_512:
- Opc = X86::VSCATTERQPDZmr; break;
- case Intrinsic::x86_avx512_scatter_qps_512:
- Opc = X86::VSCATTERQPSZmr; break;
- case Intrinsic::x86_avx512_scatter_dpd_512:
- Opc = X86::VSCATTERDPDZmr; break;
- case Intrinsic::x86_avx512_scatter_dps_512:
- Opc = X86::VSCATTERDPSZmr; break;
- case Intrinsic::x86_avx512_scatter_qpi_512:
- Opc = X86::VPSCATTERQDZmr; break;
- case Intrinsic::x86_avx512_scatter_qpq_512:
- Opc = X86::VPSCATTERQQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpq_512:
- Opc = X86::VPSCATTERDQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpi_512:
- Opc = X86::VPSCATTERDDZmr; break;
- }
- SDValue Chain = Op.getOperand(0);
- SDValue Base = Op.getOperand(2);
- SDValue Index = Op.getOperand(3);
- SDValue Src = Op.getOperand(4);
- SDValue Scale = Op.getOperand(5);
- return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
- }
- //int_scatter_mask(base, mask, index, v1, scale);
- case Intrinsic::x86_avx512_scatter_qps_mask_512:
- case Intrinsic::x86_avx512_scatter_qpd_mask_512:
- case Intrinsic::x86_avx512_scatter_dpd_mask_512:
- case Intrinsic::x86_avx512_scatter_dps_mask_512:
- case Intrinsic::x86_avx512_scatter_qpi_mask_512:
- case Intrinsic::x86_avx512_scatter_qpq_mask_512:
- case Intrinsic::x86_avx512_scatter_dpi_mask_512:
- case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_scatter_qpd_mask_512:
- Opc = X86::VSCATTERQPDZmr; break;
- case Intrinsic::x86_avx512_scatter_qps_mask_512:
- Opc = X86::VSCATTERQPSZmr; break;
- case Intrinsic::x86_avx512_scatter_dpd_mask_512:
- Opc = X86::VSCATTERDPDZmr; break;
- case Intrinsic::x86_avx512_scatter_dps_mask_512:
- Opc = X86::VSCATTERDPSZmr; break;
- case Intrinsic::x86_avx512_scatter_qpi_mask_512:
- Opc = X86::VPSCATTERQDZmr; break;
- case Intrinsic::x86_avx512_scatter_qpq_mask_512:
- Opc = X86::VPSCATTERQQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpq_mask_512:
- Opc = X86::VPSCATTERDQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpi_mask_512:
- Opc = X86::VPSCATTERDDZmr; break;
- }
+ case SCATTER: {
+ //scatter(base, mask, index, v1, scale);
SDValue Chain = Op.getOperand(0);
SDValue Base = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+ return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+ }
+ case PREFETCH: {
+ SDValue Hint = Op.getOperand(6);
+ unsigned HintVal;
+ if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
+ (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
+ llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
+ unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Mask = Op.getOperand(2);
+ SDValue Index = Op.getOperand(3);
+ SDValue Base = Op.getOperand(4);
+ SDValue Scale = Op.getOperand(5);
+ return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
+ }
+ // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+ case RDTSC: {
+ SmallVector<SDValue, 2> Results;
+ getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
+ return DAG.getMergeValues(Results, dl);
}
// XTEST intrinsics.
- case Intrinsic::x86_xtest: {
+ case XTEST: {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
@@ -12306,6 +12855,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
Ret, SDValue(InTrans.getNode(), 1));
}
}
+ llvm_unreachable("Unknown Intrinsic Type");
}
SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -12358,6 +12908,19 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
return FrameAddr;
}
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName,
+ EVT VT) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("esp", X86::ESP)
+ .Case("rsp", X86::RSP)
+ .Default(0);
+ if (Reg)
+ return Reg;
+ report_fatal_error("Invalid register name global variable");
+}
+
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
const X86RegisterInfo *RegInfo =
@@ -12477,7 +13040,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
MachinePointerInfo(TrmpAddr, 22),
false, false, 0);
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
} else {
const Function *Func =
cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
@@ -12557,7 +13120,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
MachinePointerInfo(TrmpAddr, 6),
false, false, 1);
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
}
@@ -12600,8 +13163,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
DAG.getVTList(MVT::Other),
- Ops, array_lengthof(Ops), MVT::i16,
- MMO);
+ Ops, MVT::i16, MMO);
// Load FP Control Word from stack slot
SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
@@ -12654,7 +13216,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(X86::COND_E, MVT::i8),
Op.getValue(1)
};
- Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
// Finally xor with NumBits-1.
Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
@@ -12706,7 +13268,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(X86::COND_E, MVT::i8),
Op.getValue(1)
};
- return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
}
// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
@@ -12824,59 +13386,104 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
}
-static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
- MVT EltTy = VT.getVectorElementType();
- unsigned NumElts = VT.getVectorNumElements();
- SDValue N0 = Op.getOperand(0);
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetWin64() && "Unexpected target");
+ EVT VT = Op.getValueType();
+ assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+ "Unexpected return type for lowering");
+
+ RTLIB::Libcall LC;
+ bool isSigned;
+ switch (Op->getOpcode()) {
+ default: llvm_unreachable("Unexpected request for libcall!");
+ case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
+ case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
+ case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
+ case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
+ case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
+ case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+ }
+
SDLoc dl(Op);
+ SDValue InChain = DAG.getEntryNode();
- // Lower sdiv X, pow2-const.
- BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
- if (!C)
- return SDValue();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+ EVT ArgVT = Op->getOperand(i).getValueType();
+ assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+ "Unexpected argument type for lowering");
+ SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+ Entry.Node = StackPtr;
+ InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+ false, false, 16);
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Entry.Ty = PointerType::get(ArgTy,0);
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+ }
+
+ SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+ getPointerTy());
- APInt SplatValue, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
- HasAnyUndefs) ||
- EltTy.getSizeInBits() < SplatBitSize)
- return SDValue();
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(InChain)
+ .setCallee(getLibcallCallingConv(LC),
+ static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+ Callee, &Args, 0)
+ .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+ std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+ return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+ EVT VT = Op0.getValueType();
+ SDLoc dl(Op);
- if ((SplatValue != 0) &&
- (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
- unsigned Lg2 = SplatValue.countTrailingZeros();
- // Splat the sign bit.
- SmallVector<SDValue, 16> Sz(NumElts,
- DAG.getConstant(EltTy.getSizeInBits() - 1,
- EltTy));
- SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
- NumElts));
- // Add (N0 < 0) ? abs2 - 1 : 0;
- SmallVector<SDValue, 16> Amt(NumElts,
- DAG.getConstant(EltTy.getSizeInBits() - Lg2,
- EltTy));
- SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
- NumElts));
- SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
- SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
- SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
- NumElts));
-
- // If we're dividing by a positive value, we're done. Otherwise, we must
- // negate the result.
- if (SplatValue.isNonNegative())
- return SRA;
-
- SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
- SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
- return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+ assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
+ (VT == MVT::v8i32 && Subtarget->hasInt256()));
+
+ // Get the high parts.
+ const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
+ SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+ SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+
+ // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+ // ints.
+ MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+ bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+ unsigned Opcode =
+ (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+ SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+ SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+
+ // Shuffle it back into the right order.
+ const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
+ SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
+ SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+
+ // If we have a signed multiply but no PMULDQ fix up the high parts of a
+ // unsigned multiply.
+ if (IsSigned && !Subtarget->hasSSE41()) {
+ SDValue ShAmt =
+ DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+ SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+ SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+ SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+ Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
}
- return SDValue();
+
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
}
static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
@@ -12920,7 +13527,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(uint8_t(-1U << ShiftAmt),
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
@@ -12933,7 +13540,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRA) {
if (ShiftAmt == 7) {
@@ -12946,7 +13553,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
@@ -12966,7 +13573,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(uint8_t(-1U << ShiftAmt),
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
@@ -12979,7 +13586,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+ DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRA) {
if (ShiftAmt == 7) {
@@ -12992,7 +13599,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
@@ -13014,7 +13621,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
uint64_t ShiftAmt = 0;
for (unsigned i = 0; i != Ratio; ++i) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
- if (C == 0)
+ if (!C)
return SDValue();
// 6 == Log2(64)
ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
@@ -13025,7 +13632,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
for (unsigned j = 0; j != Ratio; ++j) {
ConstantSDNode *C =
dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
- if (C == 0)
+ if (!C)
return SDValue();
// 6 == Log2(64)
ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
@@ -13107,7 +13714,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
BaseShAmt = InVec.getOperand(1);
}
}
- if (BaseShAmt.getNode() == 0)
+ if (!BaseShAmt.getNode())
BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
DAG.getIntPtrConstant(0));
}
@@ -13260,7 +13867,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
}
Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
}
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems);
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
return DAG.getNode(ISD::MUL, dl, VT, R, BV);
}
@@ -13274,6 +13881,79 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
}
+ // If possible, lower this shift as a sequence of two shifts by
+ // constant plus a MOVSS/MOVSD instead of scalarizing it.
+ // Example:
+ // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+ //
+ // Could be rewritten as:
+ // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+ //
+ // The advantage is that the two shifts from the example would be
+ // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+ // the vector shift into four scalar shifts plus four pairs of vector
+ // insert/extract.
+ if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
+ ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+ unsigned TargetOpcode = X86ISD::MOVSS;
+ bool CanBeSimplified;
+ // The splat value for the first packed shift (the 'X' from the example).
+ SDValue Amt1 = Amt->getOperand(0);
+ // The splat value for the second packed shift (the 'Y' from the example).
+ SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
+ Amt->getOperand(2);
+
+ // See if it is possible to replace this node with a sequence of
+ // two shifts followed by a MOVSS/MOVSD
+ if (VT == MVT::v4i32) {
+ // Check if it is legal to use a MOVSS.
+ CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+ Amt2 == Amt->getOperand(3);
+ if (!CanBeSimplified) {
+ // Otherwise, check if we can still simplify this node using a MOVSD.
+ CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+ Amt->getOperand(2) == Amt->getOperand(3);
+ TargetOpcode = X86ISD::MOVSD;
+ Amt2 = Amt->getOperand(2);
+ }
+ } else {
+ // Do similar checks for the case where the machine value type
+ // is MVT::v8i16.
+ CanBeSimplified = Amt1 == Amt->getOperand(1);
+ for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+ if (!CanBeSimplified) {
+ TargetOpcode = X86ISD::MOVSD;
+ CanBeSimplified = true;
+ Amt2 = Amt->getOperand(4);
+ for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+ CanBeSimplified = Amt1 == Amt->getOperand(i);
+ for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+ CanBeSimplified = Amt2 == Amt->getOperand(j);
+ }
+ }
+
+ if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+ isa<ConstantSDNode>(Amt2)) {
+ // Replace this node with two shifts followed by a MOVSS/MOVSD.
+ EVT CastVT = MVT::v4i32;
+ SDValue Splat1 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
+ SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+ SDValue Splat2 =
+ DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
+ SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+ if (TargetOpcode == X86ISD::MOVSD)
+ CastVT = MVT::v2i64;
+ SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
+ SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
+ SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
+ BitCast1, DAG);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ }
+ }
+
if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
@@ -13351,10 +14031,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
for (unsigned i = NumElems/2; i != NumElems; ++i)
Amt2Csts.push_back(Amt->getOperand(i));
- Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
- &Amt1Csts[0], NumElems/2);
- Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
- &Amt2Csts[0], NumElems/2);
+ Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
+ Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
} else {
// Variable shift amount
Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
@@ -13585,35 +14263,47 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
- Ops, array_lengthof(Ops), T, MMO);
+ Ops, T, MMO);
SDValue cpOut =
DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
return cpOut;
}
-static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- assert(Subtarget->is64Bit() && "Result not type legalized?");
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue TheChain = Op.getOperand(0);
- SDLoc dl(Op);
- SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
- SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
- SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
- rax.getValue(2));
- SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
- DAG.getConstant(32, MVT::i8));
- SDValue Ops[] = {
- DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
- rdx.getValue(1)
- };
- return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
-}
-
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
MVT SrcVT = Op.getOperand(0).getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
+
+ if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ if (DstVT != MVT::f64)
+ // This conversion needs to be expanded.
+ return SDValue();
+
+ SDValue InVec = Op->getOperand(0);
+ SDLoc dl(Op);
+ unsigned NumElts = SrcVT.getVectorNumElements();
+ EVT SVT = SrcVT.getVectorElementType();
+
+ // Widen the vector in input in the case of MVT::v2i32.
+ // Example: from MVT::v2i32 to MVT::v4i32.
+ SmallVector<SDValue, 16> Elts;
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
+ DAG.getIntPtrConstant(i)));
+
+ // Explicitly mark the extra elements as Undef.
+ SDValue Undef = DAG.getUNDEF(SVT);
+ for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
+ Elts.push_back(Undef);
+
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+ SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+ DAG.getIntPtrConstant(0));
+ }
+
assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
Subtarget->hasMMX() && "Unexpected custom BITCAST");
assert((DstVT == MVT::i64 ||
@@ -13641,8 +14331,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
cast<AtomicSDNode>(Node)->getMemoryVT(),
Node->getOperand(0),
Node->getOperand(1), negOp,
- cast<AtomicSDNode>(Node)->getSrcValue(),
- cast<AtomicSDNode>(Node)->getAlignment(),
+ cast<AtomicSDNode>(Node)->getMemOperand(),
cast<AtomicSDNode>(Node)->getOrdering(),
cast<AtomicSDNode>(Node)->getSynchScope());
}
@@ -13730,12 +14419,11 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
Type *RetTy = isF64
? (Type*)StructType::get(ArgTy, ArgTy, NULL)
: (Type*)VectorType::get(ArgTy, 4);
- TargetLowering::
- CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
- false, false, false, false, 0,
- CallingConv::C, /*isTaillCall=*/false,
- /*doesNotRet=*/false, /*isReturnValueUsed*/true,
- Callee, Args, DAG, dl);
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+ .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
if (isF64)
@@ -13764,6 +14452,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
@@ -13815,6 +14504,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
+ case ISD::UMUL_LOHI:
+ case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
@@ -13832,7 +14523,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::ADD: return LowerADD(Op, DAG);
case ISD::SUB: return LowerSUB(Op, DAG);
- case ISD::SDIV: return LowerSDIV(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
}
}
@@ -13875,10 +14565,10 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
SDValue Ops[] = { Chain, In1, In2L, In2H };
SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
SDValue Result =
- DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
+ DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
cast<MemSDNode>(Node)->getMemOperand());
SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
Results.push_back(Result.getValue(2));
}
@@ -13899,6 +14589,16 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::SUBE:
// We don't want to expand or promote these.
return;
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM:
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: {
+ SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+ Results.push_back(V);
+ return;
+ }
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
@@ -13909,10 +14609,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
std::pair<SDValue,SDValue> Vals =
FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
SDValue FIST = Vals.first, StackSlot = Vals.second;
- if (FIST.getNode() != 0) {
+ if (FIST.getNode()) {
EVT VT = N->getValueType(0);
// Return a load from the stack slot.
- if (StackSlot.getNode() != 0)
+ if (StackSlot.getNode())
Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
MachinePointerInfo(),
false, false, false, 0));
@@ -13945,20 +14645,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(V);
return;
}
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default : llvm_unreachable("Do not know how to custom type "
+ "legalize this intrinsic operation!");
+ case Intrinsic::x86_rdtsc:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
+ case Intrinsic::x86_rdtscp:
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+ Results);
+ }
+ }
case ISD::READCYCLECOUNTER: {
- SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue TheChain = N->getOperand(0);
- SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
- SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
- rd.getValue(1));
- SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
- eax.getValue(2));
- // Use a buildpair to merge the two 32-bit values into a 64-bit one.
- SDValue Ops[] = { eax, edx };
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
- array_lengthof(Ops)));
- Results.push_back(edx.getValue(1));
- return;
+ return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+ Results);
}
case ISD::ATOMIC_CMP_SWAP: {
EVT T = N->getValueType(0);
@@ -13994,8 +14696,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
X86ISD::LCMPXCHG8_DAG;
- SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
- Ops, array_lengthof(Ops), T, MMO);
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
HalfT, Result.getValue(1));
@@ -14003,7 +14704,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Regs64bit ? X86::RDX : X86::EDX,
HalfT, cpOutL.getValue(2));
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
Results.push_back(cpOutH.getValue(1));
return;
}
@@ -14058,14 +14759,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
return;
}
- case ISD::ATOMIC_LOAD:
+ case ISD::ATOMIC_LOAD: {
ReplaceATOMIC_LOAD(N, Results, DAG);
+ return;
+ }
+ case ISD::BITCAST: {
+ assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+ EVT DstVT = N->getValueType(0);
+ EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+ if (SrcVT != MVT::f64 ||
+ (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+ return;
+
+ unsigned NumElts = DstVT.getVectorNumElements();
+ EVT SVT = DstVT.getVectorElementType();
+ EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+ SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, N->getOperand(0));
+ SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+
+ SmallVector<SDValue, 8> Elts;
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+ ToVecInt, DAG.getIntPtrConstant(i)));
+
+ Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+ }
}
}
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch (Opcode) {
- default: return NULL;
+ default: return nullptr;
case X86ISD::BSF: return "X86ISD::BSF";
case X86ISD::BSR: return "X86ISD::BSR";
case X86ISD::SHLD: return "X86ISD::SHLD";
@@ -14176,7 +14902,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::OR: return "X86ISD::OR";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
- case X86ISD::BZHI: return "X86ISD::BZHI";
case X86ISD::BEXTR: return "X86ISD::BEXTR";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::PTEST: return "X86ISD::PTEST";
@@ -14203,6 +14928,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
+ case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
@@ -14210,6 +14936,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
+ case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
@@ -14240,7 +14967,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
Reloc::Model R = getTargetMachine().getRelocationModel();
// X86 allows a sign-extended 32-bit immediate field as a displacement.
- if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+ if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
return false;
if (AM.BaseGV) {
@@ -14418,7 +15145,23 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
if (VT.getSizeInBits() == 64)
return false;
- // FIXME: pshufb, blends, shifts.
+ // If this is a single-input shuffle with no 128 bit lane crossings we can
+ // lower it into pshufb.
+ if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
+ (SVT.is256BitVector() && Subtarget->hasInt256())) {
+ bool isLegal = true;
+ for (unsigned I = 0, E = M.size(); I != E; ++I) {
+ if (M[I] >= (int)SVT.getVectorNumElements() ||
+ ShuffleCrosses128bitLane(SVT, I, M[I])) {
+ isLegal = false;
+ break;
+ }
+ }
+ if (isLegal)
+ return true;
+ }
+
+ // FIXME: blends, shifts.
return (SVT.getVectorNumElements() == 2 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isMOVLMask(M, SVT) ||
@@ -15366,7 +16109,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
OffsetDestReg = 0; // unused
OverflowDestReg = DestReg;
- offsetMBB = NULL;
+ offsetMBB = nullptr;
overflowMBB = thisMBB;
endMBB = thisMBB;
} else {
@@ -15736,7 +16479,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
MachineFunction *MF = BB->getParent();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- assert(getTargetMachine().Options.EnableSegmentedStacks);
+ assert(MF->shouldSplitStack());
unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
@@ -16509,11 +17252,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// X86 Optimization Hooks
//===----------------------------------------------------------------------===//
-void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
- APInt &KnownZero,
- APInt &KnownOne,
- const SelectionDAG &DAG,
- unsigned Depth) const {
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
unsigned BitWidth = KnownZero.getBitWidth();
unsigned Opc = Op.getOpcode();
assert((Opc >= ISD::BUILTIN_OP_END ||
@@ -16576,8 +17319,10 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
}
}
-unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
- unsigned Depth) const {
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+ SDValue Op,
+ const SelectionDAG &,
+ unsigned Depth) const {
// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
if (Op.getOpcode() == X86ISD::SETCC_CARRY)
return Op.getValueType().getScalarType().getSizeInBits();
@@ -16679,7 +17424,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
SDValue ResNode =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
- array_lengthof(Ops),
Ld->getMemoryVT(),
Ld->getPointerInfo(),
Ld->getAlignment(),
@@ -17036,6 +17780,51 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
return std::make_pair(Opc, NeedSplit);
}
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc dl(N);
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+
+ if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+ SDValue CondSrc = Cond->getOperand(0);
+ if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+ Cond = CondSrc->getOperand(0);
+ }
+
+ MVT VT = N->getSimpleValueType(0);
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+ // There is no blend with immediate in AVX-512.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+ return SDValue();
+ if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+ return SDValue();
+
+ if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ return SDValue();
+
+ unsigned MaskValue = 0;
+ if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+ return SDValue();
+
+ SmallVector<int, 8> ShuffleMask(NumElems, -1);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ // Be sure we emit undef where we can.
+ if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+ ShuffleMask[i] = -1;
+ else
+ ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+ }
+
+ return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
/// nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -17378,7 +18167,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// Another special case: If C was a sign bit, the sub has been
// canonicalized into a xor.
- // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+ // FIXME: Would it be better to use computeKnownBits to determine whether
// it's safe to decanonicalize the xor?
// x s< 0 ? x^C : 0 --> subus x, C
if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
@@ -17544,7 +18333,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
// to simplify previous instructions.
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
- !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
+ !DCI.isBeforeLegalize() &&
+ // We explicitly check against v8i16 and v16i16 because, although
+ // they're marked as Custom, they might only be legal when Cond is a
+ // build_vector of constants. This will be taken care in a later
+ // condition.
+ (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
+ VT != MVT::v8i16)) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
// Don't optimize vector selects that map to mask-registers.
@@ -17571,6 +18366,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
DCI.CommitTargetLoweringOpt(TLO);
}
+ // We should generate an X86ISD::BLENDI from a vselect if its argument
+ // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+ // constants. This specific pattern gets generated when we split a
+ // selector for a 512 bit vector in a machine without AVX512 (but with
+ // 256-bit vectors), during legalization:
+ //
+ // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+ //
+ // Iff we find this pattern and the build_vectors are built from
+ // constants, we translate the vselect into a shuffle_vector that we
+ // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+ if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+ SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+ }
+
return SDValue();
}
@@ -17605,7 +18417,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
SDValue Op2 = Cmp.getOperand(1);
SDValue SetCC;
- const ConstantSDNode* C = 0;
+ const ConstantSDNode* C = nullptr;
bool needOppositeCond = (CC == X86::COND_E);
bool checkAgainstTrue = false; // Is it a comparison against 1?
@@ -17740,8 +18552,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
(FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
SDValue Ops[] = { FalseOp, TrueOp,
DAG.getConstant(CC, MVT::i8), Flags };
- return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
- Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
}
// If this is a select between two integer constants, try to do some
@@ -17856,7 +18667,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
// the DCI.xxxx conditions are provided to postpone the optimization as
// late as possible.
- ConstantSDNode *CmpAgainst = 0;
+ ConstantSDNode *CmpAgainst = nullptr;
if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
!isa<ConstantSDNode>(Cond.getOperand(0))) {
@@ -17871,8 +18682,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
SDValue Ops[] = { FalseOp, Cond.getOperand(0),
DAG.getConstant(CC, MVT::i8), Cond };
- return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
- array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
}
}
}
@@ -17880,6 +18690,106 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default: return SDValue();
+ // SSE/AVX/AVX2 blend intrinsics.
+ case Intrinsic::x86_avx2_pblendvb:
+ case Intrinsic::x86_avx2_pblendw:
+ case Intrinsic::x86_avx2_pblendd_128:
+ case Intrinsic::x86_avx2_pblendd_256:
+ // Don't try to simplify this intrinsic if we don't have AVX2.
+ if (!Subtarget->hasAVX2())
+ return SDValue();
+ // FALL-THROUGH
+ case Intrinsic::x86_avx_blend_pd_256:
+ case Intrinsic::x86_avx_blend_ps_256:
+ case Intrinsic::x86_avx_blendv_pd_256:
+ case Intrinsic::x86_avx_blendv_ps_256:
+ // Don't try to simplify this intrinsic if we don't have AVX.
+ if (!Subtarget->hasAVX())
+ return SDValue();
+ // FALL-THROUGH
+ case Intrinsic::x86_sse41_pblendw:
+ case Intrinsic::x86_sse41_blendpd:
+ case Intrinsic::x86_sse41_blendps:
+ case Intrinsic::x86_sse41_blendvps:
+ case Intrinsic::x86_sse41_blendvpd:
+ case Intrinsic::x86_sse41_pblendvb: {
+ SDValue Op0 = N->getOperand(1);
+ SDValue Op1 = N->getOperand(2);
+ SDValue Mask = N->getOperand(3);
+
+ // Don't try to simplify this intrinsic if we don't have SSE4.1.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ // fold (blend A, A, Mask) -> A
+ if (Op0 == Op1)
+ return Op0;
+ // fold (blend A, B, allZeros) -> A
+ if (ISD::isBuildVectorAllZeros(Mask.getNode()))
+ return Op0;
+ // fold (blend A, B, allOnes) -> B
+ if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+ return Op1;
+
+ // Simplify the case where the mask is a constant i32 value.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
+ if (C->isNullValue())
+ return Op0;
+ if (C->isAllOnesValue())
+ return Op1;
+ }
+ }
+
+ // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx2_psra_d: {
+ SDValue Op0 = N->getOperand(1);
+ SDValue Op1 = N->getOperand(2);
+ EVT VT = Op0.getValueType();
+ assert(VT.isVector() && "Expected a vector type!");
+
+ if (isa<BuildVectorSDNode>(Op1))
+ Op1 = Op1.getOperand(0);
+
+ if (!isa<ConstantSDNode>(Op1))
+ return SDValue();
+
+ EVT SVT = VT.getVectorElementType();
+ unsigned SVTBits = SVT.getSizeInBits();
+
+ ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
+ const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
+ uint64_t ShAmt = C.getZExtValue();
+
+ // Don't try to convert this shift into a ISD::SRA if the shift
+ // count is bigger than or equal to the element size.
+ if (ShAmt >= SVTBits)
+ return SDValue();
+
+ // Trivial case: if the shift count is zero, then fold this
+ // into the first operand.
+ if (ShAmt == 0)
+ return Op0;
+
+ // Replace this packed shift intrinsic with a target independent
+ // shift dag node.
+ SDValue Splat = DAG.getConstant(C, VT);
+ return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
+ }
+ }
+}
+
/// PerformMulCombine - Optimize a single multiply with constant into two
/// in order to implement it with two cheaper instructions, e.g.
/// LEA + SHL, LEA + LEA.
@@ -18223,7 +19133,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
N1->getOperand(0));
SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
- N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
+ N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
} else if (RHSTrunc) {
N1 = N1->getOperand(0);
}
@@ -18260,40 +19170,13 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
if (R.getNode())
return R;
- // Create BEXTR and BZHI instructions
- // BZHI is X & ((1 << Y) - 1)
+ // Create BEXTR instructions
// BEXTR is ((X >> imm) & (2**size-1))
if (VT == MVT::i32 || VT == MVT::i64) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
- if (Subtarget->hasBMI2()) {
- // Check for (and (add (shl 1, Y), -1), X)
- if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == ISD::SHL) {
- SDValue N001 = N00.getOperand(1);
- assert(N001.getValueType() == MVT::i8 && "unexpected type");
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0));
- if (C && C->getZExtValue() == 1)
- return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001);
- }
- }
-
- // Check for (and X, (add (shl 1, Y), -1))
- if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) {
- SDValue N10 = N1.getOperand(0);
- if (N10.getOpcode() == ISD::SHL) {
- SDValue N101 = N10.getOperand(1);
- assert(N101.getValueType() == MVT::i8 && "unexpected type");
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0));
- if (C && C->getZExtValue() == 1)
- return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101);
- }
- }
- }
-
// Check for BEXTR.
if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
(N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
@@ -18533,8 +19416,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
SDValue Ops[] = { N0.getOperand(0), Neg,
DAG.getConstant(X86::COND_GE, MVT::i8),
SDValue(Neg.getNode(), 1) };
- return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
- Ops, array_lengthof(Ops));
+ return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
}
return SDValue();
}
@@ -18691,8 +19573,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
}
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
- Chains.size());
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
@@ -18867,8 +19748,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
Chains.push_back(Ch);
}
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
- Chains.size());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
}
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
@@ -18891,7 +19771,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
St->getChain().hasOneUse() && !St->isVolatile()) {
SDNode* LdVal = St->getValue().getNode();
- LoadSDNode *Ld = 0;
+ LoadSDNode *Ld = nullptr;
int TokenFactorIndex = -1;
SmallVector<SDValue, 8> Ops;
SDNode* ChainVal = St->getChain().getNode();
@@ -18934,8 +19814,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue NewChain = NewLd.getValue(1);
if (TokenFactorIndex != -1) {
Ops.push_back(NewChain);
- NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
- Ops.size());
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
}
return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
St->getPointerInfo(),
@@ -18962,8 +19841,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
if (TokenFactorIndex != -1) {
Ops.push_back(LoLd);
Ops.push_back(HiLd);
- NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
- Ops.size());
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
}
LoAddr = St->getBasePtr();
@@ -19432,6 +20310,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc dl(N);
+ MVT VT = N->getOperand(1)->getSimpleValueType(0);
+ assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+ "X86insertps is only defined for v4x32");
+
+ SDValue Ld = N->getOperand(1);
+ if (MayFoldLoad(Ld)) {
+ // Extract the countS bits from the immediate so we can get the proper
+ // address when narrowing the vector load to a specific element.
+ // When the second source op is a memory address, interps doesn't use
+ // countS and just gets an f32 from that address.
+ unsigned DestIndex =
+ cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+ Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+ } else
+ return SDValue();
+
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+ // countS bits are ignored when loading from memory on insertps, which
+ // means we don't need to explicitly set them to 0.
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+ LoadScalarToVector, N->getOperand(2));
+}
+
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
@@ -19711,7 +20616,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
- case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+ case ISD::SIGN_EXTEND_INREG:
+ return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
@@ -19732,6 +20638,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
+ case X86ISD::INSERTPS:
+ return PerformINSERTPSCombine(N, DAG, Subtarget);
}
return SDValue();
@@ -20006,7 +20916,7 @@ TargetLowering::ConstraintWeight
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
- if (CallOperandVal == NULL)
+ if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
@@ -20124,7 +21034,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
std::vector<SDValue>&Ops,
SelectionDAG &DAG) const {
- SDValue Result(0, 0);
+ SDValue Result;
// Only support length 1 constraints for now.
if (Constraint.length() > 1) return;
@@ -20207,7 +21117,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// If we are in non-pic codegen mode, we allow the address of a global (with
// an optional displacement) to be used with 'i'.
- GlobalAddressSDNode *GA = 0;
+ GlobalAddressSDNode *GA = nullptr;
int64_t Offset = 0;
// Match either (GA), (GA+C), (GA+C1+C2), etc.
@@ -20363,7 +21273,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
// Not found as a standard register?
- if (Res.second == 0) {
+ if (!Res.second) {
// Map st(0) -> st(7) -> ST0
if (Constraint.size() == 7 && Constraint[0] == '{' &&
tolower(Constraint[1]) == 's' &&
@@ -20488,3 +21398,30 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
return Res;
}
+
+int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
+ Type *Ty) const {
+ // Scaling factors are not free at all.
+ // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+ // will take 2 allocations in the out of order engine instead of 1
+ // for plain addressing mode, i.e. inst (reg1).
+ // E.g.,
+ // vaddps (%rsi,%drx), %ymm0, %ymm1
+ // Requires two allocations (one for the load, one for the computation)
+ // whereas:
+ // vaddps (%rsi), %ymm0, %ymm1
+ // Requires just 1 allocation, i.e., freeing allocations for other operations
+ // and having less micro operations to execute.
+ //
+ // For some X86 architectures, this is even worse because for instance for
+ // stores, the complex addressing mode forces the instruction to use the
+ // "load" ports instead of the dedicated "store" port.
+ // E.g., on Haswell:
+ // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+ // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+ if (isLegalAddressingMode(AM, Ty))
+ // Scale represents reg2 * scale, thus account for 1
+ // as soon as we use a second register.
+ return AM.Scale != 0;
+ return -1;
+}