diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 2193 |
1 files changed, 1565 insertions, 628 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2a35061..cbaf44e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-isel" #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" @@ -23,6 +22,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/VariadicFunction.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -52,6 +52,8 @@ #include <cctype> using namespace llvm; +#define DEBUG_TYPE "x86-isel" + STATISTIC(NumTailCalls, "Number of tail calls"); // Forward declarations. @@ -84,7 +86,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); + makeArrayRef(Vec->op_begin()+NormalizedIdxVal, + ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, @@ -265,10 +268,10 @@ void X86TargetLowering::resetOperationActions() { // The _ftol2 runtime function has an unusual calling conv, which // is modeled by a special pseudo-instruction. - setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); - setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); - setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); - setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); + setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); + setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); + setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); + setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); } if (Subtarget->isTargetDarwin()) { @@ -635,15 +638,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Custom); - else if (TM.Options.EnableSegmentedStacks) - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Custom); - else - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Custom); if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. @@ -832,7 +828,9 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); @@ -944,6 +942,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ADD, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::MULHU, MVT::v8i16, Legal); + setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v4i32, Legal); @@ -1036,6 +1038,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); + + setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); + setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); + setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { @@ -1064,11 +1070,14 @@ void X86TargetLowering::resetOperationActions() { // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); + // There is no BLENDI for byte vectors. We don't need to custom lower + // some vselects for now. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -1111,9 +1120,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); - - setOperationAction(ISD::SDIV, MVT::v8i16, Custom); - setOperationAction(ISD::SDIV, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { @@ -1178,8 +1184,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SRA, MVT::v16i16, Custom); setOperationAction(ISD::SRA, MVT::v32i8, Custom); - setOperationAction(ISD::SDIV, MVT::v16i16, Custom); - setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); setOperationAction(ISD::SETCC, MVT::v8i32, Custom); @@ -1189,10 +1193,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); - setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); @@ -1232,9 +1236,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::MUL, MVT::v16i16, Legal); // Don't lower v32i8 because there is no 128-bit byte mul - setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::MULHU, MVT::v16i16, Legal); + setOperationAction(ISD::MULHS, MVT::v16i16, Legal); - setOperationAction(ISD::SDIV, MVT::v8i32, Custom); + setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -1343,7 +1351,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FNEG, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); - setOperationAction(ISD::SDIV, MVT::v16i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); @@ -1358,9 +1365,11 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); @@ -1392,6 +1401,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); @@ -1474,6 +1485,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + if (!Subtarget->is64Bit()) + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1498,9 +1511,9 @@ void X86TargetLowering::resetOperationActions() { if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. - setLibcallName(RTLIB::SHL_I128, 0); - setLibcallName(RTLIB::SRL_I128, 0); - setLibcallName(RTLIB::SRA_I128, 0); + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); } // Combine sin / cos into one node or libcall if possible. @@ -1516,6 +1529,15 @@ void X86TargetLowering::resetOperationActions() { } } + if (Subtarget->isTargetWin64()) { + setOperationAction(ISD::SDIV, MVT::i128, Custom); + setOperationAction(ISD::UDIV, MVT::i128, Custom); + setOperationAction(ISD::SREM, MVT::i128, Custom); + setOperationAction(ISD::UREM, MVT::i128, Custom); + setOperationAction(ISD::SDIVREM, MVT::i128, Custom); + setOperationAction(ISD::UDIVREM, MVT::i128, Custom); + } + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); @@ -1540,6 +1562,7 @@ void X86TargetLowering::resetOperationActions() { setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -1738,7 +1761,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, // FIXME: Why this routine is here? Move to RegInfo! std::pair<const TargetRegisterClass*, uint8_t> X86TargetLowering::findRepresentativeClass(MVT VT) const{ - const TargetRegisterClass *RRC = 0; + const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: @@ -1806,8 +1829,8 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, return CCInfo.CheckReturn(Outs, RetCC_X86); } -const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { - static const uint16_t ScratchRegs[] = { X86::R11, 0 }; +const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { + static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } @@ -1930,8 +1953,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(X86ISD::RET_FLAG, dl, - MVT::Other, &RetOps[0], RetOps.size()); + return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -2285,22 +2307,25 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, InVals.push_back(ArgValue); } - // The x86-64 ABIs require that for returning structs by value we copy - // the sret argument into %rax/%eax (depending on ABI) for the return. - // Win32 requires us to put the sret argument to %eax as well. - // Save the argument into a virtual register so that we can access it - // from the return points. - if (MF.getFunction()->hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { - X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); - unsigned Reg = FuncInfo->getSRetReturnReg(); - if (!Reg) { - MVT PtrTy = getPointerTy(); - Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); - FuncInfo->setSRetReturnReg(Reg); + if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) { + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + // The x86-64 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // Win32 requires us to put the sret argument to %eax as well. + // Save the argument into a virtual register so that we can access it + // from the return points. + if (Ins[i].Flags.isSRet()) { + unsigned Reg = FuncInfo->getSRetReturnReg(); + if (!Reg) { + MVT PtrTy = getPointerTy(); + Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + FuncInfo->setSRetReturnReg(Reg); + } + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); + break; + } } - SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); } unsigned StackSize = CCInfo.getNextStackOffset(); @@ -2320,17 +2345,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; // FIXME: We should really autogenerate these arrays - static const uint16_t GPR64ArgRegsWin64[] = { + static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; - static const uint16_t GPR64ArgRegs64Bit[] = { + static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; - static const uint16_t XMMArgRegs64Bit[] = { + static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - const uint16_t *GPR64ArgRegs; + const MCPhysReg *GPR64ArgRegs; unsigned NumXMMRegs = 0; if (IsWin64) { @@ -2424,13 +2449,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SaveXMMOps.push_back(Val); } MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, - &SaveXMMOps[0], SaveXMMOps.size())); + MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOps[0], MemOps.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } } @@ -2497,10 +2520,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). -static SDValue -EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, - SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, - unsigned SlotSize, int FPDiff, SDLoc dl) { +static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, + SDValue Chain, SDValue RetAddrFrIdx, + EVT PtrVT, unsigned SlotSize, + int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. @@ -2537,7 +2560,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; - if (isTailCall) { + bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); + if (IsMustTail) { + // Force this to be a tail call. The verifier rules are enough to ensure + // that we can lower this successfully without moving the return address + // around. + isTailCall = true; + } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, @@ -2578,7 +2607,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; - if (isTailCall && !IsSibcall) { + if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); @@ -2683,7 +2712,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); - if (StackPtr.getNode() == 0) + if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -2692,8 +2721,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (!MemOpChains.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOpChains[0], MemOpChains.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT @@ -2730,7 +2758,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. - static const uint16_t XMMArgRegs[] = { + static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; @@ -2742,8 +2770,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(NumXMMRegs, MVT::i8))); } - // For tail calls lower the arguments to the 'real' stack slot. - if (isTailCall) { + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls + // don't need this because the eligibility check rejects calls that require + // shuffling arguments passed in memory. + if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and @@ -2755,45 +2785,45 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 8> MemOpChains2; SDValue FIN; int FI = 0; - if (getTargetMachine().Options.GuaranteedTailCallOpt) { - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - if (VA.isRegLoc()) - continue; - assert(VA.isMemLoc()); - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; - // Create frame index. - int32_t Offset = VA.getLocMemOffset()+FPDiff; - uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; - FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); - FIN = DAG.getFrameIndex(FI, getPointerTy()); - - if (Flags.isByVal()) { - // Copy relative to framepointer. - SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); - if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, - RegInfo->getStackRegister(), - getPointerTy()); - Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); - - MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, - ArgChain, - Flags, DAG, dl)); - } else { - // Store relative to framepointer. - MemOpChains2.push_back( - DAG.getStore(ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); - } + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isRegLoc()) + continue; + assert(VA.isMemLoc()); + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + // Skip inalloca arguments. They don't require any work. + if (Flags.isInAlloca()) + continue; + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + FIN = DAG.getFrameIndex(FI, getPointerTy()); + + if (Flags.isByVal()) { + // Copy relative to framepointer. + SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, dl, + RegInfo->getStackRegister(), + getPointerTy()); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); + + MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, + ArgChain, + Flags, DAG, dl)); + } else { + // Store relative to framepointer. + MemOpChains2.push_back( + DAG.getStore(ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); } } if (!MemOpChains2.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOpChains2[0], MemOpChains2.size()); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, @@ -2930,10 +2960,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. - return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); + return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } - Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. @@ -3927,6 +3957,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { return true; } +/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to INSERTPS. +/// i. e: If all but one element come from the same vector. +static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) { + // TODO: Deal with AVX's VINSERTPS + if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32)) + return false; + + unsigned CorrectPosV1 = 0; + unsigned CorrectPosV2 = 0; + for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) + if (Mask[i] == i) + ++CorrectPosV1; + else if (Mask[i] == i + 4) + ++CorrectPosV2; + + if (CorrectPosV1 == 3 || CorrectPosV2 == 3) + // We have 3 elements from one vector, and one from another. + return true; + + return false; +} + // // Some special combinations that can be optimized. // @@ -4146,6 +4199,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { return true; } +// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or +// (src1[0], src0[1]), manipulation with 256-bit sub-vectors +static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) { + if (!VT.is512BitVector()) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfSize = NumElts/2; + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) { + *Imm = 1; + return true; + } + } + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) { + *Imm = 0; + return true; + } + } + return false; +} + /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. @@ -4624,11 +4700,17 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } +/// isZero - Returns true if Elt is a constant integer zero +static bool isZero(SDValue V) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); + return C && C->isNullValue(); +} + /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { - if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt)) - return CN->isNullValue(); + if (isZero(Elt)) + return true; if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) return CFP->getValueAPF().isPosZero(); return false; @@ -4677,7 +4759,7 @@ static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { /// isScalarLoadToVector - Returns true if the node is a scalar load that /// is promoted to a vector. It also returns the LoadSDNode by reference if /// required. -static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { +static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) { if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) return false; N = N->getOperand(0).getNode(); @@ -4803,28 +4885,24 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, - array_lengthof(Ops)); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, - array_lengthof(Ops)); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); SDValue Cst = DAG.getTargetConstant(0, MVT::i1); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - Ops, VT.getVectorNumElements()); + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); @@ -4844,8 +4922,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, if (VT.is256BitVector()) { if (HasInt256) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, - array_lengthof(Ops)); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); @@ -5307,7 +5384,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, return SDValue(); SDLoc dl(Op); - SDValue V(0, 0); + SDValue V; bool First = true; for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; @@ -5320,7 +5397,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } if ((i & 1) != 0) { - SDValue ThisElt(0, 0), LastElt(0, 0); + SDValue ThisElt, LastElt; bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; if (LastIsNonZero) { LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, @@ -5355,7 +5432,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return SDValue(); SDLoc dl(Op); - SDValue V(0, 0); + SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; @@ -5376,6 +5453,79 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return V; } +/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. +static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, + unsigned NonZeros, unsigned NumNonZero, + unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget *Subtarget, + const TargetLowering &TLI) { + // We know there's at least one non-zero element + unsigned FirstNonZeroIdx = 0; + SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); + while (FirstNonZero.getOpcode() == ISD::UNDEF || + X86::isZeroNode(FirstNonZero)) { + ++FirstNonZeroIdx; + FirstNonZero = Op->getOperand(FirstNonZeroIdx); + } + + if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(FirstNonZero.getOperand(1))) + return SDValue(); + + SDValue V = FirstNonZero.getOperand(0); + MVT VVT = V.getSimpleValueType(); + if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32)) + return SDValue(); + + unsigned FirstNonZeroDst = + cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue(); + unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; + unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; + unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; + + for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { + SDValue Elem = Op.getOperand(Idx); + if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) + continue; + + // TODO: What else can be here? Deal with it. + if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // TODO: Some optimizations are still possible here + // ex: Getting one element from a vector, and the rest from another. + if (Elem.getOperand(0) != V) + return SDValue(); + + unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue(); + if (Dst == Idx) + ++CorrectIdx; + else if (IncorrectIdx == -1U) { + IncorrectIdx = Idx; + IncorrectDst = Dst; + } else + // There was already one element with an incorrect index. + // We can't optimize this case to an insertps. + return SDValue(); + } + + if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { + SDLoc dl(Op); + EVT VT = Op.getSimpleValueType(); + unsigned ElementMoveMask = 0; + if (IncorrectIdx == -1U) + ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; + else + ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; + + SDValue InsertpsMask = + DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf)); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); + } + + return SDValue(); +} + /// getVShift - Return a vector logical shift node. /// static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, @@ -5480,7 +5630,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); - LoadSDNode *LDBase = NULL; + LoadSDNode *LDBase = nullptr; unsigned LastLoadedElt = -1U; // For each element in the initializer, see if we've found a load or an undef. @@ -5545,8 +5695,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, - array_lengthof(Ops), MVT::i64, + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, LDBase->getPointerInfo(), LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, @@ -5661,7 +5810,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, unsigned ScalarSize = CVT.getSizeInBits(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { - const Constant *C = 0; + const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) @@ -5706,6 +5855,41 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); } +/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real +/// underlying vector and index. +/// +/// Modifies \p ExtractedFromVec to the real vector and returns the real +/// index. +static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, + SDValue ExtIdx) { + int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); + if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) + return Idx; + + // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already + // lowered this: + // (extract_vector_elt (v8f32 %vreg1), Constant<6>) + // to: + // (extract_vector_elt (vector_shuffle<2,u,u,u> + // (extract_subvector (v8f32 %vreg0), Constant<4>), + // undef) + // Constant<0>) + // In this case the vector is the extract_subvector expression and the index + // is 2, as specified by the shuffle. + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec); + SDValue ShuffleVec = SVOp->getOperand(0); + MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); + assert(ShuffleVecVT.getVectorElementType() == + ExtractedFromVec.getSimpleValueType().getVectorElementType()); + + int ShuffleIdx = SVOp->getMaskElt(Idx); + if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { + ExtractedFromVec = ShuffleVec; + return ShuffleIdx; + } + return Idx; +} + static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -5739,34 +5923,32 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); + // Quit if non-constant index. + if (!isa<ConstantSDNode>(ExtIdx)) + return SDValue(); + int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); - // Quit if non-constant index. - if (!isa<ConstantSDNode>(ExtIdx)) - return SDValue(); - - if (VecIn1.getNode() == 0) + if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { - if (VecIn2.getNode() == 0) + if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } - unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); - if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } - if (VecIn1.getNode() == 0) + if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); @@ -5791,24 +5973,22 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(0, MVT::i1); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - Ops, VT.getVectorNumElements()); + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorAllOnes(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(1, MVT::i1); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, - Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, - Ops, VT.getVectorNumElements()); + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } bool AllContants = true; uint64_t Immediate = 0; int NonConstIdx = -1; bool IsSplat = true; + unsigned NumNonConsts = 0; + unsigned NumConsts = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) @@ -5816,9 +5996,13 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { if (!isa<ConstantSDNode>(In)) { AllContants = false; NonConstIdx = idx; + NumNonConsts++; } - else if (cast<ConstantSDNode>(In)->getZExtValue()) + else { + NumConsts++; + if (cast<ConstantSDNode>(In)->getZExtValue()) Immediate |= (1ULL << idx); + } if (In != Op.getOperand(0)) IsSplat = false; } @@ -5830,6 +6014,19 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0)); } + if (NumNonConsts == 1 && NonConstIdx != 0) { + SDValue DstVec; + if (NumConsts) { + SDValue VecAsImm = DAG.getConstant(Immediate, + MVT::getIntegerVT(VT.getSizeInBits())); + DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); + } + else + DstVec = DAG.getUNDEF(VT); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, + Op.getOperand(NonConstIdx), + DAG.getIntPtrConstant(NonConstIdx)); + } if (!IsSplat && (NonConstIdx != 0)) llvm_unreachable("Unsupported BUILD_VECTOR operation"); MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; @@ -6043,9 +6240,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. - SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); - SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], - NumElems/2); + SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, + makeArrayRef(&V[0], NumElems/2)); + SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, + makeArrayRef(&V[NumElems / 2], NumElems/2)); // Recreate the wider vector with the lower and upper part. if (VT.is256BitVector()) @@ -6078,6 +6276,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (V.getNode()) return V; } + // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS + if (EVTBits == 32 && NumElems == 4) { + SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, + NumZero, DAG, Subtarget, *this); + if (V.getNode()) + return V; + } + // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector<SDValue, 8> V(NumElems); if (NumElems == 4 && NumZero > 0) { @@ -6332,8 +6538,7 @@ static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl, if (ShufVT != VT) V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1); return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1, - DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, - PshufbMask.data(), PshufbMask.size())); + DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask)); } // v8i16 shuffles - Prefer shuffles in the following order: @@ -6516,7 +6721,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, NewV.getOperand(0), @@ -6540,7 +6745,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, NewV.getOperand(0), @@ -6635,7 +6840,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, &pshufbMask[0], 16)); + MVT::v16i8, pshufbMask)); // As PSHUFB will zero elements with negative indices, it's safe to ignore // the 2nd operand if it's undefined or zero. @@ -6653,7 +6858,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, &pshufbMask[0], 16)); + MVT::v16i8, pshufbMask)); return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); } @@ -6771,6 +6976,9 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, unsigned Scale; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected!"); + case MVT::v2i64: + case MVT::v2f64: + return SDValue(SVOp, 0); case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; @@ -6805,7 +7013,7 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT, SDValue SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget, SDLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { - LoadSDNode *LD = NULL; + LoadSDNode *LD = nullptr; if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) LD = dyn_cast<LoadSDNode>(SrcOp); if (!LD) { @@ -6924,8 +7132,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } // Construct the output using a BUILD_VECTOR. - Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], - SVOps.size()); + Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps); } else if (InputUsed[0] < 0) { // No input vectors were used! The result is undefined. Output[l] = DAG.getUNDEF(NVT); @@ -7207,6 +7414,93 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { getShuffleSHUFImmediate(SVOp), DAG); } +static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, + SelectionDAG &DAG) { + SDLoc dl(Load); + MVT VT = Load->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue Addr = Load->getOperand(1); + SDValue NewAddr = DAG.getNode( + ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); + + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + return NewLoad; +} + +// It is only safe to call this function if isINSERTPSMask is true for +// this shufflevector mask. +static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, + SelectionDAG &DAG) { + // Generate an insertps instruction when inserting an f32 from memory onto a + // v4f32 or when copying a member from one v4f32 to another. + // We also use it for transferring i32 from one register to another, + // since it simply copies the same bits. + // If we're transferring an i32 from memory to a specific element in a + // register, we output a generic DAG that will match the PINSRD + // instruction. + MVT VT = SVOp->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + auto Mask = SVOp->getMask(); + assert((VT == MVT::v4f32 || VT == MVT::v4i32) && + "unsupported vector type for insertps/pinsrd"); + + int FromV1 = std::count_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }); + + SDValue From; + SDValue To; + unsigned DestIndex; + if (FromV1 == 1) { + From = V1; + To = V2; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i < 4; }) - + Mask.begin(); + } else { + From = V2; + To = V1; + DestIndex = std::find_if(Mask.begin(), Mask.end(), + [](const int &i) { return i >= 4; }) - + Mask.begin(); + } + + if (MayFoldLoad(From)) { + // Trivial case, when From comes from a load and is only used by the + // shuffle. Make it use insertps from the vector that we need from that + // load. + SDValue NewLoad = + NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG); + if (!NewLoad.getNode()) + return SDValue(); + + if (EVT == MVT::f32) { + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad); + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector, + InsertpsMask); + } else { // EVT == MVT::i32 + // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT + // instruction, to match the PINSRD instruction, which loads an i32 to a + // certain vector element. + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad, + DAG.getConstant(DestIndex, MVT::i32)); + } + } + + // Vector-element-to-vector + unsigned SrcIndex = Mask[DestIndex] % 4; + SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); +} + // Reduce a vector shuffle to zext. static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { @@ -7295,9 +7589,8 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); } -static SDValue -NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -7322,31 +7615,29 @@ NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! - if (VT == MVT::v8i16 || VT == MVT::v16i8 || - VT == MVT::v16i16 || VT == MVT::v32i8) { + if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 || + VT == MVT::v32i8) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); - } else if ((VT == MVT::v4i32 || - (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { + } else if (VT.is128BitVector() && Subtarget->hasSSE2()) { // FIXME: Figure out a cleaner way to do this. - // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { MVT NewVT = NewOp.getSimpleValueType(); if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT, true, false)) - return getVZextMovL(VT, NewVT, NewOp.getOperand(0), - DAG, Subtarget, dl); + return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, + dl); } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { MVT NewVT = NewOp.getSimpleValueType(); if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) - return getVZextMovL(VT, NewVT, NewOp.getOperand(1), - DAG, Subtarget, dl); + return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, + dl); } } } @@ -7609,6 +7900,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShuffleSHUFImmediate(SVOp), DAG); } + unsigned Idx; + if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx)) + return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl), + Idx*(NumElems/2), DAG, dl); + // Handle VPERM2F128/VPERM2I128 permutations if (isVPERM2X128Mask(M, VT, HasFp256)) return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, @@ -7618,6 +7914,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (BlendOp.getNode()) return BlendOp; + if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) + return getINSERTPS(SVOp, dl, DAG); + unsigned Imm8; if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); @@ -7631,8 +7930,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); } - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, - &permclMask[0], NumElems); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask); if (V2IsUndef) // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 return DAG.getNode(X86ISD::VPERMV, dl, VT, @@ -7684,6 +7982,109 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +// This function assumes its argument is a BUILD_VECTOR of constants or +// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is +// true. +static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, + unsigned &MaskValue) { + MaskValue = 0; + unsigned NumElems = BuildVector->getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems - 1) / 8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; + + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + SDValue EltCond = BuildVector->getOperand(i); + SDValue SndLaneEltCond = + (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; + + int Lane1Cond = -1, Lane2Cond = -1; + if (isa<ConstantSDNode>(EltCond)) + Lane1Cond = !isZero(EltCond); + if (isa<ConstantSDNode>(SndLaneEltCond)) + Lane2Cond = !isZero(SndLaneEltCond); + + if (Lane1Cond == Lane2Cond || Lane2Cond < 0) + // Lane1Cond != 0, means we want the first argument. + // Lane1Cond == 0, means we want the second argument. + // The encoding of this argument is 0 for the first argument, 1 + // for the second. Therefore, invert the condition. + MaskValue |= !Lane1Cond << i; + else if (Lane1Cond < 0) + MaskValue |= !Lane2Cond << i; + else + return false; + } + return true; +} + +// Try to lower a vselect node into a simple blend instruction. +static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + // Check the mask for BLEND and build the value. + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) + return SDValue(); + + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + MVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, + DAG.getConstant(MaskValue, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + +SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); + if (BlendOp.getNode()) + return BlendOp; + + // Some types for vselect were previously set to Expand, not Legal or + // Custom. Return an empty SDValue so we fall-through to Expand, after + // the Custom lowering phase. + MVT VT = Op.getSimpleValueType(); + switch (VT.SimpleTy) { + default: + break; + case MVT::v8i16: + case MVT::v16i16: + return SDValue(); + } + + // We couldn't create a "Blend with immediate" node. + // This node should still be legal, but we'll have to emit a blendv* + // instruction. + return Op; +} + static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -7946,10 +8347,47 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); } +/// Insert one bit to mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +SDValue +X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue Elt = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + MVT VecVT = Vec.getSimpleValueType(); + + if (!isa<ConstantSDNode>(Idx)) { + // Non constant index. Extend source and destination, + // insert element and then truncate the result. + MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); + SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); + return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); + } + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(IdxVal, MVT::i8)); + const TargetRegisterClass* rc = getRegClassFor(VecVT); + unsigned MaxSift = rc->getSize()*8 - 1; + EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(MaxSift, MVT::i8)); + EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec, + DAG.getConstant(MaxSift - IdxVal, MVT::i8)); + return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); +} SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); + + if (EltVT == MVT::i1) + return InsertBitToMaskVector(Op, DAG); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); @@ -8294,10 +8732,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } else { SDValue Ops[] = { Chain, TGA }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. @@ -8325,7 +8763,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { - return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, + return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSGD); } @@ -8342,7 +8780,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SDValue Base; if (is64Bit) { - Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, + Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InFlag; @@ -8481,7 +8919,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Args[] = { Chain, Offset }; - Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); + Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -8507,10 +8945,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // Windows 64bit: gs:0x58 // Windows 32bit: fs:__tls_array - // If GV is an alias then use the aliasee for determining - // thread-localness. - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - GV = GA->getAliasedGlobal(); SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); @@ -8609,15 +9043,15 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; if (Op.getOpcode() == ISD::SHL_PARTS) { - Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); - Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } else { - Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); - Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); + return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, @@ -8680,8 +9114,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, - Tys, Ops, array_lengthof(Ops), - SrcVT, MMO); + Tys, Ops, SrcVT, MMO); if (useSSE) { Chain = Result.getValue(1); @@ -8704,8 +9137,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, - Ops, array_lengthof(Ops), - Op.getValueType(), MMO); + Ops, Op.getValueType(), MMO); Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(SSFI), false, false, false, 0); @@ -8900,7 +9332,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, - array_lengthof(Ops), MVT::i64, MMO); + MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); @@ -8993,8 +9425,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOLoad, MemSize, MemSize); - Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, - array_lengthof(Ops), DstTy, MMO); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); @@ -9008,8 +9439,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, array_lengthof(Ops), DstTy, - MMO); + Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); } else { SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, @@ -9021,8 +9451,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MVT::i32, eax.getValue(2)); SDValue Ops[] = { eax, edx }; SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops)) - : DAG.getMergeValues(Ops, array_lengthof(Ops), DL); + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) + : DAG.getMergeValues(Ops, DL); return std::make_pair(pair, SDValue()); } } @@ -9217,8 +9647,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { for (unsigned j = 0; j < 8; ++j) pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); } - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, - &pshufbMask[0], 32); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); @@ -9284,7 +9713,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (FIST.getNode() == 0) return Op; + if (!FIST.getNode()) return Op; if (StackSlot.getNode()) // Load the result. @@ -9581,12 +10010,29 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, VecIns.back(), VecIns.back()); } +/// \brief return true if \c Op has a use that doesn't just read flags. +static bool hasNonFlagsUse(SDValue Op) { + for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; + ++UI) { + SDNode *User = *UI; + unsigned UOpNo = UI.getOperandNo(); + if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { + // Look pass truncate. + UOpNo = User->use_begin().getOperandNo(); + User = *User->use_begin(); + } + + if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && + !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) + return true; + } + return false; +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. -SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, +SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - SDLoc dl(Op); - if (Op.getValueType() == MVT::i1) // KORTEST instruction should be selected return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, @@ -9687,31 +10133,35 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, Opcode = X86ISD::ADD; NumOperands = 2; break; - case ISD::AND: { - // If the primary and result isn't used, don't bother using X86ISD::AND, - // because a TEST instruction will be better. - bool NonFlagUse = false; - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - unsigned UOpNo = UI.getOperandNo(); - if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { - // Look pass truncate. - UOpNo = User->use_begin().getOperandNo(); - User = *User->use_begin(); - } - - if (User->getOpcode() != ISD::BRCOND && - User->getOpcode() != ISD::SETCC && - !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { - NonFlagUse = true; + case ISD::SHL: + case ISD::SRL: + // If we have a constant logical shift that's only used in a comparison + // against zero turn it into an equivalent AND. This allows turning it into + // a TEST instruction later. + if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && + isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + unsigned ShAmt = Op->getConstantOperandVal(1); + if (ShAmt >= BitWidth) // Avoid undefined shifts. break; - } + APInt Mask = ArithOp.getOpcode() == ISD::SRL + ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) + : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); + if (!Mask.isSignedIntN(32)) // Avoid large immediates. + break; + SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), + DAG.getConstant(Mask, VT)); + DAG.ReplaceAllUsesWith(Op, New); + Op = New; } + break; - if (!NonFlagUse) + case ISD::AND: + // If the primary and result isn't used, don't bother using X86ISD::AND, + // because a TEST instruction will be better. + if (!hasNonFlagsUse(Op)) break; - } // FALL THROUGH case ISD::SUB: case ISD::OR: @@ -9794,7 +10244,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, for (unsigned i = 0; i != NumOperands; ++i) Ops.push_back(Op.getOperand(i)); - SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); + SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesWith(Op, New); return SDValue(New.getNode(), 1); } @@ -9802,11 +10252,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, - SelectionDAG &DAG) const { - SDLoc dl(Op0); + SDLoc dl, SelectionDAG &DAG) const { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) { if (C->getAPIntValue() == 0) - return EmitTest(Op0, X86CC, DAG); + return EmitTest(Op0, X86CC, dl, DAG); if (Op0.getValueType() == MVT::i1) llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); @@ -9888,7 +10337,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { APInt Zeros, Ones; - DAG.ComputeMaskedBits(Op0, Zeros, Ones); + DAG.computeKnownBits(Op0, Zeros, Ones); if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) return SDValue(); } @@ -10054,7 +10503,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, /// \brief Try to turn a VSETULT into a VSETULE by modifying its second /// operand \p Op1. If non-trivial (for example because it's not constant) /// return an empty value. -static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG) +static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) { BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode()); if (!BV) @@ -10078,8 +10527,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG) ULTOp1.push_back(DAG.getConstant(Val - 1, EVT)); } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op1), VT, ULTOp1.data(), - ULTOp1.size()); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, @@ -10204,7 +10652,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget->hasAVX()) break; - SDValue ULEOp1 = ChangeVSETULTtoVSETULE(Op1, DAG); + SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); if (ULEOp1.getNode()) { Op1 = ULEOp1; Subus = true; Invert = false; Swap = false; @@ -10383,7 +10831,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (X86CC == X86::COND_INVALID) return SDValue(); - SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); + SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), EFLAGS); @@ -10418,11 +10866,6 @@ static bool isX86LogicalCmp(SDValue Op) { return false; } -static bool isZero(SDValue V) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); - return C && C->isNullValue(); -} - static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; @@ -10517,7 +10960,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Res = DAG.getNOT(DL, Res, Res.getValueType()); ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); - if (N2C == 0 || !N2C->isNullValue()) + if (!N2C || !N2C->isNullValue()) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } @@ -10606,7 +11049,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, X86::COND_NE, DAG); + Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); } // a < b ? -1 : 0 -> RES = ~setcc_carry @@ -10646,7 +11089,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // condition is true. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); SDValue Ops[] = { Op2, Op1, CC, Cond }; - return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { @@ -11027,7 +11470,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, X86::COND_NE, DAG); + Cond = EmitTest(Cond, X86::COND_NE, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), @@ -11042,13 +11485,50 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { - assert((Subtarget->isOSWindows() || - getTargetMachine().Options.EnableSegmentedStacks) && - "This should be used only on Windows targets or when segmented stacks " - "are being used"); - assert(!Subtarget->isTargetMacho() && "Not implemented"); + MachineFunction &MF = DAG.getMachineFunction(); + bool SplitStack = MF.shouldSplitStack(); + bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) || + SplitStack; SDLoc dl(Op); + if (!Lower) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDNode* Node = Op.getNode(); + + unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); + assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" + " not tell us which reg is the stack pointer!"); + EVT VT = Node->getValueType(0); + SDValue Tmp1 = SDValue(Node, 0); + SDValue Tmp2 = SDValue(Node, 1); + SDValue Tmp3 = Node->getOperand(2); + SDValue Chain = Tmp1.getOperand(0); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true), + SDLoc(Node)); + + SDValue Size = Tmp2.getOperand(1); + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); + Chain = SP.getValue(1); + unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); + const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); + unsigned StackAlign = TFI.getStackAlignment(); + Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + if (Align > StackAlign) + Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, + DAG.getConstant(-(uint64_t)Align, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain + + Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), + DAG.getIntPtrConstant(0, true), SDValue(), + SDLoc(Node)); + + SDValue Ops[2] = { Tmp1, Tmp2 }; + return DAG.getMergeValues(Ops, dl); + } + // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); @@ -11058,8 +11538,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; - if (getTargetMachine().Options.EnableSegmentedStacks) { - MachineFunction &MF = DAG.getMachineFunction(); + if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { @@ -11081,7 +11560,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); SDValue Ops1[2] = { Value, Chain }; - return DAG.getMergeValues(Ops1, 2, dl); + return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); @@ -11105,7 +11584,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, } SDValue Ops1[2] = { SP, Chain }; - return DAG.getMergeValues(Ops1, 2, dl); + return DAG.getMergeValues(Ops1, dl); } } @@ -11166,8 +11645,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, 16), false, false, 0); MemOps.push_back(Store); - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - &MemOps[0], MemOps.size()); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { @@ -11221,8 +11699,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { InstOps.push_back(DAG.getConstant(Align, MVT::i32)); SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, - VTs, &InstOps[0], InstOps.size(), - MVT::i64, + VTs, InstOps, MVT::i64, MachinePointerInfo(SV), /*Align=*/0, /*Volatile=*/false, @@ -11262,6 +11739,10 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); + // Fold this packed shift into its first operand if ShiftAmt is 0. + if (ShiftAmt == 0) + return SrcOp; + // Check for ShiftAmt >= element width if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) @@ -11282,7 +11763,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, ConstantSDNode *ND; switch(Opc) { - default: llvm_unreachable(0); + default: llvm_unreachable(nullptr); case X86ISD::VSHLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); @@ -11321,7 +11802,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, break; } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); } return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); @@ -11353,7 +11834,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, ShOps[0] = ShAmt; ShOps[1] = DAG.getConstant(0, MVT::i32); ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps); // The return type has to be a 128-bit type with the same element // type as the input type. @@ -11476,6 +11957,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse41_pmuldq: + case Intrinsic::x86_avx2_pmul_dq: + return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pmulhu_w: + case Intrinsic::x86_avx2_pmulhu_w: + return DAG.getNode(ISD::MULHU, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pmulh_w: + case Intrinsic::x86_avx2_pmulh_w: + return DAG.getNode(ISD::MULHS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + // SSE2/AVX2 sub with unsigned saturation intrinsics case Intrinsic::x86_sse2_psubus_b: case Intrinsic::x86_sse2_psubus_w: @@ -11927,7 +12423,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), SDValue(PCMP.getNode(), 1)); @@ -11944,7 +12440,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + return DAG.getNode(Opcode, dl, VTs, NewOps); } case Intrinsic::x86_fma_vfmadd_ps: case Intrinsic::x86_fma_vfmadd_pd: @@ -12042,27 +12538,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Base, SDValue Index, - SDValue ScaleOp, SDValue Chain, - const X86Subtarget * Subtarget) { - SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - assert(C && "Invalid scale type"); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); - SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); - EVT MaskVT = MVT::getVectorVT(MVT::i1, - Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg = DAG.getConstant(~0, MaskVT); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); - SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); -} - -static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget * Subtarget) { @@ -12072,7 +12547,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + SDValue MaskInReg; + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); + if (MaskC) + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); + else + MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -12081,12 +12561,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); + return DAG.getMergeValues(RetOps, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Base, SDValue Index, - SDValue ScaleOp, SDValue Chain) { + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); assert(C && "Invalid scale type"); @@ -12095,52 +12575,218 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Segment = DAG.getRegister(0, MVT::i32); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg = DAG.getConstant(~0, MaskVT); + SDValue MaskInReg; + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); + if (MaskC) + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); + else + MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); return SDValue(Res, 1); } -static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain) { +static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Mask, SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); assert(C && "Invalid scale type"); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); SDValue Disp = DAG.getTargetConstant(0, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - EVT MaskVT = MVT::getVectorVT(MVT::i1, - Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); - SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - return SDValue(Res, 1); + EVT MaskVT = + MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); + SDValue MaskInReg; + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); + if (MaskC) + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); + else + MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + //SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); + return SDValue(Res, 0); +} + +// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that +// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is +// also used to custom lower READCYCLECOUNTER nodes. +static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, + SelectionDAG &DAG, const X86Subtarget *Subtarget, + SmallVectorImpl<SDValue> &Results) { + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); + SDValue LO, HI; + + // The processor's time-stamp counter (a 64-bit MSR) is stored into the + // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR + // and the EAX register is loaded with the low-order 32 bits. + if (Subtarget->is64Bit()) { + LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + SDValue Chain = HI.getValue(1); + + if (Opcode == X86ISD::RDTSCP_DAG) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + + // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into + // the ECX register. Add 'ecx' explicitly to the chain. + SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, + HI.getValue(2)); + // Explicitly store the content of ECX at the location passed in input + // to the 'rdtscp' intrinsic. + Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), + MachinePointerInfo(), false, false, 0); + } + + if (Subtarget->is64Bit()) { + // The EDX register is loaded with the high-order 32 bits of the MSR, and + // the EAX register is loaded with the low-order 32 bits. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + +static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SmallVector<SDValue, 2> Results; + SDLoc DL(Op); + getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); + return DAG.getMergeValues(Results, DL); +} + +enum IntrinsicType { + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST +}; + +struct IntrinsicData { + IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1) + :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {} + IntrinsicType Type; + unsigned Opc0; + unsigned Opc1; +}; + +std::map < unsigned, IntrinsicData> IntrMap; +static void InitIntinsicsMap() { + static bool Initialized = false; + if (Initialized) + return; + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, + IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, + IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512, + IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512, + IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512, + IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, + IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, + IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, + IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, + IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0))); + + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512, + IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, + IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, + IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, + IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, + IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, + IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, + IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, + IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0))); + + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, + IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm, + X86::VGATHERPF1QPSm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, + IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm, + X86::VGATHERPF1QPDm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, + IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm, + X86::VGATHERPF1DPDm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, + IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm, + X86::VGATHERPF1DPSm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, + IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm, + X86::VSCATTERPF1QPSm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, + IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm, + X86::VSCATTERPF1QPDm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, + IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm, + X86::VSCATTERPF1DPDm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, + IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm, + X86::VSCATTERPF1DPSm))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16, + IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32, + IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64, + IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16, + IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32, + IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64, + IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_xtest, + IntrinsicData(XTEST, X86ISD::XTEST, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc, + IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp, + IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0))); + Initialized = true; } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - SDLoc dl(Op); + InitIntinsicsMap(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - switch (IntNo) { - default: return SDValue(); // Don't custom lower most intrinsics. + std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo); + if (itr == IntrMap.end()) + return SDValue(); - // RDRAND/RDSEED intrinsics. - case Intrinsic::x86_rdrand_16: - case Intrinsic::x86_rdrand_32: - case Intrinsic::x86_rdrand_64: - case Intrinsic::x86_rdseed_16: - case Intrinsic::x86_rdseed_32: - case Intrinsic::x86_rdseed_64: { - unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 || - IntNo == Intrinsic::x86_rdseed_32 || - IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED : - X86ISD::RDRAND; + SDLoc dl(Op); + IntrinsicData Intr = itr->second; + switch(Intr.Type) { + case RDSEED: + case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); - SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0)); + SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. @@ -12150,152 +12796,55 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, DAG.getVTList(Op->getValueType(1), MVT::Glue), - Ops, array_lengthof(Ops)); + Ops); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } - //int_gather(index, base, scale); - case Intrinsic::x86_avx512_gather_qpd_512: - case Intrinsic::x86_avx512_gather_qps_512: - case Intrinsic::x86_avx512_gather_dpd_512: - case Intrinsic::x86_avx512_gather_qpi_512: - case Intrinsic::x86_avx512_gather_qpq_512: - case Intrinsic::x86_avx512_gather_dpq_512: - case Intrinsic::x86_avx512_gather_dps_512: - case Intrinsic::x86_avx512_gather_dpi_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; - case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; - case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; - case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; - case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; - case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; - case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; - case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; - } - SDValue Chain = Op.getOperand(0); - SDValue Index = Op.getOperand(2); - SDValue Base = Op.getOperand(3); - SDValue Scale = Op.getOperand(4); - return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget); - } - //int_gather_mask(v1, mask, index, base, scale); - case Intrinsic::x86_avx512_gather_qps_mask_512: - case Intrinsic::x86_avx512_gather_qpd_mask_512: - case Intrinsic::x86_avx512_gather_dpd_mask_512: - case Intrinsic::x86_avx512_gather_dps_mask_512: - case Intrinsic::x86_avx512_gather_qpi_mask_512: - case Intrinsic::x86_avx512_gather_qpq_mask_512: - case Intrinsic::x86_avx512_gather_dpi_mask_512: - case Intrinsic::x86_avx512_gather_dpq_mask_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx512_gather_qps_mask_512: - Opc = X86::VGATHERQPSZrm; break; - case Intrinsic::x86_avx512_gather_qpd_mask_512: - Opc = X86::VGATHERQPDZrm; break; - case Intrinsic::x86_avx512_gather_dpd_mask_512: - Opc = X86::VGATHERDPDZrm; break; - case Intrinsic::x86_avx512_gather_dps_mask_512: - Opc = X86::VGATHERDPSZrm; break; - case Intrinsic::x86_avx512_gather_qpi_mask_512: - Opc = X86::VPGATHERQDZrm; break; - case Intrinsic::x86_avx512_gather_qpq_mask_512: - Opc = X86::VPGATHERQQZrm; break; - case Intrinsic::x86_avx512_gather_dpi_mask_512: - Opc = X86::VPGATHERDDZrm; break; - case Intrinsic::x86_avx512_gather_dpq_mask_512: - Opc = X86::VPGATHERDQZrm; break; - } + case GATHER: { + //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); + SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); - SDValue Base = Op.getOperand(5); + SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain, + return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } - //int_scatter(base, index, v1, scale); - case Intrinsic::x86_avx512_scatter_qpd_512: - case Intrinsic::x86_avx512_scatter_qps_512: - case Intrinsic::x86_avx512_scatter_dpd_512: - case Intrinsic::x86_avx512_scatter_qpi_512: - case Intrinsic::x86_avx512_scatter_qpq_512: - case Intrinsic::x86_avx512_scatter_dpq_512: - case Intrinsic::x86_avx512_scatter_dps_512: - case Intrinsic::x86_avx512_scatter_dpi_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx512_scatter_qpd_512: - Opc = X86::VSCATTERQPDZmr; break; - case Intrinsic::x86_avx512_scatter_qps_512: - Opc = X86::VSCATTERQPSZmr; break; - case Intrinsic::x86_avx512_scatter_dpd_512: - Opc = X86::VSCATTERDPDZmr; break; - case Intrinsic::x86_avx512_scatter_dps_512: - Opc = X86::VSCATTERDPSZmr; break; - case Intrinsic::x86_avx512_scatter_qpi_512: - Opc = X86::VPSCATTERQDZmr; break; - case Intrinsic::x86_avx512_scatter_qpq_512: - Opc = X86::VPSCATTERQQZmr; break; - case Intrinsic::x86_avx512_scatter_dpq_512: - Opc = X86::VPSCATTERDQZmr; break; - case Intrinsic::x86_avx512_scatter_dpi_512: - Opc = X86::VPSCATTERDDZmr; break; - } - SDValue Chain = Op.getOperand(0); - SDValue Base = Op.getOperand(2); - SDValue Index = Op.getOperand(3); - SDValue Src = Op.getOperand(4); - SDValue Scale = Op.getOperand(5); - return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain); - } - //int_scatter_mask(base, mask, index, v1, scale); - case Intrinsic::x86_avx512_scatter_qps_mask_512: - case Intrinsic::x86_avx512_scatter_qpd_mask_512: - case Intrinsic::x86_avx512_scatter_dpd_mask_512: - case Intrinsic::x86_avx512_scatter_dps_mask_512: - case Intrinsic::x86_avx512_scatter_qpi_mask_512: - case Intrinsic::x86_avx512_scatter_qpq_mask_512: - case Intrinsic::x86_avx512_scatter_dpi_mask_512: - case Intrinsic::x86_avx512_scatter_dpq_mask_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx512_scatter_qpd_mask_512: - Opc = X86::VSCATTERQPDZmr; break; - case Intrinsic::x86_avx512_scatter_qps_mask_512: - Opc = X86::VSCATTERQPSZmr; break; - case Intrinsic::x86_avx512_scatter_dpd_mask_512: - Opc = X86::VSCATTERDPDZmr; break; - case Intrinsic::x86_avx512_scatter_dps_mask_512: - Opc = X86::VSCATTERDPSZmr; break; - case Intrinsic::x86_avx512_scatter_qpi_mask_512: - Opc = X86::VPSCATTERQDZmr; break; - case Intrinsic::x86_avx512_scatter_qpq_mask_512: - Opc = X86::VPSCATTERQQZmr; break; - case Intrinsic::x86_avx512_scatter_dpq_mask_512: - Opc = X86::VPSCATTERDQZmr; break; - case Intrinsic::x86_avx512_scatter_dpi_mask_512: - Opc = X86::VPSCATTERDDZmr; break; - } + case SCATTER: { + //scatter(base, mask, index, v1, scale); SDValue Chain = Op.getOperand(0); SDValue Base = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + } + case PREFETCH: { + SDValue Hint = Op.getOperand(6); + unsigned HintVal; + if (dyn_cast<ConstantSDNode> (Hint) == 0 || + (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1) + llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); + unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0); + SDValue Chain = Op.getOperand(0); + SDValue Mask = Op.getOperand(2); + SDValue Index = Op.getOperand(3); + SDValue Base = Op.getOperand(4); + SDValue Scale = Op.getOperand(5); + return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); + } + // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). + case RDTSC: { + SmallVector<SDValue, 2> Results; + getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results); + return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. - case Intrinsic::x86_xtest: { + case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, @@ -12306,6 +12855,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Ret, SDValue(InTrans.getNode(), 1)); } } + llvm_unreachable("Unknown Intrinsic Type"); } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, @@ -12358,6 +12908,19 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +unsigned X86TargetLowering::getRegisterByName(const char* RegName, + EVT VT) const { + unsigned Reg = StringSwitch<unsigned>(RegName) + .Case("esp", X86::ESP) + .Case("rsp", X86::RSP) + .Default(0); + if (Reg) + return Reg; + report_fatal_error("Invalid register name global variable"); +} + SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = @@ -12477,7 +13040,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, MachinePointerInfo(TrmpAddr, 22), false, false, 0); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { const Function *Func = cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); @@ -12557,7 +13120,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, MachinePointerInfo(TrmpAddr, 6), false, false, 1); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } } @@ -12600,8 +13163,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), - Ops, array_lengthof(Ops), MVT::i16, - MMO); + Ops, MVT::i16, MMO); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, @@ -12654,7 +13216,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; - Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); // Finally xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); @@ -12706,7 +13268,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; - return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit @@ -12824,59 +13386,104 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } -static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - MVT EltTy = VT.getVectorElementType(); - unsigned NumElts = VT.getVectorNumElements(); - SDValue N0 = Op.getOperand(0); +SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetWin64() && "Unexpected target"); + EVT VT = Op.getValueType(); + assert(VT.isInteger() && VT.getSizeInBits() == 128 && + "Unexpected return type for lowering"); + + RTLIB::Libcall LC; + bool isSigned; + switch (Op->getOpcode()) { + default: llvm_unreachable("Unexpected request for libcall!"); + case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; + case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; + case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; + case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; + case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; + case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; + } + SDLoc dl(Op); + SDValue InChain = DAG.getEntryNode(); - // Lower sdiv X, pow2-const. - BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1)); - if (!C) - return SDValue(); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { + EVT ArgVT = Op->getOperand(i).getValueType(); + assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && + "Unexpected argument type for lowering"); + SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); + Entry.Node = StackPtr; + InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), + false, false, 16); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Entry.Ty = PointerType::get(ArgTy,0); + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + } + + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy()); - APInt SplatValue, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, - HasAnyUndefs) || - EltTy.getSizeInBits() < SplatBitSize) - return SDValue(); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(InChain) + .setCallee(getLibcallCallingConv(LC), + static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), + Callee, &Args, 0) + .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); + + std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); + return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first); +} + +static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); + EVT VT = Op0.getValueType(); + SDLoc dl(Op); - if ((SplatValue != 0) && - (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { - unsigned Lg2 = SplatValue.countTrailingZeros(); - // Splat the sign bit. - SmallVector<SDValue, 16> Sz(NumElts, - DAG.getConstant(EltTy.getSizeInBits() - 1, - EltTy)); - SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0], - NumElts)); - // Add (N0 < 0) ? abs2 - 1 : 0; - SmallVector<SDValue, 16> Amt(NumElts, - DAG.getConstant(EltTy.getSizeInBits() - Lg2, - EltTy)); - SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0], - NumElts)); - SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); - SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy)); - SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0], - NumElts)); - - // If we're dividing by a positive value, we're done. Otherwise, we must - // negate the result. - if (SplatValue.isNonNegative()) - return SRA; - - SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy)); - SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts); - return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA); + assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || + (VT == MVT::v8i32 && Subtarget->hasInt256())); + + // Get the high parts. + const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8}; + SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); + SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); + + // Emit two multiplies, one for the lower 2 ints and one for the higher 2 + // ints. + MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; + bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; + unsigned Opcode = + (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); + + // Shuffle it back into the right order. + const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15}; + SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14}; + SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + + // If we have a signed multiply but no PMULDQ fix up the high parts of a + // unsigned multiply. + if (IsSigned && !Subtarget->hasSSE41()) { + SDValue ShAmt = + DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); + SDValue T1 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); + SDValue T2 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); + + SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); + Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } - return SDValue(); + + return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, @@ -12920,7 +13527,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -12933,7 +13540,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { @@ -12946,7 +13553,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -12966,7 +13573,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -12979,7 +13586,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { @@ -12992,7 +13599,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -13014,7 +13621,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt = 0; for (unsigned i = 0; i != Ratio; ++i) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); - if (C == 0) + if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); @@ -13025,7 +13632,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, for (unsigned j = 0; j != Ratio; ++j) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); - if (C == 0) + if (!C) return SDValue(); // 6 == Log2(64) ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); @@ -13107,7 +13714,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, BaseShAmt = InVec.getOperand(1); } } - if (BaseShAmt.getNode() == 0) + if (!BaseShAmt.getNode()) BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, DAG.getIntPtrConstant(0)); } @@ -13260,7 +13867,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT)); } - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); return DAG.getNode(ISD::MUL, dl, VT, R, BV); } @@ -13274,6 +13881,79 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getNode(ISD::MUL, dl, VT, Op, R); } + // If possible, lower this shift as a sequence of two shifts by + // constant plus a MOVSS/MOVSD instead of scalarizing it. + // Example: + // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) + // + // Could be rewritten as: + // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>))) + // + // The advantage is that the two shifts from the example would be + // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing + // the vector shift into four scalar shifts plus four pairs of vector + // insert/extract. + if ((VT == MVT::v8i16 || VT == MVT::v4i32) && + ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + unsigned TargetOpcode = X86ISD::MOVSS; + bool CanBeSimplified; + // The splat value for the first packed shift (the 'X' from the example). + SDValue Amt1 = Amt->getOperand(0); + // The splat value for the second packed shift (the 'Y' from the example). + SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : + Amt->getOperand(2); + + // See if it is possible to replace this node with a sequence of + // two shifts followed by a MOVSS/MOVSD + if (VT == MVT::v4i32) { + // Check if it is legal to use a MOVSS. + CanBeSimplified = Amt2 == Amt->getOperand(2) && + Amt2 == Amt->getOperand(3); + if (!CanBeSimplified) { + // Otherwise, check if we can still simplify this node using a MOVSD. + CanBeSimplified = Amt1 == Amt->getOperand(1) && + Amt->getOperand(2) == Amt->getOperand(3); + TargetOpcode = X86ISD::MOVSD; + Amt2 = Amt->getOperand(2); + } + } else { + // Do similar checks for the case where the machine value type + // is MVT::v8i16. + CanBeSimplified = Amt1 == Amt->getOperand(1); + for (unsigned i=3; i != 8 && CanBeSimplified; ++i) + CanBeSimplified = Amt2 == Amt->getOperand(i); + + if (!CanBeSimplified) { + TargetOpcode = X86ISD::MOVSD; + CanBeSimplified = true; + Amt2 = Amt->getOperand(4); + for (unsigned i=0; i != 4 && CanBeSimplified; ++i) + CanBeSimplified = Amt1 == Amt->getOperand(i); + for (unsigned j=4; j != 8 && CanBeSimplified; ++j) + CanBeSimplified = Amt2 == Amt->getOperand(j); + } + } + + if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && + isa<ConstantSDNode>(Amt2)) { + // Replace this node with two shifts followed by a MOVSS/MOVSD. + EVT CastVT = MVT::v4i32; + SDValue Splat1 = + DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT); + SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); + SDValue Splat2 = + DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT); + SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); + if (TargetOpcode == X86ISD::MOVSD) + CastVT = MVT::v2i64; + SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1); + SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2); + SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, + BitCast1, DAG); + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); @@ -13351,10 +14031,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, for (unsigned i = NumElems/2; i != NumElems; ++i) Amt2Csts.push_back(Amt->getOperand(i)); - Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt1Csts[0], NumElems/2); - Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, - &Amt2Csts[0], NumElems/2); + Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); + Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); } else { // Variable shift amount Amt1 = Extract128BitVector(Amt, 0, DAG, dl); @@ -13585,35 +14263,47 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, - Ops, array_lengthof(Ops), T, MMO); + Ops, T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); return cpOut; } -static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - assert(Subtarget->is64Bit() && "Result not type legalized?"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue TheChain = Op.getOperand(0); - SDLoc dl(Op); - SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); - SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); - SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, - rax.getValue(2)); - SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, - DAG.getConstant(32, MVT::i8)); - SDValue Ops[] = { - DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), - rdx.getValue(1) - }; - return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); -} - static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); + + if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + if (DstVT != MVT::f64) + // This conversion needs to be expanded. + return SDValue(); + + SDValue InVec = Op->getOperand(0); + SDLoc dl(Op); + unsigned NumElts = SrcVT.getVectorNumElements(); + EVT SVT = SrcVT.getVectorElementType(); + + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + SmallVector<SDValue, 16> Elts; + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, + DAG.getIntPtrConstant(i))); + + // Explicitly mark the extra elements as Undef. + SDValue Undef = DAG.getUNDEF(SVT); + for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i) + Elts.push_back(Undef); + + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); + SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, + DAG.getIntPtrConstant(0)); + } + assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || @@ -13641,8 +14331,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { cast<AtomicSDNode>(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), negOp, - cast<AtomicSDNode>(Node)->getSrcValue(), - cast<AtomicSDNode>(Node)->getAlignment(), + cast<AtomicSDNode>(Node)->getMemOperand(), cast<AtomicSDNode>(Node)->getOrdering(), cast<AtomicSDNode>(Node)->getSynchScope()); } @@ -13730,12 +14419,11 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, Type *RetTy = isF64 ? (Type*)StructType::get(ArgTy, ArgTy, NULL) : (Type*)VectorType::get(ArgTy, 4); - TargetLowering:: - CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, - false, false, false, false, 0, - CallingConv::C, /*isTaillCall=*/false, - /*doesNotRet=*/false, /*isReturnValueUsed*/true, - Callee, Args, DAG, dl); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) + .setCallee(CallingConv::C, RetTy, Callee, &Args, 0); + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); if (isF64) @@ -13764,6 +14452,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); @@ -13815,6 +14504,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); + case ISD::UMUL_LOHI: + case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); @@ -13832,7 +14523,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); - case ISD::SDIV: return LowerSDIV(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); } } @@ -13875,10 +14565,10 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, SDValue Ops[] = { Chain, In1, In2L, In2H }; SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64, + DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64, cast<MemSDNode>(Node)->getMemOperand()); SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF)); Results.push_back(Result.getValue(2)); } @@ -13899,6 +14589,16 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SUBE: // We don't want to expand or promote these. return; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: { + SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); + Results.push_back(V); + return; + } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; @@ -13909,10 +14609,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; - if (FIST.getNode() != 0) { + if (FIST.getNode()) { EVT VT = N->getValueType(0); // Return a load from the stack slot. - if (StackSlot.getNode() != 0) + if (StackSlot.getNode()) Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo(), false, false, false, 0)); @@ -13945,20 +14645,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(V); return; } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default : llvm_unreachable("Do not know how to custom type " + "legalize this intrinsic operation!"); + case Intrinsic::x86_rdtsc: + return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); + case Intrinsic::x86_rdtscp: + return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, + Results); + } + } case ISD::READCYCLECOUNTER: { - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue TheChain = N->getOperand(0); - SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); - SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, - rd.getValue(1)); - SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, - eax.getValue(2)); - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - SDValue Ops[] = { eax, edx }; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, - array_lengthof(Ops))); - Results.push_back(edx.getValue(1)); - return; + return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); } case ISD::ATOMIC_CMP_SWAP: { EVT T = N->getValueType(0); @@ -13994,8 +14696,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; - SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, - Ops, array_lengthof(Ops), T, MMO); + SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -14003,7 +14704,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); Results.push_back(cpOutH.getValue(1)); return; } @@ -14058,14 +14759,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); return; } - case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_LOAD: { ReplaceATOMIC_LOAD(N, Results, DAG); + return; + } + case ISD::BITCAST: { + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + EVT DstVT = N->getValueType(0); + EVT SrcVT = N->getOperand(0)->getValueType(0); + + if (SrcVT != MVT::f64 || + (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) + return; + + unsigned NumElts = DstVT.getVectorNumElements(); + EVT SVT = DstVT.getVectorElementType(); + EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); + SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, N->getOperand(0)); + SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); + + SmallVector<SDValue, 8> Elts; + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, + ToVecInt, DAG.getIntPtrConstant(i))); + + Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); + } } } const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { - default: return NULL; + default: return nullptr; case X86ISD::BSF: return "X86ISD::BSF"; case X86ISD::BSR: return "X86ISD::BSR"; case X86ISD::SHLD: return "X86ISD::SHLD"; @@ -14176,7 +14902,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; - case X86ISD::BZHI: return "X86ISD::BZHI"; case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; @@ -14203,6 +14928,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; + case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; @@ -14210,6 +14936,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; + case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -14240,7 +14967,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, Reloc::Model R = getTargetMachine().getRelocationModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. - if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) + if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) return false; if (AM.BaseGV) { @@ -14418,7 +15145,23 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, if (VT.getSizeInBits() == 64) return false; - // FIXME: pshufb, blends, shifts. + // If this is a single-input shuffle with no 128 bit lane crossings we can + // lower it into pshufb. + if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) || + (SVT.is256BitVector() && Subtarget->hasInt256())) { + bool isLegal = true; + for (unsigned I = 0, E = M.size(); I != E; ++I) { + if (M[I] >= (int)SVT.getVectorNumElements() || + ShuffleCrosses128bitLane(SVT, I, M[I])) { + isLegal = false; + break; + } + } + if (isLegal) + return true; + } + + // FIXME: blends, shifts. return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || @@ -15366,7 +16109,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( OffsetDestReg = 0; // unused OverflowDestReg = DestReg; - offsetMBB = NULL; + offsetMBB = nullptr; overflowMBB = thisMBB; endMBB = thisMBB; } else { @@ -15736,7 +16479,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - assert(getTargetMachine().Options.EnableSegmentedStacks); + assert(MF->shouldSplitStack()); unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; @@ -16509,11 +17252,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // X86 Optimization Hooks //===----------------------------------------------------------------------===// -void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) const { +void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { unsigned BitWidth = KnownZero.getBitWidth(); unsigned Opc = Op.getOpcode(); assert((Opc >= ISD::BUILTIN_OP_END || @@ -16576,8 +17319,10 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, } } -unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, - unsigned Depth) const { +unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( + SDValue Op, + const SelectionDAG &, + unsigned Depth) const { // SETCC_CARRY sets the dest to ~0 for true or 0 for false. if (Op.getOpcode() == X86ISD::SETCC_CARRY) return Op.getValueType().getScalarType().getSizeInBits(); @@ -16679,7 +17424,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, - array_lengthof(Ops), Ld->getMemoryVT(), Ld->getPointerInfo(), Ld->getAlignment(), @@ -17036,6 +17780,51 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, return std::make_pair(Opc, NeedSplit); } +static SDValue +TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + SDValue Cond = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + + if (Cond.getOpcode() == ISD::SIGN_EXTEND) { + SDValue CondSrc = Cond->getOperand(0); + if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) + Cond = CondSrc->getOperand(0); + } + + MVT VT = N->getSimpleValueType(0); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) + return SDValue(); + + SmallVector<int, 8> ShuffleMask(NumElems, -1); + for (unsigned i = 0; i < NumElems; ++i) { + // Be sure we emit undef where we can. + if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) + ShuffleMask[i] = -1; + else + ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); + } + + return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); +} + /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, @@ -17378,7 +18167,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Another special case: If C was a sign bit, the sub has been // canonicalized into a xor. - // FIXME: Would it be better to use ComputeMaskedBits to determine whether + // FIXME: Would it be better to use computeKnownBits to determine whether // it's safe to decanonicalize the xor? // x s< 0 ? x^C : 0 --> subus x, C if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && @@ -17544,7 +18333,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // depend on the highest bit in each word. Try to use SimplifyDemandedBits // to simplify previous instructions. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && - !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { + !DCI.isBeforeLegalize() && + // We explicitly check against v8i16 and v16i16 because, although + // they're marked as Custom, they might only be legal when Cond is a + // build_vector of constants. This will be taken care in a later + // condition. + (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && + VT != MVT::v8i16)) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); // Don't optimize vector selects that map to mask-registers. @@ -17571,6 +18366,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, DCI.CommitTargetLoweringOpt(TLO); } + // We should generate an X86ISD::BLENDI from a vselect if its argument + // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of + // constants. This specific pattern gets generated when we split a + // selector for a 512 bit vector in a machine without AVX512 (but with + // 256-bit vectors), during legalization: + // + // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) + // + // Iff we find this pattern and the build_vectors are built from + // constants, we translate the vselect into a shuffle_vector that we + // know will be matched by LowerVECTOR_SHUFFLEtoBlend. + if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { + SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + return SDValue(); } @@ -17605,7 +18417,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SDValue Op2 = Cmp.getOperand(1); SDValue SetCC; - const ConstantSDNode* C = 0; + const ConstantSDNode* C = nullptr; bool needOppositeCond = (CC == X86::COND_E); bool checkAgainstTrue = false; // Is it a comparison against 1? @@ -17740,8 +18552,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { SDValue Ops[] = { FalseOp, TrueOp, DAG.getConstant(CC, MVT::i8), Flags }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), - Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); } // If this is a select between two integer constants, try to do some @@ -17856,7 +18667,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // the DCI.xxxx conditions are provided to postpone the optimization as // late as possible. - ConstantSDNode *CmpAgainst = 0; + ConstantSDNode *CmpAgainst = nullptr; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && !isa<ConstantSDNode>(Cond.getOperand(0))) { @@ -17871,8 +18682,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { SDValue Ops[] = { FalseOp, Cond.getOperand(0), DAG.getConstant(CC, MVT::i8), Cond }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, - array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); } } } @@ -17880,6 +18690,106 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: return SDValue(); + // SSE/AVX/AVX2 blend intrinsics. + case Intrinsic::x86_avx2_pblendvb: + case Intrinsic::x86_avx2_pblendw: + case Intrinsic::x86_avx2_pblendd_128: + case Intrinsic::x86_avx2_pblendd_256: + // Don't try to simplify this intrinsic if we don't have AVX2. + if (!Subtarget->hasAVX2()) + return SDValue(); + // FALL-THROUGH + case Intrinsic::x86_avx_blend_pd_256: + case Intrinsic::x86_avx_blend_ps_256: + case Intrinsic::x86_avx_blendv_pd_256: + case Intrinsic::x86_avx_blendv_ps_256: + // Don't try to simplify this intrinsic if we don't have AVX. + if (!Subtarget->hasAVX()) + return SDValue(); + // FALL-THROUGH + case Intrinsic::x86_sse41_pblendw: + case Intrinsic::x86_sse41_blendpd: + case Intrinsic::x86_sse41_blendps: + case Intrinsic::x86_sse41_blendvps: + case Intrinsic::x86_sse41_blendvpd: + case Intrinsic::x86_sse41_pblendvb: { + SDValue Op0 = N->getOperand(1); + SDValue Op1 = N->getOperand(2); + SDValue Mask = N->getOperand(3); + + // Don't try to simplify this intrinsic if we don't have SSE4.1. + if (!Subtarget->hasSSE41()) + return SDValue(); + + // fold (blend A, A, Mask) -> A + if (Op0 == Op1) + return Op0; + // fold (blend A, B, allZeros) -> A + if (ISD::isBuildVectorAllZeros(Mask.getNode())) + return Op0; + // fold (blend A, B, allOnes) -> B + if (ISD::isBuildVectorAllOnes(Mask.getNode())) + return Op1; + + // Simplify the case where the mask is a constant i32 value. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { + if (C->isNullValue()) + return Op0; + if (C->isAllOnesValue()) + return Op1; + } + } + + // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: { + SDValue Op0 = N->getOperand(1); + SDValue Op1 = N->getOperand(2); + EVT VT = Op0.getValueType(); + assert(VT.isVector() && "Expected a vector type!"); + + if (isa<BuildVectorSDNode>(Op1)) + Op1 = Op1.getOperand(0); + + if (!isa<ConstantSDNode>(Op1)) + return SDValue(); + + EVT SVT = VT.getVectorElementType(); + unsigned SVTBits = SVT.getSizeInBits(); + + ConstantSDNode *CND = cast<ConstantSDNode>(Op1); + const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); + uint64_t ShAmt = C.getZExtValue(); + + // Don't try to convert this shift into a ISD::SRA if the shift + // count is bigger than or equal to the element size. + if (ShAmt >= SVTBits) + return SDValue(); + + // Trivial case: if the shift count is zero, then fold this + // into the first operand. + if (ShAmt == 0) + return Op0; + + // Replace this packed shift intrinsic with a target independent + // shift dag node. + SDValue Splat = DAG.getConstant(C, VT); + return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat); + } + } +} + /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. @@ -18223,7 +19133,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), N1->getOperand(0)); SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); - N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); + N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); } else if (RHSTrunc) { N1 = N1->getOperand(0); } @@ -18260,40 +19170,13 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (R.getNode()) return R; - // Create BEXTR and BZHI instructions - // BZHI is X & ((1 << Y) - 1) + // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) if (VT == MVT::i32 || VT == MVT::i64) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); - if (Subtarget->hasBMI2()) { - // Check for (and (add (shl 1, Y), -1), X) - if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::SHL) { - SDValue N001 = N00.getOperand(1); - assert(N001.getValueType() == MVT::i8 && "unexpected type"); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0)); - if (C && C->getZExtValue() == 1) - return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001); - } - } - - // Check for (and X, (add (shl 1, Y), -1)) - if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) { - SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::SHL) { - SDValue N101 = N10.getOperand(1); - assert(N101.getValueType() == MVT::i8 && "unexpected type"); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0)); - if (C && C->getZExtValue() == 1) - return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101); - } - } - } - // Check for BEXTR. if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { @@ -18533,8 +19416,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { SDValue Ops[] = { N0.getOperand(0), Neg, DAG.getConstant(X86::COND_GE, MVT::i8), SDValue(Neg.getNode(), 1) }; - return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), - Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); } return SDValue(); } @@ -18691,8 +19573,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); } - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], - Chains.size()); + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. @@ -18867,8 +19748,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Chains.push_back(Ch); } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], - Chains.size()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering @@ -18891,7 +19771,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, !cast<LoadSDNode>(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { SDNode* LdVal = St->getValue().getNode(); - LoadSDNode *Ld = 0; + LoadSDNode *Ld = nullptr; int TokenFactorIndex = -1; SmallVector<SDValue, 8> Ops; SDNode* ChainVal = St->getChain().getNode(); @@ -18934,8 +19814,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(NewChain); - NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], - Ops.size()); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), St->getPointerInfo(), @@ -18962,8 +19841,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, if (TokenFactorIndex != -1) { Ops.push_back(LoLd); Ops.push_back(HiLd); - NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], - Ops.size()); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } LoAddr = St->getBasePtr(); @@ -19432,6 +20310,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + MVT VT = N->getOperand(1)->getSimpleValueType(0); + assert((VT == MVT::v4f32 || VT == MVT::v4i32) && + "X86insertps is only defined for v4x32"); + + SDValue Ld = N->getOperand(1); + if (MayFoldLoad(Ld)) { + // Extract the countS bits from the immediate so we can get the proper + // address when narrowing the vector load to a specific element. + // When the second source op is a memory address, interps doesn't use + // countS and just gets an f32 from that address. + unsigned DestIndex = + cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; + Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); + } else + return SDValue(); + + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); + // countS bits are ignored when loading from memory on insertps, which + // means we don't need to explicitly set them to 0. + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), + LoadScalarToVector, N->getOperand(2)); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -19711,7 +20616,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); - case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); + case ISD::SIGN_EXTEND_INREG: + return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); @@ -19732,6 +20638,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); + case ISD::INTRINSIC_WO_CHAIN: + return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); + case X86ISD::INSERTPS: + return PerformINSERTPSCombine(N, DAG, Subtarget); } return SDValue(); @@ -20006,7 +20916,7 @@ TargetLowering::ConstraintWeight Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. - if (CallOperandVal == NULL) + if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. @@ -20124,7 +21034,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue>&Ops, SelectionDAG &DAG) const { - SDValue Result(0, 0); + SDValue Result; // Only support length 1 constraints for now. if (Constraint.length() > 1) return; @@ -20207,7 +21117,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. - GlobalAddressSDNode *GA = 0; + GlobalAddressSDNode *GA = nullptr; int64_t Offset = 0; // Match either (GA), (GA+C), (GA+C1+C2), etc. @@ -20363,7 +21273,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); // Not found as a standard register? - if (Res.second == 0) { + if (!Res.second) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && tolower(Constraint[1]) == 's' && @@ -20488,3 +21398,30 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return Res; } + +int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, + Type *Ty) const { + // Scaling factors are not free at all. + // An indexed folded instruction, i.e., inst (reg1, reg2, scale), + // will take 2 allocations in the out of order engine instead of 1 + // for plain addressing mode, i.e. inst (reg1). + // E.g., + // vaddps (%rsi,%drx), %ymm0, %ymm1 + // Requires two allocations (one for the load, one for the computation) + // whereas: + // vaddps (%rsi), %ymm0, %ymm1 + // Requires just 1 allocation, i.e., freeing allocations for other operations + // and having less micro operations to execute. + // + // For some X86 architectures, this is even worse because for instance for + // stores, the complex addressing mode forces the instruction to use the + // "load" ports instead of the dedicated "store" port. + // E.g., on Haswell: + // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. + // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. + if (isLegalAddressingMode(AM, Ty)) + // Scale represents reg2 * scale, thus account for 1 + // as soon as we use a second register. + return AM.Scale != 0; + return -1; +} |