diff options
Diffstat (limited to 'lib/Target/ARM')
-rw-r--r-- | lib/Target/ARM/ARMISelDAGToDAG.cpp | 492 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 176 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.h | 23 |
3 files changed, 546 insertions, 145 deletions
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index fbdc2fb..5dd8434 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -196,26 +196,30 @@ private: /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// loads of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. - SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, + SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); /// SelectVST - Select NEON store intrinsics. NumVecs should /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for /// stores of D registers and even subregs and odd subregs of Q registers. /// For NumVecs <= 2, QOpcodes1 is not used. - SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, + SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should /// be 2, 3 or 4. The opcode arrays specify the instructions used for /// load/store of D registers and Q registers. - SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs, + SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, + bool isUpdating, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes); /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. (Q registers are not supported.) - SDNode *SelectVLDDup(SDNode *N, unsigned NumVecs, unsigned *Opcodes); + SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *Opcodes); /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be @@ -1439,14 +1443,15 @@ SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs, return CurDAG->getTargetConstant(Alignment, MVT::i32); } -SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, +SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; - if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align)) + unsigned AddrOpIdx = isUpdating ? 1 : 2; + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; SDValue Chain = N->getOperand(0); @@ -1482,46 +1487,39 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, ResTyElts *= 2; ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); } + std::vector<EVT> ResTys; + ResTys.push_back(ResTy); + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); SDValue Pred = getAL(CurDAG); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SDValue SuperReg; - if (is64BitVector) { - const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; - SDNode *VLd = CurDAG->getMachineNode(DOpcodes[OpcodeIndex], dl, - ResTy, MVT::Other, Ops, 5); - if (NumVecs == 1) - return VLd; - - SuperReg = SDValue(VLd, 0); - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { - SDValue D = CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec, - dl, VT, SuperReg); - ReplaceUses(SDValue(N, Vec), D); - } - ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); - return NULL; - } - - if (NumVecs <= 2) { - // Quad registers are directly supported for VLD1 and VLD2, - // loading pairs of D regs. - const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; - SDNode *VLd = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, - ResTy, MVT::Other, Ops, 5); - if (NumVecs == 1) - return VLd; + SDNode *VLd; + SmallVector<SDValue, 7> Ops; - SuperReg = SDValue(VLd, 0); - Chain = SDValue(VLd, 1); + // Double registers and VLD1/VLD2 quad registers are directly supported. + if (is64BitVector || NumVecs <= 2) { + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]); + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); } else { // Otherwise, quad registers are loaded with two separate instructions, // where one loads the even registers and the other loads the odd registers. EVT AddrTy = MemAddr.getValueType(); - // Load the even subregs. + // Load the even subregs. This is always an updating load, so that it + // provides the address to the second load for the odd subregs. SDValue ImplDef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain }; @@ -1530,37 +1528,54 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, Chain = SDValue(VLdA, 2); // Load the odd subregs. - const SDValue OpsB[] = { SDValue(VLdA, 1), Align, SDValue(VLdA, 0), - Pred, Reg0, Chain }; - SDNode *VLdB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, - ResTy, MVT::Other, OpsB, 6); - SuperReg = SDValue(VLdB, 0); - Chain = SDValue(VLdB, 1); - } - - // Extract out the Q registers. - assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { - SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec, - dl, VT, SuperReg); - ReplaceUses(SDValue(N, Vec), Q); - } - ReplaceUses(SDValue(N, NumVecs), Chain); + Ops.push_back(SDValue(VLdA, 1)); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + assert(isa<ConstantSDNode>(Inc.getNode()) && + "only constant post-increment update allowed for VLD3/4"); + (void)Inc; + Ops.push_back(Reg0); + } + Ops.push_back(SDValue(VLdA, 0)); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, + Ops.data(), Ops.size()); + } + + if (NumVecs == 1) + return VLd; + + // Extract out the subregisters. + SDValue SuperReg = SDValue(VLd, 0); + assert(ARM::dsub_7 == ARM::dsub_0+7 && + ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0); + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2)); return NULL; } -SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, +SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; - if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align)) + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; SDValue Chain = N->getOperand(0); - EVT VT = N->getOperand(3).getValueType(); + EVT VT = N->getOperand(Vec0Idx).getValueType(); bool is64BitVector = VT.is64BitVector(); Align = GetVLDSTAlign(Align, NumVecs, is64BitVector); @@ -1583,64 +1598,71 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, break; } + std::vector<EVT> ResTys; + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + SDValue Pred = getAL(CurDAG); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SmallVector<SDValue, 7> Ops; - if (is64BitVector) { + // Double registers and VST1/VST2 quad registers are directly supported. + if (is64BitVector || NumVecs <= 2) { SDValue SrcReg; if (NumVecs == 1) { - SrcReg = N->getOperand(3); - } else { - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - + SrcReg = N->getOperand(Vec0Idx); + } else if (is64BitVector) { // Form a REG_SEQUENCE to force register allocation. + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); if (NumVecs == 2) SrcReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); else { - SDValue V2 = N->getOperand(2+3); + SDValue V2 = N->getOperand(Vec0Idx + 2); // If it's a vst3, form a quad D-register and leave the last part as // an undef. SDValue V3 = (NumVecs == 3) ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : N->getOperand(3+3); + : N->getOperand(Vec0Idx + 3); SrcReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); } - } - const SDValue Ops[] = { MemAddr, Align, SrcReg, Pred, Reg0, Chain }; - return CurDAG->getMachineNode(DOpcodes[OpcodeIndex], dl, - MVT::Other, Ops, 6); - } - - if (NumVecs <= 2) { - // Quad registers are directly supported for VST1 and VST2. - SDValue SrcReg; - if (NumVecs == 1) { - SrcReg = N->getOperand(3); } else { // Form a QQ register. - SDValue Q0 = N->getOperand(3); - SDValue Q1 = N->getOperand(4); + SDValue Q0 = N->getOperand(Vec0Idx); + SDValue Q1 = N->getOperand(Vec0Idx + 1); SrcReg = SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0); } - const SDValue Ops[] = { MemAddr, Align, SrcReg, Pred, Reg0, Chain }; - return CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, - MVT::Other, Ops, 6); + + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]); + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + Ops.push_back(SrcReg); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + return CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); } // Otherwise, quad registers are stored with two separate instructions, // where one stores the even registers and the other stores the odd registers. // Form the QQQQ REG_SEQUENCE. - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - SDValue V2 = N->getOperand(2+3); + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); + SDValue V2 = N->getOperand(Vec0Idx + 2); SDValue V3 = (NumVecs == 3) ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) - : N->getOperand(3+3); + : N->getOperand(Vec0Idx + 3); SDValue RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); - // Store the even D registers. + // Store the even D registers. This is always an updating store, so that it + // provides the address to the second store for the odd subregs. const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain }; SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, MemAddr.getValueType(), @@ -1648,28 +1670,40 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, Chain = SDValue(VStA, 1); // Store the odd D registers. - const SDValue OpsB[] = { SDValue(VStA, 0), Align, RegSeq, Pred, Reg0, Chain }; - SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, - MVT::Other, OpsB, 6); - Chain = SDValue(VStB, 0); - ReplaceUses(SDValue(N, 0), Chain); - return NULL; + Ops.push_back(SDValue(VStA, 0)); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + assert(isa<ConstantSDNode>(Inc.getNode()) && + "only constant post-increment update allowed for VST3/4"); + (void)Inc; + Ops.push_back(Reg0); + } + Ops.push_back(RegSeq); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + return CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, + Ops.data(), Ops.size()); } SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, - unsigned NumVecs, unsigned *DOpcodes, + bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; - if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align)) + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; SDValue Chain = N->getOperand(0); unsigned Lane = - cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue(); - EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType(); + cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); + EVT VT = N->getOperand(Vec0Idx).getValueType(); bool is64BitVector = VT.is64BitVector(); unsigned Alignment = 0; @@ -1701,29 +1735,42 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, case MVT::v4i32: OpcodeIndex = 1; break; } + std::vector<EVT> ResTys; + if (IsLoad) { + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + if (!is64BitVector) + ResTyElts *= 2; + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), + MVT::i64, ResTyElts)); + } + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + SDValue Pred = getAL(CurDAG); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SmallVector<SDValue, 7> Ops; + SmallVector<SDValue, 8> Ops; Ops.push_back(MemAddr); Ops.push_back(Align); - - unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : - QOpcodes[OpcodeIndex]); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } SDValue SuperReg; - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); if (NumVecs == 2) { if (is64BitVector) SuperReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); else SuperReg = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0); } else { - SDValue V2 = N->getOperand(2+3); + SDValue V2 = N->getOperand(Vec0Idx + 2); SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : N->getOperand(3+3); + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) + : N->getOperand(Vec0Idx + 3); if (is64BitVector) SuperReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); else @@ -1735,33 +1782,29 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, Ops.push_back(Reg0); Ops.push_back(Chain); + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes[OpcodeIndex]); + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, + Ops.data(), Ops.size()); if (!IsLoad) - return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 7); - - EVT ResTy; - unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; - if (!is64BitVector) - ResTyElts *= 2; - ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); - - SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, - Ops.data(), 7); - SuperReg = SDValue(VLdLn, 0); - Chain = SDValue(VLdLn, 1); + return VLdLn; // Extract the subregisters. - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); - unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; + SuperReg = SDValue(VLdLn, 0); + assert(ARM::dsub_7 == ARM::dsub_0+7 && + ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; for (unsigned Vec = 0; Vec < NumVecs; ++Vec) ReplaceUses(SDValue(N, Vec), - CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); - ReplaceUses(SDValue(N, NumVecs), Chain); + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2)); return NULL; } -SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs, - unsigned *Opcodes) { +SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, + unsigned NumVecs, unsigned *Opcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); @@ -1800,13 +1843,26 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs, SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); SDValue SuperReg; unsigned Opc = Opcodes[OpcodeIndex]; - const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain }; + SmallVector<SDValue, 6> Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(2); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; - EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); - SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5); + std::vector<EVT> ResTys; + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts)); + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + SDNode *VLdDup = + CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); SuperReg = SDValue(VLdDup, 0); - Chain = SDValue(VLdDup, 1); // Extract the subregisters. assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); @@ -1814,7 +1870,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs, for (unsigned Vec = 0; Vec < NumVecs; ++Vec) ReplaceUses(SDValue(N, Vec), CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); - ReplaceUses(SDValue(N, NumVecs), Chain); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); return NULL; } @@ -2470,19 +2528,165 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case ARMISD::VLD2DUP: { unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo, ARM::VLD2DUPd32Pseudo }; - return SelectVLDDup(N, 2, Opcodes); + return SelectVLDDup(N, false, 2, Opcodes); } case ARMISD::VLD3DUP: { unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd32Pseudo }; - return SelectVLDDup(N, 3, Opcodes); + return SelectVLDDup(N, false, 3, Opcodes); } case ARMISD::VLD4DUP: { unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd32Pseudo }; - return SelectVLDDup(N, 4, Opcodes); + return SelectVLDDup(N, false, 4, Opcodes); + } + + case ARMISD::VLD2DUP_UPD: { + unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd16Pseudo_UPD, + ARM::VLD2DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 2, Opcodes); + } + + case ARMISD::VLD3DUP_UPD: { + unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD, + ARM::VLD3DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 3, Opcodes); + } + + case ARMISD::VLD4DUP_UPD: { + unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD, + ARM::VLD4DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 4, Opcodes); + } + + case ARMISD::VLD1_UPD: { + unsigned DOpcodes[] = { ARM::VLD1d8_UPD, ARM::VLD1d16_UPD, + ARM::VLD1d32_UPD, ARM::VLD1d64_UPD }; + unsigned QOpcodes[] = { ARM::VLD1q8Pseudo_UPD, ARM::VLD1q16Pseudo_UPD, + ARM::VLD1q32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD }; + return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VLD2_UPD: { + unsigned DOpcodes[] = { ARM::VLD2d8Pseudo_UPD, ARM::VLD2d16Pseudo_UPD, + ARM::VLD2d32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD2q8Pseudo_UPD, ARM::VLD2q16Pseudo_UPD, + ARM::VLD2q32Pseudo_UPD }; + return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VLD3_UPD: { + unsigned DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD, + ARM::VLD3d32Pseudo_UPD, ARM::VLD1d64TPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, + ARM::VLD3q16Pseudo_UPD, + ARM::VLD3q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD, + ARM::VLD3q16oddPseudo_UPD, + ARM::VLD3q32oddPseudo_UPD }; + return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VLD4_UPD: { + unsigned DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, + ARM::VLD4d32Pseudo_UPD, ARM::VLD1d64QPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, + ARM::VLD4q16oddPseudo_UPD, + ARM::VLD4q32oddPseudo_UPD }; + return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VLD2LN_UPD: { + unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD, + ARM::VLD2LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD, + ARM::VLD2LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes); + } + + case ARMISD::VLD3LN_UPD: { + unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD, + ARM::VLD3LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD, + ARM::VLD3LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes); + } + + case ARMISD::VLD4LN_UPD: { + unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD, + ARM::VLD4LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD, + ARM::VLD4LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes); + } + + case ARMISD::VST1_UPD: { + unsigned DOpcodes[] = { ARM::VST1d8_UPD, ARM::VST1d16_UPD, + ARM::VST1d32_UPD, ARM::VST1d64_UPD }; + unsigned QOpcodes[] = { ARM::VST1q8Pseudo_UPD, ARM::VST1q16Pseudo_UPD, + ARM::VST1q32Pseudo_UPD, ARM::VST1q64Pseudo_UPD }; + return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VST2_UPD: { + unsigned DOpcodes[] = { ARM::VST2d8Pseudo_UPD, ARM::VST2d16Pseudo_UPD, + ARM::VST2d32Pseudo_UPD, ARM::VST1q64Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST2q8Pseudo_UPD, ARM::VST2q16Pseudo_UPD, + ARM::VST2q32Pseudo_UPD }; + return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VST3_UPD: { + unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD, + ARM::VST3d32Pseudo_UPD, ARM::VST1d64TPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, + ARM::VST3q16Pseudo_UPD, + ARM::VST3q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD, + ARM::VST3q16oddPseudo_UPD, + ARM::VST3q32oddPseudo_UPD }; + return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VST4_UPD: { + unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, + ARM::VST4d32Pseudo_UPD, ARM::VST1d64QPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, + ARM::VST4q16oddPseudo_UPD, + ARM::VST4q32oddPseudo_UPD }; + return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VST2LN_UPD: { + unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD, + ARM::VST2LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD, + ARM::VST2LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes); + } + + case ARMISD::VST3LN_UPD: { + unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD, + ARM::VST3LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD, + ARM::VST3LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes); + } + + case ARMISD::VST4LN_UPD: { + unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD, + ARM::VST4LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD, + ARM::VST4LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes); } case ISD::INTRINSIC_VOID: @@ -2497,7 +2701,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD1d32, ARM::VLD1d64 }; unsigned QOpcodes[] = { ARM::VLD1q8Pseudo, ARM::VLD1q16Pseudo, ARM::VLD1q32Pseudo, ARM::VLD1q64Pseudo }; - return SelectVLD(N, 1, DOpcodes, QOpcodes, 0); + return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vld2: { @@ -2505,7 +2709,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VLD2d32Pseudo, ARM::VLD1q64Pseudo }; unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo, ARM::VLD2q32Pseudo }; - return SelectVLD(N, 2, DOpcodes, QOpcodes, 0); + return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vld3: { @@ -2517,7 +2721,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo, ARM::VLD3q16oddPseudo, ARM::VLD3q32oddPseudo }; - return SelectVLD(N, 3, DOpcodes, QOpcodes0, QOpcodes1); + return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vld4: { @@ -2529,28 +2733,28 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo, ARM::VLD4q16oddPseudo, ARM::VLD4q32oddPseudo }; - return SelectVLD(N, 4, DOpcodes, QOpcodes0, QOpcodes1); + return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vld2lane: { unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo, ARM::VLD2LNd32Pseudo }; unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo }; - return SelectVLDSTLane(N, true, 2, DOpcodes, QOpcodes); + return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vld3lane: { unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo, ARM::VLD3LNd32Pseudo }; unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo }; - return SelectVLDSTLane(N, true, 3, DOpcodes, QOpcodes); + return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vld4lane: { unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo, ARM::VLD4LNd32Pseudo }; unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo }; - return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes); + return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst1: { @@ -2558,7 +2762,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST1d32, ARM::VST1d64 }; unsigned QOpcodes[] = { ARM::VST1q8Pseudo, ARM::VST1q16Pseudo, ARM::VST1q32Pseudo, ARM::VST1q64Pseudo }; - return SelectVST(N, 1, DOpcodes, QOpcodes, 0); + return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vst2: { @@ -2566,7 +2770,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { ARM::VST2d32Pseudo, ARM::VST1q64Pseudo }; unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo, ARM::VST2q32Pseudo }; - return SelectVST(N, 2, DOpcodes, QOpcodes, 0); + return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0); } case Intrinsic::arm_neon_vst3: { @@ -2578,7 +2782,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo, ARM::VST3q16oddPseudo, ARM::VST3q32oddPseudo }; - return SelectVST(N, 3, DOpcodes, QOpcodes0, QOpcodes1); + return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vst4: { @@ -2590,28 +2794,28 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo, ARM::VST4q16oddPseudo, ARM::VST4q32oddPseudo }; - return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1); + return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); } case Intrinsic::arm_neon_vst2lane: { unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo, ARM::VST2LNd32Pseudo }; unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo }; - return SelectVLDSTLane(N, false, 2, DOpcodes, QOpcodes); + return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst3lane: { unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo, ARM::VST3LNd32Pseudo }; unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo }; - return SelectVLDSTLane(N, false, 3, DOpcodes, QOpcodes); + return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst4lane: { unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo, ARM::VST4LNd32Pseudo }; unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo }; - return SelectVLDSTLane(N, false, 4, DOpcodes, QOpcodes); + return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes); } } break; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 92ea6cb..59a7155 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -457,6 +457,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::VSETCC, MVT::v1i64, Expand); setOperationAction(ISD::VSETCC, MVT::v2i64, Expand); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); @@ -857,6 +859,23 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; + case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; + case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; + case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; + case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; + case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; + case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; + case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; + case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; + case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; + case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; + case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; + case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; + case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; + case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; + case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; + case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; + case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; } } @@ -5210,6 +5229,138 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { DAG.getUNDEF(VT), NewMask.data()); } +/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and +/// NEON load/store intrinsics to merge base address updates. +static SDValue CombineBaseUpdate(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + SDValue Addr = N->getOperand(AddrOpIdx); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoad = true; + bool isLaneOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + if (isIntrinsic) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: assert(0 && "unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLoad = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; + NumVecs = 2; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; + NumVecs = 3; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; + NumVecs = 4; isLoad = false; isLaneOp = true; break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: assert(0 && "unexpected opcode for Neon base update"); + case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; + case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; + case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + } + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoad) + VecTy = N->getValueType(0); + else + VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { + uint64_t IncVal = CInc->getZExtValue(); + if (IncVal != NumBytes) + continue; + } else if (NumBytes >= 3 * 16) { + // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two + // separate instructions that make it harder to use a non-constant update. + continue; + } + + // Create the new updating load/store node. + EVT Tys[6]; + unsigned NumResultVecs = (isLoad ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2); + SmallVector<SDValue, 8> Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(AddrOpIdx)); + Ops.push_back(Inc); + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { + Ops.push_back(N->getOperand(i)); + } + MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys, + Ops.data(), Ops.size(), + MemInt->getMemoryVT(), + MemInt->getMemOperand()); + + // Update the uses. + std::vector<SDValue> NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) { + NewResults.push_back(SDValue(UpdN.getNode(), i)); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and @@ -5720,6 +5871,31 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); + case ARMISD::VLD2DUP: + case ARMISD::VLD3DUP: + case ARMISD::VLD4DUP: + return CombineBaseUpdate(N, DCI); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: + return CombineBaseUpdate(N, DCI); + default: break; + } + break; } return SDValue(); } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index b06b8d3..dc400c4 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -181,7 +181,28 @@ namespace llvm { // Vector load N-element structure to all lanes: VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD3DUP, - VLD4DUP + VLD4DUP, + + // NEON loads with post-increment base updates: + VLD1_UPD, + VLD2_UPD, + VLD3_UPD, + VLD4_UPD, + VLD2LN_UPD, + VLD3LN_UPD, + VLD4LN_UPD, + VLD2DUP_UPD, + VLD3DUP_UPD, + VLD4DUP_UPD, + + // NEON stores with post-increment base updates: + VST1_UPD, + VST2_UPD, + VST3_UPD, + VST4_UPD, + VST2LN_UPD, + VST3LN_UPD, + VST4LN_UPD }; } |