diff options
-rw-r--r-- | include/llvm/IntrinsicsARM.td | 5 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 8 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrNEON.td | 34 | ||||
-rw-r--r-- | lib/VMCore/AutoUpgrade.cpp | 29 | ||||
-rw-r--r-- | test/Bitcode/neon-intrinsics.ll | 29 | ||||
-rw-r--r-- | test/Bitcode/neon-intrinsics.ll.bc | bin | 0 -> 820 bytes | |||
-rw-r--r-- | test/CodeGen/ARM/neon-ops.ll | 7 | ||||
-rw-r--r-- | test/CodeGen/ARM/vmov.ll | 20 |
8 files changed, 86 insertions, 46 deletions
diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td index f4a80bb..37d8131 100644 --- a/include/llvm/IntrinsicsARM.td +++ b/include/llvm/IntrinsicsARM.td @@ -60,9 +60,6 @@ let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.". class Neon_1Arg_Narrow_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMExtendedElementVectorType<0>], [IntrNoMem]>; - class Neon_1Arg_Long_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMTruncatedElementVectorType<0>], [IntrNoMem]>; class Neon_2Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; @@ -322,8 +319,6 @@ def int_arm_neon_vmovn : Neon_1Arg_Narrow_Intrinsic; def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic; def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic; def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic; -def int_arm_neon_vmovls : Neon_1Arg_Long_Intrinsic; -def int_arm_neon_vmovlu : Neon_1Arg_Long_Intrinsic; // Vector Table Lookup. // The first 1-4 arguments are the table. diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 9e9198b..034114c 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -125,12 +125,14 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); - setOperationAction(ISD::ZERO_EXTEND, VT.getSimpleVT(), Expand); if (VT.isInteger()) { setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); + setLoadExtAction(ISD::SEXTLOAD, VT.getSimpleVT(), Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT.getSimpleVT(), Expand); } + setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand); // Promote all bit-wise operations. if (VT.isInteger() && VT != PromotedBitwiseVT) { @@ -320,6 +322,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + // Neon does not support some operations on v1i64 and v2i64 types. setOperationAction(ISD::MUL, MVT::v1i64, Expand); setOperationAction(ISD::MUL, MVT::v2i64, Expand); @@ -3786,7 +3790,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); - case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); } return SDValue(); } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 595a2fc..9c01fcd 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -888,14 +888,14 @@ class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>; -// Long 2-register intrinsics (currently only used for VMOVL). -class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, - bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, - ValueType TyQ, ValueType TyD, Intrinsic IntOp> +// Long 2-register operations (currently only used for VMOVL). +class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode> : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$dst), (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>; + [(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src))))]>; // 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register. class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt> @@ -1508,14 +1508,14 @@ multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, // Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL). // source operand element sizes of 16, 32 and 64 bits: -multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4, - string OpcodeStr, string Dt, Intrinsic IntOp> { - def v8i16 : N2VLInt<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD, - OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>; - def v4i32 : N2VLInt<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, - OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; - def v2i64 : N2VLInt<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, - OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4, + string OpcodeStr, string Dt, SDNode OpNode> { + def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>; + def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>; + def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>; } @@ -3123,10 +3123,8 @@ defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD, defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, "vqmovun", "s", int_arm_neon_vqmovnsu>; // VMOVL : Vector Lengthening Move -defm VMOVLs : N2VLInt_QHS<0b01,0b10100,0,1, "vmovl", "s", - int_arm_neon_vmovls>; -defm VMOVLu : N2VLInt_QHS<0b11,0b10100,0,1, "vmovl", "u", - int_arm_neon_vmovlu>; +defm VMOVLs : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>; +defm VMOVLu : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>; // Vector Conversions. diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp index 4f4daeb..f76d0d2 100644 --- a/lib/VMCore/AutoUpgrade.cpp +++ b/lib/VMCore/AutoUpgrade.cpp @@ -78,6 +78,13 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { NewFn = F; return true; } + } else if (Name.compare(5, 9, "arm.neon.", 9) == 0) { + if (Name.compare(14, 7, "vmovls.", 7) == 0 || + Name.compare(14, 7, "vmovlu.", 7) == 0) { + // Calls to these are transformed into IR without intrinsics. + NewFn = 0; + return true; + } } break; case 'b': @@ -320,6 +327,28 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { assert(F && "CallInst has no function associated with it."); if (!NewFn) { + // Get the Function's name. + const std::string& Name = F->getName(); + + // Upgrade ARM NEON intrinsics. + if (Name.compare(5, 9, "arm.neon.", 9) == 0) { + Instruction *NewI; + if (Name.compare(14, 7, "vmovls.", 7) == 0) { + NewI = new SExtInst(CI->getArgOperand(0), CI->getType(), + "upgraded." + CI->getName(), CI); + } else if (Name.compare(14, 7, "vmovlu.", 7) == 0) { + NewI = new ZExtInst(CI->getArgOperand(0), CI->getType(), + "upgraded." + CI->getName(), CI); + } else { + llvm_unreachable("Unknown arm.neon function for CallInst upgrade."); + } + // Replace any uses of the old CallInst. + if (!CI->use_empty()) + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + bool isLoadH = false, isLoadL = false, isMovL = false; bool isMovSD = false, isShufPD = false; bool isUnpckhPD = false, isUnpcklPD = false; diff --git a/test/Bitcode/neon-intrinsics.ll b/test/Bitcode/neon-intrinsics.ll new file mode 100644 index 0000000..73ca707 --- /dev/null +++ b/test/Bitcode/neon-intrinsics.ll @@ -0,0 +1,29 @@ +; RUN: llvm-dis < %s.bc | FileCheck %s + +; vmovls should be auto-upgraded to sext + +; CHECK: vmovls8 +; CHECK-NOT: arm.neon.vmovls.v8i16 +; CHECK: sext <8 x i8> + +; CHECK: vmovls16 +; CHECK-NOT: arm.neon.vmovls.v4i32 +; CHECK: sext <4 x i16> + +; CHECK: vmovls32 +; CHECK-NOT: arm.neon.vmovls.v2i64 +; CHECK: sext <2 x i32> + +; vmovlu should be auto-upgraded to zext + +; CHECK: vmovlu8 +; CHECK-NOT: arm.neon.vmovlu.v8i16 +; CHECK: zext <8 x i8> + +; CHECK: vmovlu16 +; CHECK-NOT: arm.neon.vmovlu.v4i32 +; CHECK: zext <4 x i16> + +; CHECK: vmovlu32 +; CHECK-NOT: arm.neon.vmovlu.v2i64 +; CHECK: zext <2 x i32> diff --git a/test/Bitcode/neon-intrinsics.ll.bc b/test/Bitcode/neon-intrinsics.ll.bc Binary files differnew file mode 100644 index 0000000..93eeabc --- /dev/null +++ b/test/Bitcode/neon-intrinsics.ll.bc diff --git a/test/CodeGen/ARM/neon-ops.ll b/test/CodeGen/ARM/neon-ops.ll deleted file mode 100644 index d7e893f..0000000 --- a/test/CodeGen/ARM/neon-ops.ll +++ /dev/null @@ -1,7 +0,0 @@ -; RUN: llc -march=arm -mattr=+neon -O2 -o /dev/null - -; This used to crash. -define <4 x i32> @test1(<4 x i16> %a) { - %A = zext <4 x i16> %a to <4 x i32> - ret <4 x i32> %A -} diff --git a/test/CodeGen/ARM/vmov.ll b/test/CodeGen/ARM/vmov.ll index 5e872ab..f863bf8 100644 --- a/test/CodeGen/ARM/vmov.ll +++ b/test/CodeGen/ARM/vmov.ll @@ -192,7 +192,7 @@ define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind { ;CHECK: vmovls8: ;CHECK: vmovl.s8 %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1) + %tmp2 = sext <8 x i8> %tmp1 to <8 x i16> ret <8 x i16> %tmp2 } @@ -200,7 +200,7 @@ define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind { ;CHECK: vmovls16: ;CHECK: vmovl.s16 %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1) + %tmp2 = sext <4 x i16> %tmp1 to <4 x i32> ret <4 x i32> %tmp2 } @@ -208,7 +208,7 @@ define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind { ;CHECK: vmovls32: ;CHECK: vmovl.s32 %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1) + %tmp2 = sext <2 x i32> %tmp1 to <2 x i64> ret <2 x i64> %tmp2 } @@ -216,7 +216,7 @@ define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind { ;CHECK: vmovlu8: ;CHECK: vmovl.u8 %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1) + %tmp2 = zext <8 x i8> %tmp1 to <8 x i16> ret <8 x i16> %tmp2 } @@ -224,7 +224,7 @@ define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind { ;CHECK: vmovlu16: ;CHECK: vmovl.u16 %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1) + %tmp2 = zext <4 x i16> %tmp1 to <4 x i32> ret <4 x i32> %tmp2 } @@ -232,18 +232,10 @@ define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind { ;CHECK: vmovlu32: ;CHECK: vmovl.u32 %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1) + %tmp2 = zext <2 x i32> %tmp1 to <2 x i64> ret <2 x i64> %tmp2 } -declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone - define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind { ;CHECK: vmovni16: ;CHECK: vmovn.i16 |