diff options
author | Jiangning Liu <jiangning.liu@arm.com> | 2013-11-14 01:57:32 +0000 |
---|---|---|
committer | Jiangning Liu <jiangning.liu@arm.com> | 2013-11-14 01:57:32 +0000 |
commit | 082ac99cc86b17c7cd2a1f2a6faa2d1adc184e17 (patch) | |
tree | b00ca0129ad9280fe864b875e93ea6e5f9c6d01d | |
parent | 2999b2f2ccc3a48c834dffe19bb39c67641a3afd (diff) | |
download | external_llvm-082ac99cc86b17c7cd2a1f2a6faa2d1adc184e17.zip external_llvm-082ac99cc86b17c7cd2a1f2a6faa2d1adc184e17.tar.gz external_llvm-082ac99cc86b17c7cd2a1f2a6faa2d1adc184e17.tar.bz2 |
Implement AArch64 NEON instruction set AdvSIMD (table).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194648 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/llvm/IR/IntrinsicsAArch64.td | 45 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 99 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64InstrFormats.td | 19 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64InstrNEON.td | 50 | ||||
-rw-r--r-- | test/CodeGen/AArch64/neon-simd-tbl.ll | 828 | ||||
-rw-r--r-- | test/MC/AArch64/neon-diagnostics.s | 48 | ||||
-rw-r--r-- | test/MC/AArch64/neon-tbl.s | 56 | ||||
-rw-r--r-- | test/MC/Disassembler/AArch64/neon-instructions.txt | 41 |
8 files changed, 1185 insertions, 1 deletions
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td index 782fea2..4d2e053 100644 --- a/include/llvm/IR/IntrinsicsAArch64.td +++ b/include/llvm/IR/IntrinsicsAArch64.td @@ -84,6 +84,51 @@ def int_aarch64_neon_vminv : Neon_Across_Intrinsic; def int_aarch64_neon_vmaxnmv : Neon_Across_Intrinsic; def int_aarch64_neon_vminnmv : Neon_Across_Intrinsic; +// Vector Table Lookup. +def int_aarch64_neon_vtbl1 : + Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>; + +def int_aarch64_neon_vtbl2 : + Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<0>], + [IntrNoMem]>; + +def int_aarch64_neon_vtbl3 : + Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, + LLVMMatchType<0>], [IntrNoMem]>; + +def int_aarch64_neon_vtbl4 : + Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, + LLVMMatchType<1>, LLVMMatchType<0>], [IntrNoMem]>; + +// Vector Table Extension. +// Some elements of the destination vector may not be updated, so the original +// value of that vector is passed as the first argument. The next 1-4 +// arguments after that are the table. +def int_aarch64_neon_vtbx1 : + Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>], + [IntrNoMem]>; + +def int_aarch64_neon_vtbx2 : + Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>, + LLVMMatchType<0>], [IntrNoMem]>; + +def int_aarch64_neon_vtbx3 : + Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>, + LLVMMatchType<1>, LLVMMatchType<0>], [IntrNoMem]>; + +def int_aarch64_neon_vtbx4 : + Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>, + LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<0>], + [IntrNoMem]>; + // Scalar Add def int_aarch64_neon_vaddds : Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>; diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 4d79c78..872f99c 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -109,6 +109,13 @@ public: SDNode* Select(SDNode*); private: + /// Get the opcode for table lookup instruction + unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec); + + /// Select NEON table lookup intrinsics. NumVecs should be 1, 2, 3 or 4. + /// IsExt is to indicate if the result will be extended with an argument. + SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt); + /// Select NEON load intrinsics. NumVecs should be 1, 2, 3 or 4. SDNode *SelectVLD(SDNode *N, unsigned NumVecs, bool isUpdating, const uint16_t *Opcode); @@ -682,6 +689,73 @@ SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, return VSt; } +unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit, + unsigned NumOfVec) { + assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range"); + + unsigned Opc = 0; + switch (NumOfVec) { + default: + break; + case 1: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b; + else + Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b; + break; + case 2: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b; + else + Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b; + break; + case 3: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b; + else + Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b; + break; + case 4: + if (IsExt) + Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b; + else + Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b; + break; + } + + return Opc; +} + +SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs, + bool IsExt) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + SDLoc dl(N); + + // Check the element of look up table is 64-bit or not + unsigned Vec0Idx = IsExt ? 2 : 1; + SDValue V0 = N->getOperand(Vec0Idx + 0); + EVT VT = V0.getValueType(); + assert(!VT.is64BitVector() && + "The element of lookup table for vtbl and vtbx must be 128-bit"); + + // Check the return value type is 64-bit or not + EVT ResVT = N->getValueType(0); + bool is64BitRes = ResVT.is64BitVector(); + + // Create new SDValue for vector list + SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx, + N->op_begin() + Vec0Idx + NumVecs); + SDValue TblReg = createQTuple(Regs); + unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs); + + SmallVector<SDValue, 3> Ops; + if (IsExt) + Ops.push_back(N->getOperand(1)); + Ops.push_back(TblReg); + Ops.push_back(N->getOperand(Vec0Idx + NumVecs)); + return CurDAG->getMachineNode(Opc, dl, ResVT, Ops); +} + SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { // Dump information about the Node being selected DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n"); @@ -900,6 +974,31 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { }; return SelectVST(Node, 4, true, Opcodes); } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue(); + bool IsExt = false; + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_neon_vtbx1: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl1: + return SelectVTBL(Node, 1, IsExt); + case Intrinsic::aarch64_neon_vtbx2: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl2: + return SelectVTBL(Node, 2, IsExt); + case Intrinsic::aarch64_neon_vtbx3: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl3: + return SelectVTBL(Node, 3, IsExt); + case Intrinsic::aarch64_neon_vtbx4: + IsExt = true; + case Intrinsic::aarch64_neon_vtbl4: + return SelectVTBL(Node, 4, IsExt); + } + break; + } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 2c8cc6b..2a0cca8 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -1019,6 +1019,25 @@ class NeonI_Perm<bit q, bits<2> size, bits<3> opcode, // Inherit Rd in 4-0 } +// Format AdvSIMD table lookup +class NeonI_TBL<bit q, bits<2> op2, bits<2> len, bit op, + dag outs, dag ins, string asmstr, + list<dag> patterns, InstrItinClass itin> + : A64InstRdnm<outs, ins, asmstr, patterns, itin> { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-24} = 0b001110; + let Inst{23-22} = op2; + let Inst{21} = 0b0; + // Inherit Rm in 20-16 + let Inst{15} = 0b0; + let Inst{14-13} = len; + let Inst{12} = op; + let Inst{11-10} = 0b00; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + // Format AdvSIMD 3 vector registers with same vector type class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode, dag outs, dag ins, string asmstr, diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index c1b43a8..958d1a0 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -5163,6 +5163,56 @@ def : NI_Extract<v2i64, VPR128, EXTvvvi_16b, neon_uimm4>; def : NI_Extract<v4f32, VPR128, EXTvvvi_16b, neon_uimm4>; def : NI_Extract<v2f64, VPR128, EXTvvvi_16b, neon_uimm4>; +// Table lookup +class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op, + string asmop, string OpS, RegisterOperand OpVPR, + RegisterOperand VecList> + : NeonI_TBL<q, op2, len, op, + (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS, + [], + NoItinerary>; + +// The vectors in look up table are always 16b +multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> { + def _8b : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64, + !cast<RegisterOperand>(List # "16B_operand")>; + + def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128, + !cast<RegisterOperand>(List # "16B_operand")>; +} + +defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">; +defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">; +defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">; +defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">; + +// Table lookup extention +class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op, + string asmop, string OpS, RegisterOperand OpVPR, + RegisterOperand VecList> + : NeonI_TBL<q, op2, len, op, + (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm), + asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS, + [], + NoItinerary> { + let Constraints = "$src = $Rd"; +} + +// The vectors in look up table are always 16b +multiclass NI_TBX_pat<bits<2> len, bit op, string asmop, string List> { + def _8b : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64, + !cast<RegisterOperand>(List # "16B_operand")>; + + def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128, + !cast<RegisterOperand>(List # "16B_operand")>; +} + +defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">; +defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">; +defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">; +defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">; + // The followings are for instruction class (3V Elem) // Variant 1 diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll new file mode 100644 index 0000000..8eac1e8 --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-tbl.ll @@ -0,0 +1,828 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8>, <8 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8>, <16 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtbl1_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl11.i +} + +define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) { +; CHECK: test_vqtbl1_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b) + ret <8 x i8> %vtbl1.i +} + +define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl2_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1 + %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl17.i +} + +define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl2_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl2.i +} + +define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl3_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) + ret <8 x i8> %vtbl212.i +} + +define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl3_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl3.i +} + +define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl4_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) + ret <8 x i8> %vtbl216.i +} + +define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl4_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl4.i +} + +define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vqtbl1q_s8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %vtbl1.i +} + +define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl2q_s8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl2.i +} + +define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl3q_s8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl3.i +} + +define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl4q_s8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl4.i +} + +define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vtbx1_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx2_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1 + %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) + ret <8 x i8> %vtbx17.i +} + +define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx3_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx4_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3 + %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) + ret <8 x i8> %vtbx216.i +} + +define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) { +; CHECK: test_vqtbx1_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) + ret <8 x i8> %vtbx1.i +} + +define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx2_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx2.i +} + +define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx3_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx3.i +} + +define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx4_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx4.i +} + +define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vqtbx1q_s8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) + ret <16 x i8> %vtbx1.i +} + +define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx2q_s8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx2.i +} + +define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx3q_s8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx3.i +} + +define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx4q_s8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx4.i +} + +define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtbl1_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl11.i +} + +define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) { +; CHECK: test_vqtbl1_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b) + ret <8 x i8> %vtbl1.i +} + +define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl2_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1 + %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl17.i +} + +define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl2_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl2.i +} + +define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl3_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) + ret <8 x i8> %vtbl212.i +} + +define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl3_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl3.i +} + +define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl4_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) + ret <8 x i8> %vtbl216.i +} + +define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl4_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl4.i +} + +define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vqtbl1q_u8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %vtbl1.i +} + +define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl2q_u8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl2.i +} + +define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl3q_u8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl3.i +} + +define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl4q_u8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl4.i +} + +define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vtbx1_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx2_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1 + %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) + ret <8 x i8> %vtbx17.i +} + +define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx3_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx4_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3 + %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) + ret <8 x i8> %vtbx216.i +} + +define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) { +; CHECK: test_vqtbx1_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) + ret <8 x i8> %vtbx1.i +} + +define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx2_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx2.i +} + +define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx3_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx3.i +} + +define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx4_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx4.i +} + +define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vqtbx1q_u8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) + ret <16 x i8> %vtbx1.i +} + +define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx2q_u8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx2.i +} + +define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx3q_u8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx3.i +} + +define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx4q_u8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx4.i +} + +define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtbl1_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl11.i +} + +define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) { +; CHECK: test_vqtbl1_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b) + ret <8 x i8> %vtbl1.i +} + +define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl2_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1 + %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl17.i +} + +define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl2_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl2.i +} + +define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl3_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) + ret <8 x i8> %vtbl212.i +} + +define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl3_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl3.i +} + +define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl4_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) + ret <8 x i8> %vtbl216.i +} + +define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl4_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl4.i +} + +define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vqtbl1q_p8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %vtbl1.i +} + +define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl2q_p8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl2.i +} + +define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl3q_p8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl3.i +} + +define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl4q_p8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl4.i +} + +define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vtbx1_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx2_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1 + %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) + ret <8 x i8> %vtbx17.i +} + +define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx3_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx4_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3 + %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) + ret <8 x i8> %vtbx216.i +} + +define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) { +; CHECK: test_vqtbx1_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) + ret <8 x i8> %vtbx1.i +} + +define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx2_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx2.i +} + +define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx3_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx3.i +} + +define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx4_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx4.i +} + +define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vqtbx1q_p8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) + ret <16 x i8> %vtbx1.i +} + +define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx2q_p8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx2.i +} + +define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx3q_p8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx3.i +} + +define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx4q_p8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx4.i +} + diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s index 0a2332b..12d56a5 100644 --- a/test/MC/AArch64/neon-diagnostics.s +++ b/test/MC/AArch64/neon-diagnostics.s @@ -5928,3 +5928,51 @@ // CHECK-ERROR: error: lane number incompatible with layout // CHECK-ERROR: dup b1, v3.b[16] // CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Table look up +//---------------------------------------------------------------------- + + tbl v0.8b, {v1.8b}, v2.8b + tbl v0.8b, {v1.8b, v2.8b}, v2.8b + tbl v0.8b, {v1.8b, v2.8b, v3.8b}, v2.8b + tbl v0.8b, {v1.8b, v2.8b, v3.8b, v4.8b}, v2.8b + tbl v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b, v5.16b}, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: tbl v0.8b, {v1.8b}, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: tbl v0.8b, {v1.8b, v2.8b}, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: tbl v0.8b, {v1.8b, v2.8b, v3.8b}, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: tbl v0.8b, {v1.8b, v2.8b, v3.8b, v4.8b}, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: tbl v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b, v5.16b}, v2.8b +// CHECK-ERROR: ^ + + tbx v0.8b, {v1.8b}, v2.8b + tbx v0.8b, {v1.8b, v2.8b}, v2.8b + tbx v0.8b, {v1.8b, v2.8b, v3.8b}, v2.8b + tbx v0.8b, {v1.8b, v2.8b, v3.8b, v4.8b}, v2.8b + tbx v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b, v5.16b}, v2.8b + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: tbx v0.8b, {v1.8b}, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: tbx v0.8b, {v1.8b, v2.8b}, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: tbx v0.8b, {v1.8b, v2.8b, v3.8b}, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: tbx v0.8b, {v1.8b, v2.8b, v3.8b, v4.8b}, v2.8b +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: tbx v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b, v5.16b}, v2.8b +// CHECK-ERROR: ^ diff --git a/test/MC/AArch64/neon-tbl.s b/test/MC/AArch64/neon-tbl.s new file mode 100644 index 0000000..ff3e86b --- /dev/null +++ b/test/MC/AArch64/neon-tbl.s @@ -0,0 +1,56 @@ +// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//------------------------------------------------------------------------------ +// Instructions across vector registers +//------------------------------------------------------------------------------ + + tbl v0.8b, {v1.16b}, v2.8b + tbl v0.8b, {v1.16b, v2.16b}, v2.8b + tbl v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b + tbl v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b + tbl v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b + +// CHECK: tbl v0.8b, {v1.16b}, v2.8b // encoding: [0x20,0x00,0x02,0x0e] +// CHECK: tbl v0.8b, {v1.16b, v2.16b}, v2.8b // encoding: [0x20,0x20,0x02,0x0e] +// CHECK: tbl v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b // encoding: [0x20,0x40,0x02,0x0e] +// CHECK: tbl v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b // encoding: [0x20,0x60,0x02,0x0e] +// CHECK: tbl v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b // encoding: [0xe0,0x63,0x02,0x0e] + + tbl v0.16b, {v1.16b}, v2.16b + tbl v0.16b, {v1.16b, v2.16b}, v2.16b + tbl v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b + tbl v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b + tbl v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b + +// CHECK: tbl v0.16b, {v1.16b}, v2.16b // encoding: [0x20,0x00,0x02,0x4e] +// CHECK: tbl v0.16b, {v1.16b, v2.16b}, v2.16b // encoding: [0x20,0x20,0x02,0x4e] +// CHECK: tbl v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b // encoding: [0x20,0x40,0x02,0x4e] +// CHECK: tbl v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b // encoding: [0x20,0x60,0x02,0x4e] +// CHECK: tbl v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b // encoding: [0xc0,0x63,0x02,0x4e] + + tbx v0.8b, {v1.16b}, v2.8b + tbx v0.8b, {v1.16b, v2.16b}, v2.8b + tbx v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b + tbx v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b + tbx v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b + +// CHECK: tbx v0.8b, {v1.16b}, v2.8b // encoding: [0x20,0x10,0x02,0x0e] +// CHECK: tbx v0.8b, {v1.16b, v2.16b}, v2.8b // encoding: [0x20,0x30,0x02,0x0e] +// CHECK: tbx v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b // encoding: [0x20,0x50,0x02,0x0e] +// CHECK: tbx v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b // encoding: [0x20,0x70,0x02,0x0e] +// CHECK: tbx v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b // encoding: [0xe0,0x73,0x02,0x0e] + + tbx v0.16b, {v1.16b}, v2.16b + tbx v0.16b, {v1.16b, v2.16b}, v2.16b + tbx v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b + tbx v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b + tbx v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b + +// CHECK: tbx v0.16b, {v1.16b}, v2.16b // encoding: [0x20,0x10,0x02,0x4e] +// CHECK: tbx v0.16b, {v1.16b, v2.16b}, v2.16b // encoding: [0x20,0x30,0x02,0x4e] +// CHECK: tbx v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b // encoding: [0x20,0x50,0x02,0x4e] +// CHECK: tbx v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b // encoding: [0x20,0x70,0x02,0x4e] +// CHECK: tbx v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b // encoding: [0xc0,0x73,0x02,0x4e] + diff --git a/test/MC/Disassembler/AArch64/neon-instructions.txt b/test/MC/Disassembler/AArch64/neon-instructions.txt index b9ea7c1..c165901 100644 --- a/test/MC/Disassembler/AArch64/neon-instructions.txt +++ b/test/MC/Disassembler/AArch64/neon-instructions.txt @@ -1,4 +1,4 @@ -G# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -disassemble < %s | FileCheck %s +# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -disassemble < %s | FileCheck %s #------------------------------------------------------------------------------ # Vector Integer Add/Sub @@ -2387,3 +2387,42 @@ G# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -disassemble < %s | 0x51 0x04 0x14 0x5e 0x86 0x05 0x18 0x5e +#---------------------------------------------------------------------- +# Table look up +#---------------------------------------------------------------------- +0x20,0x00,0x02,0x0e +0xf0,0x23,0x02,0x0e +0x20,0x40,0x02,0x0e +0xf0,0x62,0x02,0x0e +# CHECK: tbl v0.8b, {v1.16b}, v2.8b +# CHECK: tbl v16.8b, {v31.16b, v0.16b}, v2.8b +# CHECK: tbl v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b +# CHECK: tbl v16.8b, {v23.16b, v24.16b, v25.16b, v26.16b}, v2.8b + +0x20,0x00,0x02,0x4e +0xf0,0x23,0x02,0x4e +0x20,0x40,0x02,0x4e +0xe0,0x63,0x02,0x4e +# CHECK: tbl v0.16b, {v1.16b}, v2.16b +# CHECK: tbl v16.16b, {v31.16b, v0.16b}, v2.16b +# CHECK: tbl v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b +# CHECK: tbl v0.16b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.16b + +0x20,0x10,0x02,0x0e +0xf0,0x33,0x02,0x0e +0x20,0x50,0x02,0x0e +0xf0,0x72,0x02,0x0e +# CHECK: tbx v0.8b, {v1.16b}, v2.8b +# CHECK: tbx v16.8b, {v31.16b, v0.16b}, v2.8b +# CHECK: tbx v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b +# CHECK: tbx v16.8b, {v23.16b, v24.16b, v25.16b, v26.16b}, v2.8b + +0x20,0x10,0x02,0x4e +0xf0,0x33,0x02,0x4e +0x20,0x50,0x02,0x4e +0xf0,0x73,0x02,0x4e +# CHECK: tbx v0.16b, {v1.16b}, v2.16b +# CHECK: tbx v16.16b, {v31.16b, v0.16b}, v2.16b +# CHECK: tbx v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b +# CHECK: tbx v16.16b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.16b + |