From d0b69cf1198dadbb7bdfc385334b67f60f756539 Mon Sep 17 00:00:00 2001 From: Bob Wilson Date: Wed, 1 Sep 2010 23:50:19 +0000 Subject: Remove NEON vmull, vmlal, and vmlsl intrinsics, replacing them with multiply, add, and subtract operations with zero-extended or sign-extended vectors. Update tests. Add auto-upgrade support for the old intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112773 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMInstrNEON.td | 187 +++++++++++++++++++++++++++++++++-------- 1 file changed, 151 insertions(+), 36 deletions(-) (limited to 'lib/Target/ARM/ARMInstrNEON.td') diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 1132911..536b132 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -93,6 +93,11 @@ def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>; def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>; def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>; +def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; +def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; + def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>]>; def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>; @@ -1255,6 +1260,42 @@ class N3VQInt3 op21_20, bits<4> op11_8, bit op4, [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2), (OpTy QPR:$src3))))]>; +// Long Multiply-Add/Sub operations. +class N3VLMulOp op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> + : N3V; +class N3VLMulOpSL op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> + : N3V; +class N3VLMulOpSL16 op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> + : N3V; + + // Neon Long 3-argument intrinsic. The destination register is // a quad-register and is also used as the first source operand register. class N3VLInt3 op21_20, bits<4> op11_8, bit op4, @@ -1306,8 +1347,37 @@ class N3VNInt op21_20, bits<4> op11_8, bit op4, // Long 3-register operations. class N3VL op21_20, bits<4> op11_8, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp, - bit Commutable> + ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable> + : N3V { + let isCommutable = Commutable; +} +class N3VLSL op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode> + : N3V; +class N3VLSL16 op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode> + : N3V; + +// Long 3-register operations with explicitly extended operands. +class N3VLExt op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp, + bit Commutable> : N3V op11_8, bit op4, multiclass N3VL_QHS op11_8, bit op4, InstrItinClass itin16, InstrItinClass itin32, string OpcodeStr, string Dt, - SDNode OpNode, SDNode ExtOp, bit Commutable = 0> { + SDNode OpNode, bit Commutable = 0> { + def v8i16 : N3VL; def v4i32 : N3VL; + v4i32, v4i16, OpNode, Commutable>; def v2i64 : N3VL; - def v8i16 : N3VL; + v2i64, v2i32, OpNode, Commutable>; +} + +multiclass N3VLSL_HS op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + def v4i16 : N3VLSL16; + def v2i32 : N3VLSL; +} + +multiclass N3VLExt_QHS op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + SDNode OpNode, SDNode ExtOp, bit Commutable = 0> { + def v8i16 : N3VLExt; + def v4i32 : N3VLExt; + def v2i64 : N3VLExt; } // Neon Long 3-register vector intrinsics. @@ -1857,6 +1951,29 @@ multiclass N3VInt3_QHS op11_8, bit op4, } +// Neon Long Multiply-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VLMulOp_QHS op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, SDNode MulOp, + SDNode OpNode> { + def v8i16 : N3VLMulOp; + def v4i32 : N3VLMulOp; + def v2i64 : N3VLMulOp; +} + +multiclass N3VLMulOpSL_HS op11_8, string OpcodeStr, + string Dt, SDNode MulOp, SDNode OpNode> { + def v4i16 : N3VLMulOpSL16; + def v2i32 : N3VLMulOpSL; +} + + // Neon Long 3-argument intrinsics. // First with only element sizes of 16 and 32 bits: @@ -2130,10 +2247,10 @@ def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32", def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32", v4f32, v4f32, fadd, 1>; // VADDL : Vector Add Long (Q = D + D) -defm VADDLs : N3VL_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, - "vaddl", "s", add, sext, 1>; -defm VADDLu : N3VL_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, - "vaddl", "u", add, zext, 1>; +defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, + "vaddl", "s", add, sext, 1>; +defm VADDLu : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, + "vaddl", "u", add, zext, 1>; // VADDW : Vector Add Wide (Q = Q + D) defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>; defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>; @@ -2247,16 +2364,14 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D) -defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, - "vmull", "s", int_arm_neon_vmulls, 1>; -defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, - "vmull", "u", int_arm_neon_vmullu, 1>; +defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "s", NEONvmulls, 1>; +defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "u", NEONvmullu, 1>; def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8", v8i16, v8i8, int_arm_neon_vmullp, 1>; -defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", - int_arm_neon_vmulls>; -defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", - int_arm_neon_vmullu>; +defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>; +defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>; // VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D, @@ -2306,13 +2421,13 @@ def : Pat<(v4f32 (fadd (v4f32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VMLAL : Vector Multiply Accumulate Long (Q += D * D) -defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlal", "s", int_arm_neon_vmlals>; -defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlal", "u", int_arm_neon_vmlalu>; +defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlal", "s", NEONvmulls, add>; +defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlal", "u", NEONvmullu, add>; -defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal", "s", int_arm_neon_vmlals>; -defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal", "u", int_arm_neon_vmlalu>; +defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>; +defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>; // VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, @@ -2358,13 +2473,13 @@ def : Pat<(v4f32 (fsub (v4f32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VMLSL : Vector Multiply Subtract Long (Q -= D * D) -defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlsl", "s", int_arm_neon_vmlsls>; -defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlsl", "u", int_arm_neon_vmlslu>; +defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlsl", "s", NEONvmulls, sub>; +defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlsl", "u", NEONvmullu, sub>; -defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl", "s", int_arm_neon_vmlsls>; -defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl", "u", int_arm_neon_vmlslu>; +defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>; +defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>; // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, @@ -2381,10 +2496,10 @@ def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32", def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32", v4f32, v4f32, fsub, 0>; // VSUBL : Vector Subtract Long (Q = D - D) -defm VSUBLs : N3VL_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, - "vsubl", "s", sub, sext, 0>; -defm VSUBLu : N3VL_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, - "vsubl", "u", sub, zext, 0>; +defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, + "vsubl", "s", sub, sext, 0>; +defm VSUBLu : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, + "vsubl", "u", sub, zext, 0>; // VSUBW : Vector Subtract Wide (Q = Q - D) defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>; defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>; -- cgit v1.1