diff options
author | Nate Begeman <natebegeman@mac.com> | 2008-02-11 04:19:36 +0000 |
---|---|---|
committer | Nate Begeman <natebegeman@mac.com> | 2008-02-11 04:19:36 +0000 |
commit | d77e59efa7ae2abffd48a834336ee4bedf5399eb (patch) | |
tree | d7bcb670b24ecb227f91407faf81ac2da765ada0 /lib/Target/X86 | |
parent | 10c8575ba747e8255d45e04564b2d216ad9aad18 (diff) | |
download | external_llvm-d77e59efa7ae2abffd48a834336ee4bedf5399eb.zip external_llvm-d77e59efa7ae2abffd48a834336ee4bedf5399eb.tar.gz external_llvm-d77e59efa7ae2abffd48a834336ee4bedf5399eb.tar.bz2 |
Enable SSE4 codegen and pattern matching.
Add some notes to the README.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@46949 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/X86')
-rw-r--r-- | lib/Target/X86/README-SSE.txt | 20 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 95 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 14 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 134 |
4 files changed, 240 insertions, 23 deletions
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index d3f91bf..d9a03a3 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -761,3 +761,23 @@ an X86 fxor. This means that we need to handle this case in the x86 backend instead of in target independent code. //===---------------------------------------------------------------------===// + +Non-SSE4 insert into 16 x i8 is atrociously bad. + +//===---------------------------------------------------------------------===// + +<2 x i64> extract is substantially worse than <2 x f64>, even if the destination +is memory. + +//===---------------------------------------------------------------------===// + +SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext +sitting between the truncate and the extract. + +//===---------------------------------------------------------------------===// + +INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert +any number of 0.0 simultaneously. Currently we only use it for simple +insertions. + +See comments in LowerINSERT_VECTOR_ELT_SSE4. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f131c57..b8a6b18 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -678,6 +678,33 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); } + + if (Subtarget->hasSSE41()) { + // FIXME: Do we need to handle scalar-to-vector here? + setOperationAction(ISD::MUL, MVT::v4i32, Legal); + + // i8 and i16 vectors are custom , because the source register and source + // source memory operand types are not the same width. f32 vectors are + // custom since the immediate controlling the insert encodes additional + // information. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Legal); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); + } + } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -3655,10 +3682,34 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { } SDOperand +X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op, + SelectionDAG &DAG) { + MVT::ValueType VT = Op.getValueType(); + if (MVT::getSizeInBits(VT) == 8) { + SDOperand Extract = DAG.getNode(X86ISD::PEXTRB, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDOperand Assert = DAG.getNode(ISD::AssertZext, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, VT, Assert); + } else if (MVT::getSizeInBits(VT) == 16) { + SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDOperand Assert = DAG.getNode(ISD::AssertZext, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, VT, Assert); + } + return SDOperand(); +} + + +SDOperand X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { if (!isa<ConstantSDNode>(Op.getOperand(1))) return SDOperand(); + if (Subtarget->hasSSE41()) + return LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); + MVT::ValueType VT = Op.getValueType(); // TODO: handle v16i8. if (MVT::getSizeInBits(VT) == 16) { @@ -3699,6 +3750,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec, DAG.getIntPtrConstant(0)); } else if (MVT::getSizeInBits(VT) == 64) { + // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b + // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught + // to match extract_elt for f64. unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue(); if (Idx == 0) return Op; @@ -3724,9 +3778,47 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { } SDOperand +X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG){ + MVT::ValueType VT = Op.getValueType(); + MVT::ValueType EVT = MVT::getVectorElementType(VT); + + SDOperand N0 = Op.getOperand(0); + SDOperand N1 = Op.getOperand(1); + SDOperand N2 = Op.getOperand(2); + + if ((MVT::getSizeInBits(EVT) == 8) || (MVT::getSizeInBits(EVT) == 16)) { + unsigned Opc = (MVT::getSizeInBits(EVT) == 8) ? X86ISD::PINSRB + : X86ISD::PINSRW; + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue()); + return DAG.getNode(Opc, VT, N0, N1, N2); + } else if (EVT == MVT::f32) { + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into these + // bits. For example (insert (extract, 3), 2) could be matched by putting + // the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // combine either bitwise AND or insert of float 0.0 to set these bits. + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue() << 4); + return DAG.getNode(X86ISD::INSERTPS, VT, N0, N1, N2); + } + return SDOperand(); +} + +SDOperand X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { MVT::ValueType VT = Op.getValueType(); MVT::ValueType EVT = MVT::getVectorElementType(VT); + + if (Subtarget->hasSSE41()) + return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); + if (EVT == MVT::i8) return SDOperand(); @@ -5273,7 +5365,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::S2VEC: return "X86ISD::S2VEC"; + case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; + case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; + case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index b14e3dc..95998b3 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -170,10 +170,22 @@ namespace llvm { /// have to match the operand type. S2VEC, + /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRB. + PEXTRB, + /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRW. PEXTRW, + /// INSERTPS - Insert any element of a 4 x float vector into any element + /// of a destination 4 x floatvector. + INSERTPS, + + /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRB. + PINSRB, + /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRW. PINSRW, @@ -493,7 +505,9 @@ namespace llvm { SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG); SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG); SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG); SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG); SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG); SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG); SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 83e446c..e50716b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -35,8 +35,19 @@ def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; def X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>; -def X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>; -def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>; +def X86pextrb : SDNode<"X86ISD::PEXTRB", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; +def X86pextrw : SDNode<"X86ISD::PEXTRW", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; +def X86pinsrb : SDNode<"X86ISD::PINSRB", + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86pinsrw : SDNode<"X86ISD::PINSRW", + SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86insrtps : SDNode<"X86ISD::INSERTPS", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, + SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>; //===----------------------------------------------------------------------===// // SSE 'Special' Instructions @@ -2087,23 +2098,21 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), - (iPTR imm:$src2)))]>; + imm:$src2))]>; let isTwoAddress = 1 in { def PINSRWrri : PDIi8<0xC4, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, - (v8i16 (X86pinsrw (v8i16 VR128:$src1), - GR32:$src2, (iPTR imm:$src3))))]>; + (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>; def PINSRWrmi : PDIi8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, i32i8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set VR128:$dst, - (v8i16 (X86pinsrw (v8i16 VR128:$src1), - (i32 (anyext (loadi16 addr:$src2))), - (iPTR imm:$src3))))]>; + [(set VR128:$dst, + (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), + imm:$src3))]>; } // Mask creation @@ -3255,7 +3264,7 @@ defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", /// SS41I_binop_rmi_int - SSE 4.1 binary operator with immediate -let isTwoAddress = 1 in { +let Uses = [XMM0], isTwoAddress = 1 in { multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -3328,26 +3337,44 @@ defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>; -/// SS41I_binop_ext8 - SSE 4.1 binary operator with immediate -multiclass SS41I_binop_ext8<bits<8> opc, string OpcodeStr> { +/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem +multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { def rr : SS4AI<opc, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set GR32:$dst, (zext - (extractelt (v16i8 VR128:$src1), imm:$src2)))]>, OpSize; + [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, + OpSize; def mr : SS4AI<opc, MRMDestMem, (outs), (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(store (extractelt (v16i8 VR128:$src1), imm:$src2), - addr:$dst)]>, OpSize; + []>, OpSize; +// FIXME: +// There's an AssertZext in the way of writing the store pattern +// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) +} + +defm PEXTRB : SS41I_extract8<0x14, "pextrb">; + + +/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination +multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { + def mr : SS4AI<opc, MRMDestMem, (outs), + (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; +// FIXME: +// There's an AssertZext in the way of writing the store pattern +// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -defm PEXTRB : SS41I_binop_ext8<0x14, "pextrb">; +defm PEXTRW : SS41I_extract16<0x15, "pextrw">; -/// SS41I_binop_ext32 - SSE 4.1 binary operator with immediate -multiclass SS41I_binop_ext32<bits<8> opc, string OpcodeStr> { + +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { def rr : SS4AI<opc, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, @@ -3362,10 +3389,11 @@ multiclass SS41I_binop_ext32<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -defm PEXTRD : SS41I_binop_ext32<0x16, "pextrd">; +defm PEXTRD : SS41I_extract32<0x16, "pextrd">; + -/// SS41I_binop_extf32 - SSE 4.1 binary operator with immediate -multiclass SS41I_binop_extf32<bits<8> opc, string OpcodeStr> { +/// SS41I_extractf32 - SSE 4.1 extract 32 bits to fp reg or memory destination +multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { def rr : SS4AI<opc, MRMSrcReg, (outs FR32:$dst), (ins VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, @@ -3380,5 +3408,65 @@ multiclass SS41I_binop_extf32<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -defm EXTRACTPS : SS41I_binop_extf32<0x17, "extractps">; +defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; + +let isTwoAddress = 1 in { + multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> { + def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize; + def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), + imm:$src3))]>, OpSize; + } +} + +defm PINSRB : SS41I_insert8<0x20, "pinsrb">; + +let isTwoAddress = 1 in { + multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> { + def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, + OpSize; + def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), + imm:$src3)))]>, OpSize; + } +} + +defm PINSRD : SS41I_insert32<0x22, "pinsrd">; + +let isTwoAddress = 1 in { + multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> { + def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, FR32:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (X86insrtps VR128:$src1, FR32:$src2, imm:$src3))]>, OpSize; + def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (X86insrtps VR128:$src1, (loadf32 addr:$src2), + imm:$src3))]>, OpSize; + } +} +defm INSERTPS : SS41I_insertf32<0x31, "insertps">; |