From 8f40f7b8676ae7931baaecb1046a21f09471384b Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 1 Jul 2012 06:12:26 +0000 Subject: Optimization of shuffle node that can fit to the register form of VBROADCAST instruction on AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@159504 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 12 ++++++++++-- lib/Target/X86/X86InstrSSE.td | 33 +++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e6a0df7..ba66593 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5047,8 +5047,16 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { SDValue Sc = Op.getOperand(0); if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && - Sc.getOpcode() != ISD::BUILD_VECTOR) - return SDValue(); + Sc.getOpcode() != ISD::BUILD_VECTOR) { + + if (!Subtarget->hasAVX2()) + return SDValue(); + + // Use the register form of the broadcast instruction available on AVX2. + if (VT.is256BitVector()) + Sc = Extract128BitVector(Sc, 0, DAG, dl); + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); + } Ld = Sc.getOperand(0); ConstSplatVal = (Ld.getOpcode() == ISD::Constant || diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ad8d15d..5319455 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7272,8 +7272,8 @@ let ExeDomain = SSEPackedSingle in { int_x86_avx2_vbroadcast_ss_ps_256>; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, - int_x86_avx2_vbroadcast_sd_pd_256>; +def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, + int_x86_avx2_vbroadcast_sd_pd_256>; let Predicates = [HasAVX2] in def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, @@ -7684,6 +7684,31 @@ let Predicates = [HasAVX2] in { def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), (VPBROADCASTQYrm addr:$src)>; + def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), + (VPBROADCASTBrr VR128:$src)>; + def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), + (VPBROADCASTBYrr VR128:$src)>; + def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), + (VPBROADCASTWrr VR128:$src)>; + def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), + (VPBROADCASTWYrr VR128:$src)>; + def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), + (VPBROADCASTDrr VR128:$src)>; + def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), + (VPBROADCASTDYrr VR128:$src)>; + def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), + (VPBROADCASTQrr VR128:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), + (VPBROADCASTQYrr VR128:$src)>; + def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), + (VBROADCASTSSrr VR128:$src)>; + def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), + (VBROADCASTSSYrr VR128:$src)>; + def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))), + (VPBROADCASTQrr VR128:$src)>; + def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), + (VBROADCASTSDYrr VR128:$src)>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { @@ -7694,7 +7719,7 @@ let Predicates = [HasAVX2] in { (VBROADCASTSSYrr (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>; def : Pat<(v4f64 (X86VBroadcast FR64:$src)), - (VBROADCASTSDrr + (VBROADCASTSDYrr (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd))>; def : Pat<(v4i32 (X86VBroadcast GR32:$src)), @@ -7704,7 +7729,7 @@ let Predicates = [HasAVX2] in { (VBROADCASTSSYrr (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss))>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), - (VBROADCASTSDrr + (VBROADCASTSDYrr (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd))>; } } -- cgit v1.1