2 files changed, 135 insertions, 31 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 444163d..fd88307 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16043,14 +16043,14 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   ISD::LoadExtType Ext = Ld->getExtensionType();
 
   // If this is a vector EXT Load then attempt to optimize it using a
-  // shuffle. We need SSSE3 shuffles.
-  // SEXT loads are suppoted starting SSE41.
-  // We generate X86ISD::VSEXT for them.
+  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+  // expansion is still better than scalar code.
+  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
+  // emit a shuffle and a arithmetic shift.
   // TODO: It is possible to support ZExt by zeroing the undef values
   // during the shuffle phase or after the shuffle.
-  if (RegVT.isVector() && RegVT.isInteger() &&
-      ((Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) ||
-       (Ext == ISD::SEXTLOAD && Subtarget->hasSSE41()))){
+  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
+      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
     assert(MemVT != RegVT && "Cannot extend to the same type");
     assert(MemVT.isVector() && "Must load a vector from memory");
 
@@ -16143,9 +16143,40 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
     unsigned SizeRatio = RegSz/MemSz;
 
     if (Ext == ISD::SEXTLOAD) {
-      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
-      return DCI.CombineTo(N, Sext, TF, true);
+      // If we have SSE4.1 we can directly emit a VSEXT node.
+      if (Subtarget->hasSSE41()) {
+        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+        return DCI.CombineTo(N, Sext, TF, true);
+      }
+
+      // Otherwise we'll shuffle the small elements in the high bits of the
+      // larger type and perform an arithmetic shift. If the shift is not legal
+      // it's better to scalarize.
+      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
+        return SDValue();
+
+      // Redistribute the loaded elements into the different locations.
+      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+      for (unsigned i = 0; i != NumElems; ++i)
+        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
+
+      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                           DAG.getUNDEF(WideVecVT),
+                                           &ShuffleVec[0]);
+
+      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+      // Build the arithmetic shift.
+      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+                     MemVT.getVectorElementType().getSizeInBits();
+      SmallVector<SDValue, 8> C(NumElems,
+                                DAG.getConstant(Amt, RegVT.getScalarType()));
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size());
+      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV);
+
+      return DCI.CombineTo(N, Shuff, TF, true);
     }
+
     // Redistribute the loaded elements into the different locations.
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll
index 425d09c..8d7d79d 100755
--- a/test/CodeGen/X86/avx-sext.ll
+++ b/test/CodeGen/X86/avx-sext.ll
@@ -1,69 +1,142 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2
 
 define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-;CHECK: sext_8i16_to_8i32
-;CHECK: vpmovsxwd
+; AVX: sext_8i16_to_8i32
+; AVX: vpmovsxwd
 
   %B = sext <8 x i16> %A to <8 x i32>
   ret <8 x i32>%B
 }
 
 define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-;CHECK: sext_4i32_to_4i64
-;CHECK: vpmovsxdq
+; AVX: sext_4i32_to_4i64
+; AVX: vpmovsxdq
 
   %B = sext <4 x i32> %A to <4 x i64>
   ret <4 x i64>%B
 }
 
-; CHECK: load_sext_test1
-; CHECK: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test1
+; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test1
+; SSSE3: movq
+; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}}
+; SSSE3: psrad $16
+; SSSE3: ret
+
+; SSE2: load_sext_test1
+; SSE2: movq
+; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}}
+; SSE2: psrad $16
+; SSE2: ret
 define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
  %X = load <4 x i16>* %ptr
  %Y = sext <4 x i16> %X to <4 x i32>
  ret <4 x i32>%Y
 }
 
-; CHECK: load_sext_test2
-; CHECK: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test2
+; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test2
+; SSSE3: movd
+; SSSE3: pshufb
+; SSSE3: psrad $24
+; SSSE3: ret
+
+; SSE2: load_sext_test2
+; SSE2: movl
+; SSE2: psrad $24
+; SSE2: ret
 define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
  %X = load <4 x i8>* %ptr
  %Y = sext <4 x i8> %X to <4 x i32>
  ret <4 x i32>%Y
 }
 
-; CHECK: load_sext_test3
-; CHECK: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test3
+; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test3
+; SSSE3: movsbq
+; SSSE3: movsbq
+; SSSE3: punpcklqdq
+; SSSE3: ret
+
+; SSE2: load_sext_test3
+; SSE2: movsbq
+; SSE2: movsbq
+; SSE2: punpcklqdq
+; SSE2: ret
 define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
  %X = load <2 x i8>* %ptr
  %Y = sext <2 x i8> %X to <2 x i64>
  ret <2 x i64>%Y
 }
 
-; CHECK: load_sext_test4
-; CHECK: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test4
+; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test4
+; SSSE3: movswq
+; SSSE3: movswq
+; SSSE3: punpcklqdq
+; SSSE3: ret
+
+; SSE2: load_sext_test4
+; SSE2: movswq
+; SSE2: movswq
+; SSE2: punpcklqdq
+; SSE2: ret
 define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
  %X = load <2 x i16>* %ptr
  %Y = sext <2 x i16> %X to <2 x i64>
  ret <2 x i64>%Y
 }
 
-; CHECK: load_sext_test5
-; CHECK: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test5
+; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test5
+; SSSE3: movslq
+; SSSE3: movslq
+; SSSE3: punpcklqdq
+; SSSE3: ret
+
+; SSE2: load_sext_test5
+; SSE2: movslq
+; SSE2: movslq
+; SSE2: punpcklqdq
+; SSE2: ret
 define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
  %X = load <2 x i32>* %ptr
  %Y = sext <2 x i32> %X to <2 x i64>
  ret <2 x i64>%Y
 }
 
-; CHECK: load_sext_test6
-; CHECK: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test6
+; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test6
+; SSSE3: movq
+; SSSE3: punpcklbw
+; SSSE3: psraw $8
+; SSSE3: ret
+
+; SSE2: load_sext_test6
+; SSE2: movq
+; SSE2: punpcklbw
+; SSE2: psraw $8
+; SSE2: ret
 define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
  %X = load <8 x i8>* %ptr
  %Y = sext <8 x i8> %X to <8 x i16>