summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp47
-rwxr-xr-xtest/CodeGen/X86/avx-sext.ll119
2 files changed, 135 insertions, 31 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 444163d..fd88307 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16043,14 +16043,14 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
ISD::LoadExtType Ext = Ld->getExtensionType();
// If this is a vector EXT Load then attempt to optimize it using a
- // shuffle. We need SSSE3 shuffles.
- // SEXT loads are suppoted starting SSE41.
- // We generate X86ISD::VSEXT for them.
+ // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+ // expansion is still better than scalar code.
+ // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
+ // emit a shuffle and a arithmetic shift.
// TODO: It is possible to support ZExt by zeroing the undef values
// during the shuffle phase or after the shuffle.
- if (RegVT.isVector() && RegVT.isInteger() &&
- ((Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) ||
- (Ext == ISD::SEXTLOAD && Subtarget->hasSSE41()))){
+ if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
+ (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
assert(MemVT != RegVT && "Cannot extend to the same type");
assert(MemVT.isVector() && "Must load a vector from memory");
@@ -16143,9 +16143,40 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
unsigned SizeRatio = RegSz/MemSz;
if (Ext == ISD::SEXTLOAD) {
- SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
- return DCI.CombineTo(N, Sext, TF, true);
+ // If we have SSE4.1 we can directly emit a VSEXT node.
+ if (Subtarget->hasSSE41()) {
+ SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+ return DCI.CombineTo(N, Sext, TF, true);
+ }
+
+ // Otherwise we'll shuffle the small elements in the high bits of the
+ // larger type and perform an arithmetic shift. If the shift is not legal
+ // it's better to scalarize.
+ if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
+ return SDValue();
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+ // Build the arithmetic shift.
+ unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+ MemVT.getVectorElementType().getSizeInBits();
+ SmallVector<SDValue, 8> C(NumElems,
+ DAG.getConstant(Amt, RegVT.getScalarType()));
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size());
+ Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV);
+
+ return DCI.CombineTo(N, Shuff, TF, true);
}
+
// Redistribute the loaded elements into the different locations.
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll
index 425d09c..8d7d79d 100755
--- a/test/CodeGen/X86/avx-sext.ll
+++ b/test/CodeGen/X86/avx-sext.ll
@@ -1,69 +1,142 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2
define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-;CHECK: sext_8i16_to_8i32
-;CHECK: vpmovsxwd
+; AVX: sext_8i16_to_8i32
+; AVX: vpmovsxwd
%B = sext <8 x i16> %A to <8 x i32>
ret <8 x i32>%B
}
define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-;CHECK: sext_4i32_to_4i64
-;CHECK: vpmovsxdq
+; AVX: sext_4i32_to_4i64
+; AVX: vpmovsxdq
%B = sext <4 x i32> %A to <4 x i64>
ret <4 x i64>%B
}
-; CHECK: load_sext_test1
-; CHECK: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test1
+; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test1
+; SSSE3: movq
+; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}}
+; SSSE3: psrad $16
+; SSSE3: ret
+
+; SSE2: load_sext_test1
+; SSE2: movq
+; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}}
+; SSE2: psrad $16
+; SSE2: ret
define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
%X = load <4 x i16>* %ptr
%Y = sext <4 x i16> %X to <4 x i32>
ret <4 x i32>%Y
}
-; CHECK: load_sext_test2
-; CHECK: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test2
+; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test2
+; SSSE3: movd
+; SSSE3: pshufb
+; SSSE3: psrad $24
+; SSSE3: ret
+
+; SSE2: load_sext_test2
+; SSE2: movl
+; SSE2: psrad $24
+; SSE2: ret
define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
%X = load <4 x i8>* %ptr
%Y = sext <4 x i8> %X to <4 x i32>
ret <4 x i32>%Y
}
-; CHECK: load_sext_test3
-; CHECK: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test3
+; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test3
+; SSSE3: movsbq
+; SSSE3: movsbq
+; SSSE3: punpcklqdq
+; SSSE3: ret
+
+; SSE2: load_sext_test3
+; SSE2: movsbq
+; SSE2: movsbq
+; SSE2: punpcklqdq
+; SSE2: ret
define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
%X = load <2 x i8>* %ptr
%Y = sext <2 x i8> %X to <2 x i64>
ret <2 x i64>%Y
}
-; CHECK: load_sext_test4
-; CHECK: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test4
+; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test4
+; SSSE3: movswq
+; SSSE3: movswq
+; SSSE3: punpcklqdq
+; SSSE3: ret
+
+; SSE2: load_sext_test4
+; SSE2: movswq
+; SSE2: movswq
+; SSE2: punpcklqdq
+; SSE2: ret
define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
%X = load <2 x i16>* %ptr
%Y = sext <2 x i16> %X to <2 x i64>
ret <2 x i64>%Y
}
-; CHECK: load_sext_test5
-; CHECK: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test5
+; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test5
+; SSSE3: movslq
+; SSSE3: movslq
+; SSSE3: punpcklqdq
+; SSSE3: ret
+
+; SSE2: load_sext_test5
+; SSE2: movslq
+; SSE2: movslq
+; SSE2: punpcklqdq
+; SSE2: ret
define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
%X = load <2 x i32>* %ptr
%Y = sext <2 x i32> %X to <2 x i64>
ret <2 x i64>%Y
}
-; CHECK: load_sext_test6
-; CHECK: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
-; CHECK: ret
+; AVX: load_sext_test6
+; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test6
+; SSSE3: movq
+; SSSE3: punpcklbw
+; SSSE3: psraw $8
+; SSSE3: ret
+
+; SSE2: load_sext_test6
+; SSE2: movq
+; SSE2: punpcklbw
+; SSE2: psraw $8
+; SSE2: ret
define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
%X = load <8 x i8>* %ptr
%Y = sext <8 x i8> %X to <8 x i16>