diff options
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 47 | ||||
-rwxr-xr-x | test/CodeGen/X86/avx-sext.ll | 119 |
2 files changed, 135 insertions, 31 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 444163d..fd88307 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16043,14 +16043,14 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, ISD::LoadExtType Ext = Ld->getExtensionType(); // If this is a vector EXT Load then attempt to optimize it using a - // shuffle. We need SSSE3 shuffles. - // SEXT loads are suppoted starting SSE41. - // We generate X86ISD::VSEXT for them. + // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the + // expansion is still better than scalar code. + // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll + // emit a shuffle and a arithmetic shift. // TODO: It is possible to support ZExt by zeroing the undef values // during the shuffle phase or after the shuffle. - if (RegVT.isVector() && RegVT.isInteger() && - ((Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) || - (Ext == ISD::SEXTLOAD && Subtarget->hasSSE41()))){ + if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && + (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); @@ -16143,9 +16143,40 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, unsigned SizeRatio = RegSz/MemSz; if (Ext == ISD::SEXTLOAD) { - SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); - return DCI.CombineTo(N, Sext, TF, true); + // If we have SSE4.1 we can directly emit a VSEXT node. + if (Subtarget->hasSSE41()) { + SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + return DCI.CombineTo(N, Sext, TF, true); + } + + // Otherwise we'll shuffle the small elements in the high bits of the + // larger type and perform an arithmetic shift. If the shift is not legal + // it's better to scalarize. + if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) + return SDValue(); + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i*SizeRatio + SizeRatio-1] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); + + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + + // Build the arithmetic shift. + unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - + MemVT.getVectorElementType().getSizeInBits(); + SmallVector<SDValue, 8> C(NumElems, + DAG.getConstant(Amt, RegVT.getScalarType())); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size()); + Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV); + + return DCI.CombineTo(N, Shuff, TF, true); } + // Redistribute the loaded elements into the different locations. SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll index 425d09c..8d7d79d 100755 --- a/test/CodeGen/X86/avx-sext.ll +++ b/test/CodeGen/X86/avx-sext.ll @@ -1,69 +1,142 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2 define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { -;CHECK: sext_8i16_to_8i32 -;CHECK: vpmovsxwd +; AVX: sext_8i16_to_8i32 +; AVX: vpmovsxwd %B = sext <8 x i16> %A to <8 x i32> ret <8 x i32>%B } define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { -;CHECK: sext_4i32_to_4i64 -;CHECK: vpmovsxdq +; AVX: sext_4i32_to_4i64 +; AVX: vpmovsxdq %B = sext <4 x i32> %A to <4 x i64> ret <4 x i64>%B } -; CHECK: load_sext_test1 -; CHECK: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}} -; CHECK: ret +; AVX: load_sext_test1 +; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}} +; AVX: ret + +; SSSE3: load_sext_test1 +; SSSE3: movq +; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}} +; SSSE3: psrad $16 +; SSSE3: ret + +; SSE2: load_sext_test1 +; SSE2: movq +; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}} +; SSE2: psrad $16 +; SSE2: ret define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) { %X = load <4 x i16>* %ptr %Y = sext <4 x i16> %X to <4 x i32> ret <4 x i32>%Y } -; CHECK: load_sext_test2 -; CHECK: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}} -; CHECK: ret +; AVX: load_sext_test2 +; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}} +; AVX: ret + +; SSSE3: load_sext_test2 +; SSSE3: movd +; SSSE3: pshufb +; SSSE3: psrad $24 +; SSSE3: ret + +; SSE2: load_sext_test2 +; SSE2: movl +; SSE2: psrad $24 +; SSE2: ret define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { %X = load <4 x i8>* %ptr %Y = sext <4 x i8> %X to <4 x i32> ret <4 x i32>%Y } -; CHECK: load_sext_test3 -; CHECK: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}} -; CHECK: ret +; AVX: load_sext_test3 +; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}} +; AVX: ret + +; SSSE3: load_sext_test3 +; SSSE3: movsbq +; SSSE3: movsbq +; SSSE3: punpcklqdq +; SSSE3: ret + +; SSE2: load_sext_test3 +; SSE2: movsbq +; SSE2: movsbq +; SSE2: punpcklqdq +; SSE2: ret define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) { %X = load <2 x i8>* %ptr %Y = sext <2 x i8> %X to <2 x i64> ret <2 x i64>%Y } -; CHECK: load_sext_test4 -; CHECK: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}} -; CHECK: ret +; AVX: load_sext_test4 +; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}} +; AVX: ret + +; SSSE3: load_sext_test4 +; SSSE3: movswq +; SSSE3: movswq +; SSSE3: punpcklqdq +; SSSE3: ret + +; SSE2: load_sext_test4 +; SSE2: movswq +; SSE2: movswq +; SSE2: punpcklqdq +; SSE2: ret define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { %X = load <2 x i16>* %ptr %Y = sext <2 x i16> %X to <2 x i64> ret <2 x i64>%Y } -; CHECK: load_sext_test5 -; CHECK: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}} -; CHECK: ret +; AVX: load_sext_test5 +; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}} +; AVX: ret + +; SSSE3: load_sext_test5 +; SSSE3: movslq +; SSSE3: movslq +; SSSE3: punpcklqdq +; SSSE3: ret + +; SSE2: load_sext_test5 +; SSE2: movslq +; SSE2: movslq +; SSE2: punpcklqdq +; SSE2: ret define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { %X = load <2 x i32>* %ptr %Y = sext <2 x i32> %X to <2 x i64> ret <2 x i64>%Y } -; CHECK: load_sext_test6 -; CHECK: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}} -; CHECK: ret +; AVX: load_sext_test6 +; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}} +; AVX: ret + +; SSSE3: load_sext_test6 +; SSSE3: movq +; SSSE3: punpcklbw +; SSSE3: psraw $8 +; SSSE3: ret + +; SSE2: load_sext_test6 +; SSE2: movq +; SSE2: punpcklbw +; SSE2: psraw $8 +; SSE2: ret define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { %X = load <8 x i8>* %ptr %Y = sext <8 x i8> %X to <8 x i16> |