diff options
author | Nate Begeman <natebegeman@mac.com> | 2009-02-23 08:49:38 +0000 |
---|---|---|
committer | Nate Begeman <natebegeman@mac.com> | 2009-02-23 08:49:38 +0000 |
commit | b9a47b824f6c8ef3989a796018bf974c09cd243f (patch) | |
tree | 811b73672faedd85c04ce43b74adf44775b55b1c /test/CodeGen | |
parent | a9183c8020c3203f7597d1fbfe4244dd99c7981e (diff) | |
download | external_llvm-b9a47b824f6c8ef3989a796018bf974c09cd243f.zip external_llvm-b9a47b824f6c8ef3989a796018bf974c09cd243f.tar.gz external_llvm-b9a47b824f6c8ef3989a796018bf974c09cd243f.tar.bz2 |
Generate better code for v8i16 shuffles on SSE2
Generate better code for v16i8 shuffles on SSE2 (avoids stack)
Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops.
Document the shuffle matching logic and add some FIXMEs for later further
cleanups.
New tests that test the above.
Examples:
New:
_shuf2:
pextrw $7, %xmm0, %eax
punpcklqdq %xmm1, %xmm0
pshuflw $128, %xmm0, %xmm0
pinsrw $2, %eax, %xmm0
Old:
_shuf2:
pextrw $2, %xmm0, %eax
pextrw $7, %xmm0, %ecx
pinsrw $2, %ecx, %xmm0
pinsrw $3, %eax, %xmm0
movd %xmm1, %eax
pinsrw $4, %eax, %xmm0
ret
=========
New:
_shuf4:
punpcklqdq %xmm1, %xmm0
pshufb LCPI1_0, %xmm0
Old:
_shuf4:
pextrw $3, %xmm0, %eax
movsd %xmm1, %xmm0
pextrw $3, %xmm1, %ecx
pinsrw $4, %ecx, %xmm0
pinsrw $5, %eax, %xmm0
========
New:
_shuf1:
pushl %ebx
pushl %edi
pushl %esi
pextrw $1, %xmm0, %eax
rolw $8, %ax
movd %xmm0, %ecx
rolw $8, %cx
pextrw $5, %xmm0, %edx
pextrw $4, %xmm0, %esi
pextrw $3, %xmm0, %edi
pextrw $2, %xmm0, %ebx
movaps %xmm0, %xmm1
pinsrw $0, %ecx, %xmm1
pinsrw $1, %eax, %xmm1
rolw $8, %bx
pinsrw $2, %ebx, %xmm1
rolw $8, %di
pinsrw $3, %edi, %xmm1
rolw $8, %si
pinsrw $4, %esi, %xmm1
rolw $8, %dx
pinsrw $5, %edx, %xmm1
pextrw $7, %xmm0, %eax
rolw $8, %ax
movaps %xmm1, %xmm0
pinsrw $7, %eax, %xmm0
popl %esi
popl %edi
popl %ebx
ret
Old:
_shuf1:
subl $252, %esp
movaps %xmm0, (%esp)
movaps %xmm0, 16(%esp)
movaps %xmm0, 32(%esp)
movaps %xmm0, 48(%esp)
movaps %xmm0, 64(%esp)
movaps %xmm0, 80(%esp)
movaps %xmm0, 96(%esp)
movaps %xmm0, 224(%esp)
movaps %xmm0, 208(%esp)
movaps %xmm0, 192(%esp)
movaps %xmm0, 176(%esp)
movaps %xmm0, 160(%esp)
movaps %xmm0, 144(%esp)
movaps %xmm0, 128(%esp)
movaps %xmm0, 112(%esp)
movzbl 14(%esp), %eax
movd %eax, %xmm1
movzbl 22(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm1, %xmm2
movzbl 42(%esp), %eax
movd %eax, %xmm1
movzbl 50(%esp), %eax
movd %eax, %xmm3
punpcklbw %xmm1, %xmm3
punpcklbw %xmm2, %xmm3
movzbl 77(%esp), %eax
movd %eax, %xmm1
movzbl 84(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm1, %xmm2
movzbl 104(%esp), %eax
movd %eax, %xmm1
punpcklbw %xmm1, %xmm0
punpcklbw %xmm2, %xmm0
movaps %xmm0, %xmm1
punpcklbw %xmm3, %xmm1
movzbl 127(%esp), %eax
movd %eax, %xmm0
movzbl 135(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm0, %xmm2
movzbl 155(%esp), %eax
movd %eax, %xmm0
movzbl 163(%esp), %eax
movd %eax, %xmm3
punpcklbw %xmm0, %xmm3
punpcklbw %xmm2, %xmm3
movzbl 188(%esp), %eax
movd %eax, %xmm0
movzbl 197(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm0, %xmm2
movzbl 217(%esp), %eax
movd %eax, %xmm4
movzbl 225(%esp), %eax
movd %eax, %xmm0
punpcklbw %xmm4, %xmm0
punpcklbw %xmm2, %xmm0
punpcklbw %xmm3, %xmm0
punpcklbw %xmm1, %xmm0
addl $252, %esp
ret
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@65311 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen')
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-12.ll | 4 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-13.ll | 4 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-2.ll | 2 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-21.ll | 2 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-28.ll | 33 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-29.ll | 2 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-31.ll | 13 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-32.ll | 13 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-33.ll | 11 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-34.ll | 13 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shuffle-35.ll | 20 |
11 files changed, 85 insertions, 32 deletions
diff --git a/test/CodeGen/X86/vec_shuffle-12.ll b/test/CodeGen/X86/vec_shuffle-12.ll index aad27ea..98b455a 100644 --- a/test/CodeGen/X86/vec_shuffle-12.ll +++ b/test/CodeGen/X86/vec_shuffle-12.ll @@ -1,8 +1,8 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 > %t +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah > %t ; RUN: not grep punpck %t ; RUN: grep pextrw %t | count 4 ; RUN: grep pinsrw %t | count 6 -; RUN: grep pshuflw %t | count 3 +; RUN: grep pshuflw %t | count 1 ; RUN: grep pshufhw %t | count 2 define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { diff --git a/test/CodeGen/X86/vec_shuffle-13.ll b/test/CodeGen/X86/vec_shuffle-13.ll index 4511e95..61cd128 100644 --- a/test/CodeGen/X86/vec_shuffle-13.ll +++ b/test/CodeGen/X86/vec_shuffle-13.ll @@ -1,7 +1,7 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 > %t +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah > %t ; RUN: grep movlhps %t | count 1 -; RUN: grep movss %t | count 1 ; RUN: grep pshufd %t | count 1 +; RUN: grep movss %t | count 1 ; RUN: grep pshuflw %t | count 1 ; RUN: grep pshufhw %t | count 1 diff --git a/test/CodeGen/X86/vec_shuffle-2.ll b/test/CodeGen/X86/vec_shuffle-2.ll index ae69801..b049565 100644 --- a/test/CodeGen/X86/vec_shuffle-2.ll +++ b/test/CodeGen/X86/vec_shuffle-2.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f ; RUN: grep pshufhw %t | count 1 ; RUN: grep pshuflw %t | count 1 ; RUN: grep movhps %t | count 1 diff --git a/test/CodeGen/X86/vec_shuffle-21.ll b/test/CodeGen/X86/vec_shuffle-21.ll index 5409acf..eec2691 100644 --- a/test/CodeGen/X86/vec_shuffle-21.ll +++ b/test/CodeGen/X86/vec_shuffle-21.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f ; RUN: grep pshuflw %t | count 1 ; RUN: grep pextrw %t | count 2 ; RUN: grep pinsrw %t | count 2 diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll index 0c81e77..f7e5001 100644 --- a/test/CodeGen/X86/vec_shuffle-28.ll +++ b/test/CodeGen/X86/vec_shuffle-28.ll @@ -1,8 +1,12 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f -; RUN: grep punpcklwd %t | count 1 -; RUN: grep pextrw %t | count 6 -; RUN: grep pinsrw %t | count 8 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep movd %t | count 1 +; RUN: grep pshuflw %t | count 1 +; RUN: grep pinsrw %t | count 1 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 1 +; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently. +; Don't XFAIL it because it's still better than the previous code. ; Pack various elements via shuffles. define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -10,24 +14,3 @@ entry: %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > ret <8 x i16> %tmp7 } - - -define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp8 -} - - -define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > - ret <8 x i16> %tmp9 -} - - -define <8 x i16> @shuf4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > - ret <8 x i16> %tmp9 -} diff --git a/test/CodeGen/X86/vec_shuffle-29.ll b/test/CodeGen/X86/vec_shuffle-29.ll index aac63c3..49cb0a7 100644 --- a/test/CodeGen/X86/vec_shuffle-29.ll +++ b/test/CodeGen/X86/vec_shuffle-29.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -disable-mmx -o %t -f +; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41,-ssse3 -disable-mmx -o %t -f ; RUN: not grep pextrw %t ; RUN: grep pinsrw %t diff --git a/test/CodeGen/X86/vec_shuffle-31.ll b/test/CodeGen/X86/vec_shuffle-31.ll new file mode 100644 index 0000000..0a9dc1f --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-31.ll @@ -0,0 +1,13 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep pextrw %t | count 1 +; RUN: grep punpcklqdq %t | count 1 +; RUN: grep pshufhw %t | count 1 +; RUN: grep pinsrw %t | count 1 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 1 + +define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +entry: + %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > + ret <8 x i16> %tmp9 +} diff --git a/test/CodeGen/X86/vec_shuffle-32.ll b/test/CodeGen/X86/vec_shuffle-32.ll new file mode 100644 index 0000000..3a81948 --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-32.ll @@ -0,0 +1,13 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep punpcklqdq %t | count 1 +; RUN: grep pextrw %t | count 1 +; RUN: grep pshufd %t | count 1 +; RUN: grep pinsrw %t | count 1 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 1 + +define <8 x i16> @shuf4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +entry: + %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > + ret <8 x i16> %tmp9 +} diff --git a/test/CodeGen/X86/vec_shuffle-33.ll b/test/CodeGen/X86/vec_shuffle-33.ll new file mode 100644 index 0000000..e3d6304 --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-33.ll @@ -0,0 +1,11 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep punpcklqdq %t | count 1 +; RUN: grep pshufhw %t | count 1 +; RUN: not grep pextrw %t +; RUN: not grep pinsrw %t + +define <8 x i16> @shuf5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +entry: + %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef > + ret <8 x i16> %tmp9 +} diff --git a/test/CodeGen/X86/vec_shuffle-34.ll b/test/CodeGen/X86/vec_shuffle-34.ll new file mode 100644 index 0000000..99c95d1 --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-34.ll @@ -0,0 +1,13 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep pextrw %t | count 1 +; RUN: grep punpcklqdq %t | count 1 +; RUN: grep pshuflw %t | count 1 +; RUN: grep pinsrw %t | count 1 +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 2 + +define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +entry: + %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > + ret <8 x i16> %tmp8 +} diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll new file mode 100644 index 0000000..f2b241f --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-35.ll @@ -0,0 +1,20 @@ +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f +; RUN: grep pextrw %t | count 13 +; RUN: grep pinsrw %t | count 14 +; RUN: grep rolw %t | count 13 +; RUN: not grep esp %t +; RUN: not grep ebp %t +; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f +; RUN: grep pshufb %t | count 3 + +define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone { +entry: + %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > + ret <16 x i8> %tmp8 +} + +define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { +entry: + %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > + ret <16 x i8> %tmp8 +} |