diff options
author | fbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-10-13 07:14:02 +0000 |
---|---|---|
committer | fbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-10-13 07:14:02 +0000 |
commit | a9b4babd9d2b4a95ab24a5dd8c61d2ab1c3881ed (patch) | |
tree | 799554004578e3a5416eb70b353624feb81e63e4 /media | |
parent | 6b1753827c4f458b55eae64bfa0f8a699ebcee4c (diff) | |
download | chromium_src-a9b4babd9d2b4a95ab24a5dd8c61d2ab1c3881ed.zip chromium_src-a9b4babd9d2b4a95ab24a5dd8c61d2ab1c3881ed.tar.gz chromium_src-a9b4babd9d2b4a95ab24a5dd8c61d2ab1c3881ed.tar.bz2 |
64 bit linux yuv
BUG=23263
TEST=media_unittest should pass and run much faster: faster than 32 bit even.
Review URL: http://codereview.chromium.org/268029
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@28802 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media')
-rw-r--r-- | media/base/yuv_row.h | 2 | ||||
-rw-r--r-- | media/base/yuv_row_linux.cc | 158 |
2 files changed, 120 insertions, 40 deletions
diff --git a/media/base/yuv_row.h b/media/base/yuv_row.h index 03ebf03..31f1788 100644 --- a/media/base/yuv_row.h +++ b/media/base/yuv_row.h @@ -64,7 +64,7 @@ void ScaleYUVToRGB32Row(const uint8* y_buf, #if !defined(USE_MMX) // Windows, Mac and Linux x86 use MMX; x64 and other CPUs do not. -#if defined(ARCH_CPU_X86) +#if defined(ARCH_CPU_X86) || (defined(ARCH_CPU_X86_64) && defined(OS_LINUX)) #define USE_MMX 1 #else #define USE_MMX 0 diff --git a/media/base/yuv_row_linux.cc b/media/base/yuv_row_linux.cc index 9410c2f..5825960 100644 --- a/media/base/yuv_row_linux.cc +++ b/media/base/yuv_row_linux.cc @@ -245,45 +245,123 @@ MMX_ALIGNED(int16 kCoefficientsRgbV[256][4]) = { #undef RGBV #undef MMX_ALIGNED -// TODO(fbarchard): Use the following function instead of -// pure assembly to help make code more portable to 64 bit -// and Mac, which has different labels. -// no-gcse eliminates the frame pointer, freeing up ebp. - -#if defined(FUTURE_64BIT_VERSION) -void __attribute__((optimize("O2", "no-gcse"))) - NewFastConvertYUVToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +#if defined(ARCH_CPU_X86_64) + +void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width); // r8 + asm( - "shr %4\n" -"1:\n" - "movzb (%1),%%eax\n" - "add $0x1,%1\n" - "movzb (%2),%%ebx\n" - "add $0x1,%2\n" - "movq kCoefficientsRgbU(,%%eax,8),%%mm0\n" - "movzb (%0),%%eax\n" - "paddsw kCoefficientsRgbV(,%%ebx,8),%%mm0\n" - "movzb 0x1(%0),%%ebx\n" - "movq kCoefficientsRgbY(,%%eax,8),%%mm1\n" - "add $0x2,%0\n" - "movq kCoefficientsRgbY(,%%ebx,8),%%mm2\n" - "paddsw %%mm0,%%mm1\n" - "paddsw %%mm0,%%mm2\n" - "psraw $0x6,%%mm1\n" - "psraw $0x6,%%mm2\n" - "packuswb %%mm2,%%mm1\n" - "movntq %%mm1,0x0(%3)\n" - "add $0x8,%3\n" - "sub $0x1,%4\n" - "jne 1b\n" - : : "r"(y_buf),"r"(u_buf),"r"(v_buf),"r"(rgb_buf),"r"(width) - : "eax","ebx"); -} -#endif + ".global FastConvertYUVToRGB32Row\n" +"FastConvertYUVToRGB32Row:\n" + "jmp convertend\n" + +"convertloop:" + "movzb (%rsi),%r10\n" + "add $0x1,%rsi\n" + "movzb (%rdx),%r11\n" + "add $0x1,%rdx\n" + "movq kCoefficientsRgbU(,%r10,8),%mm0\n" + "movzb (%rdi),%r10\n" + "paddsw kCoefficientsRgbV(,%r11,8),%mm0\n" + "movzb 0x1(%rdi),%r11\n" + "movq kCoefficientsRgbY(,%r10,8),%mm1\n" + "add $0x2,%rdi\n" + "movq kCoefficientsRgbY(,%r11,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%rcx)\n" + "add $0x8,%rcx\n" +"convertend:" + "sub $0x2,%r8\n" + "jns convertloop\n" + + "and $0x1,%r8\n" + "je convertdone\n" + + "movzb (%rsi),%r10\n" + "movq kCoefficientsRgbU(,%r10,8),%mm0\n" + "movzb (%rdx),%r10\n" + "paddsw kCoefficientsRgbV(,%r10,8),%mm0\n" + "movzb (%rdi),%r10\n" + "movq kCoefficientsRgbY(,%r10,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%rcx)\n" +"convertdone:" + "ret\n" +); + + +void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width, // r8 + int scaled_dx); // r9 + + asm( + ".global ScaleYUVToRGB32Row\n" +"ScaleYUVToRGB32Row:\n" + "xor %r11,%r11\n" + "jmp scaleend\n" + +"scaleloop:" + "mov %r11,%r10\n" + "sar $0x5,%r10\n" + "movzb (%rsi,%r10,1),%rax\n" + "movq kCoefficientsRgbU(,%rax,8),%mm0\n" + "movzb (%rdx,%r10,1),%rax\n" + "paddsw kCoefficientsRgbV(,%rax,8),%mm0\n" + "mov %r11,%r10\n" + "add %r9,%r11\n" + "sar $0x4,%r10\n" + "movzb (%rdi,%r10,1),%rax\n" + "movq kCoefficientsRgbY(,%rax,8),%mm1\n" + "mov %r11,%r10\n" + "add %r9,%r11\n" + "sar $0x4,%r10\n" + "movzb (%rdi,%r10,1),%rax\n" + "movq kCoefficientsRgbY(,%rax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%rcx)\n" + "add $0x8,%rcx\n" +"scaleend:" + "sub $0x2,%r8\n" + "jns scaleloop\n" + + "and $0x1,%r8\n" + "je scaledone\n" + + "mov %r11,%r10\n" + "sar $0x5,%r10\n" + "movzb (%rsi,%r10,1),%rax\n" + "movq kCoefficientsRgbU(,%rax,8),%mm0\n" + "movzb (%rdx,%r10,1),%rax\n" + "paddsw kCoefficientsRgbV(,%rax,8),%mm0\n" + "sar $0x4,%r11\n" + "movzb (%rdi,%r11,1),%rax\n" + "movq kCoefficientsRgbY(,%rax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%rcx)\n" + +"scaledone:" + "ret\n" +); + +#else void FastConvertYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, @@ -418,6 +496,8 @@ void ScaleYUVToRGB32Row(const uint8* y_buf, "ret\n" ); +#endif + #else // USE_MMX // Reference version of YUV converter. @@ -601,6 +681,6 @@ void ScaleYUVToRGB32Row(const uint8* y_buf, scaled_x += scaled_dx; } } -#endif // USE_MMX +#endif // USE_MMX } // extern "C" |