summaryrefslogtreecommitdiffstats
path: root/media/base
diff options
context:
space:
mode:
authorfbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-10-13 07:14:02 +0000
committerfbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-10-13 07:14:02 +0000
commita9b4babd9d2b4a95ab24a5dd8c61d2ab1c3881ed (patch)
tree799554004578e3a5416eb70b353624feb81e63e4 /media/base
parent6b1753827c4f458b55eae64bfa0f8a699ebcee4c (diff)
downloadchromium_src-a9b4babd9d2b4a95ab24a5dd8c61d2ab1c3881ed.zip
chromium_src-a9b4babd9d2b4a95ab24a5dd8c61d2ab1c3881ed.tar.gz
chromium_src-a9b4babd9d2b4a95ab24a5dd8c61d2ab1c3881ed.tar.bz2
64 bit linux yuv
BUG=23263 TEST=media_unittest should pass and run much faster: faster than 32 bit even. Review URL: http://codereview.chromium.org/268029 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@28802 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media/base')
-rw-r--r--media/base/yuv_row.h2
-rw-r--r--media/base/yuv_row_linux.cc158
2 files changed, 120 insertions, 40 deletions
diff --git a/media/base/yuv_row.h b/media/base/yuv_row.h
index 03ebf03..31f1788 100644
--- a/media/base/yuv_row.h
+++ b/media/base/yuv_row.h
@@ -64,7 +64,7 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,
#if !defined(USE_MMX)
// Windows, Mac and Linux x86 use MMX; x64 and other CPUs do not.
-#if defined(ARCH_CPU_X86)
+#if defined(ARCH_CPU_X86) || (defined(ARCH_CPU_X86_64) && defined(OS_LINUX))
#define USE_MMX 1
#else
#define USE_MMX 0
diff --git a/media/base/yuv_row_linux.cc b/media/base/yuv_row_linux.cc
index 9410c2f..5825960 100644
--- a/media/base/yuv_row_linux.cc
+++ b/media/base/yuv_row_linux.cc
@@ -245,45 +245,123 @@ MMX_ALIGNED(int16 kCoefficientsRgbV[256][4]) = {
#undef RGBV
#undef MMX_ALIGNED
-// TODO(fbarchard): Use the following function instead of
-// pure assembly to help make code more portable to 64 bit
-// and Mac, which has different labels.
-// no-gcse eliminates the frame pointer, freeing up ebp.
-
-#if defined(FUTURE_64BIT_VERSION)
-void __attribute__((optimize("O2", "no-gcse")))
- NewFastConvertYUVToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
+#if defined(ARCH_CPU_X86_64)
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width); // r8
+
asm(
- "shr %4\n"
-"1:\n"
- "movzb (%1),%%eax\n"
- "add $0x1,%1\n"
- "movzb (%2),%%ebx\n"
- "add $0x1,%2\n"
- "movq kCoefficientsRgbU(,%%eax,8),%%mm0\n"
- "movzb (%0),%%eax\n"
- "paddsw kCoefficientsRgbV(,%%ebx,8),%%mm0\n"
- "movzb 0x1(%0),%%ebx\n"
- "movq kCoefficientsRgbY(,%%eax,8),%%mm1\n"
- "add $0x2,%0\n"
- "movq kCoefficientsRgbY(,%%ebx,8),%%mm2\n"
- "paddsw %%mm0,%%mm1\n"
- "paddsw %%mm0,%%mm2\n"
- "psraw $0x6,%%mm1\n"
- "psraw $0x6,%%mm2\n"
- "packuswb %%mm2,%%mm1\n"
- "movntq %%mm1,0x0(%3)\n"
- "add $0x8,%3\n"
- "sub $0x1,%4\n"
- "jne 1b\n"
- : : "r"(y_buf),"r"(u_buf),"r"(v_buf),"r"(rgb_buf),"r"(width)
- : "eax","ebx");
-}
-#endif
+ ".global FastConvertYUVToRGB32Row\n"
+"FastConvertYUVToRGB32Row:\n"
+ "jmp convertend\n"
+
+"convertloop:"
+ "movzb (%rsi),%r10\n"
+ "add $0x1,%rsi\n"
+ "movzb (%rdx),%r11\n"
+ "add $0x1,%rdx\n"
+ "movq kCoefficientsRgbU(,%r10,8),%mm0\n"
+ "movzb (%rdi),%r10\n"
+ "paddsw kCoefficientsRgbV(,%r11,8),%mm0\n"
+ "movzb 0x1(%rdi),%r11\n"
+ "movq kCoefficientsRgbY(,%r10,8),%mm1\n"
+ "add $0x2,%rdi\n"
+ "movq kCoefficientsRgbY(,%r11,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%rcx)\n"
+ "add $0x8,%rcx\n"
+"convertend:"
+ "sub $0x2,%r8\n"
+ "jns convertloop\n"
+
+ "and $0x1,%r8\n"
+ "je convertdone\n"
+
+ "movzb (%rsi),%r10\n"
+ "movq kCoefficientsRgbU(,%r10,8),%mm0\n"
+ "movzb (%rdx),%r10\n"
+ "paddsw kCoefficientsRgbV(,%r10,8),%mm0\n"
+ "movzb (%rdi),%r10\n"
+ "movq kCoefficientsRgbY(,%r10,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%rcx)\n"
+"convertdone:"
+ "ret\n"
+);
+
+
+void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width, // r8
+ int scaled_dx); // r9
+
+ asm(
+ ".global ScaleYUVToRGB32Row\n"
+"ScaleYUVToRGB32Row:\n"
+ "xor %r11,%r11\n"
+ "jmp scaleend\n"
+
+"scaleloop:"
+ "mov %r11,%r10\n"
+ "sar $0x5,%r10\n"
+ "movzb (%rsi,%r10,1),%rax\n"
+ "movq kCoefficientsRgbU(,%rax,8),%mm0\n"
+ "movzb (%rdx,%r10,1),%rax\n"
+ "paddsw kCoefficientsRgbV(,%rax,8),%mm0\n"
+ "mov %r11,%r10\n"
+ "add %r9,%r11\n"
+ "sar $0x4,%r10\n"
+ "movzb (%rdi,%r10,1),%rax\n"
+ "movq kCoefficientsRgbY(,%rax,8),%mm1\n"
+ "mov %r11,%r10\n"
+ "add %r9,%r11\n"
+ "sar $0x4,%r10\n"
+ "movzb (%rdi,%r10,1),%rax\n"
+ "movq kCoefficientsRgbY(,%rax,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movntq %mm1,0x0(%rcx)\n"
+ "add $0x8,%rcx\n"
+"scaleend:"
+ "sub $0x2,%r8\n"
+ "jns scaleloop\n"
+
+ "and $0x1,%r8\n"
+ "je scaledone\n"
+
+ "mov %r11,%r10\n"
+ "sar $0x5,%r10\n"
+ "movzb (%rsi,%r10,1),%rax\n"
+ "movq kCoefficientsRgbU(,%rax,8),%mm0\n"
+ "movzb (%rdx,%r10,1),%rax\n"
+ "paddsw kCoefficientsRgbV(,%rax,8),%mm0\n"
+ "sar $0x4,%r11\n"
+ "movzb (%rdi,%r11,1),%rax\n"
+ "movq kCoefficientsRgbY(,%rax,8),%mm1\n"
+ "paddsw %mm0,%mm1\n"
+ "psraw $0x6,%mm1\n"
+ "packuswb %mm1,%mm1\n"
+ "movd %mm1,0x0(%rcx)\n"
+
+"scaledone:"
+ "ret\n"
+);
+
+#else
void FastConvertYUVToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
@@ -418,6 +496,8 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,
"ret\n"
);
+#endif
+
#else // USE_MMX
// Reference version of YUV converter.
@@ -601,6 +681,6 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,
scaled_x += scaled_dx;
}
}
-#endif // USE_MMX
+#endif // USE_MMX
} // extern "C"