summaryrefslogtreecommitdiffstats
path: root/media/base
diff options
context:
space:
mode:
authorfbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-10-15 10:24:29 +0000
committerfbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-10-15 10:24:29 +0000
commitbf52d7684c05e86bc77bd533a43df7f518181fb2 (patch)
treea8f1c974914a5f3e1c8bbff5abf7c9c1cf0953a2 /media/base
parentb8a91950aeff0d1bf19cbfc6840887263c59ba7a (diff)
downloadchromium_src-bf52d7684c05e86bc77bd533a43df7f518181fb2.zip
chromium_src-bf52d7684c05e86bc77bd533a43df7f518181fb2.tar.gz
chromium_src-bf52d7684c05e86bc77bd533a43df7f518181fb2.tar.bz2
MMX2 improvements on Linux 64 bit.
MMX2 to avoid EMMS LEA to remove 2 instructions from Scale loop shuffle to remove one instruction sub at top of loop avoids one branch Review URL: http://codereview.chromium.org/269088 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@29107 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media/base')
-rw-r--r--media/base/yuv_convert_unittest.cc61
-rw-r--r--media/base/yuv_row.h5
-rw-r--r--media/base/yuv_row_linux.cc98
3 files changed, 87 insertions, 77 deletions
diff --git a/media/base/yuv_convert_unittest.cc b/media/base/yuv_convert_unittest.cc
index d75488a..9438eef 100644
--- a/media/base/yuv_convert_unittest.cc
+++ b/media/base/yuv_convert_unittest.cc
@@ -33,8 +33,7 @@ static const size_t kRGBSize = kWidth * kHeight * kBpp;
static const size_t kRGBSizeConverted = kWidth * kHeight * kBpp;
// Set to 100 to time ConvertYUVToRGB32.
-// This will take approximately 40 to 200 ms.
-static const int kTestTimes = 1;
+static const int kTestTimes = 100;
TEST(YUVConvertTest, YV12) {
// Allocate all surfaces.
@@ -99,16 +98,18 @@ TEST(YUVConvertTest, YV16) {
reinterpret_cast<char*>(yuv_bytes.get()),
static_cast<int>(kYUV16Size)));
- // Convert a frame of YUV to 32 bit ARGB.
- media::ConvertYUVToRGB32(yuv_bytes.get(), // Y
- yuv_bytes.get() + kWidth * kHeight, // U
- yuv_bytes.get() + kWidth * kHeight * 3 / 2, // V
- rgb_converted_bytes.get(), // RGB output
- kWidth, kHeight, // Dimensions
- kWidth, // YStride
- kWidth / 2, // UVStride
- kWidth * kBpp, // RGBStride
- media::YV16);
+ for (int i = 0; i < kTestTimes; ++i) {
+ // Convert a frame of YUV to 32 bit ARGB.
+ media::ConvertYUVToRGB32(yuv_bytes.get(), // Y
+ yuv_bytes.get() + kWidth * kHeight, // U
+ yuv_bytes.get() + kWidth * kHeight * 3 / 2, // V
+ rgb_converted_bytes.get(), // RGB output
+ kWidth, kHeight, // Dimensions
+ kWidth, // YStride
+ kWidth / 2, // UVStride
+ kWidth * kBpp, // RGBStride
+ media::YV16);
+ }
unsigned int rgb_hash = DJB2Hash(rgb_converted_bytes.get(), kRGBSizeConverted,
kDJB2HashSeed);
@@ -124,7 +125,7 @@ TEST(YUVConvertTest, YV16) {
#endif
}
-TEST(YuvScaleTest, YV12) {
+TEST(YUVScaleTest, YV12) {
// Read YUV reference data from file.
FilePath yuv_url;
EXPECT_TRUE(PathService::Get(base::DIR_SOURCE_ROOT, &yuv_url));
@@ -143,17 +144,19 @@ TEST(YuvScaleTest, YV12) {
const size_t size_of_rgb_scaled = kScaledWidth * kScaledHeight * kBpp;
scoped_array<uint8> rgb_scaled_bytes(new uint8[size_of_rgb_scaled]);
- media::ScaleYUVToRGB32(yuv_bytes.get(), // Y
+ for (int i = 0; i < kTestTimes; ++i) {
+ media::ScaleYUVToRGB32(yuv_bytes.get(), // Y
yuv_bytes.get() + kWidth * kHeight, // U
yuv_bytes.get() + kWidth * kHeight * 5 / 4, // V
- rgb_scaled_bytes.get(), // Rgb output
- kWidth, kHeight, // Dimensions
- kScaledWidth, kScaledHeight, // Dimensions
- kWidth, // YStride
- kWidth / 2, // UvStride
- kScaledWidth * kBpp, // RgbStride
+ rgb_scaled_bytes.get(), // Rgb output
+ kWidth, kHeight, // Dimensions
+ kScaledWidth, kScaledHeight, // Dimensions
+ kWidth, // YStride
+ kWidth / 2, // UvStride
+ kScaledWidth * kBpp, // RgbStride
media::YV12,
media::ROTATE_0);
+ }
unsigned int rgb_hash = DJB2Hash(rgb_scaled_bytes.get(), size_of_rgb_scaled,
kDJB2HashSeed);
@@ -169,7 +172,7 @@ TEST(YuvScaleTest, YV12) {
#endif
}
-TEST(YuvScaleTest, YV16) {
+TEST(YUVScaleTest, YV16) {
// Read YV16 reference data from file.
FilePath yuv_url;
EXPECT_TRUE(PathService::Get(base::DIR_SOURCE_ROOT, &yuv_url));
@@ -188,17 +191,19 @@ TEST(YuvScaleTest, YV16) {
const size_t size_of_rgb_scaled = kScaledWidth * kScaledHeight * kBpp;
scoped_array<uint8> rgb_scaled_bytes(new uint8[size_of_rgb_scaled]);
- media::ScaleYUVToRGB32(yuv_bytes.get(), // Y
+ for (int i = 0; i < kTestTimes; ++i) {
+ media::ScaleYUVToRGB32(yuv_bytes.get(), // Y
yuv_bytes.get() + kWidth * kHeight, // U
yuv_bytes.get() + kWidth * kHeight * 3 / 2, // V
- rgb_scaled_bytes.get(), // Rgb output
- kWidth, kHeight, // Dimensions
- kScaledWidth, kScaledHeight, // Dimensions
- kWidth, // YStride
- kWidth / 2, // UvStride
- kScaledWidth * kBpp, // RgbStride
+ rgb_scaled_bytes.get(), // Rgb output
+ kWidth, kHeight, // Dimensions
+ kScaledWidth, kScaledHeight, // Dimensions
+ kWidth, // YStride
+ kWidth / 2, // UvStride
+ kScaledWidth * kBpp, // RgbStride
media::YV16,
media::ROTATE_0);
+ }
unsigned int rgb_hash = DJB2Hash(rgb_scaled_bytes.get(), size_of_rgb_scaled,
kDJB2HashSeed);
diff --git a/media/base/yuv_row.h b/media/base/yuv_row.h
index 31f1788..ac5c6fd 100644
--- a/media/base/yuv_row.h
+++ b/media/base/yuv_row.h
@@ -63,7 +63,7 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,
} // extern "C"
#if !defined(USE_MMX)
-// Windows, Mac and Linux x86 use MMX; x64 and other CPUs do not.
+// Windows, Mac and Linux use MMX
#if defined(ARCH_CPU_X86) || (defined(ARCH_CPU_X86_64) && defined(OS_LINUX))
#define USE_MMX 1
#else
@@ -71,7 +71,8 @@ void ScaleYUVToRGB32Row(const uint8* y_buf,
#endif
#endif
-#if USE_MMX
+// x64 uses MMX2 (SSE) so emms is not required.
+#if USE_MMX && !defined(ARCH_CPU_X86_64)
#if defined(_MSC_VER)
#define EMMS() __asm emms
#else
diff --git a/media/base/yuv_row_linux.cc b/media/base/yuv_row_linux.cc
index 5825960..6e956b9 100644
--- a/media/base/yuv_row_linux.cc
+++ b/media/base/yuv_row_linux.cc
@@ -254,7 +254,7 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
int width); // r8
asm(
- ".global FastConvertYUVToRGB32Row\n"
+".global FastConvertYUVToRGB32Row\n"
"FastConvertYUVToRGB32Row:\n"
"jmp convertend\n"
@@ -263,37 +263,40 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
"add $0x1,%rsi\n"
"movzb (%rdx),%r11\n"
"add $0x1,%rdx\n"
- "movq kCoefficientsRgbU(,%r10,8),%mm0\n"
+ "movq kCoefficientsRgbU(,%r10,8),%xmm0\n"
"movzb (%rdi),%r10\n"
- "paddsw kCoefficientsRgbV(,%r11,8),%mm0\n"
+ "movq kCoefficientsRgbV(,%r11,8),%xmm1\n"
"movzb 0x1(%rdi),%r11\n"
- "movq kCoefficientsRgbY(,%r10,8),%mm1\n"
+ "paddsw %xmm1,%xmm0\n"
+ "movq kCoefficientsRgbY(,%r10,8),%xmm2\n"
"add $0x2,%rdi\n"
- "movq kCoefficientsRgbY(,%r11,8),%mm2\n"
- "paddsw %mm0,%mm1\n"
- "paddsw %mm0,%mm2\n"
- "psraw $0x6,%mm1\n"
- "psraw $0x6,%mm2\n"
- "packuswb %mm2,%mm1\n"
- "movntq %mm1,0x0(%rcx)\n"
+ "movq kCoefficientsRgbY(,%r11,8),%xmm3\n"
+ "paddsw %xmm0,%xmm2\n"
+ "paddsw %xmm0,%xmm3\n"
+ "shufps $0x44,%xmm3,%xmm2\n"
+ "psraw $0x6,%xmm2\n"
+ "packuswb %xmm2,%xmm2\n"
+ "movq %xmm2,0x0(%rcx)\n"
"add $0x8,%rcx\n"
"convertend:"
"sub $0x2,%r8\n"
"jns convertloop\n"
- "and $0x1,%r8\n"
- "je convertdone\n"
+"convertnext:"
+ "add $0x1,%r8\n"
+ "js convertdone\n"
"movzb (%rsi),%r10\n"
- "movq kCoefficientsRgbU(,%r10,8),%mm0\n"
+ "movq kCoefficientsRgbU(,%r10,8),%xmm0\n"
"movzb (%rdx),%r10\n"
- "paddsw kCoefficientsRgbV(,%r10,8),%mm0\n"
+ "movq kCoefficientsRgbV(,%r10,8),%xmm1\n"
+ "paddsw %xmm1,%xmm0\n"
"movzb (%rdi),%r10\n"
- "movq kCoefficientsRgbY(,%r10,8),%mm1\n"
- "paddsw %mm0,%mm1\n"
- "psraw $0x6,%mm1\n"
- "packuswb %mm1,%mm1\n"
- "movd %mm1,0x0(%rcx)\n"
+ "movq kCoefficientsRgbY(,%r10,8),%xmm1\n"
+ "paddsw %xmm0,%xmm1\n"
+ "psraw $0x6,%xmm1\n"
+ "packuswb %xmm1,%xmm1\n"
+ "movd %xmm1,0x0(%rcx)\n"
"convertdone:"
"ret\n"
);
@@ -310,52 +313,53 @@ void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
".global ScaleYUVToRGB32Row\n"
"ScaleYUVToRGB32Row:\n"
"xor %r11,%r11\n"
- "jmp scaleend\n"
+ "sub $0x2,%r8\n"
+ "js scalenext\n"
"scaleloop:"
"mov %r11,%r10\n"
"sar $0x5,%r10\n"
"movzb (%rsi,%r10,1),%rax\n"
- "movq kCoefficientsRgbU(,%rax,8),%mm0\n"
+ "movq kCoefficientsRgbU(,%rax,8),%xmm0\n"
"movzb (%rdx,%r10,1),%rax\n"
- "paddsw kCoefficientsRgbV(,%rax,8),%mm0\n"
- "mov %r11,%r10\n"
- "add %r9,%r11\n"
- "sar $0x4,%r10\n"
- "movzb (%rdi,%r10,1),%rax\n"
- "movq kCoefficientsRgbY(,%rax,8),%mm1\n"
- "mov %r11,%r10\n"
- "add %r9,%r11\n"
+ "movq kCoefficientsRgbV(,%rax,8),%xmm1\n"
+ "lea (%r11,%r9),%r10\n"
+ "sar $0x4,%r11\n"
+ "movzb (%rdi,%r11,1),%rax\n"
+ "paddsw %xmm1,%xmm0\n"
+ "movq kCoefficientsRgbY(,%rax,8),%xmm1\n"
+ "lea (%r10,%r9),%r11\n"
"sar $0x4,%r10\n"
"movzb (%rdi,%r10,1),%rax\n"
- "movq kCoefficientsRgbY(,%rax,8),%mm2\n"
- "paddsw %mm0,%mm1\n"
- "paddsw %mm0,%mm2\n"
- "psraw $0x6,%mm1\n"
- "psraw $0x6,%mm2\n"
- "packuswb %mm2,%mm1\n"
- "movntq %mm1,0x0(%rcx)\n"
+ "movq kCoefficientsRgbY(,%rax,8),%xmm2\n"
+ "paddsw %xmm0,%xmm1\n"
+ "paddsw %xmm0,%xmm2\n"
+ "shufps $0x44,%xmm2,%xmm1\n"
+ "psraw $0x6,%xmm1\n"
+ "packuswb %xmm1,%xmm1\n"
+ "movq %xmm1,0x0(%rcx)\n"
"add $0x8,%rcx\n"
-"scaleend:"
"sub $0x2,%r8\n"
"jns scaleloop\n"
- "and $0x1,%r8\n"
- "je scaledone\n"
+"scalenext:"
+ "add $0x1,%r8\n"
+ "js scaledone\n"
"mov %r11,%r10\n"
"sar $0x5,%r10\n"
"movzb (%rsi,%r10,1),%rax\n"
- "movq kCoefficientsRgbU(,%rax,8),%mm0\n"
+ "movq kCoefficientsRgbU(,%rax,8),%xmm0\n"
"movzb (%rdx,%r10,1),%rax\n"
- "paddsw kCoefficientsRgbV(,%rax,8),%mm0\n"
+ "movq kCoefficientsRgbV(,%rax,8),%xmm1\n"
+ "paddsw %xmm1,%xmm0\n"
"sar $0x4,%r11\n"
"movzb (%rdi,%r11,1),%rax\n"
- "movq kCoefficientsRgbY(,%rax,8),%mm1\n"
- "paddsw %mm0,%mm1\n"
- "psraw $0x6,%mm1\n"
- "packuswb %mm1,%mm1\n"
- "movd %mm1,0x0(%rcx)\n"
+ "movq kCoefficientsRgbY(,%rax,8),%xmm1\n"
+ "paddsw %xmm0,%xmm1\n"
+ "psraw $0x6,%xmm1\n"
+ "packuswb %xmm1,%xmm1\n"
+ "movd %xmm1,0x0(%rcx)\n"
"scaledone:"
"ret\n"