diff options
author | fbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-10-15 10:24:29 +0000 |
---|---|---|
committer | fbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-10-15 10:24:29 +0000 |
commit | bf52d7684c05e86bc77bd533a43df7f518181fb2 (patch) | |
tree | a8f1c974914a5f3e1c8bbff5abf7c9c1cf0953a2 /media/base | |
parent | b8a91950aeff0d1bf19cbfc6840887263c59ba7a (diff) | |
download | chromium_src-bf52d7684c05e86bc77bd533a43df7f518181fb2.zip chromium_src-bf52d7684c05e86bc77bd533a43df7f518181fb2.tar.gz chromium_src-bf52d7684c05e86bc77bd533a43df7f518181fb2.tar.bz2 |
MMX2 improvements on Linux 64 bit.
MMX2 to avoid EMMS
LEA to remove 2 instructions from Scale loop
shuffle to remove one instruction
sub at top of loop avoids one branch
Review URL: http://codereview.chromium.org/269088
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@29107 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media/base')
-rw-r--r-- | media/base/yuv_convert_unittest.cc | 61 | ||||
-rw-r--r-- | media/base/yuv_row.h | 5 | ||||
-rw-r--r-- | media/base/yuv_row_linux.cc | 98 |
3 files changed, 87 insertions, 77 deletions
diff --git a/media/base/yuv_convert_unittest.cc b/media/base/yuv_convert_unittest.cc index d75488a..9438eef 100644 --- a/media/base/yuv_convert_unittest.cc +++ b/media/base/yuv_convert_unittest.cc @@ -33,8 +33,7 @@ static const size_t kRGBSize = kWidth * kHeight * kBpp; static const size_t kRGBSizeConverted = kWidth * kHeight * kBpp; // Set to 100 to time ConvertYUVToRGB32. -// This will take approximately 40 to 200 ms. -static const int kTestTimes = 1; +static const int kTestTimes = 100; TEST(YUVConvertTest, YV12) { // Allocate all surfaces. @@ -99,16 +98,18 @@ TEST(YUVConvertTest, YV16) { reinterpret_cast<char*>(yuv_bytes.get()), static_cast<int>(kYUV16Size))); - // Convert a frame of YUV to 32 bit ARGB. - media::ConvertYUVToRGB32(yuv_bytes.get(), // Y - yuv_bytes.get() + kWidth * kHeight, // U - yuv_bytes.get() + kWidth * kHeight * 3 / 2, // V - rgb_converted_bytes.get(), // RGB output - kWidth, kHeight, // Dimensions - kWidth, // YStride - kWidth / 2, // UVStride - kWidth * kBpp, // RGBStride - media::YV16); + for (int i = 0; i < kTestTimes; ++i) { + // Convert a frame of YUV to 32 bit ARGB. + media::ConvertYUVToRGB32(yuv_bytes.get(), // Y + yuv_bytes.get() + kWidth * kHeight, // U + yuv_bytes.get() + kWidth * kHeight * 3 / 2, // V + rgb_converted_bytes.get(), // RGB output + kWidth, kHeight, // Dimensions + kWidth, // YStride + kWidth / 2, // UVStride + kWidth * kBpp, // RGBStride + media::YV16); + } unsigned int rgb_hash = DJB2Hash(rgb_converted_bytes.get(), kRGBSizeConverted, kDJB2HashSeed); @@ -124,7 +125,7 @@ TEST(YUVConvertTest, YV16) { #endif } -TEST(YuvScaleTest, YV12) { +TEST(YUVScaleTest, YV12) { // Read YUV reference data from file. FilePath yuv_url; EXPECT_TRUE(PathService::Get(base::DIR_SOURCE_ROOT, &yuv_url)); @@ -143,17 +144,19 @@ TEST(YuvScaleTest, YV12) { const size_t size_of_rgb_scaled = kScaledWidth * kScaledHeight * kBpp; scoped_array<uint8> rgb_scaled_bytes(new uint8[size_of_rgb_scaled]); - media::ScaleYUVToRGB32(yuv_bytes.get(), // Y + for (int i = 0; i < kTestTimes; ++i) { + media::ScaleYUVToRGB32(yuv_bytes.get(), // Y yuv_bytes.get() + kWidth * kHeight, // U yuv_bytes.get() + kWidth * kHeight * 5 / 4, // V - rgb_scaled_bytes.get(), // Rgb output - kWidth, kHeight, // Dimensions - kScaledWidth, kScaledHeight, // Dimensions - kWidth, // YStride - kWidth / 2, // UvStride - kScaledWidth * kBpp, // RgbStride + rgb_scaled_bytes.get(), // Rgb output + kWidth, kHeight, // Dimensions + kScaledWidth, kScaledHeight, // Dimensions + kWidth, // YStride + kWidth / 2, // UvStride + kScaledWidth * kBpp, // RgbStride media::YV12, media::ROTATE_0); + } unsigned int rgb_hash = DJB2Hash(rgb_scaled_bytes.get(), size_of_rgb_scaled, kDJB2HashSeed); @@ -169,7 +172,7 @@ TEST(YuvScaleTest, YV12) { #endif } -TEST(YuvScaleTest, YV16) { +TEST(YUVScaleTest, YV16) { // Read YV16 reference data from file. FilePath yuv_url; EXPECT_TRUE(PathService::Get(base::DIR_SOURCE_ROOT, &yuv_url)); @@ -188,17 +191,19 @@ TEST(YuvScaleTest, YV16) { const size_t size_of_rgb_scaled = kScaledWidth * kScaledHeight * kBpp; scoped_array<uint8> rgb_scaled_bytes(new uint8[size_of_rgb_scaled]); - media::ScaleYUVToRGB32(yuv_bytes.get(), // Y + for (int i = 0; i < kTestTimes; ++i) { + media::ScaleYUVToRGB32(yuv_bytes.get(), // Y yuv_bytes.get() + kWidth * kHeight, // U yuv_bytes.get() + kWidth * kHeight * 3 / 2, // V - rgb_scaled_bytes.get(), // Rgb output - kWidth, kHeight, // Dimensions - kScaledWidth, kScaledHeight, // Dimensions - kWidth, // YStride - kWidth / 2, // UvStride - kScaledWidth * kBpp, // RgbStride + rgb_scaled_bytes.get(), // Rgb output + kWidth, kHeight, // Dimensions + kScaledWidth, kScaledHeight, // Dimensions + kWidth, // YStride + kWidth / 2, // UvStride + kScaledWidth * kBpp, // RgbStride media::YV16, media::ROTATE_0); + } unsigned int rgb_hash = DJB2Hash(rgb_scaled_bytes.get(), size_of_rgb_scaled, kDJB2HashSeed); diff --git a/media/base/yuv_row.h b/media/base/yuv_row.h index 31f1788..ac5c6fd 100644 --- a/media/base/yuv_row.h +++ b/media/base/yuv_row.h @@ -63,7 +63,7 @@ void ScaleYUVToRGB32Row(const uint8* y_buf, } // extern "C" #if !defined(USE_MMX) -// Windows, Mac and Linux x86 use MMX; x64 and other CPUs do not. +// Windows, Mac and Linux use MMX #if defined(ARCH_CPU_X86) || (defined(ARCH_CPU_X86_64) && defined(OS_LINUX)) #define USE_MMX 1 #else @@ -71,7 +71,8 @@ void ScaleYUVToRGB32Row(const uint8* y_buf, #endif #endif -#if USE_MMX +// x64 uses MMX2 (SSE) so emms is not required. +#if USE_MMX && !defined(ARCH_CPU_X86_64) #if defined(_MSC_VER) #define EMMS() __asm emms #else diff --git a/media/base/yuv_row_linux.cc b/media/base/yuv_row_linux.cc index 5825960..6e956b9 100644 --- a/media/base/yuv_row_linux.cc +++ b/media/base/yuv_row_linux.cc @@ -254,7 +254,7 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi int width); // r8 asm( - ".global FastConvertYUVToRGB32Row\n" +".global FastConvertYUVToRGB32Row\n" "FastConvertYUVToRGB32Row:\n" "jmp convertend\n" @@ -263,37 +263,40 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi "add $0x1,%rsi\n" "movzb (%rdx),%r11\n" "add $0x1,%rdx\n" - "movq kCoefficientsRgbU(,%r10,8),%mm0\n" + "movq kCoefficientsRgbU(,%r10,8),%xmm0\n" "movzb (%rdi),%r10\n" - "paddsw kCoefficientsRgbV(,%r11,8),%mm0\n" + "movq kCoefficientsRgbV(,%r11,8),%xmm1\n" "movzb 0x1(%rdi),%r11\n" - "movq kCoefficientsRgbY(,%r10,8),%mm1\n" + "paddsw %xmm1,%xmm0\n" + "movq kCoefficientsRgbY(,%r10,8),%xmm2\n" "add $0x2,%rdi\n" - "movq kCoefficientsRgbY(,%r11,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movntq %mm1,0x0(%rcx)\n" + "movq kCoefficientsRgbY(,%r11,8),%xmm3\n" + "paddsw %xmm0,%xmm2\n" + "paddsw %xmm0,%xmm3\n" + "shufps $0x44,%xmm3,%xmm2\n" + "psraw $0x6,%xmm2\n" + "packuswb %xmm2,%xmm2\n" + "movq %xmm2,0x0(%rcx)\n" "add $0x8,%rcx\n" "convertend:" "sub $0x2,%r8\n" "jns convertloop\n" - "and $0x1,%r8\n" - "je convertdone\n" +"convertnext:" + "add $0x1,%r8\n" + "js convertdone\n" "movzb (%rsi),%r10\n" - "movq kCoefficientsRgbU(,%r10,8),%mm0\n" + "movq kCoefficientsRgbU(,%r10,8),%xmm0\n" "movzb (%rdx),%r10\n" - "paddsw kCoefficientsRgbV(,%r10,8),%mm0\n" + "movq kCoefficientsRgbV(,%r10,8),%xmm1\n" + "paddsw %xmm1,%xmm0\n" "movzb (%rdi),%r10\n" - "movq kCoefficientsRgbY(,%r10,8),%mm1\n" - "paddsw %mm0,%mm1\n" - "psraw $0x6,%mm1\n" - "packuswb %mm1,%mm1\n" - "movd %mm1,0x0(%rcx)\n" + "movq kCoefficientsRgbY(,%r10,8),%xmm1\n" + "paddsw %xmm0,%xmm1\n" + "psraw $0x6,%xmm1\n" + "packuswb %xmm1,%xmm1\n" + "movd %xmm1,0x0(%rcx)\n" "convertdone:" "ret\n" ); @@ -310,52 +313,53 @@ void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi ".global ScaleYUVToRGB32Row\n" "ScaleYUVToRGB32Row:\n" "xor %r11,%r11\n" - "jmp scaleend\n" + "sub $0x2,%r8\n" + "js scalenext\n" "scaleloop:" "mov %r11,%r10\n" "sar $0x5,%r10\n" "movzb (%rsi,%r10,1),%rax\n" - "movq kCoefficientsRgbU(,%rax,8),%mm0\n" + "movq kCoefficientsRgbU(,%rax,8),%xmm0\n" "movzb (%rdx,%r10,1),%rax\n" - "paddsw kCoefficientsRgbV(,%rax,8),%mm0\n" - "mov %r11,%r10\n" - "add %r9,%r11\n" - "sar $0x4,%r10\n" - "movzb (%rdi,%r10,1),%rax\n" - "movq kCoefficientsRgbY(,%rax,8),%mm1\n" - "mov %r11,%r10\n" - "add %r9,%r11\n" + "movq kCoefficientsRgbV(,%rax,8),%xmm1\n" + "lea (%r11,%r9),%r10\n" + "sar $0x4,%r11\n" + "movzb (%rdi,%r11,1),%rax\n" + "paddsw %xmm1,%xmm0\n" + "movq kCoefficientsRgbY(,%rax,8),%xmm1\n" + "lea (%r10,%r9),%r11\n" "sar $0x4,%r10\n" "movzb (%rdi,%r10,1),%rax\n" - "movq kCoefficientsRgbY(,%rax,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movntq %mm1,0x0(%rcx)\n" + "movq kCoefficientsRgbY(,%rax,8),%xmm2\n" + "paddsw %xmm0,%xmm1\n" + "paddsw %xmm0,%xmm2\n" + "shufps $0x44,%xmm2,%xmm1\n" + "psraw $0x6,%xmm1\n" + "packuswb %xmm1,%xmm1\n" + "movq %xmm1,0x0(%rcx)\n" "add $0x8,%rcx\n" -"scaleend:" "sub $0x2,%r8\n" "jns scaleloop\n" - "and $0x1,%r8\n" - "je scaledone\n" +"scalenext:" + "add $0x1,%r8\n" + "js scaledone\n" "mov %r11,%r10\n" "sar $0x5,%r10\n" "movzb (%rsi,%r10,1),%rax\n" - "movq kCoefficientsRgbU(,%rax,8),%mm0\n" + "movq kCoefficientsRgbU(,%rax,8),%xmm0\n" "movzb (%rdx,%r10,1),%rax\n" - "paddsw kCoefficientsRgbV(,%rax,8),%mm0\n" + "movq kCoefficientsRgbV(,%rax,8),%xmm1\n" + "paddsw %xmm1,%xmm0\n" "sar $0x4,%r11\n" "movzb (%rdi,%r11,1),%rax\n" - "movq kCoefficientsRgbY(,%rax,8),%mm1\n" - "paddsw %mm0,%mm1\n" - "psraw $0x6,%mm1\n" - "packuswb %mm1,%mm1\n" - "movd %mm1,0x0(%rcx)\n" + "movq kCoefficientsRgbY(,%rax,8),%xmm1\n" + "paddsw %xmm0,%xmm1\n" + "psraw $0x6,%xmm1\n" + "packuswb %xmm1,%xmm1\n" + "movd %xmm1,0x0(%rcx)\n" "scaledone:" "ret\n" |