summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--media/base/cpu_features.h6
-rw-r--r--media/base/cpu_features_x86.cc10
-rw-r--r--media/base/simd/convert_rgb_to_yuv_x86.cc101
-rw-r--r--media/base/simd/convert_yuv_to_rgb.h150
-rw-r--r--media/base/simd/convert_yuv_to_rgb_c.cc155
-rw-r--r--media/base/simd/convert_yuv_to_rgb_mmx.asm22
-rw-r--r--media/base/simd/convert_yuv_to_rgb_mmx.inc119
-rw-r--r--media/base/simd/convert_yuv_to_rgb_sse.asm40
-rw-r--r--media/base/simd/convert_yuv_to_rgb_x86.cc71
-rw-r--r--media/base/simd/filter_yuv.h29
-rw-r--r--media/base/simd/filter_yuv_c.cc29
-rw-r--r--media/base/simd/filter_yuv_mmx.cc58
-rw-r--r--media/base/simd/filter_yuv_sse2.cc49
-rw-r--r--media/base/simd/linear_scale_yuv_to_rgb_mmx.asm23
-rw-r--r--media/base/simd/linear_scale_yuv_to_rgb_mmx.inc166
-rw-r--r--media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm142
-rw-r--r--media/base/simd/linear_scale_yuv_to_rgb_sse.asm23
-rw-r--r--media/base/simd/scale_yuv_to_rgb_mmx.asm23
-rw-r--r--media/base/simd/scale_yuv_to_rgb_mmx.inc115
-rw-r--r--media/base/simd/scale_yuv_to_rgb_sse.asm31
-rw-r--r--media/base/simd/scale_yuv_to_rgb_sse2_x64.asm109
-rw-r--r--media/base/simd/x86inc.asm8
-rw-r--r--media/base/yuv_convert.cc298
-rw-r--r--media/base/yuv_convert.h4
-rw-r--r--media/base/yuv_convert_internal.h18
-rw-r--r--media/base/yuv_convert_unittest.cc252
-rw-r--r--media/base/yuv_row_posix.cc1
-rw-r--r--media/media.gyp55
28 files changed, 1916 insertions, 191 deletions
diff --git a/media/base/cpu_features.h b/media/base/cpu_features.h
index c2762d8..0878385 100644
--- a/media/base/cpu_features.h
+++ b/media/base/cpu_features.h
@@ -10,6 +10,12 @@
namespace media {
+// Returns true if CPU has MMX support.
+bool hasMMX();
+
+// Returns true if CPU has SSE support.
+bool hasSSE();
+
// Returns true if CPU has SSE2 support.
bool hasSSE2();
diff --git a/media/base/cpu_features_x86.cc b/media/base/cpu_features_x86.cc
index bf7d05d..4fb9304 100644
--- a/media/base/cpu_features_x86.cc
+++ b/media/base/cpu_features_x86.cc
@@ -48,6 +48,16 @@ static inline void getcpuid(int info_type, int info[4]) {
}
#endif
+bool hasMMX() {
+ // TODO(hclam): Acutually checks it.
+ return true;
+}
+
+bool hasSSE() {
+ // TODO(hclam): Actually checks it.
+ return true;
+}
+
bool hasSSE2() {
#if defined(ARCH_CPU_X86_64)
/* All x86_64 machines have SSE2, so don't even bother checking. */
diff --git a/media/base/simd/convert_rgb_to_yuv_x86.cc b/media/base/simd/convert_rgb_to_yuv_x86.cc
new file mode 100644
index 0000000..2bd6930
--- /dev/null
+++ b/media/base/simd/convert_rgb_to_yuv_x86.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/simd/convert_rgb_to_yuv.h"
+
+#include "build/build_config.h"
+#include "media/base/cpu_features.h"
+#include "media/base/simd/convert_rgb_to_yuv_ssse3.h"
+
+namespace media {
+
+void ConvertRGB32ToYUV_SSSE3(const uint8* rgbframe,
+ uint8* yplane,
+ uint8* uplane,
+ uint8* vplane,
+ int width,
+ int height,
+ int rgbstride,
+ int ystride,
+ int uvstride) {
+#ifdef ENABLE_SUBSAMPLING
+ for (; height >= 2; height -= 2) {
+ ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ ConvertARGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ uplane += uvstride;
+ vplane += uvstride;
+ }
+
+ if (height)
+ ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#else
+ for (; height >= 2; height -= 2) {
+ ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ uplane += uvstride;
+ vplane += uvstride;
+ }
+
+ if (height)
+ ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#endif
+}
+
+void ConvertRGB24ToYUV_SSSE3(const uint8* rgbframe,
+ uint8* yplane,
+ uint8* uplane,
+ uint8* vplane,
+ int width,
+ int height,
+ int rgbstride,
+ int ystride,
+ int uvstride) {
+#ifdef ENABLE_SUBSAMPLING
+ for (; height >= 2; height -= 2) {
+ ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ ConvertRGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ uplane += uvstride;
+ vplane += uvstride;
+ }
+
+ if (height)
+ ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#else
+ for (; height >= 2; height -= 2) {
+ ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ uplane += uvstride;
+ vplane += uvstride;
+ }
+
+ if (height)
+ ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#endif
+}
+
+} // namespace media
diff --git a/media/base/simd/convert_yuv_to_rgb.h b/media/base/simd/convert_yuv_to_rgb.h
new file mode 100644
index 0000000..5f3df2c6
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb.h
@@ -0,0 +1,150 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_
+#define MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_
+
+#include "base/basictypes.h"
+#include "media/base/yuv_convert.h"
+
+namespace media {
+
+typedef void (*ConvertYUVToRGB32Proc)(const uint8*,
+ const uint8*,
+ const uint8*,
+ uint8*,
+ int,
+ int,
+ int,
+ int,
+ int,
+ YUVType);
+
+void ConvertYUVToRGB32_C(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+void ConvertYUVToRGB32_SSE(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+void ConvertYUVToRGB32_MMX(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+} // namespace media
+
+// Assembly functions are declared without namespace.
+extern "C" {
+
+typedef void (*ConvertYUVToRGB32RowProc)(const uint8*,
+ const uint8*,
+ const uint8*,
+ uint8*,
+ int);
+typedef void (*ScaleYUVToRGB32RowProc)(const uint8*,
+ const uint8*,
+ const uint8*,
+ uint8*,
+ int,
+ int);
+
+void ConvertYUVToRGB32Row_C(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width);
+
+void ConvertYUVToRGB32Row_MMX(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width);
+
+void ConvertYUVToRGB32Row_SSE(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width);
+
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_MMX_X64(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+}
+
+#endif // MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_
diff --git a/media/base/simd/convert_yuv_to_rgb_c.cc b/media/base/simd/convert_yuv_to_rgb_c.cc
new file mode 100644
index 0000000..f8e70b2
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_c.cc
@@ -0,0 +1,155 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/simd/convert_yuv_to_rgb.h"
+// TODO(hclam): Shouldn't depend on yuv_row.h.
+#include "media/base/yuv_row.h"
+
+#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+ (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+
+static inline void YUVPixel(uint8 y,
+ uint8 u,
+ uint8 v,
+ uint8* rgb_buf) {
+
+ int b = kCoefficientsRgbY[256+u][0];
+ int g = kCoefficientsRgbY[256+u][1];
+ int r = kCoefficientsRgbY[256+u][2];
+ int a = kCoefficientsRgbY[256+u][3];
+
+ b = paddsw(b, kCoefficientsRgbY[512+v][0]);
+ g = paddsw(g, kCoefficientsRgbY[512+v][1]);
+ r = paddsw(r, kCoefficientsRgbY[512+v][2]);
+ a = paddsw(a, kCoefficientsRgbY[512+v][3]);
+
+ b = paddsw(b, kCoefficientsRgbY[y][0]);
+ g = paddsw(g, kCoefficientsRgbY[y][1]);
+ r = paddsw(r, kCoefficientsRgbY[y][2]);
+ a = paddsw(a, kCoefficientsRgbY[y][3]);
+
+ b >>= 6;
+ g >>= 6;
+ r >>= 6;
+ a >>= 6;
+
+ *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
+ (packuswb(g) << 8) |
+ (packuswb(r) << 16) |
+ (packuswb(a) << 24);
+}
+
+extern "C" {
+
+void ConvertYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width; x += 2) {
+ uint8 u = u_buf[x >> 1];
+ uint8 v = v_buf[x >> 1];
+ uint8 y0 = y_buf[x];
+ YUVPixel(y0, u, v, rgb_buf);
+ if ((x + 1) < width) {
+ uint8 y1 = y_buf[x + 1];
+ YUVPixel(y1, u, v, rgb_buf + 4);
+ }
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+}
+
+// 16.16 fixed point is used. A shift by 16 isolates the integer.
+// A shift by 17 is used to further subsample the chrominence channels.
+// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
+// for 1/65536 pixel accurate interpolation.
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ int x = 0;
+ for (int i = 0; i < width; i += 2) {
+ int y = y_buf[x >> 16];
+ int u = u_buf[(x >> 17)];
+ int v = v_buf[(x >> 17)];
+ YUVPixel(y, u, v, rgb_buf);
+ x += source_dx;
+ if ((i + 1) < width) {
+ y = y_buf[x >> 16];
+ YUVPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+}
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ int x = 0;
+ if (source_dx >= 0x20000) {
+ x = 32768;
+ }
+ for (int i = 0; i < width; i += 2) {
+ int y0 = y_buf[x >> 16];
+ int y1 = y_buf[(x >> 16) + 1];
+ int u0 = u_buf[(x >> 17)];
+ int u1 = u_buf[(x >> 17) + 1];
+ int v0 = v_buf[(x >> 17)];
+ int v1 = v_buf[(x >> 17) + 1];
+ int y_frac = (x & 65535);
+ int uv_frac = ((x >> 1) & 65535);
+ int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+ int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
+ int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
+ YUVPixel(y, u, v, rgb_buf);
+ x += source_dx;
+ if ((i + 1) < width) {
+ y0 = y_buf[x >> 16];
+ y1 = y_buf[(x >> 16) + 1];
+ y_frac = (x & 65535);
+ y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+ YUVPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+}
+
+}
+
+namespace media {
+
+void ConvertYUVToRGB32_C(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type) {
+ unsigned int y_shift = yuv_type;
+ for (int y = 0; y < height; ++y) {
+ uint8* rgb_row = rgbframe + y * rgbstride;
+ const uint8* y_ptr = yplane + y * ystride;
+ const uint8* u_ptr = uplane + (y >> y_shift) * uvstride;
+ const uint8* v_ptr = vplane + (y >> y_shift) * uvstride;
+
+ ConvertYUVToRGB32Row_C(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ width);
+ }
+}
+
+} // namespace media
diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.asm b/media/base/simd/convert_yuv_to_rgb_mmx.asm
new file mode 100644
index 0000000..e044474
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_mmx.asm
@@ -0,0 +1,22 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+ SECTION_TEXT
+ CPU MMX
+
+; Use movq to save the output.
+%define MOVQ movq
+
+; extern "C" void ConvertYUVToRGB32Row_MMX(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width);
+%define SYMBOL ConvertYUVToRGB32Row_MMX
+%include "convert_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.inc b/media/base/simd/convert_yuv_to_rgb_mmx.inc
new file mode 100644
index 0000000..b9555ce
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_mmx.inc
@@ -0,0 +1,119 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+; Non-PIC code is the fastest so use this if possible.
+%ifndef PIC
+mangle(SYMBOL):
+ %assign stack_offset 0
+ PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV
+ extern mangle(kCoefficientsRgbY)
+ jmp .convertend
+
+.convertloop:
+ movzx TEMPUd, BYTE [Uq]
+ add Uq, 1
+ movzx TEMPVd, BYTE [Vq]
+ add Vq, 1
+ movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
+ movzx TEMPUd, BYTE [Yq]
+ paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
+ movzx TEMPVd, BYTE [Yq + 1]
+ movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
+ add Yq, 2
+ movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ MOVQ [ARGBq], mm1
+ add ARGBq, 8
+
+.convertend:
+ sub WIDTHq, 2
+ jns .convertloop
+
+ ; If number of pixels is odd then compute it.
+ and WIDTHq, 1
+ jz .convertdone
+
+ movzx TEMPUd, BYTE [Uq]
+ movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
+ movzx TEMPVd, BYTE [Vq]
+ paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
+ movzx TEMPUd, BYTE [Yq]
+ movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ARGBq], mm1
+
+.convertdone:
+ RET
+%endif
+
+; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
+; This code is slower than the above version.
+%ifdef PIC
+mangle(SYMBOL):
+ %assign stack_offset 0
+ PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE
+
+ extern mangle(kCoefficientsRgbY)
+ LOAD_SYM TABLEq, mangle(kCoefficientsRgbY)
+
+ jmp .convertend
+
+.convertloop:
+ movzx TEMPd, BYTE [Uq]
+ movq mm0, [TABLEq + 2048 + 8 * TEMPq]
+ add Uq, 1
+
+ movzx TEMPd, BYTE [Vq]
+ paddsw mm0, [TABLEq + 4096 + 8 * TEMPq]
+ add Vq, 1
+
+ movzx TEMPd, BYTE [Yq]
+ movq mm1, [TABLEq + 8 * TEMPq]
+
+ movzx TEMPd, BYTE [Yq + 1]
+ movq mm2, [TABLEq + 8 * TEMPq]
+ add Yq, 2
+
+ ; Add UV components to Y component.
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+
+ ; Down shift and then pack.
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ MOVQ [ARGBq], mm1
+ add ARGBq, 8
+
+.convertend:
+ sub WIDTHq, 2
+ jns .convertloop
+
+ ; If number of pixels is odd then compute it.
+ and WIDTHq, 1
+ jz .convertdone
+
+ movzx TEMPd, BYTE [Uq]
+ movq mm0, [TABLEq + 2048 + 8 * TEMPq]
+ movzx TEMPd, BYTE [Vq]
+ paddsw mm0, [TABLEq + 4096 + 8 * TEMPq]
+ movzx TEMPd, BYTE [Yq]
+ movq mm1, [TABLEq + 8 * TEMPq]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ARGBq], mm1
+
+.convertdone:
+ RET
+%endif
diff --git a/media/base/simd/convert_yuv_to_rgb_sse.asm b/media/base/simd/convert_yuv_to_rgb_sse.asm
new file mode 100644
index 0000000..28d2214
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_sse.asm
@@ -0,0 +1,40 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX and SSE instructions.
+;
+ SECTION_TEXT
+ CPU MMX, SSE
+
+; Use SSE instruction movntq can write faster.
+%define MOVQ movntq
+
+;
+; extern "C" void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width);
+%define SYMBOL ConvertYUVToRGB32Row_SSE
+%include "convert_yuv_to_rgb_mmx.inc"
+
+; void ScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_SSE
+%include "scale_yuv_to_rgb_mmx.inc"
+
+; void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+
diff --git a/media/base/simd/convert_yuv_to_rgb_x86.cc b/media/base/simd/convert_yuv_to_rgb_x86.cc
new file mode 100644
index 0000000..3e03ef9
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_x86.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#endif
+
+#include "media/base/cpu_features.h"
+#include "media/base/simd/convert_yuv_to_rgb.h"
+#include "media/base/yuv_convert.h"
+
+namespace media {
+
+void ConvertYUVToRGB32_MMX(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type) {
+ unsigned int y_shift = yuv_type;
+ for (int y = 0; y < height; ++y) {
+ uint8* rgb_row = rgbframe + y * rgbstride;
+ const uint8* y_ptr = yplane + y * ystride;
+ const uint8* u_ptr = uplane + (y >> y_shift) * uvstride;
+ const uint8* v_ptr = vplane + (y >> y_shift) * uvstride;
+
+ ConvertYUVToRGB32Row_MMX(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ width);
+ }
+
+ _mm_empty();
+}
+
+void ConvertYUVToRGB32_SSE(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type) {
+ unsigned int y_shift = yuv_type;
+ for (int y = 0; y < height; ++y) {
+ uint8* rgb_row = rgbframe + y * rgbstride;
+ const uint8* y_ptr = yplane + y * ystride;
+ const uint8* u_ptr = uplane + (y >> y_shift) * uvstride;
+ const uint8* v_ptr = vplane + (y >> y_shift) * uvstride;
+
+ ConvertYUVToRGB32Row_SSE(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ width);
+ }
+
+ _mm_empty();
+}
+
+} // namespace media
diff --git a/media/base/simd/filter_yuv.h b/media/base/simd/filter_yuv.h
new file mode 100644
index 0000000..5a9cf11
--- /dev/null
+++ b/media/base/simd/filter_yuv.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef MEDIA_BASE_SIMD_FILTER_YUV_H_
+#define MEDIA_BASE_SIMD_FILTER_YUV_H_
+
+#include "base/basictypes.h"
+
+namespace media {
+
+typedef void (*FilterYUVRowsProc)(uint8*,
+ const uint8*,
+ const uint8*,
+ int,
+ int);
+
+void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
+
+void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
+
+void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
+
+} // namespace media
+
+#endif // MEDIA_BASE_SIMD_FILTER_YUV_H_
diff --git a/media/base/simd/filter_yuv_c.cc b/media/base/simd/filter_yuv_c.cc
new file mode 100644
index 0000000..95ae01a
--- /dev/null
+++ b/media/base/simd/filter_yuv_c.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/simd/filter_yuv.h"
+
+namespace media {
+
+void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ uint8* end = ybuf + source_width;
+ do {
+ ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
+ ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
+ ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
+ ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
+ ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
+ ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
+ ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
+ ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
+ y0_ptr += 8;
+ y1_ptr += 8;
+ ybuf += 8;
+ } while (ybuf < end);
+}
+
+} // namespace media
diff --git a/media/base/simd/filter_yuv_mmx.cc b/media/base/simd/filter_yuv_mmx.cc
new file mode 100644
index 0000000..77698dc
--- /dev/null
+++ b/media/base/simd/filter_yuv_mmx.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#include <emmintrin.h>
+#endif
+
+#include "build/build_config.h"
+#include "media/base/simd/filter_yuv.h"
+
+namespace media {
+
+#if defined(COMPILER_MSVC)
+// Warning 4799 is about calling emms before the function exits.
+// We calls emms in a frame level so suppress this warning.
+#pragma warning(disable: 4799)
+#endif
+
+void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m64 zero = _mm_setzero_si64();
+ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+
+ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+ __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+ do {
+ __m64 y0 = *y0_ptr64++;
+ __m64 y1 = *y1_ptr64++;
+ __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+ __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+ y0 = _mm_unpacklo_pi8(y0, zero);
+ y1 = _mm_unpacklo_pi8(y1, zero);
+ y0 = _mm_mullo_pi16(y0, y0_fraction);
+ y1 = _mm_mullo_pi16(y1, y1_fraction);
+ y2 = _mm_mullo_pi16(y2, y0_fraction);
+ y3 = _mm_mullo_pi16(y3, y1_fraction);
+ y0 = _mm_add_pi16(y0, y1);
+ y2 = _mm_add_pi16(y2, y3);
+ y0 = _mm_srli_pi16(y0, 8);
+ y2 = _mm_srli_pi16(y2, 8);
+ y0 = _mm_packs_pu16(y0, y2);
+ *dest64++ = y0;
+ } while (dest64 < end64);
+}
+
+#if defined(COMPILER_MSVC)
+#pragma warning(default: 4799)
+#endif
+
+} // namespace media
diff --git a/media/base/simd/filter_yuv_sse2.cc b/media/base/simd/filter_yuv_sse2.cc
new file mode 100644
index 0000000..137ac94
--- /dev/null
+++ b/media/base/simd/filter_yuv_sse2.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#include <emmintrin.h>
+#endif
+
+#include "media/base/simd/filter_yuv.h"
+
+namespace media {
+
+void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+
+ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+
+ do {
+ __m128i y0 = _mm_loadu_si128(y0_ptr128);
+ __m128i y1 = _mm_loadu_si128(y1_ptr128);
+ __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+ __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+ y0 = _mm_unpacklo_epi8(y0, zero);
+ y1 = _mm_unpacklo_epi8(y1, zero);
+ y0 = _mm_mullo_epi16(y0, y0_fraction);
+ y1 = _mm_mullo_epi16(y1, y1_fraction);
+ y2 = _mm_mullo_epi16(y2, y0_fraction);
+ y3 = _mm_mullo_epi16(y3, y1_fraction);
+ y0 = _mm_add_epi16(y0, y1);
+ y2 = _mm_add_epi16(y2, y3);
+ y0 = _mm_srli_epi16(y0, 8);
+ y2 = _mm_srli_epi16(y2, 8);
+ y0 = _mm_packus_epi16(y0, y2);
+ *dest128++ = y0;
+ ++y0_ptr128;
+ ++y1_ptr128;
+ } while (dest128 < end128);
+}
+
+} // namespace media
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm
new file mode 100644
index 0000000..7f7e0e8
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+ SECTION_TEXT
+ CPU MMX
+
+; Use movq to save the output.
+%define MOVQ movq
+
+; void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL LinearScaleYUVToRGB32Row_MMX
+%include "linear_scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
new file mode 100644
index 0000000..91c06a5
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
@@ -0,0 +1,166 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+mangle(SYMBOL):
+ %assign stack_offset 0
+
+ extern mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP
+
+%if gprsize == 8
+%define WORD_SIZE QWORD
+%else
+%define WORD_SIZE DWORD
+%endif
+
+; Define register aliases.
+%define Xq R1q ; Current X position
+%define COMPLq R2q ; Component A value
+%define COMPLd R2d ; Component A value
+%define U_ARG_REGq R0q ; U plane address argument
+%define V_ARG_REGq R1q ; V plane address argument
+%define SOURCE_DX_ARG_REGq R3q ; Source dx argument
+%define WIDTH_ARG_REGq R2q ; Width argument
+
+%ifdef PIC
+; PIC code shared COMPR, U and V with the same register. Need to be careful in the
+; code they don't mix up. This allows R3q to be used for YUV table.
+%define COMPRq R0q ; Component B value
+%define COMPRd R0d ; Component B value
+%define Uq R0q ; U plane address
+%define Vq R0q ; V plane address
+%define U_PLANE WORD_SIZE [rsp + 3 * gprsize]
+%define TABLE R3q ; Address of the table
+%else
+; Non-PIC code defines.
+%define COMPRq R3q ; Component B value
+%define COMPRd R3d ; Component B value
+%define Uq R0q ; U plane address
+%define Vq R3q ; V plane address
+%define TABLE mangle(kCoefficientsRgbY)
+%endif
+
+; Defines for stack variables. These are used in both PIC and non-PIC code.
+%define V_PLANE WORD_SIZE [rsp + 2 * gprsize]
+%define SOURCE_DX WORD_SIZE [rsp + gprsize]
+%define SOURCE_WIDTH WORD_SIZE [rsp]
+
+; Handle stack variables differently for PIC and non-PIC code.
+
+%ifdef PIC
+; Define stack usage for PIC code. PIC code push U plane onto stack.
+ PUSH U_ARG_REGq
+ PUSH V_ARG_REGq
+ PUSH SOURCE_DX_ARG_REGq
+ imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx
+ PUSH WIDTH_ARG_REGq
+
+; Load the address of kCoefficientsRgbY into TABLE
+ mov TEMPq, SOURCE_DX_ARG_REGq ; Need to save source_dx first
+ LOAD_SYM TABLE, mangle(kCoefficientsRgbY)
+%define SOURCE_DX_ARG_REGq TEMPq ; Overwrite SOURCE_DX_ARG_REGq to TEMPq
+%else
+; Define stack usage. Non-PIC code just push 3 registers to stack.
+ PUSH V_ARG_REGq
+ PUSH SOURCE_DX_ARG_REGq
+ imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx
+ PUSH WIDTH_ARG_REGq
+%endif
+
+%macro EPILOGUE 0
+%ifdef PIC
+ ADD rsp, 4 * gprsize
+%else
+ ADD rsp, 3 * gprsize
+%endif
+%endmacro
+
+ xor Xq, Xq ; x = 0
+ cmp SOURCE_DX_ARG_REGq, 0x20000
+ jl .lscaleend
+ mov Xq, 0x8000 ; x = 0.5 for 1/2 or less
+ jmp .lscaleend
+
+.lscaleloop:
+%ifdef PIC
+ mov Uq, U_PLANE ; PIC code saves U_PLANE on stack.
+%endif
+
+; Define macros for scaling YUV components since they are reused.
+%macro SCALEUV 1
+ mov TEMPq, Xq
+ sar TEMPq, 0x11
+ movzx COMPLd, BYTE [%1 + TEMPq]
+ movzx COMPRd, BYTE [%1 + TEMPq + 1]
+ mov TEMPq, Xq
+ and TEMPq, 0x1fffe
+ imul COMPRq, TEMPq
+ xor TEMPq, 0x1fffe
+ imul COMPLq, TEMPq
+ add COMPLq, COMPRq
+ shr COMPLq, 17
+%endmacro
+ SCALEUV Uq ; Use the above macro to scale U
+ movq mm0, [TABLE + 2048 + 8 * COMPLq]
+
+ mov Vq, V_PLANE ; Read V address from stack
+ SCALEUV Vq ; Use the above macro to scale V
+ paddsw mm0, [TABLE + 4096 + 8 * COMPLq]
+
+%macro SCALEY 0
+ mov TEMPq, Xq
+ sar TEMPq, 0x10
+ movzx COMPLd, BYTE [Yq + TEMPq]
+ movzx COMPRd, BYTE [Yq + TEMPq + 1]
+ mov TEMPq, Xq
+ add Xq, SOURCE_DX ; Add source_dx from stack
+ and TEMPq, 0xffff
+ imul COMPRq, TEMPq
+ xor TEMPq, 0xffff
+ imul COMPLq, TEMPq
+ add COMPLq, COMPRq
+ shr COMPLq, 16
+%endmacro
+ SCALEY ; Use the above macro to scale Y1
+ movq mm1, [TABLE + 8 * COMPLq]
+
+ cmp Xq, SOURCE_WIDTH ; Compare source_width from stack
+ jge .lscalelastpixel
+
+ SCALEY ; Use the above macro to sacle Y2
+ movq mm2, [TABLE + 8 * COMPLq]
+
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 0x6
+ psraw mm2, 0x6
+ packuswb mm1, mm2
+ MOVQ [ARGBq], mm1
+ add ARGBq, 0x8
+
+.lscaleend:
+ cmp Xq, SOURCE_WIDTH ; Compare source_width from stack
+ jl .lscaleloop
+ EPILOGUE
+ RET
+
+.lscalelastpixel:
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ARGBq], mm1
+ EPILOGUE
+ RET
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm
new file mode 100644
index 0000000..db7854457
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm
@@ -0,0 +1,142 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+ SECTION_TEXT
+ CPU MMX
+
+%define SYMBOL LinearScaleYUVToRGB32Row_MMX_X64
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+mangle(SYMBOL):
+ %assign stack_offset 0
+ extern mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE 6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMPL
+
+%define TABLEq r10
+%define Xq r11
+%define INDEXq r12
+%define COMPRd r13d
+%define COMPRq r13
+%define FRACTIONq r14
+
+ PUSH TABLEq
+ PUSH Xq
+ PUSH INDEXq
+ PUSH COMPRq
+ PUSH FRACTIONq
+
+%macro EPILOGUE 0
+ POP FRACTIONq
+ POP COMPRq
+ POP INDEXq
+ POP Xq
+ POP TABLEq
+%endmacro
+
+ LOAD_SYM TABLEq, mangle(kCoefficientsRgbY)
+
+ imul WIDTHq, SOURCE_DXq ; source_width = width * source_dx
+ xor Xq, Xq ; x = 0
+ cmp SOURCE_DXq, 0x20000
+ jl .lscaleend
+ mov Xq, 0x8000 ; x = 0.5 for 1/2 or less
+ jmp .lscaleend
+
+.lscaleloop:
+ ; Interpolate U
+ mov INDEXq, Xq
+ sar INDEXq, 0x11
+ movzx COMPLd, BYTE [Uq + INDEXq]
+ movzx COMPRd, BYTE [Uq + INDEXq + 1]
+ mov FRACTIONq, Xq
+ and FRACTIONq, 0x1fffe
+ imul COMPRq, FRACTIONq
+ xor FRACTIONq, 0x1fffe
+ imul COMPLq, FRACTIONq
+ add COMPLq, COMPRq
+ shr COMPLq, 17
+ movq mm0, [TABLEq + 2048 + 8 * COMPLq]
+
+ ; Interpolate V
+ movzx COMPLd, BYTE [Vq + INDEXq]
+ movzx COMPRd, BYTE [Vq + INDEXq + 1]
+ ; Trick here to imul COMPL first then COMPR.
+ ; Saves two instruction. :)
+ imul COMPLq, FRACTIONq
+ xor FRACTIONq, 0x1fffe
+ imul COMPRq, FRACTIONq
+ add COMPLq, COMPRq
+ shr COMPLq, 17
+ paddsw mm0, [TABLEq + 4096 + 8 * COMPLq]
+
+ ; Interpolate first Y1.
+ lea INDEXq, [Xq + SOURCE_DXq] ; INDEXq now points to next pixel.
+ ; Xq points to current pixel.
+ mov FRACTIONq, Xq
+ sar Xq, 0x10
+ movzx COMPLd, BYTE [Yq + Xq]
+ movzx COMPRd, BYTE [Yq + Xq + 1]
+ and FRACTIONq, 0xffff
+ imul COMPRq, FRACTIONq
+ xor FRACTIONq, 0xffff
+ imul COMPLq, FRACTIONq
+ add COMPLq, COMPRq
+ shr COMPLq, 16
+ movq mm1, [TABLEq + 8 * COMPLq]
+
+ ; Interpolate Y2 if available.
+ cmp INDEXq, WIDTHq
+ jge .lscalelastpixel
+
+ lea Xq, [INDEXq + SOURCE_DXq] ; Xq points to next pixel.
+ ; INDEXq points to current pixel.
+ mov FRACTIONq, INDEXq
+ sar INDEXq, 0x10
+ movzx COMPLd, BYTE [Yq + INDEXq]
+ movzx COMPRd, BYTE [Yq + INDEXq + 1]
+ and FRACTIONq, 0xffff
+ imul COMPRq, FRACTIONq
+ xor FRACTIONq, 0xffff
+ imul COMPLq, FRACTIONq
+ add COMPLq, COMPRq
+ shr COMPLq, 16
+ movq mm2, [TABLEq + 8 * COMPLq]
+
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 0x6
+ psraw mm2, 0x6
+ packuswb mm1, mm2
+ movntq [ARGBq], mm1
+ add ARGBq, 0x8
+
+.lscaleend:
+ cmp Xq, WIDTHq
+ jl .lscaleloop
+ jmp .epilogue
+
+.lscalelastpixel:
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ARGBq], mm1
+
+.epilogue
+ EPILOGUE
+ RET
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_sse.asm b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm
new file mode 100644
index 0000000..847911c
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX and SSE instructions.
+;
+ SECTION_TEXT
+ CPU MMX, SSE
+
+; Use movq to save the output.
+%define MOVQ movntq
+
+; void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL LinearScaleYUVToRGB32Row_SSE
+%include "linear_scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.asm b/media/base/simd/scale_yuv_to_rgb_mmx.asm
new file mode 100644
index 0000000..6a83757
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_mmx.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+ SECTION_TEXT
+ CPU MMX
+
+; Use movq to save the output.
+%define MOVQ movq
+
+; void ScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_MMX
+%include "scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.inc b/media/base/simd/scale_yuv_to_rgb_mmx.inc
new file mode 100644
index 0000000..94c101c
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_mmx.inc
@@ -0,0 +1,115 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+mangle(SYMBOL):
+ %assign stack_offset 0
+
+ extern mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP
+
+%ifdef ARCH_X86_64
+%define WORD_SIZE QWORD
+%else
+%define WORD_SIZE DWORD
+%endif
+
+%ifdef PIC
+ PUSH R1q ; Width
+%endif
+ PUSH R2q ; Source dx
+
+%define SOURCE_DX WORD_SIZE [rsp]
+
+; PIC code.
+%ifdef PIC
+ LOAD_SYM R1q, mangle(kCoefficientsRgbY)
+%define WIDTH WORD_SIZE [rsp + gprsize]
+%define TABLE R1q
+%define Xq R2q
+
+; Non-PIC code.
+%else
+%define WIDTH R1q
+%define TABLE mangle(kCoefficientsRgbY)
+%define Xq R2q
+%endif
+
+ ; Set Xq index to 0.
+ xor Xq, Xq
+ jmp .scaleend
+
+.scaleloop:
+ ; TABLE can either be a register or a symbol depending on this is
+ ; PIC or not.
+ mov TEMPq, Xq
+ sar TEMPq, 17
+ movzx TEMPd, BYTE [Uq + TEMPq]
+ movq mm0, [TABLE + 2048 + 8 * TEMPq]
+ mov TEMPq, Xq
+ sar TEMPq, 17
+ movzx TEMPd, BYTE [Vq + TEMPq]
+ paddsw mm0, [TABLE + 4096 + 8 * TEMPq]
+ mov TEMPq, Xq
+ add Xq, SOURCE_DX
+ sar TEMPq, 16
+ movzx TEMPd, BYTE [Yq + TEMPq]
+ movq mm1, [TABLE + 8 * TEMPq]
+ mov TEMPq, Xq
+ add Xq, SOURCE_DX
+ sar TEMPq, 16
+ movzx TEMPd, BYTE [Yq + TEMPq]
+ movq mm2, [TABLE + 8 * TEMPq]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ MOVQ QWORD [ARGBq], mm1
+ add ARGBq, 8
+
+.scaleend:
+ ; WIDTH can either be a register or memory depending on this is
+ ; PIC or not.
+ sub WIDTH, 2
+ jns .scaleloop
+
+ and WIDTH, 1 ; odd number of pixels?
+ jz .scaledone
+
+ mov TEMPq, Xq
+ sar TEMPq, 17
+ movzx TEMPd, BYTE [Uq + TEMPq]
+ movq mm0, [TABLE + 2048 + 8 * TEMPq]
+ mov TEMPq, Xq
+ sar TEMPq, 17
+ movzx TEMPd, BYTE [Vq + TEMPq]
+ paddsw mm0, [TABLE + 4096 + 8 * TEMPq]
+ mov TEMPq, Xq
+ sar TEMPq, 16
+ movzx TEMPd, BYTE [Yq + TEMPq]
+ movq mm1, [TABLE + 8 * TEMPq]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd DWORD [ARGBq], mm1
+
+.scaledone:
+%ifdef PIC
+ ADD rsp, 2 * gprsize
+%else
+ ADD rsp, gprsize
+%endif
+ RET
diff --git a/media/base/simd/scale_yuv_to_rgb_sse.asm b/media/base/simd/scale_yuv_to_rgb_sse.asm
new file mode 100644
index 0000000..bdd5625
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_sse.asm
@@ -0,0 +1,31 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX and SSE instructions.
+;
+ SECTION_TEXT
+ CPU MMX, SSE
+
+; Use movq to save the output.
+%define MOVQ movntq
+
+; void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_SSE
+%include "scale_yuv_to_rgb_mmx.inc"
+
+; void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+
diff --git a/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm
new file mode 100644
index 0000000..e021457
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm
@@ -0,0 +1,109 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX, SSE2 and instructions.
+;
+ SECTION_TEXT
+ CPU SSE2
+
+; void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_SSE2_X64
+
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+mangle(SYMBOL):
+ %assign stack_offset 0
+ extern mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE 6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMP
+
+%define TABLEq r10
+%define Xq r11
+%define INDEXq r12
+ PUSH r10
+ PUSH r11
+ PUSH r12
+
+ LOAD_SYM TABLEq, mangle(kCoefficientsRgbY)
+
+ ; Set Xq index to 0.
+ xor Xq, Xq
+ jmp .scaleend
+
+.scaleloop:
+ ; Read UV pixels.
+ mov INDEXq, Xq
+ sar INDEXq, 17
+ movzx COMPd, BYTE [Uq + INDEXq]
+ movq xmm0, [TABLEq + 2048 + 8 * COMPq]
+ movzx COMPd, BYTE [Vq + INDEXq]
+ movq xmm1, [TABLEq + 4096 + 8 * COMPq]
+
+ ; Read first Y pixel.
+ lea INDEXq, [Xq + SOURCE_DXq] ; INDEXq nows points to next pixel.
+ sar Xq, 16
+ movzx COMPd, BYTE [Yq + Xq]
+ paddsw xmm0, xmm1 ; Hide a ADD after memory load.
+ movq xmm1, [TABLEq + 8 * COMPq]
+
+ ; Read next Y pixel.
+ lea Xq, [INDEXq + SOURCE_DXq] ; Xq now points to next pixel.
+ sar INDEXq, 16
+ movzx COMPd, BYTE [Yq + INDEXq]
+ movq xmm2, [TABLEq + 8 * COMPq]
+ paddsw xmm1, xmm0
+ paddsw xmm2, xmm0
+ shufps xmm1, xmm2, 0x44 ; Join two pixels into one XMM register
+ psraw xmm1, 6
+ packuswb xmm1, xmm1
+ movq QWORD [ARGBq], xmm1
+ add ARGBq, 8
+
+.scaleend:
+ sub WIDTHq, 2
+ jns .scaleloop
+
+ and WIDTHq, 1 ; odd number of pixels?
+ jz .scaledone
+
+ ; Read U V components.
+ mov INDEXq, Xq
+ sar INDEXq, 17
+ movzx COMPd, BYTE [Uq + INDEXq]
+ movq xmm0, [TABLEq + 2048 + 8 * COMPq]
+ movzx COMPd, BYTE [Vq + INDEXq]
+ paddsw xmm0, [TABLEq + 4096 + 8 * COMPq]
+
+ ; Read one Y component.
+ mov INDEXq, Xq
+ sar INDEXq, 16
+ movzx COMPd, BYTE [Yq + INDEXq]
+ movq xmm1, [TABLEq + 8 * COMPq]
+ paddsw xmm1, xmm0
+ psraw xmm1, 6
+ packuswb xmm1, xmm1
+ movd DWORD [ARGBq], xmm1
+
+.scaledone:
+ POP r12
+ POP r11
+ POP r10
+ RET
diff --git a/media/base/simd/x86inc.asm b/media/base/simd/x86inc.asm
index 956b999..5e0ca20 100644
--- a/media/base/simd/x86inc.asm
+++ b/media/base/simd/x86inc.asm
@@ -95,11 +95,14 @@
%ifdef WIN64
%define PIC
%elifndef ARCH_X86_64
+; For chromium we may build PIC code even for 32 bits system.
+%ifndef CHROMIUM
; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
%undef PIC
%endif
+%endif
%ifdef PIC
default rel
%endif
@@ -947,6 +950,11 @@ AVX_INSTR pfmul, 1, 0
;=============================================================================
%ifdef CHROMIUM
+; Always build PIC code on Mac for Chromium.
+%ifdef MACHO
+%define PIC
+%endif
+
;
; LOAD_SYM %1 (reg), %2 (sym)
; Copies the address to a local symbol to the specified register.
diff --git a/media/base/yuv_convert.cc b/media/base/yuv_convert.cc
index cbf7f57..22f1a24 100644
--- a/media/base/yuv_convert.cc
+++ b/media/base/yuv_convert.cc
@@ -17,151 +17,94 @@
#include "media/base/yuv_convert.h"
+#include "base/logging.h"
#include "build/build_config.h"
#include "media/base/cpu_features.h"
#include "media/base/simd/convert_rgb_to_yuv.h"
+#include "media/base/simd/convert_yuv_to_rgb.h"
+#include "media/base/simd/filter_yuv.h"
#include "media/base/yuv_convert_internal.h"
#include "media/base/yuv_row.h"
-#if USE_MMX
+#if defined(ARCH_CPU_X86_FAMILY)
#if defined(_MSC_VER)
#include <intrin.h>
#else
+#include <emmintrin.h>
#include <mmintrin.h>
#endif
#endif
-#if USE_SSE2
-#include <emmintrin.h>
-#endif
-
namespace media {
-// 16.16 fixed point arithmetic
-const int kFractionBits = 16;
-const int kFractionMax = 1 << kFractionBits;
-const int kFractionMask = ((1 << kFractionBits) - 1);
-
-// Convert a frame of YUV to 32 bit ARGB.
-void ConvertYUVToRGB32(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width,
- int height,
- int y_pitch,
- int uv_pitch,
- int rgb_pitch,
- YUVType yuv_type) {
- unsigned int y_shift = yuv_type;
- for (int y = 0; y < height; ++y) {
- uint8* rgb_row = rgb_buf + y * rgb_pitch;
- const uint8* y_ptr = y_buf + y * y_pitch;
- const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
- const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
-
- FastConvertYUVToRGB32Row(y_ptr,
- u_ptr,
- v_ptr,
- rgb_row,
- width);
- }
-
- // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
- EMMS();
+static FilterYUVRowsProc ChooseFilterYUVRowsProc() {
+#if defined(ARCH_CPU_X86_FAMILY)
+ if (hasSSE2())
+ return &FilterYUVRows_SSE2;
+ if (hasMMX())
+ return &FilterYUVRows_MMX;
+#endif
+ return &FilterYUVRows_C;
}
-#if USE_SSE2
-// FilterRows combines two rows of the image using linear interpolation.
-// SSE2 version does 16 pixels at a time
-
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
- int source_width, int source_y_fraction) {
- __m128i zero = _mm_setzero_si128();
- __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
- __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
-
- const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
- const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
- __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
- __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
-
- do {
- __m128i y0 = _mm_loadu_si128(y0_ptr128);
- __m128i y1 = _mm_loadu_si128(y1_ptr128);
- __m128i y2 = _mm_unpackhi_epi8(y0, zero);
- __m128i y3 = _mm_unpackhi_epi8(y1, zero);
- y0 = _mm_unpacklo_epi8(y0, zero);
- y1 = _mm_unpacklo_epi8(y1, zero);
- y0 = _mm_mullo_epi16(y0, y0_fraction);
- y1 = _mm_mullo_epi16(y1, y1_fraction);
- y2 = _mm_mullo_epi16(y2, y0_fraction);
- y3 = _mm_mullo_epi16(y3, y1_fraction);
- y0 = _mm_add_epi16(y0, y1);
- y2 = _mm_add_epi16(y2, y3);
- y0 = _mm_srli_epi16(y0, 8);
- y2 = _mm_srli_epi16(y2, 8);
- y0 = _mm_packus_epi16(y0, y2);
- *dest128++ = y0;
- ++y0_ptr128;
- ++y1_ptr128;
- } while (dest128 < end128);
+static ConvertYUVToRGB32RowProc ChooseConvertYUVToRGB32RowProc() {
+#if defined(ARCH_CPU_X86_FAMILY)
+ if (hasSSE())
+ return &ConvertYUVToRGB32Row_SSE;
+ if (hasMMX())
+ return &ConvertYUVToRGB32Row_MMX;
+#endif
+ return &ConvertYUVToRGB32Row_C;
}
-#elif USE_MMX
-// MMX version does 8 pixels at a time
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
- int source_width, int source_y_fraction) {
- __m64 zero = _mm_setzero_si64();
- __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
- __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
-
- const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
- const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
- __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
- __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
-
- do {
- __m64 y0 = *y0_ptr64++;
- __m64 y1 = *y1_ptr64++;
- __m64 y2 = _mm_unpackhi_pi8(y0, zero);
- __m64 y3 = _mm_unpackhi_pi8(y1, zero);
- y0 = _mm_unpacklo_pi8(y0, zero);
- y1 = _mm_unpacklo_pi8(y1, zero);
- y0 = _mm_mullo_pi16(y0, y0_fraction);
- y1 = _mm_mullo_pi16(y1, y1_fraction);
- y2 = _mm_mullo_pi16(y2, y0_fraction);
- y3 = _mm_mullo_pi16(y3, y1_fraction);
- y0 = _mm_add_pi16(y0, y1);
- y2 = _mm_add_pi16(y2, y3);
- y0 = _mm_srli_pi16(y0, 8);
- y2 = _mm_srli_pi16(y2, 8);
- y0 = _mm_packs_pu16(y0, y2);
- *dest64++ = y0;
- } while (dest64 < end64);
+
+static ScaleYUVToRGB32RowProc ChooseScaleYUVToRGB32RowProc() {
+#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(ARCH_CPU_X86_64)
+ // Use 64-bits version if possible.
+ return &ScaleYUVToRGB32Row_SSE2_X64;
+#endif
+ // Choose the best one on 32-bits system.
+ if (hasSSE())
+ return &ScaleYUVToRGB32Row_SSE;
+ if (hasMMX())
+ return &ScaleYUVToRGB32Row_MMX;
+#endif
+ return &ScaleYUVToRGB32Row_C;
}
-#else // no MMX or SSE2
-// C version does 8 at a time to mimic MMX code
-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
- int source_width, int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
- uint8* end = ybuf + source_width;
- do {
- ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
- ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
- ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
- ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
- ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
- ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
- ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
- ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
- y0_ptr += 8;
- y1_ptr += 8;
- ybuf += 8;
- } while (ybuf < end);
+
+static ScaleYUVToRGB32RowProc ChooseLinearScaleYUVToRGB32RowProc() {
+#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(ARCH_CPU_X86_64)
+ // Use 64-bits version if possible.
+ return &LinearScaleYUVToRGB32Row_MMX_X64;
+#endif
+ // 32-bits system.
+ if (hasSSE())
+ return &LinearScaleYUVToRGB32Row_SSE;
+ if (hasMMX())
+ return &LinearScaleYUVToRGB32Row_MMX;
+#endif
+ return &LinearScaleYUVToRGB32Row_C;
}
+
+// Empty SIMD registers state after using them.
+void EmptyRegisterState() {
+#if defined(ARCH_CPU_X86_FAMILY)
+ static bool checked = false;
+ static bool has_mmx = false;
+ if (!checked) {
+ has_mmx = hasMMX();
+ checked = true;
+ }
+ if (has_mmx)
+ _mm_empty();
#endif
+}
+// 16.16 fixed point arithmetic
+const int kFractionBits = 16;
+const int kFractionMax = 1 << kFractionBits;
+const int kFractionMask = ((1 << kFractionBits) - 1);
// Scale a frame of YUV to 32 bit ARGB.
void ScaleYUVToRGB32(const uint8* y_buf,
@@ -178,6 +121,20 @@ void ScaleYUVToRGB32(const uint8* y_buf,
YUVType yuv_type,
Rotate view_rotate,
ScaleFilter filter) {
+ static FilterYUVRowsProc filter_proc = NULL;
+ static ConvertYUVToRGB32RowProc convert_proc = NULL;
+ static ScaleYUVToRGB32RowProc scale_proc = NULL;
+ static ScaleYUVToRGB32RowProc linear_scale_proc = NULL;
+
+ if (!filter_proc)
+ filter_proc = ChooseFilterYUVRowsProc();
+ if (!convert_proc)
+ convert_proc = ChooseConvertYUVToRGB32RowProc();
+ if (!scale_proc)
+ scale_proc = ChooseScaleYUVToRGB32RowProc();
+ if (!linear_scale_proc)
+ linear_scale_proc = ChooseLinearScaleYUVToRGB32RowProc();
+
// Handle zero sized sources and destinations.
if ((yuv_type == YV12 && (source_width < 2 || source_height < 2)) ||
(yuv_type == YV16 && (source_width < 2 || source_height < 1)) ||
@@ -225,9 +182,6 @@ void ScaleYUVToRGB32(const uint8* y_buf,
int source_dx = source_width * kFractionMax / width;
int source_dy = source_height * kFractionMax / height;
-#if USE_MMX && defined(_MSC_VER)
- int source_dx_uv = source_dx;
-#endif
if ((view_rotate == ROTATE_90) ||
(view_rotate == ROTATE_270)) {
@@ -240,9 +194,6 @@ void ScaleYUVToRGB32(const uint8* y_buf,
int original_dx = source_dx;
int original_dy = source_dy;
source_dx = ((original_dy >> kFractionBits) * y_pitch) << kFractionBits;
-#if USE_MMX && defined(_MSC_VER)
- source_dx_uv = ((original_dy >> kFractionBits) * uv_pitch) << kFractionBits;
-#endif
source_dy = original_dx;
if (view_rotate == ROTATE_90) {
y_pitch = -1;
@@ -294,7 +245,7 @@ void ScaleYUVToRGB32(const uint8* y_buf,
if (filter & media::FILTER_BILINEAR_V) {
if (yscale_fixed != kFractionMax &&
source_y_fraction && ((source_y + 1) < source_height)) {
- FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
+ filter_proc(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
} else {
memcpy(ybuf, y0_ptr, source_width);
}
@@ -304,8 +255,8 @@ void ScaleYUVToRGB32(const uint8* y_buf,
if (yscale_fixed != kFractionMax &&
source_uv_fraction &&
(((source_y >> y_shift) + 1) < (source_height >> y_shift))) {
- FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction);
- FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction);
+ filter_proc(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction);
+ filter_proc(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction);
} else {
memcpy(ubuf, u0_ptr, uv_source_width);
memcpy(vbuf, v0_ptr, uv_source_width);
@@ -316,41 +267,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
vbuf[uv_source_width] = vbuf[uv_source_width - 1];
}
if (source_dx == kFractionMax) { // Not scaled
- FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
- dest_pixel, width);
+ convert_proc(y_ptr, u_ptr, v_ptr, dest_pixel, width);
} else {
if (filter & FILTER_BILINEAR_H) {
- LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
- dest_pixel, width, source_dx);
- } else {
-// Specialized scalers and rotation.
-#if USE_MMX && defined(_MSC_VER)
- if (width == (source_width * 2)) {
- DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
- dest_pixel, width);
- } else if ((source_dx & kFractionMask) == 0) {
- // Scaling by integer scale factor. ie half.
- ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
- dest_pixel, width,
- source_dx >> kFractionBits);
- } else if (source_dx_uv == source_dx) { // Not rotated.
- ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
- dest_pixel, width, source_dx);
- } else {
- RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
- dest_pixel, width,
- source_dx >> kFractionBits,
- source_dx_uv >> kFractionBits);
- }
-#else
- ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
- dest_pixel, width, source_dx);
-#endif
+ linear_scale_proc(y_ptr, u_ptr, v_ptr, dest_pixel, width, source_dx);
+ } else {
+ scale_proc(y_ptr, u_ptr, v_ptr, dest_pixel, width, source_dx);
}
}
}
- // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
- EMMS();
+
+ EmptyRegisterState();
}
void ConvertRGB32ToYUV(const uint8* rgbframe,
@@ -371,7 +298,9 @@ void ConvertRGB32ToYUV(const uint8* rgbframe,
convert_proc = &ConvertRGB32ToYUV_C;
#else
// For x86 processors, check if SSSE3 (or SSE2) is supported.
- if (hasSSE2())
+ if (hasSSSE3())
+ convert_proc = &ConvertRGB32ToYUV_SSSE3;
+ else if (hasSSE2())
convert_proc = &ConvertRGB32ToYUV_SSE2;
else
convert_proc = &ConvertRGB32ToYUV_C;
@@ -391,8 +320,21 @@ void ConvertRGB24ToYUV(const uint8* rgbframe,
int rgbstride,
int ystride,
int uvstride) {
+#if defined(ARCH_CPU_ARM_FAMILY)
ConvertRGB24ToYUV_C(rgbframe, yplane, uplane, vplane, width, height,
rgbstride, ystride, uvstride);
+#else
+ static void (*convert_proc)(const uint8*, uint8*, uint8*, uint8*,
+ int, int, int, int, int) = NULL;
+ if (!convert_proc) {
+ if (hasSSSE3())
+ convert_proc = &ConvertRGB24ToYUV_SSSE3;
+ else
+ convert_proc = &ConvertRGB24ToYUV_C;
+ }
+ convert_proc(rgbframe, yplane, uplane, vplane, width, height,
+ rgbstride, ystride, uvstride);
+#endif
}
void ConvertYUY2ToYUV(const uint8* src,
@@ -403,4 +345,34 @@ void ConvertYUY2ToYUV(const uint8* src,
int height) {
ConvertYUY2ToYUV_C(src, yplane, uplane, vplane, width, height);
}
+
+void ConvertYUVToRGB32(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type) {
+#if defined(ARCH_CPU_ARM_FAMILY)
+ ConvertYUVToRGB32_C(yplane, uplane, vplane, rgbframe,
+ width, height, ystride, uvstride, rgbstride, yuv_type);
+#else
+ static ConvertYUVToRGB32Proc convert_proc = NULL;
+ if (!convert_proc) {
+ if (hasSSE())
+ convert_proc = &ConvertYUVToRGB32_SSE;
+ else if (hasMMX())
+ convert_proc = &ConvertYUVToRGB32_MMX;
+ else
+ convert_proc = &ConvertYUVToRGB32_C;
+ }
+
+ convert_proc(yplane, uplane, vplane, rgbframe,
+ width, height, ystride, uvstride, rgbstride, yuv_type);
+#endif
+}
+
} // namespace media
diff --git a/media/base/yuv_convert.h b/media/base/yuv_convert.h
index 20ba0aa..95b1780 100644
--- a/media/base/yuv_convert.h
+++ b/media/base/yuv_convert.h
@@ -94,6 +94,10 @@ void ConvertYUY2ToYUV(const uint8* src,
int width,
int height);
+// Empty SIMD register state after calling optimized scaler functions.
+// This method is only used in unit test after calling SIMD functions.
+void EmptyRegisterState();
+
} // namespace media
#endif // MEDIA_BASE_YUV_CONVERT_H_
diff --git a/media/base/yuv_convert_internal.h b/media/base/yuv_convert_internal.h
index 80776aa..7be14c4 100644
--- a/media/base/yuv_convert_internal.h
+++ b/media/base/yuv_convert_internal.h
@@ -13,15 +13,15 @@
namespace media {
// SSE2 version of converting RGBA to YV12.
-extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
- uint8* yplane,
- uint8* uplane,
- uint8* vplane,
- int width,
- int height,
- int rgbstride,
- int ystride,
- int uvstride);
+void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
+ uint8* yplane,
+ uint8* uplane,
+ uint8* vplane,
+ int width,
+ int height,
+ int rgbstride,
+ int ystride,
+ int uvstride);
// This is a C reference implementation of the above routine.
// This method should only be used in unit test.
diff --git a/media/base/yuv_convert_unittest.cc b/media/base/yuv_convert_unittest.cc
index 380897e..5de3b11 100644
--- a/media/base/yuv_convert_unittest.cc
+++ b/media/base/yuv_convert_unittest.cc
@@ -8,6 +8,7 @@
#include "base/path_service.h"
#include "media/base/cpu_features.h"
#include "media/base/djb2.h"
+#include "media/base/simd/convert_yuv_to_rgb.h"
#include "media/base/yuv_convert.h"
#include "media/base/yuv_convert_internal.h"
#include "media/base/yuv_row.h"
@@ -376,17 +377,7 @@ TEST(YUVConvertTest, RGB32ToYUV_SSE2_MatchReference) {
scoped_array<uint8> yuv_converted_bytes(new uint8[kYUV12Size]);
scoped_array<uint8> yuv_reference_bytes(new uint8[kYUV12Size]);
- // Read YUV reference data from file.
- FilePath yuv_url;
- EXPECT_TRUE(PathService::Get(base::DIR_SOURCE_ROOT, &yuv_url));
- yuv_url = yuv_url.Append(FILE_PATH_LITERAL("media"))
- .Append(FILE_PATH_LITERAL("test"))
- .Append(FILE_PATH_LITERAL("data"))
- .Append(FILE_PATH_LITERAL("bali_640x360_P420.yuv"));
- EXPECT_EQ(static_cast<int>(kYUV12Size),
- file_util::ReadFile(yuv_url,
- reinterpret_cast<char*>(yuv_bytes.get()),
- static_cast<int>(kYUV12Size)));
+ ReadYV12Data(&yuv_bytes);
// Convert a frame of YUV to 32 bit ARGB.
media::ConvertYUVToRGB32(
@@ -459,4 +450,241 @@ TEST(YUVConvertTest, RGB32ToYUV_SSE2_MatchReference) {
// Make sure there's no difference from the reference.
EXPECT_EQ(0, error);
}
-#endif
+
+TEST(YUVConvertTest, ConvertYUVToRGB32Row_MMX) {
+ if (!media::hasMMX()) {
+ LOG(WARNING) << "System not supported. Test skipped.";
+ return;
+ }
+
+ scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]);
+ scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]);
+ scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]);
+ ReadYV12Data(&yuv_bytes);
+
+ const int kWidth = 167;
+ ConvertYUVToRGB32Row_C(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_reference.get(),
+ kWidth);
+ ConvertYUVToRGB32Row_MMX(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_converted.get(),
+ kWidth);
+ media::EmptyRegisterState();
+ EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(),
+ rgb_bytes_converted.get(),
+ kWidth * kBpp));
+}
+
+TEST(YUVConvertTest, ConvertYUVToRGB32Row_SSE) {
+ if (!media::hasSSE()) {
+ LOG(WARNING) << "System not supported. Test skipped.";
+ return;
+ }
+
+ scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]);
+ scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]);
+ scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]);
+ ReadYV12Data(&yuv_bytes);
+
+ const int kWidth = 167;
+ ConvertYUVToRGB32Row_C(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_reference.get(),
+ kWidth);
+ ConvertYUVToRGB32Row_SSE(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_converted.get(),
+ kWidth);
+ media::EmptyRegisterState();
+ EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(),
+ rgb_bytes_converted.get(),
+ kWidth * kBpp));
+}
+
+TEST(YUVConvertTest, ScaleYUVToRGB32Row_MMX) {
+ if (!media::hasMMX()) {
+ LOG(WARNING) << "System not supported. Test skipped.";
+ return;
+ }
+
+ scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]);
+ scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]);
+ scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]);
+ ReadYV12Data(&yuv_bytes);
+
+ const int kWidth = 167;
+ const int kSourceDx = 80000; // This value means a scale down.
+ ScaleYUVToRGB32Row_C(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_reference.get(),
+ kWidth,
+ kSourceDx);
+ ScaleYUVToRGB32Row_MMX(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_converted.get(),
+ kWidth,
+ kSourceDx);
+ media::EmptyRegisterState();
+ EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(),
+ rgb_bytes_converted.get(),
+ kWidth * kBpp));
+}
+
+TEST(YUVConvertTest, ScaleYUVToRGB32Row_SSE) {
+ if (!media::hasSSE()) {
+ LOG(WARNING) << "System not supported. Test skipped.";
+ return;
+ }
+
+ scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]);
+ scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]);
+ scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]);
+ ReadYV12Data(&yuv_bytes);
+
+ const int kWidth = 167;
+ const int kSourceDx = 80000; // This value means a scale down.
+ ScaleYUVToRGB32Row_C(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_reference.get(),
+ kWidth,
+ kSourceDx);
+ ScaleYUVToRGB32Row_SSE(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_converted.get(),
+ kWidth,
+ kSourceDx);
+ media::EmptyRegisterState();
+ EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(),
+ rgb_bytes_converted.get(),
+ kWidth * kBpp));
+}
+
+TEST(YUVConvertTest, LinearScaleYUVToRGB32Row_MMX) {
+ if (!media::hasMMX()) {
+ LOG(WARNING) << "System not supported. Test skipped.";
+ return;
+ }
+
+ scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]);
+ scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]);
+ scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]);
+ ReadYV12Data(&yuv_bytes);
+
+ const int kWidth = 167;
+ const int kSourceDx = 80000; // This value means a scale down.
+ LinearScaleYUVToRGB32Row_C(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_reference.get(),
+ kWidth,
+ kSourceDx);
+ LinearScaleYUVToRGB32Row_MMX(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_converted.get(),
+ kWidth,
+ kSourceDx);
+ media::EmptyRegisterState();
+ EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(),
+ rgb_bytes_converted.get(),
+ kWidth * kBpp));
+}
+
+TEST(YUVConvertTest, LinearScaleYUVToRGB32Row_SSE) {
+ if (!media::hasSSE()) {
+ LOG(WARNING) << "System not supported. Test skipped.";
+ return;
+ }
+
+ scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]);
+ scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]);
+ scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]);
+ ReadYV12Data(&yuv_bytes);
+
+ const int kWidth = 167;
+ const int kSourceDx = 80000; // This value means a scale down.
+ LinearScaleYUVToRGB32Row_C(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_reference.get(),
+ kWidth,
+ kSourceDx);
+ LinearScaleYUVToRGB32Row_SSE(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_converted.get(),
+ kWidth,
+ kSourceDx);
+ media::EmptyRegisterState();
+ EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(),
+ rgb_bytes_converted.get(),
+ kWidth * kBpp));
+}
+
+#if defined(ARCH_CPU_X86_64)
+
+TEST(YUVConvertTest, ScaleYUVToRGB32Row_SSE2_X64) {
+ scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]);
+ scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]);
+ scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]);
+ ReadYV12Data(&yuv_bytes);
+
+ const int kWidth = 167;
+ const int kSourceDx = 80000; // This value means a scale down.
+ ScaleYUVToRGB32Row_C(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_reference.get(),
+ kWidth,
+ kSourceDx);
+ ScaleYUVToRGB32Row_SSE2_X64(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_converted.get(),
+ kWidth,
+ kSourceDx);
+ media::EmptyRegisterState();
+ EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(),
+ rgb_bytes_converted.get(),
+ kWidth * kBpp));
+}
+
+TEST(YUVConvertTest, LinearScaleYUVToRGB32Row_MMX_X64) {
+ scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]);
+ scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]);
+ scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]);
+ ReadYV12Data(&yuv_bytes);
+
+ const int kWidth = 167;
+ const int kSourceDx = 80000; // This value means a scale down.
+ LinearScaleYUVToRGB32Row_C(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_reference.get(),
+ kWidth,
+ kSourceDx);
+ LinearScaleYUVToRGB32Row_MMX_X64(yuv_bytes.get(),
+ yuv_bytes.get() + kSourceUOffset,
+ yuv_bytes.get() + kSourceVOffset,
+ rgb_bytes_converted.get(),
+ kWidth,
+ kSourceDx);
+ media::EmptyRegisterState();
+ EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(),
+ rgb_bytes_converted.get(),
+ kWidth * kBpp));
+}
+
+#endif // defined(ARCH_CPU_X86_64)
+
+#endif // defined(ARCH_CPU_X86_FAMILY)
diff --git a/media/base/yuv_row_posix.cc b/media/base/yuv_row_posix.cc
index 2217f38..f839de8 100644
--- a/media/base/yuv_row_posix.cc
+++ b/media/base/yuv_row_posix.cc
@@ -920,4 +920,3 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf,
#endif // USE_MMX
} // extern "C"
-
diff --git a/media/media.gyp b/media/media.gyp
index 5282afb..6ecd6af 100644
--- a/media/media.gyp
+++ b/media/media.gyp
@@ -341,7 +341,12 @@
'conditions': [
[ 'target_arch == "ia32" or target_arch == "x64"', {
'dependencies': [
- 'yuv_convert_sse2',
+ 'yuv_convert_simd_x86',
+ ],
+ }],
+ [ 'target_arch == "arm"', {
+ 'dependencies': [
+ 'yuv_convert_simd_arm',
],
}],
],
@@ -357,15 +362,45 @@
],
},
{
- 'target_name': 'yuv_convert_sse2',
+ 'target_name': 'yuv_convert_simd_x86',
'type': 'static_library',
'include_dirs': [
'..',
],
+ 'sources': [
+ 'base/yuv_convert_sse2.cc',
+ 'base/simd/convert_rgb_to_yuv_x86.cc',
+ 'base/simd/convert_rgb_to_yuv_ssse3.asm',
+ 'base/simd/convert_rgb_to_yuv_ssse3.inc',
+ 'base/simd/convert_yuv_to_rgb_c.cc',
+ 'base/simd/convert_yuv_to_rgb_x86.cc',
+ 'base/simd/convert_yuv_to_rgb_mmx.asm',
+ 'base/simd/convert_yuv_to_rgb_mmx.inc',
+ 'base/simd/convert_yuv_to_rgb_sse.asm',
+ 'base/simd/filter_yuv.h',
+ 'base/simd/filter_yuv_c.cc',
+ 'base/simd/filter_yuv_mmx.cc',
+ 'base/simd/filter_yuv_sse2.cc',
+ 'base/simd/linear_scale_yuv_to_rgb_mmx.asm',
+ 'base/simd/linear_scale_yuv_to_rgb_mmx.inc',
+ 'base/simd/linear_scale_yuv_to_rgb_sse.asm',
+ 'base/simd/scale_yuv_to_rgb_mmx.asm',
+ 'base/simd/scale_yuv_to_rgb_mmx.inc',
+ 'base/simd/scale_yuv_to_rgb_sse.asm',
+ ],
'conditions': [
+ [ 'target_arch == "x64"', {
+ # Source files optimized for X64 systems.
+ 'sources': [
+ 'base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm',
+ 'base/simd/scale_yuv_to_rgb_sse2_x64.asm',
+ ],
+ }],
[ 'os_posix == 1 and OS != "mac"', {
'cflags': [
'-msse2',
+ '-msse3',
+ '-mssse3',
],
}],
[ 'OS == "mac"', {
@@ -428,10 +463,6 @@
},
}],
],
- 'sources': [
- 'base/yuv_convert_sse2.cc',
- 'base/simd/convert_rgb_to_yuv.cc',
- ],
'variables': {
'yasm_output_path': '<(SHARED_INTERMEDIATE_DIR)/media',
},
@@ -440,6 +471,18 @@
],
},
{
+ 'target_name': 'yuv_convert_simd_arm',
+ 'type': 'static_library',
+ 'include_dirs': [
+ '..',
+ ],
+ 'sources': [
+ 'base/simd/convert_yuv_to_rgb_c.cc',
+ 'base/simd/filter_yuv.h',
+ 'base/simd/filter_yuv_c.cc',
+ ],
+ },
+ {
'target_name': 'ffmpeg_unittests',
'type': 'executable',
'dependencies': [