summaryrefslogtreecommitdiffstats
path: root/media/base/simd
diff options
context:
space:
mode:
authorhclam@chromium.org <hclam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-09-14 12:40:45 +0000
committerhclam@chromium.org <hclam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-09-14 12:40:45 +0000
commitccde716550693ceb59cc8717d4f4d4845f23d853 (patch)
tree0022aff451d5665193835d1c508a1f3303ae9cc5 /media/base/simd
parent89a4d2f772aa92a79acedf057f4036820c1fd412 (diff)
downloadchromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.zip
chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.tar.gz
chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.tar.bz2
Resubmit - Rewrite color space conversions suite using YASM"
I'll watch the official buildbot this time. TBR=ajwong, dhollowa BUG=None TEST=None Review URL: http://codereview.chromium.org/7891039 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@101067 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media/base/simd')
-rw-r--r--media/base/simd/convert_rgb_to_yuv_x86.cc101
-rw-r--r--media/base/simd/convert_yuv_to_rgb.h150
-rw-r--r--media/base/simd/convert_yuv_to_rgb_c.cc155
-rw-r--r--media/base/simd/convert_yuv_to_rgb_mmx.asm22
-rw-r--r--media/base/simd/convert_yuv_to_rgb_mmx.inc119
-rw-r--r--media/base/simd/convert_yuv_to_rgb_sse.asm23
-rw-r--r--media/base/simd/convert_yuv_to_rgb_x86.cc71
-rw-r--r--media/base/simd/filter_yuv.h29
-rw-r--r--media/base/simd/filter_yuv_c.cc29
-rw-r--r--media/base/simd/filter_yuv_mmx.cc58
-rw-r--r--media/base/simd/filter_yuv_sse2.cc49
-rw-r--r--media/base/simd/linear_scale_yuv_to_rgb_mmx.asm23
-rw-r--r--media/base/simd/linear_scale_yuv_to_rgb_mmx.inc166
-rw-r--r--media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm142
-rw-r--r--media/base/simd/linear_scale_yuv_to_rgb_sse.asm23
-rw-r--r--media/base/simd/scale_yuv_to_rgb_mmx.asm23
-rw-r--r--media/base/simd/scale_yuv_to_rgb_mmx.inc115
-rw-r--r--media/base/simd/scale_yuv_to_rgb_sse.asm23
-rw-r--r--media/base/simd/scale_yuv_to_rgb_sse2_x64.asm110
-rw-r--r--media/base/simd/x86inc.asm8
20 files changed, 1439 insertions, 0 deletions
diff --git a/media/base/simd/convert_rgb_to_yuv_x86.cc b/media/base/simd/convert_rgb_to_yuv_x86.cc
new file mode 100644
index 0000000..2bd6930
--- /dev/null
+++ b/media/base/simd/convert_rgb_to_yuv_x86.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/simd/convert_rgb_to_yuv.h"
+
+#include "build/build_config.h"
+#include "media/base/cpu_features.h"
+#include "media/base/simd/convert_rgb_to_yuv_ssse3.h"
+
+namespace media {
+
+void ConvertRGB32ToYUV_SSSE3(const uint8* rgbframe,
+ uint8* yplane,
+ uint8* uplane,
+ uint8* vplane,
+ int width,
+ int height,
+ int rgbstride,
+ int ystride,
+ int uvstride) {
+#ifdef ENABLE_SUBSAMPLING
+ for (; height >= 2; height -= 2) {
+ ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ ConvertARGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ uplane += uvstride;
+ vplane += uvstride;
+ }
+
+ if (height)
+ ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#else
+ for (; height >= 2; height -= 2) {
+ ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ uplane += uvstride;
+ vplane += uvstride;
+ }
+
+ if (height)
+ ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#endif
+}
+
+void ConvertRGB24ToYUV_SSSE3(const uint8* rgbframe,
+ uint8* yplane,
+ uint8* uplane,
+ uint8* vplane,
+ int width,
+ int height,
+ int rgbstride,
+ int ystride,
+ int uvstride) {
+#ifdef ENABLE_SUBSAMPLING
+ for (; height >= 2; height -= 2) {
+ ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ ConvertRGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ uplane += uvstride;
+ vplane += uvstride;
+ }
+
+ if (height)
+ ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#else
+ for (; height >= 2; height -= 2) {
+ ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width);
+ rgbframe += rgbstride;
+ yplane += ystride;
+
+ uplane += uvstride;
+ vplane += uvstride;
+ }
+
+ if (height)
+ ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#endif
+}
+
+} // namespace media
diff --git a/media/base/simd/convert_yuv_to_rgb.h b/media/base/simd/convert_yuv_to_rgb.h
new file mode 100644
index 0000000..5f3df2c6
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb.h
@@ -0,0 +1,150 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_
+#define MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_
+
+#include "base/basictypes.h"
+#include "media/base/yuv_convert.h"
+
+namespace media {
+
+typedef void (*ConvertYUVToRGB32Proc)(const uint8*,
+ const uint8*,
+ const uint8*,
+ uint8*,
+ int,
+ int,
+ int,
+ int,
+ int,
+ YUVType);
+
+void ConvertYUVToRGB32_C(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+void ConvertYUVToRGB32_SSE(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+void ConvertYUVToRGB32_MMX(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type);
+
+} // namespace media
+
+// Assembly functions are declared without namespace.
+extern "C" {
+
+typedef void (*ConvertYUVToRGB32RowProc)(const uint8*,
+ const uint8*,
+ const uint8*,
+ uint8*,
+ int);
+typedef void (*ScaleYUVToRGB32RowProc)(const uint8*,
+ const uint8*,
+ const uint8*,
+ uint8*,
+ int,
+ int);
+
+void ConvertYUVToRGB32Row_C(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width);
+
+void ConvertYUVToRGB32Row_MMX(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width);
+
+void ConvertYUVToRGB32Row_SSE(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width);
+
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+void LinearScaleYUVToRGB32Row_MMX_X64(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx);
+
+}
+
+#endif // MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_
diff --git a/media/base/simd/convert_yuv_to_rgb_c.cc b/media/base/simd/convert_yuv_to_rgb_c.cc
new file mode 100644
index 0000000..f8e70b2
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_c.cc
@@ -0,0 +1,155 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/simd/convert_yuv_to_rgb.h"
+// TODO(hclam): Shouldn't depend on yuv_row.h.
+#include "media/base/yuv_row.h"
+
+#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+ (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+
+static inline void YUVPixel(uint8 y,
+ uint8 u,
+ uint8 v,
+ uint8* rgb_buf) {
+
+ int b = kCoefficientsRgbY[256+u][0];
+ int g = kCoefficientsRgbY[256+u][1];
+ int r = kCoefficientsRgbY[256+u][2];
+ int a = kCoefficientsRgbY[256+u][3];
+
+ b = paddsw(b, kCoefficientsRgbY[512+v][0]);
+ g = paddsw(g, kCoefficientsRgbY[512+v][1]);
+ r = paddsw(r, kCoefficientsRgbY[512+v][2]);
+ a = paddsw(a, kCoefficientsRgbY[512+v][3]);
+
+ b = paddsw(b, kCoefficientsRgbY[y][0]);
+ g = paddsw(g, kCoefficientsRgbY[y][1]);
+ r = paddsw(r, kCoefficientsRgbY[y][2]);
+ a = paddsw(a, kCoefficientsRgbY[y][3]);
+
+ b >>= 6;
+ g >>= 6;
+ r >>= 6;
+ a >>= 6;
+
+ *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
+ (packuswb(g) << 8) |
+ (packuswb(r) << 16) |
+ (packuswb(a) << 24);
+}
+
+extern "C" {
+
+void ConvertYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width; x += 2) {
+ uint8 u = u_buf[x >> 1];
+ uint8 v = v_buf[x >> 1];
+ uint8 y0 = y_buf[x];
+ YUVPixel(y0, u, v, rgb_buf);
+ if ((x + 1) < width) {
+ uint8 y1 = y_buf[x + 1];
+ YUVPixel(y1, u, v, rgb_buf + 4);
+ }
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+}
+
+// 16.16 fixed point is used. A shift by 16 isolates the integer.
+// A shift by 17 is used to further subsample the chrominence channels.
+// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
+// for 1/65536 pixel accurate interpolation.
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ int x = 0;
+ for (int i = 0; i < width; i += 2) {
+ int y = y_buf[x >> 16];
+ int u = u_buf[(x >> 17)];
+ int v = v_buf[(x >> 17)];
+ YUVPixel(y, u, v, rgb_buf);
+ x += source_dx;
+ if ((i + 1) < width) {
+ y = y_buf[x >> 16];
+ YUVPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+}
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width,
+ int source_dx) {
+ int x = 0;
+ if (source_dx >= 0x20000) {
+ x = 32768;
+ }
+ for (int i = 0; i < width; i += 2) {
+ int y0 = y_buf[x >> 16];
+ int y1 = y_buf[(x >> 16) + 1];
+ int u0 = u_buf[(x >> 17)];
+ int u1 = u_buf[(x >> 17) + 1];
+ int v0 = v_buf[(x >> 17)];
+ int v1 = v_buf[(x >> 17) + 1];
+ int y_frac = (x & 65535);
+ int uv_frac = ((x >> 1) & 65535);
+ int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+ int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
+ int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
+ YUVPixel(y, u, v, rgb_buf);
+ x += source_dx;
+ if ((i + 1) < width) {
+ y0 = y_buf[x >> 16];
+ y1 = y_buf[(x >> 16) + 1];
+ y_frac = (x & 65535);
+ y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+ YUVPixel(y, u, v, rgb_buf+4);
+ x += source_dx;
+ }
+ rgb_buf += 8;
+ }
+}
+
+}
+
+namespace media {
+
+void ConvertYUVToRGB32_C(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type) {
+ unsigned int y_shift = yuv_type;
+ for (int y = 0; y < height; ++y) {
+ uint8* rgb_row = rgbframe + y * rgbstride;
+ const uint8* y_ptr = yplane + y * ystride;
+ const uint8* u_ptr = uplane + (y >> y_shift) * uvstride;
+ const uint8* v_ptr = vplane + (y >> y_shift) * uvstride;
+
+ ConvertYUVToRGB32Row_C(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ width);
+ }
+}
+
+} // namespace media
diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.asm b/media/base/simd/convert_yuv_to_rgb_mmx.asm
new file mode 100644
index 0000000..e044474
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_mmx.asm
@@ -0,0 +1,22 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+ SECTION_TEXT
+ CPU MMX
+
+; Use movq to save the output.
+%define MOVQ movq
+
+; extern "C" void ConvertYUVToRGB32Row_MMX(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width);
+%define SYMBOL ConvertYUVToRGB32Row_MMX
+%include "convert_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.inc b/media/base/simd/convert_yuv_to_rgb_mmx.inc
new file mode 100644
index 0000000..b9555ce
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_mmx.inc
@@ -0,0 +1,119 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+; Non-PIC code is the fastest so use this if possible.
+%ifndef PIC
+mangle(SYMBOL):
+ %assign stack_offset 0
+ PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV
+ extern mangle(kCoefficientsRgbY)
+ jmp .convertend
+
+.convertloop:
+ movzx TEMPUd, BYTE [Uq]
+ add Uq, 1
+ movzx TEMPVd, BYTE [Vq]
+ add Vq, 1
+ movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
+ movzx TEMPUd, BYTE [Yq]
+ paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
+ movzx TEMPVd, BYTE [Yq + 1]
+ movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
+ add Yq, 2
+ movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ MOVQ [ARGBq], mm1
+ add ARGBq, 8
+
+.convertend:
+ sub WIDTHq, 2
+ jns .convertloop
+
+ ; If number of pixels is odd then compute it.
+ and WIDTHq, 1
+ jz .convertdone
+
+ movzx TEMPUd, BYTE [Uq]
+ movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
+ movzx TEMPVd, BYTE [Vq]
+ paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
+ movzx TEMPUd, BYTE [Yq]
+ movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ARGBq], mm1
+
+.convertdone:
+ RET
+%endif
+
+; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
+; This code is slower than the above version.
+%ifdef PIC
+mangle(SYMBOL):
+ %assign stack_offset 0
+ PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE
+
+ extern mangle(kCoefficientsRgbY)
+ LOAD_SYM TABLEq, mangle(kCoefficientsRgbY)
+
+ jmp .convertend
+
+.convertloop:
+ movzx TEMPd, BYTE [Uq]
+ movq mm0, [TABLEq + 2048 + 8 * TEMPq]
+ add Uq, 1
+
+ movzx TEMPd, BYTE [Vq]
+ paddsw mm0, [TABLEq + 4096 + 8 * TEMPq]
+ add Vq, 1
+
+ movzx TEMPd, BYTE [Yq]
+ movq mm1, [TABLEq + 8 * TEMPq]
+
+ movzx TEMPd, BYTE [Yq + 1]
+ movq mm2, [TABLEq + 8 * TEMPq]
+ add Yq, 2
+
+ ; Add UV components to Y component.
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+
+ ; Down shift and then pack.
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ MOVQ [ARGBq], mm1
+ add ARGBq, 8
+
+.convertend:
+ sub WIDTHq, 2
+ jns .convertloop
+
+ ; If number of pixels is odd then compute it.
+ and WIDTHq, 1
+ jz .convertdone
+
+ movzx TEMPd, BYTE [Uq]
+ movq mm0, [TABLEq + 2048 + 8 * TEMPq]
+ movzx TEMPd, BYTE [Vq]
+ paddsw mm0, [TABLEq + 4096 + 8 * TEMPq]
+ movzx TEMPd, BYTE [Yq]
+ movq mm1, [TABLEq + 8 * TEMPq]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ARGBq], mm1
+
+.convertdone:
+ RET
+%endif
diff --git a/media/base/simd/convert_yuv_to_rgb_sse.asm b/media/base/simd/convert_yuv_to_rgb_sse.asm
new file mode 100644
index 0000000..2f1967a
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_sse.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX and SSE instructions.
+;
+ SECTION_TEXT
+ CPU MMX, SSE
+
+; Use SSE instruction movntq can write faster.
+%define MOVQ movntq
+
+;
+; extern "C" void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width);
+%define SYMBOL ConvertYUVToRGB32Row_SSE
+%include "convert_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/convert_yuv_to_rgb_x86.cc b/media/base/simd/convert_yuv_to_rgb_x86.cc
new file mode 100644
index 0000000..3e03ef9
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_x86.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#endif
+
+#include "media/base/cpu_features.h"
+#include "media/base/simd/convert_yuv_to_rgb.h"
+#include "media/base/yuv_convert.h"
+
+namespace media {
+
+void ConvertYUVToRGB32_MMX(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type) {
+ unsigned int y_shift = yuv_type;
+ for (int y = 0; y < height; ++y) {
+ uint8* rgb_row = rgbframe + y * rgbstride;
+ const uint8* y_ptr = yplane + y * ystride;
+ const uint8* u_ptr = uplane + (y >> y_shift) * uvstride;
+ const uint8* v_ptr = vplane + (y >> y_shift) * uvstride;
+
+ ConvertYUVToRGB32Row_MMX(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ width);
+ }
+
+ _mm_empty();
+}
+
+void ConvertYUVToRGB32_SSE(const uint8* yplane,
+ const uint8* uplane,
+ const uint8* vplane,
+ uint8* rgbframe,
+ int width,
+ int height,
+ int ystride,
+ int uvstride,
+ int rgbstride,
+ YUVType yuv_type) {
+ unsigned int y_shift = yuv_type;
+ for (int y = 0; y < height; ++y) {
+ uint8* rgb_row = rgbframe + y * rgbstride;
+ const uint8* y_ptr = yplane + y * ystride;
+ const uint8* u_ptr = uplane + (y >> y_shift) * uvstride;
+ const uint8* v_ptr = vplane + (y >> y_shift) * uvstride;
+
+ ConvertYUVToRGB32Row_SSE(y_ptr,
+ u_ptr,
+ v_ptr,
+ rgb_row,
+ width);
+ }
+
+ _mm_empty();
+}
+
+} // namespace media
diff --git a/media/base/simd/filter_yuv.h b/media/base/simd/filter_yuv.h
new file mode 100644
index 0000000..5a9cf11
--- /dev/null
+++ b/media/base/simd/filter_yuv.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef MEDIA_BASE_SIMD_FILTER_YUV_H_
+#define MEDIA_BASE_SIMD_FILTER_YUV_H_
+
+#include "base/basictypes.h"
+
+namespace media {
+
+typedef void (*FilterYUVRowsProc)(uint8*,
+ const uint8*,
+ const uint8*,
+ int,
+ int);
+
+void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
+
+void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
+
+void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction);
+
+} // namespace media
+
+#endif // MEDIA_BASE_SIMD_FILTER_YUV_H_
diff --git a/media/base/simd/filter_yuv_c.cc b/media/base/simd/filter_yuv_c.cc
new file mode 100644
index 0000000..95ae01a
--- /dev/null
+++ b/media/base/simd/filter_yuv_c.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/simd/filter_yuv.h"
+
+namespace media {
+
+void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ uint8* end = ybuf + source_width;
+ do {
+ ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
+ ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
+ ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
+ ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
+ ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
+ ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
+ ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
+ ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
+ y0_ptr += 8;
+ y1_ptr += 8;
+ ybuf += 8;
+ } while (ybuf < end);
+}
+
+} // namespace media
diff --git a/media/base/simd/filter_yuv_mmx.cc b/media/base/simd/filter_yuv_mmx.cc
new file mode 100644
index 0000000..77698dc
--- /dev/null
+++ b/media/base/simd/filter_yuv_mmx.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#include <emmintrin.h>
+#endif
+
+#include "build/build_config.h"
+#include "media/base/simd/filter_yuv.h"
+
+namespace media {
+
+#if defined(COMPILER_MSVC)
+// Warning 4799 is about calling emms before the function exits.
+// We calls emms in a frame level so suppress this warning.
+#pragma warning(disable: 4799)
+#endif
+
+void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m64 zero = _mm_setzero_si64();
+ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+
+ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+ __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+ do {
+ __m64 y0 = *y0_ptr64++;
+ __m64 y1 = *y1_ptr64++;
+ __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+ __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+ y0 = _mm_unpacklo_pi8(y0, zero);
+ y1 = _mm_unpacklo_pi8(y1, zero);
+ y0 = _mm_mullo_pi16(y0, y0_fraction);
+ y1 = _mm_mullo_pi16(y1, y1_fraction);
+ y2 = _mm_mullo_pi16(y2, y0_fraction);
+ y3 = _mm_mullo_pi16(y3, y1_fraction);
+ y0 = _mm_add_pi16(y0, y1);
+ y2 = _mm_add_pi16(y2, y3);
+ y0 = _mm_srli_pi16(y0, 8);
+ y2 = _mm_srli_pi16(y2, 8);
+ y0 = _mm_packs_pu16(y0, y2);
+ *dest64++ = y0;
+ } while (dest64 < end64);
+}
+
+#if defined(COMPILER_MSVC)
+#pragma warning(default: 4799)
+#endif
+
+} // namespace media
diff --git a/media/base/simd/filter_yuv_sse2.cc b/media/base/simd/filter_yuv_sse2.cc
new file mode 100644
index 0000000..137ac94
--- /dev/null
+++ b/media/base/simd/filter_yuv_sse2.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#include <emmintrin.h>
+#endif
+
+#include "media/base/simd/filter_yuv.h"
+
+namespace media {
+
+void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+ int source_width, int source_y_fraction) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+
+ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+
+ do {
+ __m128i y0 = _mm_loadu_si128(y0_ptr128);
+ __m128i y1 = _mm_loadu_si128(y1_ptr128);
+ __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+ __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+ y0 = _mm_unpacklo_epi8(y0, zero);
+ y1 = _mm_unpacklo_epi8(y1, zero);
+ y0 = _mm_mullo_epi16(y0, y0_fraction);
+ y1 = _mm_mullo_epi16(y1, y1_fraction);
+ y2 = _mm_mullo_epi16(y2, y0_fraction);
+ y3 = _mm_mullo_epi16(y3, y1_fraction);
+ y0 = _mm_add_epi16(y0, y1);
+ y2 = _mm_add_epi16(y2, y3);
+ y0 = _mm_srli_epi16(y0, 8);
+ y2 = _mm_srli_epi16(y2, 8);
+ y0 = _mm_packus_epi16(y0, y2);
+ *dest128++ = y0;
+ ++y0_ptr128;
+ ++y1_ptr128;
+ } while (dest128 < end128);
+}
+
+} // namespace media
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm
new file mode 100644
index 0000000..7f7e0e8
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+ SECTION_TEXT
+ CPU MMX
+
+; Use movq to save the output.
+%define MOVQ movq
+
+; void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL LinearScaleYUVToRGB32Row_MMX
+%include "linear_scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
new file mode 100644
index 0000000..91c06a5
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
@@ -0,0 +1,166 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+mangle(SYMBOL):
+ %assign stack_offset 0
+
+ extern mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP
+
+%if gprsize == 8
+%define WORD_SIZE QWORD
+%else
+%define WORD_SIZE DWORD
+%endif
+
+; Define register aliases.
+%define Xq R1q ; Current X position
+%define COMPLq R2q ; Component A value
+%define COMPLd R2d ; Component A value
+%define U_ARG_REGq R0q ; U plane address argument
+%define V_ARG_REGq R1q ; V plane address argument
+%define SOURCE_DX_ARG_REGq R3q ; Source dx argument
+%define WIDTH_ARG_REGq R2q ; Width argument
+
+%ifdef PIC
+; PIC code shared COMPR, U and V with the same register. Need to be careful in the
+; code they don't mix up. This allows R3q to be used for YUV table.
+%define COMPRq R0q ; Component B value
+%define COMPRd R0d ; Component B value
+%define Uq R0q ; U plane address
+%define Vq R0q ; V plane address
+%define U_PLANE WORD_SIZE [rsp + 3 * gprsize]
+%define TABLE R3q ; Address of the table
+%else
+; Non-PIC code defines.
+%define COMPRq R3q ; Component B value
+%define COMPRd R3d ; Component B value
+%define Uq R0q ; U plane address
+%define Vq R3q ; V plane address
+%define TABLE mangle(kCoefficientsRgbY)
+%endif
+
+; Defines for stack variables. These are used in both PIC and non-PIC code.
+%define V_PLANE WORD_SIZE [rsp + 2 * gprsize]
+%define SOURCE_DX WORD_SIZE [rsp + gprsize]
+%define SOURCE_WIDTH WORD_SIZE [rsp]
+
+; Handle stack variables differently for PIC and non-PIC code.
+
+%ifdef PIC
+; Define stack usage for PIC code. PIC code push U plane onto stack.
+ PUSH U_ARG_REGq
+ PUSH V_ARG_REGq
+ PUSH SOURCE_DX_ARG_REGq
+ imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx
+ PUSH WIDTH_ARG_REGq
+
+; Load the address of kCoefficientsRgbY into TABLE
+ mov TEMPq, SOURCE_DX_ARG_REGq ; Need to save source_dx first
+ LOAD_SYM TABLE, mangle(kCoefficientsRgbY)
+%define SOURCE_DX_ARG_REGq TEMPq ; Overwrite SOURCE_DX_ARG_REGq to TEMPq
+%else
+; Define stack usage. Non-PIC code just push 3 registers to stack.
+ PUSH V_ARG_REGq
+ PUSH SOURCE_DX_ARG_REGq
+ imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx
+ PUSH WIDTH_ARG_REGq
+%endif
+
+%macro EPILOGUE 0
+%ifdef PIC
+ ADD rsp, 4 * gprsize
+%else
+ ADD rsp, 3 * gprsize
+%endif
+%endmacro
+
+ xor Xq, Xq ; x = 0
+ cmp SOURCE_DX_ARG_REGq, 0x20000
+ jl .lscaleend
+ mov Xq, 0x8000 ; x = 0.5 for 1/2 or less
+ jmp .lscaleend
+
+.lscaleloop:
+%ifdef PIC
+ mov Uq, U_PLANE ; PIC code saves U_PLANE on stack.
+%endif
+
+; Define macros for scaling YUV components since they are reused.
+%macro SCALEUV 1
+ mov TEMPq, Xq
+ sar TEMPq, 0x11
+ movzx COMPLd, BYTE [%1 + TEMPq]
+ movzx COMPRd, BYTE [%1 + TEMPq + 1]
+ mov TEMPq, Xq
+ and TEMPq, 0x1fffe
+ imul COMPRq, TEMPq
+ xor TEMPq, 0x1fffe
+ imul COMPLq, TEMPq
+ add COMPLq, COMPRq
+ shr COMPLq, 17
+%endmacro
+ SCALEUV Uq ; Use the above macro to scale U
+ movq mm0, [TABLE + 2048 + 8 * COMPLq]
+
+ mov Vq, V_PLANE ; Read V address from stack
+ SCALEUV Vq ; Use the above macro to scale V
+ paddsw mm0, [TABLE + 4096 + 8 * COMPLq]
+
+%macro SCALEY 0
+ mov TEMPq, Xq
+ sar TEMPq, 0x10
+ movzx COMPLd, BYTE [Yq + TEMPq]
+ movzx COMPRd, BYTE [Yq + TEMPq + 1]
+ mov TEMPq, Xq
+ add Xq, SOURCE_DX ; Add source_dx from stack
+ and TEMPq, 0xffff
+ imul COMPRq, TEMPq
+ xor TEMPq, 0xffff
+ imul COMPLq, TEMPq
+ add COMPLq, COMPRq
+ shr COMPLq, 16
+%endmacro
+ SCALEY ; Use the above macro to scale Y1
+ movq mm1, [TABLE + 8 * COMPLq]
+
+ cmp Xq, SOURCE_WIDTH ; Compare source_width from stack
+ jge .lscalelastpixel
+
+ SCALEY ; Use the above macro to sacle Y2
+ movq mm2, [TABLE + 8 * COMPLq]
+
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 0x6
+ psraw mm2, 0x6
+ packuswb mm1, mm2
+ MOVQ [ARGBq], mm1
+ add ARGBq, 0x8
+
+.lscaleend:
+ cmp Xq, SOURCE_WIDTH ; Compare source_width from stack
+ jl .lscaleloop
+ EPILOGUE
+ RET
+
+.lscalelastpixel:
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ARGBq], mm1
+ EPILOGUE
+ RET
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm
new file mode 100644
index 0000000..db7854457
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm
@@ -0,0 +1,142 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+ SECTION_TEXT
+ CPU MMX
+
+%define SYMBOL LinearScaleYUVToRGB32Row_MMX_X64
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+mangle(SYMBOL):
+ %assign stack_offset 0
+ extern mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE 6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMPL
+
+%define TABLEq r10
+%define Xq r11
+%define INDEXq r12
+%define COMPRd r13d
+%define COMPRq r13
+%define FRACTIONq r14
+
+ PUSH TABLEq
+ PUSH Xq
+ PUSH INDEXq
+ PUSH COMPRq
+ PUSH FRACTIONq
+
+%macro EPILOGUE 0
+ POP FRACTIONq
+ POP COMPRq
+ POP INDEXq
+ POP Xq
+ POP TABLEq
+%endmacro
+
+ LOAD_SYM TABLEq, mangle(kCoefficientsRgbY)
+
+ imul WIDTHq, SOURCE_DXq ; source_width = width * source_dx
+ xor Xq, Xq ; x = 0
+ cmp SOURCE_DXq, 0x20000
+ jl .lscaleend
+ mov Xq, 0x8000 ; x = 0.5 for 1/2 or less
+ jmp .lscaleend
+
+.lscaleloop:
+ ; Interpolate U
+ mov INDEXq, Xq
+ sar INDEXq, 0x11
+ movzx COMPLd, BYTE [Uq + INDEXq]
+ movzx COMPRd, BYTE [Uq + INDEXq + 1]
+ mov FRACTIONq, Xq
+ and FRACTIONq, 0x1fffe
+ imul COMPRq, FRACTIONq
+ xor FRACTIONq, 0x1fffe
+ imul COMPLq, FRACTIONq
+ add COMPLq, COMPRq
+ shr COMPLq, 17
+ movq mm0, [TABLEq + 2048 + 8 * COMPLq]
+
+ ; Interpolate V
+ movzx COMPLd, BYTE [Vq + INDEXq]
+ movzx COMPRd, BYTE [Vq + INDEXq + 1]
+ ; Trick here to imul COMPL first then COMPR.
+ ; Saves two instruction. :)
+ imul COMPLq, FRACTIONq
+ xor FRACTIONq, 0x1fffe
+ imul COMPRq, FRACTIONq
+ add COMPLq, COMPRq
+ shr COMPLq, 17
+ paddsw mm0, [TABLEq + 4096 + 8 * COMPLq]
+
+ ; Interpolate first Y1.
+ lea INDEXq, [Xq + SOURCE_DXq] ; INDEXq now points to next pixel.
+ ; Xq points to current pixel.
+ mov FRACTIONq, Xq
+ sar Xq, 0x10
+ movzx COMPLd, BYTE [Yq + Xq]
+ movzx COMPRd, BYTE [Yq + Xq + 1]
+ and FRACTIONq, 0xffff
+ imul COMPRq, FRACTIONq
+ xor FRACTIONq, 0xffff
+ imul COMPLq, FRACTIONq
+ add COMPLq, COMPRq
+ shr COMPLq, 16
+ movq mm1, [TABLEq + 8 * COMPLq]
+
+ ; Interpolate Y2 if available.
+ cmp INDEXq, WIDTHq
+ jge .lscalelastpixel
+
+ lea Xq, [INDEXq + SOURCE_DXq] ; Xq points to next pixel.
+ ; INDEXq points to current pixel.
+ mov FRACTIONq, INDEXq
+ sar INDEXq, 0x10
+ movzx COMPLd, BYTE [Yq + INDEXq]
+ movzx COMPRd, BYTE [Yq + INDEXq + 1]
+ and FRACTIONq, 0xffff
+ imul COMPRq, FRACTIONq
+ xor FRACTIONq, 0xffff
+ imul COMPLq, FRACTIONq
+ add COMPLq, COMPRq
+ shr COMPLq, 16
+ movq mm2, [TABLEq + 8 * COMPLq]
+
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 0x6
+ psraw mm2, 0x6
+ packuswb mm1, mm2
+ movntq [ARGBq], mm1
+ add ARGBq, 0x8
+
+.lscaleend:
+ cmp Xq, WIDTHq
+ jl .lscaleloop
+ jmp .epilogue
+
+.lscalelastpixel:
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd [ARGBq], mm1
+
+.epilogue
+ EPILOGUE
+ RET
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_sse.asm b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm
new file mode 100644
index 0000000..847911c
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX and SSE instructions.
+;
+ SECTION_TEXT
+ CPU MMX, SSE
+
+; Use movq to save the output.
+%define MOVQ movntq
+
+; void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL LinearScaleYUVToRGB32Row_SSE
+%include "linear_scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.asm b/media/base/simd/scale_yuv_to_rgb_mmx.asm
new file mode 100644
index 0000000..6a83757
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_mmx.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+ SECTION_TEXT
+ CPU MMX
+
+; Use movq to save the output.
+%define MOVQ movq
+
+; void ScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_MMX
+%include "scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.inc b/media/base/simd/scale_yuv_to_rgb_mmx.inc
new file mode 100644
index 0000000..94c101c
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_mmx.inc
@@ -0,0 +1,115 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+mangle(SYMBOL):
+ %assign stack_offset 0
+
+ extern mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP
+
+%ifdef ARCH_X86_64
+%define WORD_SIZE QWORD
+%else
+%define WORD_SIZE DWORD
+%endif
+
+%ifdef PIC
+ PUSH R1q ; Width
+%endif
+ PUSH R2q ; Source dx
+
+%define SOURCE_DX WORD_SIZE [rsp]
+
+; PIC code.
+%ifdef PIC
+ LOAD_SYM R1q, mangle(kCoefficientsRgbY)
+%define WIDTH WORD_SIZE [rsp + gprsize]
+%define TABLE R1q
+%define Xq R2q
+
+; Non-PIC code.
+%else
+%define WIDTH R1q
+%define TABLE mangle(kCoefficientsRgbY)
+%define Xq R2q
+%endif
+
+ ; Set Xq index to 0.
+ xor Xq, Xq
+ jmp .scaleend
+
+.scaleloop:
+ ; TABLE can either be a register or a symbol depending on this is
+ ; PIC or not.
+ mov TEMPq, Xq
+ sar TEMPq, 17
+ movzx TEMPd, BYTE [Uq + TEMPq]
+ movq mm0, [TABLE + 2048 + 8 * TEMPq]
+ mov TEMPq, Xq
+ sar TEMPq, 17
+ movzx TEMPd, BYTE [Vq + TEMPq]
+ paddsw mm0, [TABLE + 4096 + 8 * TEMPq]
+ mov TEMPq, Xq
+ add Xq, SOURCE_DX
+ sar TEMPq, 16
+ movzx TEMPd, BYTE [Yq + TEMPq]
+ movq mm1, [TABLE + 8 * TEMPq]
+ mov TEMPq, Xq
+ add Xq, SOURCE_DX
+ sar TEMPq, 16
+ movzx TEMPd, BYTE [Yq + TEMPq]
+ movq mm2, [TABLE + 8 * TEMPq]
+ paddsw mm1, mm0
+ paddsw mm2, mm0
+ psraw mm1, 6
+ psraw mm2, 6
+ packuswb mm1, mm2
+ MOVQ QWORD [ARGBq], mm1
+ add ARGBq, 8
+
+.scaleend:
+ ; WIDTH can either be a register or memory depending on this is
+ ; PIC or not.
+ sub WIDTH, 2
+ jns .scaleloop
+
+ and WIDTH, 1 ; odd number of pixels?
+ jz .scaledone
+
+ mov TEMPq, Xq
+ sar TEMPq, 17
+ movzx TEMPd, BYTE [Uq + TEMPq]
+ movq mm0, [TABLE + 2048 + 8 * TEMPq]
+ mov TEMPq, Xq
+ sar TEMPq, 17
+ movzx TEMPd, BYTE [Vq + TEMPq]
+ paddsw mm0, [TABLE + 4096 + 8 * TEMPq]
+ mov TEMPq, Xq
+ sar TEMPq, 16
+ movzx TEMPd, BYTE [Yq + TEMPq]
+ movq mm1, [TABLE + 8 * TEMPq]
+ paddsw mm1, mm0
+ psraw mm1, 6
+ packuswb mm1, mm1
+ movd DWORD [ARGBq], mm1
+
+.scaledone:
+%ifdef PIC
+ ADD rsp, 2 * gprsize
+%else
+ ADD rsp, gprsize
+%endif
+ RET
diff --git a/media/base/simd/scale_yuv_to_rgb_sse.asm b/media/base/simd/scale_yuv_to_rgb_sse.asm
new file mode 100644
index 0000000..5b849a6
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_sse.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX and SSE instructions.
+;
+ SECTION_TEXT
+ CPU MMX, SSE
+
+; Use movq to save the output.
+%define MOVQ movntq
+
+; void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_SSE
+%include "scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm
new file mode 100644
index 0000000..5e58146
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm
@@ -0,0 +1,110 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX, SSE2 and instructions.
+;
+ SECTION_TEXT
+ CPU SSE2
+
+; void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf,
+; const uint8* u_buf,
+; const uint8* v_buf,
+; uint8* rgb_buf,
+; int width,
+; int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_SSE2_X64
+
+ global mangle(SYMBOL) PRIVATE
+ align function_align
+
+mangle(SYMBOL):
+ %assign stack_offset 0
+ extern mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE 6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMP
+
+%define TABLEq r10
+%define Xq r11
+%define INDEXq r12
+ PUSH r10
+ PUSH r11
+ PUSH r12
+
+ LOAD_SYM TABLEq, mangle(kCoefficientsRgbY)
+
+ ; Set Xq index to 0.
+ xor Xq, Xq
+ jmp .scaleend
+
+.scaleloop:
+ ; Read UV pixels.
+ mov INDEXq, Xq
+ sar INDEXq, 17
+ movzx COMPd, BYTE [Uq + INDEXq]
+ movq xmm0, [TABLEq + 2048 + 8 * COMPq]
+ movzx COMPd, BYTE [Vq + INDEXq]
+ movq xmm1, [TABLEq + 4096 + 8 * COMPq]
+
+ ; Read first Y pixel.
+ lea INDEXq, [Xq + SOURCE_DXq] ; INDEXq nows points to next pixel.
+ sar Xq, 16
+ movzx COMPd, BYTE [Yq + Xq]
+ paddsw xmm0, xmm1 ; Hide a ADD after memory load.
+ movq xmm1, [TABLEq + 8 * COMPq]
+
+ ; Read next Y pixel.
+ lea Xq, [INDEXq + SOURCE_DXq] ; Xq now points to next pixel.
+ sar INDEXq, 16
+ movzx COMPd, BYTE [Yq + INDEXq]
+ movq xmm2, [TABLEq + 8 * COMPq]
+ paddsw xmm1, xmm0
+ paddsw xmm2, xmm0
+ shufps xmm1, xmm2, 0x44 ; Join two pixels into one XMM register
+ psraw xmm1, 6
+ packuswb xmm1, xmm1
+ movq QWORD [ARGBq], xmm1
+ add ARGBq, 8
+
+.scaleend:
+ sub WIDTHq, 2
+ jns .scaleloop
+
+ and WIDTHq, 1 ; odd number of pixels?
+ jz .scaledone
+
+ ; Read U V components.
+ mov INDEXq, Xq
+ sar INDEXq, 17
+ movzx COMPd, BYTE [Uq + INDEXq]
+ movq xmm0, [TABLEq + 2048 + 8 * COMPq]
+ movzx COMPd, BYTE [Vq + INDEXq]
+ movq xmm1, [TABLEq + 4096 + 8 * COMPq]
+ paddsw xmm0, xmm1
+
+ ; Read one Y component.
+ mov INDEXq, Xq
+ sar INDEXq, 16
+ movzx COMPd, BYTE [Yq + INDEXq]
+ movq xmm1, [TABLEq + 8 * COMPq]
+ paddsw xmm1, xmm0
+ psraw xmm1, 6
+ packuswb xmm1, xmm1
+ movd DWORD [ARGBq], xmm1
+
+.scaledone:
+ POP r12
+ POP r11
+ POP r10
+ RET
diff --git a/media/base/simd/x86inc.asm b/media/base/simd/x86inc.asm
index 956b999..5e0ca20 100644
--- a/media/base/simd/x86inc.asm
+++ b/media/base/simd/x86inc.asm
@@ -95,11 +95,14 @@
%ifdef WIN64
%define PIC
%elifndef ARCH_X86_64
+; For chromium we may build PIC code even for 32 bits system.
+%ifndef CHROMIUM
; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
%undef PIC
%endif
+%endif
%ifdef PIC
default rel
%endif
@@ -947,6 +950,11 @@ AVX_INSTR pfmul, 1, 0
;=============================================================================
%ifdef CHROMIUM
+; Always build PIC code on Mac for Chromium.
+%ifdef MACHO
+%define PIC
+%endif
+
;
; LOAD_SYM %1 (reg), %2 (sym)
; Copies the address to a local symbol to the specified register.