Resubmit - Rewrite color space conversions suite using YASM"

I'll watch the official buildbot this time. TBR=ajwong, dhollowa BUG=None TEST=None Review URL: http://codereview.chromium.org/7891039 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@101067 0039d316-1c4b-4281-b951-d872f2087c98
author: hclam@chromium.org <hclam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-14 12:40:45 +0000
committer: hclam@chromium.org <hclam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-14 12:40:45 +0000
commit: ccde716550693ceb59cc8717d4f4d4845f23d853 (patch)
tree: 0022aff451d5665193835d1c508a1f3303ae9cc5 /media/base/simd
parent: 89a4d2f772aa92a79acedf057f4036820c1fd412 (diff)
download: chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.zip
chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.tar.gz
chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.tar.bz2
20 files changed, 1439 insertions, 0 deletions
diff --git a/media/base/simd/convert_rgb_to_yuv_x86.cc b/media/base/simd/convert_rgb_to_yuv_x86.cc
new file mode 100644
index 0000000..2bd6930
--- /dev/null
+++ b/media/base/simd/convert_rgb_to_yuv_x86.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/simd/convert_rgb_to_yuv.h"
+
+#include "build/build_config.h"
+#include "media/base/cpu_features.h"
+#include "media/base/simd/convert_rgb_to_yuv_ssse3.h"
+
+namespace media {
+
+void ConvertRGB32ToYUV_SSSE3(const uint8* rgbframe,
+                             uint8* yplane,
+                             uint8* uplane,
+                             uint8* vplane,
+                             int width,
+                             int height,
+                             int rgbstride,
+                             int ystride,
+                             int uvstride) {
+#ifdef ENABLE_SUBSAMPLING
+  for (; height >= 2; height -= 2) {
+    ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+    rgbframe += rgbstride;
+    yplane += ystride;
+
+    ConvertARGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width);
+    rgbframe += rgbstride;
+    yplane += ystride;
+
+    uplane += uvstride;
+    vplane += uvstride;
+  }
+
+  if (height)
+    ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#else
+  for (; height >= 2; height -= 2) {
+    ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+    rgbframe += rgbstride;
+    yplane += ystride;
+
+    ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width);
+    rgbframe += rgbstride;
+    yplane += ystride;
+
+    uplane += uvstride;
+    vplane += uvstride;
+  }
+
+  if (height)
+    ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#endif
+}
+
+void ConvertRGB24ToYUV_SSSE3(const uint8* rgbframe,
+                             uint8* yplane,
+                             uint8* uplane,
+                             uint8* vplane,
+                             int width,
+                             int height,
+                             int rgbstride,
+                             int ystride,
+                             int uvstride) {
+#ifdef ENABLE_SUBSAMPLING
+  for (; height >= 2; height -= 2) {
+    ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+    rgbframe += rgbstride;
+    yplane += ystride;
+
+    ConvertRGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width);
+    rgbframe += rgbstride;
+    yplane += ystride;
+
+    uplane += uvstride;
+    vplane += uvstride;
+  }
+
+  if (height)
+    ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#else
+  for (; height >= 2; height -= 2) {
+    ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+    rgbframe += rgbstride;
+    yplane += ystride;
+
+    ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width);
+    rgbframe += rgbstride;
+    yplane += ystride;
+
+    uplane += uvstride;
+    vplane += uvstride;
+  }
+
+  if (height)
+    ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width);
+#endif
+}
+
+}  // namespace media
diff --git a/media/base/simd/convert_yuv_to_rgb.h b/media/base/simd/convert_yuv_to_rgb.h
new file mode 100644
index 0000000..5f3df2c6
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb.h
@@ -0,0 +1,150 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_
+#define MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_
+
+#include "base/basictypes.h"
+#include "media/base/yuv_convert.h"
+
+namespace media {
+
+typedef void (*ConvertYUVToRGB32Proc)(const uint8*,
+                                      const uint8*,
+                                      const uint8*,
+                                      uint8*,
+                                      int,
+                                      int,
+                                      int,
+                                      int,
+                                      int,
+                                      YUVType);
+
+void ConvertYUVToRGB32_C(const uint8* yplane,
+                         const uint8* uplane,
+                         const uint8* vplane,
+                         uint8* rgbframe,
+                         int width,
+                         int height,
+                         int ystride,
+                         int uvstride,
+                         int rgbstride,
+                         YUVType yuv_type);
+
+void ConvertYUVToRGB32_SSE(const uint8* yplane,
+                           const uint8* uplane,
+                           const uint8* vplane,
+                           uint8* rgbframe,
+                           int width,
+                           int height,
+                           int ystride,
+                           int uvstride,
+                           int rgbstride,
+                           YUVType yuv_type);
+
+void ConvertYUVToRGB32_MMX(const uint8* yplane,
+                           const uint8* uplane,
+                           const uint8* vplane,
+                           uint8* rgbframe,
+                           int width,
+                           int height,
+                           int ystride,
+                           int uvstride,
+                           int rgbstride,
+                           YUVType yuv_type);
+
+}  // namespace media
+
+// Assembly functions are declared without namespace.
+extern "C" {
+
+typedef void (*ConvertYUVToRGB32RowProc)(const uint8*,
+                                          const uint8*,
+                                          const uint8*,
+                                          uint8*,
+                                          int);
+typedef void (*ScaleYUVToRGB32RowProc)(const uint8*,
+                                       const uint8*,
+                                       const uint8*,
+                                       uint8*,
+                                       int,
+                                       int);
+
+void ConvertYUVToRGB32Row_C(const uint8* yplane,
+                            const uint8* uplane,
+                            const uint8* vplane,
+                            uint8* rgbframe,
+                            int width);
+
+void ConvertYUVToRGB32Row_MMX(const uint8* yplane,
+                              const uint8* uplane,
+                              const uint8* vplane,
+                              uint8* rgbframe,
+                              int width);
+
+void ConvertYUVToRGB32Row_SSE(const uint8* yplane,
+                              const uint8* uplane,
+                              const uint8* vplane,
+                              uint8* rgbframe,
+                              int width);
+
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int source_dx);
+
+void ScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width,
+                            int source_dx);
+
+void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width,
+                            int source_dx);
+
+void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width,
+                                 int source_dx);
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int source_dx);
+
+void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width,
+                                  int source_dx);
+
+void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width,
+                                  int source_dx);
+
+void LinearScaleYUVToRGB32Row_MMX_X64(const uint8* y_buf,
+                                      const uint8* u_buf,
+                                      const uint8* v_buf,
+                                      uint8* rgb_buf,
+                                      int width,
+                                      int source_dx);
+
+}
+
+#endif  // MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_
diff --git a/media/base/simd/convert_yuv_to_rgb_c.cc b/media/base/simd/convert_yuv_to_rgb_c.cc
new file mode 100644
index 0000000..f8e70b2
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_c.cc
@@ -0,0 +1,155 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/simd/convert_yuv_to_rgb.h"
+// TODO(hclam): Shouldn't depend on yuv_row.h.
+#include "media/base/yuv_row.h"
+
+#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
+#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
+    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+
+static inline void YUVPixel(uint8 y,
+                            uint8 u,
+                            uint8 v,
+                            uint8* rgb_buf) {
+
+  int b = kCoefficientsRgbY[256+u][0];
+  int g = kCoefficientsRgbY[256+u][1];
+  int r = kCoefficientsRgbY[256+u][2];
+  int a = kCoefficientsRgbY[256+u][3];
+
+  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
+  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
+  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
+  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
+
+  b = paddsw(b, kCoefficientsRgbY[y][0]);
+  g = paddsw(g, kCoefficientsRgbY[y][1]);
+  r = paddsw(r, kCoefficientsRgbY[y][2]);
+  a = paddsw(a, kCoefficientsRgbY[y][3]);
+
+  b >>= 6;
+  g >>= 6;
+  r >>= 6;
+  a >>= 6;
+
+  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
+                                        (packuswb(g) << 8) |
+                                        (packuswb(r) << 16) |
+                                        (packuswb(a) << 24);
+}
+
+extern "C" {
+
+void ConvertYUVToRGB32Row_C(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) {
+  for (int x = 0; x < width; x += 2) {
+    uint8 u = u_buf[x >> 1];
+    uint8 v = v_buf[x >> 1];
+    uint8 y0 = y_buf[x];
+    YUVPixel(y0, u, v, rgb_buf);
+    if ((x + 1) < width) {
+      uint8 y1 = y_buf[x + 1];
+      YUVPixel(y1, u, v, rgb_buf + 4);
+    }
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+}
+
+// 16.16 fixed point is used.  A shift by 16 isolates the integer.
+// A shift by 17 is used to further subsample the chrominence channels.
+// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
+// for 1/65536 pixel accurate interpolation.
+void ScaleYUVToRGB32Row_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width,
+                          int source_dx) {
+  int x = 0;
+  for (int i = 0; i < width; i += 2) {
+    int y = y_buf[x >> 16];
+    int u = u_buf[(x >> 17)];
+    int v = v_buf[(x >> 17)];
+    YUVPixel(y, u, v, rgb_buf);
+    x += source_dx;
+    if ((i + 1) < width) {
+      y = y_buf[x >> 16];
+      YUVPixel(y, u, v, rgb_buf+4);
+      x += source_dx;
+    }
+    rgb_buf += 8;
+  }
+}
+
+void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgb_buf,
+                                int width,
+                                int source_dx) {
+  int x = 0;
+  if (source_dx >= 0x20000) {
+    x = 32768;
+  }
+  for (int i = 0; i < width; i += 2) {
+    int y0 = y_buf[x >> 16];
+    int y1 = y_buf[(x >> 16) + 1];
+    int u0 = u_buf[(x >> 17)];
+    int u1 = u_buf[(x >> 17) + 1];
+    int v0 = v_buf[(x >> 17)];
+    int v1 = v_buf[(x >> 17) + 1];
+    int y_frac = (x & 65535);
+    int uv_frac = ((x >> 1) & 65535);
+    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
+    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
+    YUVPixel(y, u, v, rgb_buf);
+    x += source_dx;
+    if ((i + 1) < width) {
+      y0 = y_buf[x >> 16];
+      y1 = y_buf[(x >> 16) + 1];
+      y_frac = (x & 65535);
+      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
+      YUVPixel(y, u, v, rgb_buf+4);
+      x += source_dx;
+    }
+    rgb_buf += 8;
+  }
+}
+
+}
+
+namespace media {
+
+void ConvertYUVToRGB32_C(const uint8* yplane,
+                         const uint8* uplane,
+                         const uint8* vplane,
+                         uint8* rgbframe,
+                         int width,
+                         int height,
+                         int ystride,
+                         int uvstride,
+                         int rgbstride,
+                         YUVType yuv_type) {
+  unsigned int y_shift = yuv_type;
+  for (int y = 0; y < height; ++y) {
+    uint8* rgb_row = rgbframe + y * rgbstride;
+    const uint8* y_ptr = yplane + y * ystride;
+    const uint8* u_ptr = uplane + (y >> y_shift) * uvstride;
+    const uint8* v_ptr = vplane + (y >> y_shift) * uvstride;
+
+    ConvertYUVToRGB32Row_C(y_ptr,
+                           u_ptr,
+                           v_ptr,
+                           rgb_row,
+                           width);
+  }
+}
+
+}  // namespace media
diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.asm b/media/base/simd/convert_yuv_to_rgb_mmx.asm
new file mode 100644
index 0000000..e044474
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_mmx.asm
@@ -0,0 +1,22 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+  SECTION_TEXT
+  CPU       MMX
+
+; Use movq to save the output.
+%define MOVQ movq
+
+; extern "C" void ConvertYUVToRGB32Row_MMX(const uint8* y_buf,
+;                                          const uint8* u_buf,
+;                                          const uint8* v_buf,
+;                                          uint8* rgb_buf,
+;                                          int width);
+%define SYMBOL ConvertYUVToRGB32Row_MMX
+%include "convert_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.inc b/media/base/simd/convert_yuv_to_rgb_mmx.inc
new file mode 100644
index 0000000..b9555ce
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_mmx.inc
@@ -0,0 +1,119 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+  global    mangle(SYMBOL) PRIVATE
+  align     function_align
+
+; Non-PIC code is the fastest so use this if possible.
+%ifndef PIC
+mangle(SYMBOL):
+  %assign   stack_offset 0
+  PROLOGUE  5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV
+  extern    mangle(kCoefficientsRgbY)
+  jmp       .convertend
+
+.convertloop:
+  movzx     TEMPUd, BYTE [Uq]
+  add       Uq, 1
+  movzx     TEMPVd, BYTE [Vq]
+  add       Vq, 1
+  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
+  movzx     TEMPUd, BYTE [Yq]
+  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
+  movzx     TEMPVd, BYTE [Yq + 1]
+  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
+  add       Yq, 2
+  movq      mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq]
+  paddsw    mm1, mm0
+  paddsw    mm2, mm0
+  psraw     mm1, 6
+  psraw     mm2, 6
+  packuswb  mm1, mm2
+  MOVQ      [ARGBq], mm1
+  add       ARGBq, 8
+
+.convertend:
+  sub       WIDTHq, 2
+  jns       .convertloop
+
+  ; If number of pixels is odd then compute it.
+  and       WIDTHq, 1
+  jz        .convertdone
+
+  movzx     TEMPUd, BYTE [Uq]
+  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
+  movzx     TEMPVd, BYTE [Vq]
+  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
+  movzx     TEMPUd, BYTE [Yq]
+  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
+  paddsw    mm1, mm0
+  psraw     mm1, 6
+  packuswb  mm1, mm1
+  movd      [ARGBq], mm1
+
+.convertdone:
+  RET
+%endif
+
+; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
+; This code is slower than the above version.
+%ifdef PIC
+mangle(SYMBOL):
+  %assign   stack_offset 0
+  PROLOGUE  5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE
+
+  extern    mangle(kCoefficientsRgbY)
+  LOAD_SYM  TABLEq, mangle(kCoefficientsRgbY)
+
+  jmp       .convertend
+
+.convertloop:
+  movzx     TEMPd, BYTE [Uq]
+  movq      mm0, [TABLEq + 2048 + 8 * TEMPq]
+  add       Uq, 1
+
+  movzx     TEMPd, BYTE [Vq]
+  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq]
+  add       Vq, 1
+
+  movzx     TEMPd, BYTE [Yq]
+  movq      mm1, [TABLEq + 8 * TEMPq]
+
+  movzx     TEMPd, BYTE [Yq + 1]
+  movq      mm2, [TABLEq + 8 * TEMPq]
+  add       Yq, 2
+
+  ; Add UV components to Y component.
+  paddsw    mm1, mm0
+  paddsw    mm2, mm0
+
+  ; Down shift and then pack.
+  psraw     mm1, 6
+  psraw     mm2, 6
+  packuswb  mm1, mm2
+  MOVQ      [ARGBq], mm1
+  add       ARGBq, 8
+
+.convertend:
+  sub       WIDTHq, 2
+  jns       .convertloop
+
+  ; If number of pixels is odd then compute it.
+  and       WIDTHq, 1
+  jz        .convertdone
+
+  movzx     TEMPd, BYTE [Uq]
+  movq      mm0, [TABLEq + 2048 + 8 * TEMPq]
+  movzx     TEMPd, BYTE [Vq]
+  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq]
+  movzx     TEMPd, BYTE [Yq]
+  movq      mm1, [TABLEq + 8 * TEMPq]
+  paddsw    mm1, mm0
+  psraw     mm1, 6
+  packuswb  mm1, mm1
+  movd      [ARGBq], mm1
+
+.convertdone:
+  RET
+%endif
diff --git a/media/base/simd/convert_yuv_to_rgb_sse.asm b/media/base/simd/convert_yuv_to_rgb_sse.asm
new file mode 100644
index 0000000..2f1967a
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_sse.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX and SSE instructions.
+;
+  SECTION_TEXT
+  CPU       MMX, SSE
+
+; Use SSE instruction movntq can write faster.
+%define MOVQ movntq
+
+;
+; extern "C" void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
+;                                          const uint8* u_buf,
+;                                          const uint8* v_buf,
+;                                          uint8* rgb_buf,
+;                                          int width);
+%define SYMBOL ConvertYUVToRGB32Row_SSE
+%include "convert_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/convert_yuv_to_rgb_x86.cc b/media/base/simd/convert_yuv_to_rgb_x86.cc
new file mode 100644
index 0000000..3e03ef9
--- /dev/null
+++ b/media/base/simd/convert_yuv_to_rgb_x86.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#endif
+
+#include "media/base/cpu_features.h"
+#include "media/base/simd/convert_yuv_to_rgb.h"
+#include "media/base/yuv_convert.h"
+
+namespace media {
+
+void ConvertYUVToRGB32_MMX(const uint8* yplane,
+                           const uint8* uplane,
+                           const uint8* vplane,
+                           uint8* rgbframe,
+                           int width,
+                           int height,
+                           int ystride,
+                           int uvstride,
+                           int rgbstride,
+                           YUVType yuv_type) {
+  unsigned int y_shift = yuv_type;
+  for (int y = 0; y < height; ++y) {
+    uint8* rgb_row = rgbframe + y * rgbstride;
+    const uint8* y_ptr = yplane + y * ystride;
+    const uint8* u_ptr = uplane + (y >> y_shift) * uvstride;
+    const uint8* v_ptr = vplane + (y >> y_shift) * uvstride;
+
+    ConvertYUVToRGB32Row_MMX(y_ptr,
+                             u_ptr,
+                             v_ptr,
+                             rgb_row,
+                             width);
+  }
+
+  _mm_empty();
+}
+
+void ConvertYUVToRGB32_SSE(const uint8* yplane,
+                           const uint8* uplane,
+                           const uint8* vplane,
+                           uint8* rgbframe,
+                           int width,
+                           int height,
+                           int ystride,
+                           int uvstride,
+                           int rgbstride,
+                           YUVType yuv_type) {
+  unsigned int y_shift = yuv_type;
+  for (int y = 0; y < height; ++y) {
+    uint8* rgb_row = rgbframe + y * rgbstride;
+    const uint8* y_ptr = yplane + y * ystride;
+    const uint8* u_ptr = uplane + (y >> y_shift) * uvstride;
+    const uint8* v_ptr = vplane + (y >> y_shift) * uvstride;
+
+    ConvertYUVToRGB32Row_SSE(y_ptr,
+                             u_ptr,
+                             v_ptr,
+                             rgb_row,
+                             width);
+  }
+
+  _mm_empty();
+}
+
+}  // namespace media
diff --git a/media/base/simd/filter_yuv.h b/media/base/simd/filter_yuv.h
new file mode 100644
index 0000000..5a9cf11
--- /dev/null
+++ b/media/base/simd/filter_yuv.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef MEDIA_BASE_SIMD_FILTER_YUV_H_
+#define MEDIA_BASE_SIMD_FILTER_YUV_H_
+
+#include "base/basictypes.h"
+
+namespace media {
+
+typedef void (*FilterYUVRowsProc)(uint8*,
+                                  const uint8*,
+                                  const uint8*,
+                                  int,
+                                  int);
+
+void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                     int source_width, int source_y_fraction);
+
+void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                       int source_width, int source_y_fraction);
+
+void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                        int source_width, int source_y_fraction);
+
+}  // namespace media
+
+#endif  // MEDIA_BASE_SIMD_FILTER_YUV_H_
diff --git a/media/base/simd/filter_yuv_c.cc b/media/base/simd/filter_yuv_c.cc
new file mode 100644
index 0000000..95ae01a
--- /dev/null
+++ b/media/base/simd/filter_yuv_c.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/simd/filter_yuv.h"
+
+namespace media {
+
+void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                     int source_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  uint8* end = ybuf + source_width;
+  do {
+    ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
+    ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
+    ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
+    ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
+    ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
+    ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
+    ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
+    ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
+    y0_ptr += 8;
+    y1_ptr += 8;
+    ybuf += 8;
+  } while (ybuf < end);
+}
+
+}  // namespace media
diff --git a/media/base/simd/filter_yuv_mmx.cc b/media/base/simd/filter_yuv_mmx.cc
new file mode 100644
index 0000000..77698dc
--- /dev/null
+++ b/media/base/simd/filter_yuv_mmx.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#include <emmintrin.h>
+#endif
+
+#include "build/build_config.h"
+#include "media/base/simd/filter_yuv.h"
+
+namespace media {
+
+#if defined(COMPILER_MSVC)
+// Warning 4799 is about calling emms before the function exits.
+// We calls emms in a frame level so suppress this warning.
+#pragma warning(disable: 4799)
+#endif
+
+void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                       int source_width, int source_y_fraction) {
+  __m64 zero = _mm_setzero_si64();
+  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
+
+  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+  do {
+    __m64 y0 = *y0_ptr64++;
+    __m64 y1 = *y1_ptr64++;
+    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+    y0 = _mm_unpacklo_pi8(y0, zero);
+    y1 = _mm_unpacklo_pi8(y1, zero);
+    y0 = _mm_mullo_pi16(y0, y0_fraction);
+    y1 = _mm_mullo_pi16(y1, y1_fraction);
+    y2 = _mm_mullo_pi16(y2, y0_fraction);
+    y3 = _mm_mullo_pi16(y3, y1_fraction);
+    y0 = _mm_add_pi16(y0, y1);
+    y2 = _mm_add_pi16(y2, y3);
+    y0 = _mm_srli_pi16(y0, 8);
+    y2 = _mm_srli_pi16(y2, 8);
+    y0 = _mm_packs_pu16(y0, y2);
+    *dest64++ = y0;
+  } while (dest64 < end64);
+}
+
+#if defined(COMPILER_MSVC)
+#pragma warning(default: 4799)
+#endif
+
+}  // namespace media
diff --git a/media/base/simd/filter_yuv_sse2.cc b/media/base/simd/filter_yuv_sse2.cc
new file mode 100644
index 0000000..137ac94
--- /dev/null
+++ b/media/base/simd/filter_yuv_sse2.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#include <emmintrin.h>
+#endif
+
+#include "media/base/simd/filter_yuv.h"
+
+namespace media {
+
+void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
+                        int source_width, int source_y_fraction) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
+
+  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+
+  do {
+    __m128i y0 = _mm_loadu_si128(y0_ptr128);
+    __m128i y1 = _mm_loadu_si128(y1_ptr128);
+    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+    y0 = _mm_unpacklo_epi8(y0, zero);
+    y1 = _mm_unpacklo_epi8(y1, zero);
+    y0 = _mm_mullo_epi16(y0, y0_fraction);
+    y1 = _mm_mullo_epi16(y1, y1_fraction);
+    y2 = _mm_mullo_epi16(y2, y0_fraction);
+    y3 = _mm_mullo_epi16(y3, y1_fraction);
+    y0 = _mm_add_epi16(y0, y1);
+    y2 = _mm_add_epi16(y2, y3);
+    y0 = _mm_srli_epi16(y0, 8);
+    y2 = _mm_srli_epi16(y2, 8);
+    y0 = _mm_packus_epi16(y0, y2);
+    *dest128++ = y0;
+    ++y0_ptr128;
+    ++y1_ptr128;
+  } while (dest128 < end128);
+}
+
+}  // namespace media
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm
new file mode 100644
index 0000000..7f7e0e8
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+  SECTION_TEXT
+  CPU       MMX
+
+; Use movq to save the output.
+%define MOVQ movq
+
+; void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+;                                   const uint8* u_buf,
+;                                   const uint8* v_buf,
+;                                   uint8* rgb_buf,
+;                                   int width,
+;                                   int source_dx);
+%define SYMBOL LinearScaleYUVToRGB32Row_MMX
+%include "linear_scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
new file mode 100644
index 0000000..91c06a5
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
@@ -0,0 +1,166 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+  global    mangle(SYMBOL) PRIVATE
+  align     function_align
+
+mangle(SYMBOL):
+  %assign   stack_offset 0
+
+  extern    mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE  6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP
+
+%if gprsize == 8
+%define     WORD_SIZE   QWORD
+%else
+%define     WORD_SIZE   DWORD
+%endif
+
+; Define register aliases.
+%define     Xq                  R1q     ; Current X position
+%define     COMPLq              R2q     ; Component A value
+%define     COMPLd              R2d     ; Component A value
+%define     U_ARG_REGq          R0q     ; U plane address argument
+%define     V_ARG_REGq          R1q     ; V plane address argument
+%define     SOURCE_DX_ARG_REGq  R3q     ; Source dx argument
+%define     WIDTH_ARG_REGq      R2q     ; Width argument
+
+%ifdef PIC
+; PIC code shared COMPR, U and V with the same register. Need to be careful in the
+; code they don't mix up. This allows R3q to be used for YUV table.
+%define     COMPRq              R0q     ; Component B value
+%define     COMPRd              R0d     ; Component B value
+%define     Uq                  R0q     ; U plane address
+%define     Vq                  R0q     ; V plane address
+%define     U_PLANE             WORD_SIZE [rsp + 3 * gprsize]
+%define     TABLE               R3q     ; Address of the table
+%else
+; Non-PIC code defines.
+%define     COMPRq              R3q     ; Component B value
+%define     COMPRd              R3d     ; Component B value
+%define     Uq                  R0q     ; U plane address
+%define     Vq                  R3q     ; V plane address
+%define     TABLE               mangle(kCoefficientsRgbY)
+%endif
+
+; Defines for stack variables. These are used in both PIC and non-PIC code.
+%define     V_PLANE             WORD_SIZE [rsp + 2 * gprsize]
+%define     SOURCE_DX           WORD_SIZE [rsp + gprsize]
+%define     SOURCE_WIDTH        WORD_SIZE [rsp]
+
+; Handle stack variables differently for PIC and non-PIC code.
+
+%ifdef PIC
+; Define stack usage for PIC code. PIC code push U plane onto stack.
+  PUSH      U_ARG_REGq
+  PUSH      V_ARG_REGq
+  PUSH      SOURCE_DX_ARG_REGq
+  imul      WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq  ; source_width = width * source_dx
+  PUSH      WIDTH_ARG_REGq
+
+; Load the address of kCoefficientsRgbY into TABLE
+  mov       TEMPq, SOURCE_DX_ARG_REGq    ; Need to save source_dx first
+  LOAD_SYM  TABLE, mangle(kCoefficientsRgbY)
+%define     SOURCE_DX_ARG_REGq  TEMPq   ; Overwrite SOURCE_DX_ARG_REGq to TEMPq
+%else
+; Define stack usage. Non-PIC code just push 3 registers to stack.
+  PUSH      V_ARG_REGq
+  PUSH      SOURCE_DX_ARG_REGq
+  imul      WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq  ; source_width = width * source_dx
+  PUSH      WIDTH_ARG_REGq
+%endif
+
+%macro EPILOGUE 0
+%ifdef PIC
+  ADD       rsp, 4 * gprsize
+%else
+  ADD       rsp, 3 * gprsize
+%endif
+%endmacro
+
+  xor       Xq, Xq                       ; x = 0
+  cmp       SOURCE_DX_ARG_REGq, 0x20000
+  jl        .lscaleend
+  mov       Xq, 0x8000                   ; x = 0.5 for 1/2 or less
+  jmp       .lscaleend
+
+.lscaleloop:
+%ifdef PIC
+  mov       Uq, U_PLANE                  ; PIC code saves U_PLANE on stack.
+%endif
+
+; Define macros for scaling YUV components since they are reused.
+%macro SCALEUV 1
+  mov       TEMPq, Xq
+  sar       TEMPq, 0x11
+  movzx     COMPLd, BYTE [%1 + TEMPq]
+  movzx     COMPRd, BYTE [%1 + TEMPq + 1]
+  mov       TEMPq, Xq
+  and       TEMPq, 0x1fffe
+  imul      COMPRq, TEMPq
+  xor       TEMPq, 0x1fffe
+  imul      COMPLq, TEMPq
+  add       COMPLq, COMPRq
+  shr       COMPLq, 17
+%endmacro
+  SCALEUV   Uq                           ; Use the above macro to scale U
+  movq      mm0, [TABLE + 2048 + 8 * COMPLq]
+
+  mov       Vq, V_PLANE                  ; Read V address from stack
+  SCALEUV   Vq                           ; Use the above macro to scale V
+  paddsw    mm0, [TABLE + 4096 + 8 * COMPLq]
+
+%macro SCALEY 0
+  mov       TEMPq, Xq
+  sar       TEMPq, 0x10
+  movzx     COMPLd, BYTE [Yq + TEMPq]
+  movzx     COMPRd, BYTE [Yq + TEMPq + 1]
+  mov       TEMPq, Xq
+  add       Xq, SOURCE_DX                 ; Add source_dx from stack
+  and       TEMPq, 0xffff
+  imul      COMPRq, TEMPq
+  xor       TEMPq, 0xffff
+  imul      COMPLq, TEMPq
+  add       COMPLq, COMPRq
+  shr       COMPLq, 16
+%endmacro
+  SCALEY                                  ; Use the above macro to scale Y1
+  movq      mm1, [TABLE + 8 * COMPLq]
+
+  cmp       Xq, SOURCE_WIDTH              ; Compare source_width from stack
+  jge       .lscalelastpixel
+
+  SCALEY                                  ; Use the above macro to sacle Y2
+  movq      mm2, [TABLE + 8 * COMPLq]
+
+  paddsw    mm1, mm0
+  paddsw    mm2, mm0
+  psraw     mm1, 0x6
+  psraw     mm2, 0x6
+  packuswb  mm1, mm2
+  MOVQ      [ARGBq], mm1
+  add       ARGBq, 0x8
+
+.lscaleend:
+  cmp       Xq, SOURCE_WIDTH     ; Compare source_width from stack
+  jl        .lscaleloop
+  EPILOGUE
+  RET
+
+.lscalelastpixel:
+  paddsw    mm1, mm0
+  psraw     mm1, 6
+  packuswb  mm1, mm1
+  movd      [ARGBq], mm1
+  EPILOGUE
+  RET
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm
new file mode 100644
index 0000000..db7854457
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm
@@ -0,0 +1,142 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+  SECTION_TEXT
+  CPU       MMX
+
+%define SYMBOL LinearScaleYUVToRGB32Row_MMX_X64
+  global    mangle(SYMBOL) PRIVATE
+  align     function_align
+
+mangle(SYMBOL):
+  %assign   stack_offset 0
+  extern    mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE  6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMPL
+
+%define     TABLEq     r10
+%define     Xq         r11
+%define     INDEXq     r12
+%define     COMPRd     r13d
+%define     COMPRq     r13
+%define     FRACTIONq  r14
+
+  PUSH      TABLEq
+  PUSH      Xq
+  PUSH      INDEXq
+  PUSH      COMPRq
+  PUSH      FRACTIONq
+
+%macro EPILOGUE 0
+  POP       FRACTIONq
+  POP       COMPRq
+  POP       INDEXq
+  POP       Xq
+  POP       TABLEq
+%endmacro
+
+  LOAD_SYM  TABLEq, mangle(kCoefficientsRgbY)
+
+  imul      WIDTHq, SOURCE_DXq           ; source_width = width * source_dx
+  xor       Xq, Xq                       ; x = 0
+  cmp       SOURCE_DXq, 0x20000
+  jl        .lscaleend
+  mov       Xq, 0x8000                   ; x = 0.5 for 1/2 or less
+  jmp       .lscaleend
+
+.lscaleloop:
+  ; Interpolate U
+  mov       INDEXq, Xq
+  sar       INDEXq, 0x11
+  movzx     COMPLd, BYTE [Uq + INDEXq]
+  movzx     COMPRd, BYTE [Uq + INDEXq + 1]
+  mov       FRACTIONq, Xq
+  and       FRACTIONq, 0x1fffe
+  imul      COMPRq, FRACTIONq
+  xor       FRACTIONq, 0x1fffe
+  imul      COMPLq, FRACTIONq
+  add       COMPLq, COMPRq
+  shr       COMPLq, 17
+  movq      mm0, [TABLEq + 2048 + 8 * COMPLq]
+
+  ; Interpolate V
+  movzx     COMPLd, BYTE [Vq + INDEXq]
+  movzx     COMPRd, BYTE [Vq + INDEXq + 1]
+  ; Trick here to imul COMPL first then COMPR.
+  ; Saves two instruction. :)
+  imul      COMPLq, FRACTIONq
+  xor       FRACTIONq, 0x1fffe
+  imul      COMPRq, FRACTIONq
+  add       COMPLq, COMPRq
+  shr       COMPLq, 17
+  paddsw    mm0, [TABLEq + 4096 + 8 * COMPLq]
+
+  ; Interpolate first Y1.
+  lea       INDEXq, [Xq + SOURCE_DXq]   ; INDEXq now points to next pixel.
+                                        ; Xq points to current pixel.
+  mov       FRACTIONq, Xq
+  sar       Xq, 0x10
+  movzx     COMPLd, BYTE [Yq + Xq]
+  movzx     COMPRd, BYTE [Yq + Xq + 1]
+  and       FRACTIONq, 0xffff
+  imul      COMPRq, FRACTIONq
+  xor       FRACTIONq, 0xffff
+  imul      COMPLq, FRACTIONq
+  add       COMPLq, COMPRq
+  shr       COMPLq, 16
+  movq      mm1, [TABLEq + 8 * COMPLq]
+
+  ; Interpolate Y2 if available.
+  cmp       INDEXq, WIDTHq
+  jge       .lscalelastpixel
+
+  lea       Xq, [INDEXq + SOURCE_DXq]    ; Xq points to next pixel.
+                                         ; INDEXq points to current pixel.
+  mov       FRACTIONq, INDEXq
+  sar       INDEXq, 0x10
+  movzx     COMPLd, BYTE [Yq + INDEXq]
+  movzx     COMPRd, BYTE [Yq + INDEXq + 1]
+  and       FRACTIONq, 0xffff
+  imul      COMPRq, FRACTIONq
+  xor       FRACTIONq, 0xffff
+  imul      COMPLq, FRACTIONq
+  add       COMPLq, COMPRq
+  shr       COMPLq, 16
+  movq      mm2, [TABLEq + 8 * COMPLq]
+
+  paddsw    mm1, mm0
+  paddsw    mm2, mm0
+  psraw     mm1, 0x6
+  psraw     mm2, 0x6
+  packuswb  mm1, mm2
+  movntq    [ARGBq], mm1
+  add       ARGBq, 0x8
+
+.lscaleend:
+  cmp       Xq, WIDTHq
+  jl        .lscaleloop
+  jmp       .epilogue
+
+.lscalelastpixel:
+  paddsw    mm1, mm0
+  psraw     mm1, 6
+  packuswb  mm1, mm1
+  movd      [ARGBq], mm1
+
+.epilogue
+  EPILOGUE
+  RET
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_sse.asm b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm
new file mode 100644
index 0000000..847911c
--- /dev/null
+++ b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX and SSE instructions.
+;
+  SECTION_TEXT
+  CPU       MMX, SSE
+
+; Use movq to save the output.
+%define MOVQ movntq
+
+; void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+;                                   const uint8* u_buf,
+;                                   const uint8* v_buf,
+;                                   uint8* rgb_buf,
+;                                   int width,
+;                                   int source_dx);
+%define SYMBOL LinearScaleYUVToRGB32Row_SSE
+%include "linear_scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.asm b/media/base/simd/scale_yuv_to_rgb_mmx.asm
new file mode 100644
index 0000000..6a83757
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_mmx.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX instructions.
+;
+  SECTION_TEXT
+  CPU       MMX
+
+; Use movq to save the output.
+%define MOVQ movq
+
+; void ScaleYUVToRGB32Row_MMX(const uint8* y_buf,
+;                             const uint8* u_buf,
+;                             const uint8* v_buf,
+;                             uint8* rgb_buf,
+;                             int width,
+;                             int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_MMX
+%include "scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.inc b/media/base/simd/scale_yuv_to_rgb_mmx.inc
new file mode 100644
index 0000000..94c101c
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_mmx.inc
@@ -0,0 +1,115 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+  global    mangle(SYMBOL) PRIVATE
+  align     function_align
+
+mangle(SYMBOL):
+  %assign   stack_offset 0
+
+  extern    mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE  6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP
+
+%ifdef ARCH_X86_64
+%define     WORD_SIZE   QWORD
+%else
+%define     WORD_SIZE   DWORD
+%endif
+
+%ifdef PIC
+  PUSH      R1q  ; Width
+%endif
+  PUSH      R2q  ; Source dx
+
+%define     SOURCE_DX   WORD_SIZE [rsp]
+
+; PIC code.
+%ifdef PIC
+  LOAD_SYM  R1q, mangle(kCoefficientsRgbY)
+%define     WIDTH       WORD_SIZE [rsp + gprsize]
+%define     TABLE       R1q  
+%define     Xq           R2q
+
+; Non-PIC code.
+%else
+%define     WIDTH       R1q
+%define     TABLE       mangle(kCoefficientsRgbY)
+%define     Xq           R2q
+%endif
+
+  ; Set Xq index to 0.
+  xor       Xq, Xq
+  jmp       .scaleend
+
+.scaleloop:
+  ; TABLE can either be a register or a symbol depending on this is
+  ; PIC or not.
+  mov       TEMPq, Xq
+  sar       TEMPq, 17
+  movzx     TEMPd, BYTE [Uq + TEMPq]
+  movq      mm0, [TABLE + 2048 + 8 * TEMPq]
+  mov       TEMPq, Xq
+  sar       TEMPq, 17
+  movzx     TEMPd, BYTE [Vq + TEMPq]
+  paddsw    mm0, [TABLE + 4096 + 8 * TEMPq]
+  mov       TEMPq, Xq
+  add       Xq, SOURCE_DX
+  sar       TEMPq, 16
+  movzx     TEMPd, BYTE [Yq + TEMPq]
+  movq      mm1, [TABLE + 8 * TEMPq]
+  mov       TEMPq, Xq
+  add       Xq, SOURCE_DX
+  sar       TEMPq, 16
+  movzx     TEMPd, BYTE [Yq + TEMPq]
+  movq      mm2, [TABLE + 8 * TEMPq]
+  paddsw    mm1, mm0
+  paddsw    mm2, mm0
+  psraw     mm1, 6
+  psraw     mm2, 6
+  packuswb  mm1, mm2
+  MOVQ      QWORD [ARGBq], mm1
+  add       ARGBq, 8
+
+.scaleend:
+  ; WIDTH can either be a register or memory depending on this is
+  ; PIC or not.
+  sub       WIDTH, 2
+  jns       .scaleloop
+
+  and       WIDTH, 1             ; odd number of pixels?
+  jz        .scaledone
+
+  mov       TEMPq, Xq
+  sar       TEMPq, 17
+  movzx     TEMPd, BYTE [Uq + TEMPq]
+  movq      mm0, [TABLE + 2048 + 8 * TEMPq]
+  mov       TEMPq, Xq
+  sar       TEMPq, 17
+  movzx     TEMPd, BYTE [Vq + TEMPq]
+  paddsw    mm0, [TABLE + 4096 + 8 * TEMPq]
+  mov       TEMPq, Xq
+  sar       TEMPq, 16
+  movzx     TEMPd, BYTE [Yq + TEMPq]
+  movq      mm1, [TABLE + 8 * TEMPq]
+  paddsw    mm1, mm0
+  psraw     mm1, 6
+  packuswb  mm1, mm1
+  movd      DWORD [ARGBq], mm1
+
+.scaledone:
+%ifdef PIC
+  ADD       rsp, 2 * gprsize
+%else
+  ADD       rsp, gprsize
+%endif
+  RET
diff --git a/media/base/simd/scale_yuv_to_rgb_sse.asm b/media/base/simd/scale_yuv_to_rgb_sse.asm
new file mode 100644
index 0000000..5b849a6
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_sse.asm
@@ -0,0 +1,23 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX and SSE instructions.
+;
+  SECTION_TEXT
+  CPU       MMX, SSE
+
+; Use movq to save the output.
+%define MOVQ movntq
+
+; void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
+;                             const uint8* u_buf,
+;                             const uint8* v_buf,
+;                             uint8* rgb_buf,
+;                             int width,
+;                             int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_SSE
+%include "scale_yuv_to_rgb_mmx.inc"
diff --git a/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm
new file mode 100644
index 0000000..5e58146
--- /dev/null
+++ b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm
@@ -0,0 +1,110 @@
+; Copyright (c) 2011 The Chromium Authors. All rights reserved.
+; Use of this source code is governed by a BSD-style license that can be
+; found in the LICENSE file.
+
+%include "x86inc.asm"
+
+;
+; This file uses MMX, SSE2 and instructions.
+;
+  SECTION_TEXT
+  CPU       SSE2
+
+; void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf,
+;                                  const uint8* u_buf,
+;                                  const uint8* v_buf,
+;                                  uint8* rgb_buf,
+;                                  int width,
+;                                  int source_dx);
+%define SYMBOL ScaleYUVToRGB32Row_SSE2_X64
+
+  global    mangle(SYMBOL) PRIVATE
+  align     function_align
+
+mangle(SYMBOL):
+  %assign   stack_offset 0
+  extern    mangle(kCoefficientsRgbY)
+
+; Parameters are in the following order:
+; 1. Y plane
+; 2. U plane
+; 3. V plane
+; 4. ARGB frame
+; 5. Width
+; 6. Source dx
+
+PROLOGUE  6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMP
+
+%define     TABLEq   r10
+%define     Xq       r11
+%define     INDEXq   r12
+  PUSH      r10
+  PUSH      r11
+  PUSH      r12
+
+  LOAD_SYM  TABLEq, mangle(kCoefficientsRgbY)
+
+  ; Set Xq index to 0.
+  xor       Xq, Xq
+  jmp       .scaleend
+
+.scaleloop:
+  ; Read UV pixels.
+  mov       INDEXq, Xq
+  sar       INDEXq, 17
+  movzx     COMPd, BYTE [Uq + INDEXq]
+  movq      xmm0, [TABLEq + 2048 + 8 * COMPq]
+  movzx     COMPd, BYTE [Vq + INDEXq]
+  movq      xmm1, [TABLEq + 4096 + 8 * COMPq]
+
+  ; Read first Y pixel.
+  lea       INDEXq, [Xq + SOURCE_DXq] ; INDEXq nows points to next pixel.
+  sar       Xq, 16
+  movzx     COMPd, BYTE [Yq + Xq]
+  paddsw    xmm0, xmm1		      ; Hide a ADD after memory load.
+  movq      xmm1, [TABLEq + 8 * COMPq]
+
+  ;  Read next Y pixel.
+  lea       Xq, [INDEXq + SOURCE_DXq] ; Xq now points to next pixel.
+  sar       INDEXq, 16
+  movzx     COMPd, BYTE [Yq + INDEXq]
+  movq      xmm2, [TABLEq + 8 * COMPq]
+  paddsw    xmm1, xmm0
+  paddsw    xmm2, xmm0
+  shufps    xmm1, xmm2, 0x44          ; Join two pixels into one XMM register
+  psraw     xmm1, 6
+  packuswb  xmm1, xmm1
+  movq      QWORD [ARGBq], xmm1
+  add       ARGBq, 8
+
+.scaleend:
+  sub       WIDTHq, 2
+  jns       .scaleloop
+
+  and       WIDTHq, 1                 ; odd number of pixels?
+  jz        .scaledone
+
+  ; Read U V components.
+  mov       INDEXq, Xq
+  sar       INDEXq, 17
+  movzx     COMPd, BYTE [Uq + INDEXq]
+  movq      xmm0, [TABLEq + 2048 + 8 * COMPq]
+  movzx     COMPd, BYTE [Vq + INDEXq]
+  movq      xmm1, [TABLEq + 4096 + 8 * COMPq]
+  paddsw    xmm0, xmm1
+
+  ; Read one Y component.
+  mov       INDEXq, Xq
+  sar       INDEXq, 16
+  movzx     COMPd, BYTE [Yq + INDEXq]
+  movq      xmm1, [TABLEq + 8 * COMPq]
+  paddsw    xmm1, xmm0
+  psraw     xmm1, 6
+  packuswb  xmm1, xmm1
+  movd      DWORD [ARGBq], xmm1
+
+.scaledone:
+  POP       r12
+  POP       r11
+  POP       r10
+  RET
diff --git a/media/base/simd/x86inc.asm b/media/base/simd/x86inc.asm
index 956b999..5e0ca20 100644
--- a/media/base/simd/x86inc.asm
+++ b/media/base/simd/x86inc.asm
@@ -95,11 +95,14 @@
 %ifdef WIN64
     %define PIC
 %elifndef ARCH_X86_64
+; For chromium we may build PIC code even for 32 bits system.
+%ifndef CHROMIUM
 ; x86_32 doesn't require PIC.
 ; Some distros prefer shared objects to be PIC, but nothing breaks if
 ; the code contains a few textrels, so we'll skip that complexity.
     %undef PIC
 %endif
+%endif
 %ifdef PIC
     default rel
 %endif
@@ -947,6 +950,11 @@ AVX_INSTR pfmul, 1, 0
 ;=============================================================================
 
 %ifdef CHROMIUM
+; Always build PIC code on Mac for Chromium.
+%ifdef MACHO
+%define PIC
+%endif
+
 ;
 ; LOAD_SYM %1 (reg), %2 (sym)
 ; Copies the address to a local symbol to the specified register.
author	hclam@chromium.org <hclam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-14 12:40:45 +0000
committer	hclam@chromium.org <hclam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-14 12:40:45 +0000
commit	ccde716550693ceb59cc8717d4f4d4845f23d853 (patch)
tree	0022aff451d5665193835d1c508a1f3303ae9cc5 /media/base/simd
parent	89a4d2f772aa92a79acedf057f4036820c1fd412 (diff)
download	chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.zip chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.tar.gz chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.tar.bz2