diff options
28 files changed, 1916 insertions, 191 deletions
diff --git a/media/base/cpu_features.h b/media/base/cpu_features.h index c2762d8..0878385 100644 --- a/media/base/cpu_features.h +++ b/media/base/cpu_features.h @@ -10,6 +10,12 @@ namespace media { +// Returns true if CPU has MMX support. +bool hasMMX(); + +// Returns true if CPU has SSE support. +bool hasSSE(); + // Returns true if CPU has SSE2 support. bool hasSSE2(); diff --git a/media/base/cpu_features_x86.cc b/media/base/cpu_features_x86.cc index bf7d05d..4fb9304 100644 --- a/media/base/cpu_features_x86.cc +++ b/media/base/cpu_features_x86.cc @@ -48,6 +48,16 @@ static inline void getcpuid(int info_type, int info[4]) { } #endif +bool hasMMX() { + // TODO(hclam): Acutually checks it. + return true; +} + +bool hasSSE() { + // TODO(hclam): Actually checks it. + return true; +} + bool hasSSE2() { #if defined(ARCH_CPU_X86_64) /* All x86_64 machines have SSE2, so don't even bother checking. */ diff --git a/media/base/simd/convert_rgb_to_yuv_x86.cc b/media/base/simd/convert_rgb_to_yuv_x86.cc new file mode 100644 index 0000000..2bd6930 --- /dev/null +++ b/media/base/simd/convert_rgb_to_yuv_x86.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "media/base/simd/convert_rgb_to_yuv.h" + +#include "build/build_config.h" +#include "media/base/cpu_features.h" +#include "media/base/simd/convert_rgb_to_yuv_ssse3.h" + +namespace media { + +void ConvertRGB32ToYUV_SSSE3(const uint8* rgbframe, + uint8* yplane, + uint8* uplane, + uint8* vplane, + int width, + int height, + int rgbstride, + int ystride, + int uvstride) { +#ifdef ENABLE_SUBSAMPLING + for (; height >= 2; height -= 2) { + ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + ConvertARGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + uplane += uvstride; + vplane += uvstride; + } + + if (height) + ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width); +#else + for (; height >= 2; height -= 2) { + ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width); + rgbframe += rgbstride; + yplane += ystride; + + uplane += uvstride; + vplane += uvstride; + } + + if (height) + ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width); +#endif +} + +void ConvertRGB24ToYUV_SSSE3(const uint8* rgbframe, + uint8* yplane, + uint8* uplane, + uint8* vplane, + int width, + int height, + int rgbstride, + int ystride, + int uvstride) { +#ifdef ENABLE_SUBSAMPLING + for (; height >= 2; height -= 2) { + ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + ConvertRGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + uplane += uvstride; + vplane += uvstride; + } + + if (height) + ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width); +#else + for (; height >= 2; height -= 2) { + ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width); + rgbframe += rgbstride; + yplane += ystride; + + uplane += uvstride; + vplane += uvstride; + } + + if (height) + ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width); +#endif +} + +} // namespace media diff --git a/media/base/simd/convert_yuv_to_rgb.h b/media/base/simd/convert_yuv_to_rgb.h new file mode 100644 index 0000000..5f3df2c6 --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb.h @@ -0,0 +1,150 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_ +#define MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_ + +#include "base/basictypes.h" +#include "media/base/yuv_convert.h" + +namespace media { + +typedef void (*ConvertYUVToRGB32Proc)(const uint8*, + const uint8*, + const uint8*, + uint8*, + int, + int, + int, + int, + int, + YUVType); + +void ConvertYUVToRGB32_C(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +void ConvertYUVToRGB32_SSE(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +void ConvertYUVToRGB32_MMX(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +} // namespace media + +// Assembly functions are declared without namespace. +extern "C" { + +typedef void (*ConvertYUVToRGB32RowProc)(const uint8*, + const uint8*, + const uint8*, + uint8*, + int); +typedef void (*ScaleYUVToRGB32RowProc)(const uint8*, + const uint8*, + const uint8*, + uint8*, + int, + int); + +void ConvertYUVToRGB32Row_C(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width); + +void ConvertYUVToRGB32Row_MMX(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width); + +void ConvertYUVToRGB32Row_SSE(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width); + +void ScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_MMX_X64(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +} + +#endif // MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_ diff --git a/media/base/simd/convert_yuv_to_rgb_c.cc b/media/base/simd/convert_yuv_to_rgb_c.cc new file mode 100644 index 0000000..f8e70b2 --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_c.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "media/base/simd/convert_yuv_to_rgb.h" +// TODO(hclam): Shouldn't depend on yuv_row.h. +#include "media/base/yuv_row.h" + +#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) + +static inline void YUVPixel(uint8 y, + uint8 u, + uint8 v, + uint8* rgb_buf) { + + int b = kCoefficientsRgbY[256+u][0]; + int g = kCoefficientsRgbY[256+u][1]; + int r = kCoefficientsRgbY[256+u][2]; + int a = kCoefficientsRgbY[256+u][3]; + + b = paddsw(b, kCoefficientsRgbY[512+v][0]); + g = paddsw(g, kCoefficientsRgbY[512+v][1]); + r = paddsw(r, kCoefficientsRgbY[512+v][2]); + a = paddsw(a, kCoefficientsRgbY[512+v][3]); + + b = paddsw(b, kCoefficientsRgbY[y][0]); + g = paddsw(g, kCoefficientsRgbY[y][1]); + r = paddsw(r, kCoefficientsRgbY[y][2]); + a = paddsw(a, kCoefficientsRgbY[y][3]); + + b >>= 6; + g >>= 6; + r >>= 6; + a >>= 6; + + *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | + (packuswb(g) << 8) | + (packuswb(r) << 16) | + (packuswb(a) << 24); +} + +extern "C" { + +void ConvertYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; x += 2) { + uint8 u = u_buf[x >> 1]; + uint8 v = v_buf[x >> 1]; + uint8 y0 = y_buf[x]; + YUVPixel(y0, u, v, rgb_buf); + if ((x + 1) < width) { + uint8 y1 = y_buf[x + 1]; + YUVPixel(y1, u, v, rgb_buf + 4); + } + rgb_buf += 8; // Advance 2 pixels. + } +} + +// 16.16 fixed point is used. A shift by 16 isolates the integer. +// A shift by 17 is used to further subsample the chrominence channels. +// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, +// for 1/65536 pixel accurate interpolation. +void ScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + int x = 0; + for (int i = 0; i < width; i += 2) { + int y = y_buf[x >> 16]; + int u = u_buf[(x >> 17)]; + int v = v_buf[(x >> 17)]; + YUVPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y = y_buf[x >> 16]; + YUVPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } +} + +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + int x = 0; + if (source_dx >= 0x20000) { + x = 32768; + } + for (int i = 0; i < width; i += 2) { + int y0 = y_buf[x >> 16]; + int y1 = y_buf[(x >> 16) + 1]; + int u0 = u_buf[(x >> 17)]; + int u1 = u_buf[(x >> 17) + 1]; + int v0 = v_buf[(x >> 17)]; + int v1 = v_buf[(x >> 17) + 1]; + int y_frac = (x & 65535); + int uv_frac = ((x >> 1) & 65535); + int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; + int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; + YUVPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + y_frac = (x & 65535); + y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + YUVPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } +} + +} + +namespace media { + +void ConvertYUVToRGB32_C(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type) { + unsigned int y_shift = yuv_type; + for (int y = 0; y < height; ++y) { + uint8* rgb_row = rgbframe + y * rgbstride; + const uint8* y_ptr = yplane + y * ystride; + const uint8* u_ptr = uplane + (y >> y_shift) * uvstride; + const uint8* v_ptr = vplane + (y >> y_shift) * uvstride; + + ConvertYUVToRGB32Row_C(y_ptr, + u_ptr, + v_ptr, + rgb_row, + width); + } +} + +} // namespace media diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.asm b/media/base/simd/convert_yuv_to_rgb_mmx.asm new file mode 100644 index 0000000..e044474 --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_mmx.asm @@ -0,0 +1,22 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX instructions. +; + SECTION_TEXT + CPU MMX + +; Use movq to save the output. +%define MOVQ movq + +; extern "C" void ConvertYUVToRGB32Row_MMX(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width); +%define SYMBOL ConvertYUVToRGB32Row_MMX +%include "convert_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.inc b/media/base/simd/convert_yuv_to_rgb_mmx.inc new file mode 100644 index 0000000..b9555ce --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_mmx.inc @@ -0,0 +1,119 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + + global mangle(SYMBOL) PRIVATE + align function_align + +; Non-PIC code is the fastest so use this if possible. +%ifndef PIC +mangle(SYMBOL): + %assign stack_offset 0 + PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV + extern mangle(kCoefficientsRgbY) + jmp .convertend + +.convertloop: + movzx TEMPUd, BYTE [Uq] + add Uq, 1 + movzx TEMPVd, BYTE [Vq] + add Vq, 1 + movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq] + movzx TEMPUd, BYTE [Yq] + paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq] + movzx TEMPVd, BYTE [Yq + 1] + movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq] + add Yq, 2 + movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + MOVQ [ARGBq], mm1 + add ARGBq, 8 + +.convertend: + sub WIDTHq, 2 + jns .convertloop + + ; If number of pixels is odd then compute it. + and WIDTHq, 1 + jz .convertdone + + movzx TEMPUd, BYTE [Uq] + movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq] + movzx TEMPVd, BYTE [Vq] + paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq] + movzx TEMPUd, BYTE [Yq] + movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ARGBq], mm1 + +.convertdone: + RET +%endif + +; With PIC code we need to load the address of mangle(kCoefficientsRgbY). +; This code is slower than the above version. +%ifdef PIC +mangle(SYMBOL): + %assign stack_offset 0 + PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE + + extern mangle(kCoefficientsRgbY) + LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) + + jmp .convertend + +.convertloop: + movzx TEMPd, BYTE [Uq] + movq mm0, [TABLEq + 2048 + 8 * TEMPq] + add Uq, 1 + + movzx TEMPd, BYTE [Vq] + paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] + add Vq, 1 + + movzx TEMPd, BYTE [Yq] + movq mm1, [TABLEq + 8 * TEMPq] + + movzx TEMPd, BYTE [Yq + 1] + movq mm2, [TABLEq + 8 * TEMPq] + add Yq, 2 + + ; Add UV components to Y component. + paddsw mm1, mm0 + paddsw mm2, mm0 + + ; Down shift and then pack. + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + MOVQ [ARGBq], mm1 + add ARGBq, 8 + +.convertend: + sub WIDTHq, 2 + jns .convertloop + + ; If number of pixels is odd then compute it. + and WIDTHq, 1 + jz .convertdone + + movzx TEMPd, BYTE [Uq] + movq mm0, [TABLEq + 2048 + 8 * TEMPq] + movzx TEMPd, BYTE [Vq] + paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] + movzx TEMPd, BYTE [Yq] + movq mm1, [TABLEq + 8 * TEMPq] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ARGBq], mm1 + +.convertdone: + RET +%endif diff --git a/media/base/simd/convert_yuv_to_rgb_sse.asm b/media/base/simd/convert_yuv_to_rgb_sse.asm new file mode 100644 index 0000000..28d2214 --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_sse.asm @@ -0,0 +1,40 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX and SSE instructions. +; + SECTION_TEXT + CPU MMX, SSE + +; Use SSE instruction movntq can write faster. +%define MOVQ movntq + +; +; extern "C" void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width); +%define SYMBOL ConvertYUVToRGB32Row_SSE +%include "convert_yuv_to_rgb_mmx.inc" + +; void ScaleYUVToRGB32Row_MMX(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL ScaleYUVToRGB32Row_SSE +%include "scale_yuv_to_rgb_mmx.inc" + +; void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); + diff --git a/media/base/simd/convert_yuv_to_rgb_x86.cc b/media/base/simd/convert_yuv_to_rgb_x86.cc new file mode 100644 index 0000000..3e03ef9 --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_x86.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#if defined(_MSC_VER) +#include <intrin.h> +#else +#include <mmintrin.h> +#endif + +#include "media/base/cpu_features.h" +#include "media/base/simd/convert_yuv_to_rgb.h" +#include "media/base/yuv_convert.h" + +namespace media { + +void ConvertYUVToRGB32_MMX(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type) { + unsigned int y_shift = yuv_type; + for (int y = 0; y < height; ++y) { + uint8* rgb_row = rgbframe + y * rgbstride; + const uint8* y_ptr = yplane + y * ystride; + const uint8* u_ptr = uplane + (y >> y_shift) * uvstride; + const uint8* v_ptr = vplane + (y >> y_shift) * uvstride; + + ConvertYUVToRGB32Row_MMX(y_ptr, + u_ptr, + v_ptr, + rgb_row, + width); + } + + _mm_empty(); +} + +void ConvertYUVToRGB32_SSE(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type) { + unsigned int y_shift = yuv_type; + for (int y = 0; y < height; ++y) { + uint8* rgb_row = rgbframe + y * rgbstride; + const uint8* y_ptr = yplane + y * ystride; + const uint8* u_ptr = uplane + (y >> y_shift) * uvstride; + const uint8* v_ptr = vplane + (y >> y_shift) * uvstride; + + ConvertYUVToRGB32Row_SSE(y_ptr, + u_ptr, + v_ptr, + rgb_row, + width); + } + + _mm_empty(); +} + +} // namespace media diff --git a/media/base/simd/filter_yuv.h b/media/base/simd/filter_yuv.h new file mode 100644 index 0000000..5a9cf11 --- /dev/null +++ b/media/base/simd/filter_yuv.h @@ -0,0 +1,29 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef MEDIA_BASE_SIMD_FILTER_YUV_H_ +#define MEDIA_BASE_SIMD_FILTER_YUV_H_ + +#include "base/basictypes.h" + +namespace media { + +typedef void (*FilterYUVRowsProc)(uint8*, + const uint8*, + const uint8*, + int, + int); + +void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction); + +void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction); + +void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction); + +} // namespace media + +#endif // MEDIA_BASE_SIMD_FILTER_YUV_H_ diff --git a/media/base/simd/filter_yuv_c.cc b/media/base/simd/filter_yuv_c.cc new file mode 100644 index 0000000..95ae01a --- /dev/null +++ b/media/base/simd/filter_yuv_c.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "media/base/simd/filter_yuv.h" + +namespace media { + +void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + uint8* end = ybuf + source_width; + do { + ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; + ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; + ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; + ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; + ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8; + ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; + ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; + ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; + y0_ptr += 8; + y1_ptr += 8; + ybuf += 8; + } while (ybuf < end); +} + +} // namespace media diff --git a/media/base/simd/filter_yuv_mmx.cc b/media/base/simd/filter_yuv_mmx.cc new file mode 100644 index 0000000..77698dc --- /dev/null +++ b/media/base/simd/filter_yuv_mmx.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#if defined(_MSC_VER) +#include <intrin.h> +#else +#include <mmintrin.h> +#include <emmintrin.h> +#endif + +#include "build/build_config.h" +#include "media/base/simd/filter_yuv.h" + +namespace media { + +#if defined(COMPILER_MSVC) +// Warning 4799 is about calling emms before the function exits. +// We calls emms in a frame level so suppress this warning. +#pragma warning(disable: 4799) +#endif + +void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction) { + __m64 zero = _mm_setzero_si64(); + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); + + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); + __m64* dest64 = reinterpret_cast<__m64*>(ybuf); + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); + + do { + __m64 y0 = *y0_ptr64++; + __m64 y1 = *y1_ptr64++; + __m64 y2 = _mm_unpackhi_pi8(y0, zero); + __m64 y3 = _mm_unpackhi_pi8(y1, zero); + y0 = _mm_unpacklo_pi8(y0, zero); + y1 = _mm_unpacklo_pi8(y1, zero); + y0 = _mm_mullo_pi16(y0, y0_fraction); + y1 = _mm_mullo_pi16(y1, y1_fraction); + y2 = _mm_mullo_pi16(y2, y0_fraction); + y3 = _mm_mullo_pi16(y3, y1_fraction); + y0 = _mm_add_pi16(y0, y1); + y2 = _mm_add_pi16(y2, y3); + y0 = _mm_srli_pi16(y0, 8); + y2 = _mm_srli_pi16(y2, 8); + y0 = _mm_packs_pu16(y0, y2); + *dest64++ = y0; + } while (dest64 < end64); +} + +#if defined(COMPILER_MSVC) +#pragma warning(default: 4799) +#endif + +} // namespace media diff --git a/media/base/simd/filter_yuv_sse2.cc b/media/base/simd/filter_yuv_sse2.cc new file mode 100644 index 0000000..137ac94 --- /dev/null +++ b/media/base/simd/filter_yuv_sse2.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#if defined(_MSC_VER) +#include <intrin.h> +#else +#include <mmintrin.h> +#include <emmintrin.h> +#endif + +#include "media/base/simd/filter_yuv.h" + +namespace media { + +void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction) { + __m128i zero = _mm_setzero_si128(); + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); + + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); + + do { + __m128i y0 = _mm_loadu_si128(y0_ptr128); + __m128i y1 = _mm_loadu_si128(y1_ptr128); + __m128i y2 = _mm_unpackhi_epi8(y0, zero); + __m128i y3 = _mm_unpackhi_epi8(y1, zero); + y0 = _mm_unpacklo_epi8(y0, zero); + y1 = _mm_unpacklo_epi8(y1, zero); + y0 = _mm_mullo_epi16(y0, y0_fraction); + y1 = _mm_mullo_epi16(y1, y1_fraction); + y2 = _mm_mullo_epi16(y2, y0_fraction); + y3 = _mm_mullo_epi16(y3, y1_fraction); + y0 = _mm_add_epi16(y0, y1); + y2 = _mm_add_epi16(y2, y3); + y0 = _mm_srli_epi16(y0, 8); + y2 = _mm_srli_epi16(y2, 8); + y0 = _mm_packus_epi16(y0, y2); + *dest128++ = y0; + ++y0_ptr128; + ++y1_ptr128; + } while (dest128 < end128); +} + +} // namespace media diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm new file mode 100644 index 0000000..7f7e0e8 --- /dev/null +++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm @@ -0,0 +1,23 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX instructions. +; + SECTION_TEXT + CPU MMX + +; Use movq to save the output. +%define MOVQ movq + +; void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL LinearScaleYUVToRGB32Row_MMX +%include "linear_scale_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc new file mode 100644 index 0000000..91c06a5 --- /dev/null +++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc @@ -0,0 +1,166 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + + global mangle(SYMBOL) PRIVATE + align function_align + +mangle(SYMBOL): + %assign stack_offset 0 + + extern mangle(kCoefficientsRgbY) + +; Parameters are in the following order: +; 1. Y plane +; 2. U plane +; 3. V plane +; 4. ARGB frame +; 5. Width +; 6. Source dx + +PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP + +%if gprsize == 8 +%define WORD_SIZE QWORD +%else +%define WORD_SIZE DWORD +%endif + +; Define register aliases. +%define Xq R1q ; Current X position +%define COMPLq R2q ; Component A value +%define COMPLd R2d ; Component A value +%define U_ARG_REGq R0q ; U plane address argument +%define V_ARG_REGq R1q ; V plane address argument +%define SOURCE_DX_ARG_REGq R3q ; Source dx argument +%define WIDTH_ARG_REGq R2q ; Width argument + +%ifdef PIC +; PIC code shared COMPR, U and V with the same register. Need to be careful in the +; code they don't mix up. This allows R3q to be used for YUV table. +%define COMPRq R0q ; Component B value +%define COMPRd R0d ; Component B value +%define Uq R0q ; U plane address +%define Vq R0q ; V plane address +%define U_PLANE WORD_SIZE [rsp + 3 * gprsize] +%define TABLE R3q ; Address of the table +%else +; Non-PIC code defines. +%define COMPRq R3q ; Component B value +%define COMPRd R3d ; Component B value +%define Uq R0q ; U plane address +%define Vq R3q ; V plane address +%define TABLE mangle(kCoefficientsRgbY) +%endif + +; Defines for stack variables. These are used in both PIC and non-PIC code. +%define V_PLANE WORD_SIZE [rsp + 2 * gprsize] +%define SOURCE_DX WORD_SIZE [rsp + gprsize] +%define SOURCE_WIDTH WORD_SIZE [rsp] + +; Handle stack variables differently for PIC and non-PIC code. + +%ifdef PIC +; Define stack usage for PIC code. PIC code push U plane onto stack. + PUSH U_ARG_REGq + PUSH V_ARG_REGq + PUSH SOURCE_DX_ARG_REGq + imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx + PUSH WIDTH_ARG_REGq + +; Load the address of kCoefficientsRgbY into TABLE + mov TEMPq, SOURCE_DX_ARG_REGq ; Need to save source_dx first + LOAD_SYM TABLE, mangle(kCoefficientsRgbY) +%define SOURCE_DX_ARG_REGq TEMPq ; Overwrite SOURCE_DX_ARG_REGq to TEMPq +%else +; Define stack usage. Non-PIC code just push 3 registers to stack. + PUSH V_ARG_REGq + PUSH SOURCE_DX_ARG_REGq + imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx + PUSH WIDTH_ARG_REGq +%endif + +%macro EPILOGUE 0 +%ifdef PIC + ADD rsp, 4 * gprsize +%else + ADD rsp, 3 * gprsize +%endif +%endmacro + + xor Xq, Xq ; x = 0 + cmp SOURCE_DX_ARG_REGq, 0x20000 + jl .lscaleend + mov Xq, 0x8000 ; x = 0.5 for 1/2 or less + jmp .lscaleend + +.lscaleloop: +%ifdef PIC + mov Uq, U_PLANE ; PIC code saves U_PLANE on stack. +%endif + +; Define macros for scaling YUV components since they are reused. +%macro SCALEUV 1 + mov TEMPq, Xq + sar TEMPq, 0x11 + movzx COMPLd, BYTE [%1 + TEMPq] + movzx COMPRd, BYTE [%1 + TEMPq + 1] + mov TEMPq, Xq + and TEMPq, 0x1fffe + imul COMPRq, TEMPq + xor TEMPq, 0x1fffe + imul COMPLq, TEMPq + add COMPLq, COMPRq + shr COMPLq, 17 +%endmacro + SCALEUV Uq ; Use the above macro to scale U + movq mm0, [TABLE + 2048 + 8 * COMPLq] + + mov Vq, V_PLANE ; Read V address from stack + SCALEUV Vq ; Use the above macro to scale V + paddsw mm0, [TABLE + 4096 + 8 * COMPLq] + +%macro SCALEY 0 + mov TEMPq, Xq + sar TEMPq, 0x10 + movzx COMPLd, BYTE [Yq + TEMPq] + movzx COMPRd, BYTE [Yq + TEMPq + 1] + mov TEMPq, Xq + add Xq, SOURCE_DX ; Add source_dx from stack + and TEMPq, 0xffff + imul COMPRq, TEMPq + xor TEMPq, 0xffff + imul COMPLq, TEMPq + add COMPLq, COMPRq + shr COMPLq, 16 +%endmacro + SCALEY ; Use the above macro to scale Y1 + movq mm1, [TABLE + 8 * COMPLq] + + cmp Xq, SOURCE_WIDTH ; Compare source_width from stack + jge .lscalelastpixel + + SCALEY ; Use the above macro to sacle Y2 + movq mm2, [TABLE + 8 * COMPLq] + + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 0x6 + psraw mm2, 0x6 + packuswb mm1, mm2 + MOVQ [ARGBq], mm1 + add ARGBq, 0x8 + +.lscaleend: + cmp Xq, SOURCE_WIDTH ; Compare source_width from stack + jl .lscaleloop + EPILOGUE + RET + +.lscalelastpixel: + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ARGBq], mm1 + EPILOGUE + RET diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm new file mode 100644 index 0000000..db7854457 --- /dev/null +++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm @@ -0,0 +1,142 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX instructions. +; + SECTION_TEXT + CPU MMX + +%define SYMBOL LinearScaleYUVToRGB32Row_MMX_X64 + global mangle(SYMBOL) PRIVATE + align function_align + +mangle(SYMBOL): + %assign stack_offset 0 + extern mangle(kCoefficientsRgbY) + +; Parameters are in the following order: +; 1. Y plane +; 2. U plane +; 3. V plane +; 4. ARGB frame +; 5. Width +; 6. Source dx + +PROLOGUE 6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMPL + +%define TABLEq r10 +%define Xq r11 +%define INDEXq r12 +%define COMPRd r13d +%define COMPRq r13 +%define FRACTIONq r14 + + PUSH TABLEq + PUSH Xq + PUSH INDEXq + PUSH COMPRq + PUSH FRACTIONq + +%macro EPILOGUE 0 + POP FRACTIONq + POP COMPRq + POP INDEXq + POP Xq + POP TABLEq +%endmacro + + LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) + + imul WIDTHq, SOURCE_DXq ; source_width = width * source_dx + xor Xq, Xq ; x = 0 + cmp SOURCE_DXq, 0x20000 + jl .lscaleend + mov Xq, 0x8000 ; x = 0.5 for 1/2 or less + jmp .lscaleend + +.lscaleloop: + ; Interpolate U + mov INDEXq, Xq + sar INDEXq, 0x11 + movzx COMPLd, BYTE [Uq + INDEXq] + movzx COMPRd, BYTE [Uq + INDEXq + 1] + mov FRACTIONq, Xq + and FRACTIONq, 0x1fffe + imul COMPRq, FRACTIONq + xor FRACTIONq, 0x1fffe + imul COMPLq, FRACTIONq + add COMPLq, COMPRq + shr COMPLq, 17 + movq mm0, [TABLEq + 2048 + 8 * COMPLq] + + ; Interpolate V + movzx COMPLd, BYTE [Vq + INDEXq] + movzx COMPRd, BYTE [Vq + INDEXq + 1] + ; Trick here to imul COMPL first then COMPR. + ; Saves two instruction. :) + imul COMPLq, FRACTIONq + xor FRACTIONq, 0x1fffe + imul COMPRq, FRACTIONq + add COMPLq, COMPRq + shr COMPLq, 17 + paddsw mm0, [TABLEq + 4096 + 8 * COMPLq] + + ; Interpolate first Y1. + lea INDEXq, [Xq + SOURCE_DXq] ; INDEXq now points to next pixel. + ; Xq points to current pixel. + mov FRACTIONq, Xq + sar Xq, 0x10 + movzx COMPLd, BYTE [Yq + Xq] + movzx COMPRd, BYTE [Yq + Xq + 1] + and FRACTIONq, 0xffff + imul COMPRq, FRACTIONq + xor FRACTIONq, 0xffff + imul COMPLq, FRACTIONq + add COMPLq, COMPRq + shr COMPLq, 16 + movq mm1, [TABLEq + 8 * COMPLq] + + ; Interpolate Y2 if available. + cmp INDEXq, WIDTHq + jge .lscalelastpixel + + lea Xq, [INDEXq + SOURCE_DXq] ; Xq points to next pixel. + ; INDEXq points to current pixel. + mov FRACTIONq, INDEXq + sar INDEXq, 0x10 + movzx COMPLd, BYTE [Yq + INDEXq] + movzx COMPRd, BYTE [Yq + INDEXq + 1] + and FRACTIONq, 0xffff + imul COMPRq, FRACTIONq + xor FRACTIONq, 0xffff + imul COMPLq, FRACTIONq + add COMPLq, COMPRq + shr COMPLq, 16 + movq mm2, [TABLEq + 8 * COMPLq] + + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 0x6 + psraw mm2, 0x6 + packuswb mm1, mm2 + movntq [ARGBq], mm1 + add ARGBq, 0x8 + +.lscaleend: + cmp Xq, WIDTHq + jl .lscaleloop + jmp .epilogue + +.lscalelastpixel: + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ARGBq], mm1 + +.epilogue + EPILOGUE + RET diff --git a/media/base/simd/linear_scale_yuv_to_rgb_sse.asm b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm new file mode 100644 index 0000000..847911c --- /dev/null +++ b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm @@ -0,0 +1,23 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX and SSE instructions. +; + SECTION_TEXT + CPU MMX, SSE + +; Use movq to save the output. +%define MOVQ movntq + +; void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL LinearScaleYUVToRGB32Row_SSE +%include "linear_scale_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.asm b/media/base/simd/scale_yuv_to_rgb_mmx.asm new file mode 100644 index 0000000..6a83757 --- /dev/null +++ b/media/base/simd/scale_yuv_to_rgb_mmx.asm @@ -0,0 +1,23 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX instructions. +; + SECTION_TEXT + CPU MMX + +; Use movq to save the output. +%define MOVQ movq + +; void ScaleYUVToRGB32Row_MMX(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL ScaleYUVToRGB32Row_MMX +%include "scale_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.inc b/media/base/simd/scale_yuv_to_rgb_mmx.inc new file mode 100644 index 0000000..94c101c --- /dev/null +++ b/media/base/simd/scale_yuv_to_rgb_mmx.inc @@ -0,0 +1,115 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + + global mangle(SYMBOL) PRIVATE + align function_align + +mangle(SYMBOL): + %assign stack_offset 0 + + extern mangle(kCoefficientsRgbY) + +; Parameters are in the following order: +; 1. Y plane +; 2. U plane +; 3. V plane +; 4. ARGB frame +; 5. Width +; 6. Source dx + +PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP + +%ifdef ARCH_X86_64 +%define WORD_SIZE QWORD +%else +%define WORD_SIZE DWORD +%endif + +%ifdef PIC + PUSH R1q ; Width +%endif + PUSH R2q ; Source dx + +%define SOURCE_DX WORD_SIZE [rsp] + +; PIC code. +%ifdef PIC + LOAD_SYM R1q, mangle(kCoefficientsRgbY) +%define WIDTH WORD_SIZE [rsp + gprsize] +%define TABLE R1q +%define Xq R2q + +; Non-PIC code. +%else +%define WIDTH R1q +%define TABLE mangle(kCoefficientsRgbY) +%define Xq R2q +%endif + + ; Set Xq index to 0. + xor Xq, Xq + jmp .scaleend + +.scaleloop: + ; TABLE can either be a register or a symbol depending on this is + ; PIC or not. + mov TEMPq, Xq + sar TEMPq, 17 + movzx TEMPd, BYTE [Uq + TEMPq] + movq mm0, [TABLE + 2048 + 8 * TEMPq] + mov TEMPq, Xq + sar TEMPq, 17 + movzx TEMPd, BYTE [Vq + TEMPq] + paddsw mm0, [TABLE + 4096 + 8 * TEMPq] + mov TEMPq, Xq + add Xq, SOURCE_DX + sar TEMPq, 16 + movzx TEMPd, BYTE [Yq + TEMPq] + movq mm1, [TABLE + 8 * TEMPq] + mov TEMPq, Xq + add Xq, SOURCE_DX + sar TEMPq, 16 + movzx TEMPd, BYTE [Yq + TEMPq] + movq mm2, [TABLE + 8 * TEMPq] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + MOVQ QWORD [ARGBq], mm1 + add ARGBq, 8 + +.scaleend: + ; WIDTH can either be a register or memory depending on this is + ; PIC or not. + sub WIDTH, 2 + jns .scaleloop + + and WIDTH, 1 ; odd number of pixels? + jz .scaledone + + mov TEMPq, Xq + sar TEMPq, 17 + movzx TEMPd, BYTE [Uq + TEMPq] + movq mm0, [TABLE + 2048 + 8 * TEMPq] + mov TEMPq, Xq + sar TEMPq, 17 + movzx TEMPd, BYTE [Vq + TEMPq] + paddsw mm0, [TABLE + 4096 + 8 * TEMPq] + mov TEMPq, Xq + sar TEMPq, 16 + movzx TEMPd, BYTE [Yq + TEMPq] + movq mm1, [TABLE + 8 * TEMPq] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd DWORD [ARGBq], mm1 + +.scaledone: +%ifdef PIC + ADD rsp, 2 * gprsize +%else + ADD rsp, gprsize +%endif + RET diff --git a/media/base/simd/scale_yuv_to_rgb_sse.asm b/media/base/simd/scale_yuv_to_rgb_sse.asm new file mode 100644 index 0000000..bdd5625 --- /dev/null +++ b/media/base/simd/scale_yuv_to_rgb_sse.asm @@ -0,0 +1,31 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX and SSE instructions. +; + SECTION_TEXT + CPU MMX, SSE + +; Use movq to save the output. +%define MOVQ movntq + +; void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL ScaleYUVToRGB32Row_SSE +%include "scale_yuv_to_rgb_mmx.inc" + +; void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); + diff --git a/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm new file mode 100644 index 0000000..e021457 --- /dev/null +++ b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm @@ -0,0 +1,109 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX, SSE2 and instructions. +; + SECTION_TEXT + CPU SSE2 + +; void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL ScaleYUVToRGB32Row_SSE2_X64 + + global mangle(SYMBOL) PRIVATE + align function_align + +mangle(SYMBOL): + %assign stack_offset 0 + extern mangle(kCoefficientsRgbY) + +; Parameters are in the following order: +; 1. Y plane +; 2. U plane +; 3. V plane +; 4. ARGB frame +; 5. Width +; 6. Source dx + +PROLOGUE 6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMP + +%define TABLEq r10 +%define Xq r11 +%define INDEXq r12 + PUSH r10 + PUSH r11 + PUSH r12 + + LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) + + ; Set Xq index to 0. + xor Xq, Xq + jmp .scaleend + +.scaleloop: + ; Read UV pixels. + mov INDEXq, Xq + sar INDEXq, 17 + movzx COMPd, BYTE [Uq + INDEXq] + movq xmm0, [TABLEq + 2048 + 8 * COMPq] + movzx COMPd, BYTE [Vq + INDEXq] + movq xmm1, [TABLEq + 4096 + 8 * COMPq] + + ; Read first Y pixel. + lea INDEXq, [Xq + SOURCE_DXq] ; INDEXq nows points to next pixel. + sar Xq, 16 + movzx COMPd, BYTE [Yq + Xq] + paddsw xmm0, xmm1 ; Hide a ADD after memory load. + movq xmm1, [TABLEq + 8 * COMPq] + + ; Read next Y pixel. + lea Xq, [INDEXq + SOURCE_DXq] ; Xq now points to next pixel. + sar INDEXq, 16 + movzx COMPd, BYTE [Yq + INDEXq] + movq xmm2, [TABLEq + 8 * COMPq] + paddsw xmm1, xmm0 + paddsw xmm2, xmm0 + shufps xmm1, xmm2, 0x44 ; Join two pixels into one XMM register + psraw xmm1, 6 + packuswb xmm1, xmm1 + movq QWORD [ARGBq], xmm1 + add ARGBq, 8 + +.scaleend: + sub WIDTHq, 2 + jns .scaleloop + + and WIDTHq, 1 ; odd number of pixels? + jz .scaledone + + ; Read U V components. + mov INDEXq, Xq + sar INDEXq, 17 + movzx COMPd, BYTE [Uq + INDEXq] + movq xmm0, [TABLEq + 2048 + 8 * COMPq] + movzx COMPd, BYTE [Vq + INDEXq] + paddsw xmm0, [TABLEq + 4096 + 8 * COMPq] + + ; Read one Y component. + mov INDEXq, Xq + sar INDEXq, 16 + movzx COMPd, BYTE [Yq + INDEXq] + movq xmm1, [TABLEq + 8 * COMPq] + paddsw xmm1, xmm0 + psraw xmm1, 6 + packuswb xmm1, xmm1 + movd DWORD [ARGBq], xmm1 + +.scaledone: + POP r12 + POP r11 + POP r10 + RET diff --git a/media/base/simd/x86inc.asm b/media/base/simd/x86inc.asm index 956b999..5e0ca20 100644 --- a/media/base/simd/x86inc.asm +++ b/media/base/simd/x86inc.asm @@ -95,11 +95,14 @@ %ifdef WIN64 %define PIC %elifndef ARCH_X86_64 +; For chromium we may build PIC code even for 32 bits system. +%ifndef CHROMIUM ; x86_32 doesn't require PIC. ; Some distros prefer shared objects to be PIC, but nothing breaks if ; the code contains a few textrels, so we'll skip that complexity. %undef PIC %endif +%endif %ifdef PIC default rel %endif @@ -947,6 +950,11 @@ AVX_INSTR pfmul, 1, 0 ;============================================================================= %ifdef CHROMIUM +; Always build PIC code on Mac for Chromium. +%ifdef MACHO +%define PIC +%endif + ; ; LOAD_SYM %1 (reg), %2 (sym) ; Copies the address to a local symbol to the specified register. diff --git a/media/base/yuv_convert.cc b/media/base/yuv_convert.cc index cbf7f57..22f1a24 100644 --- a/media/base/yuv_convert.cc +++ b/media/base/yuv_convert.cc @@ -17,151 +17,94 @@ #include "media/base/yuv_convert.h" +#include "base/logging.h" #include "build/build_config.h" #include "media/base/cpu_features.h" #include "media/base/simd/convert_rgb_to_yuv.h" +#include "media/base/simd/convert_yuv_to_rgb.h" +#include "media/base/simd/filter_yuv.h" #include "media/base/yuv_convert_internal.h" #include "media/base/yuv_row.h" -#if USE_MMX +#if defined(ARCH_CPU_X86_FAMILY) #if defined(_MSC_VER) #include <intrin.h> #else +#include <emmintrin.h> #include <mmintrin.h> #endif #endif -#if USE_SSE2 -#include <emmintrin.h> -#endif - namespace media { -// 16.16 fixed point arithmetic -const int kFractionBits = 16; -const int kFractionMax = 1 << kFractionBits; -const int kFractionMask = ((1 << kFractionBits) - 1); - -// Convert a frame of YUV to 32 bit ARGB. -void ConvertYUVToRGB32(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width, - int height, - int y_pitch, - int uv_pitch, - int rgb_pitch, - YUVType yuv_type) { - unsigned int y_shift = yuv_type; - for (int y = 0; y < height; ++y) { - uint8* rgb_row = rgb_buf + y * rgb_pitch; - const uint8* y_ptr = y_buf + y * y_pitch; - const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch; - const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch; - - FastConvertYUVToRGB32Row(y_ptr, - u_ptr, - v_ptr, - rgb_row, - width); - } - - // MMX used for FastConvertYUVToRGB32Row requires emms instruction. - EMMS(); +static FilterYUVRowsProc ChooseFilterYUVRowsProc() { +#if defined(ARCH_CPU_X86_FAMILY) + if (hasSSE2()) + return &FilterYUVRows_SSE2; + if (hasMMX()) + return &FilterYUVRows_MMX; +#endif + return &FilterYUVRows_C; } -#if USE_SSE2 -// FilterRows combines two rows of the image using linear interpolation. -// SSE2 version does 16 pixels at a time - -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, - int source_width, int source_y_fraction) { - __m128i zero = _mm_setzero_si128(); - __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); - __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); - - const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); - const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); - __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); - __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); - - do { - __m128i y0 = _mm_loadu_si128(y0_ptr128); - __m128i y1 = _mm_loadu_si128(y1_ptr128); - __m128i y2 = _mm_unpackhi_epi8(y0, zero); - __m128i y3 = _mm_unpackhi_epi8(y1, zero); - y0 = _mm_unpacklo_epi8(y0, zero); - y1 = _mm_unpacklo_epi8(y1, zero); - y0 = _mm_mullo_epi16(y0, y0_fraction); - y1 = _mm_mullo_epi16(y1, y1_fraction); - y2 = _mm_mullo_epi16(y2, y0_fraction); - y3 = _mm_mullo_epi16(y3, y1_fraction); - y0 = _mm_add_epi16(y0, y1); - y2 = _mm_add_epi16(y2, y3); - y0 = _mm_srli_epi16(y0, 8); - y2 = _mm_srli_epi16(y2, 8); - y0 = _mm_packus_epi16(y0, y2); - *dest128++ = y0; - ++y0_ptr128; - ++y1_ptr128; - } while (dest128 < end128); +static ConvertYUVToRGB32RowProc ChooseConvertYUVToRGB32RowProc() { +#if defined(ARCH_CPU_X86_FAMILY) + if (hasSSE()) + return &ConvertYUVToRGB32Row_SSE; + if (hasMMX()) + return &ConvertYUVToRGB32Row_MMX; +#endif + return &ConvertYUVToRGB32Row_C; } -#elif USE_MMX -// MMX version does 8 pixels at a time -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, - int source_width, int source_y_fraction) { - __m64 zero = _mm_setzero_si64(); - __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); - __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); - - const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); - const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); - __m64* dest64 = reinterpret_cast<__m64*>(ybuf); - __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); - - do { - __m64 y0 = *y0_ptr64++; - __m64 y1 = *y1_ptr64++; - __m64 y2 = _mm_unpackhi_pi8(y0, zero); - __m64 y3 = _mm_unpackhi_pi8(y1, zero); - y0 = _mm_unpacklo_pi8(y0, zero); - y1 = _mm_unpacklo_pi8(y1, zero); - y0 = _mm_mullo_pi16(y0, y0_fraction); - y1 = _mm_mullo_pi16(y1, y1_fraction); - y2 = _mm_mullo_pi16(y2, y0_fraction); - y3 = _mm_mullo_pi16(y3, y1_fraction); - y0 = _mm_add_pi16(y0, y1); - y2 = _mm_add_pi16(y2, y3); - y0 = _mm_srli_pi16(y0, 8); - y2 = _mm_srli_pi16(y2, 8); - y0 = _mm_packs_pu16(y0, y2); - *dest64++ = y0; - } while (dest64 < end64); + +static ScaleYUVToRGB32RowProc ChooseScaleYUVToRGB32RowProc() { +#if defined(ARCH_CPU_X86_FAMILY) +#if defined(ARCH_CPU_X86_64) + // Use 64-bits version if possible. + return &ScaleYUVToRGB32Row_SSE2_X64; +#endif + // Choose the best one on 32-bits system. + if (hasSSE()) + return &ScaleYUVToRGB32Row_SSE; + if (hasMMX()) + return &ScaleYUVToRGB32Row_MMX; +#endif + return &ScaleYUVToRGB32Row_C; } -#else // no MMX or SSE2 -// C version does 8 at a time to mimic MMX code -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, - int source_width, int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - uint8* end = ybuf + source_width; - do { - ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; - ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; - ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; - ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; - ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8; - ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; - ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; - ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; - y0_ptr += 8; - y1_ptr += 8; - ybuf += 8; - } while (ybuf < end); + +static ScaleYUVToRGB32RowProc ChooseLinearScaleYUVToRGB32RowProc() { +#if defined(ARCH_CPU_X86_FAMILY) +#if defined(ARCH_CPU_X86_64) + // Use 64-bits version if possible. + return &LinearScaleYUVToRGB32Row_MMX_X64; +#endif + // 32-bits system. + if (hasSSE()) + return &LinearScaleYUVToRGB32Row_SSE; + if (hasMMX()) + return &LinearScaleYUVToRGB32Row_MMX; +#endif + return &LinearScaleYUVToRGB32Row_C; } + +// Empty SIMD registers state after using them. +void EmptyRegisterState() { +#if defined(ARCH_CPU_X86_FAMILY) + static bool checked = false; + static bool has_mmx = false; + if (!checked) { + has_mmx = hasMMX(); + checked = true; + } + if (has_mmx) + _mm_empty(); #endif +} +// 16.16 fixed point arithmetic +const int kFractionBits = 16; +const int kFractionMax = 1 << kFractionBits; +const int kFractionMask = ((1 << kFractionBits) - 1); // Scale a frame of YUV to 32 bit ARGB. void ScaleYUVToRGB32(const uint8* y_buf, @@ -178,6 +121,20 @@ void ScaleYUVToRGB32(const uint8* y_buf, YUVType yuv_type, Rotate view_rotate, ScaleFilter filter) { + static FilterYUVRowsProc filter_proc = NULL; + static ConvertYUVToRGB32RowProc convert_proc = NULL; + static ScaleYUVToRGB32RowProc scale_proc = NULL; + static ScaleYUVToRGB32RowProc linear_scale_proc = NULL; + + if (!filter_proc) + filter_proc = ChooseFilterYUVRowsProc(); + if (!convert_proc) + convert_proc = ChooseConvertYUVToRGB32RowProc(); + if (!scale_proc) + scale_proc = ChooseScaleYUVToRGB32RowProc(); + if (!linear_scale_proc) + linear_scale_proc = ChooseLinearScaleYUVToRGB32RowProc(); + // Handle zero sized sources and destinations. if ((yuv_type == YV12 && (source_width < 2 || source_height < 2)) || (yuv_type == YV16 && (source_width < 2 || source_height < 1)) || @@ -225,9 +182,6 @@ void ScaleYUVToRGB32(const uint8* y_buf, int source_dx = source_width * kFractionMax / width; int source_dy = source_height * kFractionMax / height; -#if USE_MMX && defined(_MSC_VER) - int source_dx_uv = source_dx; -#endif if ((view_rotate == ROTATE_90) || (view_rotate == ROTATE_270)) { @@ -240,9 +194,6 @@ void ScaleYUVToRGB32(const uint8* y_buf, int original_dx = source_dx; int original_dy = source_dy; source_dx = ((original_dy >> kFractionBits) * y_pitch) << kFractionBits; -#if USE_MMX && defined(_MSC_VER) - source_dx_uv = ((original_dy >> kFractionBits) * uv_pitch) << kFractionBits; -#endif source_dy = original_dx; if (view_rotate == ROTATE_90) { y_pitch = -1; @@ -294,7 +245,7 @@ void ScaleYUVToRGB32(const uint8* y_buf, if (filter & media::FILTER_BILINEAR_V) { if (yscale_fixed != kFractionMax && source_y_fraction && ((source_y + 1) < source_height)) { - FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); + filter_proc(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); } else { memcpy(ybuf, y0_ptr, source_width); } @@ -304,8 +255,8 @@ void ScaleYUVToRGB32(const uint8* y_buf, if (yscale_fixed != kFractionMax && source_uv_fraction && (((source_y >> y_shift) + 1) < (source_height >> y_shift))) { - FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction); - FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction); + filter_proc(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction); + filter_proc(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction); } else { memcpy(ubuf, u0_ptr, uv_source_width); memcpy(vbuf, v0_ptr, uv_source_width); @@ -316,41 +267,17 @@ void ScaleYUVToRGB32(const uint8* y_buf, vbuf[uv_source_width] = vbuf[uv_source_width - 1]; } if (source_dx == kFractionMax) { // Not scaled - FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, - dest_pixel, width); + convert_proc(y_ptr, u_ptr, v_ptr, dest_pixel, width); } else { if (filter & FILTER_BILINEAR_H) { - LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, - dest_pixel, width, source_dx); - } else { -// Specialized scalers and rotation. -#if USE_MMX && defined(_MSC_VER) - if (width == (source_width * 2)) { - DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, - dest_pixel, width); - } else if ((source_dx & kFractionMask) == 0) { - // Scaling by integer scale factor. ie half. - ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, - dest_pixel, width, - source_dx >> kFractionBits); - } else if (source_dx_uv == source_dx) { // Not rotated. - ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, - dest_pixel, width, source_dx); - } else { - RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, - dest_pixel, width, - source_dx >> kFractionBits, - source_dx_uv >> kFractionBits); - } -#else - ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, - dest_pixel, width, source_dx); -#endif + linear_scale_proc(y_ptr, u_ptr, v_ptr, dest_pixel, width, source_dx); + } else { + scale_proc(y_ptr, u_ptr, v_ptr, dest_pixel, width, source_dx); } } } - // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. - EMMS(); + + EmptyRegisterState(); } void ConvertRGB32ToYUV(const uint8* rgbframe, @@ -371,7 +298,9 @@ void ConvertRGB32ToYUV(const uint8* rgbframe, convert_proc = &ConvertRGB32ToYUV_C; #else // For x86 processors, check if SSSE3 (or SSE2) is supported. - if (hasSSE2()) + if (hasSSSE3()) + convert_proc = &ConvertRGB32ToYUV_SSSE3; + else if (hasSSE2()) convert_proc = &ConvertRGB32ToYUV_SSE2; else convert_proc = &ConvertRGB32ToYUV_C; @@ -391,8 +320,21 @@ void ConvertRGB24ToYUV(const uint8* rgbframe, int rgbstride, int ystride, int uvstride) { +#if defined(ARCH_CPU_ARM_FAMILY) ConvertRGB24ToYUV_C(rgbframe, yplane, uplane, vplane, width, height, rgbstride, ystride, uvstride); +#else + static void (*convert_proc)(const uint8*, uint8*, uint8*, uint8*, + int, int, int, int, int) = NULL; + if (!convert_proc) { + if (hasSSSE3()) + convert_proc = &ConvertRGB24ToYUV_SSSE3; + else + convert_proc = &ConvertRGB24ToYUV_C; + } + convert_proc(rgbframe, yplane, uplane, vplane, width, height, + rgbstride, ystride, uvstride); +#endif } void ConvertYUY2ToYUV(const uint8* src, @@ -403,4 +345,34 @@ void ConvertYUY2ToYUV(const uint8* src, int height) { ConvertYUY2ToYUV_C(src, yplane, uplane, vplane, width, height); } + +void ConvertYUVToRGB32(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type) { +#if defined(ARCH_CPU_ARM_FAMILY) + ConvertYUVToRGB32_C(yplane, uplane, vplane, rgbframe, + width, height, ystride, uvstride, rgbstride, yuv_type); +#else + static ConvertYUVToRGB32Proc convert_proc = NULL; + if (!convert_proc) { + if (hasSSE()) + convert_proc = &ConvertYUVToRGB32_SSE; + else if (hasMMX()) + convert_proc = &ConvertYUVToRGB32_MMX; + else + convert_proc = &ConvertYUVToRGB32_C; + } + + convert_proc(yplane, uplane, vplane, rgbframe, + width, height, ystride, uvstride, rgbstride, yuv_type); +#endif +} + } // namespace media diff --git a/media/base/yuv_convert.h b/media/base/yuv_convert.h index 20ba0aa..95b1780 100644 --- a/media/base/yuv_convert.h +++ b/media/base/yuv_convert.h @@ -94,6 +94,10 @@ void ConvertYUY2ToYUV(const uint8* src, int width, int height); +// Empty SIMD register state after calling optimized scaler functions. +// This method is only used in unit test after calling SIMD functions. +void EmptyRegisterState(); + } // namespace media #endif // MEDIA_BASE_YUV_CONVERT_H_ diff --git a/media/base/yuv_convert_internal.h b/media/base/yuv_convert_internal.h index 80776aa..7be14c4 100644 --- a/media/base/yuv_convert_internal.h +++ b/media/base/yuv_convert_internal.h @@ -13,15 +13,15 @@ namespace media { // SSE2 version of converting RGBA to YV12. -extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe, - uint8* yplane, - uint8* uplane, - uint8* vplane, - int width, - int height, - int rgbstride, - int ystride, - int uvstride); +void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe, + uint8* yplane, + uint8* uplane, + uint8* vplane, + int width, + int height, + int rgbstride, + int ystride, + int uvstride); // This is a C reference implementation of the above routine. // This method should only be used in unit test. diff --git a/media/base/yuv_convert_unittest.cc b/media/base/yuv_convert_unittest.cc index 380897e..5de3b11 100644 --- a/media/base/yuv_convert_unittest.cc +++ b/media/base/yuv_convert_unittest.cc @@ -8,6 +8,7 @@ #include "base/path_service.h" #include "media/base/cpu_features.h" #include "media/base/djb2.h" +#include "media/base/simd/convert_yuv_to_rgb.h" #include "media/base/yuv_convert.h" #include "media/base/yuv_convert_internal.h" #include "media/base/yuv_row.h" @@ -376,17 +377,7 @@ TEST(YUVConvertTest, RGB32ToYUV_SSE2_MatchReference) { scoped_array<uint8> yuv_converted_bytes(new uint8[kYUV12Size]); scoped_array<uint8> yuv_reference_bytes(new uint8[kYUV12Size]); - // Read YUV reference data from file. - FilePath yuv_url; - EXPECT_TRUE(PathService::Get(base::DIR_SOURCE_ROOT, &yuv_url)); - yuv_url = yuv_url.Append(FILE_PATH_LITERAL("media")) - .Append(FILE_PATH_LITERAL("test")) - .Append(FILE_PATH_LITERAL("data")) - .Append(FILE_PATH_LITERAL("bali_640x360_P420.yuv")); - EXPECT_EQ(static_cast<int>(kYUV12Size), - file_util::ReadFile(yuv_url, - reinterpret_cast<char*>(yuv_bytes.get()), - static_cast<int>(kYUV12Size))); + ReadYV12Data(&yuv_bytes); // Convert a frame of YUV to 32 bit ARGB. media::ConvertYUVToRGB32( @@ -459,4 +450,241 @@ TEST(YUVConvertTest, RGB32ToYUV_SSE2_MatchReference) { // Make sure there's no difference from the reference. EXPECT_EQ(0, error); } -#endif + +TEST(YUVConvertTest, ConvertYUVToRGB32Row_MMX) { + if (!media::hasMMX()) { + LOG(WARNING) << "System not supported. Test skipped."; + return; + } + + scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]); + scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]); + scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]); + ReadYV12Data(&yuv_bytes); + + const int kWidth = 167; + ConvertYUVToRGB32Row_C(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_reference.get(), + kWidth); + ConvertYUVToRGB32Row_MMX(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_converted.get(), + kWidth); + media::EmptyRegisterState(); + EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(), + rgb_bytes_converted.get(), + kWidth * kBpp)); +} + +TEST(YUVConvertTest, ConvertYUVToRGB32Row_SSE) { + if (!media::hasSSE()) { + LOG(WARNING) << "System not supported. Test skipped."; + return; + } + + scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]); + scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]); + scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]); + ReadYV12Data(&yuv_bytes); + + const int kWidth = 167; + ConvertYUVToRGB32Row_C(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_reference.get(), + kWidth); + ConvertYUVToRGB32Row_SSE(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_converted.get(), + kWidth); + media::EmptyRegisterState(); + EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(), + rgb_bytes_converted.get(), + kWidth * kBpp)); +} + +TEST(YUVConvertTest, ScaleYUVToRGB32Row_MMX) { + if (!media::hasMMX()) { + LOG(WARNING) << "System not supported. Test skipped."; + return; + } + + scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]); + scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]); + scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]); + ReadYV12Data(&yuv_bytes); + + const int kWidth = 167; + const int kSourceDx = 80000; // This value means a scale down. + ScaleYUVToRGB32Row_C(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_reference.get(), + kWidth, + kSourceDx); + ScaleYUVToRGB32Row_MMX(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_converted.get(), + kWidth, + kSourceDx); + media::EmptyRegisterState(); + EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(), + rgb_bytes_converted.get(), + kWidth * kBpp)); +} + +TEST(YUVConvertTest, ScaleYUVToRGB32Row_SSE) { + if (!media::hasSSE()) { + LOG(WARNING) << "System not supported. Test skipped."; + return; + } + + scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]); + scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]); + scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]); + ReadYV12Data(&yuv_bytes); + + const int kWidth = 167; + const int kSourceDx = 80000; // This value means a scale down. + ScaleYUVToRGB32Row_C(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_reference.get(), + kWidth, + kSourceDx); + ScaleYUVToRGB32Row_SSE(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_converted.get(), + kWidth, + kSourceDx); + media::EmptyRegisterState(); + EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(), + rgb_bytes_converted.get(), + kWidth * kBpp)); +} + +TEST(YUVConvertTest, LinearScaleYUVToRGB32Row_MMX) { + if (!media::hasMMX()) { + LOG(WARNING) << "System not supported. Test skipped."; + return; + } + + scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]); + scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]); + scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]); + ReadYV12Data(&yuv_bytes); + + const int kWidth = 167; + const int kSourceDx = 80000; // This value means a scale down. + LinearScaleYUVToRGB32Row_C(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_reference.get(), + kWidth, + kSourceDx); + LinearScaleYUVToRGB32Row_MMX(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_converted.get(), + kWidth, + kSourceDx); + media::EmptyRegisterState(); + EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(), + rgb_bytes_converted.get(), + kWidth * kBpp)); +} + +TEST(YUVConvertTest, LinearScaleYUVToRGB32Row_SSE) { + if (!media::hasSSE()) { + LOG(WARNING) << "System not supported. Test skipped."; + return; + } + + scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]); + scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]); + scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]); + ReadYV12Data(&yuv_bytes); + + const int kWidth = 167; + const int kSourceDx = 80000; // This value means a scale down. + LinearScaleYUVToRGB32Row_C(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_reference.get(), + kWidth, + kSourceDx); + LinearScaleYUVToRGB32Row_SSE(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_converted.get(), + kWidth, + kSourceDx); + media::EmptyRegisterState(); + EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(), + rgb_bytes_converted.get(), + kWidth * kBpp)); +} + +#if defined(ARCH_CPU_X86_64) + +TEST(YUVConvertTest, ScaleYUVToRGB32Row_SSE2_X64) { + scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]); + scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]); + scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]); + ReadYV12Data(&yuv_bytes); + + const int kWidth = 167; + const int kSourceDx = 80000; // This value means a scale down. + ScaleYUVToRGB32Row_C(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_reference.get(), + kWidth, + kSourceDx); + ScaleYUVToRGB32Row_SSE2_X64(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_converted.get(), + kWidth, + kSourceDx); + media::EmptyRegisterState(); + EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(), + rgb_bytes_converted.get(), + kWidth * kBpp)); +} + +TEST(YUVConvertTest, LinearScaleYUVToRGB32Row_MMX_X64) { + scoped_array<uint8> yuv_bytes(new uint8[kYUV12Size]); + scoped_array<uint8> rgb_bytes_reference(new uint8[kRGBSize]); + scoped_array<uint8> rgb_bytes_converted(new uint8[kRGBSize]); + ReadYV12Data(&yuv_bytes); + + const int kWidth = 167; + const int kSourceDx = 80000; // This value means a scale down. + LinearScaleYUVToRGB32Row_C(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_reference.get(), + kWidth, + kSourceDx); + LinearScaleYUVToRGB32Row_MMX_X64(yuv_bytes.get(), + yuv_bytes.get() + kSourceUOffset, + yuv_bytes.get() + kSourceVOffset, + rgb_bytes_converted.get(), + kWidth, + kSourceDx); + media::EmptyRegisterState(); + EXPECT_EQ(0, memcmp(rgb_bytes_reference.get(), + rgb_bytes_converted.get(), + kWidth * kBpp)); +} + +#endif // defined(ARCH_CPU_X86_64) + +#endif // defined(ARCH_CPU_X86_FAMILY) diff --git a/media/base/yuv_row_posix.cc b/media/base/yuv_row_posix.cc index 2217f38..f839de8 100644 --- a/media/base/yuv_row_posix.cc +++ b/media/base/yuv_row_posix.cc @@ -920,4 +920,3 @@ void LinearScaleYUVToRGB32Row(const uint8* y_buf, #endif // USE_MMX } // extern "C" - diff --git a/media/media.gyp b/media/media.gyp index 5282afb..6ecd6af 100644 --- a/media/media.gyp +++ b/media/media.gyp @@ -341,7 +341,12 @@ 'conditions': [ [ 'target_arch == "ia32" or target_arch == "x64"', { 'dependencies': [ - 'yuv_convert_sse2', + 'yuv_convert_simd_x86', + ], + }], + [ 'target_arch == "arm"', { + 'dependencies': [ + 'yuv_convert_simd_arm', ], }], ], @@ -357,15 +362,45 @@ ], }, { - 'target_name': 'yuv_convert_sse2', + 'target_name': 'yuv_convert_simd_x86', 'type': 'static_library', 'include_dirs': [ '..', ], + 'sources': [ + 'base/yuv_convert_sse2.cc', + 'base/simd/convert_rgb_to_yuv_x86.cc', + 'base/simd/convert_rgb_to_yuv_ssse3.asm', + 'base/simd/convert_rgb_to_yuv_ssse3.inc', + 'base/simd/convert_yuv_to_rgb_c.cc', + 'base/simd/convert_yuv_to_rgb_x86.cc', + 'base/simd/convert_yuv_to_rgb_mmx.asm', + 'base/simd/convert_yuv_to_rgb_mmx.inc', + 'base/simd/convert_yuv_to_rgb_sse.asm', + 'base/simd/filter_yuv.h', + 'base/simd/filter_yuv_c.cc', + 'base/simd/filter_yuv_mmx.cc', + 'base/simd/filter_yuv_sse2.cc', + 'base/simd/linear_scale_yuv_to_rgb_mmx.asm', + 'base/simd/linear_scale_yuv_to_rgb_mmx.inc', + 'base/simd/linear_scale_yuv_to_rgb_sse.asm', + 'base/simd/scale_yuv_to_rgb_mmx.asm', + 'base/simd/scale_yuv_to_rgb_mmx.inc', + 'base/simd/scale_yuv_to_rgb_sse.asm', + ], 'conditions': [ + [ 'target_arch == "x64"', { + # Source files optimized for X64 systems. + 'sources': [ + 'base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm', + 'base/simd/scale_yuv_to_rgb_sse2_x64.asm', + ], + }], [ 'os_posix == 1 and OS != "mac"', { 'cflags': [ '-msse2', + '-msse3', + '-mssse3', ], }], [ 'OS == "mac"', { @@ -428,10 +463,6 @@ }, }], ], - 'sources': [ - 'base/yuv_convert_sse2.cc', - 'base/simd/convert_rgb_to_yuv.cc', - ], 'variables': { 'yasm_output_path': '<(SHARED_INTERMEDIATE_DIR)/media', }, @@ -440,6 +471,18 @@ ], }, { + 'target_name': 'yuv_convert_simd_arm', + 'type': 'static_library', + 'include_dirs': [ + '..', + ], + 'sources': [ + 'base/simd/convert_yuv_to_rgb_c.cc', + 'base/simd/filter_yuv.h', + 'base/simd/filter_yuv_c.cc', + ], + }, + { 'target_name': 'ffmpeg_unittests', 'type': 'executable', 'dependencies': [ |