diff options
author | hclam@chromium.org <hclam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-09-14 12:40:45 +0000 |
---|---|---|
committer | hclam@chromium.org <hclam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-09-14 12:40:45 +0000 |
commit | ccde716550693ceb59cc8717d4f4d4845f23d853 (patch) | |
tree | 0022aff451d5665193835d1c508a1f3303ae9cc5 /media/base/simd | |
parent | 89a4d2f772aa92a79acedf057f4036820c1fd412 (diff) | |
download | chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.zip chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.tar.gz chromium_src-ccde716550693ceb59cc8717d4f4d4845f23d853.tar.bz2 |
Resubmit - Rewrite color space conversions suite using YASM"
I'll watch the official buildbot this time.
TBR=ajwong, dhollowa
BUG=None
TEST=None
Review URL: http://codereview.chromium.org/7891039
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@101067 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media/base/simd')
20 files changed, 1439 insertions, 0 deletions
diff --git a/media/base/simd/convert_rgb_to_yuv_x86.cc b/media/base/simd/convert_rgb_to_yuv_x86.cc new file mode 100644 index 0000000..2bd6930 --- /dev/null +++ b/media/base/simd/convert_rgb_to_yuv_x86.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "media/base/simd/convert_rgb_to_yuv.h" + +#include "build/build_config.h" +#include "media/base/cpu_features.h" +#include "media/base/simd/convert_rgb_to_yuv_ssse3.h" + +namespace media { + +void ConvertRGB32ToYUV_SSSE3(const uint8* rgbframe, + uint8* yplane, + uint8* uplane, + uint8* vplane, + int width, + int height, + int rgbstride, + int ystride, + int uvstride) { +#ifdef ENABLE_SUBSAMPLING + for (; height >= 2; height -= 2) { + ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + ConvertARGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + uplane += uvstride; + vplane += uvstride; + } + + if (height) + ConvertARGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width); +#else + for (; height >= 2; height -= 2) { + ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width); + rgbframe += rgbstride; + yplane += ystride; + + uplane += uvstride; + vplane += uvstride; + } + + if (height) + ConvertARGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width); +#endif +} + +void ConvertRGB24ToYUV_SSSE3(const uint8* rgbframe, + uint8* yplane, + uint8* uplane, + uint8* vplane, + int width, + int height, + int rgbstride, + int ystride, + int uvstride) { +#ifdef ENABLE_SUBSAMPLING + for (; height >= 2; height -= 2) { + ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + ConvertRGBToYUVOdd_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + uplane += uvstride; + vplane += uvstride; + } + + if (height) + ConvertRGBToYUVEven_SSSE3(rgbframe, yplane, uplane, vplane, width); +#else + for (; height >= 2; height -= 2) { + ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width); + rgbframe += rgbstride; + yplane += ystride; + + ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, NULL, NULL, width); + rgbframe += rgbstride; + yplane += ystride; + + uplane += uvstride; + vplane += uvstride; + } + + if (height) + ConvertRGBToYUVRow_SSSE3(rgbframe, yplane, uplane, vplane, width); +#endif +} + +} // namespace media diff --git a/media/base/simd/convert_yuv_to_rgb.h b/media/base/simd/convert_yuv_to_rgb.h new file mode 100644 index 0000000..5f3df2c6 --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb.h @@ -0,0 +1,150 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_ +#define MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_ + +#include "base/basictypes.h" +#include "media/base/yuv_convert.h" + +namespace media { + +typedef void (*ConvertYUVToRGB32Proc)(const uint8*, + const uint8*, + const uint8*, + uint8*, + int, + int, + int, + int, + int, + YUVType); + +void ConvertYUVToRGB32_C(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +void ConvertYUVToRGB32_SSE(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +void ConvertYUVToRGB32_MMX(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + +} // namespace media + +// Assembly functions are declared without namespace. +extern "C" { + +typedef void (*ConvertYUVToRGB32RowProc)(const uint8*, + const uint8*, + const uint8*, + uint8*, + int); +typedef void (*ScaleYUVToRGB32RowProc)(const uint8*, + const uint8*, + const uint8*, + uint8*, + int, + int); + +void ConvertYUVToRGB32Row_C(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width); + +void ConvertYUVToRGB32Row_MMX(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width); + +void ConvertYUVToRGB32Row_SSE(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width); + +void ScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +void LinearScaleYUVToRGB32Row_MMX_X64(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx); + +} + +#endif // MEDIA_BASE_SIMD_CONVERT_YUV_TO_RGB_H_ diff --git a/media/base/simd/convert_yuv_to_rgb_c.cc b/media/base/simd/convert_yuv_to_rgb_c.cc new file mode 100644 index 0000000..f8e70b2 --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_c.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "media/base/simd/convert_yuv_to_rgb.h" +// TODO(hclam): Shouldn't depend on yuv_row.h. +#include "media/base/yuv_row.h" + +#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) + +static inline void YUVPixel(uint8 y, + uint8 u, + uint8 v, + uint8* rgb_buf) { + + int b = kCoefficientsRgbY[256+u][0]; + int g = kCoefficientsRgbY[256+u][1]; + int r = kCoefficientsRgbY[256+u][2]; + int a = kCoefficientsRgbY[256+u][3]; + + b = paddsw(b, kCoefficientsRgbY[512+v][0]); + g = paddsw(g, kCoefficientsRgbY[512+v][1]); + r = paddsw(r, kCoefficientsRgbY[512+v][2]); + a = paddsw(a, kCoefficientsRgbY[512+v][3]); + + b = paddsw(b, kCoefficientsRgbY[y][0]); + g = paddsw(g, kCoefficientsRgbY[y][1]); + r = paddsw(r, kCoefficientsRgbY[y][2]); + a = paddsw(a, kCoefficientsRgbY[y][3]); + + b >>= 6; + g >>= 6; + r >>= 6; + a >>= 6; + + *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | + (packuswb(g) << 8) | + (packuswb(r) << 16) | + (packuswb(a) << 24); +} + +extern "C" { + +void ConvertYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; x += 2) { + uint8 u = u_buf[x >> 1]; + uint8 v = v_buf[x >> 1]; + uint8 y0 = y_buf[x]; + YUVPixel(y0, u, v, rgb_buf); + if ((x + 1) < width) { + uint8 y1 = y_buf[x + 1]; + YUVPixel(y1, u, v, rgb_buf + 4); + } + rgb_buf += 8; // Advance 2 pixels. + } +} + +// 16.16 fixed point is used. A shift by 16 isolates the integer. +// A shift by 17 is used to further subsample the chrominence channels. +// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, +// for 1/65536 pixel accurate interpolation. +void ScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + int x = 0; + for (int i = 0; i < width; i += 2) { + int y = y_buf[x >> 16]; + int u = u_buf[(x >> 17)]; + int v = v_buf[(x >> 17)]; + YUVPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y = y_buf[x >> 16]; + YUVPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } +} + +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int source_dx) { + int x = 0; + if (source_dx >= 0x20000) { + x = 32768; + } + for (int i = 0; i < width; i += 2) { + int y0 = y_buf[x >> 16]; + int y1 = y_buf[(x >> 16) + 1]; + int u0 = u_buf[(x >> 17)]; + int u1 = u_buf[(x >> 17) + 1]; + int v0 = v_buf[(x >> 17)]; + int v1 = v_buf[(x >> 17) + 1]; + int y_frac = (x & 65535); + int uv_frac = ((x >> 1) & 65535); + int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; + int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; + YUVPixel(y, u, v, rgb_buf); + x += source_dx; + if ((i + 1) < width) { + y0 = y_buf[x >> 16]; + y1 = y_buf[(x >> 16) + 1]; + y_frac = (x & 65535); + y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; + YUVPixel(y, u, v, rgb_buf+4); + x += source_dx; + } + rgb_buf += 8; + } +} + +} + +namespace media { + +void ConvertYUVToRGB32_C(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type) { + unsigned int y_shift = yuv_type; + for (int y = 0; y < height; ++y) { + uint8* rgb_row = rgbframe + y * rgbstride; + const uint8* y_ptr = yplane + y * ystride; + const uint8* u_ptr = uplane + (y >> y_shift) * uvstride; + const uint8* v_ptr = vplane + (y >> y_shift) * uvstride; + + ConvertYUVToRGB32Row_C(y_ptr, + u_ptr, + v_ptr, + rgb_row, + width); + } +} + +} // namespace media diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.asm b/media/base/simd/convert_yuv_to_rgb_mmx.asm new file mode 100644 index 0000000..e044474 --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_mmx.asm @@ -0,0 +1,22 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX instructions. +; + SECTION_TEXT + CPU MMX + +; Use movq to save the output. +%define MOVQ movq + +; extern "C" void ConvertYUVToRGB32Row_MMX(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width); +%define SYMBOL ConvertYUVToRGB32Row_MMX +%include "convert_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.inc b/media/base/simd/convert_yuv_to_rgb_mmx.inc new file mode 100644 index 0000000..b9555ce --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_mmx.inc @@ -0,0 +1,119 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + + global mangle(SYMBOL) PRIVATE + align function_align + +; Non-PIC code is the fastest so use this if possible. +%ifndef PIC +mangle(SYMBOL): + %assign stack_offset 0 + PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV + extern mangle(kCoefficientsRgbY) + jmp .convertend + +.convertloop: + movzx TEMPUd, BYTE [Uq] + add Uq, 1 + movzx TEMPVd, BYTE [Vq] + add Vq, 1 + movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq] + movzx TEMPUd, BYTE [Yq] + paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq] + movzx TEMPVd, BYTE [Yq + 1] + movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq] + add Yq, 2 + movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + MOVQ [ARGBq], mm1 + add ARGBq, 8 + +.convertend: + sub WIDTHq, 2 + jns .convertloop + + ; If number of pixels is odd then compute it. + and WIDTHq, 1 + jz .convertdone + + movzx TEMPUd, BYTE [Uq] + movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq] + movzx TEMPVd, BYTE [Vq] + paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq] + movzx TEMPUd, BYTE [Yq] + movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ARGBq], mm1 + +.convertdone: + RET +%endif + +; With PIC code we need to load the address of mangle(kCoefficientsRgbY). +; This code is slower than the above version. +%ifdef PIC +mangle(SYMBOL): + %assign stack_offset 0 + PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE + + extern mangle(kCoefficientsRgbY) + LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) + + jmp .convertend + +.convertloop: + movzx TEMPd, BYTE [Uq] + movq mm0, [TABLEq + 2048 + 8 * TEMPq] + add Uq, 1 + + movzx TEMPd, BYTE [Vq] + paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] + add Vq, 1 + + movzx TEMPd, BYTE [Yq] + movq mm1, [TABLEq + 8 * TEMPq] + + movzx TEMPd, BYTE [Yq + 1] + movq mm2, [TABLEq + 8 * TEMPq] + add Yq, 2 + + ; Add UV components to Y component. + paddsw mm1, mm0 + paddsw mm2, mm0 + + ; Down shift and then pack. + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + MOVQ [ARGBq], mm1 + add ARGBq, 8 + +.convertend: + sub WIDTHq, 2 + jns .convertloop + + ; If number of pixels is odd then compute it. + and WIDTHq, 1 + jz .convertdone + + movzx TEMPd, BYTE [Uq] + movq mm0, [TABLEq + 2048 + 8 * TEMPq] + movzx TEMPd, BYTE [Vq] + paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] + movzx TEMPd, BYTE [Yq] + movq mm1, [TABLEq + 8 * TEMPq] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ARGBq], mm1 + +.convertdone: + RET +%endif diff --git a/media/base/simd/convert_yuv_to_rgb_sse.asm b/media/base/simd/convert_yuv_to_rgb_sse.asm new file mode 100644 index 0000000..2f1967a --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_sse.asm @@ -0,0 +1,23 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX and SSE instructions. +; + SECTION_TEXT + CPU MMX, SSE + +; Use SSE instruction movntq can write faster. +%define MOVQ movntq + +; +; extern "C" void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width); +%define SYMBOL ConvertYUVToRGB32Row_SSE +%include "convert_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/convert_yuv_to_rgb_x86.cc b/media/base/simd/convert_yuv_to_rgb_x86.cc new file mode 100644 index 0000000..3e03ef9 --- /dev/null +++ b/media/base/simd/convert_yuv_to_rgb_x86.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#if defined(_MSC_VER) +#include <intrin.h> +#else +#include <mmintrin.h> +#endif + +#include "media/base/cpu_features.h" +#include "media/base/simd/convert_yuv_to_rgb.h" +#include "media/base/yuv_convert.h" + +namespace media { + +void ConvertYUVToRGB32_MMX(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type) { + unsigned int y_shift = yuv_type; + for (int y = 0; y < height; ++y) { + uint8* rgb_row = rgbframe + y * rgbstride; + const uint8* y_ptr = yplane + y * ystride; + const uint8* u_ptr = uplane + (y >> y_shift) * uvstride; + const uint8* v_ptr = vplane + (y >> y_shift) * uvstride; + + ConvertYUVToRGB32Row_MMX(y_ptr, + u_ptr, + v_ptr, + rgb_row, + width); + } + + _mm_empty(); +} + +void ConvertYUVToRGB32_SSE(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int width, + int height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type) { + unsigned int y_shift = yuv_type; + for (int y = 0; y < height; ++y) { + uint8* rgb_row = rgbframe + y * rgbstride; + const uint8* y_ptr = yplane + y * ystride; + const uint8* u_ptr = uplane + (y >> y_shift) * uvstride; + const uint8* v_ptr = vplane + (y >> y_shift) * uvstride; + + ConvertYUVToRGB32Row_SSE(y_ptr, + u_ptr, + v_ptr, + rgb_row, + width); + } + + _mm_empty(); +} + +} // namespace media diff --git a/media/base/simd/filter_yuv.h b/media/base/simd/filter_yuv.h new file mode 100644 index 0000000..5a9cf11 --- /dev/null +++ b/media/base/simd/filter_yuv.h @@ -0,0 +1,29 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef MEDIA_BASE_SIMD_FILTER_YUV_H_ +#define MEDIA_BASE_SIMD_FILTER_YUV_H_ + +#include "base/basictypes.h" + +namespace media { + +typedef void (*FilterYUVRowsProc)(uint8*, + const uint8*, + const uint8*, + int, + int); + +void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction); + +void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction); + +void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction); + +} // namespace media + +#endif // MEDIA_BASE_SIMD_FILTER_YUV_H_ diff --git a/media/base/simd/filter_yuv_c.cc b/media/base/simd/filter_yuv_c.cc new file mode 100644 index 0000000..95ae01a --- /dev/null +++ b/media/base/simd/filter_yuv_c.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "media/base/simd/filter_yuv.h" + +namespace media { + +void FilterYUVRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + uint8* end = ybuf + source_width; + do { + ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; + ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; + ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; + ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; + ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8; + ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; + ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; + ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; + y0_ptr += 8; + y1_ptr += 8; + ybuf += 8; + } while (ybuf < end); +} + +} // namespace media diff --git a/media/base/simd/filter_yuv_mmx.cc b/media/base/simd/filter_yuv_mmx.cc new file mode 100644 index 0000000..77698dc --- /dev/null +++ b/media/base/simd/filter_yuv_mmx.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#if defined(_MSC_VER) +#include <intrin.h> +#else +#include <mmintrin.h> +#include <emmintrin.h> +#endif + +#include "build/build_config.h" +#include "media/base/simd/filter_yuv.h" + +namespace media { + +#if defined(COMPILER_MSVC) +// Warning 4799 is about calling emms before the function exits. +// We calls emms in a frame level so suppress this warning. +#pragma warning(disable: 4799) +#endif + +void FilterYUVRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction) { + __m64 zero = _mm_setzero_si64(); + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); + + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); + __m64* dest64 = reinterpret_cast<__m64*>(ybuf); + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); + + do { + __m64 y0 = *y0_ptr64++; + __m64 y1 = *y1_ptr64++; + __m64 y2 = _mm_unpackhi_pi8(y0, zero); + __m64 y3 = _mm_unpackhi_pi8(y1, zero); + y0 = _mm_unpacklo_pi8(y0, zero); + y1 = _mm_unpacklo_pi8(y1, zero); + y0 = _mm_mullo_pi16(y0, y0_fraction); + y1 = _mm_mullo_pi16(y1, y1_fraction); + y2 = _mm_mullo_pi16(y2, y0_fraction); + y3 = _mm_mullo_pi16(y3, y1_fraction); + y0 = _mm_add_pi16(y0, y1); + y2 = _mm_add_pi16(y2, y3); + y0 = _mm_srli_pi16(y0, 8); + y2 = _mm_srli_pi16(y2, 8); + y0 = _mm_packs_pu16(y0, y2); + *dest64++ = y0; + } while (dest64 < end64); +} + +#if defined(COMPILER_MSVC) +#pragma warning(default: 4799) +#endif + +} // namespace media diff --git a/media/base/simd/filter_yuv_sse2.cc b/media/base/simd/filter_yuv_sse2.cc new file mode 100644 index 0000000..137ac94 --- /dev/null +++ b/media/base/simd/filter_yuv_sse2.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#if defined(_MSC_VER) +#include <intrin.h> +#else +#include <mmintrin.h> +#include <emmintrin.h> +#endif + +#include "media/base/simd/filter_yuv.h" + +namespace media { + +void FilterYUVRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, + int source_width, int source_y_fraction) { + __m128i zero = _mm_setzero_si128(); + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); + + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); + + do { + __m128i y0 = _mm_loadu_si128(y0_ptr128); + __m128i y1 = _mm_loadu_si128(y1_ptr128); + __m128i y2 = _mm_unpackhi_epi8(y0, zero); + __m128i y3 = _mm_unpackhi_epi8(y1, zero); + y0 = _mm_unpacklo_epi8(y0, zero); + y1 = _mm_unpacklo_epi8(y1, zero); + y0 = _mm_mullo_epi16(y0, y0_fraction); + y1 = _mm_mullo_epi16(y1, y1_fraction); + y2 = _mm_mullo_epi16(y2, y0_fraction); + y3 = _mm_mullo_epi16(y3, y1_fraction); + y0 = _mm_add_epi16(y0, y1); + y2 = _mm_add_epi16(y2, y3); + y0 = _mm_srli_epi16(y0, 8); + y2 = _mm_srli_epi16(y2, 8); + y0 = _mm_packus_epi16(y0, y2); + *dest128++ = y0; + ++y0_ptr128; + ++y1_ptr128; + } while (dest128 < end128); +} + +} // namespace media diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm new file mode 100644 index 0000000..7f7e0e8 --- /dev/null +++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.asm @@ -0,0 +1,23 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX instructions. +; + SECTION_TEXT + CPU MMX + +; Use movq to save the output. +%define MOVQ movq + +; void LinearScaleYUVToRGB32Row_MMX(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL LinearScaleYUVToRGB32Row_MMX +%include "linear_scale_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc new file mode 100644 index 0000000..91c06a5 --- /dev/null +++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc @@ -0,0 +1,166 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + + global mangle(SYMBOL) PRIVATE + align function_align + +mangle(SYMBOL): + %assign stack_offset 0 + + extern mangle(kCoefficientsRgbY) + +; Parameters are in the following order: +; 1. Y plane +; 2. U plane +; 3. V plane +; 4. ARGB frame +; 5. Width +; 6. Source dx + +PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP + +%if gprsize == 8 +%define WORD_SIZE QWORD +%else +%define WORD_SIZE DWORD +%endif + +; Define register aliases. +%define Xq R1q ; Current X position +%define COMPLq R2q ; Component A value +%define COMPLd R2d ; Component A value +%define U_ARG_REGq R0q ; U plane address argument +%define V_ARG_REGq R1q ; V plane address argument +%define SOURCE_DX_ARG_REGq R3q ; Source dx argument +%define WIDTH_ARG_REGq R2q ; Width argument + +%ifdef PIC +; PIC code shared COMPR, U and V with the same register. Need to be careful in the +; code they don't mix up. This allows R3q to be used for YUV table. +%define COMPRq R0q ; Component B value +%define COMPRd R0d ; Component B value +%define Uq R0q ; U plane address +%define Vq R0q ; V plane address +%define U_PLANE WORD_SIZE [rsp + 3 * gprsize] +%define TABLE R3q ; Address of the table +%else +; Non-PIC code defines. +%define COMPRq R3q ; Component B value +%define COMPRd R3d ; Component B value +%define Uq R0q ; U plane address +%define Vq R3q ; V plane address +%define TABLE mangle(kCoefficientsRgbY) +%endif + +; Defines for stack variables. These are used in both PIC and non-PIC code. +%define V_PLANE WORD_SIZE [rsp + 2 * gprsize] +%define SOURCE_DX WORD_SIZE [rsp + gprsize] +%define SOURCE_WIDTH WORD_SIZE [rsp] + +; Handle stack variables differently for PIC and non-PIC code. + +%ifdef PIC +; Define stack usage for PIC code. PIC code push U plane onto stack. + PUSH U_ARG_REGq + PUSH V_ARG_REGq + PUSH SOURCE_DX_ARG_REGq + imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx + PUSH WIDTH_ARG_REGq + +; Load the address of kCoefficientsRgbY into TABLE + mov TEMPq, SOURCE_DX_ARG_REGq ; Need to save source_dx first + LOAD_SYM TABLE, mangle(kCoefficientsRgbY) +%define SOURCE_DX_ARG_REGq TEMPq ; Overwrite SOURCE_DX_ARG_REGq to TEMPq +%else +; Define stack usage. Non-PIC code just push 3 registers to stack. + PUSH V_ARG_REGq + PUSH SOURCE_DX_ARG_REGq + imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx + PUSH WIDTH_ARG_REGq +%endif + +%macro EPILOGUE 0 +%ifdef PIC + ADD rsp, 4 * gprsize +%else + ADD rsp, 3 * gprsize +%endif +%endmacro + + xor Xq, Xq ; x = 0 + cmp SOURCE_DX_ARG_REGq, 0x20000 + jl .lscaleend + mov Xq, 0x8000 ; x = 0.5 for 1/2 or less + jmp .lscaleend + +.lscaleloop: +%ifdef PIC + mov Uq, U_PLANE ; PIC code saves U_PLANE on stack. +%endif + +; Define macros for scaling YUV components since they are reused. +%macro SCALEUV 1 + mov TEMPq, Xq + sar TEMPq, 0x11 + movzx COMPLd, BYTE [%1 + TEMPq] + movzx COMPRd, BYTE [%1 + TEMPq + 1] + mov TEMPq, Xq + and TEMPq, 0x1fffe + imul COMPRq, TEMPq + xor TEMPq, 0x1fffe + imul COMPLq, TEMPq + add COMPLq, COMPRq + shr COMPLq, 17 +%endmacro + SCALEUV Uq ; Use the above macro to scale U + movq mm0, [TABLE + 2048 + 8 * COMPLq] + + mov Vq, V_PLANE ; Read V address from stack + SCALEUV Vq ; Use the above macro to scale V + paddsw mm0, [TABLE + 4096 + 8 * COMPLq] + +%macro SCALEY 0 + mov TEMPq, Xq + sar TEMPq, 0x10 + movzx COMPLd, BYTE [Yq + TEMPq] + movzx COMPRd, BYTE [Yq + TEMPq + 1] + mov TEMPq, Xq + add Xq, SOURCE_DX ; Add source_dx from stack + and TEMPq, 0xffff + imul COMPRq, TEMPq + xor TEMPq, 0xffff + imul COMPLq, TEMPq + add COMPLq, COMPRq + shr COMPLq, 16 +%endmacro + SCALEY ; Use the above macro to scale Y1 + movq mm1, [TABLE + 8 * COMPLq] + + cmp Xq, SOURCE_WIDTH ; Compare source_width from stack + jge .lscalelastpixel + + SCALEY ; Use the above macro to sacle Y2 + movq mm2, [TABLE + 8 * COMPLq] + + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 0x6 + psraw mm2, 0x6 + packuswb mm1, mm2 + MOVQ [ARGBq], mm1 + add ARGBq, 0x8 + +.lscaleend: + cmp Xq, SOURCE_WIDTH ; Compare source_width from stack + jl .lscaleloop + EPILOGUE + RET + +.lscalelastpixel: + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ARGBq], mm1 + EPILOGUE + RET diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm new file mode 100644 index 0000000..db7854457 --- /dev/null +++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx_x64.asm @@ -0,0 +1,142 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX instructions. +; + SECTION_TEXT + CPU MMX + +%define SYMBOL LinearScaleYUVToRGB32Row_MMX_X64 + global mangle(SYMBOL) PRIVATE + align function_align + +mangle(SYMBOL): + %assign stack_offset 0 + extern mangle(kCoefficientsRgbY) + +; Parameters are in the following order: +; 1. Y plane +; 2. U plane +; 3. V plane +; 4. ARGB frame +; 5. Width +; 6. Source dx + +PROLOGUE 6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMPL + +%define TABLEq r10 +%define Xq r11 +%define INDEXq r12 +%define COMPRd r13d +%define COMPRq r13 +%define FRACTIONq r14 + + PUSH TABLEq + PUSH Xq + PUSH INDEXq + PUSH COMPRq + PUSH FRACTIONq + +%macro EPILOGUE 0 + POP FRACTIONq + POP COMPRq + POP INDEXq + POP Xq + POP TABLEq +%endmacro + + LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) + + imul WIDTHq, SOURCE_DXq ; source_width = width * source_dx + xor Xq, Xq ; x = 0 + cmp SOURCE_DXq, 0x20000 + jl .lscaleend + mov Xq, 0x8000 ; x = 0.5 for 1/2 or less + jmp .lscaleend + +.lscaleloop: + ; Interpolate U + mov INDEXq, Xq + sar INDEXq, 0x11 + movzx COMPLd, BYTE [Uq + INDEXq] + movzx COMPRd, BYTE [Uq + INDEXq + 1] + mov FRACTIONq, Xq + and FRACTIONq, 0x1fffe + imul COMPRq, FRACTIONq + xor FRACTIONq, 0x1fffe + imul COMPLq, FRACTIONq + add COMPLq, COMPRq + shr COMPLq, 17 + movq mm0, [TABLEq + 2048 + 8 * COMPLq] + + ; Interpolate V + movzx COMPLd, BYTE [Vq + INDEXq] + movzx COMPRd, BYTE [Vq + INDEXq + 1] + ; Trick here to imul COMPL first then COMPR. + ; Saves two instruction. :) + imul COMPLq, FRACTIONq + xor FRACTIONq, 0x1fffe + imul COMPRq, FRACTIONq + add COMPLq, COMPRq + shr COMPLq, 17 + paddsw mm0, [TABLEq + 4096 + 8 * COMPLq] + + ; Interpolate first Y1. + lea INDEXq, [Xq + SOURCE_DXq] ; INDEXq now points to next pixel. + ; Xq points to current pixel. + mov FRACTIONq, Xq + sar Xq, 0x10 + movzx COMPLd, BYTE [Yq + Xq] + movzx COMPRd, BYTE [Yq + Xq + 1] + and FRACTIONq, 0xffff + imul COMPRq, FRACTIONq + xor FRACTIONq, 0xffff + imul COMPLq, FRACTIONq + add COMPLq, COMPRq + shr COMPLq, 16 + movq mm1, [TABLEq + 8 * COMPLq] + + ; Interpolate Y2 if available. + cmp INDEXq, WIDTHq + jge .lscalelastpixel + + lea Xq, [INDEXq + SOURCE_DXq] ; Xq points to next pixel. + ; INDEXq points to current pixel. + mov FRACTIONq, INDEXq + sar INDEXq, 0x10 + movzx COMPLd, BYTE [Yq + INDEXq] + movzx COMPRd, BYTE [Yq + INDEXq + 1] + and FRACTIONq, 0xffff + imul COMPRq, FRACTIONq + xor FRACTIONq, 0xffff + imul COMPLq, FRACTIONq + add COMPLq, COMPRq + shr COMPLq, 16 + movq mm2, [TABLEq + 8 * COMPLq] + + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 0x6 + psraw mm2, 0x6 + packuswb mm1, mm2 + movntq [ARGBq], mm1 + add ARGBq, 0x8 + +.lscaleend: + cmp Xq, WIDTHq + jl .lscaleloop + jmp .epilogue + +.lscalelastpixel: + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ARGBq], mm1 + +.epilogue + EPILOGUE + RET diff --git a/media/base/simd/linear_scale_yuv_to_rgb_sse.asm b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm new file mode 100644 index 0000000..847911c --- /dev/null +++ b/media/base/simd/linear_scale_yuv_to_rgb_sse.asm @@ -0,0 +1,23 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX and SSE instructions. +; + SECTION_TEXT + CPU MMX, SSE + +; Use movq to save the output. +%define MOVQ movntq + +; void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL LinearScaleYUVToRGB32Row_SSE +%include "linear_scale_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.asm b/media/base/simd/scale_yuv_to_rgb_mmx.asm new file mode 100644 index 0000000..6a83757 --- /dev/null +++ b/media/base/simd/scale_yuv_to_rgb_mmx.asm @@ -0,0 +1,23 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX instructions. +; + SECTION_TEXT + CPU MMX + +; Use movq to save the output. +%define MOVQ movq + +; void ScaleYUVToRGB32Row_MMX(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL ScaleYUVToRGB32Row_MMX +%include "scale_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.inc b/media/base/simd/scale_yuv_to_rgb_mmx.inc new file mode 100644 index 0000000..94c101c --- /dev/null +++ b/media/base/simd/scale_yuv_to_rgb_mmx.inc @@ -0,0 +1,115 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + + global mangle(SYMBOL) PRIVATE + align function_align + +mangle(SYMBOL): + %assign stack_offset 0 + + extern mangle(kCoefficientsRgbY) + +; Parameters are in the following order: +; 1. Y plane +; 2. U plane +; 3. V plane +; 4. ARGB frame +; 5. Width +; 6. Source dx + +PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP + +%ifdef ARCH_X86_64 +%define WORD_SIZE QWORD +%else +%define WORD_SIZE DWORD +%endif + +%ifdef PIC + PUSH R1q ; Width +%endif + PUSH R2q ; Source dx + +%define SOURCE_DX WORD_SIZE [rsp] + +; PIC code. +%ifdef PIC + LOAD_SYM R1q, mangle(kCoefficientsRgbY) +%define WIDTH WORD_SIZE [rsp + gprsize] +%define TABLE R1q +%define Xq R2q + +; Non-PIC code. +%else +%define WIDTH R1q +%define TABLE mangle(kCoefficientsRgbY) +%define Xq R2q +%endif + + ; Set Xq index to 0. + xor Xq, Xq + jmp .scaleend + +.scaleloop: + ; TABLE can either be a register or a symbol depending on this is + ; PIC or not. + mov TEMPq, Xq + sar TEMPq, 17 + movzx TEMPd, BYTE [Uq + TEMPq] + movq mm0, [TABLE + 2048 + 8 * TEMPq] + mov TEMPq, Xq + sar TEMPq, 17 + movzx TEMPd, BYTE [Vq + TEMPq] + paddsw mm0, [TABLE + 4096 + 8 * TEMPq] + mov TEMPq, Xq + add Xq, SOURCE_DX + sar TEMPq, 16 + movzx TEMPd, BYTE [Yq + TEMPq] + movq mm1, [TABLE + 8 * TEMPq] + mov TEMPq, Xq + add Xq, SOURCE_DX + sar TEMPq, 16 + movzx TEMPd, BYTE [Yq + TEMPq] + movq mm2, [TABLE + 8 * TEMPq] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + MOVQ QWORD [ARGBq], mm1 + add ARGBq, 8 + +.scaleend: + ; WIDTH can either be a register or memory depending on this is + ; PIC or not. + sub WIDTH, 2 + jns .scaleloop + + and WIDTH, 1 ; odd number of pixels? + jz .scaledone + + mov TEMPq, Xq + sar TEMPq, 17 + movzx TEMPd, BYTE [Uq + TEMPq] + movq mm0, [TABLE + 2048 + 8 * TEMPq] + mov TEMPq, Xq + sar TEMPq, 17 + movzx TEMPd, BYTE [Vq + TEMPq] + paddsw mm0, [TABLE + 4096 + 8 * TEMPq] + mov TEMPq, Xq + sar TEMPq, 16 + movzx TEMPd, BYTE [Yq + TEMPq] + movq mm1, [TABLE + 8 * TEMPq] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd DWORD [ARGBq], mm1 + +.scaledone: +%ifdef PIC + ADD rsp, 2 * gprsize +%else + ADD rsp, gprsize +%endif + RET diff --git a/media/base/simd/scale_yuv_to_rgb_sse.asm b/media/base/simd/scale_yuv_to_rgb_sse.asm new file mode 100644 index 0000000..5b849a6 --- /dev/null +++ b/media/base/simd/scale_yuv_to_rgb_sse.asm @@ -0,0 +1,23 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX and SSE instructions. +; + SECTION_TEXT + CPU MMX, SSE + +; Use movq to save the output. +%define MOVQ movntq + +; void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL ScaleYUVToRGB32Row_SSE +%include "scale_yuv_to_rgb_mmx.inc" diff --git a/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm new file mode 100644 index 0000000..5e58146 --- /dev/null +++ b/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm @@ -0,0 +1,110 @@ +; Copyright (c) 2011 The Chromium Authors. All rights reserved. +; Use of this source code is governed by a BSD-style license that can be +; found in the LICENSE file. + +%include "x86inc.asm" + +; +; This file uses MMX, SSE2 and instructions. +; + SECTION_TEXT + CPU SSE2 + +; void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf, +; const uint8* u_buf, +; const uint8* v_buf, +; uint8* rgb_buf, +; int width, +; int source_dx); +%define SYMBOL ScaleYUVToRGB32Row_SSE2_X64 + + global mangle(SYMBOL) PRIVATE + align function_align + +mangle(SYMBOL): + %assign stack_offset 0 + extern mangle(kCoefficientsRgbY) + +; Parameters are in the following order: +; 1. Y plane +; 2. U plane +; 3. V plane +; 4. ARGB frame +; 5. Width +; 6. Source dx + +PROLOGUE 6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMP + +%define TABLEq r10 +%define Xq r11 +%define INDEXq r12 + PUSH r10 + PUSH r11 + PUSH r12 + + LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) + + ; Set Xq index to 0. + xor Xq, Xq + jmp .scaleend + +.scaleloop: + ; Read UV pixels. + mov INDEXq, Xq + sar INDEXq, 17 + movzx COMPd, BYTE [Uq + INDEXq] + movq xmm0, [TABLEq + 2048 + 8 * COMPq] + movzx COMPd, BYTE [Vq + INDEXq] + movq xmm1, [TABLEq + 4096 + 8 * COMPq] + + ; Read first Y pixel. + lea INDEXq, [Xq + SOURCE_DXq] ; INDEXq nows points to next pixel. + sar Xq, 16 + movzx COMPd, BYTE [Yq + Xq] + paddsw xmm0, xmm1 ; Hide a ADD after memory load. + movq xmm1, [TABLEq + 8 * COMPq] + + ; Read next Y pixel. + lea Xq, [INDEXq + SOURCE_DXq] ; Xq now points to next pixel. + sar INDEXq, 16 + movzx COMPd, BYTE [Yq + INDEXq] + movq xmm2, [TABLEq + 8 * COMPq] + paddsw xmm1, xmm0 + paddsw xmm2, xmm0 + shufps xmm1, xmm2, 0x44 ; Join two pixels into one XMM register + psraw xmm1, 6 + packuswb xmm1, xmm1 + movq QWORD [ARGBq], xmm1 + add ARGBq, 8 + +.scaleend: + sub WIDTHq, 2 + jns .scaleloop + + and WIDTHq, 1 ; odd number of pixels? + jz .scaledone + + ; Read U V components. + mov INDEXq, Xq + sar INDEXq, 17 + movzx COMPd, BYTE [Uq + INDEXq] + movq xmm0, [TABLEq + 2048 + 8 * COMPq] + movzx COMPd, BYTE [Vq + INDEXq] + movq xmm1, [TABLEq + 4096 + 8 * COMPq] + paddsw xmm0, xmm1 + + ; Read one Y component. + mov INDEXq, Xq + sar INDEXq, 16 + movzx COMPd, BYTE [Yq + INDEXq] + movq xmm1, [TABLEq + 8 * COMPq] + paddsw xmm1, xmm0 + psraw xmm1, 6 + packuswb xmm1, xmm1 + movd DWORD [ARGBq], xmm1 + +.scaledone: + POP r12 + POP r11 + POP r10 + RET diff --git a/media/base/simd/x86inc.asm b/media/base/simd/x86inc.asm index 956b999..5e0ca20 100644 --- a/media/base/simd/x86inc.asm +++ b/media/base/simd/x86inc.asm @@ -95,11 +95,14 @@ %ifdef WIN64 %define PIC %elifndef ARCH_X86_64 +; For chromium we may build PIC code even for 32 bits system. +%ifndef CHROMIUM ; x86_32 doesn't require PIC. ; Some distros prefer shared objects to be PIC, but nothing breaks if ; the code contains a few textrels, so we'll skip that complexity. %undef PIC %endif +%endif %ifdef PIC default rel %endif @@ -947,6 +950,11 @@ AVX_INSTR pfmul, 1, 0 ;============================================================================= %ifdef CHROMIUM +; Always build PIC code on Mac for Chromium. +%ifdef MACHO +%define PIC +%endif + ; ; LOAD_SYM %1 (reg), %2 (sym) ; Copies the address to a local symbol to the specified register. |