diff options
author | rileya@chromium.org <rileya@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-05-01 00:28:15 +0000 |
---|---|---|
committer | rileya@chromium.org <rileya@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-05-01 00:28:15 +0000 |
commit | b45fee76a0c12d22b77d932f33b89f49e79f9844 (patch) | |
tree | 203afa571d78777d5cf44b78718bf0317b12e37e /media | |
parent | 747883d7334d9fcd4e2b5902f94b949b58075ca2 (diff) | |
download | chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.zip chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.tar.gz chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.tar.bz2 |
Remove non-PIC specializations of media SIMD YUV conversion routines.
This change is made with an eye towards supporting different color ranges in YUV conversion (namely, YUVJ420P). This will require passing in a pointer to a conversion table, so the non-PIC code which hardcodes the table address is problematic (it is likely possible to code around this and maintain some of the slight perf gain of the non-PIC code, but it would require even more ugly, difficult-to-maintain code).
This will cause a small performance regression for platforms where PIC code is not required (32-bit Windows only afaik).
The nearest-neighbor scaling routines take the biggest perf hit (up to 20%), but are currently not actually used anywhere. The rest (straight conversion, and bilinear scaling) didn't show an appreciable performance hit in my tests.
BUG=172898
Review URL: https://codereview.chromium.org/245103003
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@267390 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media')
-rw-r--r-- | media/base/simd/convert_yuv_to_rgb_mmx.inc | 55 | ||||
-rw-r--r-- | media/base/simd/convert_yuva_to_argb_mmx.inc | 82 | ||||
-rw-r--r-- | media/base/simd/linear_scale_yuv_to_rgb_mmx.inc | 33 | ||||
-rw-r--r-- | media/base/simd/scale_yuv_to_rgb_mmx.inc | 19 |
4 files changed, 3 insertions, 186 deletions
diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.inc b/media/base/simd/convert_yuv_to_rgb_mmx.inc index e38794a..f143574 100644 --- a/media/base/simd/convert_yuv_to_rgb_mmx.inc +++ b/media/base/simd/convert_yuv_to_rgb_mmx.inc @@ -7,60 +7,6 @@ EXPORT SYMBOL align function_align -; Non-PIC code is the fastest so use this if possible. -%ifndef PIC -mangle(SYMBOL): - %assign stack_offset 0 - PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV - extern mangle(kCoefficientsRgbY) - jmp .convertend - -.convertloop: - movzx TEMPUd, BYTE [Uq] - add Uq, 1 - movzx TEMPVd, BYTE [Vq] - add Vq, 1 - movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq] - movzx TEMPUd, BYTE [Yq] - paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq] - movzx TEMPVd, BYTE [Yq + 1] - movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq] - add Yq, 2 - movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq] - paddsw mm1, mm0 - paddsw mm2, mm0 - psraw mm1, 6 - psraw mm2, 6 - packuswb mm1, mm2 - MOVQ [ARGBq], mm1 - add ARGBq, 8 - -.convertend: - sub WIDTHq, 2 - jns .convertloop - - ; If number of pixels is odd then compute it. - and WIDTHq, 1 - jz .convertdone - - movzx TEMPUd, BYTE [Uq] - movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq] - movzx TEMPVd, BYTE [Vq] - paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq] - movzx TEMPUd, BYTE [Yq] - movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq] - paddsw mm1, mm0 - psraw mm1, 6 - packuswb mm1, mm1 - movd [ARGBq], mm1 - -.convertdone: - RET -%endif - -; With PIC code we need to load the address of mangle(kCoefficientsRgbY). -; This code is slower than the above version. -%ifdef PIC mangle(SYMBOL): %assign stack_offset 0 PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE @@ -118,4 +64,3 @@ mangle(SYMBOL): .convertdone: RET -%endif diff --git a/media/base/simd/convert_yuva_to_argb_mmx.inc b/media/base/simd/convert_yuva_to_argb_mmx.inc index bcafb38..2e9e62d 100644 --- a/media/base/simd/convert_yuva_to_argb_mmx.inc +++ b/media/base/simd/convert_yuva_to_argb_mmx.inc @@ -7,87 +7,6 @@ EXPORT SYMBOL align function_align -; Non-PIC code is the fastest so use this if possible. -%ifndef PIC -mangle(SYMBOL): - %assign stack_offset 0 - PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP - extern mangle(kCoefficientsRgbY) - jmp .convertend - -.convertloop: - movzx TEMPd, BYTE [Uq] - movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] - add Uq, 1 - movzx TEMPd, BYTE [Vq] - paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] - add Vq, 1 - movzx TEMPd, BYTE [Yq] - movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] - movzx TEMPd, BYTE [Yq + 1] - movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq] - add Yq, 2 - paddsw mm1, mm0 - paddsw mm2, mm0 - psraw mm1, 6 - psraw mm2, 6 - packuswb mm1, mm2 - - ; Multiply ARGB by alpha value. - movq mm0, mm1 - pxor mm2, mm2 - punpcklbw mm0, mm2 - punpckhbw mm1, mm2 - movzx TEMPd, BYTE [Aq] - movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] - pmullw mm0, mm2 - psrlw mm0, 8 - movzx TEMPd, BYTE [Aq + 1] - movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] - add Aq, 2 - pmullw mm1, mm2 - psrlw mm1, 8 - packuswb mm0, mm1 - - MOVQ [ARGBq], mm0 - add ARGBq, 8 - -.convertend: - sub WIDTHq, 2 - jns .convertloop - - ; If number of pixels is odd then compute it. - and WIDTHq, 1 - jz .convertdone - - movzx TEMPd, BYTE [Uq] - movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] - movzx TEMPd, BYTE [Vq] - paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] - movzx TEMPd, BYTE [Yq] - movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] - paddsw mm1, mm0 - psraw mm1, 6 - packuswb mm1, mm1 - - ; Multiply ARGB by alpha value. - pxor mm0, mm0 - punpcklbw mm1, mm0 - movzx TEMPd, BYTE [Aq] - movq mm0, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] - pmullw mm1, mm0 - psrlw mm1, 8 - packuswb mm1, mm1 - - movd [ARGBq], mm1 - -.convertdone: - RET -%endif - -; With PIC code we need to load the address of mangle(kCoefficientsRgbY). -; This code is slower than the above version. -%ifdef PIC mangle(SYMBOL): %assign stack_offset 0 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP @@ -173,4 +92,3 @@ mangle(SYMBOL): .convertdone: POP TABLEq RET -%endif
\ No newline at end of file diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc index 493e9b3..dce591d 100644 --- a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc +++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc @@ -37,33 +37,19 @@ PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP %define SOURCE_DX_ARG_REGq R3q ; Source dx argument %define WIDTH_ARG_REGq R2q ; Width argument -%ifdef PIC -; PIC code shared COMPR, U and V with the same register. Need to be careful in the -; code they don't mix up. This allows R3q to be used for YUV table. %define COMPRq R0q ; Component B value %define COMPRd R0d ; Component B value %define Uq R0q ; U plane address %define Vq R0q ; V plane address %define U_PLANE WORD_SIZE [rsp + 3 * gprsize] %define TABLE R3q ; Address of the table -%else -; Non-PIC code defines. -%define COMPRq R3q ; Component B value -%define COMPRd R3d ; Component B value -%define Uq R0q ; U plane address -%define Vq R3q ; V plane address -%define TABLE mangle(kCoefficientsRgbY) -%endif -; Defines for stack variables. These are used in both PIC and non-PIC code. +; Defines for stack variables. %define V_PLANE WORD_SIZE [rsp + 2 * gprsize] %define SOURCE_DX WORD_SIZE [rsp + gprsize] %define SOURCE_WIDTH WORD_SIZE [rsp] -; Handle stack variables differently for PIC and non-PIC code. - -%ifdef PIC -; Define stack usage for PIC code. PIC code push U plane onto stack. +; Define stack usage. PUSH U_ARG_REGq PUSH V_ARG_REGq PUSH SOURCE_DX_ARG_REGq @@ -74,20 +60,9 @@ PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP mov TEMPq, SOURCE_DX_ARG_REGq ; Need to save source_dx first LOAD_SYM TABLE, mangle(kCoefficientsRgbY) %define SOURCE_DX_ARG_REGq TEMPq ; Overwrite SOURCE_DX_ARG_REGq to TEMPq -%else -; Define stack usage. Non-PIC code just push 3 registers to stack. - PUSH V_ARG_REGq - PUSH SOURCE_DX_ARG_REGq - imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx - PUSH WIDTH_ARG_REGq -%endif %macro EPILOGUE 0 -%ifdef PIC ADD rsp, 4 * gprsize -%else - ADD rsp, 3 * gprsize -%endif %endmacro xor Xq, Xq ; x = 0 @@ -97,9 +72,7 @@ PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP jmp .lscaleend .lscaleloop: -%ifdef PIC - mov Uq, U_PLANE ; PIC code saves U_PLANE on stack. -%endif + mov Uq, U_PLANE ; Define macros for scaling YUV components since they are reused. %macro SCALEUV 1 diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.inc b/media/base/simd/scale_yuv_to_rgb_mmx.inc index 2026390..a599b0c 100644 --- a/media/base/simd/scale_yuv_to_rgb_mmx.inc +++ b/media/base/simd/scale_yuv_to_rgb_mmx.inc @@ -28,34 +28,21 @@ PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP %define WORD_SIZE DWORD %endif -%ifdef PIC PUSH R1q ; Width -%endif PUSH R2q ; Source dx %define SOURCE_DX WORD_SIZE [rsp] -; PIC code. -%ifdef PIC LOAD_SYM R1q, mangle(kCoefficientsRgbY) %define WIDTH WORD_SIZE [rsp + gprsize] %define TABLE R1q %define Xq R2q -; Non-PIC code. -%else -%define WIDTH R1q -%define TABLE mangle(kCoefficientsRgbY) -%define Xq R2q -%endif - ; Set Xq index to 0. xor Xq, Xq jmp .scaleend .scaleloop: - ; TABLE can either be a register or a symbol depending on this is - ; PIC or not. mov TEMPq, Xq sar TEMPq, 17 movzx TEMPd, BYTE [Uq + TEMPq] @@ -83,8 +70,6 @@ PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP add ARGBq, 8 .scaleend: - ; WIDTH can either be a register or memory depending on this is - ; PIC or not. sub WIDTH, 2 jns .scaleloop @@ -109,9 +94,5 @@ PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP movd DWORD [ARGBq], mm1 .scaledone: -%ifdef PIC ADD rsp, 2 * gprsize -%else - ADD rsp, gprsize -%endif RET |