summaryrefslogtreecommitdiffstats
path: root/media
diff options
context:
space:
mode:
authorrileya@chromium.org <rileya@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2014-05-01 00:28:15 +0000
committerrileya@chromium.org <rileya@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2014-05-01 00:28:15 +0000
commitb45fee76a0c12d22b77d932f33b89f49e79f9844 (patch)
tree203afa571d78777d5cf44b78718bf0317b12e37e /media
parent747883d7334d9fcd4e2b5902f94b949b58075ca2 (diff)
downloadchromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.zip
chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.tar.gz
chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.tar.bz2
Remove non-PIC specializations of media SIMD YUV conversion routines.
This change is made with an eye towards supporting different color ranges in YUV conversion (namely, YUVJ420P). This will require passing in a pointer to a conversion table, so the non-PIC code which hardcodes the table address is problematic (it is likely possible to code around this and maintain some of the slight perf gain of the non-PIC code, but it would require even more ugly, difficult-to-maintain code). This will cause a small performance regression for platforms where PIC code is not required (32-bit Windows only afaik). The nearest-neighbor scaling routines take the biggest perf hit (up to 20%), but are currently not actually used anywhere. The rest (straight conversion, and bilinear scaling) didn't show an appreciable performance hit in my tests. BUG=172898 Review URL: https://codereview.chromium.org/245103003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@267390 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media')
-rw-r--r--media/base/simd/convert_yuv_to_rgb_mmx.inc55
-rw-r--r--media/base/simd/convert_yuva_to_argb_mmx.inc82
-rw-r--r--media/base/simd/linear_scale_yuv_to_rgb_mmx.inc33
-rw-r--r--media/base/simd/scale_yuv_to_rgb_mmx.inc19
4 files changed, 3 insertions, 186 deletions
diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.inc b/media/base/simd/convert_yuv_to_rgb_mmx.inc
index e38794a..f143574 100644
--- a/media/base/simd/convert_yuv_to_rgb_mmx.inc
+++ b/media/base/simd/convert_yuv_to_rgb_mmx.inc
@@ -7,60 +7,6 @@
EXPORT SYMBOL
align function_align
-; Non-PIC code is the fastest so use this if possible.
-%ifndef PIC
-mangle(SYMBOL):
- %assign stack_offset 0
- PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV
- extern mangle(kCoefficientsRgbY)
- jmp .convertend
-
-.convertloop:
- movzx TEMPUd, BYTE [Uq]
- add Uq, 1
- movzx TEMPVd, BYTE [Vq]
- add Vq, 1
- movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
- movzx TEMPUd, BYTE [Yq]
- paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
- movzx TEMPVd, BYTE [Yq + 1]
- movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
- add Yq, 2
- movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq]
- paddsw mm1, mm0
- paddsw mm2, mm0
- psraw mm1, 6
- psraw mm2, 6
- packuswb mm1, mm2
- MOVQ [ARGBq], mm1
- add ARGBq, 8
-
-.convertend:
- sub WIDTHq, 2
- jns .convertloop
-
- ; If number of pixels is odd then compute it.
- and WIDTHq, 1
- jz .convertdone
-
- movzx TEMPUd, BYTE [Uq]
- movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
- movzx TEMPVd, BYTE [Vq]
- paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
- movzx TEMPUd, BYTE [Yq]
- movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
- paddsw mm1, mm0
- psraw mm1, 6
- packuswb mm1, mm1
- movd [ARGBq], mm1
-
-.convertdone:
- RET
-%endif
-
-; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
-; This code is slower than the above version.
-%ifdef PIC
mangle(SYMBOL):
%assign stack_offset 0
PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE
@@ -118,4 +64,3 @@ mangle(SYMBOL):
.convertdone:
RET
-%endif
diff --git a/media/base/simd/convert_yuva_to_argb_mmx.inc b/media/base/simd/convert_yuva_to_argb_mmx.inc
index bcafb38..2e9e62d 100644
--- a/media/base/simd/convert_yuva_to_argb_mmx.inc
+++ b/media/base/simd/convert_yuva_to_argb_mmx.inc
@@ -7,87 +7,6 @@
EXPORT SYMBOL
align function_align
-; Non-PIC code is the fastest so use this if possible.
-%ifndef PIC
-mangle(SYMBOL):
- %assign stack_offset 0
- PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP
- extern mangle(kCoefficientsRgbY)
- jmp .convertend
-
-.convertloop:
- movzx TEMPd, BYTE [Uq]
- movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq]
- add Uq, 1
- movzx TEMPd, BYTE [Vq]
- paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq]
- add Vq, 1
- movzx TEMPd, BYTE [Yq]
- movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
- movzx TEMPd, BYTE [Yq + 1]
- movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
- add Yq, 2
- paddsw mm1, mm0
- paddsw mm2, mm0
- psraw mm1, 6
- psraw mm2, 6
- packuswb mm1, mm2
-
- ; Multiply ARGB by alpha value.
- movq mm0, mm1
- pxor mm2, mm2
- punpcklbw mm0, mm2
- punpckhbw mm1, mm2
- movzx TEMPd, BYTE [Aq]
- movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq]
- pmullw mm0, mm2
- psrlw mm0, 8
- movzx TEMPd, BYTE [Aq + 1]
- movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq]
- add Aq, 2
- pmullw mm1, mm2
- psrlw mm1, 8
- packuswb mm0, mm1
-
- MOVQ [ARGBq], mm0
- add ARGBq, 8
-
-.convertend:
- sub WIDTHq, 2
- jns .convertloop
-
- ; If number of pixels is odd then compute it.
- and WIDTHq, 1
- jz .convertdone
-
- movzx TEMPd, BYTE [Uq]
- movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq]
- movzx TEMPd, BYTE [Vq]
- paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq]
- movzx TEMPd, BYTE [Yq]
- movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
- paddsw mm1, mm0
- psraw mm1, 6
- packuswb mm1, mm1
-
- ; Multiply ARGB by alpha value.
- pxor mm0, mm0
- punpcklbw mm1, mm0
- movzx TEMPd, BYTE [Aq]
- movq mm0, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq]
- pmullw mm1, mm0
- psrlw mm1, 8
- packuswb mm1, mm1
-
- movd [ARGBq], mm1
-
-.convertdone:
- RET
-%endif
-
-; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
-; This code is slower than the above version.
-%ifdef PIC
mangle(SYMBOL):
%assign stack_offset 0
PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP
@@ -173,4 +92,3 @@ mangle(SYMBOL):
.convertdone:
POP TABLEq
RET
-%endif \ No newline at end of file
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
index 493e9b3..dce591d 100644
--- a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
@@ -37,33 +37,19 @@ PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP
%define SOURCE_DX_ARG_REGq R3q ; Source dx argument
%define WIDTH_ARG_REGq R2q ; Width argument
-%ifdef PIC
-; PIC code shared COMPR, U and V with the same register. Need to be careful in the
-; code they don't mix up. This allows R3q to be used for YUV table.
%define COMPRq R0q ; Component B value
%define COMPRd R0d ; Component B value
%define Uq R0q ; U plane address
%define Vq R0q ; V plane address
%define U_PLANE WORD_SIZE [rsp + 3 * gprsize]
%define TABLE R3q ; Address of the table
-%else
-; Non-PIC code defines.
-%define COMPRq R3q ; Component B value
-%define COMPRd R3d ; Component B value
-%define Uq R0q ; U plane address
-%define Vq R3q ; V plane address
-%define TABLE mangle(kCoefficientsRgbY)
-%endif
-; Defines for stack variables. These are used in both PIC and non-PIC code.
+; Defines for stack variables.
%define V_PLANE WORD_SIZE [rsp + 2 * gprsize]
%define SOURCE_DX WORD_SIZE [rsp + gprsize]
%define SOURCE_WIDTH WORD_SIZE [rsp]
-; Handle stack variables differently for PIC and non-PIC code.
-
-%ifdef PIC
-; Define stack usage for PIC code. PIC code push U plane onto stack.
+; Define stack usage.
PUSH U_ARG_REGq
PUSH V_ARG_REGq
PUSH SOURCE_DX_ARG_REGq
@@ -74,20 +60,9 @@ PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP
mov TEMPq, SOURCE_DX_ARG_REGq ; Need to save source_dx first
LOAD_SYM TABLE, mangle(kCoefficientsRgbY)
%define SOURCE_DX_ARG_REGq TEMPq ; Overwrite SOURCE_DX_ARG_REGq to TEMPq
-%else
-; Define stack usage. Non-PIC code just push 3 registers to stack.
- PUSH V_ARG_REGq
- PUSH SOURCE_DX_ARG_REGq
- imul WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq ; source_width = width * source_dx
- PUSH WIDTH_ARG_REGq
-%endif
%macro EPILOGUE 0
-%ifdef PIC
ADD rsp, 4 * gprsize
-%else
- ADD rsp, 3 * gprsize
-%endif
%endmacro
xor Xq, Xq ; x = 0
@@ -97,9 +72,7 @@ PROLOGUE 6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP
jmp .lscaleend
.lscaleloop:
-%ifdef PIC
- mov Uq, U_PLANE ; PIC code saves U_PLANE on stack.
-%endif
+ mov Uq, U_PLANE
; Define macros for scaling YUV components since they are reused.
%macro SCALEUV 1
diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.inc b/media/base/simd/scale_yuv_to_rgb_mmx.inc
index 2026390..a599b0c 100644
--- a/media/base/simd/scale_yuv_to_rgb_mmx.inc
+++ b/media/base/simd/scale_yuv_to_rgb_mmx.inc
@@ -28,34 +28,21 @@ PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP
%define WORD_SIZE DWORD
%endif
-%ifdef PIC
PUSH R1q ; Width
-%endif
PUSH R2q ; Source dx
%define SOURCE_DX WORD_SIZE [rsp]
-; PIC code.
-%ifdef PIC
LOAD_SYM R1q, mangle(kCoefficientsRgbY)
%define WIDTH WORD_SIZE [rsp + gprsize]
%define TABLE R1q
%define Xq R2q
-; Non-PIC code.
-%else
-%define WIDTH R1q
-%define TABLE mangle(kCoefficientsRgbY)
-%define Xq R2q
-%endif
-
; Set Xq index to 0.
xor Xq, Xq
jmp .scaleend
.scaleloop:
- ; TABLE can either be a register or a symbol depending on this is
- ; PIC or not.
mov TEMPq, Xq
sar TEMPq, 17
movzx TEMPd, BYTE [Uq + TEMPq]
@@ -83,8 +70,6 @@ PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP
add ARGBq, 8
.scaleend:
- ; WIDTH can either be a register or memory depending on this is
- ; PIC or not.
sub WIDTH, 2
jns .scaleloop
@@ -109,9 +94,5 @@ PROLOGUE 6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP
movd DWORD [ARGBq], mm1
.scaledone:
-%ifdef PIC
ADD rsp, 2 * gprsize
-%else
- ADD rsp, gprsize
-%endif
RET