Remove non-PIC specializations of media SIMD YUV conversion routines.

This change is made with an eye towards supporting different color ranges in YUV conversion (namely, YUVJ420P). This will require passing in a pointer to a conversion table, so the non-PIC code which hardcodes the table address is problematic (it is likely possible to code around this and maintain some of the slight perf gain of the non-PIC code, but it would require even more ugly, difficult-to-maintain code). This will cause a small performance regression for platforms where PIC code is not required (32-bit Windows only afaik). The nearest-neighbor scaling routines take the biggest perf hit (up to 20%), but are currently not actually used anywhere. The rest (straight conversion, and bilinear scaling) didn't show an appreciable performance hit in my tests. BUG=172898 Review URL: https://codereview.chromium.org/245103003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@267390 0039d316-1c4b-4281-b951-d872f2087c98
author: rileya@chromium.org <rileya@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2014-05-01 00:28:15 +0000
committer: rileya@chromium.org <rileya@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2014-05-01 00:28:15 +0000
commit: b45fee76a0c12d22b77d932f33b89f49e79f9844 (patch)
tree: 203afa571d78777d5cf44b78718bf0317b12e37e /media
parent: 747883d7334d9fcd4e2b5902f94b949b58075ca2 (diff)
download: chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.zip
chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.tar.gz
chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.tar.bz2
4 files changed, 3 insertions, 186 deletions
diff --git a/media/base/simd/convert_yuv_to_rgb_mmx.inc b/media/base/simd/convert_yuv_to_rgb_mmx.inc
index e38794a..f143574 100644
--- a/media/base/simd/convert_yuv_to_rgb_mmx.inc
+++ b/media/base/simd/convert_yuv_to_rgb_mmx.inc
@@ -7,60 +7,6 @@
   EXPORT    SYMBOL
   align     function_align
 
-; Non-PIC code is the fastest so use this if possible.
-%ifndef PIC
-mangle(SYMBOL):
-  %assign   stack_offset 0
-  PROLOGUE  5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV
-  extern    mangle(kCoefficientsRgbY)
-  jmp       .convertend
-
-.convertloop:
-  movzx     TEMPUd, BYTE [Uq]
-  add       Uq, 1
-  movzx     TEMPVd, BYTE [Vq]
-  add       Vq, 1
-  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
-  movzx     TEMPUd, BYTE [Yq]
-  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
-  movzx     TEMPVd, BYTE [Yq + 1]
-  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
-  add       Yq, 2
-  movq      mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq]
-  paddsw    mm1, mm0
-  paddsw    mm2, mm0
-  psraw     mm1, 6
-  psraw     mm2, 6
-  packuswb  mm1, mm2
-  MOVQ      [ARGBq], mm1
-  add       ARGBq, 8
-
-.convertend:
-  sub       WIDTHq, 2
-  jns       .convertloop
-
-  ; If number of pixels is odd then compute it.
-  and       WIDTHq, 1
-  jz        .convertdone
-
-  movzx     TEMPUd, BYTE [Uq]
-  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
-  movzx     TEMPVd, BYTE [Vq]
-  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
-  movzx     TEMPUd, BYTE [Yq]
-  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
-  paddsw    mm1, mm0
-  psraw     mm1, 6
-  packuswb  mm1, mm1
-  movd      [ARGBq], mm1
-
-.convertdone:
-  RET
-%endif
-
-; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
-; This code is slower than the above version.
-%ifdef PIC
 mangle(SYMBOL):
   %assign   stack_offset 0
   PROLOGUE  5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE
@@ -118,4 +64,3 @@ mangle(SYMBOL):
 
 .convertdone:
   RET
-%endif
diff --git a/media/base/simd/convert_yuva_to_argb_mmx.inc b/media/base/simd/convert_yuva_to_argb_mmx.inc
index bcafb38..2e9e62d 100644
--- a/media/base/simd/convert_yuva_to_argb_mmx.inc
+++ b/media/base/simd/convert_yuva_to_argb_mmx.inc
@@ -7,87 +7,6 @@
   EXPORT    SYMBOL
   align     function_align
 
-; Non-PIC code is the fastest so use this if possible.
-%ifndef PIC
-mangle(SYMBOL):
-  %assign   stack_offset 0
-  PROLOGUE  6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP
-  extern    mangle(kCoefficientsRgbY)
-  jmp       .convertend
-
-.convertloop:
-  movzx     TEMPd, BYTE [Uq]
-  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq]
-  add       Uq, 1
-  movzx     TEMPd, BYTE [Vq]
-  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq]
-  add       Vq, 1
-  movzx     TEMPd, BYTE [Yq]
-  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
-  movzx     TEMPd, BYTE [Yq + 1]
-  movq      mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
-  add       Yq, 2
-  paddsw    mm1, mm0
-  paddsw    mm2, mm0
-  psraw     mm1, 6
-  psraw     mm2, 6
-  packuswb  mm1, mm2
-
-  ; Multiply ARGB by alpha value.
-  movq      mm0, mm1
-  pxor      mm2, mm2
-  punpcklbw mm0, mm2
-  punpckhbw mm1, mm2
-  movzx     TEMPd, BYTE [Aq]
-  movq      mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq]
-  pmullw    mm0, mm2
-  psrlw     mm0, 8
-  movzx     TEMPd, BYTE [Aq + 1]
-  movq      mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq]
-  add       Aq, 2
-  pmullw    mm1, mm2
-  psrlw     mm1, 8
-  packuswb  mm0, mm1
-
-  MOVQ      [ARGBq], mm0
-  add       ARGBq, 8
-
-.convertend:
-  sub       WIDTHq, 2
-  jns       .convertloop
-
-  ; If number of pixels is odd then compute it.
-  and       WIDTHq, 1
-  jz        .convertdone
-
-  movzx     TEMPd, BYTE [Uq]
-  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq]
-  movzx     TEMPd, BYTE [Vq]
-  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq]
-  movzx     TEMPd, BYTE [Yq]
-  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
-  paddsw    mm1, mm0
-  psraw     mm1, 6
-  packuswb  mm1, mm1
-
-  ; Multiply ARGB by alpha value.
-  pxor      mm0, mm0
-  punpcklbw mm1, mm0
-  movzx     TEMPd, BYTE [Aq]
-  movq      mm0, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq]
-  pmullw    mm1, mm0
-  psrlw     mm1, 8
-  packuswb  mm1, mm1
-
-  movd      [ARGBq], mm1
-
-.convertdone:
-  RET
-%endif
-
-; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
-; This code is slower than the above version.
-%ifdef PIC
 mangle(SYMBOL):
   %assign   stack_offset 0
   PROLOGUE  6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP
@@ -173,4 +92,3 @@ mangle(SYMBOL):
 .convertdone:
   POP       TABLEq
   RET
-%endif
-\ No newline at end of file
diff --git a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
index 493e9b3..dce591d 100644
--- a/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
+++ b/media/base/simd/linear_scale_yuv_to_rgb_mmx.inc
@@ -37,33 +37,19 @@ PROLOGUE  6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP
 %define     SOURCE_DX_ARG_REGq  R3q     ; Source dx argument
 %define     WIDTH_ARG_REGq      R2q     ; Width argument
 
-%ifdef PIC
-; PIC code shared COMPR, U and V with the same register. Need to be careful in the
-; code they don't mix up. This allows R3q to be used for YUV table.
 %define     COMPRq              R0q     ; Component B value
 %define     COMPRd              R0d     ; Component B value
 %define     Uq                  R0q     ; U plane address
 %define     Vq                  R0q     ; V plane address
 %define     U_PLANE             WORD_SIZE [rsp + 3 * gprsize]
 %define     TABLE               R3q     ; Address of the table
-%else
-; Non-PIC code defines.
-%define     COMPRq              R3q     ; Component B value
-%define     COMPRd              R3d     ; Component B value
-%define     Uq                  R0q     ; U plane address
-%define     Vq                  R3q     ; V plane address
-%define     TABLE               mangle(kCoefficientsRgbY)
-%endif
 
-; Defines for stack variables. These are used in both PIC and non-PIC code.
+; Defines for stack variables.
 %define     V_PLANE             WORD_SIZE [rsp + 2 * gprsize]
 %define     SOURCE_DX           WORD_SIZE [rsp + gprsize]
 %define     SOURCE_WIDTH        WORD_SIZE [rsp]
 
-; Handle stack variables differently for PIC and non-PIC code.
-
-%ifdef PIC
-; Define stack usage for PIC code. PIC code push U plane onto stack.
+; Define stack usage.
   PUSH      U_ARG_REGq
   PUSH      V_ARG_REGq
   PUSH      SOURCE_DX_ARG_REGq
@@ -74,20 +60,9 @@ PROLOGUE  6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP
   mov       TEMPq, SOURCE_DX_ARG_REGq    ; Need to save source_dx first
   LOAD_SYM  TABLE, mangle(kCoefficientsRgbY)
 %define     SOURCE_DX_ARG_REGq  TEMPq   ; Overwrite SOURCE_DX_ARG_REGq to TEMPq
-%else
-; Define stack usage. Non-PIC code just push 3 registers to stack.
-  PUSH      V_ARG_REGq
-  PUSH      SOURCE_DX_ARG_REGq
-  imul      WIDTH_ARG_REGq, SOURCE_DX_ARG_REGq  ; source_width = width * source_dx
-  PUSH      WIDTH_ARG_REGq
-%endif
 
 %macro EPILOGUE 0
-%ifdef PIC
   ADD       rsp, 4 * gprsize
-%else
-  ADD       rsp, 3 * gprsize
-%endif
 %endmacro
 
   xor       Xq, Xq                       ; x = 0
@@ -97,9 +72,7 @@ PROLOGUE  6, 7, 3, Y, R0, R1, ARGB, R2, R3, TEMP
   jmp       .lscaleend
 
 .lscaleloop:
-%ifdef PIC
-  mov       Uq, U_PLANE                  ; PIC code saves U_PLANE on stack.
-%endif
+  mov       Uq, U_PLANE
 
 ; Define macros for scaling YUV components since they are reused.
 %macro SCALEUV 1
diff --git a/media/base/simd/scale_yuv_to_rgb_mmx.inc b/media/base/simd/scale_yuv_to_rgb_mmx.inc
index 2026390..a599b0c 100644
--- a/media/base/simd/scale_yuv_to_rgb_mmx.inc
+++ b/media/base/simd/scale_yuv_to_rgb_mmx.inc
@@ -28,34 +28,21 @@ PROLOGUE  6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP
 %define     WORD_SIZE   DWORD
 %endif
 
-%ifdef PIC
   PUSH      R1q  ; Width
-%endif
   PUSH      R2q  ; Source dx
 
 %define     SOURCE_DX   WORD_SIZE [rsp]
 
-; PIC code.
-%ifdef PIC
   LOAD_SYM  R1q, mangle(kCoefficientsRgbY)
 %define     WIDTH       WORD_SIZE [rsp + gprsize]
 %define     TABLE       R1q
 %define     Xq           R2q
 
-; Non-PIC code.
-%else
-%define     WIDTH       R1q
-%define     TABLE       mangle(kCoefficientsRgbY)
-%define     Xq           R2q
-%endif
-
   ; Set Xq index to 0.
   xor       Xq, Xq
   jmp       .scaleend
 
 .scaleloop:
-  ; TABLE can either be a register or a symbol depending on this is
-  ; PIC or not.
   mov       TEMPq, Xq
   sar       TEMPq, 17
   movzx     TEMPd, BYTE [Uq + TEMPq]
@@ -83,8 +70,6 @@ PROLOGUE  6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP
   add       ARGBq, 8
 
 .scaleend:
-  ; WIDTH can either be a register or memory depending on this is
-  ; PIC or not.
   sub       WIDTH, 2
   jns       .scaleloop
 
@@ -109,9 +94,5 @@ PROLOGUE  6, 7, 3, Y, U, V, ARGB, R1, R2, TEMP
   movd      DWORD [ARGBq], mm1
 
 .scaledone:
-%ifdef PIC
   ADD       rsp, 2 * gprsize
-%else
-  ADD       rsp, gprsize
-%endif
   RET
author	rileya@chromium.org <rileya@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2014-05-01 00:28:15 +0000
committer	rileya@chromium.org <rileya@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2014-05-01 00:28:15 +0000
commit	b45fee76a0c12d22b77d932f33b89f49e79f9844 (patch)
tree	203afa571d78777d5cf44b78718bf0317b12e37e /media
parent	747883d7334d9fcd4e2b5902f94b949b58075ca2 (diff)
download	chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.zip chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.tar.gz chromium_src-b45fee76a0c12d22b77d932f33b89f49e79f9844.tar.bz2