summaryrefslogtreecommitdiffstats
path: root/media/base/simd/scale_yuv_to_rgb_sse2_x64.asm
blob: bdb80c1a2f9124804ab31389d3567a5f55936010 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
; Copyright (c) 2011 The Chromium Authors. All rights reserved.
; Use of this source code is governed by a BSD-style license that can be
; found in the LICENSE file.

%include "third_party/x86inc/x86inc.asm"

;
; This file uses MMX, SSE2 and instructions.
;
  SECTION_TEXT
  CPU       SSE2

; void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf,
;                                  const uint8* u_buf,
;                                  const uint8* v_buf,
;                                  uint8* rgb_buf,
;                                  ptrdiff_t width,
;                                  ptrdiff_t source_dx);
%define SYMBOL ScaleYUVToRGB32Row_SSE2_X64

  global    mangle(SYMBOL) PRIVATE
  align     function_align

mangle(SYMBOL):
  %assign   stack_offset 0
  extern    mangle(kCoefficientsRgbY)

; Parameters are in the following order:
; 1. Y plane
; 2. U plane
; 3. V plane
; 4. ARGB frame
; 5. Width
; 6. Source dx

PROLOGUE  6, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, COMP

%define     TABLEq   r10
%define     Xq       r11
%define     INDEXq   r12
  PUSH      r10
  PUSH      r11
  PUSH      r12

  LOAD_SYM  TABLEq, mangle(kCoefficientsRgbY)

  ; Set Xq index to 0.
  xor       Xq, Xq
  jmp       .scaleend

.scaleloop:
  ; Read UV pixels.
  mov       INDEXq, Xq
  sar       INDEXq, 17
  movzx     COMPd, BYTE [Uq + INDEXq]
  movq      xmm0, [TABLEq + 2048 + 8 * COMPq]
  movzx     COMPd, BYTE [Vq + INDEXq]
  movq      xmm1, [TABLEq + 4096 + 8 * COMPq]

  ; Read first Y pixel.
  lea       INDEXq, [Xq + SOURCE_DXq] ; INDEXq nows points to next pixel.
  sar       Xq, 16
  movzx     COMPd, BYTE [Yq + Xq]
  paddsw    xmm0, xmm1		      ; Hide a ADD after memory load.
  movq      xmm1, [TABLEq + 8 * COMPq]

  ;  Read next Y pixel.
  lea       Xq, [INDEXq + SOURCE_DXq] ; Xq now points to next pixel.
  sar       INDEXq, 16
  movzx     COMPd, BYTE [Yq + INDEXq]
  movq      xmm2, [TABLEq + 8 * COMPq]
  paddsw    xmm1, xmm0
  paddsw    xmm2, xmm0
  shufps    xmm1, xmm2, 0x44          ; Join two pixels into one XMM register
  psraw     xmm1, 6
  packuswb  xmm1, xmm1
  movq      QWORD [ARGBq], xmm1
  add       ARGBq, 8

.scaleend:
  sub       WIDTHq, 2
  jns       .scaleloop

  and       WIDTHq, 1                 ; odd number of pixels?
  jz        .scaledone

  ; Read U V components.
  mov       INDEXq, Xq
  sar       INDEXq, 17
  movzx     COMPd, BYTE [Uq + INDEXq]
  movq      xmm0, [TABLEq + 2048 + 8 * COMPq]
  movzx     COMPd, BYTE [Vq + INDEXq]
  movq      xmm1, [TABLEq + 4096 + 8 * COMPq]
  paddsw    xmm0, xmm1

  ; Read one Y component.
  mov       INDEXq, Xq
  sar       INDEXq, 16
  movzx     COMPd, BYTE [Yq + INDEXq]
  movq      xmm1, [TABLEq + 8 * COMPq]
  paddsw    xmm1, xmm0
  psraw     xmm1, 6
  packuswb  xmm1, xmm1
  movd      DWORD [ARGBq], xmm1

.scaledone:
  POP       r12
  POP       r11
  POP       r10
  RET