summaryrefslogtreecommitdiffstats
path: root/media/base/simd/convert_yuv_to_rgb_mmx.inc
blob: b9555cee50a69c82662a0b678743d7e07b8535a9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
; Copyright (c) 2011 The Chromium Authors. All rights reserved.
; Use of this source code is governed by a BSD-style license that can be
; found in the LICENSE file.

  global    mangle(SYMBOL) PRIVATE
  align     function_align

; Non-PIC code is the fastest so use this if possible.
%ifndef PIC
mangle(SYMBOL):
  %assign   stack_offset 0
  PROLOGUE  5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV
  extern    mangle(kCoefficientsRgbY)
  jmp       .convertend

.convertloop:
  movzx     TEMPUd, BYTE [Uq]
  add       Uq, 1
  movzx     TEMPVd, BYTE [Vq]
  add       Vq, 1
  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
  movzx     TEMPUd, BYTE [Yq]
  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
  movzx     TEMPVd, BYTE [Yq + 1]
  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
  add       Yq, 2
  movq      mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq]
  paddsw    mm1, mm0
  paddsw    mm2, mm0
  psraw     mm1, 6
  psraw     mm2, 6
  packuswb  mm1, mm2
  MOVQ      [ARGBq], mm1
  add       ARGBq, 8

.convertend:
  sub       WIDTHq, 2
  jns       .convertloop

  ; If number of pixels is odd then compute it.
  and       WIDTHq, 1
  jz        .convertdone

  movzx     TEMPUd, BYTE [Uq]
  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq]
  movzx     TEMPVd, BYTE [Vq]
  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq]
  movzx     TEMPUd, BYTE [Yq]
  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq]
  paddsw    mm1, mm0
  psraw     mm1, 6
  packuswb  mm1, mm1
  movd      [ARGBq], mm1

.convertdone:
  RET
%endif

; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
; This code is slower than the above version.
%ifdef PIC
mangle(SYMBOL):
  %assign   stack_offset 0
  PROLOGUE  5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE

  extern    mangle(kCoefficientsRgbY)
  LOAD_SYM  TABLEq, mangle(kCoefficientsRgbY)

  jmp       .convertend

.convertloop:
  movzx     TEMPd, BYTE [Uq]
  movq      mm0, [TABLEq + 2048 + 8 * TEMPq]
  add       Uq, 1

  movzx     TEMPd, BYTE [Vq]
  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq]
  add       Vq, 1

  movzx     TEMPd, BYTE [Yq]
  movq      mm1, [TABLEq + 8 * TEMPq]

  movzx     TEMPd, BYTE [Yq + 1]
  movq      mm2, [TABLEq + 8 * TEMPq]
  add       Yq, 2

  ; Add UV components to Y component.
  paddsw    mm1, mm0
  paddsw    mm2, mm0

  ; Down shift and then pack.
  psraw     mm1, 6
  psraw     mm2, 6
  packuswb  mm1, mm2
  MOVQ      [ARGBq], mm1
  add       ARGBq, 8

.convertend:
  sub       WIDTHq, 2
  jns       .convertloop

  ; If number of pixels is odd then compute it.
  and       WIDTHq, 1
  jz        .convertdone

  movzx     TEMPd, BYTE [Uq]
  movq      mm0, [TABLEq + 2048 + 8 * TEMPq]
  movzx     TEMPd, BYTE [Vq]
  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq]
  movzx     TEMPd, BYTE [Yq]
  movq      mm1, [TABLEq + 8 * TEMPq]
  paddsw    mm1, mm0
  psraw     mm1, 6
  packuswb  mm1, mm1
  movd      [ARGBq], mm1

.convertdone:
  RET
%endif