media/base/yuv_convert_sse2.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "media/base/yuv_convert.h"
#include "media/base/yuv_convert_internal.h"
#include "media/base/yuv_row.h"

#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <mmintrin.h>
#include <emmintrin.h>
#endif

namespace media {

// This is the final offset for the conversion from signed yuv values to
// unsigned values. It is arranged so that offset of 16 is applied to Y
// components and 128 is added to UV components for 2 pixels.
SIMD_ALIGNED(const int16 kYuvOffset[8]) = {16, 0, 128, 128, 16, 0, 128, 128};

void FastConvertRGB32ToYUVRow(const uint8* rgb_buf_1,
                              const uint8* rgb_buf_2,
                              uint8* y_buf_1,
                              uint8* y_buf_2,
                              uint8* u_buf,
                              uint8* v_buf,
                              int width) {
// 64-bits system has more registers and so saving the pointers to the tables
// will be faster. But on 32-bits system we want to save registers so calculate
// the offset each time.
#if defined(ARCH_CPU_X86_64)
  const uint64* r_table = reinterpret_cast<uint64*>(kCoefficientsYuvR);
  const uint64* g_table = reinterpret_cast<uint64*>(kCoefficientsYuvR + 256);
  const uint64* b_table = reinterpret_cast<uint64*>(kCoefficientsYuvR + 512);
#define R_TABLE r_table
#define G_TABLE g_table
#define B_TABLE b_table
#else // On 32-bits system we compute the offset.
  const uint64* r_table = reinterpret_cast<uint64*>(kCoefficientsYuvR);
#define R_TABLE r_table
#define G_TABLE (r_table + 256)
#define B_TABLE (r_table + 512)
#endif

  uint16* y_row_1 = reinterpret_cast<uint16*>(y_buf_1);
  uint16* y_row_2 = reinterpret_cast<uint16*>(y_buf_2);
  __m128i offset = _mm_load_si128(reinterpret_cast<const __m128i*>(kYuvOffset));

  for (; width > 1; width -= 2) {
// MSVC generates very different intrinsic code for reading that affects
// performance significantly so we define two different ways.
#if defined(COMPILER_MSVC)
uint32 pixel;
#define READ_RGB(buf) pixel = *reinterpret_cast<const uint32*>(buf); buf += 4;
#define READ_B(buf) (pixel & 0xFF)
#define READ_G(buf) ((pixel >> 8) & 0xFF)
#define READ_R(buf) ((pixel >> 16) & 0xFF)
#define READ_A(buf)
#else
#define READ_RGB(buf)
#define READ_B(buf) (*buf++)
#define READ_G(buf) (*buf++)
#define READ_R(buf) (*buf++)
#define READ_A(buf) (buf++)
#endif
    // Load the first pixel (a) in the first row.
    READ_RGB(rgb_buf_1);
    __m128i b_comp_a = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(B_TABLE + READ_B(rgb_buf_1)));
    __m128i g_comp_a = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(G_TABLE + READ_G(rgb_buf_1)));
    __m128i r_comp_a = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(R_TABLE + READ_R(rgb_buf_1)));
    READ_A(rgb_buf_1);

    // Load the second pixel (b) in the first row.
    READ_RGB(rgb_buf_1);
    __m128i b_comp_b = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(B_TABLE + READ_B(rgb_buf_1)));
    __m128i g_comp_b = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(G_TABLE + READ_G(rgb_buf_1)));
    __m128i r_comp_b = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(R_TABLE + READ_R(rgb_buf_1)));
    READ_A(rgb_buf_1);

    // Pack two pixels into one register.
    __m128i b_comp_ab = _mm_unpacklo_epi64(b_comp_a, b_comp_b);
    __m128i g_comp_ab = _mm_unpacklo_epi64(g_comp_a, g_comp_b);
    __m128i r_comp_ab = _mm_unpacklo_epi64(r_comp_a, r_comp_b);

    // Add the coefficients together.
    //                  127                  0
    // |yuv_ab| will be (Vb Ub 0 Yb Va Ua 0 Ya).
    __m128i yuv_ab = _mm_adds_epi16(r_comp_ab,
                                    _mm_adds_epi16(g_comp_ab, b_comp_ab));

    // Right shift 6 bits to perform divide by 64 and then add the offset.
    yuv_ab = _mm_srai_epi16(yuv_ab, 6);
    yuv_ab = _mm_adds_epi16(yuv_ab, offset);

    // Do a shuffle to obtain (Vb Ub Va Ua 0 Yb 0 Ya).
    yuv_ab = _mm_shuffle_epi32(yuv_ab, 0xD8);

    // Load the first pixel (c) in the second row.
    READ_RGB(rgb_buf_2);
    __m128i b_comp_c = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(B_TABLE + READ_B(rgb_buf_2)));
    __m128i g_comp_c = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(G_TABLE + READ_G(rgb_buf_2)));
    __m128i r_comp_c = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(R_TABLE + READ_R(rgb_buf_2)));
    READ_A(rgb_buf_2);

    // Load the second pixel (d) in the second row.
    READ_RGB(rgb_buf_2);
    __m128i b_comp_d = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(B_TABLE + READ_B(rgb_buf_2)));
    __m128i g_comp_d = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(G_TABLE + READ_G(rgb_buf_2)));
    __m128i r_comp_d = _mm_loadl_epi64(
        reinterpret_cast<const __m128i*>(R_TABLE + READ_R(rgb_buf_2)));
    READ_A(rgb_buf_2);

    // Pack two pixels into one register.
    __m128i b_comp_cd = _mm_unpacklo_epi64(b_comp_c, b_comp_d);
    __m128i g_comp_cd = _mm_unpacklo_epi64(g_comp_c, g_comp_d);
    __m128i r_comp_cd = _mm_unpacklo_epi64(r_comp_c, r_comp_d);

    // Add the coefficients together.
    //                   127                  0
    // |yuv_cd| will be (Vd Ud 0 Yd Vc Uc 0 Yc).
    __m128i yuv_cd = _mm_adds_epi16(r_comp_cd,
                                    _mm_adds_epi16(g_comp_cd, b_comp_cd));

    // Right shift 6 bits to perform divide by 64 and then add the offset.
    yuv_cd = _mm_srai_epi16(yuv_cd, 6);
    yuv_cd = _mm_adds_epi16(yuv_cd, offset);

    // Do a shuffle to obtain (Vd Ud Vc Uc 0 Yd 0 Yc).
    yuv_cd = _mm_shuffle_epi32(yuv_cd, 0xD8);

    // This will form (0 Yd 0 Yc 0 Yb 0 Ya).
    __m128i y_comp = _mm_unpacklo_epi64(yuv_ab, yuv_cd);

    // Pack to 8-bits Y values.
    y_comp = _mm_packs_epi32(y_comp, y_comp);
    y_comp = _mm_packus_epi16(y_comp, y_comp);

    // And then store.
    *y_row_1++ = _mm_extract_epi16(y_comp, 0);
    *y_row_2++ = _mm_extract_epi16(y_comp, 1);

    // Find the average between pixel a and c then move to lower 64bit.
    __m128i uv_comp = _mm_avg_epu16(yuv_ab, yuv_cd);
    uv_comp = _mm_unpackhi_epi64(uv_comp, uv_comp);

    // Shuffle and then find average again.
    __m128i uv_comp_2 = _mm_shuffle_epi32(uv_comp, 0x01);
    uv_comp = _mm_avg_epu16(uv_comp, uv_comp_2);

    // Pack to 8-bits then store.
    uv_comp = _mm_packus_epi16(uv_comp, uv_comp);
    int uv = _mm_extract_epi16(uv_comp, 0);

    *u_buf++ = uv & 0xff;
    *v_buf++ = uv >> 8;
  }
}

#undef R_TABLE
#undef G_TABLE
#undef B_TABLE
#undef READ_RGB
#undef READ_R
#undef READ_G
#undef READ_B
#undef READ_A

extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
                                   uint8* yplane,
                                   uint8* uplane,
                                   uint8* vplane,
                                   int width,
                                   int height,
                                   int rgbstride,
                                   int ystride,
                                   int uvstride) {
  // Make sure |width| is a multiple of 2.
  width = (width / 2) * 2;
  for (int i = 0; i < height; i += 2) {
    FastConvertRGB32ToYUVRow(rgbframe,
                             rgbframe + rgbstride,
                             yplane,
                             yplane + ystride,
                             uplane,
                             vplane,
                             width);
    rgbframe += 2 * rgbstride;
    yplane += 2 * ystride;
    uplane += uvstride;
    vplane += uvstride;
  }
}

}  // namespace media