// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "media/base/yuv_convert.h" #include "media/base/yuv_convert_internal.h" #include "media/base/yuv_row.h" #if defined(_MSC_VER) #include #else #include #include #endif namespace media { // This is the final offset for the conversion from signed yuv values to // unsigned values. It is arranged so that offset of 16 is applied to Y // components and 128 is added to UV components for 2 pixels. SIMD_ALIGNED(const int16 kYuvOffset[8]) = {16, 0, 128, 128, 16, 0, 128, 128}; void FastConvertRGB32ToYUVRow(const uint8* rgb_buf_1, const uint8* rgb_buf_2, uint8* y_buf_1, uint8* y_buf_2, uint8* u_buf, uint8* v_buf, int width) { // 64-bits system has more registers and so saving the pointers to the tables // will be faster. But on 32-bits system we want to save registers so calculate // the offset each time. #if defined(ARCH_CPU_X86_64) const uint64* r_table = reinterpret_cast(kCoefficientsYuvR); const uint64* g_table = reinterpret_cast(kCoefficientsYuvR + 256); const uint64* b_table = reinterpret_cast(kCoefficientsYuvR + 512); #define R_TABLE r_table #define G_TABLE g_table #define B_TABLE b_table #else // On 32-bits system we compute the offset. const uint64* r_table = reinterpret_cast(kCoefficientsYuvR); #define R_TABLE r_table #define G_TABLE (r_table + 256) #define B_TABLE (r_table + 512) #endif uint16* y_row_1 = reinterpret_cast(y_buf_1); uint16* y_row_2 = reinterpret_cast(y_buf_2); __m128i offset = _mm_load_si128(reinterpret_cast(kYuvOffset)); for (; width > 1; width -= 2) { // MSVC generates very different intrinsic code for reading that affects // performance significantly so we define two different ways. #if defined(COMPILER_MSVC) uint32 pixel; #define READ_RGB(buf) pixel = *reinterpret_cast(buf); buf += 4; #define READ_B(buf) (pixel & 0xFF) #define READ_G(buf) ((pixel >> 8) & 0xFF) #define READ_R(buf) ((pixel >> 16) & 0xFF) #define READ_A(buf) #else #define READ_RGB(buf) #define READ_B(buf) (*buf++) #define READ_G(buf) (*buf++) #define READ_R(buf) (*buf++) #define READ_A(buf) (buf++) #endif // Load the first pixel (a) in the first row. READ_RGB(rgb_buf_1); __m128i b_comp_a = _mm_loadl_epi64( reinterpret_cast(B_TABLE + READ_B(rgb_buf_1))); __m128i g_comp_a = _mm_loadl_epi64( reinterpret_cast(G_TABLE + READ_G(rgb_buf_1))); __m128i r_comp_a = _mm_loadl_epi64( reinterpret_cast(R_TABLE + READ_R(rgb_buf_1))); READ_A(rgb_buf_1); // Load the second pixel (b) in the first row. READ_RGB(rgb_buf_1); __m128i b_comp_b = _mm_loadl_epi64( reinterpret_cast(B_TABLE + READ_B(rgb_buf_1))); __m128i g_comp_b = _mm_loadl_epi64( reinterpret_cast(G_TABLE + READ_G(rgb_buf_1))); __m128i r_comp_b = _mm_loadl_epi64( reinterpret_cast(R_TABLE + READ_R(rgb_buf_1))); READ_A(rgb_buf_1); // Pack two pixels into one register. __m128i b_comp_ab = _mm_unpacklo_epi64(b_comp_a, b_comp_b); __m128i g_comp_ab = _mm_unpacklo_epi64(g_comp_a, g_comp_b); __m128i r_comp_ab = _mm_unpacklo_epi64(r_comp_a, r_comp_b); // Add the coefficients together. // 127 0 // |yuv_ab| will be (Vb Ub 0 Yb Va Ua 0 Ya). __m128i yuv_ab = _mm_adds_epi16(r_comp_ab, _mm_adds_epi16(g_comp_ab, b_comp_ab)); // Right shift 6 bits to perform divide by 64 and then add the offset. yuv_ab = _mm_srai_epi16(yuv_ab, 6); yuv_ab = _mm_adds_epi16(yuv_ab, offset); // Do a shuffle to obtain (Vb Ub Va Ua 0 Yb 0 Ya). yuv_ab = _mm_shuffle_epi32(yuv_ab, 0xD8); // Load the first pixel (c) in the second row. READ_RGB(rgb_buf_2); __m128i b_comp_c = _mm_loadl_epi64( reinterpret_cast(B_TABLE + READ_B(rgb_buf_2))); __m128i g_comp_c = _mm_loadl_epi64( reinterpret_cast(G_TABLE + READ_G(rgb_buf_2))); __m128i r_comp_c = _mm_loadl_epi64( reinterpret_cast(R_TABLE + READ_R(rgb_buf_2))); READ_A(rgb_buf_2); // Load the second pixel (d) in the second row. READ_RGB(rgb_buf_2); __m128i b_comp_d = _mm_loadl_epi64( reinterpret_cast(B_TABLE + READ_B(rgb_buf_2))); __m128i g_comp_d = _mm_loadl_epi64( reinterpret_cast(G_TABLE + READ_G(rgb_buf_2))); __m128i r_comp_d = _mm_loadl_epi64( reinterpret_cast(R_TABLE + READ_R(rgb_buf_2))); READ_A(rgb_buf_2); // Pack two pixels into one register. __m128i b_comp_cd = _mm_unpacklo_epi64(b_comp_c, b_comp_d); __m128i g_comp_cd = _mm_unpacklo_epi64(g_comp_c, g_comp_d); __m128i r_comp_cd = _mm_unpacklo_epi64(r_comp_c, r_comp_d); // Add the coefficients together. // 127 0 // |yuv_cd| will be (Vd Ud 0 Yd Vc Uc 0 Yc). __m128i yuv_cd = _mm_adds_epi16(r_comp_cd, _mm_adds_epi16(g_comp_cd, b_comp_cd)); // Right shift 6 bits to perform divide by 64 and then add the offset. yuv_cd = _mm_srai_epi16(yuv_cd, 6); yuv_cd = _mm_adds_epi16(yuv_cd, offset); // Do a shuffle to obtain (Vd Ud Vc Uc 0 Yd 0 Yc). yuv_cd = _mm_shuffle_epi32(yuv_cd, 0xD8); // This will form (0 Yd 0 Yc 0 Yb 0 Ya). __m128i y_comp = _mm_unpacklo_epi64(yuv_ab, yuv_cd); // Pack to 8-bits Y values. y_comp = _mm_packs_epi32(y_comp, y_comp); y_comp = _mm_packus_epi16(y_comp, y_comp); // And then store. *y_row_1++ = _mm_extract_epi16(y_comp, 0); *y_row_2++ = _mm_extract_epi16(y_comp, 1); // Find the average between pixel a and c then move to lower 64bit. __m128i uv_comp = _mm_avg_epu16(yuv_ab, yuv_cd); uv_comp = _mm_unpackhi_epi64(uv_comp, uv_comp); // Shuffle and then find average again. __m128i uv_comp_2 = _mm_shuffle_epi32(uv_comp, 0x01); uv_comp = _mm_avg_epu16(uv_comp, uv_comp_2); // Pack to 8-bits then store. uv_comp = _mm_packus_epi16(uv_comp, uv_comp); int uv = _mm_extract_epi16(uv_comp, 0); *u_buf++ = uv & 0xff; *v_buf++ = uv >> 8; } } #undef R_TABLE #undef G_TABLE #undef B_TABLE #undef READ_RGB #undef READ_R #undef READ_G #undef READ_B #undef READ_A extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe, uint8* yplane, uint8* uplane, uint8* vplane, int width, int height, int rgbstride, int ystride, int uvstride) { // Make sure |width| is a multiple of 2. width = (width / 2) * 2; for (int i = 0; i < height; i += 2) { FastConvertRGB32ToYUVRow(rgbframe, rgbframe + rgbstride, yplane, yplane + ystride, uplane, vplane, width); rgbframe += 2 * rgbstride; yplane += 2 * ystride; uplane += uvstride; vplane += uvstride; } } } // namespace media