diff options
author | fbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-04-21 23:52:58 +0000 |
---|---|---|
committer | fbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-04-21 23:52:58 +0000 |
commit | a32abb551892ef6a9987a5372b3c9896adb7a55e (patch) | |
tree | f87b249aca0c5cd5e0efb8ea01623c32f7392fa2 /media | |
parent | 75a0fc40f52f44c7c11324b840895c716cf1752e (diff) | |
download | chromium_src-a32abb551892ef6a9987a5372b3c9896adb7a55e.zip chromium_src-a32abb551892ef6a9987a5372b3c9896adb7a55e.tar.gz chromium_src-a32abb551892ef6a9987a5372b3c9896adb7a55e.tar.bz2 |
Speed up vertical filtering using v = a+(b-a)*x formula
BUG=42064
TEST=unittests should still pass
Review URL: http://codereview.chromium.org/1733004
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@45265 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media')
-rw-r--r-- | media/base/yuv_convert.cc | 77 | ||||
-rw-r--r-- | media/base/yuv_convert.h | 6 | ||||
-rw-r--r-- | media/tools/scaler_bench/scaler_bench.cc | 6 |
3 files changed, 50 insertions, 39 deletions
diff --git a/media/base/yuv_convert.cc b/media/base/yuv_convert.cc index 569f8b8..0f6170d 100644 --- a/media/base/yuv_convert.cc +++ b/media/base/yuv_convert.cc @@ -70,14 +70,12 @@ void ConvertYUVToRGB32(const uint8* y_buf, #if USE_SSE2 // FilterRows combines two rows of the image using linear interpolation. -// SSE2 version blends 8 pixels at a time. +// Blends 8 pixels at a time. static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, int source_width, int source_y_fraction) { __m128i zero = _mm_setzero_si128(); __m128i y1_fraction = _mm_set1_epi16( static_cast<uint16>(source_y_fraction >> 8)); - __m128i y0_fraction = _mm_set1_epi16( - static_cast<uint16>(256 - (source_y_fraction >> 8))); uint8* end = ybuf + source_width; if (ybuf < end) { @@ -86,64 +84,69 @@ static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, __m128i y1 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y1_ptr)); y0 = _mm_unpacklo_epi8(y0, zero); y1 = _mm_unpacklo_epi8(y1, zero); - y0 = _mm_mullo_epi16(y0, y0_fraction); + y1 = _mm_sub_epi16(y1, y0); y1 = _mm_mullo_epi16(y1, y1_fraction); - y0 = _mm_add_epi16(y0, y1); // 8.8 fixed point result - y0 = _mm_srli_epi16(y0, 8); - y0 = _mm_packus_epi16(y0, y0); - _mm_storel_epi64(reinterpret_cast<__m128i *>(ybuf), y0); + y1 = _mm_srai_epi16(y1, 8); + y1 = _mm_add_epi16(y1, y0); + y1 = _mm_packus_epi16(y1, y1); + _mm_storel_epi64(reinterpret_cast<__m128i *>(ybuf), y1); y0_ptr += 8; y1_ptr += 8; ybuf += 8; } while (ybuf < end); } } - #elif USE_MMX -// MMX version blends 4 pixels at a time. static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, int source_width, int source_y_fraction) { __m64 zero = _mm_setzero_si64(); __m64 y1_fraction = _mm_set1_pi16( - static_cast<int16>(source_y_fraction >> 8)); - __m64 y0_fraction = _mm_set1_pi16( - static_cast<int16>(256 - (source_y_fraction >> 8))); + static_cast<int16>(source_y_fraction >> 8)); uint8* end = ybuf + source_width; if (ybuf < end) { do { - __m64 y0 = _mm_cvtsi32_si64(*reinterpret_cast<const int *>(y0_ptr)); - __m64 y1 = _mm_cvtsi32_si64(*reinterpret_cast<const int *>(y1_ptr)); - y0 = _mm_unpacklo_pi8(y0, zero); - y1 = _mm_unpacklo_pi8(y1, zero); - y0 = _mm_mullo_pi16(y0, y0_fraction); + __m64 y2 = *reinterpret_cast<const __m64 *>(y0_ptr); + __m64 y3 = *reinterpret_cast<const __m64 *>(y1_ptr); + __m64 y0 = _mm_unpacklo_pi8(y2, zero); + __m64 y1 = _mm_unpacklo_pi8(y3, zero); + y2 = _mm_unpackhi_pi8(y2, zero); + y3 = _mm_unpackhi_pi8(y3, zero); + y1 = _mm_sub_pi16(y1, y0); + y3 = _mm_sub_pi16(y3, y2); y1 = _mm_mullo_pi16(y1, y1_fraction); - y0 = _mm_add_pi16(y0, y1); // 8.8 fixed point result - y0 = _mm_srli_pi16(y0, 8); - y0 = _mm_packs_pu16(y0, y0); - *reinterpret_cast<int *>(ybuf) = _mm_cvtsi64_si32(y0); - y0_ptr += 4; - y1_ptr += 4; - ybuf += 4; + y3 = _mm_mullo_pi16(y3, y1_fraction); + y1 = _mm_srai_pi16(y1, 8); + y3 = _mm_srai_pi16(y3, 8); + y1 = _mm_add_pi16(y1, y0); + y3 = _mm_add_pi16(y3, y2); + y0 = _mm_packs_pu16(y1, y3); + *reinterpret_cast<__m64 *>(ybuf) = y0; + y0_ptr += 8; + y1_ptr += 8; + ybuf += 8; } while (ybuf < end); } } #else // no MMX or SSE2 -// C version blends 4 pixels at a time. + static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, int source_width, int source_y_fraction) { - int y1_fraction = source_y_fraction >> 8; - int y0_fraction = 256 - (source_y_fraction >> 8); + int y1_fraction = (source_y_fraction >> 8); uint8* end = ybuf + source_width; if (ybuf < end) { do { - ybuf[0] = (y0_ptr[0] * (y0_fraction) + y1_ptr[0] * (y1_fraction)) >> 8; - ybuf[1] = (y0_ptr[1] * (y0_fraction) + y1_ptr[1] * (y1_fraction)) >> 8; - ybuf[2] = (y0_ptr[2] * (y0_fraction) + y1_ptr[2] * (y1_fraction)) >> 8; - ybuf[3] = (y0_ptr[3] * (y0_fraction) + y1_ptr[3] * (y1_fraction)) >> 8; - y0_ptr += 4; - y1_ptr += 4; - ybuf += 4; + ybuf[0] = y0_ptr[0] + (((y1_ptr[0] - y0_ptr[0]) * y1_fraction) >> 8); + ybuf[1] = y0_ptr[1] + (((y1_ptr[1] - y0_ptr[1]) * y1_fraction) >> 8); + ybuf[2] = y0_ptr[2] + (((y1_ptr[2] - y0_ptr[2]) * y1_fraction) >> 8); + ybuf[3] = y0_ptr[3] + (((y1_ptr[3] - y0_ptr[3]) * y1_fraction) >> 8); + ybuf[4] = y0_ptr[4] + (((y1_ptr[4] - y0_ptr[4]) * y1_fraction) >> 8); + ybuf[5] = y0_ptr[5] + (((y1_ptr[5] - y0_ptr[5]) * y1_fraction) >> 8); + ybuf[6] = y0_ptr[6] + (((y1_ptr[6] - y0_ptr[6]) * y1_fraction) >> 8); + ybuf[7] = y0_ptr[7] + (((y1_ptr[7] - y0_ptr[7]) * y1_fraction) >> 8); + y0_ptr += 8; + y1_ptr += 8; + ybuf += 8; } while (ybuf < end); } } @@ -261,7 +264,7 @@ void ScaleYUVToRGB32(const uint8* y_buf, const uint8* v_ptr = v0_ptr; // Apply vertical filtering if necessary. // TODO(fbarchard): Remove memcpy when not necessary. - if (filter == media::FILTER_BILINEAR) { + if (filter & media::FILTER_BILINEAR_V) { if (yscale_fixed != kFractionMax && source_y_fraction && ((source_y + 1) < source_height)) { FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); @@ -289,7 +292,7 @@ void ScaleYUVToRGB32(const uint8* y_buf, FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, dest_pixel, width); } else { - if (filter == FILTER_BILINEAR) + if (filter & FILTER_BILINEAR_H) LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, dest_pixel, width, source_dx); else { diff --git a/media/base/yuv_convert.h b/media/base/yuv_convert.h index d55337d..24a2c4e 100644 --- a/media/base/yuv_convert.h +++ b/media/base/yuv_convert.h @@ -31,8 +31,10 @@ enum Rotate { // Filter affects how scaling looks. enum ScaleFilter { - FILTER_NONE, // No filter (point sampled). - FILTER_BILINEAR, // Bilinear filter. + FILTER_NONE = 0, // No filter (point sampled). + FILTER_BILINEAR_H = 1, // Bilinear horizontal filter. + FILTER_BILINEAR_V = 2, // Bilinear vertical filter. + FILTER_BILINEAR = 3, // Bilinear filter. }; // Convert a frame of YUV to 32 bit ARGB. diff --git a/media/tools/scaler_bench/scaler_bench.cc b/media/tools/scaler_bench/scaler_bench.cc index a50efa0..60417ad 100644 --- a/media/tools/scaler_bench/scaler_bench.cc +++ b/media/tools/scaler_bench/scaler_bench.cc @@ -200,6 +200,12 @@ int main(int argc, const char** argv) { << "ms/frame" << std::endl; std::cout << "No filtering: " << BenchmarkFilter(media::FILTER_NONE) << "ms/frame" << std::endl; + std::cout << "Bilinear Vertical: " + << BenchmarkFilter(media::FILTER_BILINEAR_V) + << "ms/frame" << std::endl; + std::cout << "Bilinear Horizontal: " + << BenchmarkFilter(media::FILTER_BILINEAR_H) + << "ms/frame" << std::endl; std::cout << "Bilinear: " << BenchmarkFilter(media::FILTER_BILINEAR) << "ms/frame" << std::endl; |