summaryrefslogtreecommitdiffstats
path: root/media
diff options
context:
space:
mode:
authorfbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-04-23 10:14:29 +0000
committerfbarchard@chromium.org <fbarchard@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-04-23 10:14:29 +0000
commit85654d072ae2f7054c812375673c75606a4f8468 (patch)
tree05cbf65f575a770e46005b317048a8262a9e0960 /media
parent8b52773c63eac51c13060ad93ef775933a327550 (diff)
downloadchromium_src-85654d072ae2f7054c812375673c75606a4f8468.zip
chromium_src-85654d072ae2f7054c812375673c75606a4f8468.tar.gz
chromium_src-85654d072ae2f7054c812375673c75606a4f8468.tar.bz2
Vertical Scaler better pipelined for Atom
BUG=42064 TEST=sse2 version of scaling should be faster on Atom. No quality change. Review URL: http://codereview.chromium.org/1700010 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@45431 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media')
-rw-r--r--media/base/yuv_convert.cc166
-rw-r--r--media/base/yuv_convert.h6
-rw-r--r--media/tools/scaler_bench/scaler_bench.cc10
3 files changed, 105 insertions, 77 deletions
diff --git a/media/base/yuv_convert.cc b/media/base/yuv_convert.cc
index 569f8b8..bea0e50 100644
--- a/media/base/yuv_convert.cc
+++ b/media/base/yuv_convert.cc
@@ -70,85 +70,96 @@ void ConvertYUVToRGB32(const uint8* y_buf,
#if USE_SSE2
// FilterRows combines two rows of the image using linear interpolation.
-// SSE2 version blends 8 pixels at a time.
+// SSE2 version does 16 pixels at a time
+
static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
int source_width, int source_y_fraction) {
__m128i zero = _mm_setzero_si128();
- __m128i y1_fraction = _mm_set1_epi16(
- static_cast<uint16>(source_y_fraction >> 8));
- __m128i y0_fraction = _mm_set1_epi16(
- static_cast<uint16>(256 - (source_y_fraction >> 8)));
+ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
+ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
- uint8* end = ybuf + source_width;
- if (ybuf < end) {
- do {
- __m128i y0 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y0_ptr));
- __m128i y1 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y1_ptr));
- y0 = _mm_unpacklo_epi8(y0, zero);
- y1 = _mm_unpacklo_epi8(y1, zero);
- y0 = _mm_mullo_epi16(y0, y0_fraction);
- y1 = _mm_mullo_epi16(y1, y1_fraction);
- y0 = _mm_add_epi16(y0, y1); // 8.8 fixed point result
- y0 = _mm_srli_epi16(y0, 8);
- y0 = _mm_packus_epi16(y0, y0);
- _mm_storel_epi64(reinterpret_cast<__m128i *>(ybuf), y0);
- y0_ptr += 8;
- y1_ptr += 8;
- ybuf += 8;
- } while (ybuf < end);
- }
-}
+ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
+ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
+ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
+ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
+ do {
+ __m128i y0 = _mm_loadu_si128(y0_ptr128);
+ __m128i y1 = _mm_loadu_si128(y1_ptr128);
+ __m128i y2 = _mm_unpackhi_epi8(y0, zero);
+ __m128i y3 = _mm_unpackhi_epi8(y1, zero);
+ y0 = _mm_unpacklo_epi8(y0, zero);
+ y1 = _mm_unpacklo_epi8(y1, zero);
+ y0 = _mm_mullo_epi16(y0, y0_fraction);
+ y1 = _mm_mullo_epi16(y1, y1_fraction);
+ y2 = _mm_mullo_epi16(y2, y0_fraction);
+ y3 = _mm_mullo_epi16(y3, y1_fraction);
+ y0 = _mm_add_epi16(y0, y1);
+ y2 = _mm_add_epi16(y2, y3);
+ y0 = _mm_srli_epi16(y0, 8);
+ y2 = _mm_srli_epi16(y2, 8);
+ y0 = _mm_packus_epi16(y0, y2);
+ *dest128++ = y0;
+ ++y0_ptr128;
+ ++y1_ptr128;
+ } while (dest128 < end128);
+}
#elif USE_MMX
-// MMX version blends 4 pixels at a time.
+// MMX version does 8 pixels at a time
static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
int source_width, int source_y_fraction) {
__m64 zero = _mm_setzero_si64();
- __m64 y1_fraction = _mm_set1_pi16(
- static_cast<int16>(source_y_fraction >> 8));
- __m64 y0_fraction = _mm_set1_pi16(
- static_cast<int16>(256 - (source_y_fraction >> 8)));
+ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
+ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
- uint8* end = ybuf + source_width;
- if (ybuf < end) {
- do {
- __m64 y0 = _mm_cvtsi32_si64(*reinterpret_cast<const int *>(y0_ptr));
- __m64 y1 = _mm_cvtsi32_si64(*reinterpret_cast<const int *>(y1_ptr));
- y0 = _mm_unpacklo_pi8(y0, zero);
- y1 = _mm_unpacklo_pi8(y1, zero);
- y0 = _mm_mullo_pi16(y0, y0_fraction);
- y1 = _mm_mullo_pi16(y1, y1_fraction);
- y0 = _mm_add_pi16(y0, y1); // 8.8 fixed point result
- y0 = _mm_srli_pi16(y0, 8);
- y0 = _mm_packs_pu16(y0, y0);
- *reinterpret_cast<int *>(ybuf) = _mm_cvtsi64_si32(y0);
- y0_ptr += 4;
- y1_ptr += 4;
- ybuf += 4;
- } while (ybuf < end);
- }
+ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
+ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
+ __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
+ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
+
+ do {
+ __m64 y0 = *y0_ptr64++;
+ __m64 y1 = *y1_ptr64++;
+ __m64 y2 = _mm_unpackhi_pi8(y0, zero);
+ __m64 y3 = _mm_unpackhi_pi8(y1, zero);
+ y0 = _mm_unpacklo_pi8(y0, zero);
+ y1 = _mm_unpacklo_pi8(y1, zero);
+ y0 = _mm_mullo_pi16(y0, y0_fraction);
+ y1 = _mm_mullo_pi16(y1, y1_fraction);
+ y2 = _mm_mullo_pi16(y2, y0_fraction);
+ y3 = _mm_mullo_pi16(y3, y1_fraction);
+ y0 = _mm_add_pi16(y0, y1);
+ y2 = _mm_add_pi16(y2, y3);
+ y0 = _mm_srli_pi16(y0, 8);
+ y2 = _mm_srli_pi16(y2, 8);
+ y0 = _mm_packs_pu16(y0, y2);
+ *dest64++ = y0;
+ } while (dest64 < end64);
}
#else // no MMX or SSE2
-// C version blends 4 pixels at a time.
+// C version does 8 at a time to mimic MMX code
static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
int source_width, int source_y_fraction) {
- int y1_fraction = source_y_fraction >> 8;
- int y0_fraction = 256 - (source_y_fraction >> 8);
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
uint8* end = ybuf + source_width;
- if (ybuf < end) {
- do {
- ybuf[0] = (y0_ptr[0] * (y0_fraction) + y1_ptr[0] * (y1_fraction)) >> 8;
- ybuf[1] = (y0_ptr[1] * (y0_fraction) + y1_ptr[1] * (y1_fraction)) >> 8;
- ybuf[2] = (y0_ptr[2] * (y0_fraction) + y1_ptr[2] * (y1_fraction)) >> 8;
- ybuf[3] = (y0_ptr[3] * (y0_fraction) + y1_ptr[3] * (y1_fraction)) >> 8;
- y0_ptr += 4;
- y1_ptr += 4;
- ybuf += 4;
- } while (ybuf < end);
- }
+ do {
+ ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
+ ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
+ ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
+ ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
+ ybuf[4] = (y0_ptr[4] * y0_fraction + y1_ptr[4] * y1_fraction) >> 8;
+ ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
+ ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
+ ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
+ y0_ptr += 8;
+ y1_ptr += 8;
+ ybuf += 8;
+ } while (ybuf < end);
}
#endif
+
// Scale a frame of YUV to 32 bit ARGB.
void ScaleYUVToRGB32(const uint8* y_buf,
const uint8* u_buf,
@@ -164,10 +175,13 @@ void ScaleYUVToRGB32(const uint8* y_buf,
YUVType yuv_type,
Rotate view_rotate,
ScaleFilter filter) {
- const int kFilterBufferSize = 8192;
+ // 4096 allows 3 buffers to fit in 12k.
+ // Helps performance on CPU with 16K L1 cache.
+ // Large enough for 3830x2160 and 30" displays which are 2560x1600.
+ const int kFilterBufferSize = 4096;
// Disable filtering if the screen is too big (to avoid buffer overflows).
// This should never happen to regular users: they don't have monitors
- // wider than 8192 pixels.
+ // wider than 4096 pixels.
// TODO(fbarchard): Allow rotated videos to filter.
if (source_width > kFilterBufferSize || view_rotate)
filter = FILTER_NONE;
@@ -230,13 +244,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
}
}
- // Need padding because FilterRows() may write up to 15 extra pixels
+ // Need padding because FilterRows() will write 1 to 16 extra pixels
// after the end for SSE2 version.
- uint8 ybuf[kFilterBufferSize + 16];
- uint8 ubuf[kFilterBufferSize / 2 + 16];
- uint8 vbuf[kFilterBufferSize / 2 + 16];
+ uint8 yuvbuf[16 + kFilterBufferSize * 3 + 16];
+ uint8* ybuf =
+ reinterpret_cast<uint8*>(reinterpret_cast<uintptr_t>(yuvbuf + 15) & ~15);
+ uint8* ubuf = ybuf + kFilterBufferSize;
+ uint8* vbuf = ubuf + kFilterBufferSize;
// TODO(fbarchard): Fixed point math is off by 1 on negatives.
int yscale_fixed = (source_height << kFractionBits) / height;
+
+ // TODO(fbarchard): Split this into separate function for better efficiency.
for (int y = 0; y < height; ++y) {
uint8* dest_pixel = rgb_buf + y * rgb_pitch;
int source_y_subpixel = (y * yscale_fixed);
@@ -253,15 +271,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
const uint8* v0_ptr = v_buf + (source_y >> y_shift) * uv_pitch;
const uint8* v1_ptr = v0_ptr + uv_pitch;
- int source_y_fraction = source_y_subpixel & kFractionMask;
- int source_uv_fraction = (source_y_subpixel >> y_shift) & kFractionMask;
+ // vertical scaler uses 16.8 fixed point
+ int source_y_fraction = (source_y_subpixel & kFractionMask) >> 8;
+ int source_uv_fraction =
+ ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
const uint8* y_ptr = y0_ptr;
const uint8* u_ptr = u0_ptr;
const uint8* v_ptr = v0_ptr;
// Apply vertical filtering if necessary.
// TODO(fbarchard): Remove memcpy when not necessary.
- if (filter == media::FILTER_BILINEAR) {
+ if (filter & media::FILTER_BILINEAR_V) {
if (yscale_fixed != kFractionMax &&
source_y_fraction && ((source_y + 1) < source_height)) {
FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
@@ -289,10 +309,10 @@ void ScaleYUVToRGB32(const uint8* y_buf,
FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
dest_pixel, width);
} else {
- if (filter == FILTER_BILINEAR)
+ if (filter & FILTER_BILINEAR_H) {
LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
dest_pixel, width, source_dx);
- else {
+ } else {
// Specialized scalers and rotation.
#if USE_MMX && defined(_MSC_VER)
if (width == (source_width * 2)) {
@@ -303,7 +323,7 @@ void ScaleYUVToRGB32(const uint8* y_buf,
ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
dest_pixel, width,
source_dx >> kFractionBits);
- } else if (source_dx_uv == source_dx) { // Not rotated.
+ } else if (source_dx_uv == source_dx) { // Not rotated.
ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
dest_pixel, width, source_dx);
} else {
diff --git a/media/base/yuv_convert.h b/media/base/yuv_convert.h
index d55337d..24a2c4e 100644
--- a/media/base/yuv_convert.h
+++ b/media/base/yuv_convert.h
@@ -31,8 +31,10 @@ enum Rotate {
// Filter affects how scaling looks.
enum ScaleFilter {
- FILTER_NONE, // No filter (point sampled).
- FILTER_BILINEAR, // Bilinear filter.
+ FILTER_NONE = 0, // No filter (point sampled).
+ FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
+ FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
+ FILTER_BILINEAR = 3, // Bilinear filter.
};
// Convert a frame of YUV to 32 bit ARGB.
diff --git a/media/tools/scaler_bench/scaler_bench.cc b/media/tools/scaler_bench/scaler_bench.cc
index a50efa0..749e1a6 100644
--- a/media/tools/scaler_bench/scaler_bench.cc
+++ b/media/tools/scaler_bench/scaler_bench.cc
@@ -165,13 +165,13 @@ int main(int argc, const char** argv) {
source_height = 0;
}
- std::string dest_width_param(cmd_line->GetSwitchValueASCII("dst-w"));
+ std::string dest_width_param(cmd_line->GetSwitchValueASCII("dest-w"));
if (!dest_width_param.empty() &&
!StringToInt(dest_width_param, &dest_width)) {
dest_width = 0;
}
- std::string dest_height_param(cmd_line->GetSwitchValueASCII("dst-h"));
+ std::string dest_height_param(cmd_line->GetSwitchValueASCII("dest-h"));
if (!dest_height_param.empty() &&
!StringToInt(dest_height_param, &dest_height)) {
dest_height = 0;
@@ -200,6 +200,12 @@ int main(int argc, const char** argv) {
<< "ms/frame" << std::endl;
std::cout << "No filtering: " << BenchmarkFilter(media::FILTER_NONE)
<< "ms/frame" << std::endl;
+ std::cout << "Bilinear Vertical: "
+ << BenchmarkFilter(media::FILTER_BILINEAR_V)
+ << "ms/frame" << std::endl;
+ std::cout << "Bilinear Horizontal: "
+ << BenchmarkFilter(media::FILTER_BILINEAR_H)
+ << "ms/frame" << std::endl;
std::cout << "Bilinear: " << BenchmarkFilter(media::FILTER_BILINEAR)
<< "ms/frame" << std::endl;