diff options
-rw-r--r-- | media/base/simd/sinc_resampler_sse.cc | 48 | ||||
-rw-r--r-- | media/base/simd/vector_math_sse.cc | 27 | ||||
-rw-r--r-- | media/base/sinc_resampler.cc | 89 | ||||
-rw-r--r-- | media/base/sinc_resampler.h | 35 | ||||
-rw-r--r-- | media/base/sinc_resampler_unittest.cc | 11 | ||||
-rw-r--r-- | media/base/vector_math.cc | 26 | ||||
-rw-r--r-- | media/base/vector_math_testing.h | 4 | ||||
-rw-r--r-- | media/base/vector_math_unittest.cc | 8 | ||||
-rw-r--r-- | media/media.gyp | 37 |
9 files changed, 175 insertions, 110 deletions
diff --git a/media/base/simd/sinc_resampler_sse.cc b/media/base/simd/sinc_resampler_sse.cc new file mode 100644 index 0000000..f0aec1c --- /dev/null +++ b/media/base/simd/sinc_resampler_sse.cc @@ -0,0 +1,48 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "media/base/sinc_resampler.h" + +#include <xmmintrin.h> + +namespace media { + +float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor) { + __m128 m_input; + __m128 m_sums1 = _mm_setzero_ps(); + __m128 m_sums2 = _mm_setzero_ps(); + + // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling + // these loops hurt performance in local testing. + if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { + for (int i = 0; i < kKernelSize; i += 4) { + m_input = _mm_loadu_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } else { + for (int i = 0; i < kKernelSize; i += 4) { + m_input = _mm_load_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } + + // Linearly interpolate the two "convolutions". + m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); + m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); + m_sums1 = _mm_add_ps(m_sums1, m_sums2); + + // Sum components together. + float result; + m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); + _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( + m_sums2, m_sums2, 1))); + + return result; +} + +} // namespace media diff --git a/media/base/simd/vector_math_sse.cc b/media/base/simd/vector_math_sse.cc new file mode 100644 index 0000000..5cc2df9 --- /dev/null +++ b/media/base/simd/vector_math_sse.cc @@ -0,0 +1,27 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "media/base/vector_math_testing.h" + +#include <xmmintrin.h> // NOLINT + +namespace media { +namespace vector_math { + +void FMAC_SSE(const float src[], float scale, int len, float dest[]) { + const int rem = len % 4; + const int last_index = len - rem; + __m128 m_scale = _mm_set_ps1(scale); + for (int i = 0; i < last_index; i += 4) { + _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i), + _mm_mul_ps(_mm_load_ps(src + i), m_scale))); + } + + // Handle any remaining values that wouldn't fit in an SSE pass. + for (int i = last_index; i < len; ++i) + dest[i] += src[i] * scale; +} + +} // namespace vector_math +} // namespace media diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc index d836fc7..09ff49d 100644 --- a/media/base/sinc_resampler.cc +++ b/media/base/sinc_resampler.cc @@ -40,11 +40,6 @@ #include "base/cpu.h" #include "base/logging.h" -#include "build/build_config.h" - -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) -#include <xmmintrin.h> -#endif #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) #include <arm_neon.h> @@ -52,33 +47,6 @@ namespace media { -namespace { - -enum { - // The kernel size can be adjusted for quality (higher is better) at the - // expense of performance. Must be a multiple of 32. - // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. - kKernelSize = 32, - - // The number of destination frames generated per processing pass. Affects - // how often and for how much SincResampler calls back for input. Must be - // greater than kKernelSize. - kBlockSize = 512, - - // The kernel offset count is used for interpolation and is the number of - // sub-sample kernel shifts. Can be adjusted for quality (higher is better) - // at the expense of allocating more memory. - kKernelOffsetCount = 32, - kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), - - // The size (in samples) of the internal buffer used by the resampler. - kBufferSize = kBlockSize + kKernelSize -}; - -} // namespace - -const int SincResampler::kMaximumLookAheadSize = kBufferSize; - SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb) : io_sample_rate_ratio_(io_sample_rate_ratio), virtual_source_idx_(0), @@ -222,7 +190,7 @@ void SincResampler::Resample(float* destination, int frames) { } } -int SincResampler::ChunkSize() { +int SincResampler::ChunkSize() const { return kBlockSize / io_sample_rate_ratio_; } @@ -235,14 +203,23 @@ void SincResampler::Flush() { float SincResampler::Convolve(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { + // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true + // so long as kKernelSize is a multiple of 16. + DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); + DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); + // Rely on function level static initialization to keep ConvolveProc selection // thread safe. typedef float (*ConvolveProc)(const float* src, const float* k1, const float* k2, double kernel_interpolation_factor); -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#if defined(ARCH_CPU_X86_FAMILY) +#if defined(__SSE__) + static const ConvolveProc kConvolveProc = Convolve_SSE; +#else static const ConvolveProc kConvolveProc = base::CPU().has_sse() ? Convolve_SSE : Convolve_C; +#endif #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) static const ConvolveProc kConvolveProc = Convolve_NEON; #else @@ -271,50 +248,6 @@ float SincResampler::Convolve_C(const float* input_ptr, const float* k1, + kernel_interpolation_factor * sum2; } -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) -float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, - const float* k2, - double kernel_interpolation_factor) { - // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true - // so long as kKernelSize is a multiple of 16. - DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); - DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); - - __m128 m_input; - __m128 m_sums1 = _mm_setzero_ps(); - __m128 m_sums2 = _mm_setzero_ps(); - - // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling - // these loops hurt performance in local testing. - if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { - for (int i = 0; i < kKernelSize; i += 4) { - m_input = _mm_loadu_ps(input_ptr + i); - m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); - m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); - } - } else { - for (int i = 0; i < kKernelSize; i += 4) { - m_input = _mm_load_ps(input_ptr + i); - m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); - m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); - } - } - - // Linearly interpolate the two "convolutions". - m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); - m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); - m_sums1 = _mm_add_ps(m_sums1, m_sums2); - - // Sum components together. - float result; - m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); - _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( - m_sums2, m_sums2, 1))); - - return result; -} -#endif - #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, const float* k2, diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h index a1d3cf7..f4eaf5f 100644 --- a/media/base/sinc_resampler.h +++ b/media/base/sinc_resampler.h @@ -9,6 +9,7 @@ #include "base/gtest_prod_util.h" #include "base/memory/aligned_memory.h" #include "base/memory/scoped_ptr.h" +#include "build/build_config.h" #include "media/base/media_export.h" namespace media { @@ -16,9 +17,30 @@ namespace media { // SincResampler is a high-quality single-channel sample-rate converter. class MEDIA_EXPORT SincResampler { public: - // The maximum number of samples that may be requested from the callback ahead - // of the current position in the stream. - static const int kMaximumLookAheadSize; + enum { + // The kernel size can be adjusted for quality (higher is better) at the + // expense of performance. Must be a multiple of 32. + // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. + kKernelSize = 32, + + // The number of destination frames generated per processing pass. Affects + // how often and for how much SincResampler calls back for input. Must be + // greater than kKernelSize. + kBlockSize = 512, + + // The kernel offset count is used for interpolation and is the number of + // sub-sample kernel shifts. Can be adjusted for quality (higher is better) + // at the expense of allocating more memory. + kKernelOffsetCount = 32, + kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), + + // The size (in samples) of the internal buffer used by the resampler. + kBufferSize = kBlockSize + kKernelSize, + + // The maximum number of samples that may be requested from the callback + // ahead of the current position in the stream. + kMaximumLookAheadSize = kBufferSize + }; // Callback type for providing more data into the resampler. Expects |frames| // of data to be rendered into |destination|; zero padded if not enough frames @@ -36,7 +58,7 @@ class MEDIA_EXPORT SincResampler { // The maximum size in frames that guarantees Resample() will only make a // single call to |read_cb_| for more data. - int ChunkSize(); + int ChunkSize() const; // Flush all buffered data and reset internal indices. void Flush(); @@ -55,15 +77,18 @@ class MEDIA_EXPORT SincResampler { const float* k2, double kernel_interpolation_factor); static float Convolve_C(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); +#if defined(ARCH_CPU_X86_FAMILY) static float Convolve_SSE(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) static float Convolve_NEON(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); +#endif // The ratio of input / output sample rates. - double io_sample_rate_ratio_; + const double io_sample_rate_ratio_; // An index on the source input buffer with sub-sample precision. It must be // double precision to avoid drift. diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc index 0f718f2..b7aaec4 100644 --- a/media/base/sinc_resampler_unittest.cc +++ b/media/base/sinc_resampler_unittest.cc @@ -10,6 +10,7 @@ #include "base/bind.h" #include "base/bind_helpers.h" #include "base/command_line.h" +#include "base/cpu.h" #include "base/logging.h" #include "base/string_number_conversions.h" #include "base/strings/stringize_macros.h" @@ -98,7 +99,7 @@ TEST(SincResamplerTest, Flush) { } // Define platform independent function name for Convolve* tests. -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#if defined(ARCH_CPU_X86_FAMILY) #define CONVOLVE_FUNC Convolve_SSE #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) #define CONVOLVE_FUNC Convolve_NEON @@ -109,6 +110,10 @@ TEST(SincResamplerTest, Flush) { // will be tested by the parameterized SincResampler tests below. #if defined(CONVOLVE_FUNC) TEST(SincResamplerTest, Convolve) { +#if defined(ARCH_CPU_X86_FAMILY) + ASSERT_TRUE(base::CPU().has_sse()); +#endif + // Initialize a dummy resampler. MockSource mock_source; SincResampler resampler( @@ -171,6 +176,10 @@ TEST(SincResamplerTest, ConvolveBenchmark) { printf("Convolve_C took %.2fms.\n", total_time_c_ms); #if defined(CONVOLVE_FUNC) +#if defined(ARCH_CPU_X86_FAMILY) + ASSERT_TRUE(base::CPU().has_sse()); +#endif + // Benchmark with unaligned input pointer. start = base::TimeTicks::HighResNow(); for (int j = 0; j < convolve_iterations; ++j) { diff --git a/media/base/vector_math.cc b/media/base/vector_math.cc index edd95cd..96f94d9 100644 --- a/media/base/vector_math.cc +++ b/media/base/vector_math.cc @@ -7,11 +7,6 @@ #include "base/cpu.h" #include "base/logging.h" -#include "build/build_config.h" - -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) -#include <xmmintrin.h> -#endif namespace media { namespace vector_math { @@ -25,9 +20,13 @@ void FMAC(const float src[], float scale, int len, float dest[]) { // selection thread safe. typedef void (*VectorFMACProc)(const float src[], float scale, int len, float dest[]); -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#if defined(ARCH_CPU_X86_FAMILY) +#if defined(__SSE__) + static const VectorFMACProc kVectorFMACProc = FMAC_SSE; +#else static const VectorFMACProc kVectorFMACProc = base::CPU().has_sse() ? FMAC_SSE : FMAC_C; +#endif #else static const VectorFMACProc kVectorFMACProc = FMAC_C; #endif @@ -40,20 +39,5 @@ void FMAC_C(const float src[], float scale, int len, float dest[]) { dest[i] += src[i] * scale; } -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) -void FMAC_SSE(const float src[], float scale, int len, float dest[]) { - __m128 m_scale = _mm_set_ps1(scale); - int rem = len % 4; - for (int i = 0; i < len - rem; i += 4) { - _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i), - _mm_mul_ps(_mm_load_ps(src + i), m_scale))); - } - - // Handle any remaining values that wouldn't fit in an SSE pass. - if (rem) - FMAC_C(src + len - rem, scale, rem, dest + len - rem); -} -#endif - } // namespace vector_math } // namespace media diff --git a/media/base/vector_math_testing.h b/media/base/vector_math_testing.h index d364b74..503ca6a 100644 --- a/media/base/vector_math_testing.h +++ b/media/base/vector_math_testing.h @@ -5,6 +5,7 @@ #ifndef MEDIA_BASE_VECTOR_MATH_TESTING_H_ #define MEDIA_BASE_VECTOR_MATH_TESTING_H_ +#include "build/build_config.h" #include "media/base/media_export.h" namespace media { @@ -13,8 +14,11 @@ namespace vector_math { // Optimized versions of FMAC() function exposed for testing. See vector_math.h // for details. MEDIA_EXPORT void FMAC_C(const float src[], float scale, int len, float dest[]); + +#if defined(ARCH_CPU_X86_FAMILY) MEDIA_EXPORT void FMAC_SSE(const float src[], float scale, int len, float dest[]); +#endif } // namespace vector_math } // namespace media diff --git a/media/base/vector_math_unittest.cc b/media/base/vector_math_unittest.cc index 153378e..e64c7c9 100644 --- a/media/base/vector_math_unittest.cc +++ b/media/base/vector_math_unittest.cc @@ -7,6 +7,7 @@ #include <cmath> #include "base/command_line.h" +#include "base/cpu.h" #include "base/memory/aligned_memory.h" #include "base/memory/scoped_ptr.h" #include "base/string_number_conversions.h" @@ -90,8 +91,9 @@ TEST_F(VectorMathTest, FMAC) { VerifyOutput(kResult); } -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#if defined(ARCH_CPU_X86_FAMILY) { + ASSERT_TRUE(base::CPU().has_sse()); SCOPED_TRACE("FMAC_SSE"); FillTestVectors(kInputFillValue, kOutputFillValue); vector_math::FMAC_SSE( @@ -118,7 +120,9 @@ TEST_F(VectorMathTest, FMACBenchmark) { double total_time_c_ms = (TimeTicks::HighResNow() - start).InMillisecondsF(); printf("FMAC_C took %.2fms.\n", total_time_c_ms); -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#if defined(ARCH_CPU_X86_FAMILY) + ASSERT_TRUE(base::CPU().has_sse()); + // Benchmark FMAC_SSE() with unaligned size. ASSERT_NE((kVectorSize - 1) % (vector_math::kRequiredAlignment / sizeof(float)), 0U); diff --git a/media/media.gyp b/media/media.gyp index 526c089..0b369ea 100644 --- a/media/media.gyp +++ b/media/media.gyp @@ -683,7 +683,7 @@ 'message': 'Generating Pulse stubs for dynamic loading.', }, ], - 'conditions': [ + 'conditions': [ # Linux/Solaris need libdl for dlopen() and friends. ['OS == "linux" or OS == "solaris"', { 'link_settings': { @@ -811,6 +811,12 @@ '../build/linux/system.gyp:gtk', ], }], + # ios check is necessary due to http://crbug.com/172682. + ['OS != "ios" and (target_arch == "ia32" or target_arch == "x64")', { + 'dependencies': [ + 'media_sse', + ], + }], ], 'target_conditions': [ ['OS == "ios"', { @@ -1018,12 +1024,15 @@ 'audio/audio_low_latency_input_output_unittest.cc', ], }], - [ 'target_arch=="ia32" or target_arch=="x64"', { + ['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', { 'sources': [ 'base/simd/convert_rgb_to_yuv_unittest.cc', ], + 'dependencies': [ + 'media_sse', + ], }], - [ 'screen_capture_supported == 0', { + ['screen_capture_supported == 0', { 'sources/': [ ['exclude', '^video/capture/screen/'], ], @@ -1610,5 +1619,27 @@ }, # end of target differ_block_sse2 ], }], + # ios check is necessary due to http://crbug.com/172682. + ['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', { + 'targets': [ + { + 'target_name': 'media_sse', + 'type': 'static_library', + 'cflags': [ + '-msse', + ], + 'include_dirs': [ + '..', + ], + 'defines': [ + 'MEDIA_IMPLEMENTATION', + ], + 'sources': [ + 'base/simd/sinc_resampler_sse.cc', + 'base/simd/vector_math_sse.cc', + ], + }, # end of target media_sse + ], + }], ], } |