diff options
Diffstat (limited to 'media')
-rw-r--r-- | media/base/audio_renderer_mixer_unittest.cc | 6 | ||||
-rw-r--r-- | media/base/sinc_resampler.cc | 129 | ||||
-rw-r--r-- | media/base/sinc_resampler.h | 21 | ||||
-rw-r--r-- | media/base/sinc_resampler_unittest.cc | 108 |
4 files changed, 227 insertions, 37 deletions
diff --git a/media/base/audio_renderer_mixer_unittest.cc b/media/base/audio_renderer_mixer_unittest.cc index 31b25ca..51d06e0 100644 --- a/media/base/audio_renderer_mixer_unittest.cc +++ b/media/base/audio_renderer_mixer_unittest.cc @@ -410,12 +410,12 @@ TEST_P(AudioRendererMixerTest, OnRenderError) { INSTANTIATE_TEST_CASE_P( AudioRendererMixerTest, AudioRendererMixerTest, testing::Values( // No resampling. - std::tr1::make_tuple(44100, 44100, 0.000000477), + std::tr1::make_tuple(44100, 44100, 0.00000048), // Upsampling. - std::tr1::make_tuple(44100, 48000, 0.0329405), + std::tr1::make_tuple(44100, 48000, 0.033), // Downsampling. - std::tr1::make_tuple(48000, 41000, 0.0410239))); + std::tr1::make_tuple(48000, 41000, 0.042))); } // namespace media diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc index 88e6204..8723185 100644 --- a/media/base/sinc_resampler.cc +++ b/media/base/sinc_resampler.cc @@ -36,15 +36,19 @@ #include "media/base/sinc_resampler.h" +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#include <xmmintrin.h> +#endif #include <cmath> +#include "base/cpu.h" #include "base/logging.h" namespace media { enum { // The kernel size can be adjusted for quality (higher is better) at the - // expense of performance. Must be an even number. + // expense of performance. Must be a multiple of 32. // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. kKernelSize = 32, @@ -68,10 +72,11 @@ SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb) virtual_source_idx_(0), buffer_primed_(false), read_cb_(read_cb), - // TODO(dalecurtis): When we switch to AVX/SSE optimization, we'll need to - // allocate with 32-byte alignment and ensure they're sized % 32 bytes. - kernel_storage_(new float[kKernelStorageSize]), - input_buffer_(new float[kBufferSize]), + // Create input buffers with a 16-byte alignment for SSE optimizations. + kernel_storage_(static_cast<float*>( + base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), + input_buffer_(static_cast<float*>( + base::AlignedAlloc(sizeof(float) * kBufferSize, 16))), // Setup various region pointers in the buffer (see diagram above). r0_(input_buffer_.get() + kKernelSize / 2), r1_(input_buffer_.get()), @@ -79,7 +84,10 @@ SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb) r3_(r0_ + kBlockSize - kKernelSize / 2), r4_(r0_ + kBlockSize), r5_(r0_ + kKernelSize / 2) { - DCHECK_EQ(kKernelSize % 2, 0) << "kKernelSize must be even!"; + // Ensure kKernelSize is a multiple of 32 for easy SSE optimizations; causes + // r0_ and r5_ (used for input) to always be 16-byte aligned by virtue of + // input_buffer_ being 16-byte aligned. + DCHECK_EQ(kKernelSize % 32, 0) << "kKernelSize must be a multiple of 32!"; DCHECK_GT(kBlockSize, kKernelSize) << "kBlockSize must be greater than kKernelSize!"; // Basic sanity checks to ensure buffer regions are laid out correctly: @@ -143,7 +151,7 @@ void SincResampler::InitializeKernel() { * cos(4.0 * M_PI * x); // Window the sinc() function and store at the correct offset. - kernel_storage_[i + offset_idx * kKernelSize] = sinc * window; + kernel_storage_.get()[i + offset_idx * kKernelSize] = sinc * window; } } } @@ -168,36 +176,18 @@ void SincResampler::Resample(float* destination, int frames) { double virtual_offset_idx = subsample_remainder * kKernelOffsetCount; int offset_idx = static_cast<int>(virtual_offset_idx); + // We'll compute "convolutions" for the two kernels which straddle + // |virtual_source_idx_|. float* k1 = kernel_storage_.get() + offset_idx * kKernelSize; float* k2 = k1 + kKernelSize; // Initialize input pointer based on quantized |virtual_source_idx_|. float* input_ptr = r1_ + source_idx; - // We'll compute "convolutions" for the two kernels which straddle - // |virtual_source_idx_|. - float sum1 = 0; - float sum2 = 0; - // Figure out how much to weight each kernel's "convolution". double kernel_interpolation_factor = virtual_offset_idx - offset_idx; - - // Generate a single output sample. - int n = kKernelSize; - float input; - // TODO(dalecurtis): For initial commit, I've ripped out all the SSE - // optimizations, these definitely need to go back in before release. - while (n--) { - input = *input_ptr++; - sum1 += input * *k1++; - sum2 += input * *k2++; - } - - // Linearly interpolate the two "convolutions". - double result = (1.0 - kernel_interpolation_factor) * sum1 - + kernel_interpolation_factor * sum2; - - *destination++ = result; + *destination++ = Convolve( + input_ptr, k1, k2, kernel_interpolation_factor); // Advance the virtual index. virtual_source_idx_ += io_sample_rate_ratio_; @@ -224,4 +214,85 @@ int SincResampler::ChunkSize() { return kBlockSize / io_sample_rate_ratio_; } +float SincResampler::Convolve(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor) { + // Rely on function level static initialization to keep ConvolveProc selection + // thread safe. + typedef float (*ConvolveProc)(const float* src, const float* k1, + const float* k2, + double kernel_interpolation_factor); +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) + static const ConvolveProc kConvolveProc = + base::CPU().has_sse() ? Convolve_SSE : Convolve_C; +#else + static const ConvolveProc kConvolveProc = Convolve_C; +#endif + + return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); +} + +float SincResampler::Convolve_C(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor) { + float sum1 = 0; + float sum2 = 0; + + // Generate a single output sample. Unrolling this loop hurt performance in + // local testing. + int n = kKernelSize; + while (n--) { + sum1 += *input_ptr * *k1++; + sum2 += *input_ptr++ * *k2++; + } + + // Linearly interpolate the two "convolutions". + return (1.0 - kernel_interpolation_factor) * sum1 + + kernel_interpolation_factor * sum2; +} + +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor) { + // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true + // so long as kKernelSize is a multiple of 16. + DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); + DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); + + __m128 m_input; + __m128 m_sums1 = _mm_setzero_ps(); + __m128 m_sums2 = _mm_setzero_ps(); + + // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling + // these loops hurt performance in local testing. + if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { + for (int i = 0; i < kKernelSize; i += 4) { + m_input = _mm_loadu_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } else { + for (int i = 0; i < kKernelSize; i += 4) { + m_input = _mm_load_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } + + // Linearly interpolate the two "convolutions". + m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); + m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); + m_sums1 = _mm_add_ps(m_sums1, m_sums2); + + // Sum components together. + float result; + m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); + _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( + m_sums2, m_sums2, 1))); + + return result; +} +#endif + } // namespace media diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h index 58f5c2d..4c55eab 100644 --- a/media/base/sinc_resampler.h +++ b/media/base/sinc_resampler.h @@ -6,7 +6,10 @@ #define MEDIA_BASE_SINC_RESAMPLER_H_ #include "base/callback.h" +#include "base/gtest_prod_util.h" +#include "base/memory/aligned_memory.h" #include "base/memory/scoped_ptr.h" +#include "build/build_config.h" #include "media/base/media_export.h" namespace media { @@ -33,8 +36,22 @@ class MEDIA_EXPORT SincResampler { int ChunkSize(); private: + FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, Convolve); + FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, ConvolveBenchmark); + void InitializeKernel(); + // Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are + // linearly interpolated using |kernel_interpolation_factor|. The underlying + // implementation is chosen at run time based on SSE support. + static float Convolve(const float* input_ptr, const float* k1, + const float* k2, double kernel_interpolation_factor); + static float Convolve_C(const float* input_ptr, const float* k1, + const float* k2, double kernel_interpolation_factor); + static float Convolve_SSE(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor); + // The ratio of input / output sample rates. double io_sample_rate_ratio_; @@ -51,10 +68,10 @@ class MEDIA_EXPORT SincResampler { // Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize. // The kernel offsets are sub-sample shifts of a windowed sinc shifted from // 0.0 to 1.0 sample. - scoped_array<float> kernel_storage_; + scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> kernel_storage_; // Data from the source is copied into this buffer for each processing pass. - scoped_array<float> input_buffer_; + scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> input_buffer_; // Pointers to the various regions inside |input_buffer_|. See the diagram at // the top of the .cc file for more information. diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc index 9bf7ba4..fa358e1 100644 --- a/media/base/sinc_resampler_unittest.cc +++ b/media/base/sinc_resampler_unittest.cc @@ -9,9 +9,10 @@ #include "base/bind.h" #include "base/bind_helpers.h" +#include "base/command_line.h" #include "base/logging.h" -#include "base/memory/scoped_ptr.h" -#include "base/stringprintf.h" +#include "base/string_number_conversions.h" +#include "base/time.h" #include "media/base/sinc_resampler.h" #include "testing/gmock/include/gmock/gmock.h" #include "testing/gtest/include/gtest/gtest.h" @@ -20,6 +21,12 @@ using testing::_; namespace media { +static const double kSampleRateRatio = 192000.0 / 44100.0; +static const double kKernelInterpolationFactor = 0.5; + +// Command line switch for runtime adjustment of ConvolveBenchmark iterations. +static const char kConvolveIterations[] = "convolve-iterations"; + // Helper class to ensure ChunkedResample() functions properly. class MockSource { public: @@ -33,7 +40,6 @@ TEST(SincResamplerTest, ChunkedResample) { // Choose a high ratio of input to output samples which will result in quick // exhaustion of SincResampler's internal buffers. - static const double kSampleRateRatio = 192000.0 / 44100.0; SincResampler resampler( kSampleRateRatio, base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source))); @@ -52,6 +58,102 @@ TEST(SincResamplerTest, ChunkedResample) { resampler.Resample(resampled_destination.get(), max_chunk_size); } +// Ensure various optimized Convolve() methods return the same value. Only run +// this test if other optimized methods exist, otherwise the default Convolve() +// will be tested by the parameterized SincResampler tests below. +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +TEST(SincResamplerTest, Convolve) { + // Initialize a dummy resampler. + MockSource mock_source; + SincResampler resampler( + kSampleRateRatio, + base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source))); + + // Convolve_SSE() is slightly more precise than Convolve_C(), so comparison + // must be done using an epsilon. + static const double kEpsilon = 0.00000005; + + // Use a kernel from SincResampler as input and kernel data, this has the + // benefit of already being properly sized and aligned for Convolve_SSE(). + double result = resampler.Convolve_C( + resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), + resampler.kernel_storage_.get(), kKernelInterpolationFactor); + double result2 = resampler.Convolve_SSE( + resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), + resampler.kernel_storage_.get(), kKernelInterpolationFactor); + EXPECT_NEAR(result2, result, kEpsilon); + + // Test Convolve_SSE() w/ unaligned input pointer. + result = resampler.Convolve_C( + resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), + resampler.kernel_storage_.get(), kKernelInterpolationFactor); + result2 = resampler.Convolve_SSE( + resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), + resampler.kernel_storage_.get(), kKernelInterpolationFactor); + EXPECT_NEAR(result2, result, kEpsilon); +} +#endif + +// Benchmark for the various Convolve() methods. Make sure to build with +// branding=Chrome so that DCHECKs are compiled out when benchmarking. Original +// benchmarks were run with --convolve-iterations=50000000. +TEST(SincResamplerTest, ConvolveBenchmark) { + // Initialize a dummy resampler. + MockSource mock_source; + SincResampler resampler( + kSampleRateRatio, + base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source))); + + // Retrieve benchmark iterations from command line. + int convolve_iterations = 10; + std::string iterations(CommandLine::ForCurrentProcess()->GetSwitchValueASCII( + kConvolveIterations)); + if (!iterations.empty()) + base::StringToInt(iterations, &convolve_iterations); + + printf("Benchmarking %d iterations:\n", convolve_iterations); + + // Benchmark Convolve_C(). + base::TimeTicks start = base::TimeTicks::HighResNow(); + for (int i = 0; i < convolve_iterations; ++i) { + resampler.Convolve_C( + resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), + resampler.kernel_storage_.get(), kKernelInterpolationFactor); + } + double total_time_c_ms = + (base::TimeTicks::HighResNow() - start).InMillisecondsF(); + printf("Convolve_C took %.2fms.\n", total_time_c_ms); + +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) + // Benchmark Convolve_SSE() with unaligned input pointer. + start = base::TimeTicks::HighResNow(); + for (int j = 0; j < convolve_iterations; ++j) { + resampler.Convolve_SSE( + resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), + resampler.kernel_storage_.get(), kKernelInterpolationFactor); + } + double total_time_sse_unaligned_ms = + (base::TimeTicks::HighResNow() - start).InMillisecondsF(); + printf("Convolve_SSE (unaligned) took %.2fms; which is %.2fx faster than" + " Convolve_C.\n", total_time_sse_unaligned_ms, + total_time_c_ms / total_time_sse_unaligned_ms); + + // Benchmark Convolve_SSE() with aligned input pointer. + start = base::TimeTicks::HighResNow(); + for (int j = 0; j < convolve_iterations; ++j) { + resampler.Convolve_SSE( + resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), + resampler.kernel_storage_.get(), kKernelInterpolationFactor); + } + double total_time_sse_aligned_ms = + (base::TimeTicks::HighResNow() - start).InMillisecondsF(); + printf("Convolve_SSE (aligned) took %.2fms; which is %.2fx faster than" + " Convolve_C and %.2fx faster than Convolve_SSE (unaligned).\n", + total_time_sse_aligned_ms, total_time_c_ms / total_time_sse_aligned_ms, + total_time_sse_unaligned_ms / total_time_sse_aligned_ms); +#endif +} + // Fake audio source for testing the resampler. Generates a sinusoidal linear // chirp (http://en.wikipedia.org/wiki/Chirp) which can be tuned to stress the // resampler for the specific sample rate conversion being used. |