summaryrefslogtreecommitdiffstats
path: root/media
diff options
context:
space:
mode:
Diffstat (limited to 'media')
-rw-r--r--media/base/audio_renderer_mixer_unittest.cc6
-rw-r--r--media/base/sinc_resampler.cc129
-rw-r--r--media/base/sinc_resampler.h21
-rw-r--r--media/base/sinc_resampler_unittest.cc108
4 files changed, 227 insertions, 37 deletions
diff --git a/media/base/audio_renderer_mixer_unittest.cc b/media/base/audio_renderer_mixer_unittest.cc
index 31b25ca..51d06e0 100644
--- a/media/base/audio_renderer_mixer_unittest.cc
+++ b/media/base/audio_renderer_mixer_unittest.cc
@@ -410,12 +410,12 @@ TEST_P(AudioRendererMixerTest, OnRenderError) {
INSTANTIATE_TEST_CASE_P(
AudioRendererMixerTest, AudioRendererMixerTest, testing::Values(
// No resampling.
- std::tr1::make_tuple(44100, 44100, 0.000000477),
+ std::tr1::make_tuple(44100, 44100, 0.00000048),
// Upsampling.
- std::tr1::make_tuple(44100, 48000, 0.0329405),
+ std::tr1::make_tuple(44100, 48000, 0.033),
// Downsampling.
- std::tr1::make_tuple(48000, 41000, 0.0410239)));
+ std::tr1::make_tuple(48000, 41000, 0.042)));
} // namespace media
diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc
index 88e6204..8723185 100644
--- a/media/base/sinc_resampler.cc
+++ b/media/base/sinc_resampler.cc
@@ -36,15 +36,19 @@
#include "media/base/sinc_resampler.h"
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#include <xmmintrin.h>
+#endif
#include <cmath>
+#include "base/cpu.h"
#include "base/logging.h"
namespace media {
enum {
// The kernel size can be adjusted for quality (higher is better) at the
- // expense of performance. Must be an even number.
+ // expense of performance. Must be a multiple of 32.
// TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
kKernelSize = 32,
@@ -68,10 +72,11 @@ SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)
virtual_source_idx_(0),
buffer_primed_(false),
read_cb_(read_cb),
- // TODO(dalecurtis): When we switch to AVX/SSE optimization, we'll need to
- // allocate with 32-byte alignment and ensure they're sized % 32 bytes.
- kernel_storage_(new float[kKernelStorageSize]),
- input_buffer_(new float[kBufferSize]),
+ // Create input buffers with a 16-byte alignment for SSE optimizations.
+ kernel_storage_(static_cast<float*>(
+ base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),
+ input_buffer_(static_cast<float*>(
+ base::AlignedAlloc(sizeof(float) * kBufferSize, 16))),
// Setup various region pointers in the buffer (see diagram above).
r0_(input_buffer_.get() + kKernelSize / 2),
r1_(input_buffer_.get()),
@@ -79,7 +84,10 @@ SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)
r3_(r0_ + kBlockSize - kKernelSize / 2),
r4_(r0_ + kBlockSize),
r5_(r0_ + kKernelSize / 2) {
- DCHECK_EQ(kKernelSize % 2, 0) << "kKernelSize must be even!";
+ // Ensure kKernelSize is a multiple of 32 for easy SSE optimizations; causes
+ // r0_ and r5_ (used for input) to always be 16-byte aligned by virtue of
+ // input_buffer_ being 16-byte aligned.
+ DCHECK_EQ(kKernelSize % 32, 0) << "kKernelSize must be a multiple of 32!";
DCHECK_GT(kBlockSize, kKernelSize)
<< "kBlockSize must be greater than kKernelSize!";
// Basic sanity checks to ensure buffer regions are laid out correctly:
@@ -143,7 +151,7 @@ void SincResampler::InitializeKernel() {
* cos(4.0 * M_PI * x);
// Window the sinc() function and store at the correct offset.
- kernel_storage_[i + offset_idx * kKernelSize] = sinc * window;
+ kernel_storage_.get()[i + offset_idx * kKernelSize] = sinc * window;
}
}
}
@@ -168,36 +176,18 @@ void SincResampler::Resample(float* destination, int frames) {
double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;
int offset_idx = static_cast<int>(virtual_offset_idx);
+ // We'll compute "convolutions" for the two kernels which straddle
+ // |virtual_source_idx_|.
float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
float* k2 = k1 + kKernelSize;
// Initialize input pointer based on quantized |virtual_source_idx_|.
float* input_ptr = r1_ + source_idx;
- // We'll compute "convolutions" for the two kernels which straddle
- // |virtual_source_idx_|.
- float sum1 = 0;
- float sum2 = 0;
-
// Figure out how much to weight each kernel's "convolution".
double kernel_interpolation_factor = virtual_offset_idx - offset_idx;
-
- // Generate a single output sample.
- int n = kKernelSize;
- float input;
- // TODO(dalecurtis): For initial commit, I've ripped out all the SSE
- // optimizations, these definitely need to go back in before release.
- while (n--) {
- input = *input_ptr++;
- sum1 += input * *k1++;
- sum2 += input * *k2++;
- }
-
- // Linearly interpolate the two "convolutions".
- double result = (1.0 - kernel_interpolation_factor) * sum1
- + kernel_interpolation_factor * sum2;
-
- *destination++ = result;
+ *destination++ = Convolve(
+ input_ptr, k1, k2, kernel_interpolation_factor);
// Advance the virtual index.
virtual_source_idx_ += io_sample_rate_ratio_;
@@ -224,4 +214,85 @@ int SincResampler::ChunkSize() {
return kBlockSize / io_sample_rate_ratio_;
}
+float SincResampler::Convolve(const float* input_ptr, const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ // Rely on function level static initialization to keep ConvolveProc selection
+ // thread safe.
+ typedef float (*ConvolveProc)(const float* src, const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor);
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+ static const ConvolveProc kConvolveProc =
+ base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
+#else
+ static const ConvolveProc kConvolveProc = Convolve_C;
+#endif
+
+ return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);
+}
+
+float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ float sum1 = 0;
+ float sum2 = 0;
+
+ // Generate a single output sample. Unrolling this loop hurt performance in
+ // local testing.
+ int n = kKernelSize;
+ while (n--) {
+ sum1 += *input_ptr * *k1++;
+ sum2 += *input_ptr++ * *k2++;
+ }
+
+ // Linearly interpolate the two "convolutions".
+ return (1.0 - kernel_interpolation_factor) * sum1
+ + kernel_interpolation_factor * sum2;
+}
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true
+ // so long as kKernelSize is a multiple of 16.
+ DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
+ DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
+
+ __m128 m_input;
+ __m128 m_sums1 = _mm_setzero_ps();
+ __m128 m_sums2 = _mm_setzero_ps();
+
+ // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
+ // these loops hurt performance in local testing.
+ if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
+ for (int i = 0; i < kKernelSize; i += 4) {
+ m_input = _mm_loadu_ps(input_ptr + i);
+ m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+ m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+ }
+ } else {
+ for (int i = 0; i < kKernelSize; i += 4) {
+ m_input = _mm_load_ps(input_ptr + i);
+ m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+ m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+ }
+ }
+
+ // Linearly interpolate the two "convolutions".
+ m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
+ m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
+ m_sums1 = _mm_add_ps(m_sums1, m_sums2);
+
+ // Sum components together.
+ float result;
+ m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
+ _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
+ m_sums2, m_sums2, 1)));
+
+ return result;
+}
+#endif
+
} // namespace media
diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h
index 58f5c2d..4c55eab 100644
--- a/media/base/sinc_resampler.h
+++ b/media/base/sinc_resampler.h
@@ -6,7 +6,10 @@
#define MEDIA_BASE_SINC_RESAMPLER_H_
#include "base/callback.h"
+#include "base/gtest_prod_util.h"
+#include "base/memory/aligned_memory.h"
#include "base/memory/scoped_ptr.h"
+#include "build/build_config.h"
#include "media/base/media_export.h"
namespace media {
@@ -33,8 +36,22 @@ class MEDIA_EXPORT SincResampler {
int ChunkSize();
private:
+ FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, Convolve);
+ FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, ConvolveBenchmark);
+
void InitializeKernel();
+ // Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are
+ // linearly interpolated using |kernel_interpolation_factor|. The underlying
+ // implementation is chosen at run time based on SSE support.
+ static float Convolve(const float* input_ptr, const float* k1,
+ const float* k2, double kernel_interpolation_factor);
+ static float Convolve_C(const float* input_ptr, const float* k1,
+ const float* k2, double kernel_interpolation_factor);
+ static float Convolve_SSE(const float* input_ptr, const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor);
+
// The ratio of input / output sample rates.
double io_sample_rate_ratio_;
@@ -51,10 +68,10 @@ class MEDIA_EXPORT SincResampler {
// Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
// The kernel offsets are sub-sample shifts of a windowed sinc shifted from
// 0.0 to 1.0 sample.
- scoped_array<float> kernel_storage_;
+ scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> kernel_storage_;
// Data from the source is copied into this buffer for each processing pass.
- scoped_array<float> input_buffer_;
+ scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> input_buffer_;
// Pointers to the various regions inside |input_buffer_|. See the diagram at
// the top of the .cc file for more information.
diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc
index 9bf7ba4..fa358e1 100644
--- a/media/base/sinc_resampler_unittest.cc
+++ b/media/base/sinc_resampler_unittest.cc
@@ -9,9 +9,10 @@
#include "base/bind.h"
#include "base/bind_helpers.h"
+#include "base/command_line.h"
#include "base/logging.h"
-#include "base/memory/scoped_ptr.h"
-#include "base/stringprintf.h"
+#include "base/string_number_conversions.h"
+#include "base/time.h"
#include "media/base/sinc_resampler.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
@@ -20,6 +21,12 @@ using testing::_;
namespace media {
+static const double kSampleRateRatio = 192000.0 / 44100.0;
+static const double kKernelInterpolationFactor = 0.5;
+
+// Command line switch for runtime adjustment of ConvolveBenchmark iterations.
+static const char kConvolveIterations[] = "convolve-iterations";
+
// Helper class to ensure ChunkedResample() functions properly.
class MockSource {
public:
@@ -33,7 +40,6 @@ TEST(SincResamplerTest, ChunkedResample) {
// Choose a high ratio of input to output samples which will result in quick
// exhaustion of SincResampler's internal buffers.
- static const double kSampleRateRatio = 192000.0 / 44100.0;
SincResampler resampler(
kSampleRateRatio,
base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source)));
@@ -52,6 +58,102 @@ TEST(SincResamplerTest, ChunkedResample) {
resampler.Resample(resampled_destination.get(), max_chunk_size);
}
+// Ensure various optimized Convolve() methods return the same value. Only run
+// this test if other optimized methods exist, otherwise the default Convolve()
+// will be tested by the parameterized SincResampler tests below.
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+TEST(SincResamplerTest, Convolve) {
+ // Initialize a dummy resampler.
+ MockSource mock_source;
+ SincResampler resampler(
+ kSampleRateRatio,
+ base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source)));
+
+ // Convolve_SSE() is slightly more precise than Convolve_C(), so comparison
+ // must be done using an epsilon.
+ static const double kEpsilon = 0.00000005;
+
+ // Use a kernel from SincResampler as input and kernel data, this has the
+ // benefit of already being properly sized and aligned for Convolve_SSE().
+ double result = resampler.Convolve_C(
+ resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
+ resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+ double result2 = resampler.Convolve_SSE(
+ resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
+ resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+ EXPECT_NEAR(result2, result, kEpsilon);
+
+ // Test Convolve_SSE() w/ unaligned input pointer.
+ result = resampler.Convolve_C(
+ resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
+ resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+ result2 = resampler.Convolve_SSE(
+ resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
+ resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+ EXPECT_NEAR(result2, result, kEpsilon);
+}
+#endif
+
+// Benchmark for the various Convolve() methods. Make sure to build with
+// branding=Chrome so that DCHECKs are compiled out when benchmarking. Original
+// benchmarks were run with --convolve-iterations=50000000.
+TEST(SincResamplerTest, ConvolveBenchmark) {
+ // Initialize a dummy resampler.
+ MockSource mock_source;
+ SincResampler resampler(
+ kSampleRateRatio,
+ base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source)));
+
+ // Retrieve benchmark iterations from command line.
+ int convolve_iterations = 10;
+ std::string iterations(CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
+ kConvolveIterations));
+ if (!iterations.empty())
+ base::StringToInt(iterations, &convolve_iterations);
+
+ printf("Benchmarking %d iterations:\n", convolve_iterations);
+
+ // Benchmark Convolve_C().
+ base::TimeTicks start = base::TimeTicks::HighResNow();
+ for (int i = 0; i < convolve_iterations; ++i) {
+ resampler.Convolve_C(
+ resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
+ resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+ }
+ double total_time_c_ms =
+ (base::TimeTicks::HighResNow() - start).InMillisecondsF();
+ printf("Convolve_C took %.2fms.\n", total_time_c_ms);
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+ // Benchmark Convolve_SSE() with unaligned input pointer.
+ start = base::TimeTicks::HighResNow();
+ for (int j = 0; j < convolve_iterations; ++j) {
+ resampler.Convolve_SSE(
+ resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
+ resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+ }
+ double total_time_sse_unaligned_ms =
+ (base::TimeTicks::HighResNow() - start).InMillisecondsF();
+ printf("Convolve_SSE (unaligned) took %.2fms; which is %.2fx faster than"
+ " Convolve_C.\n", total_time_sse_unaligned_ms,
+ total_time_c_ms / total_time_sse_unaligned_ms);
+
+ // Benchmark Convolve_SSE() with aligned input pointer.
+ start = base::TimeTicks::HighResNow();
+ for (int j = 0; j < convolve_iterations; ++j) {
+ resampler.Convolve_SSE(
+ resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
+ resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+ }
+ double total_time_sse_aligned_ms =
+ (base::TimeTicks::HighResNow() - start).InMillisecondsF();
+ printf("Convolve_SSE (aligned) took %.2fms; which is %.2fx faster than"
+ " Convolve_C and %.2fx faster than Convolve_SSE (unaligned).\n",
+ total_time_sse_aligned_ms, total_time_c_ms / total_time_sse_aligned_ms,
+ total_time_sse_unaligned_ms / total_time_sse_aligned_ms);
+#endif
+}
+
// Fake audio source for testing the resampler. Generates a sinusoidal linear
// chirp (http://en.wikipedia.org/wiki/Chirp) which can be tuned to stress the
// resampler for the specific sample rate conversion being used.