Add SSE optimizations to SincResampler.

These are not the same optimizations in the WebKit version of SincResampler. The WebKit version focuses on aligning the input vector, resulting in at worst two unaligned loads on each kernel index; or 2 * kKernelSize / 4 unaligned loads per call. Instead I chose to focus on keeping the kernel vectors aligned and eating at worst a single unaligned load on the input vector; or kKernelSize / 4 unaligned loads per call. Performance results from SincResamplerTest.ConvolveBenchmark: clang version 3.2 (trunk 159409): Convolve_C took 2100ms for 50000000 iterations. Convolve_SSE (aligned) took 677ms for 50000000 iterations. Convolve_SSE (unaligned) took 717ms for 50000000 iterations. gcc (Ubuntu 4.4.3-4ubuntu5.1) 4.4.3 Convolve_C took 2183ms for 50000000 iterations. Convolve_SSE (aligned) took 806ms for 50000000 iterations. Convolve_SSE (unaligned) took 844ms for 50000000 iterations. For reference, the original WebKit optimizations: clang version 3.2 (trunk 159409): Convolve_C took 2132ms for 50000000 iterations. Convolve_SSE (aligned) took 1146ms for 50000000 iterations. Convolve_SSE (unaligned) took 1797ms for 50000000 iterations. gcc (Ubuntu 4.4.3-4ubuntu5.1) 4.4.3: Convolve_C took 2209ms for 50000000 iterations. Convolve_SSE (aligned) took 1450ms for 50000000 iterations. Convolve_SSE (unaligned) took 4415ms for 50000000 iterations. In summary, SSE provides an ~2.6x to ~3x speedup on GCC and clang respectively. BUG=133637 TEST=media_unittests + SincResampler/* tests. Review URL: https://chromiumcodereview.appspot.com/10803003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@149569 0039d316-1c4b-4281-b951-d872f2087c98
author: dalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-08-02 02:59:42 +0000
committer: dalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-08-02 02:59:42 +0000
commit: fce61dcbdd3fb31839ba056b204bf903580a163b (patch)
tree: 5a8c37cfe0867ea14e0e37be2a7ce381a4ac2e27 /media
parent: 3770d6951c0db5ef047e77bcaee1b5bd9864f952 (diff)
download: chromium_src-fce61dcbdd3fb31839ba056b204bf903580a163b.zip
chromium_src-fce61dcbdd3fb31839ba056b204bf903580a163b.tar.gz
chromium_src-fce61dcbdd3fb31839ba056b204bf903580a163b.tar.bz2
4 files changed, 227 insertions, 37 deletions
diff --git a/media/base/audio_renderer_mixer_unittest.cc b/media/base/audio_renderer_mixer_unittest.cc
index 31b25ca..51d06e0 100644
--- a/media/base/audio_renderer_mixer_unittest.cc
+++ b/media/base/audio_renderer_mixer_unittest.cc
@@ -410,12 +410,12 @@ TEST_P(AudioRendererMixerTest, OnRenderError) {
 INSTANTIATE_TEST_CASE_P(
     AudioRendererMixerTest, AudioRendererMixerTest, testing::Values(
         // No resampling.
-        std::tr1::make_tuple(44100, 44100, 0.000000477),
+        std::tr1::make_tuple(44100, 44100, 0.00000048),
 
         // Upsampling.
-        std::tr1::make_tuple(44100, 48000, 0.0329405),
+        std::tr1::make_tuple(44100, 48000, 0.033),
 
         // Downsampling.
-        std::tr1::make_tuple(48000, 41000, 0.0410239)));
+        std::tr1::make_tuple(48000, 41000, 0.042)));
 
 }  // namespace media
diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc
index 88e6204..8723185 100644
--- a/media/base/sinc_resampler.cc
+++ b/media/base/sinc_resampler.cc
@@ -36,15 +36,19 @@
 
 #include "media/base/sinc_resampler.h"
 
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#include <xmmintrin.h>
+#endif
 #include <cmath>
 
+#include "base/cpu.h"
 #include "base/logging.h"
 
 namespace media {
 
 enum {
   // The kernel size can be adjusted for quality (higher is better) at the
-  // expense of performance.  Must be an even number.
+  // expense of performance.  Must be a multiple of 32.
   // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
   kKernelSize = 32,
 
@@ -68,10 +72,11 @@ SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)
       virtual_source_idx_(0),
       buffer_primed_(false),
       read_cb_(read_cb),
-      // TODO(dalecurtis): When we switch to AVX/SSE optimization, we'll need to
-      // allocate with 32-byte alignment and ensure they're sized % 32 bytes.
-      kernel_storage_(new float[kKernelStorageSize]),
-      input_buffer_(new float[kBufferSize]),
+      // Create input buffers with a 16-byte alignment for SSE optimizations.
+      kernel_storage_(static_cast<float*>(
+          base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),
+      input_buffer_(static_cast<float*>(
+          base::AlignedAlloc(sizeof(float) * kBufferSize, 16))),
       // Setup various region pointers in the buffer (see diagram above).
       r0_(input_buffer_.get() + kKernelSize / 2),
       r1_(input_buffer_.get()),
@@ -79,7 +84,10 @@ SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)
       r3_(r0_ + kBlockSize - kKernelSize / 2),
       r4_(r0_ + kBlockSize),
       r5_(r0_ + kKernelSize / 2) {
-  DCHECK_EQ(kKernelSize % 2, 0) << "kKernelSize must be even!";
+  // Ensure kKernelSize is a multiple of 32 for easy SSE optimizations; causes
+  // r0_ and r5_ (used for input) to always be 16-byte aligned by virtue of
+  // input_buffer_ being 16-byte aligned.
+  DCHECK_EQ(kKernelSize % 32, 0) << "kKernelSize must be a multiple of 32!";
   DCHECK_GT(kBlockSize, kKernelSize)
       << "kBlockSize must be greater than kKernelSize!";
   // Basic sanity checks to ensure buffer regions are laid out correctly:
@@ -143,7 +151,7 @@ void SincResampler::InitializeKernel() {
           * cos(4.0 * M_PI * x);
 
       // Window the sinc() function and store at the correct offset.
-      kernel_storage_[i + offset_idx * kKernelSize] = sinc * window;
+      kernel_storage_.get()[i + offset_idx * kKernelSize] = sinc * window;
     }
   }
 }
@@ -168,36 +176,18 @@ void SincResampler::Resample(float* destination, int frames) {
       double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;
       int offset_idx = static_cast<int>(virtual_offset_idx);
 
+      // We'll compute "convolutions" for the two kernels which straddle
+      // |virtual_source_idx_|.
       float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;
       float* k2 = k1 + kKernelSize;
 
       // Initialize input pointer based on quantized |virtual_source_idx_|.
       float* input_ptr = r1_ + source_idx;
 
-      // We'll compute "convolutions" for the two kernels which straddle
-      // |virtual_source_idx_|.
-      float sum1 = 0;
-      float sum2 = 0;
-
       // Figure out how much to weight each kernel's "convolution".
       double kernel_interpolation_factor = virtual_offset_idx - offset_idx;
-
-      // Generate a single output sample.
-      int n = kKernelSize;
-      float input;
-      // TODO(dalecurtis): For initial commit, I've ripped out all the SSE
-      // optimizations, these definitely need to go back in before release.
-      while (n--) {
-        input = *input_ptr++;
-        sum1 += input * *k1++;
-        sum2 += input * *k2++;
-      }
-
-      // Linearly interpolate the two "convolutions".
-      double result = (1.0 - kernel_interpolation_factor) * sum1
-          + kernel_interpolation_factor * sum2;
-
-      *destination++ = result;
+      *destination++ = Convolve(
+          input_ptr, k1, k2, kernel_interpolation_factor);
 
       // Advance the virtual index.
       virtual_source_idx_ += io_sample_rate_ratio_;
@@ -224,4 +214,85 @@ int SincResampler::ChunkSize() {
   return kBlockSize / io_sample_rate_ratio_;
 }
 
+float SincResampler::Convolve(const float* input_ptr, const float* k1,
+                              const float* k2,
+                              double kernel_interpolation_factor) {
+  // Rely on function level static initialization to keep ConvolveProc selection
+  // thread safe.
+  typedef float (*ConvolveProc)(const float* src, const float* k1,
+                                const float* k2,
+                                double kernel_interpolation_factor);
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+  static const ConvolveProc kConvolveProc =
+      base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
+#else
+  static const ConvolveProc kConvolveProc = Convolve_C;
+#endif
+
+  return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);
+}
+
+float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
+                                const float* k2,
+                                double kernel_interpolation_factor) {
+  float sum1 = 0;
+  float sum2 = 0;
+
+  // Generate a single output sample.  Unrolling this loop hurt performance in
+  // local testing.
+  int n = kKernelSize;
+  while (n--) {
+    sum1 += *input_ptr * *k1++;
+    sum2 += *input_ptr++ * *k2++;
+  }
+
+  // Linearly interpolate the two "convolutions".
+  return (1.0 - kernel_interpolation_factor) * sum1
+      + kernel_interpolation_factor * sum2;
+}
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
+                                  const float* k2,
+                                  double kernel_interpolation_factor) {
+  // Ensure |k1|, |k2| are 16-byte aligned for SSE usage.  Should always be true
+  // so long as kKernelSize is a multiple of 16.
+  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
+  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
+
+  __m128 m_input;
+  __m128 m_sums1 = _mm_setzero_ps();
+  __m128 m_sums2 = _mm_setzero_ps();
+
+  // Based on |input_ptr| alignment, we need to use loadu or load.  Unrolling
+  // these loops hurt performance in local testing.
+  if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
+    for (int i = 0; i < kKernelSize; i += 4) {
+      m_input = _mm_loadu_ps(input_ptr + i);
+      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+    }
+  } else {
+    for (int i = 0; i < kKernelSize; i += 4) {
+      m_input = _mm_load_ps(input_ptr + i);
+      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+    }
+  }
+
+  // Linearly interpolate the two "convolutions".
+  m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
+  m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
+  m_sums1 = _mm_add_ps(m_sums1, m_sums2);
+
+  // Sum components together.
+  float result;
+  m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
+  _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
+      m_sums2, m_sums2, 1)));
+
+  return result;
+}
+#endif
+
 }  // namespace media
diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h
index 58f5c2d..4c55eab 100644
--- a/media/base/sinc_resampler.h
+++ b/media/base/sinc_resampler.h
@@ -6,7 +6,10 @@
 #define MEDIA_BASE_SINC_RESAMPLER_H_
 
 #include "base/callback.h"
+#include "base/gtest_prod_util.h"
+#include "base/memory/aligned_memory.h"
 #include "base/memory/scoped_ptr.h"
+#include "build/build_config.h"
 #include "media/base/media_export.h"
 
 namespace media {
@@ -33,8 +36,22 @@ class MEDIA_EXPORT SincResampler {
   int ChunkSize();
 
  private:
+  FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, Convolve);
+  FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, ConvolveBenchmark);
+
   void InitializeKernel();
 
+  // Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are
+  // linearly interpolated using |kernel_interpolation_factor|.  The underlying
+  // implementation is chosen at run time based on SSE support.
+  static float Convolve(const float* input_ptr, const float* k1,
+                        const float* k2, double kernel_interpolation_factor);
+  static float Convolve_C(const float* input_ptr, const float* k1,
+                          const float* k2, double kernel_interpolation_factor);
+  static float Convolve_SSE(const float* input_ptr, const float* k1,
+                            const float* k2,
+                            double kernel_interpolation_factor);
+
   // The ratio of input / output sample rates.
   double io_sample_rate_ratio_;
 
@@ -51,10 +68,10 @@ class MEDIA_EXPORT SincResampler {
   // Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
   // The kernel offsets are sub-sample shifts of a windowed sinc shifted from
   // 0.0 to 1.0 sample.
-  scoped_array<float> kernel_storage_;
+  scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> kernel_storage_;
 
   // Data from the source is copied into this buffer for each processing pass.
-  scoped_array<float> input_buffer_;
+  scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> input_buffer_;
 
   // Pointers to the various regions inside |input_buffer_|.  See the diagram at
   // the top of the .cc file for more information.
diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc
index 9bf7ba4..fa358e1 100644
--- a/media/base/sinc_resampler_unittest.cc
+++ b/media/base/sinc_resampler_unittest.cc
@@ -9,9 +9,10 @@
 
 #include "base/bind.h"
 #include "base/bind_helpers.h"
+#include "base/command_line.h"
 #include "base/logging.h"
-#include "base/memory/scoped_ptr.h"
-#include "base/stringprintf.h"
+#include "base/string_number_conversions.h"
+#include "base/time.h"
 #include "media/base/sinc_resampler.h"
 #include "testing/gmock/include/gmock/gmock.h"
 #include "testing/gtest/include/gtest/gtest.h"
@@ -20,6 +21,12 @@ using testing::_;
 
 namespace media {
 
+static const double kSampleRateRatio = 192000.0 / 44100.0;
+static const double kKernelInterpolationFactor = 0.5;
+
+// Command line switch for runtime adjustment of ConvolveBenchmark iterations.
+static const char kConvolveIterations[] = "convolve-iterations";
+
 // Helper class to ensure ChunkedResample() functions properly.
 class MockSource {
  public:
@@ -33,7 +40,6 @@ TEST(SincResamplerTest, ChunkedResample) {
 
   // Choose a high ratio of input to output samples which will result in quick
   // exhaustion of SincResampler's internal buffers.
-  static const double kSampleRateRatio = 192000.0 / 44100.0;
   SincResampler resampler(
       kSampleRateRatio,
       base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source)));
@@ -52,6 +58,102 @@ TEST(SincResamplerTest, ChunkedResample) {
   resampler.Resample(resampled_destination.get(), max_chunk_size);
 }
 
+// Ensure various optimized Convolve() methods return the same value.  Only run
+// this test if other optimized methods exist, otherwise the default Convolve()
+// will be tested by the parameterized SincResampler tests below.
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+TEST(SincResamplerTest, Convolve) {
+  // Initialize a dummy resampler.
+  MockSource mock_source;
+  SincResampler resampler(
+      kSampleRateRatio,
+      base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source)));
+
+  // Convolve_SSE() is slightly more precise than Convolve_C(), so comparison
+  // must be done using an epsilon.
+  static const double kEpsilon = 0.00000005;
+
+  // Use a kernel from SincResampler as input and kernel data, this has the
+  // benefit of already being properly sized and aligned for Convolve_SSE().
+  double result = resampler.Convolve_C(
+      resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
+      resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+  double result2 = resampler.Convolve_SSE(
+      resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
+      resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+  EXPECT_NEAR(result2, result, kEpsilon);
+
+  // Test Convolve_SSE() w/ unaligned input pointer.
+  result = resampler.Convolve_C(
+      resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
+      resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+  result2 = resampler.Convolve_SSE(
+      resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
+      resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+  EXPECT_NEAR(result2, result, kEpsilon);
+}
+#endif
+
+// Benchmark for the various Convolve() methods.  Make sure to build with
+// branding=Chrome so that DCHECKs are compiled out when benchmarking.  Original
+// benchmarks were run with --convolve-iterations=50000000.
+TEST(SincResamplerTest, ConvolveBenchmark) {
+  // Initialize a dummy resampler.
+  MockSource mock_source;
+  SincResampler resampler(
+      kSampleRateRatio,
+      base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source)));
+
+  // Retrieve benchmark iterations from command line.
+  int convolve_iterations = 10;
+  std::string iterations(CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
+      kConvolveIterations));
+  if (!iterations.empty())
+    base::StringToInt(iterations, &convolve_iterations);
+
+  printf("Benchmarking %d iterations:\n", convolve_iterations);
+
+  // Benchmark Convolve_C().
+  base::TimeTicks start = base::TimeTicks::HighResNow();
+  for (int i = 0; i < convolve_iterations; ++i) {
+    resampler.Convolve_C(
+        resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
+        resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+  }
+  double total_time_c_ms =
+      (base::TimeTicks::HighResNow() - start).InMillisecondsF();
+  printf("Convolve_C took %.2fms.\n", total_time_c_ms);
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+  // Benchmark Convolve_SSE() with unaligned input pointer.
+  start = base::TimeTicks::HighResNow();
+  for (int j = 0; j < convolve_iterations; ++j) {
+    resampler.Convolve_SSE(
+        resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
+        resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+  }
+  double total_time_sse_unaligned_ms =
+      (base::TimeTicks::HighResNow() - start).InMillisecondsF();
+  printf("Convolve_SSE (unaligned) took %.2fms; which is %.2fx faster than"
+         " Convolve_C.\n", total_time_sse_unaligned_ms,
+         total_time_c_ms / total_time_sse_unaligned_ms);
+
+  // Benchmark Convolve_SSE() with aligned input pointer.
+  start = base::TimeTicks::HighResNow();
+  for (int j = 0; j < convolve_iterations; ++j) {
+    resampler.Convolve_SSE(
+        resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
+        resampler.kernel_storage_.get(), kKernelInterpolationFactor);
+  }
+  double total_time_sse_aligned_ms =
+      (base::TimeTicks::HighResNow() - start).InMillisecondsF();
+  printf("Convolve_SSE (aligned) took %.2fms; which is %.2fx faster than"
+         " Convolve_C and %.2fx faster than Convolve_SSE (unaligned).\n",
+         total_time_sse_aligned_ms, total_time_c_ms / total_time_sse_aligned_ms,
+         total_time_sse_unaligned_ms / total_time_sse_aligned_ms);
+#endif
+}
+
 // Fake audio source for testing the resampler.  Generates a sinusoidal linear
 // chirp (http://en.wikipedia.org/wiki/Chirp) which can be tuned to stress the
 // resampler for the specific sample rate conversion being used.
author	dalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-08-02 02:59:42 +0000
committer	dalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-08-02 02:59:42 +0000
commit	fce61dcbdd3fb31839ba056b204bf903580a163b (patch)
tree	5a8c37cfe0867ea14e0e37be2a7ce381a4ac2e27 /media
parent	3770d6951c0db5ef047e77bcaee1b5bd9864f952 (diff)
download	chromium_src-fce61dcbdd3fb31839ba056b204bf903580a163b.zip chromium_src-fce61dcbdd3fb31839ba056b204bf903580a163b.tar.gz chromium_src-fce61dcbdd3fb31839ba056b204bf903580a163b.tar.bz2