Add ARM NEON intrinsic optimizations for SincResampler.

On an exynos board these yielded an ~2.3x speedup: Benchmarking 50000000 iterations: Convolve_C took 5682.71ms. Convolve_NEON(unaligned) took 2451.18ms; which is 2.32x faster than Convolve_C. Convolve_NEON (aligned) took 2397.01ms; which is 2.37x faster than Convolve_C and 1.02x faster than Convolve_NEON (unaligned). BUG=none TEST=try bot, fischman. Review URL: https://codereview.chromium.org/10960023 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@158870 0039d316-1c4b-4281-b951-d872f2087c98
author: dalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2012-09-26 20:12:44 +0000
committer: dalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2012-09-26 20:12:44 +0000
commit: 176316bd19b6b2472444b23a9b7069b6cd69c33e (patch)
tree: 7ffd421756475a5bce034e3220f3425492887300 /media
parent: 8e7ea1d6b76da317a70949868176a5d795994a2f (diff)
download: chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.zip
chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.tar.gz
chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.tar.bz2
4 files changed, 94 insertions, 28 deletions
diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc
index 869af1b..9352fe3 100644
--- a/media/base/sinc_resampler.cc
+++ b/media/base/sinc_resampler.cc
@@ -36,13 +36,19 @@
 
 #include "media/base/sinc_resampler.h"
 
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-#include <xmmintrin.h>
-#endif
 #include <cmath>
 
 #include "base/cpu.h"
 #include "base/logging.h"
+#include "build/build_config.h"
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
+#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
+#include <arm_neon.h>
+#endif
 
 namespace media {
 
@@ -231,6 +237,8 @@ float SincResampler::Convolve(const float* input_ptr, const float* k1,
 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
   static const ConvolveProc kConvolveProc =
       base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
+  static const ConvolveProc kConvolveProc = Convolve_NEON;
 #else
   static const ConvolveProc kConvolveProc = Convolve_C;
 #endif
@@ -301,4 +309,33 @@ float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
 }
 #endif
 
+#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
+float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
+                                   const float* k2,
+                                   double kernel_interpolation_factor) {
+  float32x4_t m_input;
+  float32x4_t m_sums1 = vmovq_n_f32(0);
+  float32x4_t m_sums2 = vmovq_n_f32(0);
+
+  const float* upper = input_ptr + kKernelSize;
+  for (; input_ptr < upper; ) {
+    m_input = vld1q_f32(input_ptr);
+    input_ptr += 4;
+    m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
+    k1 += 4;
+    m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
+    k2 += 4;
+  }
+
+  // Linearly interpolate the two "convolutions".
+  m_sums1 = vmlaq_f32(
+      vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
+      m_sums2, vmovq_n_f32(kernel_interpolation_factor));
+
+  // Sum components together.
+  float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
+  return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
+}
+#endif
+
 }  // namespace media
diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h
index ef2f176..604192f 100644
--- a/media/base/sinc_resampler.h
+++ b/media/base/sinc_resampler.h
@@ -9,7 +9,6 @@
 #include "base/gtest_prod_util.h"
 #include "base/memory/aligned_memory.h"
 #include "base/memory/scoped_ptr.h"
-#include "build/build_config.h"
 #include "media/base/media_export.h"
 
 namespace media {
@@ -45,8 +44,9 @@ class MEDIA_EXPORT SincResampler {
   void InitializeKernel();
 
   // Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are
-  // linearly interpolated using |kernel_interpolation_factor|.  The underlying
-  // implementation is chosen at run time based on SSE support.
+  // linearly interpolated using |kernel_interpolation_factor|.  On x86, the
+  // underlying implementation is chosen at run time based on SSE support.  On
+  // ARM, NEON support is chosen at compile time based on compilation flags.
   static float Convolve(const float* input_ptr, const float* k1,
                         const float* k2, double kernel_interpolation_factor);
   static float Convolve_C(const float* input_ptr, const float* k1,
@@ -54,6 +54,9 @@ class MEDIA_EXPORT SincResampler {
   static float Convolve_SSE(const float* input_ptr, const float* k1,
                             const float* k2,
                             double kernel_interpolation_factor);
+  static float Convolve_NEON(const float* input_ptr, const float* k1,
+                             const float* k2,
+                             double kernel_interpolation_factor);
 
   // The ratio of input / output sample rates.
   double io_sample_rate_ratio_;
diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc
index 77a963e..59a9f81 100644
--- a/media/base/sinc_resampler_unittest.cc
+++ b/media/base/sinc_resampler_unittest.cc
@@ -12,7 +12,9 @@
 #include "base/command_line.h"
 #include "base/logging.h"
 #include "base/string_number_conversions.h"
+#include "base/stringize_macros.h"
 #include "base/time.h"
+#include "build/build_config.h"
 #include "media/base/sinc_resampler.h"
 #include "testing/gmock/include/gmock/gmock.h"
 #include "testing/gtest/include/gtest/gtest.h"
@@ -38,7 +40,10 @@ ACTION(ClearBuffer) {
 }
 
 ACTION(FillBuffer) {
-  memset(arg0, 1, arg1 * sizeof(float));
+  // Value chosen arbitrarily such that SincResampler resamples it to something
+  // easily representable on all platforms; e.g., using kSampleRateRatio this
+  // becomes 1.81219.
+  memset(arg0, 64, arg1 * sizeof(float));
 }
 
 // Test requesting multiples of ChunkSize() frames results in the proper number
@@ -89,13 +94,20 @@ TEST(SincResamplerTest, Flush) {
       .Times(1).WillOnce(ClearBuffer());
   resampler.Resample(resampled_destination.get(), resampler.ChunkSize() / 2);
   for (int i = 0; i < resampler.ChunkSize() / 2; ++i)
-    ASSERT_EQ(resampled_destination[i], 0);
+    ASSERT_FLOAT_EQ(resampled_destination[i], 0);
 }
 
+// Define platform independent function name for Convolve* tests.
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#define CONVOLVE_FUNC Convolve_SSE
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
+#define CONVOLVE_FUNC Convolve_NEON
+#endif
+
 // Ensure various optimized Convolve() methods return the same value.  Only run
 // this test if other optimized methods exist, otherwise the default Convolve()
 // will be tested by the parameterized SincResampler tests below.
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(CONVOLVE_FUNC)
 TEST(SincResamplerTest, Convolve) {
   // Initialize a dummy resampler.
   MockSource mock_source;
@@ -103,8 +115,8 @@ TEST(SincResamplerTest, Convolve) {
       kSampleRateRatio,
       base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source)));
 
-  // Convolve_SSE() is slightly more precise than Convolve_C(), so comparison
-  // must be done using an epsilon.
+  // The optimized Convolve methods are slightly more precise than Convolve_C(),
+  // so comparison must be done using an epsilon.
   static const double kEpsilon = 0.00000005;
 
   // Use a kernel from SincResampler as input and kernel data, this has the
@@ -112,16 +124,16 @@ TEST(SincResamplerTest, Convolve) {
   double result = resampler.Convolve_C(
       resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
       resampler.kernel_storage_.get(), kKernelInterpolationFactor);
-  double result2 = resampler.Convolve_SSE(
+  double result2 = resampler.CONVOLVE_FUNC(
       resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
       resampler.kernel_storage_.get(), kKernelInterpolationFactor);
   EXPECT_NEAR(result2, result, kEpsilon);
 
-  // Test Convolve_SSE() w/ unaligned input pointer.
+  // Test Convolve() w/ unaligned input pointer.
   result = resampler.Convolve_C(
       resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
       resampler.kernel_storage_.get(), kKernelInterpolationFactor);
-  result2 = resampler.Convolve_SSE(
+  result2 = resampler.CONVOLVE_FUNC(
       resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
       resampler.kernel_storage_.get(), kKernelInterpolationFactor);
   EXPECT_NEAR(result2, result, kEpsilon);
@@ -158,36 +170,40 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
       (base::TimeTicks::HighResNow() - start).InMillisecondsF();
   printf("Convolve_C took %.2fms.\n", total_time_c_ms);
 
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-  // Benchmark Convolve_SSE() with unaligned input pointer.
+#if defined(CONVOLVE_FUNC)
+  // Benchmark with unaligned input pointer.
   start = base::TimeTicks::HighResNow();
   for (int j = 0; j < convolve_iterations; ++j) {
-    resampler.Convolve_SSE(
+    resampler.CONVOLVE_FUNC(
         resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
         resampler.kernel_storage_.get(), kKernelInterpolationFactor);
   }
-  double total_time_sse_unaligned_ms =
+  double total_time_optimized_unaligned_ms =
       (base::TimeTicks::HighResNow() - start).InMillisecondsF();
-  printf("Convolve_SSE (unaligned) took %.2fms; which is %.2fx faster than"
-         " Convolve_C.\n", total_time_sse_unaligned_ms,
-         total_time_c_ms / total_time_sse_unaligned_ms);
+  printf(STRINGIZE(CONVOLVE_FUNC) "(unaligned) took %.2fms; which is %.2fx "
+         "faster than Convolve_C.\n", total_time_optimized_unaligned_ms,
+         total_time_c_ms / total_time_optimized_unaligned_ms);
 
-  // Benchmark Convolve_SSE() with aligned input pointer.
+  // Benchmark with aligned input pointer.
   start = base::TimeTicks::HighResNow();
   for (int j = 0; j < convolve_iterations; ++j) {
-    resampler.Convolve_SSE(
+    resampler.CONVOLVE_FUNC(
         resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
         resampler.kernel_storage_.get(), kKernelInterpolationFactor);
   }
-  double total_time_sse_aligned_ms =
+  double total_time_optimized_aligned_ms =
       (base::TimeTicks::HighResNow() - start).InMillisecondsF();
-  printf("Convolve_SSE (aligned) took %.2fms; which is %.2fx faster than"
-         " Convolve_C and %.2fx faster than Convolve_SSE (unaligned).\n",
-         total_time_sse_aligned_ms, total_time_c_ms / total_time_sse_aligned_ms,
-         total_time_sse_unaligned_ms / total_time_sse_aligned_ms);
+  printf(STRINGIZE(CONVOLVE_FUNC) " (aligned) took %.2fms; which is %.2fx "
+         "faster than Convolve_C and %.2fx faster than "
+         STRINGIZE(CONVOLVE_FUNC) " (unaligned).\n",
+         total_time_optimized_aligned_ms,
+         total_time_c_ms / total_time_optimized_aligned_ms,
+         total_time_optimized_unaligned_ms / total_time_optimized_aligned_ms);
 #endif
 }
 
+#undef CONVOLVE_FUNC
+
 // Fake audio source for testing the resampler.  Generates a sinusoidal linear
 // chirp (http://en.wikipedia.org/wiki/Chirp) which can be tuned to stress the
 // resampler for the specific sample rate conversion being used.
diff --git a/media/media.gyp b/media/media.gyp
index 02d9ca5..9603c2d 100644
--- a/media/media.gyp
+++ b/media/media.gyp
@@ -327,6 +327,11 @@
         ],
       },
       'conditions': [
+        ['arm_neon == 1', {
+          'defines': [
+            'USE_NEON'
+          ],
+        }],
         ['OS != "ios"', {
           'dependencies': [
             '../base/third_party/dynamic_annotations/dynamic_annotations.gyp:dynamic_annotations',
@@ -648,6 +653,11 @@
         'webm/webm_parser_unittest.cc',
       ],
       'conditions': [
+        ['arm_neon == 1', {
+          'defines': [
+            'USE_NEON'
+          ],
+        }],
         ['OS != "ios"', {
           'dependencies': [
             'shared_memory_support',
author	dalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2012-09-26 20:12:44 +0000
committer	dalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2012-09-26 20:12:44 +0000
commit	176316bd19b6b2472444b23a9b7069b6cd69c33e (patch)
tree	7ffd421756475a5bce034e3220f3425492887300 /media
parent	8e7ea1d6b76da317a70949868176a5d795994a2f (diff)
download	chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.zip chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.tar.gz chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.tar.bz2