9 files changed, 175 insertions, 110 deletions
diff --git a/media/base/simd/sinc_resampler_sse.cc b/media/base/simd/sinc_resampler_sse.cc
new file mode 100644
index 0000000..f0aec1c
--- /dev/null
+++ b/media/base/simd/sinc_resampler_sse.cc
@@ -0,0 +1,48 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/sinc_resampler.h"
+
+#include <xmmintrin.h>
+
+namespace media {
+
+float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
+                                  const float* k2,
+                                  double kernel_interpolation_factor) {
+  __m128 m_input;
+  __m128 m_sums1 = _mm_setzero_ps();
+  __m128 m_sums2 = _mm_setzero_ps();
+
+  // Based on |input_ptr| alignment, we need to use loadu or load.  Unrolling
+  // these loops hurt performance in local testing.
+  if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
+    for (int i = 0; i < kKernelSize; i += 4) {
+      m_input = _mm_loadu_ps(input_ptr + i);
+      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+    }
+  } else {
+    for (int i = 0; i < kKernelSize; i += 4) {
+      m_input = _mm_load_ps(input_ptr + i);
+      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+    }
+  }
+
+  // Linearly interpolate the two "convolutions".
+  m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
+  m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
+  m_sums1 = _mm_add_ps(m_sums1, m_sums2);
+
+  // Sum components together.
+  float result;
+  m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
+  _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
+      m_sums2, m_sums2, 1)));
+
+  return result;
+}
+
+}  // namespace media
diff --git a/media/base/simd/vector_math_sse.cc b/media/base/simd/vector_math_sse.cc
new file mode 100644
index 0000000..5cc2df9
--- /dev/null
+++ b/media/base/simd/vector_math_sse.cc
@@ -0,0 +1,27 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/vector_math_testing.h"
+
+#include <xmmintrin.h>  // NOLINT
+
+namespace media {
+namespace vector_math {
+
+void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
+  const int rem = len % 4;
+  const int last_index = len - rem;
+  __m128 m_scale = _mm_set_ps1(scale);
+  for (int i = 0; i < last_index; i += 4) {
+    _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
+                 _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
+  }
+
+  // Handle any remaining values that wouldn't fit in an SSE pass.
+  for (int i = last_index; i < len; ++i)
+    dest[i] += src[i] * scale;
+}
+
+}  // namespace vector_math
+}  // namespace media
diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc
index d836fc7..09ff49d 100644
--- a/media/base/sinc_resampler.cc
+++ b/media/base/sinc_resampler.cc
@@ -40,11 +40,6 @@
 
 #include "base/cpu.h"
 #include "base/logging.h"
-#include "build/build_config.h"
-
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-#include <xmmintrin.h>
-#endif
 
 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
 #include <arm_neon.h>
@@ -52,33 +47,6 @@
 
 namespace media {
 
-namespace {
-
-enum {
-  // The kernel size can be adjusted for quality (higher is better) at the
-  // expense of performance.  Must be a multiple of 32.
-  // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
-  kKernelSize = 32,
-
-  // The number of destination frames generated per processing pass.  Affects
-  // how often and for how much SincResampler calls back for input.  Must be
-  // greater than kKernelSize.
-  kBlockSize = 512,
-
-  // The kernel offset count is used for interpolation and is the number of
-  // sub-sample kernel shifts.  Can be adjusted for quality (higher is better)
-  // at the expense of allocating more memory.
-  kKernelOffsetCount = 32,
-  kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
-
-  // The size (in samples) of the internal buffer used by the resampler.
-  kBufferSize = kBlockSize + kKernelSize
-};
-
-}  // namespace
-
-const int SincResampler::kMaximumLookAheadSize = kBufferSize;
-
 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)
     : io_sample_rate_ratio_(io_sample_rate_ratio),
       virtual_source_idx_(0),
@@ -222,7 +190,7 @@ void SincResampler::Resample(float* destination, int frames) {
   }
 }
 
-int SincResampler::ChunkSize() {
+int SincResampler::ChunkSize() const {
   return kBlockSize / io_sample_rate_ratio_;
 }
 
@@ -235,14 +203,23 @@ void SincResampler::Flush() {
 float SincResampler::Convolve(const float* input_ptr, const float* k1,
                               const float* k2,
                               double kernel_interpolation_factor) {
+  // Ensure |k1|, |k2| are 16-byte aligned for SSE usage.  Should always be true
+  // so long as kKernelSize is a multiple of 16.
+  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
+  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
+
   // Rely on function level static initialization to keep ConvolveProc selection
   // thread safe.
   typedef float (*ConvolveProc)(const float* src, const float* k1,
                                 const float* k2,
                                 double kernel_interpolation_factor);
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(__SSE__)
+  static const ConvolveProc kConvolveProc = Convolve_SSE;
+#else
   static const ConvolveProc kConvolveProc =
       base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
+#endif
 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
   static const ConvolveProc kConvolveProc = Convolve_NEON;
 #else
@@ -271,50 +248,6 @@ float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
       + kernel_interpolation_factor * sum2;
 }
 
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
-                                  const float* k2,
-                                  double kernel_interpolation_factor) {
-  // Ensure |k1|, |k2| are 16-byte aligned for SSE usage.  Should always be true
-  // so long as kKernelSize is a multiple of 16.
-  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
-  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
-
-  __m128 m_input;
-  __m128 m_sums1 = _mm_setzero_ps();
-  __m128 m_sums2 = _mm_setzero_ps();
-
-  // Based on |input_ptr| alignment, we need to use loadu or load.  Unrolling
-  // these loops hurt performance in local testing.
-  if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
-    for (int i = 0; i < kKernelSize; i += 4) {
-      m_input = _mm_loadu_ps(input_ptr + i);
-      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
-      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
-    }
-  } else {
-    for (int i = 0; i < kKernelSize; i += 4) {
-      m_input = _mm_load_ps(input_ptr + i);
-      m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
-      m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
-    }
-  }
-
-  // Linearly interpolate the two "convolutions".
-  m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
-  m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
-  m_sums1 = _mm_add_ps(m_sums1, m_sums2);
-
-  // Sum components together.
-  float result;
-  m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
-  _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
-      m_sums2, m_sums2, 1)));
-
-  return result;
-}
-#endif
-
 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
                                    const float* k2,
diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h
index a1d3cf7..f4eaf5f 100644
--- a/media/base/sinc_resampler.h
+++ b/media/base/sinc_resampler.h
@@ -9,6 +9,7 @@
 #include "base/gtest_prod_util.h"
 #include "base/memory/aligned_memory.h"
 #include "base/memory/scoped_ptr.h"
+#include "build/build_config.h"
 #include "media/base/media_export.h"
 
 namespace media {
@@ -16,9 +17,30 @@ namespace media {
 // SincResampler is a high-quality single-channel sample-rate converter.
 class MEDIA_EXPORT SincResampler {
  public:
-  // The maximum number of samples that may be requested from the callback ahead
-  // of the current position in the stream.
-  static const int kMaximumLookAheadSize;
+  enum {
+    // The kernel size can be adjusted for quality (higher is better) at the
+    // expense of performance.  Must be a multiple of 32.
+    // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
+    kKernelSize = 32,
+
+    // The number of destination frames generated per processing pass.  Affects
+    // how often and for how much SincResampler calls back for input.  Must be
+    // greater than kKernelSize.
+    kBlockSize = 512,
+
+    // The kernel offset count is used for interpolation and is the number of
+    // sub-sample kernel shifts.  Can be adjusted for quality (higher is better)
+    // at the expense of allocating more memory.
+    kKernelOffsetCount = 32,
+    kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
+
+    // The size (in samples) of the internal buffer used by the resampler.
+    kBufferSize = kBlockSize + kKernelSize,
+
+    // The maximum number of samples that may be requested from the callback
+    // ahead of the current position in the stream.
+    kMaximumLookAheadSize = kBufferSize
+  };
 
   // Callback type for providing more data into the resampler.  Expects |frames|
   // of data to be rendered into |destination|; zero padded if not enough frames
@@ -36,7 +58,7 @@ class MEDIA_EXPORT SincResampler {
 
   // The maximum size in frames that guarantees Resample() will only make a
   // single call to |read_cb_| for more data.
-  int ChunkSize();
+  int ChunkSize() const;
 
   // Flush all buffered data and reset internal indices.
   void Flush();
@@ -55,15 +77,18 @@ class MEDIA_EXPORT SincResampler {
                         const float* k2, double kernel_interpolation_factor);
   static float Convolve_C(const float* input_ptr, const float* k1,
                           const float* k2, double kernel_interpolation_factor);
+#if defined(ARCH_CPU_X86_FAMILY)
   static float Convolve_SSE(const float* input_ptr, const float* k1,
                             const float* k2,
                             double kernel_interpolation_factor);
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
   static float Convolve_NEON(const float* input_ptr, const float* k1,
                              const float* k2,
                              double kernel_interpolation_factor);
+#endif
 
   // The ratio of input / output sample rates.
-  double io_sample_rate_ratio_;
+  const double io_sample_rate_ratio_;
 
   // An index on the source input buffer with sub-sample precision.  It must be
   // double precision to avoid drift.
diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc
index 0f718f2..b7aaec4 100644
--- a/media/base/sinc_resampler_unittest.cc
+++ b/media/base/sinc_resampler_unittest.cc
@@ -10,6 +10,7 @@
 #include "base/bind.h"
 #include "base/bind_helpers.h"
 #include "base/command_line.h"
+#include "base/cpu.h"
 #include "base/logging.h"
 #include "base/string_number_conversions.h"
 #include "base/strings/stringize_macros.h"
@@ -98,7 +99,7 @@ TEST(SincResamplerTest, Flush) {
 }
 
 // Define platform independent function name for Convolve* tests.
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
 #define CONVOLVE_FUNC Convolve_SSE
 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
 #define CONVOLVE_FUNC Convolve_NEON
@@ -109,6 +110,10 @@ TEST(SincResamplerTest, Flush) {
 // will be tested by the parameterized SincResampler tests below.
 #if defined(CONVOLVE_FUNC)
 TEST(SincResamplerTest, Convolve) {
+#if defined(ARCH_CPU_X86_FAMILY)
+  ASSERT_TRUE(base::CPU().has_sse());
+#endif
+
   // Initialize a dummy resampler.
   MockSource mock_source;
   SincResampler resampler(
@@ -171,6 +176,10 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
   printf("Convolve_C took %.2fms.\n", total_time_c_ms);
 
 #if defined(CONVOLVE_FUNC)
+#if defined(ARCH_CPU_X86_FAMILY)
+  ASSERT_TRUE(base::CPU().has_sse());
+#endif
+
   // Benchmark with unaligned input pointer.
   start = base::TimeTicks::HighResNow();
   for (int j = 0; j < convolve_iterations; ++j) {
diff --git a/media/base/vector_math.cc b/media/base/vector_math.cc
index edd95cd..96f94d9 100644
--- a/media/base/vector_math.cc
+++ b/media/base/vector_math.cc
@@ -7,11 +7,6 @@
 
 #include "base/cpu.h"
 #include "base/logging.h"
-#include "build/build_config.h"
-
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-#include <xmmintrin.h>
-#endif
 
 namespace media {
 namespace vector_math {
@@ -25,9 +20,13 @@ void FMAC(const float src[], float scale, int len, float dest[]) {
   // selection thread safe.
   typedef void (*VectorFMACProc)(const float src[], float scale, int len,
                                  float dest[]);
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(__SSE__)
+  static const VectorFMACProc kVectorFMACProc = FMAC_SSE;
+#else
   static const VectorFMACProc kVectorFMACProc =
       base::CPU().has_sse() ? FMAC_SSE : FMAC_C;
+#endif
 #else
   static const VectorFMACProc kVectorFMACProc = FMAC_C;
 #endif
@@ -40,20 +39,5 @@ void FMAC_C(const float src[], float scale, int len, float dest[]) {
     dest[i] += src[i] * scale;
 }
 
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
-  __m128 m_scale = _mm_set_ps1(scale);
-  int rem = len % 4;
-  for (int i = 0; i < len - rem; i += 4) {
-    _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
-                 _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
-  }
-
-  // Handle any remaining values that wouldn't fit in an SSE pass.
-  if (rem)
-    FMAC_C(src + len - rem, scale, rem, dest + len - rem);
-}
-#endif
-
 }  // namespace vector_math
 }  // namespace media
diff --git a/media/base/vector_math_testing.h b/media/base/vector_math_testing.h
index d364b74..503ca6a 100644
--- a/media/base/vector_math_testing.h
+++ b/media/base/vector_math_testing.h
@@ -5,6 +5,7 @@
 #ifndef MEDIA_BASE_VECTOR_MATH_TESTING_H_
 #define MEDIA_BASE_VECTOR_MATH_TESTING_H_
 
+#include "build/build_config.h"
 #include "media/base/media_export.h"
 
 namespace media {
@@ -13,8 +14,11 @@ namespace vector_math {
 // Optimized versions of FMAC() function exposed for testing.  See vector_math.h
 // for details.
 MEDIA_EXPORT void FMAC_C(const float src[], float scale, int len, float dest[]);
+
+#if defined(ARCH_CPU_X86_FAMILY)
 MEDIA_EXPORT void FMAC_SSE(const float src[], float scale, int len,
                            float dest[]);
+#endif
 
 }  // namespace vector_math
 }  // namespace media
diff --git a/media/base/vector_math_unittest.cc b/media/base/vector_math_unittest.cc
index 153378e..e64c7c9 100644
--- a/media/base/vector_math_unittest.cc
+++ b/media/base/vector_math_unittest.cc
@@ -7,6 +7,7 @@
 #include <cmath>
 
 #include "base/command_line.h"
+#include "base/cpu.h"
 #include "base/memory/aligned_memory.h"
 #include "base/memory/scoped_ptr.h"
 #include "base/string_number_conversions.h"
@@ -90,8 +91,9 @@ TEST_F(VectorMathTest, FMAC) {
     VerifyOutput(kResult);
   }
 
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
   {
+    ASSERT_TRUE(base::CPU().has_sse());
     SCOPED_TRACE("FMAC_SSE");
     FillTestVectors(kInputFillValue, kOutputFillValue);
     vector_math::FMAC_SSE(
@@ -118,7 +120,9 @@ TEST_F(VectorMathTest, FMACBenchmark) {
   double total_time_c_ms = (TimeTicks::HighResNow() - start).InMillisecondsF();
   printf("FMAC_C took %.2fms.\n", total_time_c_ms);
 
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
+  ASSERT_TRUE(base::CPU().has_sse());
+
   // Benchmark FMAC_SSE() with unaligned size.
   ASSERT_NE((kVectorSize - 1) % (vector_math::kRequiredAlignment /
             sizeof(float)), 0U);
diff --git a/media/media.gyp b/media/media.gyp
index 526c089..0b369ea 100644
--- a/media/media.gyp
+++ b/media/media.gyp
@@ -683,7 +683,7 @@
                   'message': 'Generating Pulse stubs for dynamic loading.',
                 },
               ],
-              'conditions': [ 
+              'conditions': [
                 # Linux/Solaris need libdl for dlopen() and friends.
                 ['OS == "linux" or OS == "solaris"', {
                   'link_settings': {
@@ -811,6 +811,12 @@
             '../build/linux/system.gyp:gtk',
           ],
         }],
+        # ios check is necessary due to http://crbug.com/172682.
+        ['OS != "ios" and (target_arch == "ia32" or target_arch == "x64")', {
+          'dependencies': [
+            'media_sse',
+          ],
+        }],
       ],
       'target_conditions': [
         ['OS == "ios"', {
@@ -1018,12 +1024,15 @@
             'audio/audio_low_latency_input_output_unittest.cc',
           ],
         }],
-        [ 'target_arch=="ia32" or target_arch=="x64"', {
+        ['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', {
           'sources': [
             'base/simd/convert_rgb_to_yuv_unittest.cc',
           ],
+          'dependencies': [
+            'media_sse',
+          ],
         }],
-        [ 'screen_capture_supported == 0', {
+        ['screen_capture_supported == 0', {
           'sources/': [
             ['exclude', '^video/capture/screen/'],
           ],
@@ -1610,5 +1619,27 @@
         }, # end of target differ_block_sse2
       ],
     }],
+    # ios check is necessary due to http://crbug.com/172682.
+    ['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', {
+      'targets': [
+        {
+          'target_name': 'media_sse',
+          'type': 'static_library',
+          'cflags': [
+            '-msse',
+          ],
+          'include_dirs': [
+            '..',
+          ],
+          'defines': [
+            'MEDIA_IMPLEMENTATION',
+          ],
+          'sources': [
+            'base/simd/sinc_resampler_sse.cc',
+            'base/simd/vector_math_sse.cc',
+          ],
+        }, # end of target media_sse
+      ],
+    }],
   ],
 }