1 files changed, 56 insertions, 10 deletions
diff --git a/media/base/audio_renderer_mixer.cc b/media/base/audio_renderer_mixer.cc
index 6d23faa..1ca2f39 100644
--- a/media/base/audio_renderer_mixer.cc
+++ b/media/base/audio_renderer_mixer.cc
@@ -4,9 +4,15 @@
 
 #include "media/base/audio_renderer_mixer.h"
 
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
 #include "base/bind.h"
 #include "base/bind_helpers.h"
+#include "base/cpu.h"
 #include "base/logging.h"
+#include "base/memory/aligned_memory.h"
 #include "media/audio/audio_util.h"
 #include "media/base/limits.h"
 
@@ -42,7 +48,7 @@ AudioRendererMixer::~AudioRendererMixer() {
 
   // Clean up |mixer_input_audio_data_|.
   for (size_t i = 0; i < mixer_input_audio_data_.size(); ++i)
-    delete [] mixer_input_audio_data_[i];
+    base::AlignedFree(mixer_input_audio_data_[i]);
   mixer_input_audio_data_.clear();
 
   // Ensures that all mixer inputs have stopped themselves prior to destruction
@@ -84,11 +90,12 @@ void AudioRendererMixer::ProvideInput(const std::vector<float*>& audio_data,
   // Allocate staging area for each mixer input's audio data on first call.  We
   // won't know how much to allocate until here because of resampling.
   if (mixer_input_audio_data_.size() == 0) {
-    // TODO(dalecurtis): If we switch to AVX/SSE optimization, we'll need to
-    // allocate these on 32-byte boundaries and ensure they're sized % 32 bytes.
     mixer_input_audio_data_.reserve(audio_data.size());
-    for (size_t i = 0; i < audio_data.size(); ++i)
-      mixer_input_audio_data_.push_back(new float[number_of_frames]);
+    for (size_t i = 0; i < audio_data.size(); ++i) {
+      // Allocate audio data with a 16-byte alignment for SSE optimizations.
+      mixer_input_audio_data_.push_back(static_cast<float*>(
+          base::AlignedAlloc(sizeof(float) * number_of_frames, 16)));
+    }
     mixer_input_audio_data_size_ = number_of_frames;
   }
 
@@ -120,12 +127,9 @@ void AudioRendererMixer::ProvideInput(const std::vector<float*>& audio_data,
       continue;
 
     // Volume adjust and mix each mixer input into |audio_data| after rendering.
-    // TODO(dalecurtis): Optimize with NEON/SSE/AVX vector_fmac from FFmpeg.
     for (size_t j = 0; j < audio_data.size(); ++j) {
-      float* dest = audio_data[j];
-      float* source = mixer_input_audio_data_[j];
-      for (int k = 0; k < frames_filled; ++k)
-        dest[k] += source[k] * static_cast<float>(volume);
+       VectorFMAC(
+           mixer_input_audio_data_[j], volume, frames_filled, audio_data[j]);
     }
 
     // No need to clamp values as InterleaveFloatToInt() will take care of this
@@ -143,4 +147,46 @@ void AudioRendererMixer::OnRenderError() {
   }
 }
 
+void AudioRendererMixer::VectorFMAC(const float src[], float scale, int len,
+                                    float dest[]) {
+  // Rely on function level static initialization to keep VectorFMACProc
+  // selection thread safe.
+  typedef void (*VectorFMACProc)(const float src[], float scale, int len,
+                                 float dest[]);
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+  static const VectorFMACProc kVectorFMACProc =
+      base::CPU().has_sse() ? VectorFMAC_SSE : VectorFMAC_C;
+#else
+  static const VectorFMACProc kVectorFMACProc = VectorFMAC_C;
+#endif
+
+  return kVectorFMACProc(src, scale, len, dest);
+}
+
+void AudioRendererMixer::VectorFMAC_C(const float src[], float scale, int len,
+                                      float dest[]) {
+  for (int i = 0; i < len; ++i)
+    dest[i] += src[i] * scale;
+}
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+void AudioRendererMixer::VectorFMAC_SSE(const float src[], float scale, int len,
+                                        float dest[]) {
+  // Ensure |src| and |dest| are 16-byte aligned.
+  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & 0x0F);
+  DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & 0x0F);
+
+  __m128 m_scale = _mm_set_ps1(scale);
+  int rem = len % 4;
+  for (int i = 0; i < len - rem; i += 4) {
+    _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
+                 _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
+  }
+
+  // Handle any remaining values that wouldn't fit in an SSE pass.
+  if (rem)
+    VectorFMAC_C(src + len - rem, scale, rem, dest + len - rem);
+}
+#endif
+
 }  // namespace media