summaryrefslogtreecommitdiffstats
path: root/media
diff options
context:
space:
mode:
authordalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-08-02 05:58:48 +0000
committerdalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-08-02 05:58:48 +0000
commit215d9ab6c9d936b745ad9b1acc89893639d5c770 (patch)
tree0bd02e0b471d527ea330a51a61d4c2638adaf2ee /media
parentc95310270a265b4bc3c3e6a5799285317d8653a7 (diff)
downloadchromium_src-215d9ab6c9d936b745ad9b1acc89893639d5c770.zip
chromium_src-215d9ab6c9d936b745ad9b1acc89893639d5c770.tar.gz
chromium_src-215d9ab6c9d936b745ad9b1acc89893639d5c770.tar.bz2
Optimizes the FMAC operation with SSE. Performance is on par with
FFmpeg's vector_fmac function, without the hassle of 32-byte align and over allocation. VectorFMAC_SSE requires 16-byte alignment of the source and dest vectors. Size does not need to be a multiple of 4 though. Performance results from AudioRendererMixerTest.VectorFMACBenchmark: Benchmarking 200000 iterations: VectorFMAC_C took 2030.73ms. VectorFMAC_SSE (unaligned size) took 598.33ms; which is 3.39x faster than VectorFMAC_C. VectorFMAC_SSE (aligned size) took 597.71ms; which is 3.40x faster than VectorFMAC_C and 1.00x faster than VectorFMAC_SSE (unaligned size). BUG=133637 TEST=media_unittests + AudioRendererMixer/* tests. Review URL: https://chromiumcodereview.appspot.com/10802005 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@149581 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media')
-rw-r--r--media/audio/audio_device_thread.cc8
-rw-r--r--media/base/audio_renderer_mixer.cc66
-rw-r--r--media/base/audio_renderer_mixer.h12
-rw-r--r--media/base/audio_renderer_mixer_unittest.cc135
4 files changed, 208 insertions, 13 deletions
diff --git a/media/audio/audio_device_thread.cc b/media/audio/audio_device_thread.cc
index 92831ab..83eb0c4 100644
--- a/media/audio/audio_device_thread.cc
+++ b/media/audio/audio_device_thread.cc
@@ -8,6 +8,7 @@
#include "base/bind.h"
#include "base/logging.h"
+#include "base/memory/aligned_memory.h"
#include "base/message_loop.h"
#include "base/threading/platform_thread.h"
#include "base/threading/thread_restrictions.h"
@@ -189,7 +190,7 @@ AudioDeviceThread::Callback::Callback(
AudioDeviceThread::Callback::~Callback() {
for (size_t i = 0; i < audio_data_.size(); ++i)
- delete [] audio_data_[i];
+ base::AlignedFree(audio_data_[i]);
}
void AudioDeviceThread::Callback::InitializeOnAudioThread() {
@@ -198,10 +199,11 @@ void AudioDeviceThread::Callback::InitializeOnAudioThread() {
MapSharedMemory();
DCHECK(shared_memory_.memory() != NULL);
+ // Allocate buffer with a 16-byte alignment to allow SSE optimizations.
audio_data_.reserve(audio_parameters_.channels());
for (int i = 0; i < audio_parameters_.channels(); ++i) {
- float* channel_data = new float[audio_parameters_.frames_per_buffer()];
- audio_data_.push_back(channel_data);
+ audio_data_.push_back(static_cast<float*>(base::AlignedAlloc(
+ sizeof(float) * audio_parameters_.frames_per_buffer(), 16)));
}
}
diff --git a/media/base/audio_renderer_mixer.cc b/media/base/audio_renderer_mixer.cc
index 6d23faa..1ca2f39 100644
--- a/media/base/audio_renderer_mixer.cc
+++ b/media/base/audio_renderer_mixer.cc
@@ -4,9 +4,15 @@
#include "media/base/audio_renderer_mixer.h"
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
#include "base/bind.h"
#include "base/bind_helpers.h"
+#include "base/cpu.h"
#include "base/logging.h"
+#include "base/memory/aligned_memory.h"
#include "media/audio/audio_util.h"
#include "media/base/limits.h"
@@ -42,7 +48,7 @@ AudioRendererMixer::~AudioRendererMixer() {
// Clean up |mixer_input_audio_data_|.
for (size_t i = 0; i < mixer_input_audio_data_.size(); ++i)
- delete [] mixer_input_audio_data_[i];
+ base::AlignedFree(mixer_input_audio_data_[i]);
mixer_input_audio_data_.clear();
// Ensures that all mixer inputs have stopped themselves prior to destruction
@@ -84,11 +90,12 @@ void AudioRendererMixer::ProvideInput(const std::vector<float*>& audio_data,
// Allocate staging area for each mixer input's audio data on first call. We
// won't know how much to allocate until here because of resampling.
if (mixer_input_audio_data_.size() == 0) {
- // TODO(dalecurtis): If we switch to AVX/SSE optimization, we'll need to
- // allocate these on 32-byte boundaries and ensure they're sized % 32 bytes.
mixer_input_audio_data_.reserve(audio_data.size());
- for (size_t i = 0; i < audio_data.size(); ++i)
- mixer_input_audio_data_.push_back(new float[number_of_frames]);
+ for (size_t i = 0; i < audio_data.size(); ++i) {
+ // Allocate audio data with a 16-byte alignment for SSE optimizations.
+ mixer_input_audio_data_.push_back(static_cast<float*>(
+ base::AlignedAlloc(sizeof(float) * number_of_frames, 16)));
+ }
mixer_input_audio_data_size_ = number_of_frames;
}
@@ -120,12 +127,9 @@ void AudioRendererMixer::ProvideInput(const std::vector<float*>& audio_data,
continue;
// Volume adjust and mix each mixer input into |audio_data| after rendering.
- // TODO(dalecurtis): Optimize with NEON/SSE/AVX vector_fmac from FFmpeg.
for (size_t j = 0; j < audio_data.size(); ++j) {
- float* dest = audio_data[j];
- float* source = mixer_input_audio_data_[j];
- for (int k = 0; k < frames_filled; ++k)
- dest[k] += source[k] * static_cast<float>(volume);
+ VectorFMAC(
+ mixer_input_audio_data_[j], volume, frames_filled, audio_data[j]);
}
// No need to clamp values as InterleaveFloatToInt() will take care of this
@@ -143,4 +147,46 @@ void AudioRendererMixer::OnRenderError() {
}
}
+void AudioRendererMixer::VectorFMAC(const float src[], float scale, int len,
+ float dest[]) {
+ // Rely on function level static initialization to keep VectorFMACProc
+ // selection thread safe.
+ typedef void (*VectorFMACProc)(const float src[], float scale, int len,
+ float dest[]);
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+ static const VectorFMACProc kVectorFMACProc =
+ base::CPU().has_sse() ? VectorFMAC_SSE : VectorFMAC_C;
+#else
+ static const VectorFMACProc kVectorFMACProc = VectorFMAC_C;
+#endif
+
+ return kVectorFMACProc(src, scale, len, dest);
+}
+
+void AudioRendererMixer::VectorFMAC_C(const float src[], float scale, int len,
+ float dest[]) {
+ for (int i = 0; i < len; ++i)
+ dest[i] += src[i] * scale;
+}
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+void AudioRendererMixer::VectorFMAC_SSE(const float src[], float scale, int len,
+ float dest[]) {
+ // Ensure |src| and |dest| are 16-byte aligned.
+ DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & 0x0F);
+ DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & 0x0F);
+
+ __m128 m_scale = _mm_set_ps1(scale);
+ int rem = len % 4;
+ for (int i = 0; i < len - rem; i += 4) {
+ _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
+ _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
+ }
+
+ // Handle any remaining values that wouldn't fit in an SSE pass.
+ if (rem)
+ VectorFMAC_C(src + len - rem, scale, rem, dest + len - rem);
+}
+#endif
+
} // namespace media
diff --git a/media/base/audio_renderer_mixer.h b/media/base/audio_renderer_mixer.h
index 76e68bc..d293799 100644
--- a/media/base/audio_renderer_mixer.h
+++ b/media/base/audio_renderer_mixer.h
@@ -8,6 +8,7 @@
#include <set>
#include <vector>
+#include "base/gtest_prod_util.h"
#include "base/synchronization/lock.h"
#include "media/base/audio_renderer_mixer_input.h"
#include "media/base/audio_renderer_sink.h"
@@ -33,6 +34,9 @@ class MEDIA_EXPORT AudioRendererMixer
void RemoveMixerInput(const scoped_refptr<AudioRendererMixerInput>& input);
private:
+ FRIEND_TEST_ALL_PREFIXES(AudioRendererMixerTest, VectorFMAC);
+ FRIEND_TEST_ALL_PREFIXES(AudioRendererMixerTest, VectorFMACBenchmark);
+
// AudioRendererSink::RenderCallback implementation.
virtual int Render(const std::vector<float*>& audio_data,
int number_of_frames,
@@ -45,6 +49,14 @@ class MEDIA_EXPORT AudioRendererMixer
void ProvideInput(const std::vector<float*>& audio_data,
int number_of_frames);
+ // Multiply each element of |src| (up to |len|) by |scale| and add to |dest|.
+ static void VectorFMAC(const float src[], float scale, int len, float dest[]);
+ static void VectorFMAC_C(const float src[], float scale, int len,
+ float dest[]);
+ // SSE optimized VectorFMAC, requires |src|, |dest| to be 16-byte aligned.
+ static void VectorFMAC_SSE(const float src[], float scale, int len,
+ float dest[]);
+
// Output sink for this mixer.
scoped_refptr<AudioRendererSink> audio_sink_;
diff --git a/media/base/audio_renderer_mixer_unittest.cc b/media/base/audio_renderer_mixer_unittest.cc
index 51d06e0..8394c01 100644
--- a/media/base/audio_renderer_mixer_unittest.cc
+++ b/media/base/audio_renderer_mixer_unittest.cc
@@ -8,8 +8,11 @@
#include "base/bind.h"
#include "base/bind_helpers.h"
+#include "base/command_line.h"
+#include "base/memory/aligned_memory.h"
#include "base/memory/scoped_ptr.h"
#include "base/memory/scoped_vector.h"
+#include "base/string_number_conversions.h"
#include "media/base/audio_renderer_mixer.h"
#include "media/base/audio_renderer_mixer_input.h"
#include "media/base/fake_audio_render_callback.h"
@@ -28,10 +31,142 @@ static const int kBitsPerChannel = 16;
static const ChannelLayout kChannelLayout = CHANNEL_LAYOUT_STEREO;
static const int kHighLatencyBufferSize = 8192;
static const int kLowLatencyBufferSize = 256;
+static const int kSampleRate = 48000;
// Number of full sine wave cycles for each Render() call.
static const int kSineCycles = 4;
+// Command line switch for runtime adjustment of VectorFMACBenchmark iterations.
+static const char kVectorFMACIterations[] = "vector-fmac-iterations";
+
+// Test parameters for VectorFMAC tests.
+static const float kScale = 0.5;
+static const float kInputFillValue = 1.0;
+static const float kOutputFillValue = 3.0;
+
+// Ensure various optimized VectorFMAC() methods return the same value.
+TEST(AudioRendererMixerTest, VectorFMAC) {
+ // Initialize a dummy mixer.
+ scoped_refptr<MockAudioRendererSink> sink = new MockAudioRendererSink();
+ EXPECT_CALL(*sink, Start());
+ EXPECT_CALL(*sink, Stop());
+ AudioParameters params(
+ AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, kSampleRate,
+ kBitsPerChannel, kHighLatencyBufferSize);
+ AudioRendererMixer mixer(params, params, sink);
+
+ // Initialize input and output vectors.
+ scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> input_vector(
+ static_cast<float*>(
+ base::AlignedAlloc(sizeof(float) * kHighLatencyBufferSize, 16)));
+ scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> output_vector(
+ static_cast<float*>(
+ base::AlignedAlloc(sizeof(float) * kHighLatencyBufferSize, 16)));
+
+ // Setup input and output vectors.
+ std::fill(input_vector.get(), input_vector.get() + kHighLatencyBufferSize,
+ kInputFillValue);
+ std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize,
+ kOutputFillValue);
+ mixer.VectorFMAC_C(
+ input_vector.get(), kScale, kHighLatencyBufferSize, output_vector.get());
+ for(int i = 0; i < kHighLatencyBufferSize; ++i) {
+ ASSERT_FLOAT_EQ(output_vector.get()[i],
+ kInputFillValue * kScale + kOutputFillValue);
+ }
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+ // Reset vectors, and try with SSE.
+ std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize,
+ kOutputFillValue);
+ mixer.VectorFMAC_SSE(
+ input_vector.get(), kScale, kHighLatencyBufferSize, output_vector.get());
+ for(int i = 0; i < kHighLatencyBufferSize; ++i) {
+ ASSERT_FLOAT_EQ(output_vector.get()[i],
+ kInputFillValue * kScale + kOutputFillValue);
+ }
+#endif
+}
+
+// Benchmark for the various VectorFMAC() methods. Make sure to build with
+// branding=Chrome so that DCHECKs are compiled out when benchmarking. Original
+// benchmarks were run with --vector-fmac-iterations=200000.
+TEST(AudioRendererMixerTest, VectorFMACBenchmark) {
+ // Initialize a dummy mixer.
+ scoped_refptr<MockAudioRendererSink> sink = new MockAudioRendererSink();
+ EXPECT_CALL(*sink, Start());
+ EXPECT_CALL(*sink, Stop());
+ AudioParameters params(
+ AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, kSampleRate,
+ kBitsPerChannel, kHighLatencyBufferSize);
+ AudioRendererMixer mixer(params, params, sink);
+
+ // Initialize input and output vectors.
+ scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> input_vector(
+ static_cast<float*>(
+ base::AlignedAlloc(sizeof(float) * kHighLatencyBufferSize, 16)));
+ scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> output_vector(
+ static_cast<float*>(
+ base::AlignedAlloc(sizeof(float) * kHighLatencyBufferSize, 16)));
+
+ // Retrieve benchmark iterations from command line.
+ int vector_fmac_iterations = 10;
+ std::string iterations(CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
+ kVectorFMACIterations));
+ if (!iterations.empty())
+ base::StringToInt(iterations, &vector_fmac_iterations);
+
+ printf("Benchmarking %d iterations:\n", vector_fmac_iterations);
+
+ // Benchmark VectorFMAC_C().
+ std::fill(input_vector.get(), input_vector.get() + kHighLatencyBufferSize,
+ kInputFillValue);
+ std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize,
+ kOutputFillValue);
+ base::TimeTicks start = base::TimeTicks::HighResNow();
+ for (int i = 0; i < vector_fmac_iterations; ++i) {
+ mixer.VectorFMAC_C(input_vector.get(), static_cast<float>(M_PI),
+ kHighLatencyBufferSize, output_vector.get());
+ }
+ double total_time_c_ms =
+ (base::TimeTicks::HighResNow() - start).InMillisecondsF();
+ printf("VectorFMAC_C took %.2fms.\n", total_time_c_ms);
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+ // Benchmark VectorFMAC_SSE() with unaligned size; I.e., size % 4 != 0.
+ ASSERT_NE((kHighLatencyBufferSize - 1) % 4, 0);
+ std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize,
+ kOutputFillValue);
+ start = base::TimeTicks::HighResNow();
+ for (int j = 0; j < vector_fmac_iterations; ++j) {
+ mixer.VectorFMAC_SSE(input_vector.get(), M_PI, kHighLatencyBufferSize - 1,
+ output_vector.get());
+ }
+ double total_time_sse_unaligned_ms =
+ (base::TimeTicks::HighResNow() - start).InMillisecondsF();
+ printf("VectorFMAC_SSE (unaligned size) took %.2fms; which is %.2fx faster"
+ " than VectorFMAC_C.\n", total_time_sse_unaligned_ms,
+ total_time_c_ms / total_time_sse_unaligned_ms);
+
+ // Benchmark VectorFMAC_SSE() with aligned size; I.e., size % 4 == 0.
+ ASSERT_EQ(kHighLatencyBufferSize % 4, 0);
+ std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize,
+ kOutputFillValue);
+ start = base::TimeTicks::HighResNow();
+ for (int j = 0; j < vector_fmac_iterations; ++j) {
+ mixer.VectorFMAC_SSE(input_vector.get(), M_PI, kHighLatencyBufferSize,
+ output_vector.get());
+ }
+ double total_time_sse_aligned_ms =
+ (base::TimeTicks::HighResNow() - start).InMillisecondsF();
+ printf("VectorFMAC_SSE (aligned size) took %.2fms; which is %.2fx faster than"
+ " VectorFMAC_C and %.2fx faster than VectorFMAC_SSE (unaligned size)."
+ "\n",
+ total_time_sse_aligned_ms, total_time_c_ms / total_time_sse_aligned_ms,
+ total_time_sse_unaligned_ms / total_time_sse_aligned_ms);
+#endif
+}
+
// Tuple of <input sampling rate, output sampling rate, epsilon>.
typedef std::tr1::tuple<int, int, double> AudioRendererMixerTestData;
class AudioRendererMixerTest