diff options
author | dalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-08-02 05:58:48 +0000 |
---|---|---|
committer | dalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-08-02 05:58:48 +0000 |
commit | 215d9ab6c9d936b745ad9b1acc89893639d5c770 (patch) | |
tree | 0bd02e0b471d527ea330a51a61d4c2638adaf2ee /media | |
parent | c95310270a265b4bc3c3e6a5799285317d8653a7 (diff) | |
download | chromium_src-215d9ab6c9d936b745ad9b1acc89893639d5c770.zip chromium_src-215d9ab6c9d936b745ad9b1acc89893639d5c770.tar.gz chromium_src-215d9ab6c9d936b745ad9b1acc89893639d5c770.tar.bz2 |
Optimizes the FMAC operation with SSE. Performance is on par with
FFmpeg's vector_fmac function, without the hassle of 32-byte align
and over allocation.
VectorFMAC_SSE requires 16-byte alignment of the source and dest
vectors. Size does not need to be a multiple of 4 though.
Performance results from AudioRendererMixerTest.VectorFMACBenchmark:
Benchmarking 200000 iterations:
VectorFMAC_C took 2030.73ms.
VectorFMAC_SSE (unaligned size) took 598.33ms; which is 3.39x faster than VectorFMAC_C.
VectorFMAC_SSE (aligned size) took 597.71ms; which is 3.40x faster than VectorFMAC_C and 1.00x faster than VectorFMAC_SSE (unaligned size).
BUG=133637
TEST=media_unittests + AudioRendererMixer/* tests.
Review URL: https://chromiumcodereview.appspot.com/10802005
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@149581 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media')
-rw-r--r-- | media/audio/audio_device_thread.cc | 8 | ||||
-rw-r--r-- | media/base/audio_renderer_mixer.cc | 66 | ||||
-rw-r--r-- | media/base/audio_renderer_mixer.h | 12 | ||||
-rw-r--r-- | media/base/audio_renderer_mixer_unittest.cc | 135 |
4 files changed, 208 insertions, 13 deletions
diff --git a/media/audio/audio_device_thread.cc b/media/audio/audio_device_thread.cc index 92831ab..83eb0c4 100644 --- a/media/audio/audio_device_thread.cc +++ b/media/audio/audio_device_thread.cc @@ -8,6 +8,7 @@ #include "base/bind.h" #include "base/logging.h" +#include "base/memory/aligned_memory.h" #include "base/message_loop.h" #include "base/threading/platform_thread.h" #include "base/threading/thread_restrictions.h" @@ -189,7 +190,7 @@ AudioDeviceThread::Callback::Callback( AudioDeviceThread::Callback::~Callback() { for (size_t i = 0; i < audio_data_.size(); ++i) - delete [] audio_data_[i]; + base::AlignedFree(audio_data_[i]); } void AudioDeviceThread::Callback::InitializeOnAudioThread() { @@ -198,10 +199,11 @@ void AudioDeviceThread::Callback::InitializeOnAudioThread() { MapSharedMemory(); DCHECK(shared_memory_.memory() != NULL); + // Allocate buffer with a 16-byte alignment to allow SSE optimizations. audio_data_.reserve(audio_parameters_.channels()); for (int i = 0; i < audio_parameters_.channels(); ++i) { - float* channel_data = new float[audio_parameters_.frames_per_buffer()]; - audio_data_.push_back(channel_data); + audio_data_.push_back(static_cast<float*>(base::AlignedAlloc( + sizeof(float) * audio_parameters_.frames_per_buffer(), 16))); } } diff --git a/media/base/audio_renderer_mixer.cc b/media/base/audio_renderer_mixer.cc index 6d23faa..1ca2f39 100644 --- a/media/base/audio_renderer_mixer.cc +++ b/media/base/audio_renderer_mixer.cc @@ -4,9 +4,15 @@ #include "media/base/audio_renderer_mixer.h" +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#include <xmmintrin.h> +#endif + #include "base/bind.h" #include "base/bind_helpers.h" +#include "base/cpu.h" #include "base/logging.h" +#include "base/memory/aligned_memory.h" #include "media/audio/audio_util.h" #include "media/base/limits.h" @@ -42,7 +48,7 @@ AudioRendererMixer::~AudioRendererMixer() { // Clean up |mixer_input_audio_data_|. for (size_t i = 0; i < mixer_input_audio_data_.size(); ++i) - delete [] mixer_input_audio_data_[i]; + base::AlignedFree(mixer_input_audio_data_[i]); mixer_input_audio_data_.clear(); // Ensures that all mixer inputs have stopped themselves prior to destruction @@ -84,11 +90,12 @@ void AudioRendererMixer::ProvideInput(const std::vector<float*>& audio_data, // Allocate staging area for each mixer input's audio data on first call. We // won't know how much to allocate until here because of resampling. if (mixer_input_audio_data_.size() == 0) { - // TODO(dalecurtis): If we switch to AVX/SSE optimization, we'll need to - // allocate these on 32-byte boundaries and ensure they're sized % 32 bytes. mixer_input_audio_data_.reserve(audio_data.size()); - for (size_t i = 0; i < audio_data.size(); ++i) - mixer_input_audio_data_.push_back(new float[number_of_frames]); + for (size_t i = 0; i < audio_data.size(); ++i) { + // Allocate audio data with a 16-byte alignment for SSE optimizations. + mixer_input_audio_data_.push_back(static_cast<float*>( + base::AlignedAlloc(sizeof(float) * number_of_frames, 16))); + } mixer_input_audio_data_size_ = number_of_frames; } @@ -120,12 +127,9 @@ void AudioRendererMixer::ProvideInput(const std::vector<float*>& audio_data, continue; // Volume adjust and mix each mixer input into |audio_data| after rendering. - // TODO(dalecurtis): Optimize with NEON/SSE/AVX vector_fmac from FFmpeg. for (size_t j = 0; j < audio_data.size(); ++j) { - float* dest = audio_data[j]; - float* source = mixer_input_audio_data_[j]; - for (int k = 0; k < frames_filled; ++k) - dest[k] += source[k] * static_cast<float>(volume); + VectorFMAC( + mixer_input_audio_data_[j], volume, frames_filled, audio_data[j]); } // No need to clamp values as InterleaveFloatToInt() will take care of this @@ -143,4 +147,46 @@ void AudioRendererMixer::OnRenderError() { } } +void AudioRendererMixer::VectorFMAC(const float src[], float scale, int len, + float dest[]) { + // Rely on function level static initialization to keep VectorFMACProc + // selection thread safe. + typedef void (*VectorFMACProc)(const float src[], float scale, int len, + float dest[]); +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) + static const VectorFMACProc kVectorFMACProc = + base::CPU().has_sse() ? VectorFMAC_SSE : VectorFMAC_C; +#else + static const VectorFMACProc kVectorFMACProc = VectorFMAC_C; +#endif + + return kVectorFMACProc(src, scale, len, dest); +} + +void AudioRendererMixer::VectorFMAC_C(const float src[], float scale, int len, + float dest[]) { + for (int i = 0; i < len; ++i) + dest[i] += src[i] * scale; +} + +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +void AudioRendererMixer::VectorFMAC_SSE(const float src[], float scale, int len, + float dest[]) { + // Ensure |src| and |dest| are 16-byte aligned. + DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & 0x0F); + DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & 0x0F); + + __m128 m_scale = _mm_set_ps1(scale); + int rem = len % 4; + for (int i = 0; i < len - rem; i += 4) { + _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i), + _mm_mul_ps(_mm_load_ps(src + i), m_scale))); + } + + // Handle any remaining values that wouldn't fit in an SSE pass. + if (rem) + VectorFMAC_C(src + len - rem, scale, rem, dest + len - rem); +} +#endif + } // namespace media diff --git a/media/base/audio_renderer_mixer.h b/media/base/audio_renderer_mixer.h index 76e68bc..d293799 100644 --- a/media/base/audio_renderer_mixer.h +++ b/media/base/audio_renderer_mixer.h @@ -8,6 +8,7 @@ #include <set> #include <vector> +#include "base/gtest_prod_util.h" #include "base/synchronization/lock.h" #include "media/base/audio_renderer_mixer_input.h" #include "media/base/audio_renderer_sink.h" @@ -33,6 +34,9 @@ class MEDIA_EXPORT AudioRendererMixer void RemoveMixerInput(const scoped_refptr<AudioRendererMixerInput>& input); private: + FRIEND_TEST_ALL_PREFIXES(AudioRendererMixerTest, VectorFMAC); + FRIEND_TEST_ALL_PREFIXES(AudioRendererMixerTest, VectorFMACBenchmark); + // AudioRendererSink::RenderCallback implementation. virtual int Render(const std::vector<float*>& audio_data, int number_of_frames, @@ -45,6 +49,14 @@ class MEDIA_EXPORT AudioRendererMixer void ProvideInput(const std::vector<float*>& audio_data, int number_of_frames); + // Multiply each element of |src| (up to |len|) by |scale| and add to |dest|. + static void VectorFMAC(const float src[], float scale, int len, float dest[]); + static void VectorFMAC_C(const float src[], float scale, int len, + float dest[]); + // SSE optimized VectorFMAC, requires |src|, |dest| to be 16-byte aligned. + static void VectorFMAC_SSE(const float src[], float scale, int len, + float dest[]); + // Output sink for this mixer. scoped_refptr<AudioRendererSink> audio_sink_; diff --git a/media/base/audio_renderer_mixer_unittest.cc b/media/base/audio_renderer_mixer_unittest.cc index 51d06e0..8394c01 100644 --- a/media/base/audio_renderer_mixer_unittest.cc +++ b/media/base/audio_renderer_mixer_unittest.cc @@ -8,8 +8,11 @@ #include "base/bind.h" #include "base/bind_helpers.h" +#include "base/command_line.h" +#include "base/memory/aligned_memory.h" #include "base/memory/scoped_ptr.h" #include "base/memory/scoped_vector.h" +#include "base/string_number_conversions.h" #include "media/base/audio_renderer_mixer.h" #include "media/base/audio_renderer_mixer_input.h" #include "media/base/fake_audio_render_callback.h" @@ -28,10 +31,142 @@ static const int kBitsPerChannel = 16; static const ChannelLayout kChannelLayout = CHANNEL_LAYOUT_STEREO; static const int kHighLatencyBufferSize = 8192; static const int kLowLatencyBufferSize = 256; +static const int kSampleRate = 48000; // Number of full sine wave cycles for each Render() call. static const int kSineCycles = 4; +// Command line switch for runtime adjustment of VectorFMACBenchmark iterations. +static const char kVectorFMACIterations[] = "vector-fmac-iterations"; + +// Test parameters for VectorFMAC tests. +static const float kScale = 0.5; +static const float kInputFillValue = 1.0; +static const float kOutputFillValue = 3.0; + +// Ensure various optimized VectorFMAC() methods return the same value. +TEST(AudioRendererMixerTest, VectorFMAC) { + // Initialize a dummy mixer. + scoped_refptr<MockAudioRendererSink> sink = new MockAudioRendererSink(); + EXPECT_CALL(*sink, Start()); + EXPECT_CALL(*sink, Stop()); + AudioParameters params( + AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, kSampleRate, + kBitsPerChannel, kHighLatencyBufferSize); + AudioRendererMixer mixer(params, params, sink); + + // Initialize input and output vectors. + scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> input_vector( + static_cast<float*>( + base::AlignedAlloc(sizeof(float) * kHighLatencyBufferSize, 16))); + scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> output_vector( + static_cast<float*>( + base::AlignedAlloc(sizeof(float) * kHighLatencyBufferSize, 16))); + + // Setup input and output vectors. + std::fill(input_vector.get(), input_vector.get() + kHighLatencyBufferSize, + kInputFillValue); + std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize, + kOutputFillValue); + mixer.VectorFMAC_C( + input_vector.get(), kScale, kHighLatencyBufferSize, output_vector.get()); + for(int i = 0; i < kHighLatencyBufferSize; ++i) { + ASSERT_FLOAT_EQ(output_vector.get()[i], + kInputFillValue * kScale + kOutputFillValue); + } + +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) + // Reset vectors, and try with SSE. + std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize, + kOutputFillValue); + mixer.VectorFMAC_SSE( + input_vector.get(), kScale, kHighLatencyBufferSize, output_vector.get()); + for(int i = 0; i < kHighLatencyBufferSize; ++i) { + ASSERT_FLOAT_EQ(output_vector.get()[i], + kInputFillValue * kScale + kOutputFillValue); + } +#endif +} + +// Benchmark for the various VectorFMAC() methods. Make sure to build with +// branding=Chrome so that DCHECKs are compiled out when benchmarking. Original +// benchmarks were run with --vector-fmac-iterations=200000. +TEST(AudioRendererMixerTest, VectorFMACBenchmark) { + // Initialize a dummy mixer. + scoped_refptr<MockAudioRendererSink> sink = new MockAudioRendererSink(); + EXPECT_CALL(*sink, Start()); + EXPECT_CALL(*sink, Stop()); + AudioParameters params( + AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, kSampleRate, + kBitsPerChannel, kHighLatencyBufferSize); + AudioRendererMixer mixer(params, params, sink); + + // Initialize input and output vectors. + scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> input_vector( + static_cast<float*>( + base::AlignedAlloc(sizeof(float) * kHighLatencyBufferSize, 16))); + scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> output_vector( + static_cast<float*>( + base::AlignedAlloc(sizeof(float) * kHighLatencyBufferSize, 16))); + + // Retrieve benchmark iterations from command line. + int vector_fmac_iterations = 10; + std::string iterations(CommandLine::ForCurrentProcess()->GetSwitchValueASCII( + kVectorFMACIterations)); + if (!iterations.empty()) + base::StringToInt(iterations, &vector_fmac_iterations); + + printf("Benchmarking %d iterations:\n", vector_fmac_iterations); + + // Benchmark VectorFMAC_C(). + std::fill(input_vector.get(), input_vector.get() + kHighLatencyBufferSize, + kInputFillValue); + std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize, + kOutputFillValue); + base::TimeTicks start = base::TimeTicks::HighResNow(); + for (int i = 0; i < vector_fmac_iterations; ++i) { + mixer.VectorFMAC_C(input_vector.get(), static_cast<float>(M_PI), + kHighLatencyBufferSize, output_vector.get()); + } + double total_time_c_ms = + (base::TimeTicks::HighResNow() - start).InMillisecondsF(); + printf("VectorFMAC_C took %.2fms.\n", total_time_c_ms); + +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) + // Benchmark VectorFMAC_SSE() with unaligned size; I.e., size % 4 != 0. + ASSERT_NE((kHighLatencyBufferSize - 1) % 4, 0); + std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize, + kOutputFillValue); + start = base::TimeTicks::HighResNow(); + for (int j = 0; j < vector_fmac_iterations; ++j) { + mixer.VectorFMAC_SSE(input_vector.get(), M_PI, kHighLatencyBufferSize - 1, + output_vector.get()); + } + double total_time_sse_unaligned_ms = + (base::TimeTicks::HighResNow() - start).InMillisecondsF(); + printf("VectorFMAC_SSE (unaligned size) took %.2fms; which is %.2fx faster" + " than VectorFMAC_C.\n", total_time_sse_unaligned_ms, + total_time_c_ms / total_time_sse_unaligned_ms); + + // Benchmark VectorFMAC_SSE() with aligned size; I.e., size % 4 == 0. + ASSERT_EQ(kHighLatencyBufferSize % 4, 0); + std::fill(output_vector.get(), output_vector.get() + kHighLatencyBufferSize, + kOutputFillValue); + start = base::TimeTicks::HighResNow(); + for (int j = 0; j < vector_fmac_iterations; ++j) { + mixer.VectorFMAC_SSE(input_vector.get(), M_PI, kHighLatencyBufferSize, + output_vector.get()); + } + double total_time_sse_aligned_ms = + (base::TimeTicks::HighResNow() - start).InMillisecondsF(); + printf("VectorFMAC_SSE (aligned size) took %.2fms; which is %.2fx faster than" + " VectorFMAC_C and %.2fx faster than VectorFMAC_SSE (unaligned size)." + "\n", + total_time_sse_aligned_ms, total_time_c_ms / total_time_sse_aligned_ms, + total_time_sse_unaligned_ms / total_time_sse_aligned_ms); +#endif +} + // Tuple of <input sampling rate, output sampling rate, epsilon>. typedef std::tr1::tuple<int, int, double> AudioRendererMixerTestData; class AudioRendererMixerTest |