diff options
author | dalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-09-26 20:12:44 +0000 |
---|---|---|
committer | dalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-09-26 20:12:44 +0000 |
commit | 176316bd19b6b2472444b23a9b7069b6cd69c33e (patch) | |
tree | 7ffd421756475a5bce034e3220f3425492887300 /media | |
parent | 8e7ea1d6b76da317a70949868176a5d795994a2f (diff) | |
download | chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.zip chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.tar.gz chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.tar.bz2 |
Add ARM NEON intrinsic optimizations for SincResampler.
On an exynos board these yielded an ~2.3x speedup:
Benchmarking 50000000 iterations:
Convolve_C took 5682.71ms.
Convolve_NEON(unaligned) took 2451.18ms; which is 2.32x faster than Convolve_C.
Convolve_NEON (aligned) took 2397.01ms; which is 2.37x faster than Convolve_C and 1.02x faster than Convolve_NEON (unaligned).
BUG=none
TEST=try bot, fischman.
Review URL: https://codereview.chromium.org/10960023
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@158870 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media')
-rw-r--r-- | media/base/sinc_resampler.cc | 43 | ||||
-rw-r--r-- | media/base/sinc_resampler.h | 9 | ||||
-rw-r--r-- | media/base/sinc_resampler_unittest.cc | 60 | ||||
-rw-r--r-- | media/media.gyp | 10 |
4 files changed, 94 insertions, 28 deletions
diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc index 869af1b..9352fe3 100644 --- a/media/base/sinc_resampler.cc +++ b/media/base/sinc_resampler.cc @@ -36,13 +36,19 @@ #include "media/base/sinc_resampler.h" -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) -#include <xmmintrin.h> -#endif #include <cmath> #include "base/cpu.h" #include "base/logging.h" +#include "build/build_config.h" + +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#include <xmmintrin.h> +#endif + +#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) +#include <arm_neon.h> +#endif namespace media { @@ -231,6 +237,8 @@ float SincResampler::Convolve(const float* input_ptr, const float* k1, #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) static const ConvolveProc kConvolveProc = base::CPU().has_sse() ? Convolve_SSE : Convolve_C; +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) + static const ConvolveProc kConvolveProc = Convolve_NEON; #else static const ConvolveProc kConvolveProc = Convolve_C; #endif @@ -301,4 +309,33 @@ float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, } #endif +#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) +float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor) { + float32x4_t m_input; + float32x4_t m_sums1 = vmovq_n_f32(0); + float32x4_t m_sums2 = vmovq_n_f32(0); + + const float* upper = input_ptr + kKernelSize; + for (; input_ptr < upper; ) { + m_input = vld1q_f32(input_ptr); + input_ptr += 4; + m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); + k1 += 4; + m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); + k2 += 4; + } + + // Linearly interpolate the two "convolutions". + m_sums1 = vmlaq_f32( + vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), + m_sums2, vmovq_n_f32(kernel_interpolation_factor)); + + // Sum components together. + float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); + return vget_lane_f32(vpadd_f32(m_half, m_half), 0); +} +#endif + } // namespace media diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h index ef2f176..604192f 100644 --- a/media/base/sinc_resampler.h +++ b/media/base/sinc_resampler.h @@ -9,7 +9,6 @@ #include "base/gtest_prod_util.h" #include "base/memory/aligned_memory.h" #include "base/memory/scoped_ptr.h" -#include "build/build_config.h" #include "media/base/media_export.h" namespace media { @@ -45,8 +44,9 @@ class MEDIA_EXPORT SincResampler { void InitializeKernel(); // Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are - // linearly interpolated using |kernel_interpolation_factor|. The underlying - // implementation is chosen at run time based on SSE support. + // linearly interpolated using |kernel_interpolation_factor|. On x86, the + // underlying implementation is chosen at run time based on SSE support. On + // ARM, NEON support is chosen at compile time based on compilation flags. static float Convolve(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); static float Convolve_C(const float* input_ptr, const float* k1, @@ -54,6 +54,9 @@ class MEDIA_EXPORT SincResampler { static float Convolve_SSE(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); + static float Convolve_NEON(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor); // The ratio of input / output sample rates. double io_sample_rate_ratio_; diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc index 77a963e..59a9f81 100644 --- a/media/base/sinc_resampler_unittest.cc +++ b/media/base/sinc_resampler_unittest.cc @@ -12,7 +12,9 @@ #include "base/command_line.h" #include "base/logging.h" #include "base/string_number_conversions.h" +#include "base/stringize_macros.h" #include "base/time.h" +#include "build/build_config.h" #include "media/base/sinc_resampler.h" #include "testing/gmock/include/gmock/gmock.h" #include "testing/gtest/include/gtest/gtest.h" @@ -38,7 +40,10 @@ ACTION(ClearBuffer) { } ACTION(FillBuffer) { - memset(arg0, 1, arg1 * sizeof(float)); + // Value chosen arbitrarily such that SincResampler resamples it to something + // easily representable on all platforms; e.g., using kSampleRateRatio this + // becomes 1.81219. + memset(arg0, 64, arg1 * sizeof(float)); } // Test requesting multiples of ChunkSize() frames results in the proper number @@ -89,13 +94,20 @@ TEST(SincResamplerTest, Flush) { .Times(1).WillOnce(ClearBuffer()); resampler.Resample(resampled_destination.get(), resampler.ChunkSize() / 2); for (int i = 0; i < resampler.ChunkSize() / 2; ++i) - ASSERT_EQ(resampled_destination[i], 0); + ASSERT_FLOAT_EQ(resampled_destination[i], 0); } +// Define platform independent function name for Convolve* tests. +#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#define CONVOLVE_FUNC Convolve_SSE +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) +#define CONVOLVE_FUNC Convolve_NEON +#endif + // Ensure various optimized Convolve() methods return the same value. Only run // this test if other optimized methods exist, otherwise the default Convolve() // will be tested by the parameterized SincResampler tests below. -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) +#if defined(CONVOLVE_FUNC) TEST(SincResamplerTest, Convolve) { // Initialize a dummy resampler. MockSource mock_source; @@ -103,8 +115,8 @@ TEST(SincResamplerTest, Convolve) { kSampleRateRatio, base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source))); - // Convolve_SSE() is slightly more precise than Convolve_C(), so comparison - // must be done using an epsilon. + // The optimized Convolve methods are slightly more precise than Convolve_C(), + // so comparison must be done using an epsilon. static const double kEpsilon = 0.00000005; // Use a kernel from SincResampler as input and kernel data, this has the @@ -112,16 +124,16 @@ TEST(SincResamplerTest, Convolve) { double result = resampler.Convolve_C( resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); - double result2 = resampler.Convolve_SSE( + double result2 = resampler.CONVOLVE_FUNC( resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); EXPECT_NEAR(result2, result, kEpsilon); - // Test Convolve_SSE() w/ unaligned input pointer. + // Test Convolve() w/ unaligned input pointer. result = resampler.Convolve_C( resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); - result2 = resampler.Convolve_SSE( + result2 = resampler.CONVOLVE_FUNC( resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); EXPECT_NEAR(result2, result, kEpsilon); @@ -158,36 +170,40 @@ TEST(SincResamplerTest, ConvolveBenchmark) { (base::TimeTicks::HighResNow() - start).InMillisecondsF(); printf("Convolve_C took %.2fms.\n", total_time_c_ms); -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) - // Benchmark Convolve_SSE() with unaligned input pointer. +#if defined(CONVOLVE_FUNC) + // Benchmark with unaligned input pointer. start = base::TimeTicks::HighResNow(); for (int j = 0; j < convolve_iterations; ++j) { - resampler.Convolve_SSE( + resampler.CONVOLVE_FUNC( resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); } - double total_time_sse_unaligned_ms = + double total_time_optimized_unaligned_ms = (base::TimeTicks::HighResNow() - start).InMillisecondsF(); - printf("Convolve_SSE (unaligned) took %.2fms; which is %.2fx faster than" - " Convolve_C.\n", total_time_sse_unaligned_ms, - total_time_c_ms / total_time_sse_unaligned_ms); + printf(STRINGIZE(CONVOLVE_FUNC) "(unaligned) took %.2fms; which is %.2fx " + "faster than Convolve_C.\n", total_time_optimized_unaligned_ms, + total_time_c_ms / total_time_optimized_unaligned_ms); - // Benchmark Convolve_SSE() with aligned input pointer. + // Benchmark with aligned input pointer. start = base::TimeTicks::HighResNow(); for (int j = 0; j < convolve_iterations; ++j) { - resampler.Convolve_SSE( + resampler.CONVOLVE_FUNC( resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); } - double total_time_sse_aligned_ms = + double total_time_optimized_aligned_ms = (base::TimeTicks::HighResNow() - start).InMillisecondsF(); - printf("Convolve_SSE (aligned) took %.2fms; which is %.2fx faster than" - " Convolve_C and %.2fx faster than Convolve_SSE (unaligned).\n", - total_time_sse_aligned_ms, total_time_c_ms / total_time_sse_aligned_ms, - total_time_sse_unaligned_ms / total_time_sse_aligned_ms); + printf(STRINGIZE(CONVOLVE_FUNC) " (aligned) took %.2fms; which is %.2fx " + "faster than Convolve_C and %.2fx faster than " + STRINGIZE(CONVOLVE_FUNC) " (unaligned).\n", + total_time_optimized_aligned_ms, + total_time_c_ms / total_time_optimized_aligned_ms, + total_time_optimized_unaligned_ms / total_time_optimized_aligned_ms); #endif } +#undef CONVOLVE_FUNC + // Fake audio source for testing the resampler. Generates a sinusoidal linear // chirp (http://en.wikipedia.org/wiki/Chirp) which can be tuned to stress the // resampler for the specific sample rate conversion being used. diff --git a/media/media.gyp b/media/media.gyp index 02d9ca5..9603c2d 100644 --- a/media/media.gyp +++ b/media/media.gyp @@ -327,6 +327,11 @@ ], }, 'conditions': [ + ['arm_neon == 1', { + 'defines': [ + 'USE_NEON' + ], + }], ['OS != "ios"', { 'dependencies': [ '../base/third_party/dynamic_annotations/dynamic_annotations.gyp:dynamic_annotations', @@ -648,6 +653,11 @@ 'webm/webm_parser_unittest.cc', ], 'conditions': [ + ['arm_neon == 1', { + 'defines': [ + 'USE_NEON' + ], + }], ['OS != "ios"', { 'dependencies': [ 'shared_memory_support', |