summaryrefslogtreecommitdiffstats
path: root/media
diff options
context:
space:
mode:
authordalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2012-09-26 20:12:44 +0000
committerdalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2012-09-26 20:12:44 +0000
commit176316bd19b6b2472444b23a9b7069b6cd69c33e (patch)
tree7ffd421756475a5bce034e3220f3425492887300 /media
parent8e7ea1d6b76da317a70949868176a5d795994a2f (diff)
downloadchromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.zip
chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.tar.gz
chromium_src-176316bd19b6b2472444b23a9b7069b6cd69c33e.tar.bz2
Add ARM NEON intrinsic optimizations for SincResampler.
On an exynos board these yielded an ~2.3x speedup: Benchmarking 50000000 iterations: Convolve_C took 5682.71ms. Convolve_NEON(unaligned) took 2451.18ms; which is 2.32x faster than Convolve_C. Convolve_NEON (aligned) took 2397.01ms; which is 2.37x faster than Convolve_C and 1.02x faster than Convolve_NEON (unaligned). BUG=none TEST=try bot, fischman. Review URL: https://codereview.chromium.org/10960023 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@158870 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media')
-rw-r--r--media/base/sinc_resampler.cc43
-rw-r--r--media/base/sinc_resampler.h9
-rw-r--r--media/base/sinc_resampler_unittest.cc60
-rw-r--r--media/media.gyp10
4 files changed, 94 insertions, 28 deletions
diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc
index 869af1b..9352fe3 100644
--- a/media/base/sinc_resampler.cc
+++ b/media/base/sinc_resampler.cc
@@ -36,13 +36,19 @@
#include "media/base/sinc_resampler.h"
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-#include <xmmintrin.h>
-#endif
#include <cmath>
#include "base/cpu.h"
#include "base/logging.h"
+#include "build/build_config.h"
+
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
+#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
+#include <arm_neon.h>
+#endif
namespace media {
@@ -231,6 +237,8 @@ float SincResampler::Convolve(const float* input_ptr, const float* k1,
#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
static const ConvolveProc kConvolveProc =
base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
+ static const ConvolveProc kConvolveProc = Convolve_NEON;
#else
static const ConvolveProc kConvolveProc = Convolve_C;
#endif
@@ -301,4 +309,33 @@ float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
}
#endif
+#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
+float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ float32x4_t m_input;
+ float32x4_t m_sums1 = vmovq_n_f32(0);
+ float32x4_t m_sums2 = vmovq_n_f32(0);
+
+ const float* upper = input_ptr + kKernelSize;
+ for (; input_ptr < upper; ) {
+ m_input = vld1q_f32(input_ptr);
+ input_ptr += 4;
+ m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
+ k1 += 4;
+ m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
+ k2 += 4;
+ }
+
+ // Linearly interpolate the two "convolutions".
+ m_sums1 = vmlaq_f32(
+ vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
+ m_sums2, vmovq_n_f32(kernel_interpolation_factor));
+
+ // Sum components together.
+ float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
+ return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
+}
+#endif
+
} // namespace media
diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h
index ef2f176..604192f 100644
--- a/media/base/sinc_resampler.h
+++ b/media/base/sinc_resampler.h
@@ -9,7 +9,6 @@
#include "base/gtest_prod_util.h"
#include "base/memory/aligned_memory.h"
#include "base/memory/scoped_ptr.h"
-#include "build/build_config.h"
#include "media/base/media_export.h"
namespace media {
@@ -45,8 +44,9 @@ class MEDIA_EXPORT SincResampler {
void InitializeKernel();
// Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are
- // linearly interpolated using |kernel_interpolation_factor|. The underlying
- // implementation is chosen at run time based on SSE support.
+ // linearly interpolated using |kernel_interpolation_factor|. On x86, the
+ // underlying implementation is chosen at run time based on SSE support. On
+ // ARM, NEON support is chosen at compile time based on compilation flags.
static float Convolve(const float* input_ptr, const float* k1,
const float* k2, double kernel_interpolation_factor);
static float Convolve_C(const float* input_ptr, const float* k1,
@@ -54,6 +54,9 @@ class MEDIA_EXPORT SincResampler {
static float Convolve_SSE(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor);
+ static float Convolve_NEON(const float* input_ptr, const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor);
// The ratio of input / output sample rates.
double io_sample_rate_ratio_;
diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc
index 77a963e..59a9f81 100644
--- a/media/base/sinc_resampler_unittest.cc
+++ b/media/base/sinc_resampler_unittest.cc
@@ -12,7 +12,9 @@
#include "base/command_line.h"
#include "base/logging.h"
#include "base/string_number_conversions.h"
+#include "base/stringize_macros.h"
#include "base/time.h"
+#include "build/build_config.h"
#include "media/base/sinc_resampler.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
@@ -38,7 +40,10 @@ ACTION(ClearBuffer) {
}
ACTION(FillBuffer) {
- memset(arg0, 1, arg1 * sizeof(float));
+ // Value chosen arbitrarily such that SincResampler resamples it to something
+ // easily representable on all platforms; e.g., using kSampleRateRatio this
+ // becomes 1.81219.
+ memset(arg0, 64, arg1 * sizeof(float));
}
// Test requesting multiples of ChunkSize() frames results in the proper number
@@ -89,13 +94,20 @@ TEST(SincResamplerTest, Flush) {
.Times(1).WillOnce(ClearBuffer());
resampler.Resample(resampled_destination.get(), resampler.ChunkSize() / 2);
for (int i = 0; i < resampler.ChunkSize() / 2; ++i)
- ASSERT_EQ(resampled_destination[i], 0);
+ ASSERT_FLOAT_EQ(resampled_destination[i], 0);
}
+// Define platform independent function name for Convolve* tests.
+#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#define CONVOLVE_FUNC Convolve_SSE
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
+#define CONVOLVE_FUNC Convolve_NEON
+#endif
+
// Ensure various optimized Convolve() methods return the same value. Only run
// this test if other optimized methods exist, otherwise the default Convolve()
// will be tested by the parameterized SincResampler tests below.
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(CONVOLVE_FUNC)
TEST(SincResamplerTest, Convolve) {
// Initialize a dummy resampler.
MockSource mock_source;
@@ -103,8 +115,8 @@ TEST(SincResamplerTest, Convolve) {
kSampleRateRatio,
base::Bind(&MockSource::ProvideInput, base::Unretained(&mock_source)));
- // Convolve_SSE() is slightly more precise than Convolve_C(), so comparison
- // must be done using an epsilon.
+ // The optimized Convolve methods are slightly more precise than Convolve_C(),
+ // so comparison must be done using an epsilon.
static const double kEpsilon = 0.00000005;
// Use a kernel from SincResampler as input and kernel data, this has the
@@ -112,16 +124,16 @@ TEST(SincResamplerTest, Convolve) {
double result = resampler.Convolve_C(
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
- double result2 = resampler.Convolve_SSE(
+ double result2 = resampler.CONVOLVE_FUNC(
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
EXPECT_NEAR(result2, result, kEpsilon);
- // Test Convolve_SSE() w/ unaligned input pointer.
+ // Test Convolve() w/ unaligned input pointer.
result = resampler.Convolve_C(
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
- result2 = resampler.Convolve_SSE(
+ result2 = resampler.CONVOLVE_FUNC(
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
EXPECT_NEAR(result2, result, kEpsilon);
@@ -158,36 +170,40 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
(base::TimeTicks::HighResNow() - start).InMillisecondsF();
printf("Convolve_C took %.2fms.\n", total_time_c_ms);
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
- // Benchmark Convolve_SSE() with unaligned input pointer.
+#if defined(CONVOLVE_FUNC)
+ // Benchmark with unaligned input pointer.
start = base::TimeTicks::HighResNow();
for (int j = 0; j < convolve_iterations; ++j) {
- resampler.Convolve_SSE(
+ resampler.CONVOLVE_FUNC(
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
}
- double total_time_sse_unaligned_ms =
+ double total_time_optimized_unaligned_ms =
(base::TimeTicks::HighResNow() - start).InMillisecondsF();
- printf("Convolve_SSE (unaligned) took %.2fms; which is %.2fx faster than"
- " Convolve_C.\n", total_time_sse_unaligned_ms,
- total_time_c_ms / total_time_sse_unaligned_ms);
+ printf(STRINGIZE(CONVOLVE_FUNC) "(unaligned) took %.2fms; which is %.2fx "
+ "faster than Convolve_C.\n", total_time_optimized_unaligned_ms,
+ total_time_c_ms / total_time_optimized_unaligned_ms);
- // Benchmark Convolve_SSE() with aligned input pointer.
+ // Benchmark with aligned input pointer.
start = base::TimeTicks::HighResNow();
for (int j = 0; j < convolve_iterations; ++j) {
- resampler.Convolve_SSE(
+ resampler.CONVOLVE_FUNC(
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
}
- double total_time_sse_aligned_ms =
+ double total_time_optimized_aligned_ms =
(base::TimeTicks::HighResNow() - start).InMillisecondsF();
- printf("Convolve_SSE (aligned) took %.2fms; which is %.2fx faster than"
- " Convolve_C and %.2fx faster than Convolve_SSE (unaligned).\n",
- total_time_sse_aligned_ms, total_time_c_ms / total_time_sse_aligned_ms,
- total_time_sse_unaligned_ms / total_time_sse_aligned_ms);
+ printf(STRINGIZE(CONVOLVE_FUNC) " (aligned) took %.2fms; which is %.2fx "
+ "faster than Convolve_C and %.2fx faster than "
+ STRINGIZE(CONVOLVE_FUNC) " (unaligned).\n",
+ total_time_optimized_aligned_ms,
+ total_time_c_ms / total_time_optimized_aligned_ms,
+ total_time_optimized_unaligned_ms / total_time_optimized_aligned_ms);
#endif
}
+#undef CONVOLVE_FUNC
+
// Fake audio source for testing the resampler. Generates a sinusoidal linear
// chirp (http://en.wikipedia.org/wiki/Chirp) which can be tuned to stress the
// resampler for the specific sample rate conversion being used.
diff --git a/media/media.gyp b/media/media.gyp
index 02d9ca5..9603c2d 100644
--- a/media/media.gyp
+++ b/media/media.gyp
@@ -327,6 +327,11 @@
],
},
'conditions': [
+ ['arm_neon == 1', {
+ 'defines': [
+ 'USE_NEON'
+ ],
+ }],
['OS != "ios"', {
'dependencies': [
'../base/third_party/dynamic_annotations/dynamic_annotations.gyp:dynamic_annotations',
@@ -648,6 +653,11 @@
'webm/webm_parser_unittest.cc',
],
'conditions': [
+ ['arm_neon == 1', {
+ 'defines': [
+ 'USE_NEON'
+ ],
+ }],
['OS != "ios"', {
'dependencies': [
'shared_memory_support',