diff options
author | dalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-04-23 18:39:48 +0000 |
---|---|---|
committer | dalecurtis@google.com <dalecurtis@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-04-23 18:39:48 +0000 |
commit | 7218e4ead141aaf8c3369705e683b54e9abcdaeb (patch) | |
tree | c66d0ab8ba44ddf2d0ac48dee3f502fbf44664b4 /media/base/vector_math.cc | |
parent | 8b257c9762e9051c1d4e794900d1084fd81913cd (diff) | |
download | chromium_src-7218e4ead141aaf8c3369705e683b54e9abcdaeb.zip chromium_src-7218e4ead141aaf8c3369705e683b54e9abcdaeb.tar.gz chromium_src-7218e4ead141aaf8c3369705e683b54e9abcdaeb.tar.bz2 |
Add NEON optimizations for FMAC and FMUL.
Benchmarks show a 2.60x and 2.77x speedup for FMAC and FMUL
respectively.
BUG=none
TEST=media_unittests
Review URL: https://codereview.chromium.org/14188032
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@195855 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media/base/vector_math.cc')
-rw-r--r-- | media/base/vector_math.cc | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/media/base/vector_math.cc b/media/base/vector_math.cc index f534d92..603ae0b 100644 --- a/media/base/vector_math.cc +++ b/media/base/vector_math.cc @@ -7,6 +7,11 @@ #include "base/cpu.h" #include "base/logging.h" +#include "build/build_config.h" + +#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) +#include <arm_neon.h> +#endif namespace media { namespace vector_math { @@ -31,6 +36,8 @@ void FMAC(const float src[], float scale, int len, float dest[]) { static const VectorFMACProc kVectorFMACProc = base::CPU().has_sse() ? FMAC_SSE : FMAC_C; #endif +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) + static const VectorFMACProc kVectorFMACProc = FMAC_NEON; #else static const VectorFMACProc kVectorFMACProc = FMAC_C; #endif @@ -63,6 +70,8 @@ void FMUL(const float src[], float scale, int len, float dest[]) { static const VectorFMULProc kVectorFMULProc = base::CPU().has_sse() ? FMUL_SSE : FMUL_C; #endif +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) + static const VectorFMULProc kVectorFMULProc = FMUL_NEON; #else static const VectorFMULProc kVectorFMULProc = FMUL_C; #endif @@ -75,5 +84,33 @@ void FMUL_C(const float src[], float scale, int len, float dest[]) { dest[i] = src[i] * scale; } +#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) +void FMAC_NEON(const float src[], float scale, int len, float dest[]) { + const int rem = len % 4; + const int last_index = len - rem; + float32x4_t m_scale = vmovq_n_f32(scale); + for (int i = 0; i < last_index; i += 4) { + vst1q_f32(dest + i, vmlaq_f32( + vld1q_f32(dest + i), vld1q_f32(src + i), m_scale)); + } + + // Handle any remaining values that wouldn't fit in an NEON pass. + for (int i = last_index; i < len; ++i) + dest[i] += src[i] * scale; +} + +void FMUL_NEON(const float src[], float scale, int len, float dest[]) { + const int rem = len % 4; + const int last_index = len - rem; + float32x4_t m_scale = vmovq_n_f32(scale); + for (int i = 0; i < last_index; i += 4) + vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale)); + + // Handle any remaining values that wouldn't fit in an NEON pass. + for (int i = last_index; i < len; ++i) + dest[i] = src[i] * scale; +} +#endif + } // namespace vector_math } // namespace media |