diff options
author | dalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-05-31 00:50:51 +0000 |
---|---|---|
committer | dalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-05-31 00:50:51 +0000 |
commit | 62f909dc4da6185d52ff88a093d4fe4f2a43d6f0 (patch) | |
tree | d88e687fc596befb3d2dd04dfb2b574442b8ab4e /media/base | |
parent | 0776bb9e6646e05fdc1e5ead47af1d0c44ef56fd (diff) | |
download | chromium_src-62f909dc4da6185d52ff88a093d4fe4f2a43d6f0.zip chromium_src-62f909dc4da6185d52ff88a093d4fe4f2a43d6f0.tar.gz chromium_src-62f909dc4da6185d52ff88a093d4fe4f2a43d6f0.tar.bz2 |
Remove runtime CPU detection for SSE optimized media/ methods.
SSE2 is now the baseline for X86 platforms.
BUG=378608
TEST=compiles
Review URL: https://codereview.chromium.org/308003004
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@273964 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media/base')
-rw-r--r-- | media/base/media.cc | 5 | ||||
-rw-r--r-- | media/base/simd/sinc_resampler_sse.cc | 48 | ||||
-rw-r--r-- | media/base/simd/vector_math_sse.cc | 118 | ||||
-rw-r--r-- | media/base/sinc_resampler.cc | 80 | ||||
-rw-r--r-- | media/base/sinc_resampler.h | 4 | ||||
-rw-r--r-- | media/base/sinc_resampler_perftest.cc | 4 | ||||
-rw-r--r-- | media/base/sinc_resampler_unittest.cc | 5 | ||||
-rw-r--r-- | media/base/vector_math.cc | 151 | ||||
-rw-r--r-- | media/base/vector_math.h | 5 | ||||
-rw-r--r-- | media/base/vector_math_perftest.cc | 35 | ||||
-rw-r--r-- | media/base/vector_math_testing.h | 2 | ||||
-rw-r--r-- | media/base/vector_math_unittest.cc | 4 |
12 files changed, 162 insertions, 299 deletions
diff --git a/media/base/media.cc b/media/base/media.cc index 75625fe..37fc02a 100644 --- a/media/base/media.cc +++ b/media/base/media.cc @@ -9,8 +9,6 @@ #include "base/path_service.h" #include "base/synchronization/lock.h" #include "build/build_config.h" -#include "media/base/sinc_resampler.h" -#include "media/base/vector_math.h" #include "media/base/yuv_convert.h" namespace media { @@ -44,9 +42,6 @@ class MediaInitializer { : initialized_(false), tried_initialize_(false) { // Perform initialization of libraries which require runtime CPU detection. - // TODO(dalecurtis): Add initialization of YUV, SincResampler. - vector_math::Initialize(); - SincResampler::InitializeCPUSpecificFeatures(); InitializeCPUSpecificYUVConversions(); } diff --git a/media/base/simd/sinc_resampler_sse.cc b/media/base/simd/sinc_resampler_sse.cc deleted file mode 100644 index f0aec1c..0000000 --- a/media/base/simd/sinc_resampler_sse.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2013 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "media/base/sinc_resampler.h" - -#include <xmmintrin.h> - -namespace media { - -float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, - const float* k2, - double kernel_interpolation_factor) { - __m128 m_input; - __m128 m_sums1 = _mm_setzero_ps(); - __m128 m_sums2 = _mm_setzero_ps(); - - // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling - // these loops hurt performance in local testing. - if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { - for (int i = 0; i < kKernelSize; i += 4) { - m_input = _mm_loadu_ps(input_ptr + i); - m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); - m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); - } - } else { - for (int i = 0; i < kKernelSize; i += 4) { - m_input = _mm_load_ps(input_ptr + i); - m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); - m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); - } - } - - // Linearly interpolate the two "convolutions". - m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); - m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); - m_sums1 = _mm_add_ps(m_sums1, m_sums2); - - // Sum components together. - float result; - m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); - _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( - m_sums2, m_sums2, 1))); - - return result; -} - -} // namespace media diff --git a/media/base/simd/vector_math_sse.cc b/media/base/simd/vector_math_sse.cc deleted file mode 100644 index c212122..0000000 --- a/media/base/simd/vector_math_sse.cc +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2013 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "media/base/vector_math_testing.h" - -#include <algorithm> - -#include <xmmintrin.h> // NOLINT - -namespace media { -namespace vector_math { - -void FMUL_SSE(const float src[], float scale, int len, float dest[]) { - const int rem = len % 4; - const int last_index = len - rem; - __m128 m_scale = _mm_set_ps1(scale); - for (int i = 0; i < last_index; i += 4) - _mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale)); - - // Handle any remaining values that wouldn't fit in an SSE pass. - for (int i = last_index; i < len; ++i) - dest[i] = src[i] * scale; -} - -void FMAC_SSE(const float src[], float scale, int len, float dest[]) { - const int rem = len % 4; - const int last_index = len - rem; - __m128 m_scale = _mm_set_ps1(scale); - for (int i = 0; i < last_index; i += 4) { - _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i), - _mm_mul_ps(_mm_load_ps(src + i), m_scale))); - } - - // Handle any remaining values that wouldn't fit in an SSE pass. - for (int i = last_index; i < len; ++i) - dest[i] += src[i] * scale; -} - -// Convenience macro to extract float 0 through 3 from the vector |a|. This is -// needed because compilers other than clang don't support access via -// operator[](). -#define EXTRACT_FLOAT(a, i) \ - (i == 0 ? \ - _mm_cvtss_f32(a) : \ - _mm_cvtss_f32(_mm_shuffle_ps(a, a, i))) - -std::pair<float, float> EWMAAndMaxPower_SSE( - float initial_value, const float src[], int len, float smoothing_factor) { - // When the recurrence is unrolled, we see that we can split it into 4 - // separate lanes of evaluation: - // - // y[n] = a(S[n]^2) + (1-a)(y[n-1]) - // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ... - // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) - // - // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ... - // - // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in - // each of the 4 lanes, and then combine them to give y[n]. - - const int rem = len % 4; - const int last_index = len - rem; - - const __m128 smoothing_factor_x4 = _mm_set_ps1(smoothing_factor); - const float weight_prev = 1.0f - smoothing_factor; - const __m128 weight_prev_x4 = _mm_set_ps1(weight_prev); - const __m128 weight_prev_squared_x4 = - _mm_mul_ps(weight_prev_x4, weight_prev_x4); - const __m128 weight_prev_4th_x4 = - _mm_mul_ps(weight_prev_squared_x4, weight_prev_squared_x4); - - // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and - // 0, respectively. - __m128 max_x4 = _mm_setzero_ps(); - __m128 ewma_x4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, initial_value); - int i; - for (i = 0; i < last_index; i += 4) { - ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_4th_x4); - const __m128 sample_x4 = _mm_load_ps(src + i); - const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4); - max_x4 = _mm_max_ps(max_x4, sample_squared_x4); - // Note: The compiler optimizes this to a single multiply-and-accumulate - // instruction: - ewma_x4 = _mm_add_ps(ewma_x4, - _mm_mul_ps(sample_squared_x4, smoothing_factor_x4)); - } - - // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) - float ewma = EXTRACT_FLOAT(ewma_x4, 3); - ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4); - ewma += EXTRACT_FLOAT(ewma_x4, 2); - ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4); - ewma += EXTRACT_FLOAT(ewma_x4, 1); - ewma_x4 = _mm_mul_ss(ewma_x4, weight_prev_x4); - ewma += EXTRACT_FLOAT(ewma_x4, 0); - - // Fold the maximums together to get the overall maximum. - max_x4 = _mm_max_ps(max_x4, - _mm_shuffle_ps(max_x4, max_x4, _MM_SHUFFLE(3, 3, 1, 1))); - max_x4 = _mm_max_ss(max_x4, _mm_shuffle_ps(max_x4, max_x4, 2)); - - std::pair<float, float> result(ewma, EXTRACT_FLOAT(max_x4, 0)); - - // Handle remaining values at the end of |src|. - for (; i < len; ++i) { - result.first *= weight_prev; - const float sample = src[i]; - const float sample_squared = sample * sample; - result.first += sample_squared * smoothing_factor; - result.second = std::max(result.second, sample_squared); - } - - return result; -} - -} // namespace vector_math -} // namespace media diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc index d3d494d..900648e 100644 --- a/media/base/sinc_resampler.cc +++ b/media/base/sinc_resampler.cc @@ -81,11 +81,16 @@ #include <cmath> #include <limits> -#include "base/cpu.h" #include "base/logging.h" -#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) +#if defined(ARCH_CPU_X86_FAMILY) +#include <xmmintrin.h> +#define CONVOLVE_FUNC Convolve_SSE +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) #include <arm_neon.h> +#define CONVOLVE_FUNC Convolve_NEON +#else +#define CONVOLVE_FUNC Convolve_C #endif namespace media { @@ -106,36 +111,6 @@ static double SincScaleFactor(double io_ratio) { return sinc_scale_factor; } -// If we know the minimum architecture at compile time, avoid CPU detection. -// Force NaCl code to use C routines since (at present) nothing there uses these -// methods and plumbing the -msse built library is non-trivial. -#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) -#if defined(__SSE__) -#define CONVOLVE_FUNC Convolve_SSE -void SincResampler::InitializeCPUSpecificFeatures() {} -#else -// X86 CPU detection required. Functions will be set by -// InitializeCPUSpecificFeatures(). -// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed. -#define CONVOLVE_FUNC g_convolve_proc_ - -typedef float (*ConvolveProc)(const float*, const float*, const float*, double); -static ConvolveProc g_convolve_proc_ = NULL; - -void SincResampler::InitializeCPUSpecificFeatures() { - CHECK(!g_convolve_proc_); - g_convolve_proc_ = base::CPU().has_sse() ? Convolve_SSE : Convolve_C; -} -#endif -#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) -#define CONVOLVE_FUNC Convolve_NEON -void SincResampler::InitializeCPUSpecificFeatures() {} -#else -// Unknown architecture. -#define CONVOLVE_FUNC Convolve_C -void SincResampler::InitializeCPUSpecificFeatures() {} -#endif - SincResampler::SincResampler(double io_sample_rate_ratio, int request_frames, const ReadCB& read_cb) @@ -321,8 +296,6 @@ void SincResampler::Resample(int frames, float* destination) { } } -#undef CONVOLVE_FUNC - int SincResampler::ChunkSize() const { return block_size_ / io_sample_rate_ratio_; } @@ -354,7 +327,44 @@ float SincResampler::Convolve_C(const float* input_ptr, const float* k1, + kernel_interpolation_factor * sum2; } -#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) +#if defined(ARCH_CPU_X86_FAMILY) +float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor) { + __m128 m_input; + __m128 m_sums1 = _mm_setzero_ps(); + __m128 m_sums2 = _mm_setzero_ps(); + + // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling + // these loops hurt performance in local testing. + if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { + for (int i = 0; i < kKernelSize; i += 4) { + m_input = _mm_loadu_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } else { + for (int i = 0; i < kKernelSize; i += 4) { + m_input = _mm_load_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } + + // Linearly interpolate the two "convolutions". + m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); + m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); + m_sums1 = _mm_add_ps(m_sums1, m_sums2); + + // Sum components together. + float result; + m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); + _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( + m_sums2, m_sums2, 1))); + + return result; +} +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h index af9a302..79db853 100644 --- a/media/base/sinc_resampler.h +++ b/media/base/sinc_resampler.h @@ -34,10 +34,6 @@ class MEDIA_EXPORT SincResampler { kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), }; - // Selects runtime specific CPU features like SSE. Must be called before - // using SincResampler. - static void InitializeCPUSpecificFeatures(); - // Callback type for providing more data into the resampler. Expects |frames| // of data to be rendered into |destination|; zero padded if not enough frames // are available to satisfy the request. diff --git a/media/base/sinc_resampler_perftest.cc b/media/base/sinc_resampler_perftest.cc index 21c6ec3..b54056a 100644 --- a/media/base/sinc_resampler_perftest.cc +++ b/media/base/sinc_resampler_perftest.cc @@ -4,7 +4,6 @@ #include "base/bind.h" #include "base/bind_helpers.h" -#include "base/cpu.h" #include "base/time/time.h" #include "media/base/sinc_resampler.h" #include "testing/gmock/include/gmock/gmock.h" @@ -61,9 +60,6 @@ TEST(SincResamplerPerfTest, Convolve) { &resampler, SincResampler::Convolve_C, true, "unoptimized_aligned"); #if defined(CONVOLVE_FUNC) -#if defined(ARCH_CPU_X86_FAMILY) - ASSERT_TRUE(base::CPU().has_sse()); -#endif RunConvolveBenchmark( &resampler, SincResampler::CONVOLVE_FUNC, true, "optimized_aligned"); RunConvolveBenchmark( diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc index 3b460a3..c0f9d98 100644 --- a/media/base/sinc_resampler_unittest.cc +++ b/media/base/sinc_resampler_unittest.cc @@ -9,7 +9,6 @@ #include "base/bind.h" #include "base/bind_helpers.h" -#include "base/cpu.h" #include "base/strings/string_number_conversions.h" #include "base/time/time.h" #include "build/build_config.h" @@ -121,10 +120,6 @@ TEST(SincResamplerTest, DISABLED_SetRatioBench) { static const double kKernelInterpolationFactor = 0.5; TEST(SincResamplerTest, Convolve) { -#if defined(ARCH_CPU_X86_FAMILY) - ASSERT_TRUE(base::CPU().has_sse()); -#endif - // Initialize a dummy resampler. MockSource mock_source; SincResampler resampler( diff --git a/media/base/vector_math.cc b/media/base/vector_math.cc index 6152204..71721b6 100644 --- a/media/base/vector_math.cc +++ b/media/base/vector_math.cc @@ -7,63 +7,29 @@ #include <algorithm> -#include "base/cpu.h" #include "base/logging.h" #include "build/build_config.h" -#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) -#include <arm_neon.h> -#endif - -namespace media { -namespace vector_math { - -// If we know the minimum architecture at compile time, avoid CPU detection. -// Force NaCl code to use C routines since (at present) nothing there uses these -// methods and plumbing the -msse built library is non-trivial. +// NaCl does not allow intrinsics. #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) -#if defined(__SSE__) +#include <xmmintrin.h> #define FMAC_FUNC FMAC_SSE #define FMUL_FUNC FMUL_SSE #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE -void Initialize() {} -#else -// X86 CPU detection required. Functions will be set by Initialize(). -// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed. -#define FMAC_FUNC g_fmac_proc_ -#define FMUL_FUNC g_fmul_proc_ -#define EWMAAndMaxPower_FUNC g_ewma_power_proc_ - -typedef void (*MathProc)(const float src[], float scale, int len, float dest[]); -static MathProc g_fmac_proc_ = NULL; -static MathProc g_fmul_proc_ = NULL; -typedef std::pair<float, float> (*EWMAAndMaxPowerProc)( - float initial_value, const float src[], int len, float smoothing_factor); -static EWMAAndMaxPowerProc g_ewma_power_proc_ = NULL; - -void Initialize() { - CHECK(!g_fmac_proc_); - CHECK(!g_fmul_proc_); - CHECK(!g_ewma_power_proc_); - const bool kUseSSE = base::CPU().has_sse(); - g_fmac_proc_ = kUseSSE ? FMAC_SSE : FMAC_C; - g_fmul_proc_ = kUseSSE ? FMUL_SSE : FMUL_C; - g_ewma_power_proc_ = kUseSSE ? EWMAAndMaxPower_SSE : EWMAAndMaxPower_C; -} -#endif #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) +#include <arm_neon.h> #define FMAC_FUNC FMAC_NEON #define FMUL_FUNC FMUL_NEON #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON -void Initialize() {} #else -// Unknown architecture. #define FMAC_FUNC FMAC_C #define FMUL_FUNC FMUL_C #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C -void Initialize() {} #endif +namespace media { +namespace vector_math { + void FMAC(const float src[], float scale, int len, float dest[]) { // Ensure |src| and |dest| are 16-byte aligned. DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1)); @@ -116,6 +82,111 @@ std::pair<float, float> EWMAAndMaxPower_C( return result; } +#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) +void FMUL_SSE(const float src[], float scale, int len, float dest[]) { + const int rem = len % 4; + const int last_index = len - rem; + __m128 m_scale = _mm_set_ps1(scale); + for (int i = 0; i < last_index; i += 4) + _mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale)); + + // Handle any remaining values that wouldn't fit in an SSE pass. + for (int i = last_index; i < len; ++i) + dest[i] = src[i] * scale; +} + +void FMAC_SSE(const float src[], float scale, int len, float dest[]) { + const int rem = len % 4; + const int last_index = len - rem; + __m128 m_scale = _mm_set_ps1(scale); + for (int i = 0; i < last_index; i += 4) { + _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i), + _mm_mul_ps(_mm_load_ps(src + i), m_scale))); + } + + // Handle any remaining values that wouldn't fit in an SSE pass. + for (int i = last_index; i < len; ++i) + dest[i] += src[i] * scale; +} + +// Convenience macro to extract float 0 through 3 from the vector |a|. This is +// needed because compilers other than clang don't support access via +// operator[](). +#define EXTRACT_FLOAT(a, i) \ + (i == 0 ? \ + _mm_cvtss_f32(a) : \ + _mm_cvtss_f32(_mm_shuffle_ps(a, a, i))) + +std::pair<float, float> EWMAAndMaxPower_SSE( + float initial_value, const float src[], int len, float smoothing_factor) { + // When the recurrence is unrolled, we see that we can split it into 4 + // separate lanes of evaluation: + // + // y[n] = a(S[n]^2) + (1-a)(y[n-1]) + // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ... + // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) + // + // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ... + // + // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in + // each of the 4 lanes, and then combine them to give y[n]. + + const int rem = len % 4; + const int last_index = len - rem; + + const __m128 smoothing_factor_x4 = _mm_set_ps1(smoothing_factor); + const float weight_prev = 1.0f - smoothing_factor; + const __m128 weight_prev_x4 = _mm_set_ps1(weight_prev); + const __m128 weight_prev_squared_x4 = + _mm_mul_ps(weight_prev_x4, weight_prev_x4); + const __m128 weight_prev_4th_x4 = + _mm_mul_ps(weight_prev_squared_x4, weight_prev_squared_x4); + + // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and + // 0, respectively. + __m128 max_x4 = _mm_setzero_ps(); + __m128 ewma_x4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, initial_value); + int i; + for (i = 0; i < last_index; i += 4) { + ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_4th_x4); + const __m128 sample_x4 = _mm_load_ps(src + i); + const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4); + max_x4 = _mm_max_ps(max_x4, sample_squared_x4); + // Note: The compiler optimizes this to a single multiply-and-accumulate + // instruction: + ewma_x4 = _mm_add_ps(ewma_x4, + _mm_mul_ps(sample_squared_x4, smoothing_factor_x4)); + } + + // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) + float ewma = EXTRACT_FLOAT(ewma_x4, 3); + ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4); + ewma += EXTRACT_FLOAT(ewma_x4, 2); + ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4); + ewma += EXTRACT_FLOAT(ewma_x4, 1); + ewma_x4 = _mm_mul_ss(ewma_x4, weight_prev_x4); + ewma += EXTRACT_FLOAT(ewma_x4, 0); + + // Fold the maximums together to get the overall maximum. + max_x4 = _mm_max_ps(max_x4, + _mm_shuffle_ps(max_x4, max_x4, _MM_SHUFFLE(3, 3, 1, 1))); + max_x4 = _mm_max_ss(max_x4, _mm_shuffle_ps(max_x4, max_x4, 2)); + + std::pair<float, float> result(ewma, EXTRACT_FLOAT(max_x4, 0)); + + // Handle remaining values at the end of |src|. + for (; i < len; ++i) { + result.first *= weight_prev; + const float sample = src[i]; + const float sample_squared = sample * sample; + result.first += sample_squared * smoothing_factor; + result.second = std::max(result.second, sample_squared); + } + + return result; +} +#endif + #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) void FMAC_NEON(const float src[], float scale, int len, float dest[]) { const int rem = len % 4; diff --git a/media/base/vector_math.h b/media/base/vector_math.h index 0a2cb06..a148ca0 100644 --- a/media/base/vector_math.h +++ b/media/base/vector_math.h @@ -15,11 +15,6 @@ namespace vector_math { // Required alignment for inputs and outputs to all vector math functions enum { kRequiredAlignment = 16 }; -// Selects runtime specific optimizations such as SSE. Must be called prior to -// calling FMAC() or FMUL(). Called during media library initialization; most -// users should never have to call this. -MEDIA_EXPORT void Initialize(); - // Multiply each element of |src| (up to |len|) by |scale| and add to |dest|. // |src| and |dest| must be aligned by kRequiredAlignment. MEDIA_EXPORT void FMAC(const float src[], float scale, int len, float dest[]); diff --git a/media/base/vector_math_perftest.cc b/media/base/vector_math_perftest.cc index 6adcfa6..2cf4691 100644 --- a/media/base/vector_math_perftest.cc +++ b/media/base/vector_math_perftest.cc @@ -2,7 +2,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "base/cpu.h" #include "base/memory/aligned_memory.h" #include "base/memory/scoped_ptr.h" #include "base/time/time.h" @@ -80,11 +79,15 @@ class VectorMathPerfTest : public testing::Test { DISALLOW_COPY_AND_ASSIGN(VectorMathPerfTest); }; -// Define platform independent function name for FMAC* perf tests. +// Define platform dependent function names for SIMD optimized methods. #if defined(ARCH_CPU_X86_FAMILY) #define FMAC_FUNC FMAC_SSE +#define FMUL_FUNC FMUL_SSE +#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) #define FMAC_FUNC FMAC_NEON +#define FMUL_FUNC FMUL_NEON +#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON #endif // Benchmark for each optimized vector_math::FMAC() method. @@ -93,9 +96,6 @@ TEST_F(VectorMathPerfTest, FMAC) { RunBenchmark( vector_math::FMAC_C, true, "vector_math_fmac", "unoptimized"); #if defined(FMAC_FUNC) -#if defined(ARCH_CPU_X86_FAMILY) - ASSERT_TRUE(base::CPU().has_sse()); -#endif // Benchmark FMAC_FUNC() with unaligned size. ASSERT_NE((kVectorSize - 1) % (vector_math::kRequiredAlignment / sizeof(float)), 0U); @@ -109,24 +109,12 @@ TEST_F(VectorMathPerfTest, FMAC) { #endif } -#undef FMAC_FUNC - -// Define platform independent function name for FMULBenchmark* tests. -#if defined(ARCH_CPU_X86_FAMILY) -#define FMUL_FUNC FMUL_SSE -#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) -#define FMUL_FUNC FMUL_NEON -#endif - // Benchmark for each optimized vector_math::FMUL() method. TEST_F(VectorMathPerfTest, FMUL) { // Benchmark FMUL_C(). RunBenchmark( vector_math::FMUL_C, true, "vector_math_fmul", "unoptimized"); #if defined(FMUL_FUNC) -#if defined(ARCH_CPU_X86_FAMILY) - ASSERT_TRUE(base::CPU().has_sse()); -#endif // Benchmark FMUL_FUNC() with unaligned size. ASSERT_NE((kVectorSize - 1) % (vector_math::kRequiredAlignment / sizeof(float)), 0U); @@ -140,14 +128,6 @@ TEST_F(VectorMathPerfTest, FMUL) { #endif } -#undef FMUL_FUNC - -#if defined(ARCH_CPU_X86_FAMILY) -#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE -#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) -#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON -#endif - // Benchmark for each optimized vector_math::EWMAAndMaxPower() method. TEST_F(VectorMathPerfTest, EWMAAndMaxPower) { // Benchmark EWMAAndMaxPower_C(). @@ -156,9 +136,6 @@ TEST_F(VectorMathPerfTest, EWMAAndMaxPower) { "vector_math_ewma_and_max_power", "unoptimized"); #if defined(EWMAAndMaxPower_FUNC) -#if defined(ARCH_CPU_X86_FAMILY) - ASSERT_TRUE(base::CPU().has_sse()); -#endif // Benchmark EWMAAndMaxPower_FUNC() with unaligned size. ASSERT_NE((kVectorSize - 1) % (vector_math::kRequiredAlignment / sizeof(float)), 0U); @@ -176,6 +153,4 @@ TEST_F(VectorMathPerfTest, EWMAAndMaxPower) { #endif } -#undef EWMAAndMaxPower_FUNC - } // namespace media diff --git a/media/base/vector_math_testing.h b/media/base/vector_math_testing.h index b0b30440..9240fbf 100644 --- a/media/base/vector_math_testing.h +++ b/media/base/vector_math_testing.h @@ -19,7 +19,7 @@ MEDIA_EXPORT void FMUL_C(const float src[], float scale, int len, float dest[]); MEDIA_EXPORT std::pair<float, float> EWMAAndMaxPower_C( float initial_value, const float src[], int len, float smoothing_factor); -#if defined(ARCH_CPU_X86_FAMILY) +#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) MEDIA_EXPORT void FMAC_SSE(const float src[], float scale, int len, float dest[]); MEDIA_EXPORT void FMUL_SSE(const float src[], float scale, int len, diff --git a/media/base/vector_math_unittest.cc b/media/base/vector_math_unittest.cc index 95433ca..a936923 100644 --- a/media/base/vector_math_unittest.cc +++ b/media/base/vector_math_unittest.cc @@ -6,7 +6,6 @@ #define _USE_MATH_DEFINES #include <cmath> -#include "base/cpu.h" #include "base/memory/aligned_memory.h" #include "base/memory/scoped_ptr.h" #include "base/strings/string_number_conversions.h" @@ -76,7 +75,6 @@ TEST_F(VectorMathTest, FMAC) { #if defined(ARCH_CPU_X86_FAMILY) { - ASSERT_TRUE(base::CPU().has_sse()); SCOPED_TRACE("FMAC_SSE"); FillTestVectors(kInputFillValue, kOutputFillValue); vector_math::FMAC_SSE( @@ -118,7 +116,6 @@ TEST_F(VectorMathTest, FMUL) { #if defined(ARCH_CPU_X86_FAMILY) { - ASSERT_TRUE(base::CPU().has_sse()); SCOPED_TRACE("FMUL_SSE"); FillTestVectors(kInputFillValue, kOutputFillValue); vector_math::FMUL_SSE( @@ -227,7 +224,6 @@ class EWMATestScenario { #if defined(ARCH_CPU_X86_FAMILY) { - ASSERT_TRUE(base::CPU().has_sse()); SCOPED_TRACE("EWMAAndMaxPower_SSE"); const std::pair<float, float>& result = vector_math::EWMAAndMaxPower_SSE( initial_value_, data_.get(), data_len_, smoothing_factor_); |