summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--media/base/simd/sinc_resampler_sse.cc48
-rw-r--r--media/base/simd/vector_math_sse.cc27
-rw-r--r--media/base/sinc_resampler.cc89
-rw-r--r--media/base/sinc_resampler.h35
-rw-r--r--media/base/sinc_resampler_unittest.cc11
-rw-r--r--media/base/vector_math.cc26
-rw-r--r--media/base/vector_math_testing.h4
-rw-r--r--media/base/vector_math_unittest.cc8
-rw-r--r--media/media.gyp37
9 files changed, 175 insertions, 110 deletions
diff --git a/media/base/simd/sinc_resampler_sse.cc b/media/base/simd/sinc_resampler_sse.cc
new file mode 100644
index 0000000..f0aec1c
--- /dev/null
+++ b/media/base/simd/sinc_resampler_sse.cc
@@ -0,0 +1,48 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/sinc_resampler.h"
+
+#include <xmmintrin.h>
+
+namespace media {
+
+float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ __m128 m_input;
+ __m128 m_sums1 = _mm_setzero_ps();
+ __m128 m_sums2 = _mm_setzero_ps();
+
+ // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
+ // these loops hurt performance in local testing.
+ if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
+ for (int i = 0; i < kKernelSize; i += 4) {
+ m_input = _mm_loadu_ps(input_ptr + i);
+ m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+ m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+ }
+ } else {
+ for (int i = 0; i < kKernelSize; i += 4) {
+ m_input = _mm_load_ps(input_ptr + i);
+ m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+ m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+ }
+ }
+
+ // Linearly interpolate the two "convolutions".
+ m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
+ m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
+ m_sums1 = _mm_add_ps(m_sums1, m_sums2);
+
+ // Sum components together.
+ float result;
+ m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
+ _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
+ m_sums2, m_sums2, 1)));
+
+ return result;
+}
+
+} // namespace media
diff --git a/media/base/simd/vector_math_sse.cc b/media/base/simd/vector_math_sse.cc
new file mode 100644
index 0000000..5cc2df9
--- /dev/null
+++ b/media/base/simd/vector_math_sse.cc
@@ -0,0 +1,27 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "media/base/vector_math_testing.h"
+
+#include <xmmintrin.h> // NOLINT
+
+namespace media {
+namespace vector_math {
+
+void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
+ const int rem = len % 4;
+ const int last_index = len - rem;
+ __m128 m_scale = _mm_set_ps1(scale);
+ for (int i = 0; i < last_index; i += 4) {
+ _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
+ _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
+ }
+
+ // Handle any remaining values that wouldn't fit in an SSE pass.
+ for (int i = last_index; i < len; ++i)
+ dest[i] += src[i] * scale;
+}
+
+} // namespace vector_math
+} // namespace media
diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc
index d836fc7..09ff49d 100644
--- a/media/base/sinc_resampler.cc
+++ b/media/base/sinc_resampler.cc
@@ -40,11 +40,6 @@
#include "base/cpu.h"
#include "base/logging.h"
-#include "build/build_config.h"
-
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-#include <xmmintrin.h>
-#endif
#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
#include <arm_neon.h>
@@ -52,33 +47,6 @@
namespace media {
-namespace {
-
-enum {
- // The kernel size can be adjusted for quality (higher is better) at the
- // expense of performance. Must be a multiple of 32.
- // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
- kKernelSize = 32,
-
- // The number of destination frames generated per processing pass. Affects
- // how often and for how much SincResampler calls back for input. Must be
- // greater than kKernelSize.
- kBlockSize = 512,
-
- // The kernel offset count is used for interpolation and is the number of
- // sub-sample kernel shifts. Can be adjusted for quality (higher is better)
- // at the expense of allocating more memory.
- kKernelOffsetCount = 32,
- kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
-
- // The size (in samples) of the internal buffer used by the resampler.
- kBufferSize = kBlockSize + kKernelSize
-};
-
-} // namespace
-
-const int SincResampler::kMaximumLookAheadSize = kBufferSize;
-
SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)
: io_sample_rate_ratio_(io_sample_rate_ratio),
virtual_source_idx_(0),
@@ -222,7 +190,7 @@ void SincResampler::Resample(float* destination, int frames) {
}
}
-int SincResampler::ChunkSize() {
+int SincResampler::ChunkSize() const {
return kBlockSize / io_sample_rate_ratio_;
}
@@ -235,14 +203,23 @@ void SincResampler::Flush() {
float SincResampler::Convolve(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor) {
+ // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true
+ // so long as kKernelSize is a multiple of 16.
+ DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
+ DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
+
// Rely on function level static initialization to keep ConvolveProc selection
// thread safe.
typedef float (*ConvolveProc)(const float* src, const float* k1,
const float* k2,
double kernel_interpolation_factor);
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(__SSE__)
+ static const ConvolveProc kConvolveProc = Convolve_SSE;
+#else
static const ConvolveProc kConvolveProc =
base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
+#endif
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
static const ConvolveProc kConvolveProc = Convolve_NEON;
#else
@@ -271,50 +248,6 @@ float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
+ kernel_interpolation_factor * sum2;
}
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
- const float* k2,
- double kernel_interpolation_factor) {
- // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true
- // so long as kKernelSize is a multiple of 16.
- DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
- DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
-
- __m128 m_input;
- __m128 m_sums1 = _mm_setzero_ps();
- __m128 m_sums2 = _mm_setzero_ps();
-
- // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
- // these loops hurt performance in local testing.
- if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
- for (int i = 0; i < kKernelSize; i += 4) {
- m_input = _mm_loadu_ps(input_ptr + i);
- m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
- m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
- }
- } else {
- for (int i = 0; i < kKernelSize; i += 4) {
- m_input = _mm_load_ps(input_ptr + i);
- m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
- m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
- }
- }
-
- // Linearly interpolate the two "convolutions".
- m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
- m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
- m_sums1 = _mm_add_ps(m_sums1, m_sums2);
-
- // Sum components together.
- float result;
- m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
- _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
- m_sums2, m_sums2, 1)));
-
- return result;
-}
-#endif
-
#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
const float* k2,
diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h
index a1d3cf7..f4eaf5f 100644
--- a/media/base/sinc_resampler.h
+++ b/media/base/sinc_resampler.h
@@ -9,6 +9,7 @@
#include "base/gtest_prod_util.h"
#include "base/memory/aligned_memory.h"
#include "base/memory/scoped_ptr.h"
+#include "build/build_config.h"
#include "media/base/media_export.h"
namespace media {
@@ -16,9 +17,30 @@ namespace media {
// SincResampler is a high-quality single-channel sample-rate converter.
class MEDIA_EXPORT SincResampler {
public:
- // The maximum number of samples that may be requested from the callback ahead
- // of the current position in the stream.
- static const int kMaximumLookAheadSize;
+ enum {
+ // The kernel size can be adjusted for quality (higher is better) at the
+ // expense of performance. Must be a multiple of 32.
+ // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
+ kKernelSize = 32,
+
+ // The number of destination frames generated per processing pass. Affects
+ // how often and for how much SincResampler calls back for input. Must be
+ // greater than kKernelSize.
+ kBlockSize = 512,
+
+ // The kernel offset count is used for interpolation and is the number of
+ // sub-sample kernel shifts. Can be adjusted for quality (higher is better)
+ // at the expense of allocating more memory.
+ kKernelOffsetCount = 32,
+ kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
+
+ // The size (in samples) of the internal buffer used by the resampler.
+ kBufferSize = kBlockSize + kKernelSize,
+
+ // The maximum number of samples that may be requested from the callback
+ // ahead of the current position in the stream.
+ kMaximumLookAheadSize = kBufferSize
+ };
// Callback type for providing more data into the resampler. Expects |frames|
// of data to be rendered into |destination|; zero padded if not enough frames
@@ -36,7 +58,7 @@ class MEDIA_EXPORT SincResampler {
// The maximum size in frames that guarantees Resample() will only make a
// single call to |read_cb_| for more data.
- int ChunkSize();
+ int ChunkSize() const;
// Flush all buffered data and reset internal indices.
void Flush();
@@ -55,15 +77,18 @@ class MEDIA_EXPORT SincResampler {
const float* k2, double kernel_interpolation_factor);
static float Convolve_C(const float* input_ptr, const float* k1,
const float* k2, double kernel_interpolation_factor);
+#if defined(ARCH_CPU_X86_FAMILY)
static float Convolve_SSE(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor);
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
static float Convolve_NEON(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor);
+#endif
// The ratio of input / output sample rates.
- double io_sample_rate_ratio_;
+ const double io_sample_rate_ratio_;
// An index on the source input buffer with sub-sample precision. It must be
// double precision to avoid drift.
diff --git a/media/base/sinc_resampler_unittest.cc b/media/base/sinc_resampler_unittest.cc
index 0f718f2..b7aaec4 100644
--- a/media/base/sinc_resampler_unittest.cc
+++ b/media/base/sinc_resampler_unittest.cc
@@ -10,6 +10,7 @@
#include "base/bind.h"
#include "base/bind_helpers.h"
#include "base/command_line.h"
+#include "base/cpu.h"
#include "base/logging.h"
#include "base/string_number_conversions.h"
#include "base/strings/stringize_macros.h"
@@ -98,7 +99,7 @@ TEST(SincResamplerTest, Flush) {
}
// Define platform independent function name for Convolve* tests.
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
#define CONVOLVE_FUNC Convolve_SSE
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
#define CONVOLVE_FUNC Convolve_NEON
@@ -109,6 +110,10 @@ TEST(SincResamplerTest, Flush) {
// will be tested by the parameterized SincResampler tests below.
#if defined(CONVOLVE_FUNC)
TEST(SincResamplerTest, Convolve) {
+#if defined(ARCH_CPU_X86_FAMILY)
+ ASSERT_TRUE(base::CPU().has_sse());
+#endif
+
// Initialize a dummy resampler.
MockSource mock_source;
SincResampler resampler(
@@ -171,6 +176,10 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
printf("Convolve_C took %.2fms.\n", total_time_c_ms);
#if defined(CONVOLVE_FUNC)
+#if defined(ARCH_CPU_X86_FAMILY)
+ ASSERT_TRUE(base::CPU().has_sse());
+#endif
+
// Benchmark with unaligned input pointer.
start = base::TimeTicks::HighResNow();
for (int j = 0; j < convolve_iterations; ++j) {
diff --git a/media/base/vector_math.cc b/media/base/vector_math.cc
index edd95cd..96f94d9 100644
--- a/media/base/vector_math.cc
+++ b/media/base/vector_math.cc
@@ -7,11 +7,6 @@
#include "base/cpu.h"
#include "base/logging.h"
-#include "build/build_config.h"
-
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-#include <xmmintrin.h>
-#endif
namespace media {
namespace vector_math {
@@ -25,9 +20,13 @@ void FMAC(const float src[], float scale, int len, float dest[]) {
// selection thread safe.
typedef void (*VectorFMACProc)(const float src[], float scale, int len,
float dest[]);
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
+#if defined(__SSE__)
+ static const VectorFMACProc kVectorFMACProc = FMAC_SSE;
+#else
static const VectorFMACProc kVectorFMACProc =
base::CPU().has_sse() ? FMAC_SSE : FMAC_C;
+#endif
#else
static const VectorFMACProc kVectorFMACProc = FMAC_C;
#endif
@@ -40,20 +39,5 @@ void FMAC_C(const float src[], float scale, int len, float dest[]) {
dest[i] += src[i] * scale;
}
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
-void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
- __m128 m_scale = _mm_set_ps1(scale);
- int rem = len % 4;
- for (int i = 0; i < len - rem; i += 4) {
- _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
- _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
- }
-
- // Handle any remaining values that wouldn't fit in an SSE pass.
- if (rem)
- FMAC_C(src + len - rem, scale, rem, dest + len - rem);
-}
-#endif
-
} // namespace vector_math
} // namespace media
diff --git a/media/base/vector_math_testing.h b/media/base/vector_math_testing.h
index d364b74..503ca6a 100644
--- a/media/base/vector_math_testing.h
+++ b/media/base/vector_math_testing.h
@@ -5,6 +5,7 @@
#ifndef MEDIA_BASE_VECTOR_MATH_TESTING_H_
#define MEDIA_BASE_VECTOR_MATH_TESTING_H_
+#include "build/build_config.h"
#include "media/base/media_export.h"
namespace media {
@@ -13,8 +14,11 @@ namespace vector_math {
// Optimized versions of FMAC() function exposed for testing. See vector_math.h
// for details.
MEDIA_EXPORT void FMAC_C(const float src[], float scale, int len, float dest[]);
+
+#if defined(ARCH_CPU_X86_FAMILY)
MEDIA_EXPORT void FMAC_SSE(const float src[], float scale, int len,
float dest[]);
+#endif
} // namespace vector_math
} // namespace media
diff --git a/media/base/vector_math_unittest.cc b/media/base/vector_math_unittest.cc
index 153378e..e64c7c9 100644
--- a/media/base/vector_math_unittest.cc
+++ b/media/base/vector_math_unittest.cc
@@ -7,6 +7,7 @@
#include <cmath>
#include "base/command_line.h"
+#include "base/cpu.h"
#include "base/memory/aligned_memory.h"
#include "base/memory/scoped_ptr.h"
#include "base/string_number_conversions.h"
@@ -90,8 +91,9 @@ TEST_F(VectorMathTest, FMAC) {
VerifyOutput(kResult);
}
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
{
+ ASSERT_TRUE(base::CPU().has_sse());
SCOPED_TRACE("FMAC_SSE");
FillTestVectors(kInputFillValue, kOutputFillValue);
vector_math::FMAC_SSE(
@@ -118,7 +120,9 @@ TEST_F(VectorMathTest, FMACBenchmark) {
double total_time_c_ms = (TimeTicks::HighResNow() - start).InMillisecondsF();
printf("FMAC_C took %.2fms.\n", total_time_c_ms);
-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
+#if defined(ARCH_CPU_X86_FAMILY)
+ ASSERT_TRUE(base::CPU().has_sse());
+
// Benchmark FMAC_SSE() with unaligned size.
ASSERT_NE((kVectorSize - 1) % (vector_math::kRequiredAlignment /
sizeof(float)), 0U);
diff --git a/media/media.gyp b/media/media.gyp
index 526c089..0b369ea 100644
--- a/media/media.gyp
+++ b/media/media.gyp
@@ -683,7 +683,7 @@
'message': 'Generating Pulse stubs for dynamic loading.',
},
],
- 'conditions': [
+ 'conditions': [
# Linux/Solaris need libdl for dlopen() and friends.
['OS == "linux" or OS == "solaris"', {
'link_settings': {
@@ -811,6 +811,12 @@
'../build/linux/system.gyp:gtk',
],
}],
+ # ios check is necessary due to http://crbug.com/172682.
+ ['OS != "ios" and (target_arch == "ia32" or target_arch == "x64")', {
+ 'dependencies': [
+ 'media_sse',
+ ],
+ }],
],
'target_conditions': [
['OS == "ios"', {
@@ -1018,12 +1024,15 @@
'audio/audio_low_latency_input_output_unittest.cc',
],
}],
- [ 'target_arch=="ia32" or target_arch=="x64"', {
+ ['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', {
'sources': [
'base/simd/convert_rgb_to_yuv_unittest.cc',
],
+ 'dependencies': [
+ 'media_sse',
+ ],
}],
- [ 'screen_capture_supported == 0', {
+ ['screen_capture_supported == 0', {
'sources/': [
['exclude', '^video/capture/screen/'],
],
@@ -1610,5 +1619,27 @@
}, # end of target differ_block_sse2
],
}],
+ # ios check is necessary due to http://crbug.com/172682.
+ ['OS != "ios" and (target_arch=="ia32" or target_arch=="x64")', {
+ 'targets': [
+ {
+ 'target_name': 'media_sse',
+ 'type': 'static_library',
+ 'cflags': [
+ '-msse',
+ ],
+ 'include_dirs': [
+ '..',
+ ],
+ 'defines': [
+ 'MEDIA_IMPLEMENTATION',
+ ],
+ 'sources': [
+ 'base/simd/sinc_resampler_sse.cc',
+ 'base/simd/vector_math_sse.cc',
+ ],
+ }, # end of target media_sse
+ ],
+ }],
],
}