summaryrefslogtreecommitdiffstats
path: root/media/base/sinc_resampler.h
diff options
context:
space:
mode:
authordalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-08-02 02:59:42 +0000
committerdalecurtis@chromium.org <dalecurtis@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-08-02 02:59:42 +0000
commitfce61dcbdd3fb31839ba056b204bf903580a163b (patch)
tree5a8c37cfe0867ea14e0e37be2a7ce381a4ac2e27 /media/base/sinc_resampler.h
parent3770d6951c0db5ef047e77bcaee1b5bd9864f952 (diff)
downloadchromium_src-fce61dcbdd3fb31839ba056b204bf903580a163b.zip
chromium_src-fce61dcbdd3fb31839ba056b204bf903580a163b.tar.gz
chromium_src-fce61dcbdd3fb31839ba056b204bf903580a163b.tar.bz2
Add SSE optimizations to SincResampler.
These are not the same optimizations in the WebKit version of SincResampler. The WebKit version focuses on aligning the input vector, resulting in at worst two unaligned loads on each kernel index; or 2 * kKernelSize / 4 unaligned loads per call. Instead I chose to focus on keeping the kernel vectors aligned and eating at worst a single unaligned load on the input vector; or kKernelSize / 4 unaligned loads per call. Performance results from SincResamplerTest.ConvolveBenchmark: clang version 3.2 (trunk 159409): Convolve_C took 2100ms for 50000000 iterations. Convolve_SSE (aligned) took 677ms for 50000000 iterations. Convolve_SSE (unaligned) took 717ms for 50000000 iterations. gcc (Ubuntu 4.4.3-4ubuntu5.1) 4.4.3 Convolve_C took 2183ms for 50000000 iterations. Convolve_SSE (aligned) took 806ms for 50000000 iterations. Convolve_SSE (unaligned) took 844ms for 50000000 iterations. For reference, the original WebKit optimizations: clang version 3.2 (trunk 159409): Convolve_C took 2132ms for 50000000 iterations. Convolve_SSE (aligned) took 1146ms for 50000000 iterations. Convolve_SSE (unaligned) took 1797ms for 50000000 iterations. gcc (Ubuntu 4.4.3-4ubuntu5.1) 4.4.3: Convolve_C took 2209ms for 50000000 iterations. Convolve_SSE (aligned) took 1450ms for 50000000 iterations. Convolve_SSE (unaligned) took 4415ms for 50000000 iterations. In summary, SSE provides an ~2.6x to ~3x speedup on GCC and clang respectively. BUG=133637 TEST=media_unittests + SincResampler/* tests. Review URL: https://chromiumcodereview.appspot.com/10803003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@149569 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'media/base/sinc_resampler.h')
-rw-r--r--media/base/sinc_resampler.h21
1 files changed, 19 insertions, 2 deletions
diff --git a/media/base/sinc_resampler.h b/media/base/sinc_resampler.h
index 58f5c2d..4c55eab 100644
--- a/media/base/sinc_resampler.h
+++ b/media/base/sinc_resampler.h
@@ -6,7 +6,10 @@
#define MEDIA_BASE_SINC_RESAMPLER_H_
#include "base/callback.h"
+#include "base/gtest_prod_util.h"
+#include "base/memory/aligned_memory.h"
#include "base/memory/scoped_ptr.h"
+#include "build/build_config.h"
#include "media/base/media_export.h"
namespace media {
@@ -33,8 +36,22 @@ class MEDIA_EXPORT SincResampler {
int ChunkSize();
private:
+ FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, Convolve);
+ FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, ConvolveBenchmark);
+
void InitializeKernel();
+ // Compute convolution of |k1| and |k2| over |input_ptr|, resultant sums are
+ // linearly interpolated using |kernel_interpolation_factor|. The underlying
+ // implementation is chosen at run time based on SSE support.
+ static float Convolve(const float* input_ptr, const float* k1,
+ const float* k2, double kernel_interpolation_factor);
+ static float Convolve_C(const float* input_ptr, const float* k1,
+ const float* k2, double kernel_interpolation_factor);
+ static float Convolve_SSE(const float* input_ptr, const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor);
+
// The ratio of input / output sample rates.
double io_sample_rate_ratio_;
@@ -51,10 +68,10 @@ class MEDIA_EXPORT SincResampler {
// Contains kKernelOffsetCount kernels back-to-back, each of size kKernelSize.
// The kernel offsets are sub-sample shifts of a windowed sinc shifted from
// 0.0 to 1.0 sample.
- scoped_array<float> kernel_storage_;
+ scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> kernel_storage_;
// Data from the source is copied into this buffer for each processing pass.
- scoped_array<float> input_buffer_;
+ scoped_ptr_malloc<float, base::ScopedPtrAlignedFree> input_buffer_;
// Pointers to the various regions inside |input_buffer_|. See the diagram at
// the top of the .cc file for more information.