diff options
author | finnur@chromium.org <finnur@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-11 11:58:29 +0000 |
---|---|---|
committer | finnur@chromium.org <finnur@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-11 11:58:29 +0000 |
commit | f92d0bbbd1e7eb2bd77e1ebb7edc33a516bc29f6 (patch) | |
tree | a55b89c0f6de8a76d75b3cea20bd99d2bb4da3ff /remoting/host/differ_block_sse2.cc | |
parent | 5e5376de8ce113df5653351f21a3efb7ba2db007 (diff) | |
download | chromium_src-f92d0bbbd1e7eb2bd77e1ebb7edc33a516bc29f6.zip chromium_src-f92d0bbbd1e7eb2bd77e1ebb7edc33a516bc29f6.tar.gz chromium_src-f92d0bbbd1e7eb2bd77e1ebb7edc33a516bc29f6.tar.bz2 |
Revert 74583 - Revert 74571 - Use SSE2 block differ for chromoting
(Quick test to see if it is the cause of crashes in media_unittests)
(Test showed this CL is not to blame)
We have the SSE2 lying around in the tree just never being used. This will
allow us to use it.
BUG=None
TEST=Chromoting to a host machine and the diff will work correctly
Review URL: http://codereview.chromium.org/6469022
TBR=hclam@chromium.org
Review URL: http://codereview.chromium.org/6488023
TBR=finnur@chromium.org
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@74588 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'remoting/host/differ_block_sse2.cc')
-rw-r--r-- | remoting/host/differ_block_sse2.cc | 111 |
1 files changed, 111 insertions, 0 deletions
diff --git a/remoting/host/differ_block_sse2.cc b/remoting/host/differ_block_sse2.cc new file mode 100644 index 0000000..c0cc3b6 --- /dev/null +++ b/remoting/host/differ_block_sse2.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#if defined(_MSC_VER) +#include <intrin.h> +#else +#include <mmintrin.h> +#include <emmintrin.h> +#endif + +#include "remoting/host/differ_block.h" +#include "remoting/host/differ_block_internal.h" + +namespace remoting { + +extern int BlockDifference_SSE2_W16(const uint8* image1, const uint8* image2, + int stride) { + __m128i acc = _mm_setzero_si128(); + __m128i v0; + __m128i v1; + __m128i sad; + for (int y = 0; y < kBlockHeight; ++y) { + const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); + const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); + v0 = _mm_loadu_si128(i1); + v1 = _mm_loadu_si128(i2); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 1); + v1 = _mm_loadu_si128(i2 + 1); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 2); + v1 = _mm_loadu_si128(i2 + 2); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 3); + v1 = _mm_loadu_si128(i2 + 3); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + + // This essential means sad = acc >> 64. We only care about the lower 16 + // bits. + sad = _mm_shuffle_epi32(acc, 0xEE); + sad = _mm_adds_epu16(sad, acc); + int diff = _mm_cvtsi128_si32(sad); + if (diff) + return 1; + image1 += stride; + image2 += stride; + } + return 0; +} + +extern int BlockDifference_SSE2_W32(const uint8* image1, const uint8* image2, + int stride) { + __m128i acc = _mm_setzero_si128(); + __m128i v0; + __m128i v1; + __m128i sad; + for (int y = 0; y < kBlockHeight; ++y) { + const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); + const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); + v0 = _mm_loadu_si128(i1); + v1 = _mm_loadu_si128(i2); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 1); + v1 = _mm_loadu_si128(i2 + 1); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 2); + v1 = _mm_loadu_si128(i2 + 2); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 3); + v1 = _mm_loadu_si128(i2 + 3); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 4); + v1 = _mm_loadu_si128(i2 + 4); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 5); + v1 = _mm_loadu_si128(i2 + 5); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 6); + v1 = _mm_loadu_si128(i2 + 6); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + v0 = _mm_loadu_si128(i1 + 7); + v1 = _mm_loadu_si128(i2 + 7); + sad = _mm_sad_epu8(v0, v1); + acc = _mm_adds_epu16(acc, sad); + + // This essential means sad = acc >> 64. We only care about the lower 16 + // bits. + sad = _mm_shuffle_epi32(acc, 0xEE); + sad = _mm_adds_epu16(sad, acc); + int diff = _mm_cvtsi128_si32(sad); + if (diff) + return 1; + image1 += stride; + image2 += stride; + } + return 0; +} + +} // namespace remoting |