summaryrefslogtreecommitdiffstats
path: root/remoting/host/differ_block_sse2.cc
diff options
context:
space:
mode:
authorfinnur@chromium.org <finnur@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-02-11 11:58:29 +0000
committerfinnur@chromium.org <finnur@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-02-11 11:58:29 +0000
commitf92d0bbbd1e7eb2bd77e1ebb7edc33a516bc29f6 (patch)
treea55b89c0f6de8a76d75b3cea20bd99d2bb4da3ff /remoting/host/differ_block_sse2.cc
parent5e5376de8ce113df5653351f21a3efb7ba2db007 (diff)
downloadchromium_src-f92d0bbbd1e7eb2bd77e1ebb7edc33a516bc29f6.zip
chromium_src-f92d0bbbd1e7eb2bd77e1ebb7edc33a516bc29f6.tar.gz
chromium_src-f92d0bbbd1e7eb2bd77e1ebb7edc33a516bc29f6.tar.bz2
Revert 74583 - Revert 74571 - Use SSE2 block differ for chromoting
(Quick test to see if it is the cause of crashes in media_unittests) (Test showed this CL is not to blame) We have the SSE2 lying around in the tree just never being used. This will allow us to use it. BUG=None TEST=Chromoting to a host machine and the diff will work correctly Review URL: http://codereview.chromium.org/6469022 TBR=hclam@chromium.org Review URL: http://codereview.chromium.org/6488023 TBR=finnur@chromium.org git-svn-id: svn://svn.chromium.org/chrome/trunk/src@74588 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'remoting/host/differ_block_sse2.cc')
-rw-r--r--remoting/host/differ_block_sse2.cc111
1 files changed, 111 insertions, 0 deletions
diff --git a/remoting/host/differ_block_sse2.cc b/remoting/host/differ_block_sse2.cc
new file mode 100644
index 0000000..c0cc3b6
--- /dev/null
+++ b/remoting/host/differ_block_sse2.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <mmintrin.h>
+#include <emmintrin.h>
+#endif
+
+#include "remoting/host/differ_block.h"
+#include "remoting/host/differ_block_internal.h"
+
+namespace remoting {
+
+extern int BlockDifference_SSE2_W16(const uint8* image1, const uint8* image2,
+ int stride) {
+ __m128i acc = _mm_setzero_si128();
+ __m128i v0;
+ __m128i v1;
+ __m128i sad;
+ for (int y = 0; y < kBlockHeight; ++y) {
+ const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
+ const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
+ v0 = _mm_loadu_si128(i1);
+ v1 = _mm_loadu_si128(i2);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 1);
+ v1 = _mm_loadu_si128(i2 + 1);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 2);
+ v1 = _mm_loadu_si128(i2 + 2);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 3);
+ v1 = _mm_loadu_si128(i2 + 3);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+
+ // This essential means sad = acc >> 64. We only care about the lower 16
+ // bits.
+ sad = _mm_shuffle_epi32(acc, 0xEE);
+ sad = _mm_adds_epu16(sad, acc);
+ int diff = _mm_cvtsi128_si32(sad);
+ if (diff)
+ return 1;
+ image1 += stride;
+ image2 += stride;
+ }
+ return 0;
+}
+
+extern int BlockDifference_SSE2_W32(const uint8* image1, const uint8* image2,
+ int stride) {
+ __m128i acc = _mm_setzero_si128();
+ __m128i v0;
+ __m128i v1;
+ __m128i sad;
+ for (int y = 0; y < kBlockHeight; ++y) {
+ const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
+ const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
+ v0 = _mm_loadu_si128(i1);
+ v1 = _mm_loadu_si128(i2);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 1);
+ v1 = _mm_loadu_si128(i2 + 1);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 2);
+ v1 = _mm_loadu_si128(i2 + 2);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 3);
+ v1 = _mm_loadu_si128(i2 + 3);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 4);
+ v1 = _mm_loadu_si128(i2 + 4);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 5);
+ v1 = _mm_loadu_si128(i2 + 5);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 6);
+ v1 = _mm_loadu_si128(i2 + 6);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+ v0 = _mm_loadu_si128(i1 + 7);
+ v1 = _mm_loadu_si128(i2 + 7);
+ sad = _mm_sad_epu8(v0, v1);
+ acc = _mm_adds_epu16(acc, sad);
+
+ // This essential means sad = acc >> 64. We only care about the lower 16
+ // bits.
+ sad = _mm_shuffle_epi32(acc, 0xEE);
+ sad = _mm_adds_epu16(sad, acc);
+ int diff = _mm_cvtsi128_si32(sad);
+ if (diff)
+ return 1;
+ image1 += stride;
+ image2 += stride;
+ }
+ return 0;
+}
+
+} // namespace remoting