1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "remoting/host/differ_block.h"
#include <stdlib.h>
#if !defined(USE_SSE2)
#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || defined(_MSC_VER)
#define USE_SSE2 1
#else
#define USE_SSE2 0
#endif
#endif
#if USE_SSE2
#include <emmintrin.h>
#endif
namespace remoting {
#if USE_SSE2
int BlockDifference(const uint8* image1, const uint8* image2, int stride) {
__m128i acc = _mm_setzero_si128();
__m128i v0;
__m128i v1;
__m128i sad;
for (int y = 0; y < kBlockHeight; ++y) {
const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
v0 = _mm_loadu_si128(i1);
v1 = _mm_loadu_si128(i2);
sad = _mm_sad_epu8(v0, v1);
acc = _mm_adds_epu16(acc, sad);
v0 = _mm_loadu_si128(i1 + 1);
v1 = _mm_loadu_si128(i2 + 1);
sad = _mm_sad_epu8(v0, v1);
acc = _mm_adds_epu16(acc, sad);
v0 = _mm_loadu_si128(i1 + 2);
v1 = _mm_loadu_si128(i2 + 2);
sad = _mm_sad_epu8(v0, v1);
acc = _mm_adds_epu16(acc, sad);
v0 = _mm_loadu_si128(i1 + 3);
v1 = _mm_loadu_si128(i2 + 3);
sad = _mm_sad_epu8(v0, v1);
acc = _mm_adds_epu16(acc, sad);
v0 = _mm_loadu_si128(i1 + 4);
v1 = _mm_loadu_si128(i2 + 4);
sad = _mm_sad_epu8(v0, v1);
acc = _mm_adds_epu16(acc, sad);
v0 = _mm_loadu_si128(i1 + 5);
v1 = _mm_loadu_si128(i2 + 5);
sad = _mm_sad_epu8(v0, v1);
acc = _mm_adds_epu16(acc, sad);
v0 = _mm_loadu_si128(i1 + 6);
v1 = _mm_loadu_si128(i2 + 6);
sad = _mm_sad_epu8(v0, v1);
acc = _mm_adds_epu16(acc, sad);
v0 = _mm_loadu_si128(i1 + 7);
v1 = _mm_loadu_si128(i2 + 7);
sad = _mm_sad_epu8(v0, v1);
acc = _mm_adds_epu16(acc, sad);
sad = _mm_shuffle_epi32(acc, 0xEE); // [acc3, acc2, acc3, acc2]
sad = _mm_adds_epu16(sad, acc);
int diff = _mm_cvtsi128_si32(sad);
if (diff) {
return 1;
}
image1 += stride;
image2 += stride;
}
return 0;
}
#else
int BlockDifference(const uint8* image1, const uint8* image2, int stride) {
// Number of uint64s in each row of the block.
// This must be an integral number.
int int64s_per_row = (kBlockWidth * kBytesPerPixel) / sizeof(uint64);
for (int y = 0; y < kBlockHeight; y++) {
const uint64* prev = reinterpret_cast<const uint64*>(image1);
const uint64* curr = reinterpret_cast<const uint64*>(image2);
// Check each row in uint64-sized chunks.
// Note that this check may straddle multiple pixels. This is OK because
// we're interested in identifying whether or not there was change - we
// don't care what the actual change is.
for (int x = 0; x < int64s_per_row; x++) {
if (*prev++ != *curr++) {
return 1;
}
}
image1 += stride;
image2 += stride;
}
return 0;
}
#endif
} // namespace remoting
|