blob: ac9f0d406678da1af06b280d45230bb61dd4bef5 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// We want to measure the similarity of two sequences of bytes as a surrogate
// for measuring how well a second sequence will compress differentially to the
// first sequence.
#include "courgette/difference_estimator.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "base/containers/hash_tables.h"
#include "base/macros.h"
namespace courgette {
// Our difference measure is the number of k-tuples that occur in Subject that
// don't occur in Base.
const int kTupleSize = 4;
namespace {
static_assert(kTupleSize >= 4 && kTupleSize <= 8,
"kTupleSize should be between 4 and 8");
size_t HashTuple(const uint8_t* source) {
size_t hash1 = *reinterpret_cast<const uint32_t*>(source);
size_t hash2 = *reinterpret_cast<const uint32_t*>(source + kTupleSize - 4);
size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23);
return hash;
}
bool RegionsEqual(const Region& a, const Region& b) {
if (a.length() != b.length())
return false;
return memcmp(a.start(), b.start(), a.length()) == 0;
}
} // anonymous namepace
class DifferenceEstimator::Base {
public:
explicit Base(const Region& region) : region_(region) { }
void Init() {
if (region_.length() < kTupleSize)
return;
const uint8_t* start = region_.start();
const uint8_t* end = region_.end() - (kTupleSize - 1);
for (const uint8_t* p = start; p < end; ++p) {
size_t hash = HashTuple(p);
hashes_.insert(hash);
}
}
const Region& region() const { return region_; }
private:
Region region_;
base::hash_set<size_t> hashes_;
friend class DifferenceEstimator;
DISALLOW_COPY_AND_ASSIGN(Base);
};
class DifferenceEstimator::Subject {
public:
explicit Subject(const Region& region) : region_(region) {}
const Region& region() const { return region_; }
private:
Region region_;
DISALLOW_COPY_AND_ASSIGN(Subject);
};
DifferenceEstimator::DifferenceEstimator() {}
DifferenceEstimator::~DifferenceEstimator() {
for (size_t i = 0; i < owned_bases_.size(); ++i)
delete owned_bases_[i];
for (size_t i = 0; i < owned_subjects_.size(); ++i)
delete owned_subjects_[i];
}
DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) {
Base* base = new Base(region);
base->Init();
owned_bases_.push_back(base);
return base;
}
DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject(
const Region& region) {
Subject* subject = new Subject(region);
owned_subjects_.push_back(subject);
return subject;
}
size_t DifferenceEstimator::Measure(Base* base, Subject* subject) {
size_t mismatches = 0;
if (subject->region().length() >= kTupleSize) {
const uint8_t* start = subject->region().start();
const uint8_t* end = subject->region().end() - (kTupleSize - 1);
const uint8_t* p = start;
while (p < end) {
size_t hash = HashTuple(p);
if (base->hashes_.find(hash) == base->hashes_.end()) {
++mismatches;
}
p += 1;
}
}
if (mismatches == 0) {
if (RegionsEqual(base->region(), subject->region()))
return 0;
}
++mismatches; // Guarantee not zero.
return mismatches;
}
} // namespace
|