diff options
author | shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-10 21:50:43 +0000 |
---|---|---|
committer | shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-10 21:50:43 +0000 |
commit | b6cb7cfef7d9500f419163b34fb9035fbebe5ac3 (patch) | |
tree | c150a94a3585a62983a8540bd782482808abdc6f /chrome/browser/safe_browsing/safe_browsing_database.cc | |
parent | a89ca3ca54c3a74ec4e751b99104b5fd96b9e308 (diff) | |
download | chromium_src-b6cb7cfef7d9500f419163b34fb9035fbebe5ac3.zip chromium_src-b6cb7cfef7d9500f419163b34fb9035fbebe5ac3.tar.gz chromium_src-b6cb7cfef7d9500f419163b34fb9035fbebe5ac3.tar.bz2 |
PrefixSet as an alternate to BloomFilter for safe-browsing.
The safe-browsing prefix data is uniformly distributed across the
32-bit integer space. When sorted, the average delta between items is
about 8,000, which can be encoded in a 16-bit integer. PrefixSet
takes advantage of this to compress the prefixes into a structure
which is relatively efficient to query.
The current CL adds the new structure, but continues to use the bloom
filter to control things. Histograms are logged to track differences
in results.
BUG=71832
TEST=none
Review URL: http://codereview.chromium.org/6286072
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@74487 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/safe_browsing/safe_browsing_database.cc')
-rw-r--r-- | chrome/browser/safe_browsing/safe_browsing_database.cc | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/chrome/browser/safe_browsing/safe_browsing_database.cc b/chrome/browser/safe_browsing/safe_browsing_database.cc index 3027289..595886f 100644 --- a/chrome/browser/safe_browsing/safe_browsing_database.cc +++ b/chrome/browser/safe_browsing/safe_browsing_database.cc @@ -12,6 +12,7 @@ #include "base/process_util.h" #include "base/sha2.h" #include "chrome/browser/safe_browsing/bloom_filter.h" +#include "chrome/browser/safe_browsing/prefix_set.h" #include "chrome/browser/safe_browsing/safe_browsing_store_file.h" #include "googleurl/src/gurl.h" @@ -177,6 +178,37 @@ bool SBAddFullHashPrefixLess(const SBAddFullHash& a, const SBAddFullHash& b) { return a.full_hash.prefix < b.full_hash.prefix; } +// Create a |PrefixSet| from a vector of add-prefixes. +safe_browsing::PrefixSet* CreatePrefixSet( + const std::vector<SBAddPrefix>& add_prefixes) { + std::vector<SBPrefix> prefixes; + for (size_t i = 0; i < add_prefixes.size(); ++i) { + prefixes.push_back(add_prefixes[i].prefix); + } + return new safe_browsing::PrefixSet(prefixes); +} + +// As compared to the bloom filter, PrefixSet should have these +// properties: +// - Any bloom filter miss should be a prefix set miss. +// - Any prefix set hit should be a bloom filter hit. +// - Bloom filter false positives are prefix set misses. +// The following is to log actual performance to verify this. +enum PrefixSetEvent { + PREFIX_SET_EVENT_HIT, + PREFIX_SET_EVENT_BLOOM_HIT, + PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT, + + // Memory space for histograms is determined by the max. ALWAYS ADD + // NEW VALUES BEFORE THIS ONE. + PREFIX_SET_EVENT_MAX +}; + +void RecordPrefixSetInfo(PrefixSetEvent event_type) { + UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetEvent", event_type, + PREFIX_SET_EVENT_MAX); +} + } // namespace // The default SafeBrowsingDatabaseFactory. @@ -327,6 +359,9 @@ bool SafeBrowsingDatabaseNew::ResetDatabase() { // TODO(shess): This could probably be |bloom_filter_.reset()|. browse_bloom_filter_ = new BloomFilter(BloomFilter::kBloomFilterMinSize * BloomFilter::kBloomFilterSizeRatio); + // TODO(shess): It is simpler for the code to assume that presence + // of a bloom filter always implies presence of a prefix set. + prefix_set_.reset(CreatePrefixSet(std::vector<SBAddPrefix>())); } return true; @@ -355,13 +390,24 @@ bool SafeBrowsingDatabaseNew::ContainsBrowseUrl( if (!browse_bloom_filter_.get()) return false; + DCHECK(prefix_set_.get()); size_t miss_count = 0; for (size_t i = 0; i < prefixes.size(); ++i) { + bool found = prefix_set_->Exists(prefixes[i]); + if (browse_bloom_filter_->Exists(prefixes[i])) { + RecordPrefixSetInfo(PREFIX_SET_EVENT_BLOOM_HIT); + if (found) + RecordPrefixSetInfo(PREFIX_SET_EVENT_HIT); prefix_hits->push_back(prefixes[i]); if (prefix_miss_cache_.count(prefixes[i]) > 0) ++miss_count; + } else { + // Bloom filter misses should never be in prefix set. + DCHECK(!found); + if (found) + RecordPrefixSetInfo(PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT); } } @@ -778,6 +824,9 @@ void SafeBrowsingDatabaseNew::UpdateBrowseStore() { filter->Insert(add_prefixes[i].prefix); } + scoped_ptr<safe_browsing::PrefixSet> + prefix_set(CreatePrefixSet(add_prefixes)); + // This needs to be in sorted order by prefix for efficient access. std::sort(add_full_hashes.begin(), add_full_hashes.end(), SBAddFullHashPrefixLess); @@ -795,6 +844,7 @@ void SafeBrowsingDatabaseNew::UpdateBrowseStore() { pending_browse_hashes_.clear(); prefix_miss_cache_.clear(); browse_bloom_filter_.swap(filter); + prefix_set_.swap(prefix_set); } const base::TimeDelta bloom_gen = base::Time::Now() - before; @@ -878,6 +928,12 @@ void SafeBrowsingDatabaseNew::LoadBloomFilter() { if (!browse_bloom_filter_.get()) RecordFailure(FAILURE_DATABASE_FILTER_READ); + + // Manually re-generate the prefix set from the main database. + // TODO(shess): Write/read for prefix set. + std::vector<SBAddPrefix> add_prefixes; + browse_store_->GetAddPrefixes(&add_prefixes); + prefix_set_.reset(CreatePrefixSet(add_prefixes)); } bool SafeBrowsingDatabaseNew::Delete() { |