diff options
Diffstat (limited to 'chrome')
8 files changed, 245 insertions, 88 deletions
diff --git a/chrome/browser/safe_browsing/bloom_filter.cc b/chrome/browser/safe_browsing/bloom_filter.cc index 1187e4e..45e1c572 100644 --- a/chrome/browser/safe_browsing/bloom_filter.cc +++ b/chrome/browser/safe_browsing/bloom_filter.cc @@ -6,14 +6,45 @@ #include <string.h> +#include "base/logging.h" +#include "base/rand_util.h" +#include "net/base/file_stream.h" +#include "net/base/net_errors.h" + +namespace { + +// The Jenkins 96 bit mix function: +// http://www.concentric.net/~Ttwang/tech/inthash.htm +uint32 HashMix(uint64 hash_key, uint32 c) { + uint32 a = static_cast<uint32>(hash_key) & 0xFFFFFFFF; + uint32 b = static_cast<uint32>(hash_key >> 32) & 0xFFFFFFFF; + + a = a - b; a = a - c; a = a ^ (c >> 13); + b = b - c; b = b - a; b = b ^ (a << 8); + c = c - a; c = c - b; c = c ^ (b >> 13); + a = a - b; a = a - c; a = a ^ (c >> 12); + b = b - c; b = b - a; b = b ^ (a << 16); + c = c - a; c = c - b; c = c ^ (b >> 5); + a = a - b; a = a - c; a = a ^ (c >> 3); + b = b - c; b = b - a; b = b ^ (a << 10); + c = c - a; c = c - b; c = c ^ (b >> 15); + + return c; +} + +} // namespace + BloomFilter::BloomFilter(int bit_size) { + for (int i = 0; i < kNumHashKeys; ++i) + hash_keys_.push_back(base::RandUint64()); byte_size_ = bit_size / 8 + 1; bit_size_ = byte_size_ * 8; data_.reset(new char[byte_size_]); memset(data_.get(), 0, byte_size_); } -BloomFilter::BloomFilter(char* data, int size) { +BloomFilter::BloomFilter(char* data, int size, const std::vector<uint64>& keys) + : hash_keys_(keys) { byte_size_ = size; bit_size_ = byte_size_ * 8; data_.reset(data); @@ -25,10 +56,9 @@ BloomFilter::~BloomFilter() { void BloomFilter::Insert(int hash_int) { uint32 hash; memcpy(&hash, &hash_int, sizeof(hash)); - for (int i = 0; i < 4; ++i) { - hash = RotateLeft(hash); - uint32 index = hash % bit_size_; - + for (int i = 0; i < static_cast<int>(hash_keys_.size()); ++i) { + uint32 mix = HashMix(hash_keys_[i], hash); + uint32 index = mix % bit_size_; int byte = index / 8; int bit = index % 8; data_.get()[byte] |= 1 << bit; @@ -38,10 +68,9 @@ void BloomFilter::Insert(int hash_int) { bool BloomFilter::Exists(int hash_int) const { uint32 hash; memcpy(&hash, &hash_int, sizeof(hash)); - for (int i = 0; i < 4; ++i) { - hash = RotateLeft(hash); - uint32 index = hash % bit_size_; - + for (int i = 0; i < static_cast<int>(hash_keys_.size()); ++i) { + uint32 mix = HashMix(hash_keys_[i], hash); + uint32 index = mix % bit_size_; int byte = index / 8; int bit = index % 8; char data = data_.get()[byte]; @@ -52,9 +81,87 @@ bool BloomFilter::Exists(int hash_int) const { return true; } -uint32 BloomFilter::RotateLeft(uint32 hash) { - uint32 left_byte = hash >> 24; - hash = hash << 8; - hash |= left_byte; - return hash; +// static. +BloomFilter* BloomFilter::LoadFile(const FilePath& filter_name) { + net::FileStream filter; + + if (filter.Open(filter_name, + base::PLATFORM_FILE_OPEN | + base::PLATFORM_FILE_READ) != net::OK) + return NULL; + + // Make sure we have a file version that we can understand. + int file_version; + int bytes_read = filter.Read(reinterpret_cast<char*>(&file_version), + sizeof(file_version), NULL); + if (bytes_read != sizeof(file_version) || file_version != kFileVersion) + return NULL; + + // Get all the random hash keys. + int num_keys; + bytes_read = filter.Read(reinterpret_cast<char*>(&num_keys), + sizeof(num_keys), NULL); + if (bytes_read != sizeof(num_keys) || num_keys < 1 || num_keys > kNumHashKeys) + return NULL; + + std::vector<uint64> hash_keys; + for (int i = 0; i < num_keys; ++i) { + uint64 key; + bytes_read = filter.Read(reinterpret_cast<char*>(&key), sizeof(key), NULL); + if (bytes_read != sizeof(key)) + return NULL; + hash_keys.push_back(key); + } + + // Read in the filter data, with sanity checks on min and max sizes. + int64 remaining64 = filter.Available(); + if (remaining64 < kBloomFilterMinSize || remaining64 > kBloomFilterMaxSize) + return NULL; + + int byte_size = static_cast<int>(remaining64); + scoped_array<char> data(new char[byte_size]); + bytes_read = filter.Read(data.get(), byte_size, NULL); + if (bytes_read != byte_size) + return NULL; + + // We've read everything okay, commit the data. + return new BloomFilter(data.release(), byte_size, hash_keys); +} + +bool BloomFilter::WriteFile(const FilePath& filter_name) { + net::FileStream filter; + + if (filter.Open(filter_name, + base::PLATFORM_FILE_WRITE | + base::PLATFORM_FILE_CREATE_ALWAYS) != net::OK) + return false; + + // Write the version information. + int version = kFileVersion; + int bytes_written = filter.Write(reinterpret_cast<char*>(&version), + sizeof(version), NULL); + if (bytes_written != sizeof(version)) + return false; + + // Write the number of random hash keys. + int num_keys = static_cast<int>(hash_keys_.size()); + bytes_written = filter.Write(reinterpret_cast<char*>(&num_keys), + sizeof(num_keys), NULL); + if (bytes_written != sizeof(num_keys)) + return false; + + for (int i = 0; i < num_keys; ++i) { + bytes_written = filter.Write(reinterpret_cast<char*>(&hash_keys_[i]), + sizeof(hash_keys_[i]), NULL); + if (bytes_written != sizeof(hash_keys_[i])) + return false; + } + + // Write the filter data. + bytes_written = filter.Write(data_.get(), byte_size_, NULL); + if (bytes_written != byte_size_) + return false; + + return true; } + diff --git a/chrome/browser/safe_browsing/bloom_filter.h b/chrome/browser/safe_browsing/bloom_filter.h index 1ec0624..d005216b 100644 --- a/chrome/browser/safe_browsing/bloom_filter.h +++ b/chrome/browser/safe_browsing/bloom_filter.h @@ -2,24 +2,38 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // -// A simple bloom filter. It's currently limited to four hashing functions, -// which are calculated from the item's hash. +// A simple bloom filter. It uses a large number (20) of hashes to reduce the +// possibility of false positives. The bloom filter's hashing uses random keys +// in order to minimize the chance that a false positive for one user is a false +// positive for all. +// +// The bloom filter manages it serialization to disk with the following file +// format: +// 4 byte version number +// 4 byte number of hash keys (n) +// n * 8 bytes of hash keys +// Remaining bytes are the filter data. #ifndef CHROME_BROWSER_SAFE_BROWSING_BLOOM_FILTER_H_ #define CHROME_BROWSER_SAFE_BROWSING_BLOOM_FILTER_H_ +#include <vector> + +#include "base/file_path.h" #include "base/ref_counted.h" #include "base/scoped_ptr.h" #include "base/basictypes.h" +#include "testing/gtest/include/gtest/gtest_prod.h" class BloomFilter : public base::RefCountedThreadSafe<BloomFilter> { public: // Constructs an empty filter with the given size. - BloomFilter(int bit_size); + explicit BloomFilter(int bit_size); + + // Constructs a filter from serialized data. This object owns the memory and + // will delete it on destruction. + BloomFilter(char* data, int size, const std::vector<uint64>& keys); - // Constructs a filter from serialized data. This object owns the memory - // and will delete it on destruction. - BloomFilter(char* data, int size); ~BloomFilter(); void Insert(int hash); @@ -28,12 +42,36 @@ class BloomFilter : public base::RefCountedThreadSafe<BloomFilter> { const char* data() const { return data_.get(); } int size() const { return byte_size_; } + // Loading and storing the filter from / to disk. + static BloomFilter* LoadFile(const FilePath& filter_name); + bool WriteFile(const FilePath& filter_name); + + // How many bits to use per item. See the design doc for more information. + static const int kBloomFilterSizeRatio = 25; + + // Force a minimum size on the bloom filter to prevent a high false positive + // hash request rate (in bytes). + static const int kBloomFilterMinSize = 250000; + + // Force a maximum size on the bloom filter to avoid using too much memory + // (in bytes). + static const int kBloomFilterMaxSize = 2 * 1024 * 1024; + private: - static uint32 RotateLeft(uint32 hash); + FRIEND_TEST(SafeBrowsingBloomFilter, BloomFilterUse); + FRIEND_TEST(SafeBrowsingBloomFilter, BloomFilterFile); + + static const int kNumHashKeys = 20; + static const int kFileVersion = 1; int byte_size_; // size in bytes int bit_size_; // size in bits scoped_array<char> data_; + + // Random keys used for hashing. + std::vector<uint64> hash_keys_; + + DISALLOW_COPY_AND_ASSIGN(BloomFilter); }; #endif // CHROME_BROWSER_SAFE_BROWSING_BLOOM_FILTER_H_ diff --git a/chrome/browser/safe_browsing/bloom_filter_unittest.cc b/chrome/browser/safe_browsing/bloom_filter_unittest.cc index 06db478..f8dde07 100644 --- a/chrome/browser/safe_browsing/bloom_filter_unittest.cc +++ b/chrome/browser/safe_browsing/bloom_filter_unittest.cc @@ -8,8 +8,11 @@ #include <limits.h> #include <set> +#include <vector> +#include "base/file_util.h" #include "base/logging.h" +#include "base/path_service.h" #include "base/rand_util.h" #include "base/ref_counted.h" #include "base/string_util.h" @@ -23,12 +26,13 @@ uint32 GenHash() { } -TEST(SafeBrowsing, BloomFilter) { +TEST(SafeBrowsingBloomFilter, BloomFilterUse) { // Use a small number for unit test so it's not slow. uint32 count = 1000; // Build up the bloom filter. - scoped_refptr<BloomFilter> filter = new BloomFilter(count * 10); + scoped_refptr<BloomFilter> filter = + new BloomFilter(count * BloomFilter::kBloomFilterSizeRatio); typedef std::set<int> Values; Values values; @@ -42,7 +46,7 @@ TEST(SafeBrowsing, BloomFilter) { char* data_copy = new char[filter->size()]; memcpy(data_copy, filter->data(), filter->size()); scoped_refptr<BloomFilter> filter_copy = - new BloomFilter(data_copy, filter->size()); + new BloomFilter(data_copy, filter->size(), filter->hash_keys_); // Check no false negatives by ensuring that every time we inserted exists. for (Values::iterator i = values.begin(); i != values.end(); ++i) { @@ -75,3 +79,42 @@ TEST(SafeBrowsing, BloomFilter) { LOG(INFO) << "For safe browsing bloom filter of size " << count << ", the FP rate was " << fp_rate << " %"; } + +// Test that we can read and write the bloom filter file. +TEST(SafeBrowsingBloomFilter, BloomFilterFile) { + // Create initial filter. + const int kTestEntries = BloomFilter::kBloomFilterMinSize; + scoped_refptr<BloomFilter> filter_write = + new BloomFilter(kTestEntries * BloomFilter::kBloomFilterSizeRatio); + + for (int i = 0; i < kTestEntries; ++i) + filter_write->Insert(GenHash()); + + // Remove any left over test filters and serialize. + FilePath filter_path; + PathService::Get(base::DIR_TEMP, &filter_path); + filter_path = filter_path.AppendASCII("SafeBrowsingTestFilter"); + file_util::Delete(filter_path, false); + ASSERT_FALSE(file_util::PathExists(filter_path)); + ASSERT_TRUE(filter_write->WriteFile(filter_path)); + + // Create new empty filter and load from disk. + BloomFilter* filter = BloomFilter::LoadFile(filter_path); + ASSERT_TRUE(filter != NULL); + scoped_refptr<BloomFilter> filter_read = filter; + + // Check data consistency. + EXPECT_EQ(filter_write->hash_keys_.size(), filter_read->hash_keys_.size()); + + for (int i = 0; i < static_cast<int>(filter_write->hash_keys_.size()); ++i) + EXPECT_EQ(filter_write->hash_keys_[i], filter_read->hash_keys_[i]); + + EXPECT_EQ(filter_write->size(), filter_read->size()); + + EXPECT_TRUE(memcmp(filter_write->data(), + filter_read->data(), + filter_read->size()) == 0); + + file_util::Delete(filter_path, false); +} + diff --git a/chrome/browser/safe_browsing/filter_false_positive_perftest.cc b/chrome/browser/safe_browsing/filter_false_positive_perftest.cc index 40a546f..9bb7290 100644 --- a/chrome/browser/safe_browsing/filter_false_positive_perftest.cc +++ b/chrome/browser/safe_browsing/filter_false_positive_perftest.cc @@ -46,6 +46,7 @@ #include <fstream> #include <iostream> +#include <vector> #include "base/command_line.h" #include "base/file_path.h" @@ -55,7 +56,8 @@ #include "base/scoped_ptr.h" #include "base/sha2.h" #include "base/string_util.h" -#include "chrome/browser/safe_browsing/safe_browsing_database_bloom.h" +#include "chrome/browser/safe_browsing/bloom_filter.h" +#include "chrome/browser/safe_browsing/safe_browsing_util.h" #include "chrome/common/chrome_paths.h" #include "chrome/common/sqlite_compiled_statement.h" #include "chrome/common/sqlite_utils.h" @@ -294,7 +296,7 @@ TEST(SafeBrowsingBloomFilter, FalsePositives) { if (!ReadDatabase(data_dir, &prefix_list)) return; - int start = SafeBrowsingDatabaseBloom::kBloomFilterSizeRatio; + int start = BloomFilter::kBloomFilterSizeRatio; if (CommandLine::ForCurrentProcess()->HasSwitch(kFilterStart)) { start = StringToInt( CommandLine::ForCurrentProcess()->GetSwitchValue(kFilterStart)); diff --git a/chrome/browser/safe_browsing/safe_browsing_database.cc b/chrome/browser/safe_browsing/safe_browsing_database.cc index 3b370f1..c97f696 100644 --- a/chrome/browser/safe_browsing/safe_browsing_database.cc +++ b/chrome/browser/safe_browsing/safe_browsing_database.cc @@ -5,6 +5,7 @@ #include "chrome/browser/safe_browsing/safe_browsing_database.h" #include "base/file_util.h" +#include "base/histogram.h" #include "base/logging.h" #include "base/sha2.h" #include "chrome/browser/safe_browsing/safe_browsing_database_bloom.h" @@ -14,7 +15,7 @@ using base::Time; // Filename suffix for the bloom filter. static const FilePath::CharType kBloomFilterFile[] = - FILE_PATH_LITERAL(" Filter"); + FILE_PATH_LITERAL(" Filter 2"); // Factory method. SafeBrowsingDatabase* SafeBrowsingDatabase::Create() { @@ -63,23 +64,28 @@ FilePath SafeBrowsingDatabase::BloomFilterFilename( void SafeBrowsingDatabase::LoadBloomFilter() { DCHECK(!bloom_filter_filename_.empty()); + // If we're missing either of the database or filter files, we wait until the + // next update to generate a new filter. + // TODO(paulg): Investigate how often the filter file is missing and how + // expensive it would be to regenerate it. int64 size_64; + if (!file_util::GetFileSize(filename_, &size_64) || size_64 == 0) + return; + if (!file_util::GetFileSize(bloom_filter_filename_, &size_64) || size_64 == 0) { - BuildBloomFilter(); + UMA_HISTOGRAM_COUNTS("SB2.FilterMissing", 1); return; } - int size = static_cast<int>(size_64); - char* data = new char[size]; - CHECK(data); - + // We have a bloom filter file, so use that as our filter. Time before = Time::Now(); - file_util::ReadFile(bloom_filter_filename_, data, size); - SB_DLOG(INFO) << "SafeBrowsingDatabase read bloom filter in " << - (Time::Now() - before).InMilliseconds() << " ms"; + bloom_filter_ = BloomFilter::LoadFile(bloom_filter_filename_); + SB_DLOG(INFO) << "SafeBrowsingDatabase read bloom filter in " + << (Time::Now() - before).InMilliseconds() << " ms"; - bloom_filter_ = new BloomFilter(data, size); + if (!bloom_filter_.get()) + UMA_HISTOGRAM_COUNTS("SB2.FilterReadFail", 1); } void SafeBrowsingDatabase::DeleteBloomFilter() { @@ -91,9 +97,10 @@ void SafeBrowsingDatabase::WriteBloomFilter() { return; Time before = Time::Now(); - file_util::WriteFile(bloom_filter_filename_, - bloom_filter_->data(), - bloom_filter_->size()); + bool write_ok = bloom_filter_->WriteFile(bloom_filter_filename_); SB_DLOG(INFO) << "SafeBrowsingDatabase wrote bloom filter in " << (Time::Now() - before).InMilliseconds() << " ms"; + + if (!write_ok) + UMA_HISTOGRAM_COUNTS("SB2.FilterWriteFail", 1); } diff --git a/chrome/browser/safe_browsing/safe_browsing_database_bloom.cc b/chrome/browser/safe_browsing/safe_browsing_database_bloom.cc index cc8bfcc..f2fbff2 100644 --- a/chrome/browser/safe_browsing/safe_browsing_database_bloom.cc +++ b/chrome/browser/safe_browsing/safe_browsing_database_bloom.cc @@ -26,10 +26,6 @@ using base::TimeDelta; // database is reset. static const int kDatabaseVersion = 6; -// Don't want to create too small of a bloom filter initially while we're -// downloading the data and then keep having to rebuild it. -static const int kBloomFilterMinSize = 250000; - // When we awake from a low power state, we try to avoid doing expensive disk // operations for a few minutes to let the system page itself in and settle // down. @@ -75,36 +71,6 @@ bool SafeBrowsingDatabaseBloom::Init(const FilePath& filename, return true; } -void SafeBrowsingDatabaseBloom::LoadBloomFilter() { - DCHECK(!bloom_filter_filename_.empty()); - - // If we're missing either of the database or filter files, we wait until the - // next update to generate a new filter. - // TODO(paulg): Investigate how often the filter file is missing and how - // expensive it would be to regenerate it. - int64 size_64; - if (!file_util::GetFileSize(filename_, &size_64) || size_64 == 0) - return; - - if (!file_util::GetFileSize(bloom_filter_filename_, &size_64) || - size_64 == 0) { - UMA_HISTOGRAM_COUNTS("SB2.FilterMissing", 1); - return; - } - - // We have a bloom filter file, so use that as our filter. - int size = static_cast<int>(size_64); - char* data = new char[size]; - CHECK(data); - - Time before = Time::Now(); - file_util::ReadFile(bloom_filter_filename_, data, size); - SB_DLOG(INFO) << "SafeBrowsingDatabase read bloom filter in " - << (Time::Now() - before).InMilliseconds() << " ms"; - - bloom_filter_ = new BloomFilter(data, size); -} - bool SafeBrowsingDatabaseBloom::Open() { if (db_) return true; @@ -244,8 +210,8 @@ bool SafeBrowsingDatabaseBloom::ResetDatabase() { return false; } - bloom_filter_ = - new BloomFilter(kBloomFilterMinSize * kBloomFilterSizeRatio); + bloom_filter_ = new BloomFilter(BloomFilter::kBloomFilterMinSize * + BloomFilter::kBloomFilterSizeRatio); file_util::Delete(bloom_filter_filename_, false); // TODO(paulg): Fix potential infinite recursion between Open and Reset. @@ -1064,9 +1030,11 @@ bool SafeBrowsingDatabaseBloom::WritePrefixes( // Determine the size of the new bloom filter. We will cap the maximum size at // 2 MB to prevent an error from consuming large amounts of memory. - int number_of_keys = std::max(add_count_, kBloomFilterMinSize); - int filter_size = std::min(number_of_keys * kBloomFilterSizeRatio, - 2 * 1024 * 1024 * 8); + const int default_min = BloomFilter::kBloomFilterMinSize; + int number_of_keys = std::max(add_count_, default_min); + int filter_size = + std::min(number_of_keys * BloomFilter::kBloomFilterSizeRatio, + BloomFilter::kBloomFilterMaxSize * 8); BloomFilter* new_filter = new BloomFilter(filter_size); SBPair* add = adds; int new_count = 0; diff --git a/chrome/browser/safe_browsing/safe_browsing_database_bloom.h b/chrome/browser/safe_browsing/safe_browsing_database_bloom.h index d8108d7..f983ce1 100644 --- a/chrome/browser/safe_browsing/safe_browsing_database_bloom.h +++ b/chrome/browser/safe_browsing/safe_browsing_database_bloom.h @@ -81,9 +81,6 @@ class SafeBrowsingDatabaseBloom : public SafeBrowsingDatabase { virtual bool NeedToCheckUrl(const GURL& url); - // How many bits to use per item. See the design doc for more information. - static const int kBloomFilterSizeRatio = 25; - private: // Opens the database. bool Open(); @@ -196,9 +193,6 @@ class SafeBrowsingDatabaseBloom : public SafeBrowsingDatabase { // Flush in memory temporary caches. void ClearUpdateCaches(); - // Load the bloom filter off disk. - virtual void LoadBloomFilter(); - // Encode the list id in the lower bit of the chunk. static inline int EncodeChunkId(int chunk, int list_id) { DCHECK(list_id == 0 || list_id == 1); diff --git a/chrome/browser/safe_browsing/safe_browsing_database_unittest.cc b/chrome/browser/safe_browsing/safe_browsing_database_unittest.cc index 94a9a52..bea9020 100644 --- a/chrome/browser/safe_browsing/safe_browsing_database_unittest.cc +++ b/chrome/browser/safe_browsing/safe_browsing_database_unittest.cc @@ -20,10 +20,8 @@ using base::Time; -static const FilePath::CharType kBloomSuffix[] = - FILE_PATH_LITERAL(" Bloom"); -static const FilePath::CharType kFilterSuffix[] = - FILE_PATH_LITERAL(" Filter"); +static const FilePath::CharType kBloomSuffix[] = FILE_PATH_LITERAL(" Bloom"); +static const FilePath::CharType kFilterSuffix[] = FILE_PATH_LITERAL(" Filter"); namespace { SBPrefix Sha256Prefix(const std::string& str) { |