summaryrefslogtreecommitdiffstats
path: root/chrome/browser/safe_browsing
diff options
context:
space:
mode:
authorpaulg@google.com <paulg@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2009-04-17 23:31:45 +0000
committerpaulg@google.com <paulg@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2009-04-17 23:31:45 +0000
commitbe5e9e1ee5bc539e50952e063c5ad2ec5219e3d4 (patch)
tree6e0345de8ae20391da2e872a594fdbab37883651 /chrome/browser/safe_browsing
parent958c75c8132a692a31580f20d55a27ccba3335df (diff)
downloadchromium_src-be5e9e1ee5bc539e50952e063c5ad2ec5219e3d4.zip
chromium_src-be5e9e1ee5bc539e50952e063c5ad2ec5219e3d4.tar.gz
chromium_src-be5e9e1ee5bc539e50952e063c5ad2ec5219e3d4.tar.bz2
Create a test program for generating false positive statistics
for SafeBrowsing data. The program reads a regular SafeBrowsing database and creates bloom filters of various sizes, comparing the hit and miss statistics against a list of URLs. BUG=10584 (http://crbug.com/10584) Review URL: http://codereview.chromium.org/67245 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@13981 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/safe_browsing')
-rw-r--r--chrome/browser/safe_browsing/filter_false_positive_perftest.cc280
1 files changed, 280 insertions, 0 deletions
diff --git a/chrome/browser/safe_browsing/filter_false_positive_perftest.cc b/chrome/browser/safe_browsing/filter_false_positive_perftest.cc
new file mode 100644
index 0000000..01350b0
--- /dev/null
+++ b/chrome/browser/safe_browsing/filter_false_positive_perftest.cc
@@ -0,0 +1,280 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// This test performs a series false positive checks using a list of URLs
+// against a known set of SafeBrowsing data.
+//
+// It uses a normal SafeBrowsing database to create a bloom filter where it
+// looks up all the URLs in the url file. A URL that has a prefix found in the
+// bloom filter and found in the database is considered a hit: a valid lookup
+// that will result in a gethash request. A URL that has a prefix found in the
+// bloom filter but not in the database is a miss: a false positive lookup that
+// will result in an unnecessary gethash request.
+//
+// By varying the size of the bloom filter and using a constant set of
+// SafeBrowsing data, we can check a known set of URLs against the filter and
+// determine the false positive rate.
+//
+// Usage:
+// $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.FalsePositives
+// --filter-start=<integer>
+// --filter-steps=<integer>
+// --filter-verbose
+//
+// --filter-start: The filter multiplier to begin with. This represents the
+// number of bits per prefix of memory to use in the filter.
+// The default value is identical to the current SafeBrowsing
+// database value.
+// --filter-steps: The number of iterations to run, with each iteration
+// increasing the filter multiplier by 1. The default value
+// is 1.
+// --filter-verbose: Used to print out the hit / miss results per URL.
+//
+// Data files:
+// chrome/test/data/safe_browsing/filter/database
+// chrome/test/data/safe_browsing/filter/urls
+//
+// database: A normal SafeBrowsing database.
+// urls: A text file containing a list of URLs, one per line.
+
+#include <fstream>
+#include <iostream>
+
+#include "base/command_line.h"
+#include "base/file_path.h"
+#include "base/file_util.h"
+#include "base/logging.h"
+#include "base/path_service.h"
+#include "base/scoped_ptr.h"
+#include "base/sha2.h"
+#include "base/string_util.h"
+#include "chrome/browser/safe_browsing/safe_browsing_database_bloom.h"
+#include "chrome/common/chrome_paths.h"
+#include "chrome/common/sqlite_compiled_statement.h"
+#include "chrome/common/sqlite_utils.h"
+#include "googleurl/src/gurl.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace {
+
+// Ensures the SafeBrowsing database is closed properly.
+class ScopedPerfDatabase {
+ public:
+ explicit ScopedPerfDatabase(sqlite3* db) : db_(db) {}
+ ~ScopedPerfDatabase() {
+ sqlite3_close(db_);
+ }
+
+ private:
+ sqlite3* db_;
+
+ DISALLOW_COPY_AND_ASSIGN(ScopedPerfDatabase);
+};
+
+// Command line flags.
+const wchar_t kFilterVerbose[] = L"filter-verbose";
+const wchar_t kFilterStart[] = L"filter-start";
+const wchar_t kFilterSteps[] = L"filter-steps";
+
+// Returns the path to the data used in this test, relative to the top of the
+// source directory.
+FilePath GetFullDataPath() {
+ FilePath full_path;
+ CHECK(PathService::Get(chrome::DIR_TEST_DATA, &full_path));
+ full_path = full_path.Append(FILE_PATH_LITERAL("safe_browsing"));
+ full_path = full_path.Append(FILE_PATH_LITERAL("filter"));
+ CHECK(file_util::PathExists(full_path));
+ return full_path;
+}
+
+// Constructs a bloom filter of the appropriate size from the provided prefixes.
+void BuildBloomFilter(int size_multiplier,
+ const std::vector<SBPrefix>& prefixes,
+ BloomFilter** bloom_filter) {
+ std::cout << "Building bloom filter with " << prefixes.size()
+ << " prefixes (size multiplier = " << size_multiplier
+ << ")" << std::endl;
+
+ // Create a BloomFilter with the specified size.
+ const int key_count = std::max(static_cast<int>(prefixes.size()), 250000);
+ const int filter_size = key_count * size_multiplier;
+ *bloom_filter = new BloomFilter(filter_size);
+
+ // Add the prefixes to it.
+ for (size_t i = 0; i < prefixes.size(); ++i)
+ (*bloom_filter)->Insert(prefixes[i]);
+}
+
+// Reads the set of add prefixes contained in a SafeBrowsing database into a
+// sorted array suitable for fast searching. This takes significantly less time
+// to look up a given prefix than performing SQL queries.
+bool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) {
+ FilePath database_file = path.Append(FILE_PATH_LITERAL("database"));
+ sqlite3* db = NULL;
+ if (OpenSqliteDb(database_file, &db) != SQLITE_OK) {
+ sqlite3_close(db);
+ return false;
+ }
+
+ ScopedPerfDatabase database(db);
+ scoped_ptr<SqliteStatementCache> sql_cache(new SqliteStatementCache(db));
+
+ // Get the number of items in the add_prefix table.
+ std::string sql = "SELECT COUNT(*) FROM add_prefix";
+ SQLITE_UNIQUE_STATEMENT(count_statement, *sql_cache, sql.c_str());
+ if (!count_statement.is_valid())
+ return false;
+
+ if (count_statement->step() != SQLITE_ROW)
+ return false;
+
+ const int count = count_statement->column_int(0);
+
+ // Load them into a prefix vector and sort
+ prefixes->reserve(count);
+ sql = "SELECT prefix FROM add_prefix";
+ SQLITE_UNIQUE_STATEMENT(prefix_statement, *sql_cache, sql.c_str());
+ if (!prefix_statement.is_valid())
+ return false;
+
+ while (prefix_statement->step() == SQLITE_ROW)
+ prefixes->push_back(prefix_statement->column_int(0));
+
+ DCHECK(static_cast<int>(prefixes->size()) == count);
+ sort(prefixes->begin(), prefixes->end());
+
+ return true;
+}
+
+// Generates all legal SafeBrowsing prefixes for the specified URL, and returns
+// the set of Prefixes that exist in the bloom filter.
+void GeneratePrefixHits(const std::string url,
+ BloomFilter* bloom_filter,
+ std::vector<SBPrefix>* prefixes) {
+ GURL url_check(url);
+ std::vector<std::string> hosts;
+ if (url_check.HostIsIPAddress()) {
+ hosts.push_back(url_check.host());
+ } else {
+ safe_browsing_util::GenerateHostsToCheck(url_check, &hosts);
+ }
+
+ std::vector<std::string> paths;
+ safe_browsing_util::GeneratePathsToCheck(url_check, &paths);
+
+ for (size_t i = 0; i < hosts.size(); ++i) {
+ for (size_t j = 0; j < paths.size(); ++j) {
+ SBPrefix prefix;
+ base::SHA256HashString(hosts[i] + paths[j], &prefix, sizeof(prefix));
+ if (bloom_filter->Exists(prefix))
+ prefixes->push_back(prefix);
+ }
+ }
+}
+
+// Binary search of sorted prefixes.
+bool IsPrefixInDatabase(SBPrefix prefix,
+ const std::vector<SBPrefix>& prefixes) {
+ if (prefixes.empty())
+ return false;
+
+ int low = 0;
+ int high = prefixes.size() - 1;
+ while (low <= high) {
+ int mid = ((unsigned int)low + (unsigned int)high) >> 1;
+ SBPrefix prefix_mid = prefixes[mid];
+ if (prefix_mid == prefix)
+ return true;
+ if (prefix_mid < prefix)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ }
+
+ return false;
+}
+
+// Construct a bloom filter with the given prefixes and multiplier, and test the
+// false positive rate (misses) against a URL list.
+void CalculateBloomFilterFalsePositives(
+ int size_multiplier,
+ const FilePath& data_dir,
+ const std::vector<SBPrefix>& prefix_list) {
+ BloomFilter* bloom_filter = NULL;
+ BuildBloomFilter(size_multiplier, prefix_list, &bloom_filter);
+ scoped_refptr<BloomFilter> scoped_filter(bloom_filter);
+
+ // Read in data file line at a time.
+ FilePath url_file = data_dir.Append(FILE_PATH_LITERAL("urls"));
+ std::ifstream url_stream(WideToASCII(url_file.ToWStringHack()).c_str());
+
+ // Keep track of stats
+ int hits = 0;
+ int misses = 0;
+
+ // Print out volumes of data (per URL hit and miss information).
+ bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose);
+
+ std::string url;
+ while (std::getline(url_stream, url)) {
+ // See if the URL is in the bloom filter.
+ std::vector<SBPrefix> prefixes;
+ GeneratePrefixHits(url, bloom_filter, &prefixes);
+
+ // See if the prefix is actually in the database (in-memory prefix list).
+ for (size_t i = 0; i < prefixes.size(); ++i) {
+ if (IsPrefixInDatabase(prefixes[i], prefix_list)) {
+ ++hits;
+ if (verbose) {
+ std::cout << "Hit for URL: " << url
+ << " (Prefix = " << prefixes[i] << ")"
+ << std::endl;
+ }
+ } else {
+ ++misses;
+ if (verbose) {
+ std::cout << "Miss for URL: " << url
+ << " (Prefix = " << prefixes[i] << ")"
+ << std::endl;
+ }
+ }
+ }
+ }
+
+ // Print out the results for this test.
+ std::cout << "Multiplier: " << size_multiplier
+ << ", Hits: " << hits
+ << ", Misses: " << misses
+ << ", Size: " << bloom_filter->size()
+ << std::endl;
+}
+
+} // namespace
+
+// This test can take several minutes to perform its calculations, so it should
+// be disabled until you need to run it.
+TEST(SafeBrowsingBloomFilter, FalsePositives) {
+ std::vector<SBPrefix> prefix_list;
+ FilePath data_dir = GetFullDataPath();
+ if (!ReadDatabase(data_dir, &prefix_list))
+ return;
+
+ int start = SafeBrowsingDatabaseBloom::kBloomFilterSizeRatio;
+ if (CommandLine::ForCurrentProcess()->HasSwitch(kFilterStart)) {
+ start = StringToInt(
+ CommandLine::ForCurrentProcess()->GetSwitchValue(kFilterStart));
+ }
+
+ int steps = 1;
+ if (CommandLine::ForCurrentProcess()->HasSwitch(kFilterSteps)) {
+ steps = StringToInt(
+ CommandLine::ForCurrentProcess()->GetSwitchValue(kFilterSteps));
+ }
+
+ int stop = start + steps;
+
+ for (int multiplier = start; multiplier < stop; ++multiplier)
+ CalculateBloomFilterFalsePositives(multiplier, data_dir, prefix_list);
+}
+