diff options
author | paulg@google.com <paulg@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-05-05 01:24:40 +0000 |
---|---|---|
committer | paulg@google.com <paulg@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-05-05 01:24:40 +0000 |
commit | 58956203f3268d5b4642a8ad586fdd66ad7d9375 (patch) | |
tree | 74369412c4a16b323df8010b5e136c9bb050444f /chrome | |
parent | 321539f84acd68316617fa658be4e4ce3ff517f3 (diff) | |
download | chromium_src-58956203f3268d5b4642a8ad586fdd66ad7d9375.zip chromium_src-58956203f3268d5b4642a8ad586fdd66ad7d9375.tar.gz chromium_src-58956203f3268d5b4642a8ad586fdd66ad7d9375.tar.bz2 |
Add support for handling files containing URLs with a sampling
weight. Also print out a few more statistics like the number
of URLs examined and the number of host + path pairs looked up.
Review URL: http://codereview.chromium.org/105005
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@15263 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome')
-rw-r--r-- | chrome/browser/safe_browsing/filter_false_positive_perftest.cc | 68 |
1 files changed, 51 insertions, 17 deletions
diff --git a/chrome/browser/safe_browsing/filter_false_positive_perftest.cc b/chrome/browser/safe_browsing/filter_false_positive_perftest.cc index 01350b0..40a546f 100644 --- a/chrome/browser/safe_browsing/filter_false_positive_perftest.cc +++ b/chrome/browser/safe_browsing/filter_false_positive_perftest.cc @@ -30,13 +30,19 @@ // increasing the filter multiplier by 1. The default value // is 1. // --filter-verbose: Used to print out the hit / miss results per URL. +// --filter-csv: The URL file contains information about the number of +// unique views (the popularity) of each URL. See the format +// description below. // // Data files: // chrome/test/data/safe_browsing/filter/database // chrome/test/data/safe_browsing/filter/urls // // database: A normal SafeBrowsing database. -// urls: A text file containing a list of URLs, one per line. +// urls: A text file containing a list of URLs, one per line. If the option +// --filter-csv is specified, the format of each line in the file is +// <url>,<weight> where weight is an integer indicating the number of +// unique views for the URL. #include <fstream> #include <iostream> @@ -76,6 +82,7 @@ class ScopedPerfDatabase { const wchar_t kFilterVerbose[] = L"filter-verbose"; const wchar_t kFilterStart[] = L"filter-start"; const wchar_t kFilterSteps[] = L"filter-steps"; +const wchar_t kFilterCsv[] = L"filter-csv"; // Returns the path to the data used in this test, relative to the top of the // source directory. @@ -92,10 +99,6 @@ FilePath GetFullDataPath() { void BuildBloomFilter(int size_multiplier, const std::vector<SBPrefix>& prefixes, BloomFilter** bloom_filter) { - std::cout << "Building bloom filter with " << prefixes.size() - << " prefixes (size multiplier = " << size_multiplier - << ")" << std::endl; - // Create a BloomFilter with the specified size. const int key_count = std::max(static_cast<int>(prefixes.size()), 250000); const int filter_size = key_count * size_multiplier; @@ -104,6 +107,11 @@ void BuildBloomFilter(int size_multiplier, // Add the prefixes to it. for (size_t i = 0; i < prefixes.size(); ++i) (*bloom_filter)->Insert(prefixes[i]); + + std::cout << "Bloom filter with prefixes: " << prefixes.size() + << ", multiplier: " << size_multiplier + << ", size (bytes): " << (*bloom_filter)->size() + << std::endl; } // Reads the set of add prefixes contained in a SafeBrowsing database into a @@ -148,10 +156,11 @@ bool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) { } // Generates all legal SafeBrowsing prefixes for the specified URL, and returns -// the set of Prefixes that exist in the bloom filter. -void GeneratePrefixHits(const std::string url, - BloomFilter* bloom_filter, - std::vector<SBPrefix>* prefixes) { +// the set of Prefixes that exist in the bloom filter. The function returns the +// number of host + path combinations checked. +int GeneratePrefixHits(const std::string url, + BloomFilter* bloom_filter, + std::vector<SBPrefix>* prefixes) { GURL url_check(url); std::vector<std::string> hosts; if (url_check.HostIsIPAddress()) { @@ -171,6 +180,8 @@ void GeneratePrefixHits(const std::string url, prefixes->push_back(prefix); } } + + return hosts.size() * paths.size(); } // Binary search of sorted prefixes. @@ -212,30 +223,49 @@ void CalculateBloomFilterFalsePositives( // Keep track of stats int hits = 0; int misses = 0; + int weighted_hits = 0; + int weighted_misses = 0; + int url_count = 0; + int prefix_count = 0; // Print out volumes of data (per URL hit and miss information). bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose); + bool use_weights = CommandLine::ForCurrentProcess()->HasSwitch(kFilterCsv); std::string url; while (std::getline(url_stream, url)) { + ++url_count; + + // Handle a format that contains URLs weighted by unique views. + int weight = 1; + if (use_weights) { + std::string::size_type pos = url.find_last_of(","); + if (pos != std::string::npos) { + weight = StringToInt(std::string(url, pos + 1)); + url = url.substr(0, pos); + } + } + // See if the URL is in the bloom filter. std::vector<SBPrefix> prefixes; - GeneratePrefixHits(url, bloom_filter, &prefixes); + prefix_count += GeneratePrefixHits(url, bloom_filter, &prefixes); // See if the prefix is actually in the database (in-memory prefix list). for (size_t i = 0; i < prefixes.size(); ++i) { if (IsPrefixInDatabase(prefixes[i], prefix_list)) { ++hits; + weighted_hits += weight; if (verbose) { std::cout << "Hit for URL: " << url - << " (Prefix = " << prefixes[i] << ")" + << " (prefix = " << prefixes[i] << ")" << std::endl; } } else { ++misses; + weighted_misses += weight; if (verbose) { std::cout << "Miss for URL: " << url - << " (Prefix = " << prefixes[i] << ")" + << " (prefix = " << prefixes[i] << ")" << std::endl; } } @@ -243,11 +273,15 @@ void CalculateBloomFilterFalsePositives( } // Print out the results for this test. - std::cout << "Multiplier: " << size_multiplier - << ", Hits: " << hits - << ", Misses: " << misses - << ", Size: " << bloom_filter->size() - << std::endl; + std::cout << "URLs checked: " << url_count + << ", prefix compares: " << prefix_count + << ", hits: " << hits + << ", misses: " << misses; + if (use_weights) { + std::cout << ", weighted hits: " << weighted_hits + << ", weighted misses: " << weighted_misses; + } + std::cout << std::endl; } } // namespace |