diff options
author | fischman@chromium.org <fischman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-08-08 23:44:55 +0000 |
---|---|---|
committer | fischman@chromium.org <fischman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-08-08 23:44:55 +0000 |
commit | b5a0c87bc18661fe8f4c4d2eae5716e8e072c25d (patch) | |
tree | dbafe7a58082d44a3159a435afd81e5ddcea953f /net/base/ssl_false_start_blacklist_process.cc | |
parent | 7291c4878d7afa5e575084d8fae75b58c11a8262 (diff) | |
download | chromium_src-b5a0c87bc18661fe8f4c4d2eae5716e8e072c25d.zip chromium_src-b5a0c87bc18661fe8f4c4d2eae5716e8e072c25d.tar.gz chromium_src-b5a0c87bc18661fe8f4c4d2eae5716e8e072c25d.tar.bz2 |
Revert 95907 - Clean up SSL false start blacklist code. Numerous changes, including:
* Handle trailing dots in LastTwoLabels() as in http://codereview.chromium.org/7518035/ . Rename this function to LastTwoComponents() to match the terminology used in the RegistryControlledDomainService and elsewhere in Chrome.
* Since callers are using std::string anyway, make the functions in the header take const std::string& instead of char*. This also allows doing string operations on them.
* Use string operations (like find_last_of()) in place of hand-written algorithms, for brevity, clarity, and safety.
* Avoid "unsigned", which the style guide forbids, and use allowed types like size_t, uint32, or int (depending on the situation).
* Avoid #define and "using".
* Use standard algorithms for similar reasons as using string ops.
* Use file_util functions to significantly abbreviate file reading/writing code.
* Use wmain() (on Windows) in combination with FilePath to avoid issues if the provided pathname has extended characters that don't flatten losslessly to the default codepage (thanks Darin for pointing out this issue).
* Avoid casting where possible. Avoid some casts for printf()-style calls by using a string stream, which also allows for slightly less boilerplate.
* Convert non-error uses of stderr to the chrome-standard VLOG(1).
* Correctly handle hostnames with trailing dots in the input file.
* In general, shorten code where possible.
Because this adds a dependency on base, and ssl_false_start_blacklist_process has the "#host" specifier in net.gyp, bradnelson tells me that base and its dependencies need an explicit "host, target" toolchain list for the Linux builds to work correctly. It would be nice if we could avoid this but I guess gyp would have to be smarter or something.
BUG=none
TEST=none
Review URL: http://codereview.chromium.org/7550002
TBR=pkasting@chromium.org
Review URL: http://codereview.chromium.org/7529035
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@95910 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/base/ssl_false_start_blacklist_process.cc')
-rw-r--r-- | net/base/ssl_false_start_blacklist_process.cc | 377 |
1 files changed, 217 insertions, 160 deletions
diff --git a/net/base/ssl_false_start_blacklist_process.cc b/net/base/ssl_false_start_blacklist_process.cc index e1ff7d2..762a665 100644 --- a/net/base/ssl_false_start_blacklist_process.cc +++ b/net/base/ssl_false_start_blacklist_process.cc @@ -5,115 +5,116 @@ // This utility program exists to process the False Start blacklist file into // a static hash table so that it can be efficiently queried by Chrome. -#include <algorithm> -#include <cstdio> +#include <stdio.h> +#include <stdlib.h> + #include <set> -#include <sstream> #include <string> #include <vector> #include "base/basictypes.h" -#include "base/file_util.h" -#include "base/string_util.h" #include "net/base/ssl_false_start_blacklist.h" -typedef std::vector<std::string> Hosts; +using net::SSLFalseStartBlacklist; -// Parses |input| as a blacklist data file, and returns the set of hosts it -// contains. -Hosts ParseHosts(const std::string& input) { - Hosts hosts; - size_t line_start = 0; - bool is_comment = false; - bool non_whitespace_seen = false; - for (size_t i = 0; i <= input.size(); ++i) { - if (i == input.size() || input[i] == '\n') { - if (!is_comment && non_whitespace_seen) { - size_t len = i - line_start; - if (i > 0 && input[i - 1] == '\r') - len--; - hosts.push_back(input.substr(line_start, len)); - } - is_comment = false; - non_whitespace_seen = false; - line_start = i + 1; - } else if (input[i] != ' ' && input[i] != '\t' && input[i] != '\r') { - non_whitespace_seen = true; - if (i == line_start && input[i] == '#') - is_comment = true; - } - } - VLOG(1) << "Have " << hosts.size() << " hosts after parse"; - return hosts; +static const unsigned kBuckets = SSLFalseStartBlacklist::kBuckets; + +static bool verbose = false; + +static int +usage(const char* argv0) { + fprintf(stderr, "Usage: %s <blacklist file> <output .c file>\n", argv0); + return 1; } -// Returns |host| with any initial "www." and trailing dots removed. Partly -// based on net::StripWWW(). -std::string StripWWWAndTrailingDots(const std::string& host) { - const std::string www("www."); - const size_t start = StartsWithASCII(host, www, true) ? www.length() : 0; - const size_t end = host.find_last_not_of('.'); - return (end == std::string::npos) ? - std::string() : host.substr(start, end - start + 1); +// StripWWWPrefix removes "www." from the beginning of any elements of the +// vector. +static void StripWWWPrefix(std::vector<std::string>* hosts) { + static const char kPrefix[] = "www."; + static const unsigned kPrefixLen = sizeof(kPrefix) - 1; + + for (size_t i = 0; i < hosts->size(); i++) { + const std::string& h = (*hosts)[i]; + if (h.size() >= kPrefixLen && + memcmp(h.data(), kPrefix, kPrefixLen) == 0) { + (*hosts)[i] = h.substr(kPrefixLen, h.size() - kPrefixLen); + } + } } -// Removes all duplicates from |hosts|. +// RemoveDuplicateEntries removes all duplicates from |hosts|. static void RemoveDuplicateEntries(std::vector<std::string>* hosts) { - std::sort(hosts->begin(), hosts->end()); - hosts->erase(std::unique(hosts->begin(), hosts->end()), hosts->end()); - VLOG(1) << "Have " << hosts->size() << " hosts after removing duplicates"; -} + std::set<std::string> hosts_set; + std::vector<std::string> ret; + + for (std::vector<std::string>::const_iterator + i = hosts->begin(); i != hosts->end(); i++) { + if (hosts_set.count(*i)) { + if (verbose) + fprintf(stderr, "Removing duplicate entry for %s\n", i->c_str()); + continue; + } + hosts_set.insert(*i); + ret.push_back(*i); + } -// Returns the parent domain for |host|, or the empty string if the name is a -// top-level domain. -static std::string ParentDomain(const std::string& host) { - const size_t first_dot = host.find('.'); - return (first_dot == std::string::npos) ? - std::string() : host.substr(first_dot + 1); + hosts->swap(ret); } -// Predicate which returns true when a hostname has a parent domain in the set -// of hosts provided at construction time. -class ParentInSet : public std::unary_function<std::string, bool> { - public: - explicit ParentInSet(const std::set<std::string>& hosts) : hosts_(hosts) {} - - bool operator()(const std::string& host) const { - for (std::string parent(ParentDomain(host)); !parent.empty(); - parent = ParentDomain(parent)) { - if (hosts_.count(parent)) { - VLOG(1) << "Removing " << host << " as redundant"; - return true; - } +// ParentDomain returns the parent domain for a given domain name or the empty +// string if the name is a top-level domain. +static std::string ParentDomain(const std::string& in) { + for (size_t i = 0; i < in.size(); i++) { + if (in[i] == '.') { + return in.substr(i + 1, in.size() - i - 1); } - return false; } - private: - const std::set<std::string>& hosts_; -}; + return std::string(); +} -// Removes any hosts which are subdomains of other hosts. E.g. -// "foo.example.com" would be removed if "example.com" were also included. -static void RemoveRedundantEntries(Hosts* hosts) { +// RemoveRedundantEntries removes any entries which are subdomains of other +// entries. (i.e. foo.example.com would be removed if example.com were also +// included.) +static void RemoveRedundantEntries(std::vector<std::string>* hosts) { std::set<std::string> hosts_set; - for (Hosts::const_iterator i(hosts->begin()); i != hosts->end(); ++i) + std::vector<std::string> ret; + + for (std::vector<std::string>::const_iterator + i = hosts->begin(); i != hosts->end(); i++) { hosts_set.insert(*i); - hosts->erase(std::remove_if(hosts->begin(), hosts->end(), - ParentInSet(hosts_set)), hosts->end()); - VLOG(1) << "Have " << hosts->size() << " hosts after removing redundants"; + } + + for (std::vector<std::string>::const_iterator + i = hosts->begin(); i != hosts->end(); i++) { + std::string parent = ParentDomain(*i); + while (!parent.empty()) { + if (hosts_set.count(parent)) + break; + parent = ParentDomain(parent); + } + if (parent.empty()) { + ret.push_back(*i); + } else { + if (verbose) + fprintf(stderr, "Removing %s as redundant\n", i->c_str()); + } + } + + hosts->swap(ret); } -// Returns true iff all |hosts| are less than 256 bytes long (not including the -// terminating NUL) and contain two or more dot-separated components. -static bool CheckLengths(const Hosts& hosts) { - for (Hosts::const_iterator i(hosts.begin()); i != hosts.end(); ++i) { +// CheckLengths returns true iff every host is less than 256 bytes long (not +// including the terminating NUL) and contains two or more labels. +static bool CheckLengths(const std::vector<std::string>& hosts) { + for (std::vector<std::string>::const_iterator + i = hosts.begin(); i != hosts.end(); i++) { if (i->size() >= 256) { - fprintf(stderr, "Entry '%s' is too large\n", i->c_str()); + fprintf(stderr, "Entry %s is too large\n", i->c_str()); return false; } - if (net::SSLFalseStartBlacklist::LastTwoComponents(*i).empty()) { - fprintf(stderr, "Entry '%s' contains too few labels\n", i->c_str()); + if (SSLFalseStartBlacklist::LastTwoLabels(i->c_str()) == NULL) { + fprintf(stderr, "Entry %s contains too few labels\n", i->c_str()); return false; } } @@ -121,94 +122,150 @@ static bool CheckLengths(const Hosts& hosts) { return true; } -// Returns the contents of the output file to be written. -std::string GenerateOutput(const Hosts& hosts) { - // Hash each host into its appropriate bucket. - VLOG(1) << "Using " << net::SSLFalseStartBlacklist::kBuckets - << " entry hash table"; - Hosts buckets[net::SSLFalseStartBlacklist::kBuckets]; - for (Hosts::const_iterator i(hosts.begin()); i != hosts.end(); ++i) { - const uint32 hash = net::SSLFalseStartBlacklist::Hash( - net::SSLFalseStartBlacklist::LastTwoComponents(*i)); - buckets[hash & (net::SSLFalseStartBlacklist::kBuckets - 1)].push_back(*i); - } - - // Write header. - std::ostringstream output; - output << "// Copyright (c) 2011 The Chromium Authors. All rights reserved.\n" - "// Use of this source code is governed by a BSD-style license that" - " can be\n// found in the LICENSE file.\n\n// WARNING: This code is" - " generated by ssl_false_start_blacklist_process.cc.\n// Do not " - "edit.\n\n#include \"net/base/ssl_false_start_blacklist.h\"\n\n" - "namespace net {\n\nconst uint32 " - "SSLFalseStartBlacklist::kHashTable[" - << net::SSLFalseStartBlacklist::kBuckets << " + 1] = {\n 0,\n"; - - // Construct data table, writing out the size as each bucket is appended. +int main(int argc, char** argv) { + if (argc != 3) + return usage(argv[0]); + + const char* input_file = argv[1]; + const char* output_file = argv[2]; + FILE* input = fopen(input_file, "rb"); + if (!input) { + perror("open"); + return usage(argv[0]); + } + + if (fseek(input, 0, SEEK_END)) { + perror("fseek"); + return 1; + } + + const long input_size = ftell(input); + if (input_size < 0) { + perror("ftell"); + return 1; + } + + if (fseek(input, 0, SEEK_SET)) { + perror("fseek"); + return 1; + } + + char* buffer = static_cast<char*>(malloc(input_size)); + long done = 0; + while (done < input_size) { + size_t n = fread(buffer + done, 1, input_size - done, input); + if (n == 0) { + perror("fread"); + free(buffer); + fclose(input); + return 1; + } + done += n; + } + fclose(input); + + std::vector<std::string> hosts; + + off_t line_start = 0; + bool is_comment = false; + bool non_whitespace_seen = false; + for (long i = 0; i <= input_size; i++) { + if (i == input_size || buffer[i] == '\n') { + if (!is_comment && non_whitespace_seen) { + long len = i - line_start; + if (i > 0 && buffer[i-1] == '\r') + len--; + hosts.push_back(std::string(&buffer[line_start], len)); + } + is_comment = false; + non_whitespace_seen = false; + line_start = i + 1; + continue; + } + + if (i == line_start && buffer[i] == '#') + is_comment = true; + if (buffer[i] != ' ' && buffer[i] != '\t' && buffer[i] != '\r') + non_whitespace_seen = true; + } + free(buffer); + + fprintf(stderr, "Have %d hosts after parse\n", (int) hosts.size()); + StripWWWPrefix(&hosts); + RemoveDuplicateEntries(&hosts); + fprintf(stderr, "Have %d hosts after removing duplicates\n", (int) hosts.size()); + RemoveRedundantEntries(&hosts); + fprintf(stderr, "Have %d hosts after removing redundants\n", (int) hosts.size()); + if (!CheckLengths(hosts)) { + fprintf(stderr, "One or more entries is too large or too small\n"); + return 2; + } + + fprintf(stderr, "Using %d entry hash table\n", kBuckets); + uint32 table[kBuckets]; + std::vector<std::string> buckets[kBuckets]; + + for (std::vector<std::string>::const_iterator + i = hosts.begin(); i != hosts.end(); i++) { + const char* last_two_labels = + SSLFalseStartBlacklist::LastTwoLabels(i->c_str()); + const unsigned h = SSLFalseStartBlacklist::Hash(last_two_labels); + buckets[h & (kBuckets - 1)].push_back(*i); + } + std::string table_data; - size_t max_bucket_size = 0; - for (size_t i = 0; i < net::SSLFalseStartBlacklist::kBuckets; i++) { - max_bucket_size = std::max(max_bucket_size, buckets[i].size()); - for (Hosts::const_iterator j(buckets[i].begin()); j != buckets[i].end(); - ++j) { - table_data.push_back(static_cast<char>(j->size())); + unsigned max_bucket_size = 0; + for (unsigned i = 0; i < kBuckets; i++) { + if (buckets[i].size() > max_bucket_size) + max_bucket_size = buckets[i].size(); + + table[i] = table_data.size(); + for (std::vector<std::string>::const_iterator + j = buckets[i].begin(); j != buckets[i].end(); j++) { + table_data.push_back((char) j->size()); table_data.append(*j); } - output << " " << table_data.size() << ",\n"; } - output << "};\n\n"; - VLOG(1) << "Largest bucket has " << max_bucket_size << " entries"; - // Write data table, breaking lines after 72+ (2 indent, 70+ data) characters. - output << "const char SSLFalseStartBlacklist::kHashData[] = {\n"; - for (size_t i = 0, line_length = 0; i < table_data.size(); i++) { + fprintf(stderr, "Largest bucket has %d entries\n", max_bucket_size); + + FILE* out = fopen(output_file, "w+"); + if (!out) { + perror("opening output file"); + return 4; + } + + fprintf(out, "// Copyright (c) 2010 The Chromium Authors. All rights " + "reserved.\n// Use of this source code is governed by a BSD-style " + "license that can be\n// found in the LICENSE file.\n\n"); + fprintf(out, "// WARNING: this code is generated by\n" + "// ssl_false_start_blacklist_process.cc. Do not edit.\n\n"); + fprintf(out, "#include \"base/basictypes.h\"\n\n"); + fprintf(out, "#include \"net/base/ssl_false_start_blacklist.h\"\n\n"); + fprintf(out, "namespace net {\n\n"); + fprintf(out, "const uint32 SSLFalseStartBlacklist::kHashTable[%d + 1] = {\n", + kBuckets); + for (unsigned i = 0; i < kBuckets; i++) { + fprintf(out, " %u,\n", (unsigned) table[i]); + } + fprintf(out, " %u,\n", (unsigned) table_data.size()); + fprintf(out, "};\n\n"); + + fprintf(out, "const char SSLFalseStartBlacklist::kHashData[] = {\n"); + for (unsigned i = 0, line_length = 0; i < table_data.size(); i++) { if (line_length == 0) - output << " "; - std::ostringstream::pos_type current_length = output.tellp(); - output << static_cast<int>(table_data[i]) << ", "; - line_length += output.tellp() - current_length; + fprintf(out, " "); + uint8 c = static_cast<uint8>(table_data[i]); + line_length += fprintf(out, "%d, ", c); if (i == table_data.size() - 1) { - output << "\n};\n"; + fprintf(out, "\n};\n"); } else if (line_length >= 70) { - output << "\n"; + fprintf(out, "\n"); line_length = 0; } } - output << "\n} // namespace net\n"; - return output.str(); -} - -#if defined(OS_WIN) -int wmain(int argc, wchar_t* argv[], wchar_t* envp[]) { -#elif defined(OS_POSIX) -int main(int argc, char* argv[], char* envp[]) { -#endif - if (argc != 3) { - fprintf(stderr, "Usage: %s <blacklist file> <output .c file>\n", argv[0]); - return 1; - } + fprintf(out, "\n} // namespace net\n"); + fclose(out); - // Read input file. - std::string input; - if (!file_util::ReadFileToString(FilePath(argv[1]), &input)) { - fprintf(stderr, "Failed to read input file '%s'\n", argv[1]); - return 2; - } - Hosts hosts(ParseHosts(input)); - - // Sanitize |hosts|. - std::transform(hosts.begin(), hosts.end(), hosts.begin(), - StripWWWAndTrailingDots); - RemoveDuplicateEntries(&hosts); - RemoveRedundantEntries(&hosts); - if (!CheckLengths(hosts)) - return 3; - - // Write output file. - const std::string output_str(GenerateOutput(hosts)); - if (file_util::WriteFile(FilePath(argv[2]), output_str.data(), - output_str.size()) == static_cast<int>(output_str.size())) - return 0; - fprintf(stderr, "Failed to write output file '%s'\n", argv[2]); - return 4; + return 0; } |