diff options
author | ttuttle@chromium.org <ttuttle@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-07-04 23:22:07 +0000 |
---|---|---|
committer | ttuttle@chromium.org <ttuttle@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-07-04 23:22:07 +0000 |
commit | 460fff7d4363a9eb8f0f19fc98324c6bdd1656b5 (patch) | |
tree | d031dd2fbc28883c97ecfdb9c1a29891bd784123 /net/dns | |
parent | 0e357c3e1cb8f788ad592ab6760900716f9b018d (diff) | |
download | chromium_src-460fff7d4363a9eb8f0f19fc98324c6bdd1656b5.zip chromium_src-460fff7d4363a9eb8f0f19fc98324c6bdd1656b5.tar.gz chromium_src-460fff7d4363a9eb8f0f19fc98324c6bdd1656b5.tar.bz2 |
Optimize parsing of /etc/hosts.
Right now, Chrome won't parse hosts files larger than 64k, since the
parser is kind of slow. This CL takes some steps to optimize it:
1. Replace nested StringTokenizers with a custom, one-pass parser.
2. Cache the literal IP from the previous line so we can skip parsing
the same IP address if it's listed many times in a row.
(Ad-blocking hosts files can have tens or hundreds of thousands of
entries in a row that all point to 127.0.0.1.)
3. Replace std::map with a base::hash_map.
A rough benchmark suggests that these changes make ParseHosts run about
three times faster on a large (close to 6M and close to 200k lines)
ad-blocking hosts file.
TODO:
1. Break this into separate CLs, if we want?
2. Store the actual hosts file (converted to lowercase) in memory and
store StringPieces pointing into it, instead of making copies of
every hostname.
3. Fix on Android (it doesn't implement == on hash_map?).
BUG=107810
TEST=net_unittests still pass; may want nastier parsing tests
Review URL: https://chromiumcodereview.appspot.com/18407003
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@210237 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/dns')
-rw-r--r-- | net/dns/dns_hosts.cc | 142 | ||||
-rw-r--r-- | net/dns/dns_hosts.h | 34 | ||||
-rw-r--r-- | net/dns/dns_hosts_unittest.cc | 9 |
3 files changed, 157 insertions, 28 deletions
diff --git a/net/dns/dns_hosts.cc b/net/dns/dns_hosts.cc index 5a7c3cc..60cac9b 100644 --- a/net/dns/dns_hosts.cc +++ b/net/dns/dns_hosts.cc @@ -10,40 +10,128 @@ #include "base/strings/string_util.h" #include "base/strings/string_tokenizer.h" +using base::StringPiece; + namespace net { +// Parses the contents of a hosts file. Returns one token (IP or hostname) at +// a time. Doesn't copy anything; accepts the file as a StringPiece and +// returns tokens as StringPieces. +class HostsParser { + public: + explicit HostsParser(const StringPiece& text) + : text_(text), + data_(text.data()), + end_(text.size()), + pos_(0), + token_(), + token_is_ip_(false) {} + + // Advances to the next token (IP or hostname). Returns whether another + // token was available. |token_is_ip| and |token| can be used to find out + // the type and text of the token. + bool Advance() { + bool next_is_ip = (pos_ == 0); + while (pos_ < end_ && pos_ != std::string::npos) { + SkipWhitespace(); + switch (text_[pos_]) { + case '\r': + case '\n': + next_is_ip = true; + pos_++; + break; + + case '#': + SkipRestOfLine(); + break; + + default: { + size_t token_start = pos_; + SkipToken(); + size_t token_end = (pos_ == std::string::npos) ? end_ : pos_; + + token_ = StringPiece(data_ + token_start, token_end - token_start); + token_is_ip_ = next_is_ip; + + return true; + } + } + } + + text_ = StringPiece(); + return false; + } + + // Fast-forwards the parser to the next line. Should be called if an IP + // address doesn't parse, to avoid wasting time tokenizing hostnames that + // will be ignored. + void SkipRestOfLine() { + pos_ = text_.find("\n", pos_); + } + + // Returns whether the last-parsed token is an IP address (true) or a + // hostname (false). + bool token_is_ip() { return token_is_ip_; } + + // Returns the text of the last-parsed token as a StringPiece referencing + // the same underlying memory as the StringPiece passed to the constructor. + // Returns an empty StringPiece if no token has been parsed or the end of + // the input string has been reached. + const StringPiece& token() { return token_; } + + private: + void SkipToken() { + pos_ = text_.find_first_of(" \t\n\r#", pos_); + } + + void SkipWhitespace() { + pos_ = text_.find_first_not_of(" \t", pos_); + } + + StringPiece text_; + const char* data_; + const size_t end_; + + size_t pos_; + StringPiece token_; + bool token_is_ip_; + + DISALLOW_COPY_AND_ASSIGN(HostsParser); +}; + + + void ParseHosts(const std::string& contents, DnsHosts* dns_hosts) { CHECK(dns_hosts); DnsHosts& hosts = *dns_hosts; - // Split into lines. Accept CR for Windows. - base::StringTokenizer contents_lines(contents, "\n\r"); - while (contents_lines.GetNext()) { - // Ignore comments after '#'. - std::string line = contents_lines.token(); - base::StringTokenizer line_parts(line, "#"); - line_parts.set_options(base::StringTokenizer::RETURN_DELIMS); - - if (line_parts.GetNext() && !line_parts.token_is_delim()) { - // Split and trim whitespace. - std::string part = line_parts.token(); - base::StringTokenizer tokens(part, " \t"); - - if (tokens.GetNext()) { - IPAddressNumber ip; - // TODO(szym): handle %iface notation on mac - if (!ParseIPLiteralToNumber(tokens.token(), &ip)) - continue; // Ignore malformed lines. - AddressFamily fam = (ip.size() == 4) ? ADDRESS_FAMILY_IPV4 : - ADDRESS_FAMILY_IPV6; - while (tokens.GetNext()) { - DnsHostsKey key(tokens.token(), fam); - StringToLowerASCII(&(key.first)); - IPAddressNumber& mapped_ip = hosts[key]; - if (mapped_ip.empty()) - mapped_ip = ip; - // else ignore this entry (first hit counts) + + StringPiece ip_text; + IPAddressNumber ip; + AddressFamily family = ADDRESS_FAMILY_IPV4; + HostsParser parser(contents); + while (parser.Advance()) { + if (parser.token_is_ip()) { + StringPiece new_ip_text = parser.token(); + // Some ad-blocking hosts files contain thousands of entries pointing to + // the same IP address (usually 127.0.0.1). Don't bother parsing the IP + // again if it's the same as the one above it. + if (new_ip_text != ip_text) { + IPAddressNumber new_ip; + if (ParseIPLiteralToNumber(parser.token().as_string(), &new_ip)) { + ip_text = new_ip_text; + ip.swap(new_ip); + family = (ip.size() == 4) ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6; + } else { + parser.SkipRestOfLine(); } } + } else { + DnsHostsKey key(parser.token().as_string(), family); + StringToLowerASCII(&key.first); + IPAddressNumber& mapped_ip = hosts[key]; + if (mapped_ip.empty()) + mapped_ip = ip; + // else ignore this entry (first hit counts) } } } diff --git a/net/dns/dns_hosts.h b/net/dns/dns_hosts.h index a75bb27..c2b2909 100644 --- a/net/dns/dns_hosts.h +++ b/net/dns/dns_hosts.h @@ -10,12 +10,39 @@ #include <utility> #include <vector> +#include "base/basictypes.h" +#include "base/containers/hash_tables.h" #include "base/files/file_path.h" #include "net/base/address_family.h" #include "net/base/net_export.h" #include "net/base/net_util.h" // can't forward-declare IPAddressNumber namespace net { + typedef std::pair<std::string, AddressFamily> DnsHostsKey; +}; + +namespace BASE_HASH_NAMESPACE { +#if defined(COMPILER_GCC) + +template<> +struct hash<net::DnsHostsKey> { + std::size_t operator()(const net::DnsHostsKey& key) const { + hash<base::StringPiece> string_piece_hash; + return string_piece_hash(key.first) + key.second; + } +}; + +#elif defined(COMPILER_MSVC) + +inline size_t hash_value(const net::DnsHostsKey& key) { + return hash_value(key.first) + key.second; +} + +#endif // COMPILER + +} // namespace BASE_HASH_NAMESPACE + +namespace net { // Parsed results of a Hosts file. // @@ -27,8 +54,13 @@ namespace net { // 127.0.0.1 localhost // 10.0.0.1 localhost // The expected resolution of localhost is 127.0.0.1. -typedef std::pair<std::string, AddressFamily> DnsHostsKey; +#if !defined(OS_ANDROID) +typedef base::hash_map<DnsHostsKey, IPAddressNumber> DnsHosts; +#else +// Android's hash_map doesn't support ==, so fall back to map. (Chromium on +// Android doesn't use the built-in DNS resolver anyway, so it's irrelevant.) typedef std::map<DnsHostsKey, IPAddressNumber> DnsHosts; +#endif // Parses |contents| (as read from /etc/hosts or equivalent) and stores results // in |dns_hosts|. Invalid lines are ignored (as in most implementations). diff --git a/net/dns/dns_hosts_unittest.cc b/net/dns/dns_hosts_unittest.cc index f9ee294..76d4a72 100644 --- a/net/dns/dns_hosts_unittest.cc +++ b/net/dns/dns_hosts_unittest.cc @@ -23,6 +23,11 @@ TEST(DnsHostsTest, ParseHosts) { "\t fe00::0 ip6-localnet\r\n" "2048::2 example\n" "2048::1 company example # ignored for 'example' \n" + "127.0.0.1 cache1\n" + "127.0.0.1 cache2 # should reuse parsed IP\n" + "256.0.0.0 cache3 # bogus IP should not clear parsed IP cache\n" + "127.0.0.1 cache4 # should still be reused\n" + "127.0.0.2 cache5\n" "gibberish"; const struct { @@ -39,6 +44,10 @@ TEST(DnsHostsTest, ParseHosts) { { "ip6-localnet", ADDRESS_FAMILY_IPV6, "fe00::0" }, { "company", ADDRESS_FAMILY_IPV6, "2048::1" }, { "example", ADDRESS_FAMILY_IPV6, "2048::2" }, + { "cache1", ADDRESS_FAMILY_IPV4, "127.0.0.1" }, + { "cache2", ADDRESS_FAMILY_IPV4, "127.0.0.1" }, + { "cache4", ADDRESS_FAMILY_IPV4, "127.0.0.1" }, + { "cache5", ADDRESS_FAMILY_IPV4, "127.0.0.2" }, }; DnsHosts expected; |