summaryrefslogtreecommitdiffstats
path: root/net/dns
diff options
context:
space:
mode:
authorttuttle@chromium.org <ttuttle@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-07-04 23:22:07 +0000
committerttuttle@chromium.org <ttuttle@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-07-04 23:22:07 +0000
commit460fff7d4363a9eb8f0f19fc98324c6bdd1656b5 (patch)
treed031dd2fbc28883c97ecfdb9c1a29891bd784123 /net/dns
parent0e357c3e1cb8f788ad592ab6760900716f9b018d (diff)
downloadchromium_src-460fff7d4363a9eb8f0f19fc98324c6bdd1656b5.zip
chromium_src-460fff7d4363a9eb8f0f19fc98324c6bdd1656b5.tar.gz
chromium_src-460fff7d4363a9eb8f0f19fc98324c6bdd1656b5.tar.bz2
Optimize parsing of /etc/hosts.
Right now, Chrome won't parse hosts files larger than 64k, since the parser is kind of slow. This CL takes some steps to optimize it: 1. Replace nested StringTokenizers with a custom, one-pass parser. 2. Cache the literal IP from the previous line so we can skip parsing the same IP address if it's listed many times in a row. (Ad-blocking hosts files can have tens or hundreds of thousands of entries in a row that all point to 127.0.0.1.) 3. Replace std::map with a base::hash_map. A rough benchmark suggests that these changes make ParseHosts run about three times faster on a large (close to 6M and close to 200k lines) ad-blocking hosts file. TODO: 1. Break this into separate CLs, if we want? 2. Store the actual hosts file (converted to lowercase) in memory and store StringPieces pointing into it, instead of making copies of every hostname. 3. Fix on Android (it doesn't implement == on hash_map?). BUG=107810 TEST=net_unittests still pass; may want nastier parsing tests Review URL: https://chromiumcodereview.appspot.com/18407003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@210237 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/dns')
-rw-r--r--net/dns/dns_hosts.cc142
-rw-r--r--net/dns/dns_hosts.h34
-rw-r--r--net/dns/dns_hosts_unittest.cc9
3 files changed, 157 insertions, 28 deletions
diff --git a/net/dns/dns_hosts.cc b/net/dns/dns_hosts.cc
index 5a7c3cc..60cac9b 100644
--- a/net/dns/dns_hosts.cc
+++ b/net/dns/dns_hosts.cc
@@ -10,40 +10,128 @@
#include "base/strings/string_util.h"
#include "base/strings/string_tokenizer.h"
+using base::StringPiece;
+
namespace net {
+// Parses the contents of a hosts file. Returns one token (IP or hostname) at
+// a time. Doesn't copy anything; accepts the file as a StringPiece and
+// returns tokens as StringPieces.
+class HostsParser {
+ public:
+ explicit HostsParser(const StringPiece& text)
+ : text_(text),
+ data_(text.data()),
+ end_(text.size()),
+ pos_(0),
+ token_(),
+ token_is_ip_(false) {}
+
+ // Advances to the next token (IP or hostname). Returns whether another
+ // token was available. |token_is_ip| and |token| can be used to find out
+ // the type and text of the token.
+ bool Advance() {
+ bool next_is_ip = (pos_ == 0);
+ while (pos_ < end_ && pos_ != std::string::npos) {
+ SkipWhitespace();
+ switch (text_[pos_]) {
+ case '\r':
+ case '\n':
+ next_is_ip = true;
+ pos_++;
+ break;
+
+ case '#':
+ SkipRestOfLine();
+ break;
+
+ default: {
+ size_t token_start = pos_;
+ SkipToken();
+ size_t token_end = (pos_ == std::string::npos) ? end_ : pos_;
+
+ token_ = StringPiece(data_ + token_start, token_end - token_start);
+ token_is_ip_ = next_is_ip;
+
+ return true;
+ }
+ }
+ }
+
+ text_ = StringPiece();
+ return false;
+ }
+
+ // Fast-forwards the parser to the next line. Should be called if an IP
+ // address doesn't parse, to avoid wasting time tokenizing hostnames that
+ // will be ignored.
+ void SkipRestOfLine() {
+ pos_ = text_.find("\n", pos_);
+ }
+
+ // Returns whether the last-parsed token is an IP address (true) or a
+ // hostname (false).
+ bool token_is_ip() { return token_is_ip_; }
+
+ // Returns the text of the last-parsed token as a StringPiece referencing
+ // the same underlying memory as the StringPiece passed to the constructor.
+ // Returns an empty StringPiece if no token has been parsed or the end of
+ // the input string has been reached.
+ const StringPiece& token() { return token_; }
+
+ private:
+ void SkipToken() {
+ pos_ = text_.find_first_of(" \t\n\r#", pos_);
+ }
+
+ void SkipWhitespace() {
+ pos_ = text_.find_first_not_of(" \t", pos_);
+ }
+
+ StringPiece text_;
+ const char* data_;
+ const size_t end_;
+
+ size_t pos_;
+ StringPiece token_;
+ bool token_is_ip_;
+
+ DISALLOW_COPY_AND_ASSIGN(HostsParser);
+};
+
+
+
void ParseHosts(const std::string& contents, DnsHosts* dns_hosts) {
CHECK(dns_hosts);
DnsHosts& hosts = *dns_hosts;
- // Split into lines. Accept CR for Windows.
- base::StringTokenizer contents_lines(contents, "\n\r");
- while (contents_lines.GetNext()) {
- // Ignore comments after '#'.
- std::string line = contents_lines.token();
- base::StringTokenizer line_parts(line, "#");
- line_parts.set_options(base::StringTokenizer::RETURN_DELIMS);
-
- if (line_parts.GetNext() && !line_parts.token_is_delim()) {
- // Split and trim whitespace.
- std::string part = line_parts.token();
- base::StringTokenizer tokens(part, " \t");
-
- if (tokens.GetNext()) {
- IPAddressNumber ip;
- // TODO(szym): handle %iface notation on mac
- if (!ParseIPLiteralToNumber(tokens.token(), &ip))
- continue; // Ignore malformed lines.
- AddressFamily fam = (ip.size() == 4) ? ADDRESS_FAMILY_IPV4 :
- ADDRESS_FAMILY_IPV6;
- while (tokens.GetNext()) {
- DnsHostsKey key(tokens.token(), fam);
- StringToLowerASCII(&(key.first));
- IPAddressNumber& mapped_ip = hosts[key];
- if (mapped_ip.empty())
- mapped_ip = ip;
- // else ignore this entry (first hit counts)
+
+ StringPiece ip_text;
+ IPAddressNumber ip;
+ AddressFamily family = ADDRESS_FAMILY_IPV4;
+ HostsParser parser(contents);
+ while (parser.Advance()) {
+ if (parser.token_is_ip()) {
+ StringPiece new_ip_text = parser.token();
+ // Some ad-blocking hosts files contain thousands of entries pointing to
+ // the same IP address (usually 127.0.0.1). Don't bother parsing the IP
+ // again if it's the same as the one above it.
+ if (new_ip_text != ip_text) {
+ IPAddressNumber new_ip;
+ if (ParseIPLiteralToNumber(parser.token().as_string(), &new_ip)) {
+ ip_text = new_ip_text;
+ ip.swap(new_ip);
+ family = (ip.size() == 4) ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6;
+ } else {
+ parser.SkipRestOfLine();
}
}
+ } else {
+ DnsHostsKey key(parser.token().as_string(), family);
+ StringToLowerASCII(&key.first);
+ IPAddressNumber& mapped_ip = hosts[key];
+ if (mapped_ip.empty())
+ mapped_ip = ip;
+ // else ignore this entry (first hit counts)
}
}
}
diff --git a/net/dns/dns_hosts.h b/net/dns/dns_hosts.h
index a75bb27..c2b2909 100644
--- a/net/dns/dns_hosts.h
+++ b/net/dns/dns_hosts.h
@@ -10,12 +10,39 @@
#include <utility>
#include <vector>
+#include "base/basictypes.h"
+#include "base/containers/hash_tables.h"
#include "base/files/file_path.h"
#include "net/base/address_family.h"
#include "net/base/net_export.h"
#include "net/base/net_util.h" // can't forward-declare IPAddressNumber
namespace net {
+ typedef std::pair<std::string, AddressFamily> DnsHostsKey;
+};
+
+namespace BASE_HASH_NAMESPACE {
+#if defined(COMPILER_GCC)
+
+template<>
+struct hash<net::DnsHostsKey> {
+ std::size_t operator()(const net::DnsHostsKey& key) const {
+ hash<base::StringPiece> string_piece_hash;
+ return string_piece_hash(key.first) + key.second;
+ }
+};
+
+#elif defined(COMPILER_MSVC)
+
+inline size_t hash_value(const net::DnsHostsKey& key) {
+ return hash_value(key.first) + key.second;
+}
+
+#endif // COMPILER
+
+} // namespace BASE_HASH_NAMESPACE
+
+namespace net {
// Parsed results of a Hosts file.
//
@@ -27,8 +54,13 @@ namespace net {
// 127.0.0.1 localhost
// 10.0.0.1 localhost
// The expected resolution of localhost is 127.0.0.1.
-typedef std::pair<std::string, AddressFamily> DnsHostsKey;
+#if !defined(OS_ANDROID)
+typedef base::hash_map<DnsHostsKey, IPAddressNumber> DnsHosts;
+#else
+// Android's hash_map doesn't support ==, so fall back to map. (Chromium on
+// Android doesn't use the built-in DNS resolver anyway, so it's irrelevant.)
typedef std::map<DnsHostsKey, IPAddressNumber> DnsHosts;
+#endif
// Parses |contents| (as read from /etc/hosts or equivalent) and stores results
// in |dns_hosts|. Invalid lines are ignored (as in most implementations).
diff --git a/net/dns/dns_hosts_unittest.cc b/net/dns/dns_hosts_unittest.cc
index f9ee294..76d4a72 100644
--- a/net/dns/dns_hosts_unittest.cc
+++ b/net/dns/dns_hosts_unittest.cc
@@ -23,6 +23,11 @@ TEST(DnsHostsTest, ParseHosts) {
"\t fe00::0 ip6-localnet\r\n"
"2048::2 example\n"
"2048::1 company example # ignored for 'example' \n"
+ "127.0.0.1 cache1\n"
+ "127.0.0.1 cache2 # should reuse parsed IP\n"
+ "256.0.0.0 cache3 # bogus IP should not clear parsed IP cache\n"
+ "127.0.0.1 cache4 # should still be reused\n"
+ "127.0.0.2 cache5\n"
"gibberish";
const struct {
@@ -39,6 +44,10 @@ TEST(DnsHostsTest, ParseHosts) {
{ "ip6-localnet", ADDRESS_FAMILY_IPV6, "fe00::0" },
{ "company", ADDRESS_FAMILY_IPV6, "2048::1" },
{ "example", ADDRESS_FAMILY_IPV6, "2048::2" },
+ { "cache1", ADDRESS_FAMILY_IPV4, "127.0.0.1" },
+ { "cache2", ADDRESS_FAMILY_IPV4, "127.0.0.1" },
+ { "cache4", ADDRESS_FAMILY_IPV4, "127.0.0.1" },
+ { "cache5", ADDRESS_FAMILY_IPV4, "127.0.0.2" },
};
DnsHosts expected;