Add support for split Public Suffix List distinctions.

This adds support for the private additions to the Public Suffix List. * Since net::RegistryControlledDomainService only contained static methods, this CL changes these methods to be contained within the namespace net::registry_controlled_domains and removes the class entirely. * All methods defined as part of net::registry_controlled_domains now have a mandatory argument to specify whether the private registries should be included. * Since the old implementation did not take into account the private registries, this sets all old callers to use EXCLUDE_PRIVATE as the net::registry_controlled_domains::PrivateRegistryFilter argument. * Changes the parameter for including unknown registries or not to be an enum instead of a boolean, using a similar naming scheme as for the private registries: net::registry_controlled_domains::UnknownRegistryFilter. * This also updates the effective-TLD data file to: 45cfff9c781f 2013-04-23 11:51 +0100 It includes changes from a number of Mozilla bugs, listed on https://hg.mozilla.org/mozilla-central/log/45cfff9c781f/netwerk/dns/effective_tld_names.dat between 290afd57d2a8 (2012-07-04 16:08 +0100) and 45cfff9c781f (2013-04-23 11:51 +0100). BUG=37436,96086 R=brettw@chromium.org, erikwright@chromium.org, pam@chromium.org, rsleevi@chromium.org, sky@chromium.org Review URL: https://codereview.chromium.org/13979002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@199771 0039d316-1c4b-4281-b951-d872f2087c98
author: nyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-05-13 18:12:48 +0000
committer: nyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-05-13 18:12:48 +0000
commit: 1e690d65afc4f616671ddbabbd2f47dce45c514d (patch)
tree: f4312c7ca39b90b6647b1c6089c37aab74efb7ab /net/tools
parent: c5cba4cb2c4eceda230f8e6a07121285725bff3a (diff)
download: chromium_src-1e690d65afc4f616671ddbabbd2f47dce45c514d.zip
chromium_src-1e690d65afc4f616671ddbabbd2f47dce45c514d.tar.gz
chromium_src-1e690d65afc4f616671ddbabbd2f47dce45c514d.tar.bz2
6 files changed, 490 insertions, 237 deletions
diff --git a/net/tools/tld_cleanup/README b/net/tools/tld_cleanup/README
index adaac7e..7b468b5 100644
--- a/net/tools/tld_cleanup/README
+++ b/net/tools/tld_cleanup/README
@@ -4,18 +4,27 @@ When updating src/net/base/registry_controlled_domains/effective_tld_names.dat:
    http://goo.gl/Ji2bB
 
 2. Remove whitespace from the ends of the lines.
+   You could possibly use something like:
+     sed -i -e "s/\s*$//g" \
+         src/net/base/registry_controlled_domains/effective_tld_names.dat
 
-3. Add the Chromium note back in.
+3. Add the Chromium note back in just after the license at the top, and just
+   before '===BEGIN ICANN DOMAINS==='. Ensure there is an empty line above and
+   two empty lines below the note. The note should say:
+// Chromium note: this is based on Mozilla's file:
+//  http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1
 
-4. Build tld_cleanup.exe (the "(net)" > "tld_cleanup" project)
+4. Build tld_cleanup (the "(net)" > "tld_cleanup" project)
 
 5. Run it (no arguments needed), typically from src/build/Release or
    src/build/Debug. It will re-generate
    src/net/base/registry_controlled_domains/effective_tld_names.gperf.
 
 6. Run gperf on the new effective_tld_names.gperf:
+     pushd src/net/base/registry_controlled_domains;
      gperf -a -L "C++" -C -c -o -t -k '*' -NFindDomain -D -m 5 \
-        effective_tld_names.gperf > effective_tld_names.cc
+         effective_tld_names.gperf > effective_tld_names.cc;
+     popd;
    It will produce a new effective_tld_names.cc.
 
 7. Check in the updated effective_tld_names.dat, effective_tld_names.gperf,
diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup.cc
index 1162d98..485bece 100644
--- a/net/tools/tld_cleanup/tld_cleanup.cc
+++ b/net/tools/tld_cleanup/tld_cleanup.cc
@@ -21,243 +21,18 @@
 //  * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
 //  * Canonicalizes each rule's domain by converting it to a GURL and back.
 //  * Adds explicit rules for true TLDs found in any rule.
-// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.
-//  * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
-//    and "// ===END PRIVATE DOMAINS===".
-
-#include <map>
-#include <set>
-#include <string>
+//  * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
+//    and "// ===END PRIVATE DOMAINS===" as private.
 
 #include "base/at_exit.h"
 #include "base/command_line.h"
 #include "base/file_util.h"
-#include "base/file_util.h"
 #include "base/files/file_path.h"
 #include "base/i18n/icu_util.h"
 #include "base/logging.h"
 #include "base/path_service.h"
 #include "base/process_util.h"
-#include "base/string_util.h"
-#include "googleurl/src/gurl.h"
-#include "googleurl/src/url_parse.h"
-
-namespace {
-struct Rule {
-  bool exception;
-  bool wildcard;
-};
-
-typedef std::map<std::string, Rule> RuleMap;
-typedef std::set<std::string> RuleSet;
-
-const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
-const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
-}
-
-// Writes the list of domain rules contained in the 'rules' set to the
-// 'outfile', with each rule terminated by a LF.  The file must already have
-// been created with write access.
-bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
-  std::string data;
-  data.append(
-"%{\n"
-"// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n"
-"// Use of this source code is governed by a BSD-style license that can be\n"
-"// found in the LICENSE file.\n\n"
-"// This file is generated by net/tools/tld_cleanup/.\n"
-"// DO NOT MANUALLY EDIT!\n"
-"%}\n"
-"struct DomainRule {\n"
-"  const char *name;\n"
-"  int type;  // 1: exception, 2: wildcard\n"
-"};\n"
-"%%\n"
-  );
-
-  for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
-    data.append(i->first);
-    data.append(", ");
-    if (i->second.exception) {
-      data.append("1");
-    } else if (i->second.wildcard) {
-      data.append("2");
-    } else {
-      data.append("0");
-    }
-    data.append("\n");
-  }
-
-  data.append("%%\n");
-
-  int written = file_util::WriteFile(outfile, data.data(), data.size());
-
-  return written == static_cast<int>(data.size());
-}
-
-// These result codes should be in increasing order of severity.
-typedef enum {
-  kSuccess,
-  kWarning,
-  kError,
-} NormalizeResult;
-
-// Adjusts the rule to a standard form: removes single extraneous dots and
-// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
-// valid; logs a warning and returns kWarning if it is probably invalid; and
-// logs an error and returns kError if the rule is (almost) certainly invalid.
-NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
-  NormalizeResult result = kSuccess;
-
-  // Strip single leading and trailing dots.
-  if (domain->at(0) == '.')
-    domain->erase(0, 1);
-  if (domain->empty()) {
-    LOG(WARNING) << "Ignoring empty rule";
-    return kWarning;
-  }
-  if (domain->at(domain->size() - 1) == '.')
-    domain->erase(domain->size() - 1, 1);
-  if (domain->empty()) {
-    LOG(WARNING) << "Ignoring empty rule";
-    return kWarning;
-  }
-
-  // Allow single leading '*.' or '!', saved here so it's not canonicalized.
-  size_t start_offset = 0;
-  if (domain->at(0) == '!') {
-    domain->erase(0, 1);
-    rule->exception = true;
-  } else if (domain->find("*.") == 0) {
-    domain->erase(0, 2);
-    rule->wildcard = true;
-  }
-  if (domain->empty()) {
-    LOG(WARNING) << "Ignoring empty rule";
-    return kWarning;
-  }
-
-  // Warn about additional '*.' or '!'.
-  if (domain->find("*.", start_offset) != std::string::npos ||
-      domain->find('!', start_offset) != std::string::npos) {
-    LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
-    result = kWarning;
-  }
-
-  // Make a GURL and normalize it, then get the host back out.
-  std::string url = "http://";
-  url.append(*domain);
-  GURL gurl(url);
-  const std::string& spec = gurl.possibly_invalid_spec();
-  url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
-  if (host.len < 0) {
-    LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
-    return kError;
-  }
-  if (!gurl.is_valid()) {
-    LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
-    result = kWarning;
-  }
-  domain->assign(spec.substr(host.begin, host.len));
-
-  return result;
-}
-
-// Loads the file described by 'in_filename', converts it to the desired format
-// (see the file comments above), and saves it into 'out_filename'.  Returns
-// the most severe of the result codes encountered when normalizing the rules.
-NormalizeResult NormalizeFile(const base::FilePath& in_filename,
-                              const base::FilePath& out_filename) {
-  std::string data;
-  if (!file_util::ReadFileToString(in_filename, &data)) {
-    LOG(ERROR) << "Unable to read file";
-    // We return success since we've already reported the error.
-    return kSuccess;
-  }
-
-  // We do a lot of string assignment during parsing, but simplicity is more
-  // important than performance here.
-  std::string domain;
-  NormalizeResult result = kSuccess;
-  size_t line_start = 0;
-  size_t line_end = 0;
-  RuleMap rules;
-  RuleSet extra_rules;
-  int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
-  while (line_start < data.size()) {
-    // Skip the entire section of private domains.
-    // TODO(pamg): remove this when http://crbug.com/96086 is fixed.
-    if (line_start + begin_private_length < data.size() &&
-        !data.compare(line_start, begin_private_length,
-                      kBeginPrivateDomainsComment)) {
-      line_end = data.find(kEndPrivateDomainsComment, line_start);
-      if (line_end == std::string::npos) {
-        LOG(WARNING) << "Private-domain section had no end marker.";
-        line_end = data.size();
-      }
-    } else if (line_start + 1 < data.size() &&
-        data[line_start] == '/' &&
-        data[line_start + 1] == '/') {
-      // Skip comments.
-      line_end = data.find_first_of("\r\n", line_start);
-      if (line_end == std::string::npos)
-        line_end = data.size();
-    } else {
-      // Truncate at first whitespace.
-      line_end = data.find_first_of("\r\n \t", line_start);
-      if (line_end == std::string::npos)
-        line_end = data.size();
-      domain.assign(data.data(), line_start, line_end - line_start);
-
-      Rule rule;
-      rule.wildcard = false;
-      rule.exception = false;
-      NormalizeResult new_result = NormalizeRule(&domain, &rule);
-      if (new_result != kError) {
-        // Check the existing rules to make sure we don't have an exception and
-        // wildcard for the same rule.  If we did, we'd have to update our
-        // parsing code to handle this case.
-        CHECK(rules.find(domain) == rules.end());
-
-        rules[domain] = rule;
-        // Add true TLD for multi-level rules.  We don't add them right now, in
-        // case there's an exception or wild card that either exists or might be
-        // added in a later iteration.  In those cases, there's no need to add
-        // it and it would just slow down parsing the data.
-        size_t tld_start = domain.find_last_of('.');
-        if (tld_start != std::string::npos && tld_start + 1 < domain.size())
-          extra_rules.insert(domain.substr(tld_start + 1));
-      }
-      result = std::max(result, new_result);
-    }
-
-    // Find beginning of next non-empty line.
-    line_start = data.find_first_of("\r\n", line_end);
-    if (line_start == std::string::npos)
-      line_start = data.size();
-    line_start = data.find_first_not_of("\r\n", line_start);
-    if (line_start == std::string::npos)
-      line_start = data.size();
-  }
-
-  for (RuleSet::const_iterator iter = extra_rules.begin();
-       iter != extra_rules.end();
-       ++iter) {
-    if (rules.find(*iter) == rules.end()) {
-      Rule rule;
-      rule.exception = false;
-      rule.wildcard = false;
-      rules[*iter] = rule;
-    }
-  }
-
-  if (!WriteRules(rules, out_filename)) {
-    LOG(ERROR) << "Error(s) writing output file";
-    result = kError;
-  }
-
-  return result;
-}
+#include "net/tools/tld_cleanup/tld_cleanup_util.h"
 
 int main(int argc, const char* argv[]) {
   base::EnableTerminationOnHeapCorruption();
@@ -307,13 +82,14 @@ int main(int argc, const char* argv[]) {
                                "registry_controlled_domains"))
                            .Append(FILE_PATH_LITERAL(
                                "effective_tld_names.gperf"));
-  NormalizeResult result = NormalizeFile(input_file, output_file);
-  if (result != kSuccess) {
+  net::tld_cleanup::NormalizeResult result =
+      net::tld_cleanup::NormalizeFile(input_file, output_file);
+  if (result != net::tld_cleanup::kSuccess) {
     fprintf(stderr,
             "Errors or warnings processing file.  See log in tld_cleanup.log.");
   }
 
-  if (result == kError)
+  if (result == net::tld_cleanup::kError)
     return 1;
   return 0;
 }
diff --git a/net/tools/tld_cleanup/tld_cleanup.gyp b/net/tools/tld_cleanup/tld_cleanup.gyp
index 245df98..227022c 100644
--- a/net/tools/tld_cleanup/tld_cleanup.gyp
+++ b/net/tools/tld_cleanup/tld_cleanup.gyp
@@ -8,14 +8,15 @@
   },
   'targets': [
     {
-      'target_name': 'tld_cleanup',
-      'type': 'executable',
+      'target_name': 'tld_cleanup_util',
+      'type': 'static_library',
       'dependencies': [
         '../../../base/base.gyp:base',
         '../../../build/temp_gyp/googleurl.gyp:googleurl',
       ],
       'sources': [
-        'tld_cleanup.cc',
+        'tld_cleanup_util.h',
+        'tld_cleanup_util.cc',
       ],
     },
   ],
diff --git a/net/tools/tld_cleanup/tld_cleanup_util.cc b/net/tools/tld_cleanup/tld_cleanup_util.cc
new file mode 100644
index 0000000..2f5496e
--- /dev/null
+++ b/net/tools/tld_cleanup/tld_cleanup_util.cc
@@ -0,0 +1,251 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "net/tools/tld_cleanup/tld_cleanup_util.h"
+
+#include "base/file_util.h"
+#include "base/logging.h"
+#include "base/string_util.h"
+#include "googleurl/src/gurl.h"
+#include "googleurl/src/url_parse.h"
+
+namespace {
+
+const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
+const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
+}
+
+namespace net {
+namespace tld_cleanup {
+
+// Writes the list of domain rules contained in the 'rules' set to the
+// 'outfile', with each rule terminated by a LF.  The file must already have
+// been created with write access.
+bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
+  std::string data;
+  data.append(
+"%{\n"
+"// Copyright 2012 The Chromium Authors. All rights reserved.\n"
+"// Use of this source code is governed by a BSD-style license that can be\n"
+"// found in the LICENSE file.\n\n"
+"// This file is generated by net/tools/tld_cleanup/.\n"
+"// DO NOT MANUALLY EDIT!\n"
+"%}\n"
+"struct DomainRule {\n"
+"  const char *name;\n"
+"  int type;  // 1: exception, 2: wildcard\n"
+"  bool is_private;\n"
+"};\n"
+"%%\n"
+  );
+
+  for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
+    data.append(i->first);
+    data.append(", ");
+    if (i->second.exception) {
+      data.append("1");
+    } else if (i->second.wildcard) {
+      data.append("2");
+    } else {
+      data.append("0");
+    }
+    if (i->second.is_private) {
+      data.append(", true");
+    } else {
+      data.append(", false");
+    }
+    data.append("\n");
+  }
+
+  data.append("%%\n");
+
+  int written = file_util::WriteFile(outfile, data.data(), data.size());
+
+  return written == static_cast<int>(data.size());
+}
+
+// Adjusts the rule to a standard form: removes single extraneous dots and
+// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
+// valid; logs a warning and returns kWarning if it is probably invalid; and
+// logs an error and returns kError if the rule is (almost) certainly invalid.
+NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
+  NormalizeResult result = kSuccess;
+
+  // Strip single leading and trailing dots.
+  if (domain->at(0) == '.')
+    domain->erase(0, 1);
+  if (domain->empty()) {
+    LOG(WARNING) << "Ignoring empty rule";
+    return kWarning;
+  }
+  if (domain->at(domain->size() - 1) == '.')
+    domain->erase(domain->size() - 1, 1);
+  if (domain->empty()) {
+    LOG(WARNING) << "Ignoring empty rule";
+    return kWarning;
+  }
+
+  // Allow single leading '*.' or '!', saved here so it's not canonicalized.
+  size_t start_offset = 0;
+  if (domain->at(0) == '!') {
+    domain->erase(0, 1);
+    rule->exception = true;
+  } else if (domain->find("*.") == 0) {
+    domain->erase(0, 2);
+    rule->wildcard = true;
+  }
+  if (domain->empty()) {
+    LOG(WARNING) << "Ignoring empty rule";
+    return kWarning;
+  }
+
+  // Warn about additional '*.' or '!'.
+  if (domain->find("*.", start_offset) != std::string::npos ||
+      domain->find('!', start_offset) != std::string::npos) {
+    LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
+    result = kWarning;
+  }
+
+  // Make a GURL and normalize it, then get the host back out.
+  std::string url = "http://";
+  url.append(*domain);
+  GURL gurl(url);
+  const std::string& spec = gurl.possibly_invalid_spec();
+  url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
+  if (host.len < 0) {
+    LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
+    return kError;
+  }
+  if (!gurl.is_valid()) {
+    LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
+    result = kWarning;
+  }
+  domain->assign(spec.substr(host.begin, host.len));
+
+  return result;
+}
+
+NormalizeResult NormalizeDataToRuleMap(const std::string data,
+                                       RuleMap* rules) {
+  CHECK(rules);
+  // We do a lot of string assignment during parsing, but simplicity is more
+  // important than performance here.
+  std::string domain;
+  NormalizeResult result = kSuccess;
+  size_t line_start = 0;
+  size_t line_end = 0;
+  bool is_private = false;
+  RuleMap extra_rules;
+  int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
+  int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
+  while (line_start < data.size()) {
+    if (line_start + begin_private_length < data.size() &&
+        !data.compare(line_start, begin_private_length,
+                      kBeginPrivateDomainsComment)) {
+      is_private = true;
+      line_end = line_start + begin_private_length;
+    } else if (line_start + end_private_length < data.size() &&
+        !data.compare(line_start, end_private_length,
+                      kEndPrivateDomainsComment)) {
+      is_private = false;
+      line_end = line_start + end_private_length;
+    } else if (line_start + 1 < data.size() &&
+        data[line_start] == '/' &&
+        data[line_start + 1] == '/') {
+      // Skip comments.
+      line_end = data.find_first_of("\r\n", line_start);
+      if (line_end == std::string::npos)
+        line_end = data.size();
+    } else {
+      // Truncate at first whitespace.
+      line_end = data.find_first_of("\r\n \t", line_start);
+      if (line_end == std::string::npos)
+        line_end = data.size();
+      domain.assign(data.data(), line_start, line_end - line_start);
+
+      Rule rule;
+      rule.wildcard = false;
+      rule.exception = false;
+      rule.is_private = is_private;
+      NormalizeResult new_result = NormalizeRule(&domain, &rule);
+      if (new_result != kError) {
+        // Check the existing rules to make sure we don't have an exception and
+        // wildcard for the same rule, or that the same domain is listed as both
+        // private and not private. If we did, we'd have to update our
+        // parsing code to handle this case.
+        CHECK(rules->find(domain) == rules->end());
+
+        (*rules)[domain] = rule;
+        // Add true TLD for multi-level rules.  We don't add them right now, in
+        // case there's an exception or wild card that either exists or might be
+        // added in a later iteration.  In those cases, there's no need to add
+        // it and it would just slow down parsing the data.
+        size_t tld_start = domain.find_last_of('.');
+        if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
+          std::string extra_rule_domain = domain.substr(tld_start + 1);
+          RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
+          Rule extra_rule;
+          extra_rule.exception = false;
+          extra_rule.wildcard = false;
+          if (iter == extra_rules.end()) {
+            extra_rule.is_private = is_private;
+          } else {
+            // A rule already exists, so we ensure that if any of the entries is
+            // not private the result should be that the entry is not private.
+            // An example is .au which is not listed as a real TLD, but only
+            // lists second-level domains such as com.au. Subdomains of .au
+            // (eg. blogspot.com.au) are also listed in the private section,
+            // which is processed later, so this ensures that the real TLD
+            // (eg. .au) is listed as public.
+            extra_rule.is_private = is_private && iter->second.is_private;
+          }
+          extra_rules[extra_rule_domain] = extra_rule;
+        }
+      }
+      result = std::max(result, new_result);
+    }
+
+    // Find beginning of next non-empty line.
+    line_start = data.find_first_of("\r\n", line_end);
+    if (line_start == std::string::npos)
+      line_start = data.size();
+    line_start = data.find_first_not_of("\r\n", line_start);
+    if (line_start == std::string::npos)
+      line_start = data.size();
+  }
+
+  for (RuleMap::const_iterator iter = extra_rules.begin();
+       iter != extra_rules.end();
+       ++iter) {
+    if (rules->find(iter->first) == rules->end()) {
+      (*rules)[iter->first] = iter->second;
+    }
+  }
+
+  return result;
+}
+
+NormalizeResult NormalizeFile(const base::FilePath& in_filename,
+                              const base::FilePath& out_filename) {
+  RuleMap rules;
+  std::string data;
+  if (!file_util::ReadFileToString(in_filename, &data)) {
+    LOG(ERROR) << "Unable to read file";
+    // We return success since we've already reported the error.
+    return kSuccess;
+  }
+
+  NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
+
+  if (!WriteRules(rules, out_filename)) {
+    LOG(ERROR) << "Error(s) writing output file";
+    result = kError;
+  }
+
+  return result;
+}
+
+
+}  // namespace tld_cleanup
+}  // namespace net
diff --git a/net/tools/tld_cleanup/tld_cleanup_util.h b/net/tools/tld_cleanup/tld_cleanup_util.h
new file mode 100644
index 0000000..5900206
--- /dev/null
+++ b/net/tools/tld_cleanup/tld_cleanup_util.h
@@ -0,0 +1,48 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef NET_TOOLS_TLD_CLEANUP_TLD_CLEANUP_UTIL_H_
+#define NET_TOOLS_TLD_CLEANUP_TLD_CLEANUP_UTIL_H_
+
+#include <map>
+#include <string>
+
+namespace base {
+class FilePath;
+}  // namespace base
+
+namespace net {
+namespace tld_cleanup {
+
+struct Rule {
+  bool exception;
+  bool wildcard;
+  bool is_private;
+};
+
+typedef std::map<std::string, Rule> RuleMap;
+
+// These result codes should be in increasing order of severity.
+typedef enum {
+  kSuccess,
+  kWarning,
+  kError,
+} NormalizeResult;
+
+// Loads the file described by |in_filename|, converts it to the desired format
+// (see the file comments in tld_cleanup.cc), and saves it into |out_filename|.
+// Returns the most severe of the result codes encountered when normalizing the
+// rules.
+NormalizeResult NormalizeFile(const base::FilePath& in_filename,
+                              const base::FilePath& out_filename);
+
+// Parses |data|, and converts it to the internal data format RuleMap. Returns
+// the most severe of the result codes encountered when normalizing the rules.
+NormalizeResult NormalizeDataToRuleMap(const std::string data,
+                                       RuleMap* rules);
+
+}  // namespace tld_cleanup
+}  // namespace net
+
+#endif  // NET_TOOLS_TLD_CLEANUP_TLD_CLEANUP_UTIL_H_
diff --git a/net/tools/tld_cleanup/tld_cleanup_util_unittest.cc b/net/tools/tld_cleanup/tld_cleanup_util_unittest.cc
new file mode 100644
index 0000000..6b1d02a
--- /dev/null
+++ b/net/tools/tld_cleanup/tld_cleanup_util_unittest.cc
@@ -0,0 +1,168 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "net/tools/tld_cleanup/tld_cleanup_util.h"
+
+#include "base/files/file_path.h"
+#include "base/path_service.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace net {
+namespace tld_cleanup {
+
+std::string SetupData(std::string icann_domains, std::string private_domains) {
+  return "// ===BEGIN ICANN DOMAINS===\n" +
+         icann_domains +
+         "// ===END ICANN DOMAINS===\n" +
+         "// ===BEGIN PRIVATE DOMAINS===\n" +
+         private_domains +
+         "// ===END PRIVATE DOMAINS===\n";
+}
+
+TEST(TldCleanupUtilTest, TwoRealTldsSuccessfullyRead) {
+  std::string icann_domains = "foo\n"
+                              "bar\n";
+  std::string private_domains = "";
+  std::string data = SetupData(icann_domains, private_domains);
+  RuleMap rules;
+  NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
+  ASSERT_EQ(kSuccess, result);
+  ASSERT_EQ(2U, rules.size());
+  RuleMap::const_iterator foo_iter = rules.find("foo");
+  ASSERT_FALSE(rules.end() == foo_iter);
+  EXPECT_FALSE(foo_iter->second.wildcard);
+  EXPECT_FALSE(foo_iter->second.exception);
+  EXPECT_FALSE(foo_iter->second.is_private);
+  RuleMap::const_iterator bar_iter = rules.find("bar");
+  ASSERT_FALSE(rules.end() == bar_iter);
+  EXPECT_FALSE(bar_iter->second.wildcard);
+  EXPECT_FALSE(bar_iter->second.exception);
+  EXPECT_FALSE(bar_iter->second.is_private);
+}
+
+TEST(TldCleanupUtilTest, RealTldAutomaticallyAddedForSubdomain) {
+  std::string icann_domains = "foo.bar\n";
+  std::string private_domains = "";
+  std::string data = SetupData(icann_domains, private_domains);
+  RuleMap rules;
+  NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
+  ASSERT_EQ(kSuccess, result);
+  ASSERT_EQ(2U, rules.size());
+  RuleMap::const_iterator foo_bar_iter = rules.find("foo.bar");
+  ASSERT_FALSE(rules.end() == foo_bar_iter);
+  EXPECT_FALSE(foo_bar_iter->second.wildcard);
+  EXPECT_FALSE(foo_bar_iter->second.exception);
+  EXPECT_FALSE(foo_bar_iter->second.is_private);
+  RuleMap::const_iterator bar_iter = rules.find("bar");
+  ASSERT_FALSE(rules.end() == bar_iter);
+  EXPECT_FALSE(bar_iter->second.wildcard);
+  EXPECT_FALSE(bar_iter->second.exception);
+  EXPECT_FALSE(bar_iter->second.is_private);
+}
+
+TEST(TldCleanupUtilTest, PrivateTldMarkedAsPrivate) {
+  std::string icann_domains = "foo\n"
+                              "bar\n";
+  std::string private_domains = "baz\n";
+  std::string data = SetupData(icann_domains, private_domains);
+  RuleMap rules;
+  NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
+  ASSERT_EQ(kSuccess, result);
+  ASSERT_EQ(3U, rules.size());
+  RuleMap::const_iterator foo_iter = rules.find("foo");
+  ASSERT_FALSE(rules.end() == foo_iter);
+  EXPECT_FALSE(foo_iter->second.wildcard);
+  EXPECT_FALSE(foo_iter->second.exception);
+  EXPECT_FALSE(foo_iter->second.is_private);
+  RuleMap::const_iterator bar_iter = rules.find("bar");
+  ASSERT_FALSE(rules.end() == bar_iter);
+  EXPECT_FALSE(bar_iter->second.wildcard);
+  EXPECT_FALSE(bar_iter->second.exception);
+  EXPECT_FALSE(bar_iter->second.is_private);
+  RuleMap::const_iterator baz_iter = rules.find("baz");
+  ASSERT_FALSE(rules.end() == baz_iter);
+  EXPECT_FALSE(baz_iter->second.wildcard);
+  EXPECT_FALSE(baz_iter->second.exception);
+  EXPECT_TRUE(baz_iter->second.is_private);
+}
+
+TEST(TldCleanupUtilTest, PrivateDomainMarkedAsPrivate) {
+  std::string icann_domains = "bar\n";
+  std::string private_domains = "foo.bar\n";
+  std::string data = SetupData(icann_domains, private_domains);
+  RuleMap rules;
+  NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
+  ASSERT_EQ(kSuccess, result);
+  ASSERT_EQ(2U, rules.size());
+  RuleMap::const_iterator bar_iter = rules.find("bar");
+  ASSERT_FALSE(rules.end() == bar_iter);
+  EXPECT_FALSE(bar_iter->second.wildcard);
+  EXPECT_FALSE(bar_iter->second.exception);
+  EXPECT_FALSE(bar_iter->second.is_private);
+  RuleMap::const_iterator foo_bar_iter = rules.find("foo.bar");
+  ASSERT_FALSE(rules.end() == foo_bar_iter);
+  EXPECT_FALSE(foo_bar_iter->second.wildcard);
+  EXPECT_FALSE(foo_bar_iter->second.exception);
+  EXPECT_TRUE(foo_bar_iter->second.is_private);
+}
+
+TEST(TldCleanupUtilTest, ExtraTldRuleIsNotMarkedPrivate) {
+  std::string icann_domains = "foo.bar\n"
+                              "baz.bar\n";
+  std::string private_domains = "qux.bar\n";
+  std::string data = SetupData(icann_domains, private_domains);
+  RuleMap rules;
+  NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
+  ASSERT_EQ(kSuccess, result);
+  ASSERT_EQ(4U, rules.size());
+  RuleMap::const_iterator foo_bar_iter = rules.find("foo.bar");
+  ASSERT_FALSE(rules.end() == foo_bar_iter);
+  EXPECT_FALSE(foo_bar_iter->second.wildcard);
+  EXPECT_FALSE(foo_bar_iter->second.exception);
+  EXPECT_FALSE(foo_bar_iter->second.is_private);
+  RuleMap::const_iterator baz_bar_iter = rules.find("baz.bar");
+  ASSERT_FALSE(rules.end() == baz_bar_iter);
+  EXPECT_FALSE(baz_bar_iter->second.wildcard);
+  EXPECT_FALSE(baz_bar_iter->second.exception);
+  EXPECT_FALSE(baz_bar_iter->second.is_private);
+  RuleMap::const_iterator bar_iter = rules.find("bar");
+  ASSERT_FALSE(rules.end() == bar_iter);
+  EXPECT_FALSE(bar_iter->second.wildcard);
+  EXPECT_FALSE(bar_iter->second.exception);
+  EXPECT_FALSE(bar_iter->second.is_private);
+  RuleMap::const_iterator qux_bar_iter = rules.find("qux.bar");
+  ASSERT_FALSE(rules.end() == qux_bar_iter);
+  EXPECT_FALSE(qux_bar_iter->second.wildcard);
+  EXPECT_FALSE(qux_bar_iter->second.exception);
+  EXPECT_TRUE(qux_bar_iter->second.is_private);
+}
+
+TEST(TldCleanupUtilTest, WildcardAndExceptionParsedCorrectly) {
+  std::string icann_domains = "*.bar\n"
+                              "!foo.bar\n";
+  std::string private_domains = "!baz.bar\n";
+  std::string data = SetupData(icann_domains, private_domains);
+  RuleMap rules;
+  NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
+  ASSERT_EQ(kSuccess, result);
+  ASSERT_EQ(3U, rules.size());
+  RuleMap::const_iterator foo_bar_iter = rules.find("bar");
+  ASSERT_FALSE(rules.end() == foo_bar_iter);
+  EXPECT_TRUE(foo_bar_iter->second.wildcard);
+  EXPECT_FALSE(foo_bar_iter->second.exception);
+  EXPECT_FALSE(foo_bar_iter->second.is_private);
+  RuleMap::const_iterator bar_iter = rules.find("foo.bar");
+  ASSERT_FALSE(rules.end() == bar_iter);
+  EXPECT_FALSE(bar_iter->second.wildcard);
+  EXPECT_TRUE(bar_iter->second.exception);
+  EXPECT_FALSE(bar_iter->second.is_private);
+  RuleMap::const_iterator baz_bar_iter = rules.find("baz.bar");
+  ASSERT_FALSE(rules.end() == baz_bar_iter);
+  EXPECT_FALSE(baz_bar_iter->second.wildcard);
+  EXPECT_TRUE(baz_bar_iter->second.exception);
+  EXPECT_TRUE(baz_bar_iter->second.is_private);
+}
+
+}  // namespace tld_cleanup
+}  // namespace net
author	nyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-05-13 18:12:48 +0000
committer	nyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-05-13 18:12:48 +0000
commit	1e690d65afc4f616671ddbabbd2f47dce45c514d (patch)
tree	f4312c7ca39b90b6647b1c6089c37aab74efb7ab /net/tools
parent	c5cba4cb2c4eceda230f8e6a07121285725bff3a (diff)
download	chromium_src-1e690d65afc4f616671ddbabbd2f47dce45c514d.zip chromium_src-1e690d65afc4f616671ddbabbd2f47dce45c514d.tar.gz chromium_src-1e690d65afc4f616671ddbabbd2f47dce45c514d.tar.bz2