summaryrefslogtreecommitdiffstats
path: root/net/tools/tld_cleanup/tld_cleanup.cc
diff options
context:
space:
mode:
authornyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-05-13 18:33:30 +0000
committernyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-05-13 18:33:30 +0000
commit9aff2d195bab7e0040e80c4f70ebc8b23601076f (patch)
treec9163b54377a40050454bfe101eecc8467e2a938 /net/tools/tld_cleanup/tld_cleanup.cc
parentc60e3d89f914a4b1be19114fe695c6ad7e2a5b32 (diff)
downloadchromium_src-9aff2d195bab7e0040e80c4f70ebc8b23601076f.zip
chromium_src-9aff2d195bab7e0040e80c4f70ebc8b23601076f.tar.gz
chromium_src-9aff2d195bab7e0040e80c4f70ebc8b23601076f.tar.bz2
Revert 199771 "Add support for split Public Suffix List distinct..."
> Add support for split Public Suffix List distinctions. > > This adds support for the private additions to the Public Suffix List. > > * Since net::RegistryControlledDomainService only contained static methods, this > CL changes these methods to be contained within the namespace > net::registry_controlled_domains and removes the class entirely. > * All methods defined as part of net::registry_controlled_domains now > have a mandatory argument to specify whether the private registries > should be included. > * Since the old implementation did not take into account the private > registries, this sets all old callers to use EXCLUDE_PRIVATE as the > net::registry_controlled_domains::PrivateRegistryFilter argument. > * Changes the parameter for including unknown registries or not to be an enum > instead of a boolean, using a similar naming scheme as for the private > registries: net::registry_controlled_domains::UnknownRegistryFilter. > * This also updates the effective-TLD data file to: > 45cfff9c781f 2013-04-23 11:51 +0100 > It includes changes from a number of Mozilla bugs, listed on > https://hg.mozilla.org/mozilla-central/log/45cfff9c781f/netwerk/dns/effective_tld_names.dat > between 290afd57d2a8 (2012-07-04 16:08 +0100) and > 45cfff9c781f (2013-04-23 11:51 +0100). > > BUG=37436,96086 > R=brettw@chromium.org, erikwright@chromium.org, pam@chromium.org, rsleevi@chromium.org, sky@chromium.org > > Review URL: https://codereview.chromium.org/13979002 TBR=nyquist@chromium.org Review URL: https://codereview.chromium.org/14767028 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@199774 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/tools/tld_cleanup/tld_cleanup.cc')
-rw-r--r--net/tools/tld_cleanup/tld_cleanup.cc238
1 files changed, 231 insertions, 7 deletions
diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup.cc
index 485bece..1162d98 100644
--- a/net/tools/tld_cleanup/tld_cleanup.cc
+++ b/net/tools/tld_cleanup/tld_cleanup.cc
@@ -21,18 +21,243 @@
// * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
// * Canonicalizes each rule's domain by converting it to a GURL and back.
// * Adds explicit rules for true TLDs found in any rule.
-// * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
-// and "// ===END PRIVATE DOMAINS===" as private.
+// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.
+// * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
+// and "// ===END PRIVATE DOMAINS===".
+
+#include <map>
+#include <set>
+#include <string>
#include "base/at_exit.h"
#include "base/command_line.h"
#include "base/file_util.h"
+#include "base/file_util.h"
#include "base/files/file_path.h"
#include "base/i18n/icu_util.h"
#include "base/logging.h"
#include "base/path_service.h"
#include "base/process_util.h"
-#include "net/tools/tld_cleanup/tld_cleanup_util.h"
+#include "base/string_util.h"
+#include "googleurl/src/gurl.h"
+#include "googleurl/src/url_parse.h"
+
+namespace {
+struct Rule {
+ bool exception;
+ bool wildcard;
+};
+
+typedef std::map<std::string, Rule> RuleMap;
+typedef std::set<std::string> RuleSet;
+
+const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
+const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
+}
+
+// Writes the list of domain rules contained in the 'rules' set to the
+// 'outfile', with each rule terminated by a LF. The file must already have
+// been created with write access.
+bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
+ std::string data;
+ data.append(
+"%{\n"
+"// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n"
+"// Use of this source code is governed by a BSD-style license that can be\n"
+"// found in the LICENSE file.\n\n"
+"// This file is generated by net/tools/tld_cleanup/.\n"
+"// DO NOT MANUALLY EDIT!\n"
+"%}\n"
+"struct DomainRule {\n"
+" const char *name;\n"
+" int type; // 1: exception, 2: wildcard\n"
+"};\n"
+"%%\n"
+ );
+
+ for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
+ data.append(i->first);
+ data.append(", ");
+ if (i->second.exception) {
+ data.append("1");
+ } else if (i->second.wildcard) {
+ data.append("2");
+ } else {
+ data.append("0");
+ }
+ data.append("\n");
+ }
+
+ data.append("%%\n");
+
+ int written = file_util::WriteFile(outfile, data.data(), data.size());
+
+ return written == static_cast<int>(data.size());
+}
+
+// These result codes should be in increasing order of severity.
+typedef enum {
+ kSuccess,
+ kWarning,
+ kError,
+} NormalizeResult;
+
+// Adjusts the rule to a standard form: removes single extraneous dots and
+// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
+// valid; logs a warning and returns kWarning if it is probably invalid; and
+// logs an error and returns kError if the rule is (almost) certainly invalid.
+NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
+ NormalizeResult result = kSuccess;
+
+ // Strip single leading and trailing dots.
+ if (domain->at(0) == '.')
+ domain->erase(0, 1);
+ if (domain->empty()) {
+ LOG(WARNING) << "Ignoring empty rule";
+ return kWarning;
+ }
+ if (domain->at(domain->size() - 1) == '.')
+ domain->erase(domain->size() - 1, 1);
+ if (domain->empty()) {
+ LOG(WARNING) << "Ignoring empty rule";
+ return kWarning;
+ }
+
+ // Allow single leading '*.' or '!', saved here so it's not canonicalized.
+ size_t start_offset = 0;
+ if (domain->at(0) == '!') {
+ domain->erase(0, 1);
+ rule->exception = true;
+ } else if (domain->find("*.") == 0) {
+ domain->erase(0, 2);
+ rule->wildcard = true;
+ }
+ if (domain->empty()) {
+ LOG(WARNING) << "Ignoring empty rule";
+ return kWarning;
+ }
+
+ // Warn about additional '*.' or '!'.
+ if (domain->find("*.", start_offset) != std::string::npos ||
+ domain->find('!', start_offset) != std::string::npos) {
+ LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
+ result = kWarning;
+ }
+
+ // Make a GURL and normalize it, then get the host back out.
+ std::string url = "http://";
+ url.append(*domain);
+ GURL gurl(url);
+ const std::string& spec = gurl.possibly_invalid_spec();
+ url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
+ if (host.len < 0) {
+ LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
+ return kError;
+ }
+ if (!gurl.is_valid()) {
+ LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
+ result = kWarning;
+ }
+ domain->assign(spec.substr(host.begin, host.len));
+
+ return result;
+}
+
+// Loads the file described by 'in_filename', converts it to the desired format
+// (see the file comments above), and saves it into 'out_filename'. Returns
+// the most severe of the result codes encountered when normalizing the rules.
+NormalizeResult NormalizeFile(const base::FilePath& in_filename,
+ const base::FilePath& out_filename) {
+ std::string data;
+ if (!file_util::ReadFileToString(in_filename, &data)) {
+ LOG(ERROR) << "Unable to read file";
+ // We return success since we've already reported the error.
+ return kSuccess;
+ }
+
+ // We do a lot of string assignment during parsing, but simplicity is more
+ // important than performance here.
+ std::string domain;
+ NormalizeResult result = kSuccess;
+ size_t line_start = 0;
+ size_t line_end = 0;
+ RuleMap rules;
+ RuleSet extra_rules;
+ int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
+ while (line_start < data.size()) {
+ // Skip the entire section of private domains.
+ // TODO(pamg): remove this when http://crbug.com/96086 is fixed.
+ if (line_start + begin_private_length < data.size() &&
+ !data.compare(line_start, begin_private_length,
+ kBeginPrivateDomainsComment)) {
+ line_end = data.find(kEndPrivateDomainsComment, line_start);
+ if (line_end == std::string::npos) {
+ LOG(WARNING) << "Private-domain section had no end marker.";
+ line_end = data.size();
+ }
+ } else if (line_start + 1 < data.size() &&
+ data[line_start] == '/' &&
+ data[line_start + 1] == '/') {
+ // Skip comments.
+ line_end = data.find_first_of("\r\n", line_start);
+ if (line_end == std::string::npos)
+ line_end = data.size();
+ } else {
+ // Truncate at first whitespace.
+ line_end = data.find_first_of("\r\n \t", line_start);
+ if (line_end == std::string::npos)
+ line_end = data.size();
+ domain.assign(data.data(), line_start, line_end - line_start);
+
+ Rule rule;
+ rule.wildcard = false;
+ rule.exception = false;
+ NormalizeResult new_result = NormalizeRule(&domain, &rule);
+ if (new_result != kError) {
+ // Check the existing rules to make sure we don't have an exception and
+ // wildcard for the same rule. If we did, we'd have to update our
+ // parsing code to handle this case.
+ CHECK(rules.find(domain) == rules.end());
+
+ rules[domain] = rule;
+ // Add true TLD for multi-level rules. We don't add them right now, in
+ // case there's an exception or wild card that either exists or might be
+ // added in a later iteration. In those cases, there's no need to add
+ // it and it would just slow down parsing the data.
+ size_t tld_start = domain.find_last_of('.');
+ if (tld_start != std::string::npos && tld_start + 1 < domain.size())
+ extra_rules.insert(domain.substr(tld_start + 1));
+ }
+ result = std::max(result, new_result);
+ }
+
+ // Find beginning of next non-empty line.
+ line_start = data.find_first_of("\r\n", line_end);
+ if (line_start == std::string::npos)
+ line_start = data.size();
+ line_start = data.find_first_not_of("\r\n", line_start);
+ if (line_start == std::string::npos)
+ line_start = data.size();
+ }
+
+ for (RuleSet::const_iterator iter = extra_rules.begin();
+ iter != extra_rules.end();
+ ++iter) {
+ if (rules.find(*iter) == rules.end()) {
+ Rule rule;
+ rule.exception = false;
+ rule.wildcard = false;
+ rules[*iter] = rule;
+ }
+ }
+
+ if (!WriteRules(rules, out_filename)) {
+ LOG(ERROR) << "Error(s) writing output file";
+ result = kError;
+ }
+
+ return result;
+}
int main(int argc, const char* argv[]) {
base::EnableTerminationOnHeapCorruption();
@@ -82,14 +307,13 @@ int main(int argc, const char* argv[]) {
"registry_controlled_domains"))
.Append(FILE_PATH_LITERAL(
"effective_tld_names.gperf"));
- net::tld_cleanup::NormalizeResult result =
- net::tld_cleanup::NormalizeFile(input_file, output_file);
- if (result != net::tld_cleanup::kSuccess) {
+ NormalizeResult result = NormalizeFile(input_file, output_file);
+ if (result != kSuccess) {
fprintf(stderr,
"Errors or warnings processing file. See log in tld_cleanup.log.");
}
- if (result == net::tld_cleanup::kError)
+ if (result == kError)
return 1;
return 0;
}