diff options
author | nyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-13 18:33:30 +0000 |
---|---|---|
committer | nyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-13 18:33:30 +0000 |
commit | 9aff2d195bab7e0040e80c4f70ebc8b23601076f (patch) | |
tree | c9163b54377a40050454bfe101eecc8467e2a938 /net/tools/tld_cleanup/tld_cleanup.cc | |
parent | c60e3d89f914a4b1be19114fe695c6ad7e2a5b32 (diff) | |
download | chromium_src-9aff2d195bab7e0040e80c4f70ebc8b23601076f.zip chromium_src-9aff2d195bab7e0040e80c4f70ebc8b23601076f.tar.gz chromium_src-9aff2d195bab7e0040e80c4f70ebc8b23601076f.tar.bz2 |
Revert 199771 "Add support for split Public Suffix List distinct..."
> Add support for split Public Suffix List distinctions.
>
> This adds support for the private additions to the Public Suffix List.
>
> * Since net::RegistryControlledDomainService only contained static methods, this
> CL changes these methods to be contained within the namespace
> net::registry_controlled_domains and removes the class entirely.
> * All methods defined as part of net::registry_controlled_domains now
> have a mandatory argument to specify whether the private registries
> should be included.
> * Since the old implementation did not take into account the private
> registries, this sets all old callers to use EXCLUDE_PRIVATE as the
> net::registry_controlled_domains::PrivateRegistryFilter argument.
> * Changes the parameter for including unknown registries or not to be an enum
> instead of a boolean, using a similar naming scheme as for the private
> registries: net::registry_controlled_domains::UnknownRegistryFilter.
> * This also updates the effective-TLD data file to:
> 45cfff9c781f 2013-04-23 11:51 +0100
> It includes changes from a number of Mozilla bugs, listed on
> https://hg.mozilla.org/mozilla-central/log/45cfff9c781f/netwerk/dns/effective_tld_names.dat
> between 290afd57d2a8 (2012-07-04 16:08 +0100) and
> 45cfff9c781f (2013-04-23 11:51 +0100).
>
> BUG=37436,96086
> R=brettw@chromium.org, erikwright@chromium.org, pam@chromium.org, rsleevi@chromium.org, sky@chromium.org
>
> Review URL: https://codereview.chromium.org/13979002
TBR=nyquist@chromium.org
Review URL: https://codereview.chromium.org/14767028
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@199774 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/tools/tld_cleanup/tld_cleanup.cc')
-rw-r--r-- | net/tools/tld_cleanup/tld_cleanup.cc | 238 |
1 files changed, 231 insertions, 7 deletions
diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup.cc index 485bece..1162d98 100644 --- a/net/tools/tld_cleanup/tld_cleanup.cc +++ b/net/tools/tld_cleanup/tld_cleanup.cc @@ -21,18 +21,243 @@ // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. // * Canonicalizes each rule's domain by converting it to a GURL and back. // * Adds explicit rules for true TLDs found in any rule. -// * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS===" -// and "// ===END PRIVATE DOMAINS===" as private. +// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed. +// * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS===" +// and "// ===END PRIVATE DOMAINS===". + +#include <map> +#include <set> +#include <string> #include "base/at_exit.h" #include "base/command_line.h" #include "base/file_util.h" +#include "base/file_util.h" #include "base/files/file_path.h" #include "base/i18n/icu_util.h" #include "base/logging.h" #include "base/path_service.h" #include "base/process_util.h" -#include "net/tools/tld_cleanup/tld_cleanup_util.h" +#include "base/string_util.h" +#include "googleurl/src/gurl.h" +#include "googleurl/src/url_parse.h" + +namespace { +struct Rule { + bool exception; + bool wildcard; +}; + +typedef std::map<std::string, Rule> RuleMap; +typedef std::set<std::string> RuleSet; + +const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; +const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; +} + +// Writes the list of domain rules contained in the 'rules' set to the +// 'outfile', with each rule terminated by a LF. The file must already have +// been created with write access. +bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { + std::string data; + data.append( +"%{\n" +"// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" +"// Use of this source code is governed by a BSD-style license that can be\n" +"// found in the LICENSE file.\n\n" +"// This file is generated by net/tools/tld_cleanup/.\n" +"// DO NOT MANUALLY EDIT!\n" +"%}\n" +"struct DomainRule {\n" +" const char *name;\n" +" int type; // 1: exception, 2: wildcard\n" +"};\n" +"%%\n" + ); + + for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { + data.append(i->first); + data.append(", "); + if (i->second.exception) { + data.append("1"); + } else if (i->second.wildcard) { + data.append("2"); + } else { + data.append("0"); + } + data.append("\n"); + } + + data.append("%%\n"); + + int written = file_util::WriteFile(outfile, data.data(), data.size()); + + return written == static_cast<int>(data.size()); +} + +// These result codes should be in increasing order of severity. +typedef enum { + kSuccess, + kWarning, + kError, +} NormalizeResult; + +// Adjusts the rule to a standard form: removes single extraneous dots and +// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as +// valid; logs a warning and returns kWarning if it is probably invalid; and +// logs an error and returns kError if the rule is (almost) certainly invalid. +NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { + NormalizeResult result = kSuccess; + + // Strip single leading and trailing dots. + if (domain->at(0) == '.') + domain->erase(0, 1); + if (domain->empty()) { + LOG(WARNING) << "Ignoring empty rule"; + return kWarning; + } + if (domain->at(domain->size() - 1) == '.') + domain->erase(domain->size() - 1, 1); + if (domain->empty()) { + LOG(WARNING) << "Ignoring empty rule"; + return kWarning; + } + + // Allow single leading '*.' or '!', saved here so it's not canonicalized. + size_t start_offset = 0; + if (domain->at(0) == '!') { + domain->erase(0, 1); + rule->exception = true; + } else if (domain->find("*.") == 0) { + domain->erase(0, 2); + rule->wildcard = true; + } + if (domain->empty()) { + LOG(WARNING) << "Ignoring empty rule"; + return kWarning; + } + + // Warn about additional '*.' or '!'. + if (domain->find("*.", start_offset) != std::string::npos || + domain->find('!', start_offset) != std::string::npos) { + LOG(WARNING) << "Keeping probably invalid rule: " << *domain; + result = kWarning; + } + + // Make a GURL and normalize it, then get the host back out. + std::string url = "http://"; + url.append(*domain); + GURL gurl(url); + const std::string& spec = gurl.possibly_invalid_spec(); + url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host; + if (host.len < 0) { + LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; + return kError; + } + if (!gurl.is_valid()) { + LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; + result = kWarning; + } + domain->assign(spec.substr(host.begin, host.len)); + + return result; +} + +// Loads the file described by 'in_filename', converts it to the desired format +// (see the file comments above), and saves it into 'out_filename'. Returns +// the most severe of the result codes encountered when normalizing the rules. +NormalizeResult NormalizeFile(const base::FilePath& in_filename, + const base::FilePath& out_filename) { + std::string data; + if (!file_util::ReadFileToString(in_filename, &data)) { + LOG(ERROR) << "Unable to read file"; + // We return success since we've already reported the error. + return kSuccess; + } + + // We do a lot of string assignment during parsing, but simplicity is more + // important than performance here. + std::string domain; + NormalizeResult result = kSuccess; + size_t line_start = 0; + size_t line_end = 0; + RuleMap rules; + RuleSet extra_rules; + int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; + while (line_start < data.size()) { + // Skip the entire section of private domains. + // TODO(pamg): remove this when http://crbug.com/96086 is fixed. + if (line_start + begin_private_length < data.size() && + !data.compare(line_start, begin_private_length, + kBeginPrivateDomainsComment)) { + line_end = data.find(kEndPrivateDomainsComment, line_start); + if (line_end == std::string::npos) { + LOG(WARNING) << "Private-domain section had no end marker."; + line_end = data.size(); + } + } else if (line_start + 1 < data.size() && + data[line_start] == '/' && + data[line_start + 1] == '/') { + // Skip comments. + line_end = data.find_first_of("\r\n", line_start); + if (line_end == std::string::npos) + line_end = data.size(); + } else { + // Truncate at first whitespace. + line_end = data.find_first_of("\r\n \t", line_start); + if (line_end == std::string::npos) + line_end = data.size(); + domain.assign(data.data(), line_start, line_end - line_start); + + Rule rule; + rule.wildcard = false; + rule.exception = false; + NormalizeResult new_result = NormalizeRule(&domain, &rule); + if (new_result != kError) { + // Check the existing rules to make sure we don't have an exception and + // wildcard for the same rule. If we did, we'd have to update our + // parsing code to handle this case. + CHECK(rules.find(domain) == rules.end()); + + rules[domain] = rule; + // Add true TLD for multi-level rules. We don't add them right now, in + // case there's an exception or wild card that either exists or might be + // added in a later iteration. In those cases, there's no need to add + // it and it would just slow down parsing the data. + size_t tld_start = domain.find_last_of('.'); + if (tld_start != std::string::npos && tld_start + 1 < domain.size()) + extra_rules.insert(domain.substr(tld_start + 1)); + } + result = std::max(result, new_result); + } + + // Find beginning of next non-empty line. + line_start = data.find_first_of("\r\n", line_end); + if (line_start == std::string::npos) + line_start = data.size(); + line_start = data.find_first_not_of("\r\n", line_start); + if (line_start == std::string::npos) + line_start = data.size(); + } + + for (RuleSet::const_iterator iter = extra_rules.begin(); + iter != extra_rules.end(); + ++iter) { + if (rules.find(*iter) == rules.end()) { + Rule rule; + rule.exception = false; + rule.wildcard = false; + rules[*iter] = rule; + } + } + + if (!WriteRules(rules, out_filename)) { + LOG(ERROR) << "Error(s) writing output file"; + result = kError; + } + + return result; +} int main(int argc, const char* argv[]) { base::EnableTerminationOnHeapCorruption(); @@ -82,14 +307,13 @@ int main(int argc, const char* argv[]) { "registry_controlled_domains")) .Append(FILE_PATH_LITERAL( "effective_tld_names.gperf")); - net::tld_cleanup::NormalizeResult result = - net::tld_cleanup::NormalizeFile(input_file, output_file); - if (result != net::tld_cleanup::kSuccess) { + NormalizeResult result = NormalizeFile(input_file, output_file); + if (result != kSuccess) { fprintf(stderr, "Errors or warnings processing file. See log in tld_cleanup.log."); } - if (result == net::tld_cleanup::kError) + if (result == kError) return 1; return 0; } |