diff options
author | nyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-14 20:49:29 +0000 |
---|---|---|
committer | nyquist@chromium.org <nyquist@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-14 20:49:29 +0000 |
commit | ed32c214924922c8729ac5d40782bb0b6687d6d0 (patch) | |
tree | 2194c4bdc380a423fcf8c9fefeeea1c79a723843 /net/tools/tld_cleanup/tld_cleanup.cc | |
parent | b0e9dbad146da85a98b6fe1240b2aa5ce6d51daf (diff) | |
download | chromium_src-ed32c214924922c8729ac5d40782bb0b6687d6d0.zip chromium_src-ed32c214924922c8729ac5d40782bb0b6687d6d0.tar.gz chromium_src-ed32c214924922c8729ac5d40782bb0b6687d6d0.tar.bz2 |
Add support for split Public Suffix List distinctions.
This adds support for the private additions to the Public Suffix List.
* Since net::RegistryControlledDomainService only contained static methods, this
CL changes these methods to be contained within the namespace
net::registry_controlled_domains and removes the class entirely.
* All methods defined as part of net::registry_controlled_domains now
have a mandatory argument to specify whether the private registries
should be included.
* Since the old implementation did not take into account the private
registries, this sets all old callers to use EXCLUDE_PRIVATE as the
net::registry_controlled_domains::PrivateRegistryFilter argument.
* Changes the parameter for including unknown registries or not to be an enum
instead of a boolean, using a similar naming scheme as for the private
registries: net::registry_controlled_domains::UnknownRegistryFilter.
* This also updates the effective-TLD data file to:
45cfff9c781f 2013-04-23 11:51 +0100
It includes changes from a number of Mozilla bugs, listed on
https://hg.mozilla.org/mozilla-central/log/45cfff9c781f/netwerk/dns/effective_tld_names.dat
between 290afd57d2a8 (2012-07-04 16:08 +0100) and
45cfff9c781f (2013-04-23 11:51 +0100).
Patch set 1 is equal to the committed patch set from:
https://codereview.chromium.org/13979002/
TBRing OWNERs from original CL.
TBR=pam@chromium.org
BUG=37436, 96086
Review URL: https://codereview.chromium.org/15140003
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@200066 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/tools/tld_cleanup/tld_cleanup.cc')
-rw-r--r-- | net/tools/tld_cleanup/tld_cleanup.cc | 238 |
1 files changed, 7 insertions, 231 deletions
diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup.cc index 1162d98..485bece 100644 --- a/net/tools/tld_cleanup/tld_cleanup.cc +++ b/net/tools/tld_cleanup/tld_cleanup.cc @@ -21,243 +21,18 @@ // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. // * Canonicalizes each rule's domain by converting it to a GURL and back. // * Adds explicit rules for true TLDs found in any rule. -// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed. -// * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS===" -// and "// ===END PRIVATE DOMAINS===". - -#include <map> -#include <set> -#include <string> +// * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS===" +// and "// ===END PRIVATE DOMAINS===" as private. #include "base/at_exit.h" #include "base/command_line.h" #include "base/file_util.h" -#include "base/file_util.h" #include "base/files/file_path.h" #include "base/i18n/icu_util.h" #include "base/logging.h" #include "base/path_service.h" #include "base/process_util.h" -#include "base/string_util.h" -#include "googleurl/src/gurl.h" -#include "googleurl/src/url_parse.h" - -namespace { -struct Rule { - bool exception; - bool wildcard; -}; - -typedef std::map<std::string, Rule> RuleMap; -typedef std::set<std::string> RuleSet; - -const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; -const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; -} - -// Writes the list of domain rules contained in the 'rules' set to the -// 'outfile', with each rule terminated by a LF. The file must already have -// been created with write access. -bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { - std::string data; - data.append( -"%{\n" -"// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" -"// Use of this source code is governed by a BSD-style license that can be\n" -"// found in the LICENSE file.\n\n" -"// This file is generated by net/tools/tld_cleanup/.\n" -"// DO NOT MANUALLY EDIT!\n" -"%}\n" -"struct DomainRule {\n" -" const char *name;\n" -" int type; // 1: exception, 2: wildcard\n" -"};\n" -"%%\n" - ); - - for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { - data.append(i->first); - data.append(", "); - if (i->second.exception) { - data.append("1"); - } else if (i->second.wildcard) { - data.append("2"); - } else { - data.append("0"); - } - data.append("\n"); - } - - data.append("%%\n"); - - int written = file_util::WriteFile(outfile, data.data(), data.size()); - - return written == static_cast<int>(data.size()); -} - -// These result codes should be in increasing order of severity. -typedef enum { - kSuccess, - kWarning, - kError, -} NormalizeResult; - -// Adjusts the rule to a standard form: removes single extraneous dots and -// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as -// valid; logs a warning and returns kWarning if it is probably invalid; and -// logs an error and returns kError if the rule is (almost) certainly invalid. -NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { - NormalizeResult result = kSuccess; - - // Strip single leading and trailing dots. - if (domain->at(0) == '.') - domain->erase(0, 1); - if (domain->empty()) { - LOG(WARNING) << "Ignoring empty rule"; - return kWarning; - } - if (domain->at(domain->size() - 1) == '.') - domain->erase(domain->size() - 1, 1); - if (domain->empty()) { - LOG(WARNING) << "Ignoring empty rule"; - return kWarning; - } - - // Allow single leading '*.' or '!', saved here so it's not canonicalized. - size_t start_offset = 0; - if (domain->at(0) == '!') { - domain->erase(0, 1); - rule->exception = true; - } else if (domain->find("*.") == 0) { - domain->erase(0, 2); - rule->wildcard = true; - } - if (domain->empty()) { - LOG(WARNING) << "Ignoring empty rule"; - return kWarning; - } - - // Warn about additional '*.' or '!'. - if (domain->find("*.", start_offset) != std::string::npos || - domain->find('!', start_offset) != std::string::npos) { - LOG(WARNING) << "Keeping probably invalid rule: " << *domain; - result = kWarning; - } - - // Make a GURL and normalize it, then get the host back out. - std::string url = "http://"; - url.append(*domain); - GURL gurl(url); - const std::string& spec = gurl.possibly_invalid_spec(); - url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host; - if (host.len < 0) { - LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; - return kError; - } - if (!gurl.is_valid()) { - LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; - result = kWarning; - } - domain->assign(spec.substr(host.begin, host.len)); - - return result; -} - -// Loads the file described by 'in_filename', converts it to the desired format -// (see the file comments above), and saves it into 'out_filename'. Returns -// the most severe of the result codes encountered when normalizing the rules. -NormalizeResult NormalizeFile(const base::FilePath& in_filename, - const base::FilePath& out_filename) { - std::string data; - if (!file_util::ReadFileToString(in_filename, &data)) { - LOG(ERROR) << "Unable to read file"; - // We return success since we've already reported the error. - return kSuccess; - } - - // We do a lot of string assignment during parsing, but simplicity is more - // important than performance here. - std::string domain; - NormalizeResult result = kSuccess; - size_t line_start = 0; - size_t line_end = 0; - RuleMap rules; - RuleSet extra_rules; - int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; - while (line_start < data.size()) { - // Skip the entire section of private domains. - // TODO(pamg): remove this when http://crbug.com/96086 is fixed. - if (line_start + begin_private_length < data.size() && - !data.compare(line_start, begin_private_length, - kBeginPrivateDomainsComment)) { - line_end = data.find(kEndPrivateDomainsComment, line_start); - if (line_end == std::string::npos) { - LOG(WARNING) << "Private-domain section had no end marker."; - line_end = data.size(); - } - } else if (line_start + 1 < data.size() && - data[line_start] == '/' && - data[line_start + 1] == '/') { - // Skip comments. - line_end = data.find_first_of("\r\n", line_start); - if (line_end == std::string::npos) - line_end = data.size(); - } else { - // Truncate at first whitespace. - line_end = data.find_first_of("\r\n \t", line_start); - if (line_end == std::string::npos) - line_end = data.size(); - domain.assign(data.data(), line_start, line_end - line_start); - - Rule rule; - rule.wildcard = false; - rule.exception = false; - NormalizeResult new_result = NormalizeRule(&domain, &rule); - if (new_result != kError) { - // Check the existing rules to make sure we don't have an exception and - // wildcard for the same rule. If we did, we'd have to update our - // parsing code to handle this case. - CHECK(rules.find(domain) == rules.end()); - - rules[domain] = rule; - // Add true TLD for multi-level rules. We don't add them right now, in - // case there's an exception or wild card that either exists or might be - // added in a later iteration. In those cases, there's no need to add - // it and it would just slow down parsing the data. - size_t tld_start = domain.find_last_of('.'); - if (tld_start != std::string::npos && tld_start + 1 < domain.size()) - extra_rules.insert(domain.substr(tld_start + 1)); - } - result = std::max(result, new_result); - } - - // Find beginning of next non-empty line. - line_start = data.find_first_of("\r\n", line_end); - if (line_start == std::string::npos) - line_start = data.size(); - line_start = data.find_first_not_of("\r\n", line_start); - if (line_start == std::string::npos) - line_start = data.size(); - } - - for (RuleSet::const_iterator iter = extra_rules.begin(); - iter != extra_rules.end(); - ++iter) { - if (rules.find(*iter) == rules.end()) { - Rule rule; - rule.exception = false; - rule.wildcard = false; - rules[*iter] = rule; - } - } - - if (!WriteRules(rules, out_filename)) { - LOG(ERROR) << "Error(s) writing output file"; - result = kError; - } - - return result; -} +#include "net/tools/tld_cleanup/tld_cleanup_util.h" int main(int argc, const char* argv[]) { base::EnableTerminationOnHeapCorruption(); @@ -307,13 +82,14 @@ int main(int argc, const char* argv[]) { "registry_controlled_domains")) .Append(FILE_PATH_LITERAL( "effective_tld_names.gperf")); - NormalizeResult result = NormalizeFile(input_file, output_file); - if (result != kSuccess) { + net::tld_cleanup::NormalizeResult result = + net::tld_cleanup::NormalizeFile(input_file, output_file); + if (result != net::tld_cleanup::kSuccess) { fprintf(stderr, "Errors or warnings processing file. See log in tld_cleanup.log."); } - if (result == kError) + if (result == net::tld_cleanup::kError) return 1; return 0; } |