diff options
Diffstat (limited to 'net/tools/tld_cleanup/tld_cleanup.cc')
-rw-r--r-- | net/tools/tld_cleanup/tld_cleanup.cc | 266 |
1 files changed, 266 insertions, 0 deletions
diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup.cc new file mode 100644 index 0000000..2efac1b7 --- /dev/null +++ b/net/tools/tld_cleanup/tld_cleanup.cc @@ -0,0 +1,266 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This command-line program converts an effective-TLD data file in UTF-8 from +// the format provided by Mozilla to the format expected by Chrome. Any errors +// or warnings are recorded in tld_cleanup.log. +// +// In particular, it +// * Strips blank lines and comments, as well as notes for individual rules. +// * Changes all line endings to LF. +// * Strips a single leading and/or trailing dot from each rule, if present. +// * Logs a warning if a rule contains '!' or '*.' other than at the beginning +// of the rule. (This also catches multiple ! or *. at the start of a rule.) +// * Logs a warning if GURL reports a rule as invalid, but keeps the rule. +// * Canonicalizes each rule's domain by converting it to a GURL and back. +// * Adds explicit rules for true TLDs found in any rule. + +#include <windows.h> +#include <set> +#include <string> + +#include "base/file_util.h" +#include "base/icu_util.h" +#include "base/logging.h" +#include "base/path_service.h" +#include "base/string_util.h" +#include "googleurl/src/gurl.h" +#include "googleurl/src/url_parse.h" + +static const wchar_t* const kLogFileName = L"tld_cleanup.log"; +typedef std::set<std::string> StringSet; + +// Writes the list of domain rules contained in the 'rules' set to the +// 'outfile', with each rule terminated by a LF. The file must already have +// been created with write access. +bool WriteRules(const StringSet& rules, HANDLE outfile) { + std::string data; + for (StringSet::const_iterator iter = rules.begin(); + iter != rules.end(); + ++iter) { + data.append(*iter); + data.append(1, '\n'); + } + + unsigned long written = 0; + BOOL success = WriteFile(outfile, + data.data(), + static_cast<long>(data.size()), + &written, + NULL); + return (success && written == static_cast<long>(data.size())); +} + +// These result codes should be in increasing order of severity. +typedef enum { + kSuccess, + kWarning, + kError, +} NormalizeResult; + +// Adjusts the rule to a standard form: removes single extraneous dots and +// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as +// valid; logs a warning and returns kWarning if it is probably invalid; and +// logs an error and returns kError if the rule is (almost) certainly invalid. +NormalizeResult NormalizeRule(std::string* rule) { + NormalizeResult result = kSuccess; + + // Strip single leading and trailing dots. + if (rule->at(0) == '.') + rule->erase(0, 1); + if (rule->size() == 0) { + LOG(WARNING) << "Ignoring empty rule"; + return kWarning; + } + if (rule->at(rule->size() - 1) == '.') + rule->erase(rule->size() - 1, 1); + if (rule->size() == 0) { + LOG(WARNING) << "Ignoring empty rule"; + return kWarning; + } + + // Allow single leading '*.' or '!', saved here so it's not canonicalized. + bool wildcard = false; + bool exception = false; + size_t start_offset = 0; + if (rule->at(0) == '!') { + rule->erase(0, 1); + exception = true; + } else if (rule->find("*.") == 0) { + rule->erase(0, 2); + wildcard = true; + } + if (rule->size() == 0) { + LOG(WARNING) << "Ignoring empty rule"; + return kWarning; + } + + // Warn about additional '*.' or '!'. + if (rule->find("*.", start_offset) != std::string::npos || + rule->find('!', start_offset) != std::string::npos) { + LOG(WARNING) << "Keeping probably invalid rule: " << *rule; + result = kWarning; + } + + // Make a GURL and normalize it, then get the host back out. + std::string url = "http://"; + url.append(*rule); + GURL gurl(url); + const std::string& spec = gurl.possibly_invalid_spec(); + url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host; + if (host.len < 0) { + LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *rule; + return kError; + } + if (!gurl.is_valid()) { + LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *rule; + result = kWarning; + } + rule->assign(spec.substr(host.begin, host.len)); + + // Restore wildcard or exception marker. + if (exception) + rule->insert(0, 1, '!'); + else if (wildcard) + rule->insert(0, "*."); + + return result; +} + +// Loads the file described by 'in_filename', converts it to the desired format +// (see the file comments above), and saves it into 'out_filename'. Returns +// the most severe of the result codes encountered when normalizing the rules. +NormalizeResult NormalizeFile(const std::wstring& in_filename, + const std::wstring& out_filename) { + std::string data; + if (!file_util::ReadFileToString(in_filename, &data)) { + fwprintf(stderr, L"Unable to read file %s\n", in_filename.c_str()); + // We return success since we've already reported the error. + return kSuccess; + } + + HANDLE outfile(CreateFile(out_filename.c_str(), + GENERIC_WRITE, + 0, + NULL, + CREATE_ALWAYS, + FILE_ATTRIBUTE_NORMAL, + NULL)); + if (outfile == INVALID_HANDLE_VALUE) { + fwprintf(stderr, L"Unable to write file %s\n", out_filename.c_str()); + // We return success since we've already reported the error. + return kSuccess; + } + + // We do a lot of string assignment during parsing, but simplicity is more + // important than performance here. + std::string rule; + NormalizeResult result = kSuccess; + size_t line_start = 0; + size_t line_end = 0; + StringSet rules; + while (line_start < data.size()) { + // Skip comments. + if (line_start + 1 < data.size() && + data[line_start] == '/' && + data[line_start + 1] == '/') { + line_end = data.find_first_of("\r\n", line_start); + if (line_end == std::string::npos) + line_end = data.size(); + } else { + // Truncate at first whitespace. + line_end = data.find_first_of("\r\n \t", line_start); + if (line_end == std::string::npos) + line_end = data.size(); + rule.assign(data.data(), line_start, line_end - line_start); + + NormalizeResult new_result = NormalizeRule(&rule); + if (new_result != kError) { + rules.insert(rule); + // Add true TLD for multi-level rules. + size_t tld_start = rule.find_last_of('.'); + if (tld_start != std::string::npos && tld_start + 1 < rule.size()) + rules.insert(rule.substr(tld_start + 1)); + } + result = std::max(result, new_result); + } + + // Find beginning of next non-empty line. + line_start = data.find_first_of("\r\n", line_end); + if (line_start == std::string::npos) + line_start = data.size(); + line_start = data.find_first_not_of("\r\n", line_start); + if (line_start == std::string::npos) + line_start = data.size(); + } + + if (!WriteRules(rules, outfile)) { + LOG(ERROR) << "Error(s) writing " << out_filename; + result = kError; + } + + return result; +} + +int main(int argc, const char* argv[]) { + if (argc != 3) { + fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); + fprintf(stderr, "Usage: %s <input> <output>\n", argv[0]); + return 1; + } + + // Only use OutputDebugString in debug mode. +#ifdef NDEBUG + logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE; +#else + logging::LoggingDestination destination = + logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG; +#endif + + std::wstring log_filename; + PathService::Get(base::DIR_EXE, &log_filename); + file_util::AppendToPath(&log_filename, kLogFileName); + logging::InitLogging(log_filename.c_str(), + destination, + logging::LOCK_LOG_FILE, + logging::DELETE_OLD_LOG_FILE); + + icu_util::Initialize(); + + NormalizeResult result = NormalizeFile(UTF8ToWide(argv[1]), + UTF8ToWide(argv[2])); + if (result != kSuccess) { + fwprintf(stderr, L"Errors or warnings processing file. See log in %s.", + kLogFileName); + } + + if (result == kError) + return 1; + return 0; +} |