diff options
author | inferno@chromium.org <inferno@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-30 17:40:00 +0000 |
---|---|---|
committer | inferno@chromium.org <inferno@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-30 17:40:00 +0000 |
commit | 69ee9c92af91fdad5aabf0f5696cd0432102985f (patch) | |
tree | dd6bd42ab4fb07d42fa13a74f0c445b31988148e /chrome/browser/safe_browsing | |
parent | 87a833ed78a8b1b94f134aa55a7803d53dccb03d (diff) | |
download | chromium_src-69ee9c92af91fdad5aabf0f5696cd0432102985f.zip chromium_src-69ee9c92af91fdad5aabf0f5696cd0432102985f.tar.gz chromium_src-69ee9c92af91fdad5aabf0f5696cd0432102985f.tar.bz2 |
Canonicalize the url based on Section 6.1 Safe Browsing Spec.
BUG=7713
TEST=SafeBrowsingUtilTest.CanonicalizeUrl
Review URL: http://codereview.chromium.org/1275002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@43100 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/safe_browsing')
-rw-r--r-- | chrome/browser/safe_browsing/safe_browsing_util.cc | 152 | ||||
-rw-r--r-- | chrome/browser/safe_browsing/safe_browsing_util.h | 7 | ||||
-rw-r--r-- | chrome/browser/safe_browsing/safe_browsing_util_unittest.cc | 202 |
3 files changed, 355 insertions, 6 deletions
diff --git a/chrome/browser/safe_browsing/safe_browsing_util.cc b/chrome/browser/safe_browsing/safe_browsing_util.cc index 4d7c7ae..3fa3a27 100644 --- a/chrome/browser/safe_browsing/safe_browsing_util.cc +++ b/chrome/browser/safe_browsing/safe_browsing_util.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -10,6 +10,7 @@ #include "base/string_util.h" #include "chrome/browser/google_util.h" #include "googleurl/src/gurl.h" +#include "googleurl/src/url_util.h" #include "net/base/escape.h" #include "unicode/locid.h" @@ -161,9 +162,144 @@ std::string GetListName(int list_id) { return (list_id == PHISH) ? kPhishingList : std::string(); } +std::string Unescape(const std::string& url) { + std::string unescaped_str(url); + std::string old_unescaped_str; + const int kMaxLoopIterations = 1024; + int loop_var = 0; + do { + old_unescaped_str = unescaped_str; + unescaped_str = UnescapeURLComponent(old_unescaped_str, + UnescapeRule::CONTROL_CHARS | UnescapeRule::SPACES | + UnescapeRule::URL_SPECIAL_CHARS); + } while (unescaped_str != old_unescaped_str && ++loop_var <= + kMaxLoopIterations); + + return unescaped_str; +} + +std::string Escape(const std::string& url) { + std::string escaped_str; + const char* kHexString = "0123456789ABCDEF"; + for (size_t i = 0; i < url.length(); i++) { + unsigned char c = static_cast<unsigned char>(url[i]); + if (c <= ' ' || c > '~' || c == '#' || c == '%') { + escaped_str.push_back('%'); + escaped_str.push_back(kHexString[c >> 4]); + escaped_str.push_back(kHexString[c & 0xf]); + } else { + escaped_str.push_back(c); + } + } + + return escaped_str; +} + +std::string RemoveConsecutiveChars(const std::string& str, const char c) { + std::string output(str); + std::string string_to_find; + std::string::size_type loc = 0; + string_to_find.append(2, c); + while ((loc = output.find(string_to_find, loc)) != std::string::npos) { + output.erase(loc, 1); + } + + return output; +} + +// Canonicalizes url as per Google Safe Browsing Specification. +// See section 6.1 in +// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. +void CanonicalizeUrl(const GURL& url, + std::string* canonicalized_hostname, + std::string* canonicalized_path, + std::string* canonicalized_query) { + // Following canonicalization steps are excluded since url parsing takes care + // of those :- + // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url. + // (Exclude escaped version of these chars). + // 2. Normalize hostname to 4 dot-seperated decimal values. + // 3. Lowercase hostname. + // 4. Resolve path sequences "/../" and "/./". + + // That leaves us with the following :- + // 1. Remove fragment in URL. + GURL url_without_fragment; + GURL::Replacements f_replacements; + f_replacements.ClearRef(); + f_replacements.ClearUsername(); + f_replacements.ClearPassword(); + url_without_fragment = url.ReplaceComponents(f_replacements); + + // 2. Do URL unescaping until no more hex encoded characters exist. + std::string url_unescaped_str(Unescape(url_without_fragment.spec())); + url_parse::Parsed parsed; + url_parse::ParseStandardURL(url_unescaped_str.data(), + url_unescaped_str.length(), &parsed); + + // 3. In hostname, remove all leading and trailing dots. + const std::string host = (parsed.host.len > 0) ? url_unescaped_str.substr( + parsed.host.begin, parsed.host.len) : ""; + const char kCharsToTrim[] = "."; + std::string host_without_end_dots; + TrimString(host, kCharsToTrim, &host_without_end_dots); + + // 4. In hostname, replace consecutive dots with a single dot. + std::string host_without_consecutive_dots(RemoveConsecutiveChars( + host_without_end_dots, '.')); + + // 5. In path, replace runs of consecutive slashes with a single slash. + std::string path = (parsed.path.len > 0) ? url_unescaped_str.substr( + parsed.path.begin, parsed.path.len): ""; + std::string path_without_consecutive_slash(RemoveConsecutiveChars( + path, '/')); + + url_canon::Replacements<char> hp_replacements; + hp_replacements.SetHost(host_without_consecutive_dots.data(), + url_parse::Component(0, host_without_consecutive_dots.length())); + hp_replacements.SetPath(path_without_consecutive_slash.data(), + url_parse::Component(0, path_without_consecutive_slash.length())); + + std::string url_unescaped_with_can_hostpath; + url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath); + url_parse::Parsed temp_parsed; + url_util::ReplaceComponents(url_unescaped_str.data(), + url_unescaped_str.length(), parsed, + hp_replacements, NULL, &output, &temp_parsed); + output.Complete(); + + // 6. Step needed to revert escaping done in url_util::ReplaceComponents. + url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath); + + // 7. After performing all above steps, percent-escape all chars in url which + // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters. + std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath)); + url_parse::Parsed final_parsed; + url_parse::ParseStandardURL(escaped_canon_url_str.data(), + escaped_canon_url_str.length(), &final_parsed); + + if (canonicalized_hostname && final_parsed.host.len > 0) { + *canonicalized_hostname = + escaped_canon_url_str.substr(final_parsed.host.begin, + final_parsed.host.len); + } + if (canonicalized_path && final_parsed.path.len > 0) { + *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin, + final_parsed.path.len); + } + if (canonicalized_query && final_parsed.query.len > 0) { + *canonicalized_query = escaped_canon_url_str.substr( + final_parsed.query.begin, final_parsed.query.len); + } +} + void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { hosts->clear(); - const std::string host = url.host(); // const sidesteps GCC bugs below! + + std::string canon_host; + CanonicalizeUrl(url, &canon_host, NULL, NULL); + + const std::string host = canon_host; // const sidesteps GCC bugs below! if (host.empty()) return; @@ -196,7 +332,13 @@ void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { paths->clear(); - const std::string path = url.path(); // const sidesteps GCC bugs below! + + std::string canon_path; + std::string canon_query; + CanonicalizeUrl(url, NULL, &canon_path, &canon_query); + + const std::string path = canon_path; // const sidesteps GCC bugs below! + const std::string query = canon_query; if (path.empty()) return; @@ -215,8 +357,8 @@ void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { if (paths->back() != path) paths->push_back(path); - if (url.has_query()) - paths->push_back(path + "?" + url.query()); + if (!query.empty()) + paths->push_back(path + "?" + query); } int CompareFullHashes(const GURL& url, diff --git a/chrome/browser/safe_browsing/safe_browsing_util.h b/chrome/browser/safe_browsing/safe_browsing_util.h index ea58891..ea95a33 100644 --- a/chrome/browser/safe_browsing/safe_browsing_util.h +++ b/chrome/browser/safe_browsing/safe_browsing_util.h @@ -274,6 +274,13 @@ enum ListType { int GetListId(const std::string& name); std::string GetListName(int list_id); +// Canonicalizes url as per Google Safe Browsing Specification. +// See section 6.1 in +// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. +void CanonicalizeUrl(const GURL& url, std::string* canonicalized_hostname, + std::string* canonicalized_path, + std::string* canonicalized_query); + // Given a URL, returns all the hosts we need to check. They are returned // in order of size (i.e. b.c is first, then a.b.c). void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts); diff --git a/chrome/browser/safe_browsing/safe_browsing_util_unittest.cc b/chrome/browser/safe_browsing/safe_browsing_util_unittest.cc index 8e37b9c..d74982c 100644 --- a/chrome/browser/safe_browsing/safe_browsing_util_unittest.cc +++ b/chrome/browser/safe_browsing/safe_browsing_util_unittest.cc @@ -1,10 +1,11 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include <algorithm> #include "base/sha2.h" +#include "base/string_util.h" #include "chrome/browser/safe_browsing/safe_browsing_util.h" #include "googleurl/src/gurl.h" #include "testing/gtest/include/gtest/gtest.h" @@ -59,6 +60,205 @@ TEST(SafeBrowsingUtilTest, UrlParsing) { EXPECT_TRUE(VectorContains(paths, "/")); } +// Tests the url canonicalization according to the Safe Browsing spec. +// See section 6.1 in +// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. +TEST(SafeBrowsingUtilTest, CanonicalizeUrl) { + struct { + const char* input_url; + const char* expected_canonicalized_hostname; + const char* expected_canonicalized_path; + const char* expected_canonicalized_query; + } tests[] = { + { + "http://host/%25%32%35", + "host", + "/%25", + "" + }, { + "http://host/%25%32%35%25%32%35", + "host", + "/%25%25", + "" + }, { + "http://host/%2525252525252525", + "host", + "/%25", + "" + }, { + "http://host/asdf%25%32%35asd", + "host", + "/asdf%25asd", + "" + }, { + "http://host/%%%25%32%35asd%%", + "host", + "/%25%25%25asd%25%25", + "" + }, { + "http://host/%%%25%32%35asd%%", + "host", + "/%25%25%25asd%25%25", + "" + }, { + "http://www.google.com/", + "www.google.com", + "/", + "" + }, { + "http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77" + "%77%77%2E%65%62%61%79%2E%63%6F%6D/", + "168.188.99.26", + "/.secure/www.ebay.com/", + "" + }, { + "http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserd" + "ataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/", + "195.127.0.11", + "/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmv" + "alidateinfoswqpcmlx=hgplmcx/", + "" + }, { + "http://host.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A" + "22%252833%252944_55%252B", + "host.com", + "/~a!b@c%23d$e%25f^00&11*22(33)44_55+", + "" + }, { + "http://3279880203/blah", + "195.127.0.11", + "/blah", + "" + }, { + "http://www.google.com/blah/..", + "www.google.com", + "/", + "" + }, { + "http://www.google.com/blah#fraq", + "www.google.com", + "/blah", + "" + }, { + "http://www.GOOgle.com/", + "www.google.com", + "/", + "" + }, { + "http://www.google.com.../", + "www.google.com", + "/", + "" + }, { + "http://www.google.com/q?", + "www.google.com", + "/q", + "" + }, { + "http://www.google.com/q?r?", + "www.google.com", + "/q", + "r?" + }, { + "http://www.google.com/q?r?s", + "www.google.com", + "/q", + "r?s" + }, { + "http://evil.com/foo#bar#baz", + "evil.com", + "/foo", + "" + }, { + "http://evil.com/foo;", + "evil.com", + "/foo;", + "" + }, { + "http://evil.com/foo?bar;", + "evil.com", + "/foo", + "bar;" + }, { + "http://notrailingslash.com", + "notrailingslash.com", + "/", + "" + }, { + "http://www.gotaport.com:1234/", + "www.gotaport.com", + "/", + "" + }, { + " http://www.google.com/ ", + "www.google.com", + "/", + "" + }, { + "http:// leadingspace.com/", + "%20leadingspace.com", + "/", + "" + }, { + "http://%20leadingspace.com/", + "%20leadingspace.com", + "/", + "" + }, { + "https://www.securesite.com/", + "www.securesite.com", + "/", + "" + }, { + "http://host.com/ab%23cd", + "host.com", + "/ab%23cd", + "" + }, { + "http://host%3e.com//twoslashes?more//slashes", + "host>.com", + "/twoslashes", + "more//slashes" + }, { + "http://host.com/abc?val=xyz#anything", + "host.com", + "/abc", + "val=xyz" + }, { + "http://abc:def@host.com/xyz", + "host.com", + "/xyz", + "" + }, { + "http://host%3e.com/abc/%2e%2e%2fdef", + "host>.com", + "/def", + "" + }, { + "http://.......host...com.....//abc/////def%2F%2F%2Fxyz", + "host.com", + "/abc/def/xyz", + "" + }, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(tests); ++i) { + SCOPED_TRACE(StringPrintf("Test: %s", tests[i].input_url)); + GURL url(tests[i].input_url); + + std::string canonicalized_hostname; + std::string canonicalized_path; + std::string canonicalized_query; + safe_browsing_util::CanonicalizeUrl(url, &canonicalized_hostname, + &canonicalized_path, &canonicalized_query); + + EXPECT_EQ(tests[i].expected_canonicalized_hostname, + canonicalized_hostname); + EXPECT_EQ(tests[i].expected_canonicalized_path, + canonicalized_path); + EXPECT_EQ(tests[i].expected_canonicalized_query, + canonicalized_query); + } +} TEST(SafeBrowsingUtilTest, FullHashCompare) { GURL url("http://www.evil.com/phish.html"); |