Canonicalize the url based on Section 6.1 Safe Browsing Spec.

BUG=7713 TEST=SafeBrowsingUtilTest.CanonicalizeUrl Review URL: http://codereview.chromium.org/1275002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@43100 0039d316-1c4b-4281-b951-d872f2087c98
author: inferno@chromium.org <inferno@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-03-30 17:40:00 +0000
committer: inferno@chromium.org <inferno@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-03-30 17:40:00 +0000
commit: 69ee9c92af91fdad5aabf0f5696cd0432102985f (patch)
tree: dd6bd42ab4fb07d42fa13a74f0c445b31988148e /chrome/browser/safe_browsing
parent: 87a833ed78a8b1b94f134aa55a7803d53dccb03d (diff)
download: chromium_src-69ee9c92af91fdad5aabf0f5696cd0432102985f.zip
chromium_src-69ee9c92af91fdad5aabf0f5696cd0432102985f.tar.gz
chromium_src-69ee9c92af91fdad5aabf0f5696cd0432102985f.tar.bz2
3 files changed, 355 insertions, 6 deletions
diff --git a/chrome/browser/safe_browsing/safe_browsing_util.cc b/chrome/browser/safe_browsing/safe_browsing_util.cc
index 4d7c7ae..3fa3a27 100644
--- a/chrome/browser/safe_browsing/safe_browsing_util.cc
+++ b/chrome/browser/safe_browsing/safe_browsing_util.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -10,6 +10,7 @@
 #include "base/string_util.h"
 #include "chrome/browser/google_util.h"
 #include "googleurl/src/gurl.h"
+#include "googleurl/src/url_util.h"
 #include "net/base/escape.h"
 #include "unicode/locid.h"
 
@@ -161,9 +162,144 @@ std::string GetListName(int list_id) {
   return (list_id == PHISH) ? kPhishingList : std::string();
 }
 
+std::string Unescape(const std::string& url) {
+  std::string unescaped_str(url);
+  std::string old_unescaped_str;
+  const int kMaxLoopIterations = 1024;
+  int loop_var = 0;
+  do {
+    old_unescaped_str = unescaped_str;
+    unescaped_str = UnescapeURLComponent(old_unescaped_str,
+        UnescapeRule::CONTROL_CHARS | UnescapeRule::SPACES |
+        UnescapeRule::URL_SPECIAL_CHARS);
+  } while (unescaped_str != old_unescaped_str && ++loop_var <=
+           kMaxLoopIterations);
+
+  return unescaped_str;
+}
+
+std::string Escape(const std::string& url) {
+  std::string escaped_str;
+  const char* kHexString = "0123456789ABCDEF";
+  for (size_t i = 0; i < url.length(); i++) {
+    unsigned char c = static_cast<unsigned char>(url[i]);
+    if (c <= ' ' || c > '~' || c == '#' || c == '%') {
+      escaped_str.push_back('%');
+      escaped_str.push_back(kHexString[c >> 4]);
+      escaped_str.push_back(kHexString[c & 0xf]);
+    } else {
+      escaped_str.push_back(c);
+    }
+  }
+
+  return escaped_str;
+}
+
+std::string RemoveConsecutiveChars(const std::string& str, const char c) {
+  std::string output(str);
+  std::string string_to_find;
+  std::string::size_type loc = 0;
+  string_to_find.append(2, c);
+  while ((loc = output.find(string_to_find, loc)) != std::string::npos) {
+    output.erase(loc, 1);
+  }
+
+  return output;
+}
+
+// Canonicalizes url as per Google Safe Browsing Specification.
+// See section 6.1 in
+// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
+void CanonicalizeUrl(const GURL& url,
+                     std::string* canonicalized_hostname,
+                     std::string* canonicalized_path,
+                     std::string* canonicalized_query) {
+  // Following canonicalization steps are excluded since url parsing takes care
+  // of those :-
+  // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
+  //    (Exclude escaped version of these chars).
+  // 2. Normalize hostname to 4 dot-seperated decimal values.
+  // 3. Lowercase hostname.
+  // 4. Resolve path sequences "/../" and "/./".
+
+  // That leaves us with the following :-
+  // 1. Remove fragment in URL.
+  GURL url_without_fragment;
+  GURL::Replacements f_replacements;
+  f_replacements.ClearRef();
+  f_replacements.ClearUsername();
+  f_replacements.ClearPassword();
+  url_without_fragment = url.ReplaceComponents(f_replacements);
+
+  // 2. Do URL unescaping until no more hex encoded characters exist.
+  std::string url_unescaped_str(Unescape(url_without_fragment.spec()));
+  url_parse::Parsed parsed;
+  url_parse::ParseStandardURL(url_unescaped_str.data(),
+      url_unescaped_str.length(), &parsed);
+
+  // 3. In hostname, remove all leading and trailing dots.
+  const std::string host = (parsed.host.len > 0) ? url_unescaped_str.substr(
+      parsed.host.begin, parsed.host.len) : "";
+  const char kCharsToTrim[] = ".";
+  std::string host_without_end_dots;
+  TrimString(host, kCharsToTrim, &host_without_end_dots);
+
+  // 4. In hostname, replace consecutive dots with a single dot.
+  std::string host_without_consecutive_dots(RemoveConsecutiveChars(
+      host_without_end_dots, '.'));
+
+  // 5. In path, replace runs of consecutive slashes with a single slash.
+  std::string path = (parsed.path.len > 0) ? url_unescaped_str.substr(
+       parsed.path.begin, parsed.path.len): "";
+  std::string path_without_consecutive_slash(RemoveConsecutiveChars(
+      path, '/'));
+
+  url_canon::Replacements<char> hp_replacements;
+  hp_replacements.SetHost(host_without_consecutive_dots.data(),
+  url_parse::Component(0, host_without_consecutive_dots.length()));
+  hp_replacements.SetPath(path_without_consecutive_slash.data(),
+  url_parse::Component(0, path_without_consecutive_slash.length()));
+
+  std::string url_unescaped_with_can_hostpath;
+  url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);
+  url_parse::Parsed temp_parsed;
+  url_util::ReplaceComponents(url_unescaped_str.data(),
+                              url_unescaped_str.length(), parsed,
+                              hp_replacements, NULL, &output, &temp_parsed);
+  output.Complete();
+
+  // 6. Step needed to revert escaping done in url_util::ReplaceComponents.
+  url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);
+
+  // 7. After performing all above steps, percent-escape all chars in url which
+  // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
+  std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));
+  url_parse::Parsed final_parsed;
+  url_parse::ParseStandardURL(escaped_canon_url_str.data(),
+                              escaped_canon_url_str.length(), &final_parsed);
+
+  if (canonicalized_hostname && final_parsed.host.len > 0) {
+    *canonicalized_hostname =
+        escaped_canon_url_str.substr(final_parsed.host.begin,
+                                     final_parsed.host.len);
+  }
+  if (canonicalized_path && final_parsed.path.len > 0) {
+    *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,
+                                                       final_parsed.path.len);
+  }
+  if (canonicalized_query && final_parsed.query.len > 0) {
+    *canonicalized_query = escaped_canon_url_str.substr(
+        final_parsed.query.begin, final_parsed.query.len);
+  }
+}
+
 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
   hosts->clear();
-  const std::string host = url.host();  // const sidesteps GCC bugs below!
+
+  std::string canon_host;
+  CanonicalizeUrl(url, &canon_host, NULL, NULL);
+
+  const std::string host = canon_host;  // const sidesteps GCC bugs below!
   if (host.empty())
     return;
 
@@ -196,7 +332,13 @@ void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
 
 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
   paths->clear();
-  const std::string path = url.path();  // const sidesteps GCC bugs below!
+
+  std::string canon_path;
+  std::string canon_query;
+  CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
+
+  const std::string path = canon_path;   // const sidesteps GCC bugs below!
+  const std::string query = canon_query;
   if (path.empty())
     return;
 
@@ -215,8 +357,8 @@ void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
   if (paths->back() != path)
     paths->push_back(path);
 
-  if (url.has_query())
-    paths->push_back(path + "?" + url.query());
+  if (!query.empty())
+    paths->push_back(path + "?" + query);
 }
 
 int CompareFullHashes(const GURL& url,
diff --git a/chrome/browser/safe_browsing/safe_browsing_util.h b/chrome/browser/safe_browsing/safe_browsing_util.h
index ea58891..ea95a33 100644
--- a/chrome/browser/safe_browsing/safe_browsing_util.h
+++ b/chrome/browser/safe_browsing/safe_browsing_util.h
@@ -274,6 +274,13 @@ enum ListType {
 int GetListId(const std::string& name);
 std::string GetListName(int list_id);
 
+// Canonicalizes url as per Google Safe Browsing Specification.
+// See section 6.1 in
+// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
+void CanonicalizeUrl(const GURL& url, std::string* canonicalized_hostname,
+                     std::string* canonicalized_path,
+                     std::string* canonicalized_query);
+
 // Given a URL, returns all the hosts we need to check.  They are returned
 // in order of size (i.e. b.c is first, then a.b.c).
 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts);
diff --git a/chrome/browser/safe_browsing/safe_browsing_util_unittest.cc b/chrome/browser/safe_browsing/safe_browsing_util_unittest.cc
index 8e37b9c..d74982c 100644
--- a/chrome/browser/safe_browsing/safe_browsing_util_unittest.cc
+++ b/chrome/browser/safe_browsing/safe_browsing_util_unittest.cc
@@ -1,10 +1,11 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
 #include <algorithm>
 
 #include "base/sha2.h"
+#include "base/string_util.h"
 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
 #include "googleurl/src/gurl.h"
 #include "testing/gtest/include/gtest/gtest.h"
@@ -59,6 +60,205 @@ TEST(SafeBrowsingUtilTest, UrlParsing) {
   EXPECT_TRUE(VectorContains(paths, "/"));
 }
 
+// Tests the url canonicalization according to the Safe Browsing spec.
+// See section 6.1 in
+// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
+TEST(SafeBrowsingUtilTest, CanonicalizeUrl) {
+  struct {
+    const char* input_url;
+    const char* expected_canonicalized_hostname;
+    const char* expected_canonicalized_path;
+    const char* expected_canonicalized_query;
+  } tests[] = {
+    {
+      "http://host/%25%32%35",
+      "host",
+      "/%25",
+      ""
+    }, {
+      "http://host/%25%32%35%25%32%35",
+      "host",
+      "/%25%25",
+      ""
+    }, {
+      "http://host/%2525252525252525",
+      "host",
+      "/%25",
+      ""
+    }, {
+      "http://host/asdf%25%32%35asd",
+      "host",
+      "/asdf%25asd",
+      ""
+    }, {
+      "http://host/%%%25%32%35asd%%",
+      "host",
+      "/%25%25%25asd%25%25",
+      ""
+    }, {
+      "http://host/%%%25%32%35asd%%",
+      "host",
+      "/%25%25%25asd%25%25",
+      ""
+    }, {
+      "http://www.google.com/",
+      "www.google.com",
+      "/",
+      ""
+    }, {
+      "http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77"
+          "%77%77%2E%65%62%61%79%2E%63%6F%6D/",
+      "168.188.99.26",
+      "/.secure/www.ebay.com/",
+      ""
+    }, {
+      "http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserd"
+          "ataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/",
+      "195.127.0.11",
+      "/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmv"
+          "alidateinfoswqpcmlx=hgplmcx/",
+      ""
+    }, {
+      "http://host.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A"
+          "22%252833%252944_55%252B",
+      "host.com",
+      "/~a!b@c%23d$e%25f^00&11*22(33)44_55+",
+      ""
+    }, {
+      "http://3279880203/blah",
+      "195.127.0.11",
+      "/blah",
+      ""
+    }, {
+      "http://www.google.com/blah/..",
+      "www.google.com",
+      "/",
+      ""
+    }, {
+      "http://www.google.com/blah#fraq",
+      "www.google.com",
+      "/blah",
+      ""
+    }, {
+      "http://www.GOOgle.com/",
+      "www.google.com",
+      "/",
+      ""
+    }, {
+      "http://www.google.com.../",
+      "www.google.com",
+      "/",
+      ""
+    }, {
+      "http://www.google.com/q?",
+      "www.google.com",
+      "/q",
+      ""
+    }, {
+      "http://www.google.com/q?r?",
+      "www.google.com",
+      "/q",
+      "r?"
+    }, {
+      "http://www.google.com/q?r?s",
+      "www.google.com",
+      "/q",
+      "r?s"
+    }, {
+      "http://evil.com/foo#bar#baz",
+      "evil.com",
+      "/foo",
+      ""
+    }, {
+      "http://evil.com/foo;",
+      "evil.com",
+      "/foo;",
+      ""
+    }, {
+      "http://evil.com/foo?bar;",
+      "evil.com",
+      "/foo",
+      "bar;"
+    }, {
+      "http://notrailingslash.com",
+      "notrailingslash.com",
+      "/",
+      ""
+    }, {
+      "http://www.gotaport.com:1234/",
+      "www.gotaport.com",
+      "/",
+      ""
+    }, {
+      "  http://www.google.com/  ",
+      "www.google.com",
+      "/",
+      ""
+    }, {
+      "http:// leadingspace.com/",
+      "%20leadingspace.com",
+      "/",
+      ""
+    }, {
+      "http://%20leadingspace.com/",
+      "%20leadingspace.com",
+      "/",
+      ""
+    }, {
+      "https://www.securesite.com/",
+      "www.securesite.com",
+      "/",
+      ""
+    }, {
+      "http://host.com/ab%23cd",
+      "host.com",
+      "/ab%23cd",
+      ""
+    }, {
+      "http://host%3e.com//twoslashes?more//slashes",
+      "host>.com",
+      "/twoslashes",
+      "more//slashes"
+    }, {
+      "http://host.com/abc?val=xyz#anything",
+      "host.com",
+      "/abc",
+      "val=xyz"
+    }, {
+      "http://abc:def@host.com/xyz",
+      "host.com",
+      "/xyz",
+      ""
+    }, {
+      "http://host%3e.com/abc/%2e%2e%2fdef",
+      "host>.com",
+      "/def",
+      ""
+    }, {
+      "http://.......host...com.....//abc/////def%2F%2F%2Fxyz",
+      "host.com",
+      "/abc/def/xyz",
+      ""
+    },
+  };
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(tests); ++i) {
+    SCOPED_TRACE(StringPrintf("Test: %s", tests[i].input_url));
+    GURL url(tests[i].input_url);
+
+    std::string canonicalized_hostname;
+    std::string canonicalized_path;
+    std::string canonicalized_query;
+    safe_browsing_util::CanonicalizeUrl(url, &canonicalized_hostname,
+        &canonicalized_path, &canonicalized_query);
+
+    EXPECT_EQ(tests[i].expected_canonicalized_hostname,
+              canonicalized_hostname);
+    EXPECT_EQ(tests[i].expected_canonicalized_path,
+              canonicalized_path);
+    EXPECT_EQ(tests[i].expected_canonicalized_query,
+              canonicalized_query);
+  }
+}
 
 TEST(SafeBrowsingUtilTest, FullHashCompare) {
   GURL url("http://www.evil.com/phish.html");
author	inferno@chromium.org <inferno@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-03-30 17:40:00 +0000
committer	inferno@chromium.org <inferno@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-03-30 17:40:00 +0000
commit	69ee9c92af91fdad5aabf0f5696cd0432102985f (patch)
tree	dd6bd42ab4fb07d42fa13a74f0c445b31988148e /chrome/browser/safe_browsing
parent	87a833ed78a8b1b94f134aa55a7803d53dccb03d (diff)
download	chromium_src-69ee9c92af91fdad5aabf0f5696cd0432102985f.zip chromium_src-69ee9c92af91fdad5aabf0f5696cd0432102985f.tar.gz chromium_src-69ee9c92af91fdad5aabf0f5696cd0432102985f.tar.bz2