Support URL fragment resolution against non-hierarchical schemes

Support URL fragment resolution against non-hierarchical schemes As a result, data: about: etc now have 'query' and 'ref' components parsed; as a result a new GURL::GetContent() convenience is added to retrieve the spec with the scheme stripped off. A complication in supporting this is that we now need to allow whitespace to trailing whitespace to be preserved when transferring url_parse::Parsed structs between KURL and GURL. Without this, the URL prior to the #fragment can change (i.e. whitespace stripped) when following an anchor link which breaks the page (causes reload from source). See http://crbug.com/291747 for more details on this. R=brettw@chromium.org TBR=cbentzel@chromium.org BUG=291747 Review URL: https://codereview.chromium.org/23835019 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@236917 0039d316-1c4b-4281-b951-d872f2087c98
author: joth@chromium.org <joth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-11-23 01:53:52 +0000
committer: joth@chromium.org <joth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-11-23 01:53:52 +0000
commit: 369e84f70d256d188a1866d8cef52edf4468cd9b (patch)
tree: e7e90408125f4831ce7983fd37414ad763b965b1 /url
parent: a7e3691579181327dc65b02d043e7c01d4b06cb9 (diff)
download: chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.zip
chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.tar.gz
chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.tar.bz2
13 files changed, 209 insertions, 113 deletions
diff --git a/url/gurl.cc b/url/gurl.cc
index a550c46..8eb2d61 100644
--- a/url/gurl.cc
+++ b/url/gurl.cc
@@ -19,25 +19,6 @@
 
 namespace {
 
-// External template that can handle initialization of either character type.
-// The input spec is given, and the canonical version will be placed in
-// |*canonical|, along with the parsing of the canonical spec in |*parsed|.
-template<typename STR>
-bool InitCanonical(const STR& input_spec,
-                   std::string* canonical,
-                   url_parse::Parsed* parsed) {
-  // Reserve enough room in the output for the input, plus some extra so that
-  // we have room if we have to escape a few things without reallocating.
-  canonical->reserve(input_spec.size() + 32);
-  url_canon::StdStringCanonOutput output(canonical);
-  bool success = url_util::Canonicalize(
-      input_spec.data(), static_cast<int>(input_spec.length()),
-      NULL, &output, parsed);
-
-  output.Complete();  // Must be done before using string.
-  return success;
-}
-
 static std::string* empty_string = NULL;
 static GURL* empty_gurl = NULL;
 
@@ -94,21 +75,15 @@ GURL::GURL(const GURL& other)
 }
 
 GURL::GURL(const std::string& url_string) {
-  is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
-  if (is_valid_ && SchemeIsFileSystem()) {
-    inner_url_.reset(
-        new GURL(spec_.data(), parsed_.Length(),
-                 *parsed_.inner_parsed(), true));
-  }
+  InitCanonical(url_string, true);
 }
 
 GURL::GURL(const base::string16& url_string) {
-  is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
-  if (is_valid_ && SchemeIsFileSystem()) {
-    inner_url_.reset(
-        new GURL(spec_.data(), parsed_.Length(),
-                 *parsed_.inner_parsed(), true));
-  }
+  InitCanonical(url_string, true);
+}
+
+GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
+  InitCanonical(url_string, false);
 }
 
 GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
@@ -127,6 +102,23 @@ GURL::GURL(std::string canonical_spec,
   InitializeFromCanonicalSpec();
 }
 
+template<typename STR>
+void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
+  // Reserve enough room in the output for the input, plus some extra so that
+  // we have room if we have to escape a few things without reallocating.
+  spec_.reserve(input_spec.size() + 32);
+  url_canon::StdStringCanonOutput output(&spec_);
+  is_valid_ = url_util::Canonicalize(
+      input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
+      NULL, &output, &parsed_);
+
+  output.Complete();  // Must be done before using string.
+  if (is_valid_ && SchemeIsFileSystem()) {
+    inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
+                              *parsed_.inner_parsed(), true));
+  }
+}
+
 void GURL::InitializeFromCanonicalSpec() {
   if (is_valid_ && SchemeIsFileSystem()) {
     inner_url_.reset(
@@ -140,13 +132,17 @@ void GURL::InitializeFromCanonicalSpec() {
   // and we can't always canonicalize then reproducabely.
   if (is_valid_) {
     url_parse::Component scheme;
+    // We can't do this check on the inner_url of a filesystem URL, as
+    // canonical_spec actually points to the start of the outer URL, so we'd
+    // end up with infinite recursion in this constructor.
     if (!url_util::FindAndCompareScheme(spec_.data(), spec_.length(),
                                         "filesystem", &scheme) ||
         scheme.begin == parsed_.scheme.begin) {
-      // We can't do this check on the inner_url of a filesystem URL, as
-      // canonical_spec actually points to the start of the outer URL, so we'd
-      // end up with infinite recursion in this constructor.
-      GURL test_url(spec_);
+      // We need to retain trailing whitespace on path URLs, as the |parsed_|
+      // spec we originally received may legitimately contain trailing white-
+      // space on the path or  components e.g. if the #ref has been
+      // removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
+      GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);
 
       DCHECK(test_url.is_valid_ == is_valid_);
       DCHECK(test_url.spec_ == spec_);
diff --git a/url/gurl.h b/url/gurl.h
index d145ab6..dc88fec25 100644
--- a/url/gurl.h
+++ b/url/gurl.h
@@ -358,6 +358,17 @@ class URL_EXPORT GURL {
   }
 
  private:
+  // Variant of the string parsing constructor that allows the caller to elect
+  // retain trailing whitespace, if any, on the passed URL spec but only  if the
+  // scheme is one that allows trailing whitespace. The primary use-case is
+  // for data: URLs. In most cases, you want to use the single parameter
+  // constructor above.
+  enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
+  GURL(const std::string& url_string, RetainWhiteSpaceSelector);
+
+  template<typename STR>
+  void InitCanonical(const STR& input_spec, bool trim_path_end);
+
   void InitializeFromCanonicalSpec();
 
   // Returns the substring of the input identified by the given component.
diff --git a/url/gurl_unittest.cc b/url/gurl_unittest.cc
index 767333c..c2d86e9 100644
--- a/url/gurl_unittest.cc
+++ b/url/gurl_unittest.cc
@@ -350,6 +350,29 @@ TEST(GURLTest, Replacements) {
   }
 }
 
+TEST(GURLTest, ClearFragmentOnDataUrl) {
+  // http://crbug.com/291747 - a data URL may legitimately have trailing
+  // whitespace in the spec after the ref is cleared. Test this does not trigger
+  // the url_parse::Parsed importing validation DCHECK in GURL.
+  GURL url(" data: one ? two # three ");
+
+  // By default the trailing whitespace will have been stripped.
+  EXPECT_EQ("data: one ? two # three", url.spec());
+  GURL::Replacements repl;
+  repl.ClearRef();
+  GURL url_no_ref = url.ReplaceComponents(repl);
+
+  EXPECT_EQ("data: one ? two ", url_no_ref.spec());
+
+  // Importing a parsed url via this constructor overload will retain trailing
+  // whitespace.
+  GURL import_url(url_no_ref.spec(),
+                  url_no_ref.parsed_for_possibly_invalid_spec(),
+                  url_no_ref.is_valid());
+  EXPECT_EQ(url_no_ref, import_url);
+  EXPECT_EQ(import_url.query(), " two ");
+}
+
 TEST(GURLTest, PathForRequest) {
   struct TestCase {
     const char* input;
diff --git a/url/third_party/mozilla/url_parse.cc b/url/third_party/mozilla/url_parse.cc
index fbc8a9b..84a7558 100644
--- a/url/third_party/mozilla/url_parse.cc
+++ b/url/third_party/mozilla/url_parse.cc
@@ -455,45 +455,53 @@ void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
 // Initializes a path URL which is merely a scheme followed by a path. Examples
 // include "about:foo" and "javascript:alert('bar');"
 template<typename CHAR>
-void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+void DoParsePathURL(const CHAR* spec, int spec_len,
+                    bool trim_path_end,
+                    Parsed* parsed) {
   // Get the non-path and non-scheme parts of the URL out of the way, we never
   // use them.
   parsed->username.reset();
   parsed->password.reset();
   parsed->host.reset();
   parsed->port.reset();
+  parsed->path.reset();
   parsed->query.reset();
   parsed->ref.reset();
 
   // Strip leading & trailing spaces and control characters.
-  int begin = 0;
-  TrimURL(spec, &begin, &spec_len);
+  int scheme_begin = 0;
+  TrimURL(spec, &scheme_begin, &spec_len, trim_path_end);
 
   // Handle empty specs or ones that contain only whitespace or control chars.
-  if (begin == spec_len) {
+  if (scheme_begin == spec_len) {
     parsed->scheme.reset();
     parsed->path.reset();
     return;
   }
 
+  int path_begin;
   // Extract the scheme, with the path being everything following. We also
   // handle the case where there is no scheme.
-  if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+  if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin,
+                    &parsed->scheme)) {
     // Offset the results since we gave ExtractScheme a substring.
-    parsed->scheme.begin += begin;
-
-    // For compatability with the standard URL parser, we treat no path as
-    // -1, rather than having a length of 0 (we normally wouldn't care so
-    // much for these non-standard URLs).
-    if (parsed->scheme.end() == spec_len - 1)
-      parsed->path.reset();
-    else
-      parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len);
+    parsed->scheme.begin += scheme_begin;
+    path_begin = parsed->scheme.end() + 1;
   } else {
-    // No scheme found, just path.
+    // No scheme case.
     parsed->scheme.reset();
-    parsed->path = MakeRange(begin, spec_len);
+    path_begin = scheme_begin;
   }
+
+  if (path_begin == spec_len)
+    return;
+  DCHECK_LT(path_begin, spec_len);
+
+  ParsePath(spec,
+            MakeRange(path_begin, spec_len),
+            &parsed->path,
+            &parsed->query,
+            &parsed->ref);
 }
 
 template<typename CHAR>
@@ -875,12 +883,18 @@ void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) {
   DoParseStandardURL(url, url_len, parsed);
 }
 
-void ParsePathURL(const char* url, int url_len, Parsed* parsed) {
-  DoParsePathURL(url, url_len, parsed);
+void ParsePathURL(const char* url,
+                  int url_len,
+                  bool trim_path_end,
+                  Parsed* parsed) {
+  DoParsePathURL(url, url_len, trim_path_end, parsed);
 }
 
-void ParsePathURL(const base::char16* url, int url_len, Parsed* parsed) {
-  DoParsePathURL(url, url_len, parsed);
+void ParsePathURL(const base::char16* url,
+                  int url_len,
+                  bool trim_path_end,
+                  Parsed* parsed) {
+  DoParsePathURL(url, url_len, trim_path_end, parsed);
 }
 
 void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {
diff --git a/url/third_party/mozilla/url_parse.h b/url/third_party/mozilla/url_parse.h
index 5fa9322..9d943e5 100644
--- a/url/third_party/mozilla/url_parse.h
+++ b/url/third_party/mozilla/url_parse.h
@@ -238,9 +238,13 @@ URL_EXPORT void ParseStandardURL(const base::char16* url,
 // section but that aren't file URLs either. The scheme is parsed, and
 // everything after the scheme is considered as the path. This is used for
 // things like "about:" and "javascript:"
-URL_EXPORT void ParsePathURL(const char* url, int url_len, Parsed* parsed);
+URL_EXPORT void ParsePathURL(const char* url,
+                             int url_len,
+                             bool trim_path_end,
+                             Parsed* parsed);
 URL_EXPORT void ParsePathURL(const base::char16* url,
                              int url_len,
+                             bool trim_path_end,
                              Parsed* parsed);
 
 // FileURL is for file URLs. There are some special rules for interpreting
diff --git a/url/url_canon_pathurl.cc b/url/url_canon_pathurl.cc
index bc681f4..8f7dee4 100644
--- a/url/url_canon_pathurl.cc
+++ b/url/url_canon_pathurl.cc
@@ -13,6 +13,39 @@ namespace url_canon {
 
 namespace {
 
+// Canonicalize the given |component| from |source| into |output| and
+// |new_component|. If |separator| is non-zero, it is pre-pended to |ouput|
+// prior to the canonicalized component; i.e. for the '?' or '#' characters.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizePathComponent(const CHAR* source,
+                                 const url_parse::Component& component,
+                                 CHAR seperator,
+                                 CanonOutput* output,
+                                 url_parse::Component* new_component) {
+  bool success = true;
+  if (component.is_valid()) {
+    if (seperator)
+      output->push_back(seperator);
+    // Copy the path using path URL's more lax escaping rules (think for
+    // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
+    // ASCII characters alone. This helps readability of JavaStript.
+    new_component->begin = output->length();
+    int end = component.end();
+    for (int i = component.begin; i < end; i++) {
+      UCHAR uch = static_cast<UCHAR>(source[i]);
+      if (uch < 0x20 || uch >= 0x80)
+        success &= AppendUTF8EscapedChar(source, &i, end, output);
+      else
+        output->push_back(static_cast<char>(uch));
+    }
+    new_component->len = output->length() - new_component->begin;
+  } else {
+    // Empty part.
+    new_component->reset();
+  }
+  return success;
+}
+
 template<typename CHAR, typename UCHAR>
 bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
                            const url_parse::Parsed& parsed,
@@ -28,29 +61,14 @@ bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
   new_parsed->password.reset();
   new_parsed->host.reset();
   new_parsed->port.reset();
-
-  if (parsed.path.is_valid()) {
-    // Copy the path using path URL's more lax escaping rules (think for
-    // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
-    // ASCII characters alone. This helps readability of JavaStript.
-    new_parsed->path.begin = output->length();
-    int end = parsed.path.end();
-    for (int i = parsed.path.begin; i < end; i++) {
-      UCHAR uch = static_cast<UCHAR>(source.path[i]);
-      if (uch < 0x20 || uch >= 0x80)
-        success &= AppendUTF8EscapedChar(source.path, &i, end, output);
-      else
-        output->push_back(static_cast<char>(uch));
-    }
-    new_parsed->path.len = output->length() - new_parsed->path.begin;
-  } else {
-    // Empty path.
-    new_parsed->path.reset();
-  }
-
-  // Assume there's no query or ref.
-  new_parsed->query.reset();
-  new_parsed->ref.reset();
+  // We allow path URLs to have the path, query and fragment components, but we
+  // will canonicalize each of the via the weaker path URL rules.
+  success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+      source.path, parsed.path, 0, output, &new_parsed->path);
+  success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+      source.query, parsed.query, '?', output, &new_parsed->query);
+  success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+      source.ref, parsed.ref, '#', output, &new_parsed->ref);
 
   return success;
 }
diff --git a/url/url_canon_relative.cc b/url/url_canon_relative.cc
index 4edd6ced..33b814c 100644
--- a/url/url_canon_relative.cc
+++ b/url/url_canon_relative.cc
@@ -101,10 +101,16 @@ bool DoIsRelativeURL(const char* base,
   // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
   // empty, we treat it as relative (":foo") like IE does.
   url_parse::Component scheme;
-  if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) {
-    // Don't allow relative URLs if the base scheme doesn't support it.
-    if (!is_base_hierarchical)
+  const bool scheme_is_empty =
+      !url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0;
+  if (scheme_is_empty) {
+    if (url[begin] == '#') {
+      // |url| is a bare fragement (e.g. "#foo"). This can be resolved against
+      // any base. Fall-through.
+    } else if (!is_base_hierarchical) {
+      // Don't allow relative URLs if the base scheme doesn't support it.
       return false;
+    }
 
     *relative_component = url_parse::MakeRange(begin, url_len);
     *is_relative = true;
diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc
index 1b59a00..2275429 100644
--- a/url/url_canon_unittest.cc
+++ b/url/url_canon_unittest.cc
@@ -1609,7 +1609,7 @@ TEST(URLCanonTest, ReplacePathURL) {
     const ReplaceCase& cur = replace_cases[i];
     int base_len = static_cast<int>(strlen(cur.base));
     url_parse::Parsed parsed;
-    url_parse::ParsePathURL(cur.base, base_len, &parsed);
+    url_parse::ParsePathURL(cur.base, base_len, false, &parsed);
 
     url_canon::Replacements<char> r;
     typedef url_canon::Replacements<char> R;  // Clean up syntax.
@@ -1830,7 +1830,7 @@ TEST(URLCanonTest, CanonicalizePathURL) {
   for (size_t i = 0; i < ARRAYSIZE(path_cases); i++) {
     int url_len = static_cast<int>(strlen(path_cases[i].input));
     url_parse::Parsed parsed;
-    url_parse::ParsePathURL(path_cases[i].input, url_len, &parsed);
+    url_parse::ParsePathURL(path_cases[i].input, url_len, true, &parsed);
 
     url_parse::Parsed out_parsed;
     std::string out_str;
@@ -1848,8 +1848,8 @@ TEST(URLCanonTest, CanonicalizePathURL) {
 
     // When we end with a colon at the end, there should be no path.
     if (path_cases[i].input[url_len - 1] == ':') {
-      EXPECT_EQ(0, out_parsed.path.begin);
-      EXPECT_EQ(-1, out_parsed.path.len);
+      EXPECT_EQ(0, out_parsed.GetContent().begin);
+      EXPECT_EQ(-1, out_parsed.GetContent().len);
     }
   }
 }
@@ -2156,7 +2156,7 @@ TEST(URLCanonTest, ResolveRelativeURL) {
     else if (cur_case.is_base_hier)
       url_parse::ParseStandardURL(cur_case.base, base_len, &parsed);
     else
-      url_parse::ParsePathURL(cur_case.base, base_len, &parsed);
+      url_parse::ParsePathURL(cur_case.base, base_len, false, &parsed);
 
     // First see if it is relative.
     int test_len = static_cast<int>(strlen(cur_case.test));
@@ -2188,12 +2188,15 @@ TEST(URLCanonTest, ResolveRelativeURL) {
       // the URL freshly.
       url_parse::Parsed ref_parsed;
       int resolved_len = static_cast<int>(resolved.size());
-      if (cur_case.is_base_file)
+      if (cur_case.is_base_file) {
         url_parse::ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed);
-      else if (cur_case.is_base_hier)
-        url_parse::ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed);
-      else
-        url_parse::ParsePathURL(resolved.c_str(), resolved_len, &ref_parsed);
+      } else if (cur_case.is_base_hier) {
+        url_parse::ParseStandardURL(resolved.c_str(), resolved_len,
+                                    &ref_parsed);
+      } else {
+        url_parse::ParsePathURL(resolved.c_str(), resolved_len, false,
+                                &ref_parsed);
+      }
       EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed));
     }
   }
diff --git a/url/url_parse_internal.h b/url/url_parse_internal.h
index 9c2b2b6..e61379c 100644
--- a/url/url_parse_internal.h
+++ b/url/url_parse_internal.h
@@ -28,15 +28,19 @@ inline bool ShouldTrimFromURL(base::char16 ch) {
 // in the input string (so the string starts at character |*begin| in the spec,
 // and goes until |*len|).
 template<typename CHAR>
-inline void TrimURL(const CHAR* spec, int* begin, int* len) {
+inline void TrimURL(const CHAR* spec, int* begin, int* len,
+                    bool trim_path_end = true) {
   // Strip leading whitespace and control characters.
   while (*begin < *len && ShouldTrimFromURL(spec[*begin]))
     (*begin)++;
 
-  // Strip trailing whitespace and control characters. We need the >i test for
-  // when the input string is all blanks; we don't want to back past the input.
-  while (*len > *begin && ShouldTrimFromURL(spec[*len - 1]))
-    (*len)--;
+  if (trim_path_end) {
+    // Strip trailing whitespace and control characters. We need the >i test
+    // for when the input string is all blanks; we don't want to back past the
+    // input.
+    while (*len > *begin && ShouldTrimFromURL(spec[*len - 1]))
+      (*len)--;
+  }
 }
 
 // Counts the number of consecutive slashes starting at the given offset
diff --git a/url/url_parse_unittest.cc b/url/url_parse_unittest.cc
index ad47616..f080fb5 100644
--- a/url/url_parse_unittest.cc
+++ b/url/url_parse_unittest.cc
@@ -343,11 +343,11 @@ static PathURLParseCase path_cases[] = {
 {":",                                       "",            NULL},
 {":/",                                      "",            "/"},
 {"/",                                       NULL,          "/"},
-{" This is \\interesting// \t",             NULL,          "This is \\interesting//"},
+{" This is \\interesting// \t",             NULL,          "This is \\interesting// \t"},
 {"about:",                                  "about",       NULL},
 {"about:blank",                             "about",       "blank"},
-{"  about: blank ",                         "about",       " blank"},
-{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\");"},
+{"  about: blank ",                         "about",       " blank "},
+{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\"); "},
 };
 
 TEST(URLParser, PathURL) {
@@ -356,18 +356,18 @@ TEST(URLParser, PathURL) {
   url_parse::Parsed parsed;
   for (size_t i = 0; i < arraysize(path_cases); i++) {
     const char* url = path_cases[i].input;
-    url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), &parsed);
+    url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), false, &parsed);
 
-    EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme));
-    EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.path));
+    EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme))
+        << i;
+    EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.GetContent()))
+        << i;
 
     // The remaining components are never used for path urls.
     ExpectInvalidComponent(parsed.username);
     ExpectInvalidComponent(parsed.password);
     ExpectInvalidComponent(parsed.host);
     ExpectInvalidComponent(parsed.port);
-    ExpectInvalidComponent(parsed.query);
-    ExpectInvalidComponent(parsed.ref);
   }
 }
 
diff --git a/url/url_util.cc b/url/url_util.cc
index f16af98..2b4f7ad 100644
--- a/url/url_util.cc
+++ b/url/url_util.cc
@@ -121,6 +121,7 @@ bool DoFindAndCompareScheme(const CHAR* str,
 
 template<typename CHAR>
 bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
+                    bool trim_path_end,
                     url_canon::CharsetConverter* charset_converter,
                     url_canon::CanonOutput* output,
                     url_parse::Parsed* output_parsed) {
@@ -188,7 +189,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
 
   } else {
     // "Weird" URLs like data: and javascript:
-    url_parse::ParsePathURL(spec, spec_len, &parsed_input);
+    url_parse::ParsePathURL(spec, spec_len, trim_path_end, &parsed_input);
     success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input,
                                              output, output_parsed);
   }
@@ -252,7 +253,8 @@ bool DoResolveRelative(const char* base_spec,
       // The output_parsed is incorrect at this point (because it was built
       // based on base_parsed_authority instead of base_parsed) and needs to be
       // re-created.
-      ParsePathURL(output->data(), output->length(), output_parsed);
+      ParsePathURL(output->data(), output->length(), true,
+                   output_parsed);
       return did_resolve_succeed;
     }
   } else if (is_relative) {
@@ -266,7 +268,7 @@ bool DoResolveRelative(const char* base_spec,
   }
 
   // Not relative, canonicalize the input.
-  return DoCanonicalize(relative, relative_length, charset_converter,
+  return DoCanonicalize(relative, relative_length, true, charset_converter,
                         output, output_parsed);
 }
 
@@ -314,7 +316,7 @@ bool DoReplaceComponents(const char* spec,
     // may have changed with the different scheme.
     url_canon::RawCanonOutput<128> recanonicalized;
     url_parse::Parsed recanonicalized_parsed;
-    DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
+    DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true,
                    charset_converter,
                    &recanonicalized, &recanonicalized_parsed);
 
@@ -429,19 +431,21 @@ bool FindAndCompareScheme(const base::char16* str,
 
 bool Canonicalize(const char* spec,
                   int spec_len,
+                  bool trim_path_end,
                   url_canon::CharsetConverter* charset_converter,
                   url_canon::CanonOutput* output,
                   url_parse::Parsed* output_parsed) {
-  return DoCanonicalize(spec, spec_len, charset_converter,
+  return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
                         output, output_parsed);
 }
 
 bool Canonicalize(const base::char16* spec,
                   int spec_len,
+                  bool trim_path_end,
                   url_canon::CharsetConverter* charset_converter,
                   url_canon::CanonOutput* output,
                   url_parse::Parsed* output_parsed) {
-  return DoCanonicalize(spec, spec_len, charset_converter,
+  return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
                         output, output_parsed);
 }
 
diff --git a/url/url_util.h b/url/url_util.h
index 4494819..eff386c 100644
--- a/url/url_util.h
+++ b/url/url_util.h
@@ -112,11 +112,13 @@ inline bool IsStandard(const char* spec, int spec_len,
 // but they will not represent a loadable URL.
 URL_EXPORT bool Canonicalize(const char* spec,
                              int spec_len,
+                             bool trim_path_end,
                              url_canon::CharsetConverter* charset_converter,
                              url_canon::CanonOutput* output,
                              url_parse::Parsed* output_parsed);
 URL_EXPORT bool Canonicalize(const base::char16* spec,
                              int spec_len,
+                             bool trim_path_end,
                              url_canon::CharsetConverter* charset_converter,
                              url_canon::CanonOutput* output,
                              url_parse::Parsed* output_parsed);
diff --git a/url/url_util_unittest.cc b/url/url_util_unittest.cc
index dfbdb40..70adc5b 100644
--- a/url/url_util_unittest.cc
+++ b/url/url_util_unittest.cc
@@ -93,7 +93,7 @@ static std::string CheckReplaceScheme(const char* base_url,
   // Make sure the input is canonicalized.
   url_canon::RawCanonOutput<32> original;
   url_parse::Parsed original_parsed;
-  url_util::Canonicalize(base_url, strlen(base_url), NULL,
+  url_util::Canonicalize(base_url, strlen(base_url), true, NULL,
                          &original, &original_parsed);
 
   url_canon::Replacements<char> replacements;
@@ -135,6 +135,9 @@ TEST(URLUtilTest, ReplaceScheme) {
   // http://crbug.com/160 which should also be an acceptable result.
   EXPECT_EQ("about://google.com/",
             CheckReplaceScheme("http://google.com/", "about"));
+
+  EXPECT_EQ("http://example.com/%20hello%20# world",
+            CheckReplaceScheme("myscheme:example.com/ hello # world ", "http"));
 }
 
 TEST(URLUtilTest, DecodeURLEscapeSequences) {
@@ -270,12 +273,20 @@ TEST(URLUtilTest, TestResolveRelativeWithNonStandardBase) {
       // Resolving should fail if the base URL is authority-based but is
       // missing a path component (the '/' at the end).
     {"scheme://Authority", "path", false, ""},
+      // Test resolving a fragment (only) against any kind of base-URL.
+    {"about:blank", "#id42", true, "about:blank#id42" },
+    {"about:blank", " #id42", true, "about:blank#id42" },
+    {"about:blank#oldfrag", "#newfrag", true, "about:blank#newfrag" },
+      // A surprising side effect of allowing fragments to resolve against
+      // any URL scheme is we might break javascript: URLs by doing so...
+    {"javascript:alert('foo#bar')", "#badfrag", true,
+      "javascript:alert('foo#badfrag" },
   };
 
   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(resolve_non_standard_cases); i++) {
     const ResolveRelativeCase& test_data = resolve_non_standard_cases[i];
     url_parse::Parsed base_parsed;
-    url_parse::ParsePathURL(test_data.base, strlen(test_data.base),
+    url_parse::ParsePathURL(test_data.base, strlen(test_data.base), false,
                             &base_parsed);
 
     std::string resolved;
author	joth@chromium.org <joth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-11-23 01:53:52 +0000
committer	joth@chromium.org <joth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-11-23 01:53:52 +0000
commit	369e84f70d256d188a1866d8cef52edf4468cd9b (patch)
tree	e7e90408125f4831ce7983fd37414ad763b965b1 /url
parent	a7e3691579181327dc65b02d043e7c01d4b06cb9 (diff)
download	chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.zip chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.tar.gz chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.tar.bz2