summaryrefslogtreecommitdiffstats
path: root/url
diff options
context:
space:
mode:
authorjoth@chromium.org <joth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-11-23 01:53:52 +0000
committerjoth@chromium.org <joth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-11-23 01:53:52 +0000
commit369e84f70d256d188a1866d8cef52edf4468cd9b (patch)
treee7e90408125f4831ce7983fd37414ad763b965b1 /url
parenta7e3691579181327dc65b02d043e7c01d4b06cb9 (diff)
downloadchromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.zip
chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.tar.gz
chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.tar.bz2
Support URL fragment resolution against non-hierarchical schemes
Support URL fragment resolution against non-hierarchical schemes As a result, data: about: etc now have 'query' and 'ref' components parsed; as a result a new GURL::GetContent() convenience is added to retrieve the spec with the scheme stripped off. A complication in supporting this is that we now need to allow whitespace to trailing whitespace to be preserved when transferring url_parse::Parsed structs between KURL and GURL. Without this, the URL prior to the #fragment can change (i.e. whitespace stripped) when following an anchor link which breaks the page (causes reload from source). See http://crbug.com/291747 for more details on this. R=brettw@chromium.org TBR=cbentzel@chromium.org BUG=291747 Review URL: https://codereview.chromium.org/23835019 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@236917 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'url')
-rw-r--r--url/gurl.cc66
-rw-r--r--url/gurl.h11
-rw-r--r--url/gurl_unittest.cc23
-rw-r--r--url/third_party/mozilla/url_parse.cc54
-rw-r--r--url/third_party/mozilla/url_parse.h6
-rw-r--r--url/url_canon_pathurl.cc64
-rw-r--r--url/url_canon_relative.cc12
-rw-r--r--url/url_canon_unittest.cc23
-rw-r--r--url/url_parse_internal.h14
-rw-r--r--url/url_parse_unittest.cc16
-rw-r--r--url/url_util.cc16
-rw-r--r--url/url_util.h2
-rw-r--r--url/url_util_unittest.cc15
13 files changed, 209 insertions, 113 deletions
diff --git a/url/gurl.cc b/url/gurl.cc
index a550c46..8eb2d61 100644
--- a/url/gurl.cc
+++ b/url/gurl.cc
@@ -19,25 +19,6 @@
namespace {
-// External template that can handle initialization of either character type.
-// The input spec is given, and the canonical version will be placed in
-// |*canonical|, along with the parsing of the canonical spec in |*parsed|.
-template<typename STR>
-bool InitCanonical(const STR& input_spec,
- std::string* canonical,
- url_parse::Parsed* parsed) {
- // Reserve enough room in the output for the input, plus some extra so that
- // we have room if we have to escape a few things without reallocating.
- canonical->reserve(input_spec.size() + 32);
- url_canon::StdStringCanonOutput output(canonical);
- bool success = url_util::Canonicalize(
- input_spec.data(), static_cast<int>(input_spec.length()),
- NULL, &output, parsed);
-
- output.Complete(); // Must be done before using string.
- return success;
-}
-
static std::string* empty_string = NULL;
static GURL* empty_gurl = NULL;
@@ -94,21 +75,15 @@ GURL::GURL(const GURL& other)
}
GURL::GURL(const std::string& url_string) {
- is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
- if (is_valid_ && SchemeIsFileSystem()) {
- inner_url_.reset(
- new GURL(spec_.data(), parsed_.Length(),
- *parsed_.inner_parsed(), true));
- }
+ InitCanonical(url_string, true);
}
GURL::GURL(const base::string16& url_string) {
- is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
- if (is_valid_ && SchemeIsFileSystem()) {
- inner_url_.reset(
- new GURL(spec_.data(), parsed_.Length(),
- *parsed_.inner_parsed(), true));
- }
+ InitCanonical(url_string, true);
+}
+
+GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
+ InitCanonical(url_string, false);
}
GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
@@ -127,6 +102,23 @@ GURL::GURL(std::string canonical_spec,
InitializeFromCanonicalSpec();
}
+template<typename STR>
+void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ spec_.reserve(input_spec.size() + 32);
+ url_canon::StdStringCanonOutput output(&spec_);
+ is_valid_ = url_util::Canonicalize(
+ input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
+ NULL, &output, &parsed_);
+
+ output.Complete(); // Must be done before using string.
+ if (is_valid_ && SchemeIsFileSystem()) {
+ inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
+ *parsed_.inner_parsed(), true));
+ }
+}
+
void GURL::InitializeFromCanonicalSpec() {
if (is_valid_ && SchemeIsFileSystem()) {
inner_url_.reset(
@@ -140,13 +132,17 @@ void GURL::InitializeFromCanonicalSpec() {
// and we can't always canonicalize then reproducabely.
if (is_valid_) {
url_parse::Component scheme;
+ // We can't do this check on the inner_url of a filesystem URL, as
+ // canonical_spec actually points to the start of the outer URL, so we'd
+ // end up with infinite recursion in this constructor.
if (!url_util::FindAndCompareScheme(spec_.data(), spec_.length(),
"filesystem", &scheme) ||
scheme.begin == parsed_.scheme.begin) {
- // We can't do this check on the inner_url of a filesystem URL, as
- // canonical_spec actually points to the start of the outer URL, so we'd
- // end up with infinite recursion in this constructor.
- GURL test_url(spec_);
+ // We need to retain trailing whitespace on path URLs, as the |parsed_|
+ // spec we originally received may legitimately contain trailing white-
+ // space on the path or components e.g. if the #ref has been
+ // removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
+ GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);
DCHECK(test_url.is_valid_ == is_valid_);
DCHECK(test_url.spec_ == spec_);
diff --git a/url/gurl.h b/url/gurl.h
index d145ab6..dc88fec25 100644
--- a/url/gurl.h
+++ b/url/gurl.h
@@ -358,6 +358,17 @@ class URL_EXPORT GURL {
}
private:
+ // Variant of the string parsing constructor that allows the caller to elect
+ // retain trailing whitespace, if any, on the passed URL spec but only if the
+ // scheme is one that allows trailing whitespace. The primary use-case is
+ // for data: URLs. In most cases, you want to use the single parameter
+ // constructor above.
+ enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
+ GURL(const std::string& url_string, RetainWhiteSpaceSelector);
+
+ template<typename STR>
+ void InitCanonical(const STR& input_spec, bool trim_path_end);
+
void InitializeFromCanonicalSpec();
// Returns the substring of the input identified by the given component.
diff --git a/url/gurl_unittest.cc b/url/gurl_unittest.cc
index 767333c..c2d86e9 100644
--- a/url/gurl_unittest.cc
+++ b/url/gurl_unittest.cc
@@ -350,6 +350,29 @@ TEST(GURLTest, Replacements) {
}
}
+TEST(GURLTest, ClearFragmentOnDataUrl) {
+ // http://crbug.com/291747 - a data URL may legitimately have trailing
+ // whitespace in the spec after the ref is cleared. Test this does not trigger
+ // the url_parse::Parsed importing validation DCHECK in GURL.
+ GURL url(" data: one ? two # three ");
+
+ // By default the trailing whitespace will have been stripped.
+ EXPECT_EQ("data: one ? two # three", url.spec());
+ GURL::Replacements repl;
+ repl.ClearRef();
+ GURL url_no_ref = url.ReplaceComponents(repl);
+
+ EXPECT_EQ("data: one ? two ", url_no_ref.spec());
+
+ // Importing a parsed url via this constructor overload will retain trailing
+ // whitespace.
+ GURL import_url(url_no_ref.spec(),
+ url_no_ref.parsed_for_possibly_invalid_spec(),
+ url_no_ref.is_valid());
+ EXPECT_EQ(url_no_ref, import_url);
+ EXPECT_EQ(import_url.query(), " two ");
+}
+
TEST(GURLTest, PathForRequest) {
struct TestCase {
const char* input;
diff --git a/url/third_party/mozilla/url_parse.cc b/url/third_party/mozilla/url_parse.cc
index fbc8a9b..84a7558 100644
--- a/url/third_party/mozilla/url_parse.cc
+++ b/url/third_party/mozilla/url_parse.cc
@@ -455,45 +455,53 @@ void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
// Initializes a path URL which is merely a scheme followed by a path. Examples
// include "about:foo" and "javascript:alert('bar');"
template<typename CHAR>
-void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+void DoParsePathURL(const CHAR* spec, int spec_len,
+ bool trim_path_end,
+ Parsed* parsed) {
// Get the non-path and non-scheme parts of the URL out of the way, we never
// use them.
parsed->username.reset();
parsed->password.reset();
parsed->host.reset();
parsed->port.reset();
+ parsed->path.reset();
parsed->query.reset();
parsed->ref.reset();
// Strip leading & trailing spaces and control characters.
- int begin = 0;
- TrimURL(spec, &begin, &spec_len);
+ int scheme_begin = 0;
+ TrimURL(spec, &scheme_begin, &spec_len, trim_path_end);
// Handle empty specs or ones that contain only whitespace or control chars.
- if (begin == spec_len) {
+ if (scheme_begin == spec_len) {
parsed->scheme.reset();
parsed->path.reset();
return;
}
+ int path_begin;
// Extract the scheme, with the path being everything following. We also
// handle the case where there is no scheme.
- if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin,
+ &parsed->scheme)) {
// Offset the results since we gave ExtractScheme a substring.
- parsed->scheme.begin += begin;
-
- // For compatability with the standard URL parser, we treat no path as
- // -1, rather than having a length of 0 (we normally wouldn't care so
- // much for these non-standard URLs).
- if (parsed->scheme.end() == spec_len - 1)
- parsed->path.reset();
- else
- parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len);
+ parsed->scheme.begin += scheme_begin;
+ path_begin = parsed->scheme.end() + 1;
} else {
- // No scheme found, just path.
+ // No scheme case.
parsed->scheme.reset();
- parsed->path = MakeRange(begin, spec_len);
+ path_begin = scheme_begin;
}
+
+ if (path_begin == spec_len)
+ return;
+ DCHECK_LT(path_begin, spec_len);
+
+ ParsePath(spec,
+ MakeRange(path_begin, spec_len),
+ &parsed->path,
+ &parsed->query,
+ &parsed->ref);
}
template<typename CHAR>
@@ -875,12 +883,18 @@ void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) {
DoParseStandardURL(url, url_len, parsed);
}
-void ParsePathURL(const char* url, int url_len, Parsed* parsed) {
- DoParsePathURL(url, url_len, parsed);
+void ParsePathURL(const char* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed) {
+ DoParsePathURL(url, url_len, trim_path_end, parsed);
}
-void ParsePathURL(const base::char16* url, int url_len, Parsed* parsed) {
- DoParsePathURL(url, url_len, parsed);
+void ParsePathURL(const base::char16* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed) {
+ DoParsePathURL(url, url_len, trim_path_end, parsed);
}
void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {
diff --git a/url/third_party/mozilla/url_parse.h b/url/third_party/mozilla/url_parse.h
index 5fa9322..9d943e5 100644
--- a/url/third_party/mozilla/url_parse.h
+++ b/url/third_party/mozilla/url_parse.h
@@ -238,9 +238,13 @@ URL_EXPORT void ParseStandardURL(const base::char16* url,
// section but that aren't file URLs either. The scheme is parsed, and
// everything after the scheme is considered as the path. This is used for
// things like "about:" and "javascript:"
-URL_EXPORT void ParsePathURL(const char* url, int url_len, Parsed* parsed);
+URL_EXPORT void ParsePathURL(const char* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed);
URL_EXPORT void ParsePathURL(const base::char16* url,
int url_len,
+ bool trim_path_end,
Parsed* parsed);
// FileURL is for file URLs. There are some special rules for interpreting
diff --git a/url/url_canon_pathurl.cc b/url/url_canon_pathurl.cc
index bc681f4..8f7dee4 100644
--- a/url/url_canon_pathurl.cc
+++ b/url/url_canon_pathurl.cc
@@ -13,6 +13,39 @@ namespace url_canon {
namespace {
+// Canonicalize the given |component| from |source| into |output| and
+// |new_component|. If |separator| is non-zero, it is pre-pended to |ouput|
+// prior to the canonicalized component; i.e. for the '?' or '#' characters.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizePathComponent(const CHAR* source,
+ const url_parse::Component& component,
+ CHAR seperator,
+ CanonOutput* output,
+ url_parse::Component* new_component) {
+ bool success = true;
+ if (component.is_valid()) {
+ if (seperator)
+ output->push_back(seperator);
+ // Copy the path using path URL's more lax escaping rules (think for
+ // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
+ // ASCII characters alone. This helps readability of JavaStript.
+ new_component->begin = output->length();
+ int end = component.end();
+ for (int i = component.begin; i < end; i++) {
+ UCHAR uch = static_cast<UCHAR>(source[i]);
+ if (uch < 0x20 || uch >= 0x80)
+ success &= AppendUTF8EscapedChar(source, &i, end, output);
+ else
+ output->push_back(static_cast<char>(uch));
+ }
+ new_component->len = output->length() - new_component->begin;
+ } else {
+ // Empty part.
+ new_component->reset();
+ }
+ return success;
+}
+
template<typename CHAR, typename UCHAR>
bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
const url_parse::Parsed& parsed,
@@ -28,29 +61,14 @@ bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
new_parsed->password.reset();
new_parsed->host.reset();
new_parsed->port.reset();
-
- if (parsed.path.is_valid()) {
- // Copy the path using path URL's more lax escaping rules (think for
- // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
- // ASCII characters alone. This helps readability of JavaStript.
- new_parsed->path.begin = output->length();
- int end = parsed.path.end();
- for (int i = parsed.path.begin; i < end; i++) {
- UCHAR uch = static_cast<UCHAR>(source.path[i]);
- if (uch < 0x20 || uch >= 0x80)
- success &= AppendUTF8EscapedChar(source.path, &i, end, output);
- else
- output->push_back(static_cast<char>(uch));
- }
- new_parsed->path.len = output->length() - new_parsed->path.begin;
- } else {
- // Empty path.
- new_parsed->path.reset();
- }
-
- // Assume there's no query or ref.
- new_parsed->query.reset();
- new_parsed->ref.reset();
+ // We allow path URLs to have the path, query and fragment components, but we
+ // will canonicalize each of the via the weaker path URL rules.
+ success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+ source.path, parsed.path, 0, output, &new_parsed->path);
+ success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+ source.query, parsed.query, '?', output, &new_parsed->query);
+ success &= DoCanonicalizePathComponent<CHAR, UCHAR>(
+ source.ref, parsed.ref, '#', output, &new_parsed->ref);
return success;
}
diff --git a/url/url_canon_relative.cc b/url/url_canon_relative.cc
index 4edd6ced..33b814c 100644
--- a/url/url_canon_relative.cc
+++ b/url/url_canon_relative.cc
@@ -101,10 +101,16 @@ bool DoIsRelativeURL(const char* base,
// "http:foo.html" is a relative URL with path "foo.html". If the scheme is
// empty, we treat it as relative (":foo") like IE does.
url_parse::Component scheme;
- if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) {
- // Don't allow relative URLs if the base scheme doesn't support it.
- if (!is_base_hierarchical)
+ const bool scheme_is_empty =
+ !url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0;
+ if (scheme_is_empty) {
+ if (url[begin] == '#') {
+ // |url| is a bare fragement (e.g. "#foo"). This can be resolved against
+ // any base. Fall-through.
+ } else if (!is_base_hierarchical) {
+ // Don't allow relative URLs if the base scheme doesn't support it.
return false;
+ }
*relative_component = url_parse::MakeRange(begin, url_len);
*is_relative = true;
diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc
index 1b59a00..2275429 100644
--- a/url/url_canon_unittest.cc
+++ b/url/url_canon_unittest.cc
@@ -1609,7 +1609,7 @@ TEST(URLCanonTest, ReplacePathURL) {
const ReplaceCase& cur = replace_cases[i];
int base_len = static_cast<int>(strlen(cur.base));
url_parse::Parsed parsed;
- url_parse::ParsePathURL(cur.base, base_len, &parsed);
+ url_parse::ParsePathURL(cur.base, base_len, false, &parsed);
url_canon::Replacements<char> r;
typedef url_canon::Replacements<char> R; // Clean up syntax.
@@ -1830,7 +1830,7 @@ TEST(URLCanonTest, CanonicalizePathURL) {
for (size_t i = 0; i < ARRAYSIZE(path_cases); i++) {
int url_len = static_cast<int>(strlen(path_cases[i].input));
url_parse::Parsed parsed;
- url_parse::ParsePathURL(path_cases[i].input, url_len, &parsed);
+ url_parse::ParsePathURL(path_cases[i].input, url_len, true, &parsed);
url_parse::Parsed out_parsed;
std::string out_str;
@@ -1848,8 +1848,8 @@ TEST(URLCanonTest, CanonicalizePathURL) {
// When we end with a colon at the end, there should be no path.
if (path_cases[i].input[url_len - 1] == ':') {
- EXPECT_EQ(0, out_parsed.path.begin);
- EXPECT_EQ(-1, out_parsed.path.len);
+ EXPECT_EQ(0, out_parsed.GetContent().begin);
+ EXPECT_EQ(-1, out_parsed.GetContent().len);
}
}
}
@@ -2156,7 +2156,7 @@ TEST(URLCanonTest, ResolveRelativeURL) {
else if (cur_case.is_base_hier)
url_parse::ParseStandardURL(cur_case.base, base_len, &parsed);
else
- url_parse::ParsePathURL(cur_case.base, base_len, &parsed);
+ url_parse::ParsePathURL(cur_case.base, base_len, false, &parsed);
// First see if it is relative.
int test_len = static_cast<int>(strlen(cur_case.test));
@@ -2188,12 +2188,15 @@ TEST(URLCanonTest, ResolveRelativeURL) {
// the URL freshly.
url_parse::Parsed ref_parsed;
int resolved_len = static_cast<int>(resolved.size());
- if (cur_case.is_base_file)
+ if (cur_case.is_base_file) {
url_parse::ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed);
- else if (cur_case.is_base_hier)
- url_parse::ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed);
- else
- url_parse::ParsePathURL(resolved.c_str(), resolved_len, &ref_parsed);
+ } else if (cur_case.is_base_hier) {
+ url_parse::ParseStandardURL(resolved.c_str(), resolved_len,
+ &ref_parsed);
+ } else {
+ url_parse::ParsePathURL(resolved.c_str(), resolved_len, false,
+ &ref_parsed);
+ }
EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed));
}
}
diff --git a/url/url_parse_internal.h b/url/url_parse_internal.h
index 9c2b2b6..e61379c 100644
--- a/url/url_parse_internal.h
+++ b/url/url_parse_internal.h
@@ -28,15 +28,19 @@ inline bool ShouldTrimFromURL(base::char16 ch) {
// in the input string (so the string starts at character |*begin| in the spec,
// and goes until |*len|).
template<typename CHAR>
-inline void TrimURL(const CHAR* spec, int* begin, int* len) {
+inline void TrimURL(const CHAR* spec, int* begin, int* len,
+ bool trim_path_end = true) {
// Strip leading whitespace and control characters.
while (*begin < *len && ShouldTrimFromURL(spec[*begin]))
(*begin)++;
- // Strip trailing whitespace and control characters. We need the >i test for
- // when the input string is all blanks; we don't want to back past the input.
- while (*len > *begin && ShouldTrimFromURL(spec[*len - 1]))
- (*len)--;
+ if (trim_path_end) {
+ // Strip trailing whitespace and control characters. We need the >i test
+ // for when the input string is all blanks; we don't want to back past the
+ // input.
+ while (*len > *begin && ShouldTrimFromURL(spec[*len - 1]))
+ (*len)--;
+ }
}
// Counts the number of consecutive slashes starting at the given offset
diff --git a/url/url_parse_unittest.cc b/url/url_parse_unittest.cc
index ad47616..f080fb5 100644
--- a/url/url_parse_unittest.cc
+++ b/url/url_parse_unittest.cc
@@ -343,11 +343,11 @@ static PathURLParseCase path_cases[] = {
{":", "", NULL},
{":/", "", "/"},
{"/", NULL, "/"},
-{" This is \\interesting// \t", NULL, "This is \\interesting//"},
+{" This is \\interesting// \t", NULL, "This is \\interesting// \t"},
{"about:", "about", NULL},
{"about:blank", "about", "blank"},
-{" about: blank ", "about", " blank"},
-{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\");"},
+{" about: blank ", "about", " blank "},
+{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\"); "},
};
TEST(URLParser, PathURL) {
@@ -356,18 +356,18 @@ TEST(URLParser, PathURL) {
url_parse::Parsed parsed;
for (size_t i = 0; i < arraysize(path_cases); i++) {
const char* url = path_cases[i].input;
- url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), &parsed);
+ url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), false, &parsed);
- EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme));
- EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.path));
+ EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme))
+ << i;
+ EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.GetContent()))
+ << i;
// The remaining components are never used for path urls.
ExpectInvalidComponent(parsed.username);
ExpectInvalidComponent(parsed.password);
ExpectInvalidComponent(parsed.host);
ExpectInvalidComponent(parsed.port);
- ExpectInvalidComponent(parsed.query);
- ExpectInvalidComponent(parsed.ref);
}
}
diff --git a/url/url_util.cc b/url/url_util.cc
index f16af98..2b4f7ad 100644
--- a/url/url_util.cc
+++ b/url/url_util.cc
@@ -121,6 +121,7 @@ bool DoFindAndCompareScheme(const CHAR* str,
template<typename CHAR>
bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
+ bool trim_path_end,
url_canon::CharsetConverter* charset_converter,
url_canon::CanonOutput* output,
url_parse::Parsed* output_parsed) {
@@ -188,7 +189,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
} else {
// "Weird" URLs like data: and javascript:
- url_parse::ParsePathURL(spec, spec_len, &parsed_input);
+ url_parse::ParsePathURL(spec, spec_len, trim_path_end, &parsed_input);
success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input,
output, output_parsed);
}
@@ -252,7 +253,8 @@ bool DoResolveRelative(const char* base_spec,
// The output_parsed is incorrect at this point (because it was built
// based on base_parsed_authority instead of base_parsed) and needs to be
// re-created.
- ParsePathURL(output->data(), output->length(), output_parsed);
+ ParsePathURL(output->data(), output->length(), true,
+ output_parsed);
return did_resolve_succeed;
}
} else if (is_relative) {
@@ -266,7 +268,7 @@ bool DoResolveRelative(const char* base_spec,
}
// Not relative, canonicalize the input.
- return DoCanonicalize(relative, relative_length, charset_converter,
+ return DoCanonicalize(relative, relative_length, true, charset_converter,
output, output_parsed);
}
@@ -314,7 +316,7 @@ bool DoReplaceComponents(const char* spec,
// may have changed with the different scheme.
url_canon::RawCanonOutput<128> recanonicalized;
url_parse::Parsed recanonicalized_parsed;
- DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
+ DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true,
charset_converter,
&recanonicalized, &recanonicalized_parsed);
@@ -429,19 +431,21 @@ bool FindAndCompareScheme(const base::char16* str,
bool Canonicalize(const char* spec,
int spec_len,
+ bool trim_path_end,
url_canon::CharsetConverter* charset_converter,
url_canon::CanonOutput* output,
url_parse::Parsed* output_parsed) {
- return DoCanonicalize(spec, spec_len, charset_converter,
+ return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
output, output_parsed);
}
bool Canonicalize(const base::char16* spec,
int spec_len,
+ bool trim_path_end,
url_canon::CharsetConverter* charset_converter,
url_canon::CanonOutput* output,
url_parse::Parsed* output_parsed) {
- return DoCanonicalize(spec, spec_len, charset_converter,
+ return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter,
output, output_parsed);
}
diff --git a/url/url_util.h b/url/url_util.h
index 4494819..eff386c 100644
--- a/url/url_util.h
+++ b/url/url_util.h
@@ -112,11 +112,13 @@ inline bool IsStandard(const char* spec, int spec_len,
// but they will not represent a loadable URL.
URL_EXPORT bool Canonicalize(const char* spec,
int spec_len,
+ bool trim_path_end,
url_canon::CharsetConverter* charset_converter,
url_canon::CanonOutput* output,
url_parse::Parsed* output_parsed);
URL_EXPORT bool Canonicalize(const base::char16* spec,
int spec_len,
+ bool trim_path_end,
url_canon::CharsetConverter* charset_converter,
url_canon::CanonOutput* output,
url_parse::Parsed* output_parsed);
diff --git a/url/url_util_unittest.cc b/url/url_util_unittest.cc
index dfbdb40..70adc5b 100644
--- a/url/url_util_unittest.cc
+++ b/url/url_util_unittest.cc
@@ -93,7 +93,7 @@ static std::string CheckReplaceScheme(const char* base_url,
// Make sure the input is canonicalized.
url_canon::RawCanonOutput<32> original;
url_parse::Parsed original_parsed;
- url_util::Canonicalize(base_url, strlen(base_url), NULL,
+ url_util::Canonicalize(base_url, strlen(base_url), true, NULL,
&original, &original_parsed);
url_canon::Replacements<char> replacements;
@@ -135,6 +135,9 @@ TEST(URLUtilTest, ReplaceScheme) {
// http://crbug.com/160 which should also be an acceptable result.
EXPECT_EQ("about://google.com/",
CheckReplaceScheme("http://google.com/", "about"));
+
+ EXPECT_EQ("http://example.com/%20hello%20# world",
+ CheckReplaceScheme("myscheme:example.com/ hello # world ", "http"));
}
TEST(URLUtilTest, DecodeURLEscapeSequences) {
@@ -270,12 +273,20 @@ TEST(URLUtilTest, TestResolveRelativeWithNonStandardBase) {
// Resolving should fail if the base URL is authority-based but is
// missing a path component (the '/' at the end).
{"scheme://Authority", "path", false, ""},
+ // Test resolving a fragment (only) against any kind of base-URL.
+ {"about:blank", "#id42", true, "about:blank#id42" },
+ {"about:blank", " #id42", true, "about:blank#id42" },
+ {"about:blank#oldfrag", "#newfrag", true, "about:blank#newfrag" },
+ // A surprising side effect of allowing fragments to resolve against
+ // any URL scheme is we might break javascript: URLs by doing so...
+ {"javascript:alert('foo#bar')", "#badfrag", true,
+ "javascript:alert('foo#badfrag" },
};
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(resolve_non_standard_cases); i++) {
const ResolveRelativeCase& test_data = resolve_non_standard_cases[i];
url_parse::Parsed base_parsed;
- url_parse::ParsePathURL(test_data.base, strlen(test_data.base),
+ url_parse::ParsePathURL(test_data.base, strlen(test_data.base), false,
&base_parsed);
std::string resolved;