diff options
author | joth@chromium.org <joth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-11-23 01:53:52 +0000 |
---|---|---|
committer | joth@chromium.org <joth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-11-23 01:53:52 +0000 |
commit | 369e84f70d256d188a1866d8cef52edf4468cd9b (patch) | |
tree | e7e90408125f4831ce7983fd37414ad763b965b1 | |
parent | a7e3691579181327dc65b02d043e7c01d4b06cb9 (diff) | |
download | chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.zip chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.tar.gz chromium_src-369e84f70d256d188a1866d8cef52edf4468cd9b.tar.bz2 |
Support URL fragment resolution against non-hierarchical schemes
Support URL fragment resolution against non-hierarchical schemes
As a result, data: about: etc now have 'query' and 'ref' components
parsed; as a result a new GURL::GetContent() convenience is added to
retrieve the spec with the scheme stripped off.
A complication in supporting this is that we now need to allow whitespace
to trailing whitespace to be preserved when transferring url_parse::Parsed
structs between KURL and GURL. Without this, the URL prior to the
#fragment can change (i.e. whitespace stripped) when following an anchor
link which breaks the page (causes reload from source). See
http://crbug.com/291747 for more details on this.
R=brettw@chromium.org
TBR=cbentzel@chromium.org
BUG=291747
Review URL: https://codereview.chromium.org/23835019
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@236917 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | net/base/net_util_unittest.cc | 4 | ||||
-rw-r--r-- | url/gurl.cc | 66 | ||||
-rw-r--r-- | url/gurl.h | 11 | ||||
-rw-r--r-- | url/gurl_unittest.cc | 23 | ||||
-rw-r--r-- | url/third_party/mozilla/url_parse.cc | 54 | ||||
-rw-r--r-- | url/third_party/mozilla/url_parse.h | 6 | ||||
-rw-r--r-- | url/url_canon_pathurl.cc | 64 | ||||
-rw-r--r-- | url/url_canon_relative.cc | 12 | ||||
-rw-r--r-- | url/url_canon_unittest.cc | 23 | ||||
-rw-r--r-- | url/url_parse_internal.h | 14 | ||||
-rw-r--r-- | url/url_parse_unittest.cc | 16 | ||||
-rw-r--r-- | url/url_util.cc | 16 | ||||
-rw-r--r-- | url/url_util.h | 2 | ||||
-rw-r--r-- | url/url_util_unittest.cc | 15 |
14 files changed, 211 insertions, 115 deletions
diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc index f783ca4..8200ad3 100644 --- a/net/base/net_util_unittest.cc +++ b/net/base/net_util_unittest.cc @@ -3026,9 +3026,9 @@ TEST(NetUtilTest, SimplifyUrlForRequest) { "ftp://user:pass@google.com:80/sup?yo#X#X", "ftp://google.com:80/sup?yo", }, - { // Try an nonstandard URL - "foobar://user:pass@google.com:80/sup?yo#X#X", + { // Try a nonstandard URL "foobar://user:pass@google.com:80/sup?yo#X#X", + "foobar://user:pass@google.com:80/sup?yo", }, }; for (size_t i = 0; i < ARRAYSIZE_UNSAFE(tests); ++i) { diff --git a/url/gurl.cc b/url/gurl.cc index a550c46..8eb2d61 100644 --- a/url/gurl.cc +++ b/url/gurl.cc @@ -19,25 +19,6 @@ namespace { -// External template that can handle initialization of either character type. -// The input spec is given, and the canonical version will be placed in -// |*canonical|, along with the parsing of the canonical spec in |*parsed|. -template<typename STR> -bool InitCanonical(const STR& input_spec, - std::string* canonical, - url_parse::Parsed* parsed) { - // Reserve enough room in the output for the input, plus some extra so that - // we have room if we have to escape a few things without reallocating. - canonical->reserve(input_spec.size() + 32); - url_canon::StdStringCanonOutput output(canonical); - bool success = url_util::Canonicalize( - input_spec.data(), static_cast<int>(input_spec.length()), - NULL, &output, parsed); - - output.Complete(); // Must be done before using string. - return success; -} - static std::string* empty_string = NULL; static GURL* empty_gurl = NULL; @@ -94,21 +75,15 @@ GURL::GURL(const GURL& other) } GURL::GURL(const std::string& url_string) { - is_valid_ = InitCanonical(url_string, &spec_, &parsed_); - if (is_valid_ && SchemeIsFileSystem()) { - inner_url_.reset( - new GURL(spec_.data(), parsed_.Length(), - *parsed_.inner_parsed(), true)); - } + InitCanonical(url_string, true); } GURL::GURL(const base::string16& url_string) { - is_valid_ = InitCanonical(url_string, &spec_, &parsed_); - if (is_valid_ && SchemeIsFileSystem()) { - inner_url_.reset( - new GURL(spec_.data(), parsed_.Length(), - *parsed_.inner_parsed(), true)); - } + InitCanonical(url_string, true); +} + +GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) { + InitCanonical(url_string, false); } GURL::GURL(const char* canonical_spec, size_t canonical_spec_len, @@ -127,6 +102,23 @@ GURL::GURL(std::string canonical_spec, InitializeFromCanonicalSpec(); } +template<typename STR> +void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) { + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + spec_.reserve(input_spec.size() + 32); + url_canon::StdStringCanonOutput output(&spec_); + is_valid_ = url_util::Canonicalize( + input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end, + NULL, &output, &parsed_); + + output.Complete(); // Must be done before using string. + if (is_valid_ && SchemeIsFileSystem()) { + inner_url_.reset(new GURL(spec_.data(), parsed_.Length(), + *parsed_.inner_parsed(), true)); + } +} + void GURL::InitializeFromCanonicalSpec() { if (is_valid_ && SchemeIsFileSystem()) { inner_url_.reset( @@ -140,13 +132,17 @@ void GURL::InitializeFromCanonicalSpec() { // and we can't always canonicalize then reproducabely. if (is_valid_) { url_parse::Component scheme; + // We can't do this check on the inner_url of a filesystem URL, as + // canonical_spec actually points to the start of the outer URL, so we'd + // end up with infinite recursion in this constructor. if (!url_util::FindAndCompareScheme(spec_.data(), spec_.length(), "filesystem", &scheme) || scheme.begin == parsed_.scheme.begin) { - // We can't do this check on the inner_url of a filesystem URL, as - // canonical_spec actually points to the start of the outer URL, so we'd - // end up with infinite recursion in this constructor. - GURL test_url(spec_); + // We need to retain trailing whitespace on path URLs, as the |parsed_| + // spec we originally received may legitimately contain trailing white- + // space on the path or components e.g. if the #ref has been + // removed from a "foo:hello #ref" URL (see http://crbug.com/291747). + GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE); DCHECK(test_url.is_valid_ == is_valid_); DCHECK(test_url.spec_ == spec_); @@ -358,6 +358,17 @@ class URL_EXPORT GURL { } private: + // Variant of the string parsing constructor that allows the caller to elect + // retain trailing whitespace, if any, on the passed URL spec but only if the + // scheme is one that allows trailing whitespace. The primary use-case is + // for data: URLs. In most cases, you want to use the single parameter + // constructor above. + enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE }; + GURL(const std::string& url_string, RetainWhiteSpaceSelector); + + template<typename STR> + void InitCanonical(const STR& input_spec, bool trim_path_end); + void InitializeFromCanonicalSpec(); // Returns the substring of the input identified by the given component. diff --git a/url/gurl_unittest.cc b/url/gurl_unittest.cc index 767333c..c2d86e9 100644 --- a/url/gurl_unittest.cc +++ b/url/gurl_unittest.cc @@ -350,6 +350,29 @@ TEST(GURLTest, Replacements) { } } +TEST(GURLTest, ClearFragmentOnDataUrl) { + // http://crbug.com/291747 - a data URL may legitimately have trailing + // whitespace in the spec after the ref is cleared. Test this does not trigger + // the url_parse::Parsed importing validation DCHECK in GURL. + GURL url(" data: one ? two # three "); + + // By default the trailing whitespace will have been stripped. + EXPECT_EQ("data: one ? two # three", url.spec()); + GURL::Replacements repl; + repl.ClearRef(); + GURL url_no_ref = url.ReplaceComponents(repl); + + EXPECT_EQ("data: one ? two ", url_no_ref.spec()); + + // Importing a parsed url via this constructor overload will retain trailing + // whitespace. + GURL import_url(url_no_ref.spec(), + url_no_ref.parsed_for_possibly_invalid_spec(), + url_no_ref.is_valid()); + EXPECT_EQ(url_no_ref, import_url); + EXPECT_EQ(import_url.query(), " two "); +} + TEST(GURLTest, PathForRequest) { struct TestCase { const char* input; diff --git a/url/third_party/mozilla/url_parse.cc b/url/third_party/mozilla/url_parse.cc index fbc8a9b..84a7558 100644 --- a/url/third_party/mozilla/url_parse.cc +++ b/url/third_party/mozilla/url_parse.cc @@ -455,45 +455,53 @@ void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) { // Initializes a path URL which is merely a scheme followed by a path. Examples // include "about:foo" and "javascript:alert('bar');" template<typename CHAR> -void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) { +void DoParsePathURL(const CHAR* spec, int spec_len, + bool trim_path_end, + Parsed* parsed) { // Get the non-path and non-scheme parts of the URL out of the way, we never // use them. parsed->username.reset(); parsed->password.reset(); parsed->host.reset(); parsed->port.reset(); + parsed->path.reset(); parsed->query.reset(); parsed->ref.reset(); // Strip leading & trailing spaces and control characters. - int begin = 0; - TrimURL(spec, &begin, &spec_len); + int scheme_begin = 0; + TrimURL(spec, &scheme_begin, &spec_len, trim_path_end); // Handle empty specs or ones that contain only whitespace or control chars. - if (begin == spec_len) { + if (scheme_begin == spec_len) { parsed->scheme.reset(); parsed->path.reset(); return; } + int path_begin; // Extract the scheme, with the path being everything following. We also // handle the case where there is no scheme. - if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin, + &parsed->scheme)) { // Offset the results since we gave ExtractScheme a substring. - parsed->scheme.begin += begin; - - // For compatability with the standard URL parser, we treat no path as - // -1, rather than having a length of 0 (we normally wouldn't care so - // much for these non-standard URLs). - if (parsed->scheme.end() == spec_len - 1) - parsed->path.reset(); - else - parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len); + parsed->scheme.begin += scheme_begin; + path_begin = parsed->scheme.end() + 1; } else { - // No scheme found, just path. + // No scheme case. parsed->scheme.reset(); - parsed->path = MakeRange(begin, spec_len); + path_begin = scheme_begin; } + + if (path_begin == spec_len) + return; + DCHECK_LT(path_begin, spec_len); + + ParsePath(spec, + MakeRange(path_begin, spec_len), + &parsed->path, + &parsed->query, + &parsed->ref); } template<typename CHAR> @@ -875,12 +883,18 @@ void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) { DoParseStandardURL(url, url_len, parsed); } -void ParsePathURL(const char* url, int url_len, Parsed* parsed) { - DoParsePathURL(url, url_len, parsed); +void ParsePathURL(const char* url, + int url_len, + bool trim_path_end, + Parsed* parsed) { + DoParsePathURL(url, url_len, trim_path_end, parsed); } -void ParsePathURL(const base::char16* url, int url_len, Parsed* parsed) { - DoParsePathURL(url, url_len, parsed); +void ParsePathURL(const base::char16* url, + int url_len, + bool trim_path_end, + Parsed* parsed) { + DoParsePathURL(url, url_len, trim_path_end, parsed); } void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { diff --git a/url/third_party/mozilla/url_parse.h b/url/third_party/mozilla/url_parse.h index 5fa9322..9d943e5 100644 --- a/url/third_party/mozilla/url_parse.h +++ b/url/third_party/mozilla/url_parse.h @@ -238,9 +238,13 @@ URL_EXPORT void ParseStandardURL(const base::char16* url, // section but that aren't file URLs either. The scheme is parsed, and // everything after the scheme is considered as the path. This is used for // things like "about:" and "javascript:" -URL_EXPORT void ParsePathURL(const char* url, int url_len, Parsed* parsed); +URL_EXPORT void ParsePathURL(const char* url, + int url_len, + bool trim_path_end, + Parsed* parsed); URL_EXPORT void ParsePathURL(const base::char16* url, int url_len, + bool trim_path_end, Parsed* parsed); // FileURL is for file URLs. There are some special rules for interpreting diff --git a/url/url_canon_pathurl.cc b/url/url_canon_pathurl.cc index bc681f4..8f7dee4 100644 --- a/url/url_canon_pathurl.cc +++ b/url/url_canon_pathurl.cc @@ -13,6 +13,39 @@ namespace url_canon { namespace { +// Canonicalize the given |component| from |source| into |output| and +// |new_component|. If |separator| is non-zero, it is pre-pended to |ouput| +// prior to the canonicalized component; i.e. for the '?' or '#' characters. +template<typename CHAR, typename UCHAR> +bool DoCanonicalizePathComponent(const CHAR* source, + const url_parse::Component& component, + CHAR seperator, + CanonOutput* output, + url_parse::Component* new_component) { + bool success = true; + if (component.is_valid()) { + if (seperator) + output->push_back(seperator); + // Copy the path using path URL's more lax escaping rules (think for + // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all + // ASCII characters alone. This helps readability of JavaStript. + new_component->begin = output->length(); + int end = component.end(); + for (int i = component.begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(source[i]); + if (uch < 0x20 || uch >= 0x80) + success &= AppendUTF8EscapedChar(source, &i, end, output); + else + output->push_back(static_cast<char>(uch)); + } + new_component->len = output->length() - new_component->begin; + } else { + // Empty part. + new_component->reset(); + } + return success; +} + template<typename CHAR, typename UCHAR> bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source, const url_parse::Parsed& parsed, @@ -28,29 +61,14 @@ bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source, new_parsed->password.reset(); new_parsed->host.reset(); new_parsed->port.reset(); - - if (parsed.path.is_valid()) { - // Copy the path using path URL's more lax escaping rules (think for - // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all - // ASCII characters alone. This helps readability of JavaStript. - new_parsed->path.begin = output->length(); - int end = parsed.path.end(); - for (int i = parsed.path.begin; i < end; i++) { - UCHAR uch = static_cast<UCHAR>(source.path[i]); - if (uch < 0x20 || uch >= 0x80) - success &= AppendUTF8EscapedChar(source.path, &i, end, output); - else - output->push_back(static_cast<char>(uch)); - } - new_parsed->path.len = output->length() - new_parsed->path.begin; - } else { - // Empty path. - new_parsed->path.reset(); - } - - // Assume there's no query or ref. - new_parsed->query.reset(); - new_parsed->ref.reset(); + // We allow path URLs to have the path, query and fragment components, but we + // will canonicalize each of the via the weaker path URL rules. + success &= DoCanonicalizePathComponent<CHAR, UCHAR>( + source.path, parsed.path, 0, output, &new_parsed->path); + success &= DoCanonicalizePathComponent<CHAR, UCHAR>( + source.query, parsed.query, '?', output, &new_parsed->query); + success &= DoCanonicalizePathComponent<CHAR, UCHAR>( + source.ref, parsed.ref, '#', output, &new_parsed->ref); return success; } diff --git a/url/url_canon_relative.cc b/url/url_canon_relative.cc index 4edd6ced..33b814c 100644 --- a/url/url_canon_relative.cc +++ b/url/url_canon_relative.cc @@ -101,10 +101,16 @@ bool DoIsRelativeURL(const char* base, // "http:foo.html" is a relative URL with path "foo.html". If the scheme is // empty, we treat it as relative (":foo") like IE does. url_parse::Component scheme; - if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) { - // Don't allow relative URLs if the base scheme doesn't support it. - if (!is_base_hierarchical) + const bool scheme_is_empty = + !url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0; + if (scheme_is_empty) { + if (url[begin] == '#') { + // |url| is a bare fragement (e.g. "#foo"). This can be resolved against + // any base. Fall-through. + } else if (!is_base_hierarchical) { + // Don't allow relative URLs if the base scheme doesn't support it. return false; + } *relative_component = url_parse::MakeRange(begin, url_len); *is_relative = true; diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc index 1b59a00..2275429 100644 --- a/url/url_canon_unittest.cc +++ b/url/url_canon_unittest.cc @@ -1609,7 +1609,7 @@ TEST(URLCanonTest, ReplacePathURL) { const ReplaceCase& cur = replace_cases[i]; int base_len = static_cast<int>(strlen(cur.base)); url_parse::Parsed parsed; - url_parse::ParsePathURL(cur.base, base_len, &parsed); + url_parse::ParsePathURL(cur.base, base_len, false, &parsed); url_canon::Replacements<char> r; typedef url_canon::Replacements<char> R; // Clean up syntax. @@ -1830,7 +1830,7 @@ TEST(URLCanonTest, CanonicalizePathURL) { for (size_t i = 0; i < ARRAYSIZE(path_cases); i++) { int url_len = static_cast<int>(strlen(path_cases[i].input)); url_parse::Parsed parsed; - url_parse::ParsePathURL(path_cases[i].input, url_len, &parsed); + url_parse::ParsePathURL(path_cases[i].input, url_len, true, &parsed); url_parse::Parsed out_parsed; std::string out_str; @@ -1848,8 +1848,8 @@ TEST(URLCanonTest, CanonicalizePathURL) { // When we end with a colon at the end, there should be no path. if (path_cases[i].input[url_len - 1] == ':') { - EXPECT_EQ(0, out_parsed.path.begin); - EXPECT_EQ(-1, out_parsed.path.len); + EXPECT_EQ(0, out_parsed.GetContent().begin); + EXPECT_EQ(-1, out_parsed.GetContent().len); } } } @@ -2156,7 +2156,7 @@ TEST(URLCanonTest, ResolveRelativeURL) { else if (cur_case.is_base_hier) url_parse::ParseStandardURL(cur_case.base, base_len, &parsed); else - url_parse::ParsePathURL(cur_case.base, base_len, &parsed); + url_parse::ParsePathURL(cur_case.base, base_len, false, &parsed); // First see if it is relative. int test_len = static_cast<int>(strlen(cur_case.test)); @@ -2188,12 +2188,15 @@ TEST(URLCanonTest, ResolveRelativeURL) { // the URL freshly. url_parse::Parsed ref_parsed; int resolved_len = static_cast<int>(resolved.size()); - if (cur_case.is_base_file) + if (cur_case.is_base_file) { url_parse::ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed); - else if (cur_case.is_base_hier) - url_parse::ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed); - else - url_parse::ParsePathURL(resolved.c_str(), resolved_len, &ref_parsed); + } else if (cur_case.is_base_hier) { + url_parse::ParseStandardURL(resolved.c_str(), resolved_len, + &ref_parsed); + } else { + url_parse::ParsePathURL(resolved.c_str(), resolved_len, false, + &ref_parsed); + } EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed)); } } diff --git a/url/url_parse_internal.h b/url/url_parse_internal.h index 9c2b2b6..e61379c 100644 --- a/url/url_parse_internal.h +++ b/url/url_parse_internal.h @@ -28,15 +28,19 @@ inline bool ShouldTrimFromURL(base::char16 ch) { // in the input string (so the string starts at character |*begin| in the spec, // and goes until |*len|). template<typename CHAR> -inline void TrimURL(const CHAR* spec, int* begin, int* len) { +inline void TrimURL(const CHAR* spec, int* begin, int* len, + bool trim_path_end = true) { // Strip leading whitespace and control characters. while (*begin < *len && ShouldTrimFromURL(spec[*begin])) (*begin)++; - // Strip trailing whitespace and control characters. We need the >i test for - // when the input string is all blanks; we don't want to back past the input. - while (*len > *begin && ShouldTrimFromURL(spec[*len - 1])) - (*len)--; + if (trim_path_end) { + // Strip trailing whitespace and control characters. We need the >i test + // for when the input string is all blanks; we don't want to back past the + // input. + while (*len > *begin && ShouldTrimFromURL(spec[*len - 1])) + (*len)--; + } } // Counts the number of consecutive slashes starting at the given offset diff --git a/url/url_parse_unittest.cc b/url/url_parse_unittest.cc index ad47616..f080fb5 100644 --- a/url/url_parse_unittest.cc +++ b/url/url_parse_unittest.cc @@ -343,11 +343,11 @@ static PathURLParseCase path_cases[] = { {":", "", NULL}, {":/", "", "/"}, {"/", NULL, "/"}, -{" This is \\interesting// \t", NULL, "This is \\interesting//"}, +{" This is \\interesting// \t", NULL, "This is \\interesting// \t"}, {"about:", "about", NULL}, {"about:blank", "about", "blank"}, -{" about: blank ", "about", " blank"}, -{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\");"}, +{" about: blank ", "about", " blank "}, +{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\"); "}, }; TEST(URLParser, PathURL) { @@ -356,18 +356,18 @@ TEST(URLParser, PathURL) { url_parse::Parsed parsed; for (size_t i = 0; i < arraysize(path_cases); i++) { const char* url = path_cases[i].input; - url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), &parsed); + url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), false, &parsed); - EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme)); - EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme)) + << i; + EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.GetContent())) + << i; // The remaining components are never used for path urls. ExpectInvalidComponent(parsed.username); ExpectInvalidComponent(parsed.password); ExpectInvalidComponent(parsed.host); ExpectInvalidComponent(parsed.port); - ExpectInvalidComponent(parsed.query); - ExpectInvalidComponent(parsed.ref); } } diff --git a/url/url_util.cc b/url/url_util.cc index f16af98..2b4f7ad 100644 --- a/url/url_util.cc +++ b/url/url_util.cc @@ -121,6 +121,7 @@ bool DoFindAndCompareScheme(const CHAR* str, template<typename CHAR> bool DoCanonicalize(const CHAR* in_spec, int in_spec_len, + bool trim_path_end, url_canon::CharsetConverter* charset_converter, url_canon::CanonOutput* output, url_parse::Parsed* output_parsed) { @@ -188,7 +189,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len, } else { // "Weird" URLs like data: and javascript: - url_parse::ParsePathURL(spec, spec_len, &parsed_input); + url_parse::ParsePathURL(spec, spec_len, trim_path_end, &parsed_input); success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input, output, output_parsed); } @@ -252,7 +253,8 @@ bool DoResolveRelative(const char* base_spec, // The output_parsed is incorrect at this point (because it was built // based on base_parsed_authority instead of base_parsed) and needs to be // re-created. - ParsePathURL(output->data(), output->length(), output_parsed); + ParsePathURL(output->data(), output->length(), true, + output_parsed); return did_resolve_succeed; } } else if (is_relative) { @@ -266,7 +268,7 @@ bool DoResolveRelative(const char* base_spec, } // Not relative, canonicalize the input. - return DoCanonicalize(relative, relative_length, charset_converter, + return DoCanonicalize(relative, relative_length, true, charset_converter, output, output_parsed); } @@ -314,7 +316,7 @@ bool DoReplaceComponents(const char* spec, // may have changed with the different scheme. url_canon::RawCanonOutput<128> recanonicalized; url_parse::Parsed recanonicalized_parsed; - DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), + DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true, charset_converter, &recanonicalized, &recanonicalized_parsed); @@ -429,19 +431,21 @@ bool FindAndCompareScheme(const base::char16* str, bool Canonicalize(const char* spec, int spec_len, + bool trim_path_end, url_canon::CharsetConverter* charset_converter, url_canon::CanonOutput* output, url_parse::Parsed* output_parsed) { - return DoCanonicalize(spec, spec_len, charset_converter, + return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter, output, output_parsed); } bool Canonicalize(const base::char16* spec, int spec_len, + bool trim_path_end, url_canon::CharsetConverter* charset_converter, url_canon::CanonOutput* output, url_parse::Parsed* output_parsed) { - return DoCanonicalize(spec, spec_len, charset_converter, + return DoCanonicalize(spec, spec_len, trim_path_end, charset_converter, output, output_parsed); } diff --git a/url/url_util.h b/url/url_util.h index 4494819..eff386c 100644 --- a/url/url_util.h +++ b/url/url_util.h @@ -112,11 +112,13 @@ inline bool IsStandard(const char* spec, int spec_len, // but they will not represent a loadable URL. URL_EXPORT bool Canonicalize(const char* spec, int spec_len, + bool trim_path_end, url_canon::CharsetConverter* charset_converter, url_canon::CanonOutput* output, url_parse::Parsed* output_parsed); URL_EXPORT bool Canonicalize(const base::char16* spec, int spec_len, + bool trim_path_end, url_canon::CharsetConverter* charset_converter, url_canon::CanonOutput* output, url_parse::Parsed* output_parsed); diff --git a/url/url_util_unittest.cc b/url/url_util_unittest.cc index dfbdb40..70adc5b 100644 --- a/url/url_util_unittest.cc +++ b/url/url_util_unittest.cc @@ -93,7 +93,7 @@ static std::string CheckReplaceScheme(const char* base_url, // Make sure the input is canonicalized. url_canon::RawCanonOutput<32> original; url_parse::Parsed original_parsed; - url_util::Canonicalize(base_url, strlen(base_url), NULL, + url_util::Canonicalize(base_url, strlen(base_url), true, NULL, &original, &original_parsed); url_canon::Replacements<char> replacements; @@ -135,6 +135,9 @@ TEST(URLUtilTest, ReplaceScheme) { // http://crbug.com/160 which should also be an acceptable result. EXPECT_EQ("about://google.com/", CheckReplaceScheme("http://google.com/", "about")); + + EXPECT_EQ("http://example.com/%20hello%20# world", + CheckReplaceScheme("myscheme:example.com/ hello # world ", "http")); } TEST(URLUtilTest, DecodeURLEscapeSequences) { @@ -270,12 +273,20 @@ TEST(URLUtilTest, TestResolveRelativeWithNonStandardBase) { // Resolving should fail if the base URL is authority-based but is // missing a path component (the '/' at the end). {"scheme://Authority", "path", false, ""}, + // Test resolving a fragment (only) against any kind of base-URL. + {"about:blank", "#id42", true, "about:blank#id42" }, + {"about:blank", " #id42", true, "about:blank#id42" }, + {"about:blank#oldfrag", "#newfrag", true, "about:blank#newfrag" }, + // A surprising side effect of allowing fragments to resolve against + // any URL scheme is we might break javascript: URLs by doing so... + {"javascript:alert('foo#bar')", "#badfrag", true, + "javascript:alert('foo#badfrag" }, }; for (size_t i = 0; i < ARRAYSIZE_UNSAFE(resolve_non_standard_cases); i++) { const ResolveRelativeCase& test_data = resolve_non_standard_cases[i]; url_parse::Parsed base_parsed; - url_parse::ParsePathURL(test_data.base, strlen(test_data.base), + url_parse::ParsePathURL(test_data.base, strlen(test_data.base), false, &base_parsed); std::string resolved; |