summaryrefslogtreecommitdiffstats
path: root/googleurl/src/url_util.cc
diff options
context:
space:
mode:
Diffstat (limited to 'googleurl/src/url_util.cc')
-rw-r--r--googleurl/src/url_util.cc232
1 files changed, 166 insertions, 66 deletions
diff --git a/googleurl/src/url_util.cc b/googleurl/src/url_util.cc
index d623b45..7e100aa 100644
--- a/googleurl/src/url_util.cc
+++ b/googleurl/src/url_util.cc
@@ -33,6 +33,7 @@
#include "googleurl/src/url_util.h"
#include "base/logging.h"
+#include "googleurl/src/url_canon_internal.h"
#include "googleurl/src/url_file.h"
namespace url_util {
@@ -58,13 +59,15 @@ inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
const char kFileScheme[] = "file"; // Used in a number of places.
const char kMailtoScheme[] = "mailto";
-const int kNumStandardURLSchemes = 5;
+const int kNumStandardURLSchemes = 7;
const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
"http",
"https",
kFileScheme, // Yes, file urls can have a hostname!
"ftp",
"gopher",
+ "ws", // WebSocket.
+ "wss", // WebSocket secure.
};
// List of the currently installed standard schemes. This list is lazily
@@ -72,6 +75,9 @@ const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
// any destructors from being called that will slow us down or cause problems.
std::vector<const char*>* standard_schemes = NULL;
+// See the LockStandardSchemes declaration in the header.
+bool standard_schemes_locked = false;
+
// Ensures that the standard_schemes list is initialized, does nothing if it
// already has values.
void InitStandardSchemes() {
@@ -96,10 +102,9 @@ inline bool CompareSchemeComponent(const CHAR* spec,
}
// Returns true if the given scheme identified by |scheme| within |spec| is one
-// of the registered "standard" schemes. Note that this does not check for
-// "://", use IsStandard for that.
+// of the registered "standard" schemes.
template<typename CHAR>
-bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) {
+bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {
if (!scheme.is_nonempty())
return false; // Empty or invalid schemes are non-standard.
@@ -112,34 +117,20 @@ bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) {
return false;
}
-// Returns true if the stuff following the scheme in the given spec indicates
-// a "standard" URL. The presence of "://" after the scheme indicates that
-// there is a hostname, etc. which we call a standard URL.
-template<typename CHAR>
-bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len,
- const url_parse::Component& scheme) {
- int after_scheme = scheme.end();
- if (spec_len < after_scheme + 3)
- return false;
- return spec[after_scheme] == ':' &&
- spec[after_scheme + 1] == '/' &&
- spec[after_scheme + 2] == '/';
-}
-
-template<typename CHAR>
-bool DoIsStandard(const CHAR* spec, int spec_len,
- const url_parse::Component& scheme) {
- return HasStandardSchemeSeparator(spec, spec_len, scheme) ||
- IsStandardScheme(spec, scheme);
-}
-
template<typename CHAR>
bool DoFindAndCompareScheme(const CHAR* str,
int str_len,
const char* compare,
url_parse::Component* found_scheme) {
+ // Before extracting scheme, canonicalize the URL to remove any whitespace.
+ // This matches the canonicalization done in DoCanonicalize function.
+ url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+ int spec_len;
+ const CHAR* spec = RemoveURLWhitespace(str, str_len,
+ &whitespace_buffer, &spec_len);
+
url_parse::Component our_scheme;
- if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) {
+ if (!url_parse::ExtractScheme(spec, spec_len, &our_scheme)) {
// No scheme.
if (found_scheme)
*found_scheme = url_parse::Component();
@@ -147,7 +138,7 @@ bool DoFindAndCompareScheme(const CHAR* str,
}
if (found_scheme)
*found_scheme = our_scheme;
- return CompareSchemeComponent(str, our_scheme, compare);
+ return CompareSchemeComponent(spec, our_scheme, compare);
}
template<typename CHAR>
@@ -184,7 +175,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
#endif
url_parse::Component scheme;
- if(!url_parse::ExtractScheme(spec, spec_len, &scheme))
+ if (!url_parse::ExtractScheme(spec, spec_len, &scheme))
return false;
// This is the parsed version of the input URL, we have to canonicalize it
@@ -197,7 +188,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
charset_converter,
output, output_parsed);
- } else if (IsStandard(spec, spec_len, scheme)) {
+ } else if (DoIsStandard(spec, scheme)) {
// All "normal" URLs.
url_parse::ParseStandardURL(spec, spec_len, &parsed_input);
success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,
@@ -239,7 +230,7 @@ bool DoResolveRelative(const char* base_spec,
// See if our base URL should be treated as "standard".
bool standard_base_scheme =
base_parsed.scheme.is_nonempty() &&
- IsStandard(base_spec, base_spec_len, base_parsed.scheme);
+ DoIsStandard(base_spec, base_parsed.scheme);
bool is_relative;
url_parse::Component relative_component;
@@ -275,53 +266,111 @@ bool DoReplaceComponents(const char* spec,
url_canon::CharsetConverter* charset_converter,
url_canon::CanonOutput* output,
url_parse::Parsed* out_parsed) {
- // Note that we dispatch to the parser according the the scheme type of
- // the OUTPUT URL. Normally, this is the same as our scheme, but if the
- // scheme is being overridden, we need to test that.
-
- if (// Either the scheme is not replaced and the old one is a file,
- (!replacements.IsSchemeOverridden() &&
- CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) ||
- // ...or it is being replaced and the new one is a file.
- (replacements.IsSchemeOverridden() &&
- CompareSchemeComponent(replacements.sources().scheme,
- replacements.components().scheme,
- kFileScheme))) {
+ // If the scheme is overridden, just do a simple string substitution and
+ // reparse the whole thing. There are lots of edge cases that we really don't
+ // want to deal with. Like what happens if I replace "http://e:8080/foo"
+ // with a file. Does it become "file:///E:/8080/foo" where the port number
+ // becomes part of the path? Parsing that string as a file URL says "yes"
+ // but almost no sane rule for dealing with the components individually would
+ // come up with that.
+ //
+ // Why allow these crazy cases at all? Programatically, there is almost no
+ // case for replacing the scheme. The most common case for hitting this is
+ // in JS when building up a URL using the location object. In this case, the
+ // JS code expects the string substitution behavior:
+ // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
+ if (replacements.IsSchemeOverridden()) {
+ // Canonicalize the new scheme so it is 8-bit and can be concatenated with
+ // the existing spec.
+ url_canon::RawCanonOutput<128> scheme_replaced;
+ url_parse::Component scheme_replaced_parsed;
+ url_canon::CanonicalizeScheme(
+ replacements.sources().scheme,
+ replacements.components().scheme,
+ &scheme_replaced, &scheme_replaced_parsed);
+
+ // We can assume that the input is canonicalized, which means it always has
+ // a colon after the scheme (or where the scheme would be).
+ int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
+ : 1;
+ if (spec_len - spec_after_colon > 0) {
+ scheme_replaced.Append(&spec[spec_after_colon],
+ spec_len - spec_after_colon);
+ }
+
+ // We now need to completely re-parse the resulting string since its meaning
+ // may have changed with the different scheme.
+ url_canon::RawCanonOutput<128> recanonicalized;
+ url_parse::Parsed recanonicalized_parsed;
+ DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
+ charset_converter,
+ &recanonicalized, &recanonicalized_parsed);
+
+ // Recurse using the version with the scheme already replaced. This will now
+ // use the replacement rules for the new scheme.
+ //
+ // Warning: this code assumes that ReplaceComponents will re-check all
+ // components for validity. This is because we can't fail if DoCanonicalize
+ // failed above since theoretically the thing making it fail could be
+ // getting replaced here. If ReplaceComponents didn't re-check everything,
+ // we wouldn't know if something *not* getting replaced is a problem.
+ // If the scheme-specific replacers are made more intelligent so they don't
+ // re-check everything, we should instead recanonicalize the whole thing
+ // after this call to check validity (this assumes replacing the scheme is
+ // much much less common than other types of replacements, like clearing the
+ // ref).
+ url_canon::Replacements<CHAR> replacements_no_scheme = replacements;
+ replacements_no_scheme.SetScheme(NULL, url_parse::Component());
+ return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
+ recanonicalized_parsed, replacements_no_scheme,
+ charset_converter, output, out_parsed);
+ }
+
+ // If we get here, then we know the scheme doesn't need to be replaced, so can
+ // just key off the scheme in the spec to know how to do the replacements.
+ if (CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {
return url_canon::ReplaceFileURL(spec, parsed, replacements,
charset_converter, output, out_parsed);
}
-
- if (// Either the scheme is not replaced and the old one is standard,
- (!replacements.IsSchemeOverridden() &&
- IsStandard(spec, spec_len, parsed.scheme)) ||
- // ...or it is being replaced and the new one is standard.
- (replacements.IsSchemeOverridden() &&
- IsStandardScheme(replacements.sources().scheme,
- replacements.components().scheme))) {
- // Standard URL with all parts.
+ if (DoIsStandard(spec, parsed.scheme)) {
return url_canon::ReplaceStandardURL(spec, parsed, replacements,
charset_converter, output, out_parsed);
}
-
- if (// Either the scheme is not replaced and the old one is mailto,
- (!replacements.IsSchemeOverridden() &&
- CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) ||
- // ...or it is being replaced and the new one is a mailto.
- (replacements.IsSchemeOverridden() &&
- CompareSchemeComponent(replacements.sources().scheme,
- replacements.components().scheme,
- kMailtoScheme))) {
+ if (CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {
return url_canon::ReplaceMailtoURL(spec, parsed, replacements,
output, out_parsed);
}
+ // Default is a path URL.
return url_canon::ReplacePathURL(spec, parsed, replacements,
output, out_parsed);
}
} // namespace
+void Initialize() {
+ InitStandardSchemes();
+}
+
+void Shutdown() {
+ if (standard_schemes) {
+ delete standard_schemes;
+ standard_schemes = NULL;
+ }
+}
+
void AddStandardScheme(const char* new_scheme) {
+ // If this assert triggers, it means you've called AddStandardScheme after
+ // LockStandardSchemes have been called (see the header file for
+ // LockStandardSchemes for more).
+ //
+ // This normally means you're trying to set up a new standard scheme too late
+ // in your application's init process. Locate where your app does this
+ // initialization and calls LockStandardScheme, and add your new standard
+ // scheme there.
+ DCHECK(!standard_schemes_locked) <<
+ "Trying to add a standard scheme after the list has been locked.";
+
size_t scheme_len = strlen(new_scheme);
if (scheme_len == 0)
return;
@@ -335,14 +384,16 @@ void AddStandardScheme(const char* new_scheme) {
standard_schemes->push_back(dup_scheme);
}
-bool IsStandard(const char* spec, int spec_len,
- const url_parse::Component& scheme) {
- return DoIsStandard(spec, spec_len, scheme);
+void LockStandardSchemes() {
+ standard_schemes_locked = true;
+}
+
+bool IsStandard(const char* spec, const url_parse::Component& scheme) {
+ return DoIsStandard(spec, scheme);
}
-bool IsStandard(const char16* spec, int spec_len,
- const url_parse::Component& scheme) {
- return DoIsStandard(spec, spec_len, scheme);
+bool IsStandard(const char16* spec, const url_parse::Component& scheme) {
+ return DoIsStandard(spec, scheme);
}
bool FindAndCompareScheme(const char* str,
@@ -450,4 +501,53 @@ bool LowerCaseEqualsASCII(const char16* a_begin,
return DoLowerCaseEqualsASCII(a_begin, a_end, b);
}
+void DecodeURLEscapeSequences(const char* input, int length,
+ url_canon::CanonOutputW* output) {
+ url_canon::RawCanonOutputT<char> unescaped_chars;
+ for (int i = 0; i < length; i++) {
+ if (input[i] == '%') {
+ unsigned char ch;
+ if (url_canon::DecodeEscaped(input, &i, length, &ch)) {
+ unescaped_chars.push_back(ch);
+ } else {
+ // Invalid escape sequence, copy the percent literal.
+ unescaped_chars.push_back('%');
+ }
+ } else {
+ // Regular non-escaped 8-bit character.
+ unescaped_chars.push_back(input[i]);
+ }
+ }
+
+ // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
+ // JavaScript URLs, but Firefox and Safari do.
+ for (int i = 0; i < unescaped_chars.length(); i++) {
+ unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
+ if (uch < 0x80) {
+ // Non-UTF-8, just append directly
+ output->push_back(uch);
+ } else {
+ // next_ch will point to the last character of the decoded
+ // character.
+ int next_character = i;
+ unsigned code_point;
+ if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character,
+ unescaped_chars.length(), &code_point)) {
+ // Valid UTF-8 character, convert to UTF-16.
+ url_canon::AppendUTF16Value(code_point, output);
+ i = next_character;
+ } else {
+ // If there are any sequences that are not valid UTF-8, we keep
+ // invalid code points and promote to UTF-16. We copy all characters
+ // from the current position to the end of the identified sequence.
+ while (i < next_character) {
+ output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+ i++;
+ }
+ output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+ }
+ }
+ }
+}
+
} // namespace url_util