1 files changed, 166 insertions, 66 deletions
diff --git a/googleurl/src/url_util.cc b/googleurl/src/url_util.cc
index d623b45..7e100aa 100644
--- a/googleurl/src/url_util.cc
+++ b/googleurl/src/url_util.cc
@@ -33,6 +33,7 @@
 #include "googleurl/src/url_util.h"
 
 #include "base/logging.h"
+#include "googleurl/src/url_canon_internal.h"
 #include "googleurl/src/url_file.h"
 
 namespace url_util {
@@ -58,13 +59,15 @@ inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
 const char kFileScheme[] = "file";  // Used in a number of places.
 const char kMailtoScheme[] = "mailto";
 
-const int kNumStandardURLSchemes = 5;
+const int kNumStandardURLSchemes = 7;
 const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
   "http",
   "https",
   kFileScheme,  // Yes, file urls can have a hostname!
   "ftp",
   "gopher",
+  "ws",  // WebSocket.
+  "wss",  // WebSocket secure.
 };
 
 // List of the currently installed standard schemes. This list is lazily
@@ -72,6 +75,9 @@ const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
 // any destructors from being called that will slow us down or cause problems.
 std::vector<const char*>* standard_schemes = NULL;
 
+// See the LockStandardSchemes declaration in the header.
+bool standard_schemes_locked = false;
+
 // Ensures that the standard_schemes list is initialized, does nothing if it
 // already has values.
 void InitStandardSchemes() {
@@ -96,10 +102,9 @@ inline bool CompareSchemeComponent(const CHAR* spec,
 }
 
 // Returns true if the given scheme identified by |scheme| within |spec| is one
-// of the registered "standard" schemes. Note that this does not check for
-// "://", use IsStandard for that.
+// of the registered "standard" schemes.
 template<typename CHAR>
-bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) {
+bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {
   if (!scheme.is_nonempty())
     return false;  // Empty or invalid schemes are non-standard.
 
@@ -112,34 +117,20 @@ bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) {
   return false;
 }
 
-// Returns true if the stuff following the scheme in the given spec indicates
-// a "standard" URL. The presence of "://" after the scheme indicates that
-// there is a hostname, etc. which we call a standard URL.
-template<typename CHAR>
-bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len,
-                                const url_parse::Component& scheme) {
-  int after_scheme = scheme.end();
-  if (spec_len < after_scheme + 3)
-    return false;
-  return spec[after_scheme] == ':' &&
-         spec[after_scheme + 1] == '/' &&
-         spec[after_scheme + 2] == '/';
-}
-
-template<typename CHAR>
-bool DoIsStandard(const CHAR* spec, int spec_len,
-                  const url_parse::Component& scheme) {
-  return HasStandardSchemeSeparator(spec, spec_len, scheme) ||
-         IsStandardScheme(spec, scheme);
-}
-
 template<typename CHAR>
 bool DoFindAndCompareScheme(const CHAR* str,
                             int str_len,
                             const char* compare,
                             url_parse::Component* found_scheme) {
+  // Before extracting scheme, canonicalize the URL to remove any whitespace.
+  // This matches the canonicalization done in DoCanonicalize function.
+  url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+  int spec_len;
+  const CHAR* spec = RemoveURLWhitespace(str, str_len,
+                                         &whitespace_buffer, &spec_len);
+
   url_parse::Component our_scheme;
-  if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) {
+  if (!url_parse::ExtractScheme(spec, spec_len, &our_scheme)) {
     // No scheme.
     if (found_scheme)
       *found_scheme = url_parse::Component();
@@ -147,7 +138,7 @@ bool DoFindAndCompareScheme(const CHAR* str,
   }
   if (found_scheme)
     *found_scheme = our_scheme;
-  return CompareSchemeComponent(str, our_scheme, compare);
+  return CompareSchemeComponent(spec, our_scheme, compare);
 }
 
 template<typename CHAR>
@@ -184,7 +175,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
 #endif
 
   url_parse::Component scheme;
-  if(!url_parse::ExtractScheme(spec, spec_len, &scheme))
+  if (!url_parse::ExtractScheme(spec, spec_len, &scheme))
     return false;
 
   // This is the parsed version of the input URL, we have to canonicalize it
@@ -197,7 +188,7 @@ bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
                                              charset_converter,
                                              output, output_parsed);
 
-  } else if (IsStandard(spec, spec_len, scheme)) {
+  } else if (DoIsStandard(spec, scheme)) {
     // All "normal" URLs.
     url_parse::ParseStandardURL(spec, spec_len, &parsed_input);
     success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,
@@ -239,7 +230,7 @@ bool DoResolveRelative(const char* base_spec,
   // See if our base URL should be treated as "standard".
   bool standard_base_scheme =
       base_parsed.scheme.is_nonempty() &&
-      IsStandard(base_spec, base_spec_len, base_parsed.scheme);
+      DoIsStandard(base_spec, base_parsed.scheme);
 
   bool is_relative;
   url_parse::Component relative_component;
@@ -275,53 +266,111 @@ bool DoReplaceComponents(const char* spec,
                          url_canon::CharsetConverter* charset_converter,
                          url_canon::CanonOutput* output,
                          url_parse::Parsed* out_parsed) {
-  // Note that we dispatch to the parser according the the scheme type of
-  // the OUTPUT URL. Normally, this is the same as our scheme, but if the
-  // scheme is being overridden, we need to test that.
-
-  if (// Either the scheme is not replaced and the old one is a file,
-      (!replacements.IsSchemeOverridden() &&
-       CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) ||
-      // ...or it is being replaced and the new one is a file.
-      (replacements.IsSchemeOverridden() &&
-       CompareSchemeComponent(replacements.sources().scheme,
-                              replacements.components().scheme,
-                              kFileScheme))) {
+  // If the scheme is overridden, just do a simple string substitution and
+  // reparse the whole thing. There are lots of edge cases that we really don't
+  // want to deal with. Like what happens if I replace "http://e:8080/foo"
+  // with a file. Does it become "file:///E:/8080/foo" where the port number
+  // becomes part of the path? Parsing that string as a file URL says "yes"
+  // but almost no sane rule for dealing with the components individually would
+  // come up with that.
+  //
+  // Why allow these crazy cases at all? Programatically, there is almost no
+  // case for replacing the scheme. The most common case for hitting this is
+  // in JS when building up a URL using the location object. In this case, the
+  // JS code expects the string substitution behavior:
+  //   http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
+  if (replacements.IsSchemeOverridden()) {
+    // Canonicalize the new scheme so it is 8-bit and can be concatenated with
+    // the existing spec.
+    url_canon::RawCanonOutput<128> scheme_replaced;
+    url_parse::Component scheme_replaced_parsed;
+    url_canon::CanonicalizeScheme(
+        replacements.sources().scheme,
+        replacements.components().scheme,
+        &scheme_replaced, &scheme_replaced_parsed);
+
+    // We can assume that the input is canonicalized, which means it always has
+    // a colon after the scheme (or where the scheme would be).
+    int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
+                                                    : 1;
+    if (spec_len - spec_after_colon > 0) {
+      scheme_replaced.Append(&spec[spec_after_colon],
+                             spec_len - spec_after_colon);
+    }
+
+    // We now need to completely re-parse the resulting string since its meaning
+    // may have changed with the different scheme.
+    url_canon::RawCanonOutput<128> recanonicalized;
+    url_parse::Parsed recanonicalized_parsed;
+    DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
+                   charset_converter,
+                   &recanonicalized, &recanonicalized_parsed);
+
+    // Recurse using the version with the scheme already replaced. This will now
+    // use the replacement rules for the new scheme.
+    //
+    // Warning: this code assumes that ReplaceComponents will re-check all
+    // components for validity. This is because we can't fail if DoCanonicalize
+    // failed above since theoretically the thing making it fail could be
+    // getting replaced here. If ReplaceComponents didn't re-check everything,
+    // we wouldn't know if something *not* getting replaced is a problem.
+    // If the scheme-specific replacers are made more intelligent so they don't
+    // re-check everything, we should instead recanonicalize the whole thing
+    // after this call to check validity (this assumes replacing the scheme is
+    // much much less common than other types of replacements, like clearing the
+    // ref).
+    url_canon::Replacements<CHAR> replacements_no_scheme = replacements;
+    replacements_no_scheme.SetScheme(NULL, url_parse::Component());
+    return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
+                               recanonicalized_parsed, replacements_no_scheme,
+                               charset_converter, output, out_parsed);
+  }
+
+  // If we get here, then we know the scheme doesn't need to be replaced, so can
+  // just key off the scheme in the spec to know how to do the replacements.
+  if (CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {
     return url_canon::ReplaceFileURL(spec, parsed, replacements,
                                      charset_converter, output, out_parsed);
   }
-
-  if (// Either the scheme is not replaced and the old one is standard,
-      (!replacements.IsSchemeOverridden() &&
-       IsStandard(spec, spec_len, parsed.scheme)) ||
-      // ...or it is being replaced and the new one is standard.
-      (replacements.IsSchemeOverridden() &&
-       IsStandardScheme(replacements.sources().scheme,
-                        replacements.components().scheme))) {
-    // Standard URL with all parts.
+  if (DoIsStandard(spec, parsed.scheme)) {
     return url_canon::ReplaceStandardURL(spec, parsed, replacements,
                                          charset_converter, output, out_parsed);
   }
-
-  if (// Either the scheme is not replaced and the old one is mailto,
-      (!replacements.IsSchemeOverridden() &&
-       CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) ||
-      // ...or it is being replaced and the new one is a mailto.
-      (replacements.IsSchemeOverridden() &&
-       CompareSchemeComponent(replacements.sources().scheme,
-                              replacements.components().scheme,
-                              kMailtoScheme))) {
+  if (CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {
      return url_canon::ReplaceMailtoURL(spec, parsed, replacements,
                                         output, out_parsed);
   }
 
+  // Default is a path URL.
   return url_canon::ReplacePathURL(spec, parsed, replacements,
                                    output, out_parsed);
 }
 
 }  // namespace
 
+void Initialize() {
+  InitStandardSchemes();
+}
+
+void Shutdown() {
+  if (standard_schemes) {
+    delete standard_schemes;
+    standard_schemes = NULL;
+  }
+}
+
 void AddStandardScheme(const char* new_scheme) {
+  // If this assert triggers, it means you've called AddStandardScheme after
+  // LockStandardSchemes have been called (see the header file for
+  // LockStandardSchemes for more).
+  //
+  // This normally means you're trying to set up a new standard scheme too late
+  // in your application's init process. Locate where your app does this
+  // initialization and calls LockStandardScheme, and add your new standard
+  // scheme there.
+  DCHECK(!standard_schemes_locked) <<
+      "Trying to add a standard scheme after the list has been locked.";
+
   size_t scheme_len = strlen(new_scheme);
   if (scheme_len == 0)
     return;
@@ -335,14 +384,16 @@ void AddStandardScheme(const char* new_scheme) {
   standard_schemes->push_back(dup_scheme);
 }
 
-bool IsStandard(const char* spec, int spec_len,
-                const url_parse::Component& scheme) {
-  return DoIsStandard(spec, spec_len, scheme);
+void LockStandardSchemes() {
+  standard_schemes_locked = true;
+}
+
+bool IsStandard(const char* spec, const url_parse::Component& scheme) {
+  return DoIsStandard(spec, scheme);
 }
 
-bool IsStandard(const char16* spec, int spec_len,
-                const url_parse::Component& scheme) {
-  return DoIsStandard(spec, spec_len, scheme);
+bool IsStandard(const char16* spec, const url_parse::Component& scheme) {
+  return DoIsStandard(spec, scheme);
 }
 
 bool FindAndCompareScheme(const char* str,
@@ -450,4 +501,53 @@ bool LowerCaseEqualsASCII(const char16* a_begin,
   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 }
 
+void DecodeURLEscapeSequences(const char* input, int length,
+                              url_canon::CanonOutputW* output) {
+  url_canon::RawCanonOutputT<char> unescaped_chars;
+  for (int i = 0; i < length; i++) {
+    if (input[i] == '%') {
+      unsigned char ch;
+      if (url_canon::DecodeEscaped(input, &i, length, &ch)) {
+        unescaped_chars.push_back(ch);
+      } else {
+        // Invalid escape sequence, copy the percent literal.
+        unescaped_chars.push_back('%');
+      }
+    } else {
+      // Regular non-escaped 8-bit character.
+      unescaped_chars.push_back(input[i]);
+    }
+  }
+
+  // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
+  // JavaScript URLs, but Firefox and Safari do.
+  for (int i = 0; i < unescaped_chars.length(); i++) {
+    unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
+    if (uch < 0x80) {
+      // Non-UTF-8, just append directly
+      output->push_back(uch);
+    } else {
+      // next_ch will point to the last character of the decoded
+      // character.
+      int next_character = i;
+      unsigned code_point;
+      if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character,
+                                 unescaped_chars.length(), &code_point)) {
+        // Valid UTF-8 character, convert to UTF-16.
+        url_canon::AppendUTF16Value(code_point, output);
+        i = next_character;
+      } else {
+        // If there are any sequences that are not valid UTF-8, we keep
+        // invalid code points and promote to UTF-16. We copy all characters
+        // from the current position to the end of the identified sequence.
+        while (i < next_character) {
+          output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+          i++;
+        }
+        output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+      }
+    }
+  }
+}
+
 }  // namespace url_util