1 files changed, 223 insertions, 132 deletions
diff --git a/net/base/net_util.cc b/net/base/net_util.cc
index 85151e9..9171e54 100644
--- a/net/base/net_util.cc
+++ b/net/base/net_util.cc
@@ -650,60 +650,51 @@ bool IsIDNComponentSafe(const char16* str,
 }
 
 // Converts one component of a host (between dots) to IDN if safe. The result
-// will be APPENDED to the given output string and  will be the same as the
-// input if it is not IDN or the IDN is unsafe to display.
-void IDNToUnicodeOneComponent(const char16* comp,
-                              int comp_len,
+// will be APPENDED to the given output string and will be the same as the input
+// if it is not IDN or the IDN is unsafe to display.  Returns whether any
+// conversion was performed.
+bool IDNToUnicodeOneComponent(const char16* comp,
+                              size_t comp_len,
                               const std::wstring& languages,
                               string16* out) {
-  DCHECK(comp_len >= 0);
+  DCHECK(out);
   if (comp_len == 0)
-    return;
+    return false;
 
-  // Expand the output string to make room for a possibly longer string
-  // (we'll expand if it's still not big enough below).
-  int extra_space = 64;
-  size_t host_begin_in_output = out->size();
-
-  // Just copy the input if it can't be an IDN component.
-  if (comp_len < 4 ||
-      comp[0] != 'x' || comp[1] != 'n' || comp[2] != '-' || comp[3] != '-') {
-    out->resize(host_begin_in_output + comp_len);
-    for (int i = 0; i < comp_len; i++)
-      (*out)[host_begin_in_output + i] = comp[i];
-    return;
-  }
+  // Only transform if the input can be an IDN component.
+  static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'};
+  if ((comp_len > arraysize(kIdnPrefix)) &&
+      !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) {
+    // Repeatedly expand the output string until it's big enough.  It looks like
+    // ICU will return the required size of the buffer, but that's not
+    // documented, so we'll just grow by 2x. This should be rare and is not on a
+    // critical path.
+    size_t original_length = out->length();
+    for (int extra_space = 64; ; extra_space *= 2) {
+      UErrorCode status = U_ZERO_ERROR;
+      out->resize(out->length() + extra_space);
+      int output_chars = uidna_IDNToUnicode(comp,
+          static_cast<int32_t>(comp_len), &(*out)[original_length], extra_space,
+          UIDNA_DEFAULT, NULL, &status);
+      if (status == U_ZERO_ERROR) {
+        // Converted successfully.
+        out->resize(original_length + output_chars);
+        if (IsIDNComponentSafe(out->data() + original_length, output_chars,
+                               languages))
+          return true;
+      }
 
-  while (true) {
-    UErrorCode status = U_ZERO_ERROR;
-    out->resize(out->size() + extra_space);
-    int output_chars =
-        uidna_IDNToUnicode(comp, comp_len, &(*out)[host_begin_in_output],
-                           extra_space, UIDNA_DEFAULT, NULL, &status);
-    if (status == U_ZERO_ERROR) {
-      // Converted successfully.
-      out->resize(host_begin_in_output + output_chars);
-      if (!IsIDNComponentSafe(&out->data()[host_begin_in_output],
-                              output_chars,
-                              languages))
-        break;  // The error handling below will undo the IDN.
-      return;
+      if (status != U_BUFFER_OVERFLOW_ERROR)
+        break;
     }
-    if (status != U_BUFFER_OVERFLOW_ERROR)
-      break;
-
-    // Need to loop again with a bigger buffer. It looks like ICU will
-    // return the required size of the buffer, but that's not documented,
-    // so we'll just grow by 2x. This should be rare and is not on a
-    // critical path.
-    extra_space *= 2;
+    // Failed, revert back to original string.
+    out->resize(original_length);
   }
 
-  // We get here on error, in which case we replace anything that was added
-  // with the literal input.
-  out->resize(host_begin_in_output + comp_len);
-  for (int i = 0; i < comp_len; i++)
-    (*out)[host_begin_in_output + i] = comp[i];
+  // We get here with no IDN or on error, in which case we just append the
+  // literal input.
+  out->append(comp, comp_len);
+  return false;
 }
 
 // Helper for FormatUrl().
@@ -712,19 +703,23 @@ std::wstring FormatViewSourceUrl(const GURL& url,
                                  bool omit_username_password,
                                  UnescapeRule::Type unescape_rules,
                                  url_parse::Parsed* new_parsed,
-                                 size_t* prefix_end) {
+                                 size_t* prefix_end,
+                                 size_t* offset_for_adjustment) {
   DCHECK(new_parsed);
   const wchar_t* const kWideViewSource = L"view-source:";
   const size_t kViewSourceLengthPlus1 = 12;
 
   GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1));
+  size_t temp_offset = (*offset_for_adjustment == std::wstring::npos) ?
+      std::wstring::npos : (*offset_for_adjustment - kViewSourceLengthPlus1);
+  size_t* temp_offset_ptr = (*offset_for_adjustment < kViewSourceLengthPlus1) ?
+      NULL : &temp_offset;
   std::wstring result = net::FormatUrl(real_url, languages,
-      omit_username_password, unescape_rules, new_parsed, prefix_end);
+      omit_username_password, unescape_rules, new_parsed, prefix_end,
+      temp_offset_ptr);
   result.insert(0, kWideViewSource);
 
   // Adjust position values.
-  if (prefix_end)
-    *prefix_end += kViewSourceLengthPlus1;
   if (new_parsed->scheme.is_nonempty()) {
     // Assume "view-source:real-scheme" as a scheme.
     new_parsed->scheme.len += kViewSourceLengthPlus1;
@@ -746,6 +741,12 @@ std::wstring FormatViewSourceUrl(const GURL& url,
     new_parsed->query.begin += kViewSourceLengthPlus1;
   if (new_parsed->ref.is_nonempty())
     new_parsed->ref.begin += kViewSourceLengthPlus1;
+  if (prefix_end)
+    *prefix_end += kViewSourceLengthPlus1;
+  if (temp_offset_ptr) {
+    *offset_for_adjustment = (temp_offset == std::wstring::npos) ?
+        std::wstring::npos : (temp_offset + kViewSourceLengthPlus1);
+  }
   return result;
 }
 
@@ -769,12 +770,20 @@ std::set<int> explicitly_allowed_ports;
 
 // Appends the substring |in_component| inside of the URL |spec| to |output|,
 // and the resulting range will be filled into |out_component|. |unescape_rules|
-// defines how to clean the URL for human readability.
+// defines how to clean the URL for human readability.  |offset_for_adjustment|
+// is an offset into |output| which will be adjusted based on how it maps to the
+// component being converted; if it is less than output->length(), it will be
+// untouched, and if it is greater than output->length() + in_component.len it
+// will be shortened by the difference in lengths between the input and output
+// components.  Otherwise it points into the component being converted, and is
+// adjusted to point to the same logical place in |output|.
+// |offset_for_adjustment| may not be NULL.
 static void AppendFormattedComponent(const std::string& spec,
                                      const url_parse::Component& in_component,
                                      UnescapeRule::Type unescape_rules,
                                      std::wstring* output,
-                                     url_parse::Component* out_component);
+                                     url_parse::Component* out_component,
+                                     size_t* offset_for_adjustment);
 
 GURL FilePathToFileURL(const FilePath& path) {
   // Produce a URL like "file:///C:/foo" for a regular file, or
@@ -849,58 +858,56 @@ std::string GetHeaderParamValue(const std::string& field,
 //
 // We may want to skip this step in the case of file URLs to allow unicode
 // UNC hostnames regardless of encodings.
-void IDNToUnicode(const char* host,
-                  int host_len,
-                  const std::wstring& languages,
-                  std::wstring* out) {
+std::wstring IDNToUnicode(const char* host,
+                          size_t host_len,
+                          const std::wstring& languages,
+                          size_t* offset_for_adjustment) {
   // Convert the ASCII input to a wide string for ICU.
   string16 input16;
   input16.reserve(host_len);
-  for (int i = 0; i < host_len; i++)
-    input16.push_back(host[i]);
+  std::copy(host, host + host_len, std::back_inserter(input16));
 
   string16 out16;
-  // The output string is appended to, so convert what's already there if
-  // needed.
-#if defined(WCHAR_T_IS_UTF32)
-  WideToUTF16(out->data(), out->length(), &out16);
-  out->clear();  // for equivalence with the swap below
-#elif defined(WCHAR_T_IS_UTF16)
-  out->swap(out16);
-#endif
+  size_t output_offset = offset_for_adjustment ?
+      *offset_for_adjustment : std::wstring::npos;
 
   // Do each component of the host separately, since we enforce script matching
   // on a per-component basis.
-  size_t cur_begin = 0;  // Beginning of the current component (inclusive).
-  while (cur_begin < input16.size()) {
-    // Find the next dot or the end of the string.
-    size_t next_dot = input16.find_first_of('.', cur_begin);
-    if (next_dot == std::wstring::npos)
-      next_dot = input16.size();  // For getting the last component.
-
-    if (next_dot > cur_begin) {
+  for (size_t component_start = 0, component_end;
+       component_start < input16.length();
+       component_start = component_end + 1) {
+    // Find the end of the component.
+    component_end = input16.find('.', component_start);
+    if (component_end == string16::npos)
+      component_end = input16.length();  // For getting the last component.
+    size_t component_length = component_end - component_start;
+
+    size_t output_component_start = out16.length();
+    bool converted_idn = false;
+    if (component_end > component_start) {
       // Add the substring that we just found.
-      IDNToUnicodeOneComponent(&input16[cur_begin],
-                               static_cast<int>(next_dot - cur_begin),
-                               languages,
-                               &out16);
+      converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start,
+          component_length, languages, &out16);
+    }
+    size_t output_component_length = out16.length() - output_component_start;
+
+    if ((output_offset != std::wstring::npos) &&
+        (*offset_for_adjustment > component_start)) {
+      if ((*offset_for_adjustment < component_end) && converted_idn)
+        output_offset = std::wstring::npos;
+      else
+        output_offset += output_component_length - component_length;
     }
 
-    // Need to add the dot we just found (if we found one). This needs to be
-    // done before we break out below in case the URL ends in a dot.
-    if (next_dot < input16.size())
+    // Need to add the dot we just found (if we found one).
+    if (component_end < input16.length())
       out16.push_back('.');
-    else
-      break;  // No more components left.
-
-    cur_begin = next_dot + 1;
   }
 
-#if defined(WCHAR_T_IS_UTF32)
-  UTF16ToWide(out16.data(), out16.length(), out);
-#elif defined(WCHAR_T_IS_UTF16)
-  out->swap(out16);
-#endif
+  if (offset_for_adjustment)
+    *offset_for_adjustment = output_offset;
+
+  return UTF16ToWideAndAdjustOffset(out16, offset_for_adjustment);
 }
 
 std::string CanonicalizeHost(const std::string& host,
@@ -1262,31 +1269,48 @@ void GetIdentityFromURL(const GURL& url,
                         std::wstring* username,
                         std::wstring* password) {
   UnescapeRule::Type flags = UnescapeRule::SPACES;
-  *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags);
-  *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags);
+  *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags, NULL);
+  *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags, NULL);
 }
 
 void AppendFormattedHost(const GURL& url,
                          const std::wstring& languages,
                          std::wstring* output,
-                         url_parse::Parsed* new_parsed) {
+                         url_parse::Parsed* new_parsed,
+                         size_t* offset_for_adjustment) {
+  DCHECK(output);
   const url_parse::Component& host =
       url.parsed_for_possibly_invalid_spec().host;
 
   if (host.is_nonempty()) {
     // Handle possible IDN in the host name.
+    int new_host_begin = static_cast<int>(output->length());
     if (new_parsed)
-      new_parsed->host.begin = static_cast<int>(output->length());
+      new_parsed->host.begin = new_host_begin;
+    size_t offset_past_current_output =
+        (!offset_for_adjustment ||
+         (*offset_for_adjustment == std::wstring::npos) ||
+         (*offset_for_adjustment < output->length())) ?
+            std::wstring::npos : (*offset_for_adjustment - output->length());
+    size_t* offset_into_host =
+        (offset_past_current_output >= static_cast<size_t>(host.len)) ?
+            NULL : &offset_past_current_output;
 
     const std::string& spec = url.possibly_invalid_spec();
     DCHECK(host.begin >= 0 &&
            ((spec.length() == 0 && host.begin == 0) ||
             host.begin < static_cast<int>(spec.length())));
-    net::IDNToUnicode(&spec[host.begin], host.len, languages, output);
+    output->append(net::IDNToUnicode(&spec[host.begin],
+                   static_cast<size_t>(host.len), languages, offset_into_host));
 
-    if (new_parsed) {
-      new_parsed->host.len =
-          static_cast<int>(output->length()) - new_parsed->host.begin;
+    int new_host_len = static_cast<int>(output->length()) - new_host_begin;
+    if (new_parsed)
+      new_parsed->host.len = new_host_len;
+    if (offset_into_host) {
+      *offset_for_adjustment = (*offset_into_host == std::wstring::npos) ?
+          std::wstring::npos : (new_host_begin + *offset_into_host);
+    } else if (offset_past_current_output != std::wstring::npos) {
+      *offset_for_adjustment += new_host_len - host.len;
     }
   } else if (new_parsed) {
     new_parsed->host.reset();
@@ -1298,19 +1322,36 @@ void AppendFormattedComponent(const std::string& spec,
                               const url_parse::Component& in_component,
                               UnescapeRule::Type unescape_rules,
                               std::wstring* output,
-                              url_parse::Component* out_component) {
+                              url_parse::Component* out_component,
+                              size_t* offset_for_adjustment) {
+  DCHECK(output);
+  DCHECK(offset_for_adjustment);
   if (in_component.is_nonempty()) {
     out_component->begin = static_cast<int>(output->length());
+    size_t offset_past_current_output =
+        ((*offset_for_adjustment == std::wstring::npos) ||
+         (*offset_for_adjustment < output->length())) ?
+            std::wstring::npos : (*offset_for_adjustment - output->length());
+    size_t* offset_into_component =
+        (offset_past_current_output >= static_cast<size_t>(in_component.len)) ?
+            NULL : &offset_past_current_output;
     if (unescape_rules == UnescapeRule::NONE) {
-      output->append(UTF8ToWide(spec.substr(
-          in_component.begin, in_component.len)));
+      output->append(UTF8ToWideAndAdjustOffset(
+          spec.substr(in_component.begin, in_component.len),
+          offset_into_component));
     } else {
       output->append(UnescapeAndDecodeUTF8URLComponent(
-          spec.substr(in_component.begin, in_component.len),
-          unescape_rules));
+          spec.substr(in_component.begin, in_component.len), unescape_rules,
+          offset_into_component));
     }
     out_component->len =
         static_cast<int>(output->length()) - out_component->begin;
+    if (offset_into_component) {
+      *offset_for_adjustment = (*offset_into_component == std::wstring::npos) ?
+          std::wstring::npos : (out_component->begin + *offset_into_component);
+    } else if (offset_past_current_output != std::wstring::npos) {
+      *offset_for_adjustment += out_component->len - in_component.len;
+    }
   } else {
     out_component->reset();
   }
@@ -1321,10 +1362,14 @@ std::wstring FormatUrl(const GURL& url,
                        bool omit_username_password,
                        UnescapeRule::Type unescape_rules,
                        url_parse::Parsed* new_parsed,
-                       size_t* prefix_end) {
+                       size_t* prefix_end,
+                       size_t* offset_for_adjustment) {
   url_parse::Parsed parsed_temp;
   if (!new_parsed)
     new_parsed = &parsed_temp;
+  size_t offset_temp = std::wstring::npos;
+  if (!offset_for_adjustment)
+    offset_for_adjustment = &offset_temp;
 
   std::wstring url_string;
 
@@ -1332,6 +1377,7 @@ std::wstring FormatUrl(const GURL& url,
   if (url.is_empty()) {
     if (prefix_end)
       *prefix_end = 0;
+    *offset_for_adjustment = std::wstring::npos;
     return url_string;
   }
 
@@ -1343,19 +1389,22 @@ std::wstring FormatUrl(const GURL& url,
   if (url.SchemeIs(kViewSource) &&
       !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) {
     return FormatViewSourceUrl(url, languages, omit_username_password,
-        unescape_rules, new_parsed, prefix_end);
+        unescape_rules, new_parsed, prefix_end, offset_for_adjustment);
   }
 
   // We handle both valid and invalid URLs (this will give us the spec
   // regardless of validity).
   const std::string& spec = url.possibly_invalid_spec();
   const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
+  if (*offset_for_adjustment >= spec.length())
+    *offset_for_adjustment = std::wstring::npos;
 
   // Copy everything before the username (the scheme and the separators.)
   // These are ASCII.
-  int pre_end = parsed.CountCharactersBefore(url_parse::Parsed::USERNAME, true);
-  for (int i = 0; i < pre_end; ++i)
-    url_string.push_back(spec[i]);
+  std::copy(spec.begin(),
+      spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME,
+                                                  true),
+      std::back_inserter(url_string));
   new_parsed->scheme = parsed.scheme;
 
   if (omit_username_password) {
@@ -1364,16 +1413,41 @@ std::wstring FormatUrl(const GURL& url,
     // e.g. "http://google.com:search@evil.ru/"
     new_parsed->username.reset();
     new_parsed->password.reset();
+    if ((*offset_for_adjustment != std::wstring::npos) &&
+        (parsed.username.is_nonempty() || parsed.password.is_nonempty())) {
+      if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {
+        // The seeming off-by-one and off-by-two in these first two lines are to
+        // account for the ':' after the username and '@' after the password.
+        if (*offset_for_adjustment >
+            static_cast<size_t>(parsed.password.end())) {
+          *offset_for_adjustment -=
+              (parsed.username.len + parsed.password.len + 2);
+        } else if (*offset_for_adjustment >
+                   static_cast<size_t>(parsed.username.begin)) {
+          *offset_for_adjustment = std::wstring::npos;
+        }
+      } else {
+        const url_parse::Component* nonempty_component =
+            parsed.username.is_nonempty() ? &parsed.username : &parsed.password;
+        // The seeming off-by-one in these first two lines is to account for the
+        // '@' after the username/password.
+        if (*offset_for_adjustment >
+            static_cast<size_t>(nonempty_component->end())) {
+          *offset_for_adjustment -= (nonempty_component->len + 1);
+        } else if (*offset_for_adjustment >
+                   static_cast<size_t>(nonempty_component->begin)) {
+          *offset_for_adjustment = std::wstring::npos;
+        }
+      }
+    }
   } else {
-    AppendFormattedComponent(
-        spec, parsed.username, unescape_rules,
-        &url_string, &new_parsed->username);
+    AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string,
+                             &new_parsed->username, offset_for_adjustment);
     if (parsed.password.is_valid()) {
       url_string.push_back(':');
     }
-    AppendFormattedComponent(
-        spec, parsed.password, unescape_rules,
-        &url_string, &new_parsed->password);
+    AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string,
+                             &new_parsed->password, offset_for_adjustment);
     if (parsed.username.is_valid() || parsed.password.is_valid()) {
       url_string.push_back('@');
     }
@@ -1381,39 +1455,56 @@ std::wstring FormatUrl(const GURL& url,
   if (prefix_end)
     *prefix_end = static_cast<size_t>(url_string.length());
 
-  AppendFormattedHost(url, languages, &url_string, new_parsed);
+  AppendFormattedHost(url, languages, &url_string, new_parsed,
+                      offset_for_adjustment);
 
   // Port.
   if (parsed.port.is_nonempty()) {
     url_string.push_back(':');
-    int begin = url_string.length();
-    for (int i = parsed.port.begin; i < parsed.port.end(); ++i)
-      url_string.push_back(spec[i]);
-    new_parsed->port.begin = begin;
-    new_parsed->port.len = url_string.length() - begin;
+    new_parsed->port.begin = url_string.length();
+    std::copy(spec.begin() + parsed.port.begin,
+              spec.begin() + parsed.port.end(), std::back_inserter(url_string));
+    new_parsed->port.len = url_string.length() - new_parsed->port.begin;
   } else {
     new_parsed->port.reset();
   }
 
   // Path and query both get the same general unescape & convert treatment.
-  AppendFormattedComponent(
-      spec, parsed.path, unescape_rules, &url_string,
-      &new_parsed->path);
+  AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string,
+                           &new_parsed->path, offset_for_adjustment);
   if (parsed.query.is_valid())
     url_string.push_back('?');
-  AppendFormattedComponent(
-      spec, parsed.query, unescape_rules, &url_string,
-      &new_parsed->query);
+  AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string,
+                           &new_parsed->query, offset_for_adjustment);
 
   // Reference is stored in valid, unescaped UTF-8, so we can just convert.
   if (parsed.ref.is_valid()) {
     url_string.push_back('#');
-    int begin = url_string.length();
-    if (parsed.ref.len > 0)
-      url_string.append(UTF8ToWide(std::string(&spec[parsed.ref.begin],
-                                               parsed.ref.len)));
-    new_parsed->ref.begin = begin;
-    new_parsed->ref.len = url_string.length() - begin;
+    new_parsed->ref.begin = url_string.length();
+    size_t offset_past_current_output =
+        ((*offset_for_adjustment == std::wstring::npos) ||
+         (*offset_for_adjustment < url_string.length())) ?
+            std::wstring::npos : (*offset_for_adjustment - url_string.length());
+    size_t* offset_into_ref =
+        (offset_past_current_output >= static_cast<size_t>(parsed.ref.len)) ?
+            NULL : &offset_past_current_output;
+    if (parsed.ref.len > 0) {
+      url_string.append(UTF8ToWideAndAdjustOffset(spec.substr(parsed.ref.begin,
+                                                              parsed.ref.len),
+                                                  offset_into_ref));
+    }
+    new_parsed->ref.len = url_string.length() - new_parsed->ref.begin;
+    if (offset_into_ref) {
+      *offset_for_adjustment = (*offset_into_ref == std::wstring::npos) ?
+          std::wstring::npos : (new_parsed->ref.begin + *offset_into_ref);
+    } else if (offset_past_current_output != std::wstring::npos) {
+      // We clamped the offset near the beginning of this function to ensure it
+      // was within the input URL.  If we reach here, the input was something
+      // invalid and non-parseable such that the offset was past any component
+      // we could figure out.  In this case it won't be represented in the
+      // output string, so reset it.
+      *offset_for_adjustment = std::wstring::npos;
+    }
   }
 
   return url_string;