Fix various problems with inline autocomplete and URLs that change length during fixup:

* URLs with http auth info, which gets stripped * URLs with IDN hosts * URLs with escaped values that get unescaped In cases like these, we'd inline autocomplete from the wrong locations, highlight the wrong portions of the URL as matches, and sometimes DCHECK() in debug mode. The fix is to track how fixup affects the offsets into the URL we care about. Plumbing this required an enormous number of additions :( There is also a fix here to the URL Fixer Upper, which was obviously modified at some point in the past to use the Parsed components, but without updating the comments or some of the functionality to match. Since this isn't supposed to "fix up" things that aren't simple typos, I removed some code to "fix" bogus ports, which was causing bizarre effects when typing HTTP auth URLs ("http://foo:bar" would be fixed to "http://foo" and then matched for inline autocompletion, which was clearly wrong). This is tested incidentally by one of the new History URL Provider tests (which is how I discovered it). BUG=4010 TEST=Covered by unittests Review URL: http://codereview.chromium.org/372017 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@31352 0039d316-1c4b-4281-b951-d872f2087c98
author: pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-11-07 01:34:53 +0000
committer: pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-11-07 01:34:53 +0000
commit: ce85f60cd9d399109dab39fe5a9613879ab9a8f7 (patch)
tree: 0e9e0072d2e5eadfeec08eef0f06a43c56dc1751 /base/i18n/icu_string_conversions.cc
parent: d90684d0cf0aa16389c9202153c97d373829b7f3 (diff)
download: chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.zip
chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.gz
chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.bz2
1 files changed, 128 insertions, 76 deletions
diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc
index ba9f9ae..c93b103 100644
--- a/base/i18n/icu_string_conversions.cc
+++ b/base/i18n/icu_string_conversions.cc
@@ -157,6 +157,90 @@ const char kCodepageUTF16LE[] = "UTF-16LE";
 
 // Codepage <-> Wide/UTF-16  ---------------------------------------------------
 
+// Convert a UTF-16 string into the specified codepage_name.  If the codepage
+// isn't found, return false.
+bool UTF16ToCodepage(const string16& utf16,
+                     const char* codepage_name,
+                     OnStringConversionError::Type on_error,
+                     std::string* encoded) {
+  encoded->clear();
+
+  UErrorCode status = U_ZERO_ERROR;
+  UConverter* converter = ucnv_open(codepage_name, &status);
+  if (!U_SUCCESS(status))
+    return false;
+
+  return ConvertFromUTF16(converter, utf16.c_str(),
+                          static_cast<int>(utf16.length()), on_error, encoded);
+}
+
+bool CodepageToUTF16AndAdjustOffset(const std::string& encoded,
+                                    const char* codepage_name,
+                                    OnStringConversionError::Type on_error,
+                                    string16* utf16,
+                                    size_t* offset_for_adjustment) {
+  utf16->clear();
+
+  UErrorCode status = U_ZERO_ERROR;
+  UConverter* converter = ucnv_open(codepage_name, &status);
+  if (!U_SUCCESS(status))
+    return false;
+
+  // Even in the worst case, the maximum length in 2-byte units of UTF-16
+  // output would be at most the same as the number of bytes in input. There
+  // is no single-byte encoding in which a character is mapped to a
+  // non-BMP character requiring two 2-byte units.
+  //
+  // Moreover, non-BMP characters in legacy multibyte encodings
+  // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
+  // BOCU and SCSU, but we don't care about them.
+  size_t uchar_max_length = encoded.length() + 1;
+
+  SetUpErrorHandlerForToUChars(on_error, converter, &status);
+  char16* byte_buffer = WriteInto(utf16, uchar_max_length);
+  int byte_buffer_length = static_cast<int>(uchar_max_length);
+  const char* data = encoded.data();
+  int length = static_cast<int>(encoded.length());
+  int actual_size = 0;
+  if (offset_for_adjustment) {
+    if (*offset_for_adjustment >= encoded.length()) {
+      *offset_for_adjustment = string16::npos;
+    } else if (*offset_for_adjustment != 0) {
+      // Try to adjust the offset by converting the string in two pieces and
+      // using the length of the first piece as the adjusted offset.
+      actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length,
+          data, static_cast<int>(*offset_for_adjustment), &status);
+      if (U_SUCCESS(status)) {
+        // Conversion succeeded, so update the offset and then fall through to
+        // appending the second half of the string.
+        data += *offset_for_adjustment;
+        length -= *offset_for_adjustment;
+        *offset_for_adjustment = actual_size;
+        byte_buffer += actual_size;
+        byte_buffer_length -= actual_size;
+      } else {
+        // The offset may have been in the middle of an encoding sequence; mark
+        // it as having failed to adjust and then try to convert the entire
+        // string.
+        *offset_for_adjustment = string16::npos;
+        actual_size = 0;
+        ucnv_reset(converter);
+        status = U_ZERO_ERROR;
+      }
+    }
+  }
+  actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data,
+                               length, &status);
+  ucnv_close(converter);
+  if (!U_SUCCESS(status)) {
+    utf16->clear();  // Make sure the output is empty on error.
+    return false;
+  }
+
+  utf16->resize(actual_size);
+  return true;
+}
+
 // Convert a wstring into the specified codepage_name.  If the codepage
 // isn't found, return false.
 bool WideToCodepage(const std::wstring& wide,
@@ -188,31 +272,16 @@ bool WideToCodepage(const std::wstring& wide,
 #endif  // defined(WCHAR_T_IS_UTF32)
 }
 
-// Convert a UTF-16 string into the specified codepage_name.  If the codepage
-// isn't found, return false.
-bool UTF16ToCodepage(const string16& utf16,
-                    const char* codepage_name,
-                    OnStringConversionError::Type on_error,
-                    std::string* encoded) {
-  encoded->clear();
-
-  UErrorCode status = U_ZERO_ERROR;
-  UConverter* converter = ucnv_open(codepage_name, &status);
-  if (!U_SUCCESS(status))
-    return false;
-
-  return ConvertFromUTF16(converter, utf16.c_str(),
-                          static_cast<int>(utf16.length()), on_error, encoded);
-}
-
 // Converts a string of the given codepage into wstring.
 // If the codepage isn't found, return false.
-bool CodepageToWide(const std::string& encoded,
-                    const char* codepage_name,
-                    OnStringConversionError::Type on_error,
-                    std::wstring* wide) {
+bool CodepageToWideAndAdjustOffset(const std::string& encoded,
+                                   const char* codepage_name,
+                                   OnStringConversionError::Type on_error,
+                                   std::wstring* wide,
+                                   size_t* offset_for_adjustment) {
 #if defined(WCHAR_T_IS_UTF16)
-  return CodepageToUTF16(encoded, codepage_name, on_error, wide);
+  return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide,
+                                        offset_for_adjustment);
 #elif defined(WCHAR_T_IS_UTF32)
   wide->clear();
 
@@ -227,70 +296,53 @@ bool CodepageToWide(const std::string& encoded,
   // this can be 4 times larger than actually needed.
   size_t wchar_max_length = encoded.length() + 1;
 
-  // The byte buffer and its length to pass to ucnv_toAlgorithimic.
-  char* byte_buffer = reinterpret_cast<char*>(
-      WriteInto(wide, wchar_max_length));
-  int byte_buffer_length = static_cast<int>(wchar_max_length) * 4;
-
   SetUpErrorHandlerForToUChars(on_error, converter, &status);
-  int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(),
-                                       converter,
-                                       byte_buffer,
-                                       byte_buffer_length,
-                                       encoded.data(),
-                                       static_cast<int>(encoded.length()),
-                                       &status);
+  char* byte_buffer =
+      reinterpret_cast<char*>(WriteInto(wide, wchar_max_length));
+  int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t);
+  const char* data = encoded.data();
+  int length = static_cast<int>(encoded.length());
+  int actual_size = 0;
+  if (offset_for_adjustment) {
+    if (*offset_for_adjustment >= encoded.length()) {
+      *offset_for_adjustment = std::wstring::npos;
+    } else if (*offset_for_adjustment != 0) {
+      // Try to adjust the offset by converting the string in two pieces and
+      // using the length of the first piece as the adjusted offset.
+      actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter,
+          byte_buffer, byte_buffer_length, data,
+          static_cast<int>(*offset_for_adjustment), &status);
+      if (U_SUCCESS(status)) {
+        // Conversion succeeded, so update the offset and then fall through to
+        // appending the second half of the string.
+        data += *offset_for_adjustment;
+        length -= *offset_for_adjustment;
+        *offset_for_adjustment = actual_size / sizeof(wchar_t);
+        byte_buffer += actual_size;
+        byte_buffer_length -= actual_size;
+      } else {
+        // The offset may have been in the middle of an encoding sequence; mark
+        // it as having failed to adjust and then try to convert the entire
+        // string.
+        *offset_for_adjustment = std::wstring::npos;
+        actual_size = 0;
+        ucnv_reset(converter);
+        status = U_ZERO_ERROR;
+      }
+    }
+  }
+  actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter,
+      byte_buffer, byte_buffer_length, data, length, &status);
   ucnv_close(converter);
-
   if (!U_SUCCESS(status)) {
     wide->clear();  // Make sure the output is empty on error.
     return false;
   }
 
   // actual_size is # of bytes.
-  wide->resize(actual_size / 4);
+  wide->resize(actual_size / sizeof(wchar_t));
   return true;
 #endif  // defined(WCHAR_T_IS_UTF32)
 }
 
-// Converts a string of the given codepage into UTF-16.
-// If the codepage isn't found, return false.
-bool CodepageToUTF16(const std::string& encoded,
-                     const char* codepage_name,
-                     OnStringConversionError::Type on_error,
-                     string16* utf16) {
-  utf16->clear();
-
-  UErrorCode status = U_ZERO_ERROR;
-  UConverter* converter = ucnv_open(codepage_name, &status);
-  if (!U_SUCCESS(status))
-    return false;
-
-  // Even in the worst case, the maximum length in 2-byte units of UTF-16
-  // output would be at most the same as the number of bytes in input. There
-  // is no single-byte encoding in which a character is mapped to a
-  // non-BMP character requiring two 2-byte units.
-  //
-  // Moreover, non-BMP characters in legacy multibyte encodings
-  // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
-  // BOCU and SCSU, but we don't care about them.
-  size_t uchar_max_length = encoded.length() + 1;
-
-  SetUpErrorHandlerForToUChars(on_error, converter, &status);
-  int actual_size = ucnv_toUChars(converter,
-                                  WriteInto(utf16, uchar_max_length),
-                                  static_cast<int>(uchar_max_length),
-                                  encoded.data(),
-                                  static_cast<int>(encoded.length()),
-                                  &status);
-  ucnv_close(converter);
-  if (!U_SUCCESS(status)) {
-    utf16->clear();  // Make sure the output is empty on error.
-    return false;
-  }
-
-  utf16->resize(actual_size);
-  return true;
-}
-
 }  // namespace base
author	pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-11-07 01:34:53 +0000
committer	pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-11-07 01:34:53 +0000
commit	ce85f60cd9d399109dab39fe5a9613879ab9a8f7 (patch)
tree	0e9e0072d2e5eadfeec08eef0f06a43c56dc1751 /base/i18n/icu_string_conversions.cc
parent	d90684d0cf0aa16389c9202153c97d373829b7f3 (diff)
download	chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.zip chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.gz chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.bz2