diff options
author | pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-11-07 01:34:53 +0000 |
---|---|---|
committer | pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-11-07 01:34:53 +0000 |
commit | ce85f60cd9d399109dab39fe5a9613879ab9a8f7 (patch) | |
tree | 0e9e0072d2e5eadfeec08eef0f06a43c56dc1751 /base/i18n/icu_string_conversions.cc | |
parent | d90684d0cf0aa16389c9202153c97d373829b7f3 (diff) | |
download | chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.zip chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.gz chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.bz2 |
Fix various problems with inline autocomplete and URLs that change length during fixup:
* URLs with http auth info, which gets stripped
* URLs with IDN hosts
* URLs with escaped values that get unescaped
In cases like these, we'd inline autocomplete from the wrong locations, highlight the wrong portions of the URL as matches, and sometimes DCHECK() in debug mode.
The fix is to track how fixup affects the offsets into the URL we care about. Plumbing this required an enormous number of additions :(
There is also a fix here to the URL Fixer Upper, which was obviously modified at some point in the past to use the Parsed components, but without updating the comments or some of the functionality to match. Since this isn't supposed to "fix up" things that aren't simple typos, I removed some code to "fix" bogus ports, which was causing bizarre effects when typing HTTP auth URLs ("http://foo:bar" would be fixed to "http://foo" and then matched for inline autocompletion, which was clearly wrong). This is tested incidentally by one of the new History URL Provider tests (which is how I discovered it).
BUG=4010
TEST=Covered by unittests
Review URL: http://codereview.chromium.org/372017
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@31352 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/i18n/icu_string_conversions.cc')
-rw-r--r-- | base/i18n/icu_string_conversions.cc | 204 |
1 files changed, 128 insertions, 76 deletions
diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc index ba9f9ae..c93b103 100644 --- a/base/i18n/icu_string_conversions.cc +++ b/base/i18n/icu_string_conversions.cc @@ -157,6 +157,90 @@ const char kCodepageUTF16LE[] = "UTF-16LE"; // Codepage <-> Wide/UTF-16 --------------------------------------------------- +// Convert a UTF-16 string into the specified codepage_name. If the codepage +// isn't found, return false. +bool UTF16ToCodepage(const string16& utf16, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::string* encoded) { + encoded->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + return ConvertFromUTF16(converter, utf16.c_str(), + static_cast<int>(utf16.length()), on_error, encoded); +} + +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment) { + utf16->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + // Even in the worst case, the maximum length in 2-byte units of UTF-16 + // output would be at most the same as the number of bytes in input. There + // is no single-byte encoding in which a character is mapped to a + // non-BMP character requiring two 2-byte units. + // + // Moreover, non-BMP characters in legacy multibyte encodings + // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are + // BOCU and SCSU, but we don't care about them. + size_t uchar_max_length = encoded.length() + 1; + + SetUpErrorHandlerForToUChars(on_error, converter, &status); + char16* byte_buffer = WriteInto(utf16, uchar_max_length); + int byte_buffer_length = static_cast<int>(uchar_max_length); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = string16::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, + data, static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size; + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = string16::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data, + length, &status); + ucnv_close(converter); + if (!U_SUCCESS(status)) { + utf16->clear(); // Make sure the output is empty on error. + return false; + } + + utf16->resize(actual_size); + return true; +} + // Convert a wstring into the specified codepage_name. If the codepage // isn't found, return false. bool WideToCodepage(const std::wstring& wide, @@ -188,31 +272,16 @@ bool WideToCodepage(const std::wstring& wide, #endif // defined(WCHAR_T_IS_UTF32) } -// Convert a UTF-16 string into the specified codepage_name. If the codepage -// isn't found, return false. -bool UTF16ToCodepage(const string16& utf16, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::string* encoded) { - encoded->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - return ConvertFromUTF16(converter, utf16.c_str(), - static_cast<int>(utf16.length()), on_error, encoded); -} - // Converts a string of the given codepage into wstring. // If the codepage isn't found, return false. -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide) { +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment) { #if defined(WCHAR_T_IS_UTF16) - return CodepageToUTF16(encoded, codepage_name, on_error, wide); + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide, + offset_for_adjustment); #elif defined(WCHAR_T_IS_UTF32) wide->clear(); @@ -227,70 +296,53 @@ bool CodepageToWide(const std::string& encoded, // this can be 4 times larger than actually needed. size_t wchar_max_length = encoded.length() + 1; - // The byte buffer and its length to pass to ucnv_toAlgorithimic. - char* byte_buffer = reinterpret_cast<char*>( - WriteInto(wide, wchar_max_length)); - int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), - converter, - byte_buffer, - byte_buffer_length, - encoded.data(), - static_cast<int>(encoded.length()), - &status); + char* byte_buffer = + reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)); + int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = std::wstring::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, + static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size / sizeof(wchar_t); + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = std::wstring::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, length, &status); ucnv_close(converter); - if (!U_SUCCESS(status)) { wide->clear(); // Make sure the output is empty on error. return false; } // actual_size is # of bytes. - wide->resize(actual_size / 4); + wide->resize(actual_size / sizeof(wchar_t)); return true; #endif // defined(WCHAR_T_IS_UTF32) } -// Converts a string of the given codepage into UTF-16. -// If the codepage isn't found, return false. -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16) { - utf16->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - // Even in the worst case, the maximum length in 2-byte units of UTF-16 - // output would be at most the same as the number of bytes in input. There - // is no single-byte encoding in which a character is mapped to a - // non-BMP character requiring two 2-byte units. - // - // Moreover, non-BMP characters in legacy multibyte encodings - // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are - // BOCU and SCSU, but we don't care about them. - size_t uchar_max_length = encoded.length() + 1; - - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toUChars(converter, - WriteInto(utf16, uchar_max_length), - static_cast<int>(uchar_max_length), - encoded.data(), - static_cast<int>(encoded.length()), - &status); - ucnv_close(converter); - if (!U_SUCCESS(status)) { - utf16->clear(); // Make sure the output is empty on error. - return false; - } - - utf16->resize(actual_size); - return true; -} - } // namespace base |