diff options
author | pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-11-07 01:34:53 +0000 |
---|---|---|
committer | pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-11-07 01:34:53 +0000 |
commit | ce85f60cd9d399109dab39fe5a9613879ab9a8f7 (patch) | |
tree | 0e9e0072d2e5eadfeec08eef0f06a43c56dc1751 /base/i18n | |
parent | d90684d0cf0aa16389c9202153c97d373829b7f3 (diff) | |
download | chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.zip chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.gz chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.bz2 |
Fix various problems with inline autocomplete and URLs that change length during fixup:
* URLs with http auth info, which gets stripped
* URLs with IDN hosts
* URLs with escaped values that get unescaped
In cases like these, we'd inline autocomplete from the wrong locations, highlight the wrong portions of the URL as matches, and sometimes DCHECK() in debug mode.
The fix is to track how fixup affects the offsets into the URL we care about. Plumbing this required an enormous number of additions :(
There is also a fix here to the URL Fixer Upper, which was obviously modified at some point in the past to use the Parsed components, but without updating the comments or some of the functionality to match. Since this isn't supposed to "fix up" things that aren't simple typos, I removed some code to "fix" bogus ports, which was causing bizarre effects when typing HTTP auth URLs ("http://foo:bar" would be fixed to "http://foo" and then matched for inline autocompletion, which was clearly wrong). This is tested incidentally by one of the new History URL Provider tests (which is how I discovered it).
BUG=4010
TEST=Covered by unittests
Review URL: http://codereview.chromium.org/372017
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@31352 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/i18n')
-rw-r--r-- | base/i18n/icu_string_conversions.cc | 204 | ||||
-rw-r--r-- | base/i18n/icu_string_conversions.h | 43 | ||||
-rw-r--r-- | base/i18n/icu_string_conversions_unittest.cc | 55 |
3 files changed, 211 insertions, 91 deletions
diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc index ba9f9ae..c93b103 100644 --- a/base/i18n/icu_string_conversions.cc +++ b/base/i18n/icu_string_conversions.cc @@ -157,6 +157,90 @@ const char kCodepageUTF16LE[] = "UTF-16LE"; // Codepage <-> Wide/UTF-16 --------------------------------------------------- +// Convert a UTF-16 string into the specified codepage_name. If the codepage +// isn't found, return false. +bool UTF16ToCodepage(const string16& utf16, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::string* encoded) { + encoded->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + return ConvertFromUTF16(converter, utf16.c_str(), + static_cast<int>(utf16.length()), on_error, encoded); +} + +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment) { + utf16->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + // Even in the worst case, the maximum length in 2-byte units of UTF-16 + // output would be at most the same as the number of bytes in input. There + // is no single-byte encoding in which a character is mapped to a + // non-BMP character requiring two 2-byte units. + // + // Moreover, non-BMP characters in legacy multibyte encodings + // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are + // BOCU and SCSU, but we don't care about them. + size_t uchar_max_length = encoded.length() + 1; + + SetUpErrorHandlerForToUChars(on_error, converter, &status); + char16* byte_buffer = WriteInto(utf16, uchar_max_length); + int byte_buffer_length = static_cast<int>(uchar_max_length); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = string16::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, + data, static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size; + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = string16::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data, + length, &status); + ucnv_close(converter); + if (!U_SUCCESS(status)) { + utf16->clear(); // Make sure the output is empty on error. + return false; + } + + utf16->resize(actual_size); + return true; +} + // Convert a wstring into the specified codepage_name. If the codepage // isn't found, return false. bool WideToCodepage(const std::wstring& wide, @@ -188,31 +272,16 @@ bool WideToCodepage(const std::wstring& wide, #endif // defined(WCHAR_T_IS_UTF32) } -// Convert a UTF-16 string into the specified codepage_name. If the codepage -// isn't found, return false. -bool UTF16ToCodepage(const string16& utf16, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::string* encoded) { - encoded->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - return ConvertFromUTF16(converter, utf16.c_str(), - static_cast<int>(utf16.length()), on_error, encoded); -} - // Converts a string of the given codepage into wstring. // If the codepage isn't found, return false. -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide) { +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment) { #if defined(WCHAR_T_IS_UTF16) - return CodepageToUTF16(encoded, codepage_name, on_error, wide); + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide, + offset_for_adjustment); #elif defined(WCHAR_T_IS_UTF32) wide->clear(); @@ -227,70 +296,53 @@ bool CodepageToWide(const std::string& encoded, // this can be 4 times larger than actually needed. size_t wchar_max_length = encoded.length() + 1; - // The byte buffer and its length to pass to ucnv_toAlgorithimic. - char* byte_buffer = reinterpret_cast<char*>( - WriteInto(wide, wchar_max_length)); - int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), - converter, - byte_buffer, - byte_buffer_length, - encoded.data(), - static_cast<int>(encoded.length()), - &status); + char* byte_buffer = + reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)); + int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = std::wstring::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, + static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size / sizeof(wchar_t); + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = std::wstring::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, length, &status); ucnv_close(converter); - if (!U_SUCCESS(status)) { wide->clear(); // Make sure the output is empty on error. return false; } // actual_size is # of bytes. - wide->resize(actual_size / 4); + wide->resize(actual_size / sizeof(wchar_t)); return true; #endif // defined(WCHAR_T_IS_UTF32) } -// Converts a string of the given codepage into UTF-16. -// If the codepage isn't found, return false. -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16) { - utf16->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - // Even in the worst case, the maximum length in 2-byte units of UTF-16 - // output would be at most the same as the number of bytes in input. There - // is no single-byte encoding in which a character is mapped to a - // non-BMP character requiring two 2-byte units. - // - // Moreover, non-BMP characters in legacy multibyte encodings - // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are - // BOCU and SCSU, but we don't care about them. - size_t uchar_max_length = encoded.length() + 1; - - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toUChars(converter, - WriteInto(utf16, uchar_max_length), - static_cast<int>(uchar_max_length), - encoded.data(), - static_cast<int>(encoded.length()), - &status); - ucnv_close(converter); - if (!U_SUCCESS(status)) { - utf16->clear(); // Make sure the output is empty on error. - return false; - } - - utf16->resize(actual_size); - return true; -} - } // namespace base diff --git a/base/i18n/icu_string_conversions.h b/base/i18n/icu_string_conversions.h index e7dac605..6f2cab7 100644 --- a/base/i18n/icu_string_conversions.h +++ b/base/i18n/icu_string_conversions.h @@ -40,6 +40,17 @@ extern const char kCodepageUTF8[]; extern const char kCodepageUTF16BE[]; extern const char kCodepageUTF16LE[]; +// Like CodepageToUTF16() (see below), but also takes an offset into |encoded|, +// which will be adjusted to point at the same logical place in |utf16|. If +// this isn't possible because it points past the end of |encoded| or into the +// middle of a multibyte sequence, it will be set to std::string16::npos. +// |offset_for_adjustment| may be NULL. +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment); + // Converts between UTF-16 strings and the encoding specified. If the // encoding doesn't exist or the encoding fails (when on_error is FAIL), // returns false. @@ -47,11 +58,24 @@ bool UTF16ToCodepage(const string16& utf16, const char* codepage_name, OnStringConversionError::Type on_error, std::string* encoded); +inline bool CodepageToUTF16(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16) { + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, utf16, + NULL); +} -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16); +// Like CodepageToWide() (see below), but also takes an offset into |encoded|, +// which will be adjusted to point at the same logical place in |wide|. If +// this isn't possible because it points past the end of |encoded| or into the +// middle of a multibyte sequence, it will be set to std::wstring::npos. +// |offset_for_adjustment| may be NULL. +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment); // Converts between wide strings and the encoding specified. If the // encoding doesn't exist or the encoding fails (when on_error is FAIL), @@ -60,10 +84,13 @@ bool WideToCodepage(const std::wstring& wide, const char* codepage_name, OnStringConversionError::Type on_error, std::string* encoded); -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide); +inline bool CodepageToWide(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide) { + return CodepageToWideAndAdjustOffset(encoded, codepage_name, on_error, wide, + NULL); +} } // namespace base diff --git a/base/i18n/icu_string_conversions_unittest.cc b/base/i18n/icu_string_conversions_unittest.cc index 969ddb7..0088a03 100644 --- a/base/i18n/icu_string_conversions_unittest.cc +++ b/base/i18n/icu_string_conversions_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -9,9 +9,9 @@ #include <sstream> #include "base/basictypes.h" +#include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/utf_string_conversions.h" -#include "base/i18n/icu_string_conversions.h" #include "testing/gtest/include/gtest/gtest.h" namespace base { @@ -39,7 +39,7 @@ string16 BuildString16(const wchar_t* s) { #endif } -static const wchar_t* const kConvertRoundtripCases[] = { +const wchar_t* const kConvertRoundtripCases[] = { L"Google Video", // "网页 图片 资讯更多 »" L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", @@ -68,7 +68,7 @@ static const wchar_t* const kConvertRoundtripCases[] = { } // namespace -TEST(StringUtilTest, ConvertCodepageUTF8) { +TEST(ICUStringConversionsTest, ConvertCodepageUTF8) { // Make sure WideToCodepage works like WideToUTF8. for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { std::string expected(WideToUTF8(kConvertRoundtripCases[i])); @@ -156,7 +156,7 @@ static const struct { true, #if defined(WCHAR_T_IS_UTF16) L"\xD840\xDC00\x4E00", -#else +#elif defined(WCHAR_T_IS_UTF32) L"\x20000\x4E00", #endif L"\xD840\xDC00\x4E00"}, @@ -234,7 +234,7 @@ static const struct { NULL}, }; -TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { +TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndWide) { for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { std::wstring wide; bool success = CodepageToWide(kConvertCodepageCases[i].encoded, @@ -296,7 +296,7 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { OnStringConversionError::SKIP, &encoded)); } -TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { +TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { string16 utf16; bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, @@ -325,4 +325,45 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { } } +static const struct { + const char* codepage_name; + const char* encoded; + size_t input_offset; + size_t u16_output_offset; + size_t wide_output_offset; +} kAdjustOffsetCases[] = { + {"gb2312", "", 0, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", 0, 0, 0}, + {"gb2312", "\xC4\xE3\xBA\xC3", 2, 1, 1}, + {"gb2312", "\xC4\xE3\xBA\xC3", 4, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", 1, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", std::string::npos, string16::npos, + std::wstring::npos}, + {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 2, string16::npos, + std::wstring::npos}, + {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 4, 2, 1}, +}; + +TEST(ICUStringConversionsTest, AdjustOffset) { + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kAdjustOffsetCases); ++i) { + string16 utf16; + size_t offset = kAdjustOffsetCases[i].input_offset; + EXPECT_TRUE(CodepageToUTF16AndAdjustOffset(kAdjustOffsetCases[i].encoded, + kAdjustOffsetCases[i].codepage_name, + OnStringConversionError::FAIL, &utf16, &offset)); + EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); + + std::wstring wide; + offset = kAdjustOffsetCases[i].input_offset; + CodepageToWideAndAdjustOffset(kAdjustOffsetCases[i].encoded, + kAdjustOffsetCases[i].codepage_name, + OnStringConversionError::FAIL, &wide, &offset); +#if defined(WCHAR_T_IS_UTF16) + EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); +#elif defined(WCHAR_T_IS_UTF32) + EXPECT_EQ(kAdjustOffsetCases[i].wide_output_offset, offset); +#endif + } +} + } // namespace base |