diff options
author | pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-11-07 01:34:53 +0000 |
---|---|---|
committer | pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-11-07 01:34:53 +0000 |
commit | ce85f60cd9d399109dab39fe5a9613879ab9a8f7 (patch) | |
tree | 0e9e0072d2e5eadfeec08eef0f06a43c56dc1751 /base | |
parent | d90684d0cf0aa16389c9202153c97d373829b7f3 (diff) | |
download | chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.zip chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.gz chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.bz2 |
Fix various problems with inline autocomplete and URLs that change length during fixup:
* URLs with http auth info, which gets stripped
* URLs with IDN hosts
* URLs with escaped values that get unescaped
In cases like these, we'd inline autocomplete from the wrong locations, highlight the wrong portions of the URL as matches, and sometimes DCHECK() in debug mode.
The fix is to track how fixup affects the offsets into the URL we care about. Plumbing this required an enormous number of additions :(
There is also a fix here to the URL Fixer Upper, which was obviously modified at some point in the past to use the Parsed components, but without updating the comments or some of the functionality to match. Since this isn't supposed to "fix up" things that aren't simple typos, I removed some code to "fix" bogus ports, which was causing bizarre effects when typing HTTP auth URLs ("http://foo:bar" would be fixed to "http://foo" and then matched for inline autocompletion, which was clearly wrong). This is tested incidentally by one of the new History URL Provider tests (which is how I discovered it).
BUG=4010
TEST=Covered by unittests
Review URL: http://codereview.chromium.org/372017
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@31352 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/base.gyp | 1 | ||||
-rw-r--r-- | base/i18n/icu_string_conversions.cc | 204 | ||||
-rw-r--r-- | base/i18n/icu_string_conversions.h | 43 | ||||
-rw-r--r-- | base/i18n/icu_string_conversions_unittest.cc | 55 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 199 | ||||
-rw-r--r-- | base/utf_string_conversions.cc | 253 | ||||
-rw-r--r-- | base/utf_string_conversions.h | 68 | ||||
-rw-r--r-- | base/utf_string_conversions_unittest.cc | 306 |
8 files changed, 720 insertions, 409 deletions
diff --git a/base/base.gyp b/base/base.gyp index 71ff640..f09e2e5 100644 --- a/base/base.gyp +++ b/base/base.gyp @@ -633,6 +633,7 @@ 'timer_unittest.cc', 'tracked_objects_unittest.cc', 'tuple_unittest.cc', + 'utf_string_conversions_unittest.cc', 'values_unittest.cc', 'version_unittest.cc', 'waitable_event_unittest.cc', diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc index ba9f9ae..c93b103 100644 --- a/base/i18n/icu_string_conversions.cc +++ b/base/i18n/icu_string_conversions.cc @@ -157,6 +157,90 @@ const char kCodepageUTF16LE[] = "UTF-16LE"; // Codepage <-> Wide/UTF-16 --------------------------------------------------- +// Convert a UTF-16 string into the specified codepage_name. If the codepage +// isn't found, return false. +bool UTF16ToCodepage(const string16& utf16, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::string* encoded) { + encoded->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + return ConvertFromUTF16(converter, utf16.c_str(), + static_cast<int>(utf16.length()), on_error, encoded); +} + +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment) { + utf16->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + // Even in the worst case, the maximum length in 2-byte units of UTF-16 + // output would be at most the same as the number of bytes in input. There + // is no single-byte encoding in which a character is mapped to a + // non-BMP character requiring two 2-byte units. + // + // Moreover, non-BMP characters in legacy multibyte encodings + // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are + // BOCU and SCSU, but we don't care about them. + size_t uchar_max_length = encoded.length() + 1; + + SetUpErrorHandlerForToUChars(on_error, converter, &status); + char16* byte_buffer = WriteInto(utf16, uchar_max_length); + int byte_buffer_length = static_cast<int>(uchar_max_length); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = string16::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, + data, static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size; + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = string16::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data, + length, &status); + ucnv_close(converter); + if (!U_SUCCESS(status)) { + utf16->clear(); // Make sure the output is empty on error. + return false; + } + + utf16->resize(actual_size); + return true; +} + // Convert a wstring into the specified codepage_name. If the codepage // isn't found, return false. bool WideToCodepage(const std::wstring& wide, @@ -188,31 +272,16 @@ bool WideToCodepage(const std::wstring& wide, #endif // defined(WCHAR_T_IS_UTF32) } -// Convert a UTF-16 string into the specified codepage_name. If the codepage -// isn't found, return false. -bool UTF16ToCodepage(const string16& utf16, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::string* encoded) { - encoded->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - return ConvertFromUTF16(converter, utf16.c_str(), - static_cast<int>(utf16.length()), on_error, encoded); -} - // Converts a string of the given codepage into wstring. // If the codepage isn't found, return false. -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide) { +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment) { #if defined(WCHAR_T_IS_UTF16) - return CodepageToUTF16(encoded, codepage_name, on_error, wide); + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide, + offset_for_adjustment); #elif defined(WCHAR_T_IS_UTF32) wide->clear(); @@ -227,70 +296,53 @@ bool CodepageToWide(const std::string& encoded, // this can be 4 times larger than actually needed. size_t wchar_max_length = encoded.length() + 1; - // The byte buffer and its length to pass to ucnv_toAlgorithimic. - char* byte_buffer = reinterpret_cast<char*>( - WriteInto(wide, wchar_max_length)); - int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), - converter, - byte_buffer, - byte_buffer_length, - encoded.data(), - static_cast<int>(encoded.length()), - &status); + char* byte_buffer = + reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)); + int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = std::wstring::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, + static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size / sizeof(wchar_t); + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = std::wstring::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, length, &status); ucnv_close(converter); - if (!U_SUCCESS(status)) { wide->clear(); // Make sure the output is empty on error. return false; } // actual_size is # of bytes. - wide->resize(actual_size / 4); + wide->resize(actual_size / sizeof(wchar_t)); return true; #endif // defined(WCHAR_T_IS_UTF32) } -// Converts a string of the given codepage into UTF-16. -// If the codepage isn't found, return false. -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16) { - utf16->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - // Even in the worst case, the maximum length in 2-byte units of UTF-16 - // output would be at most the same as the number of bytes in input. There - // is no single-byte encoding in which a character is mapped to a - // non-BMP character requiring two 2-byte units. - // - // Moreover, non-BMP characters in legacy multibyte encodings - // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are - // BOCU and SCSU, but we don't care about them. - size_t uchar_max_length = encoded.length() + 1; - - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toUChars(converter, - WriteInto(utf16, uchar_max_length), - static_cast<int>(uchar_max_length), - encoded.data(), - static_cast<int>(encoded.length()), - &status); - ucnv_close(converter); - if (!U_SUCCESS(status)) { - utf16->clear(); // Make sure the output is empty on error. - return false; - } - - utf16->resize(actual_size); - return true; -} - } // namespace base diff --git a/base/i18n/icu_string_conversions.h b/base/i18n/icu_string_conversions.h index e7dac605..6f2cab7 100644 --- a/base/i18n/icu_string_conversions.h +++ b/base/i18n/icu_string_conversions.h @@ -40,6 +40,17 @@ extern const char kCodepageUTF8[]; extern const char kCodepageUTF16BE[]; extern const char kCodepageUTF16LE[]; +// Like CodepageToUTF16() (see below), but also takes an offset into |encoded|, +// which will be adjusted to point at the same logical place in |utf16|. If +// this isn't possible because it points past the end of |encoded| or into the +// middle of a multibyte sequence, it will be set to std::string16::npos. +// |offset_for_adjustment| may be NULL. +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment); + // Converts between UTF-16 strings and the encoding specified. If the // encoding doesn't exist or the encoding fails (when on_error is FAIL), // returns false. @@ -47,11 +58,24 @@ bool UTF16ToCodepage(const string16& utf16, const char* codepage_name, OnStringConversionError::Type on_error, std::string* encoded); +inline bool CodepageToUTF16(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16) { + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, utf16, + NULL); +} -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16); +// Like CodepageToWide() (see below), but also takes an offset into |encoded|, +// which will be adjusted to point at the same logical place in |wide|. If +// this isn't possible because it points past the end of |encoded| or into the +// middle of a multibyte sequence, it will be set to std::wstring::npos. +// |offset_for_adjustment| may be NULL. +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment); // Converts between wide strings and the encoding specified. If the // encoding doesn't exist or the encoding fails (when on_error is FAIL), @@ -60,10 +84,13 @@ bool WideToCodepage(const std::wstring& wide, const char* codepage_name, OnStringConversionError::Type on_error, std::string* encoded); -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide); +inline bool CodepageToWide(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide) { + return CodepageToWideAndAdjustOffset(encoded, codepage_name, on_error, wide, + NULL); +} } // namespace base diff --git a/base/i18n/icu_string_conversions_unittest.cc b/base/i18n/icu_string_conversions_unittest.cc index 969ddb7..0088a03 100644 --- a/base/i18n/icu_string_conversions_unittest.cc +++ b/base/i18n/icu_string_conversions_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -9,9 +9,9 @@ #include <sstream> #include "base/basictypes.h" +#include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/utf_string_conversions.h" -#include "base/i18n/icu_string_conversions.h" #include "testing/gtest/include/gtest/gtest.h" namespace base { @@ -39,7 +39,7 @@ string16 BuildString16(const wchar_t* s) { #endif } -static const wchar_t* const kConvertRoundtripCases[] = { +const wchar_t* const kConvertRoundtripCases[] = { L"Google Video", // "网页 图片 资讯更多 »" L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", @@ -68,7 +68,7 @@ static const wchar_t* const kConvertRoundtripCases[] = { } // namespace -TEST(StringUtilTest, ConvertCodepageUTF8) { +TEST(ICUStringConversionsTest, ConvertCodepageUTF8) { // Make sure WideToCodepage works like WideToUTF8. for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { std::string expected(WideToUTF8(kConvertRoundtripCases[i])); @@ -156,7 +156,7 @@ static const struct { true, #if defined(WCHAR_T_IS_UTF16) L"\xD840\xDC00\x4E00", -#else +#elif defined(WCHAR_T_IS_UTF32) L"\x20000\x4E00", #endif L"\xD840\xDC00\x4E00"}, @@ -234,7 +234,7 @@ static const struct { NULL}, }; -TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { +TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndWide) { for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { std::wstring wide; bool success = CodepageToWide(kConvertCodepageCases[i].encoded, @@ -296,7 +296,7 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { OnStringConversionError::SKIP, &encoded)); } -TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { +TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { string16 utf16; bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, @@ -325,4 +325,45 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { } } +static const struct { + const char* codepage_name; + const char* encoded; + size_t input_offset; + size_t u16_output_offset; + size_t wide_output_offset; +} kAdjustOffsetCases[] = { + {"gb2312", "", 0, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", 0, 0, 0}, + {"gb2312", "\xC4\xE3\xBA\xC3", 2, 1, 1}, + {"gb2312", "\xC4\xE3\xBA\xC3", 4, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", 1, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", std::string::npos, string16::npos, + std::wstring::npos}, + {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 2, string16::npos, + std::wstring::npos}, + {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 4, 2, 1}, +}; + +TEST(ICUStringConversionsTest, AdjustOffset) { + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kAdjustOffsetCases); ++i) { + string16 utf16; + size_t offset = kAdjustOffsetCases[i].input_offset; + EXPECT_TRUE(CodepageToUTF16AndAdjustOffset(kAdjustOffsetCases[i].encoded, + kAdjustOffsetCases[i].codepage_name, + OnStringConversionError::FAIL, &utf16, &offset)); + EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); + + std::wstring wide; + offset = kAdjustOffsetCases[i].input_offset; + CodepageToWideAndAdjustOffset(kAdjustOffsetCases[i].encoded, + kAdjustOffsetCases[i].codepage_name, + OnStringConversionError::FAIL, &wide, &offset); +#if defined(WCHAR_T_IS_UTF16) + EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); +#elif defined(WCHAR_T_IS_UTF32) + EXPECT_EQ(kAdjustOffsetCases[i].wide_output_offset, offset); +#endif + } +} + } // namespace base diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 0ccea91..d691003 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -229,203 +229,6 @@ TEST(StringUtilTest, IsStringUTF8) { EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); } -static const wchar_t* const kConvertRoundtripCases[] = { - L"Google Video", - // "网页 图片 资讯更多 »" - L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", - // "Παγκόσμιος Ιστός" - L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" - L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", - // "Поиск страниц на русском" - L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" - L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" - L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", - // "전체서비스" - L"\xc804\xccb4\xc11c\xbe44\xc2a4", - - // Test characters that take more than 16 bits. This will depend on whether - // wchar_t is 16 or 32 bits. -#if defined(WCHAR_T_IS_UTF16) - L"\xd800\xdf00", - // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) - L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", -#elif defined(WCHAR_T_IS_UTF32) - L"\x10300", - // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) - L"\x11d40\x11d41\x11d42\x11d43\x11d44", -#endif -}; - -TEST(StringUtilTest, ConvertUTF8AndWide) { - // we round-trip all the wide strings through UTF-8 to make sure everything - // agrees on the conversion. This uses the stream operators to test them - // simultaneously. - for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { - std::ostringstream utf8; - utf8 << WideToUTF8(kConvertRoundtripCases[i]); - std::wostringstream wide; - wide << UTF8ToWide(utf8.str()); - - EXPECT_EQ(kConvertRoundtripCases[i], wide.str()); - } -} - -TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) { - // An empty std::wstring should be converted to an empty std::string, - // and vice versa. - std::wstring wempty; - std::string empty; - EXPECT_EQ(empty, WideToUTF8(wempty)); - EXPECT_EQ(wempty, UTF8ToWide(empty)); -} - -TEST(StringUtilTest, ConvertUTF8ToWide) { - struct UTF8ToWideCase { - const char* utf8; - const wchar_t* wide; - bool success; - } convert_cases[] = { - // Regular UTF-8 input. - {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, - // Non-character is passed through. - {"\xef\xbf\xbfHello", L"\xffffHello", true}, - // Truncated UTF-8 sequence. - {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, - // Truncated off the end. - {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, - // Non-shortest-form UTF-8. - {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, - // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. - {"\xed\xb0\x80", L"", false}, - // Non-BMP characters. The second is a non-character regarded as valid. - // The result will either be in UTF-16 or UTF-32. -#if defined(WCHAR_T_IS_UTF16) - {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, - {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, -#elif defined(WCHAR_T_IS_UTF32) - {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, - {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, -#endif - }; - - for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { - std::wstring converted; - EXPECT_EQ(convert_cases[i].success, - UTF8ToWide(convert_cases[i].utf8, - strlen(convert_cases[i].utf8), - &converted)); - std::wstring expected(convert_cases[i].wide); - EXPECT_EQ(expected, converted); - } - - // Manually test an embedded NULL. - std::wstring converted; - EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); - ASSERT_EQ(3U, converted.length()); -#if defined(WCHAR_T_IS_UNSIGNED) - EXPECT_EQ(0U, converted[0]); -#else - EXPECT_EQ(0, converted[0]); -#endif - EXPECT_EQ('Z', converted[1]); - EXPECT_EQ('\t', converted[2]); - - // Make sure that conversion replaces, not appends. - EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); - ASSERT_EQ(1U, converted.length()); - EXPECT_EQ('B', converted[0]); -} - -#if defined(WCHAR_T_IS_UTF16) -// This test is only valid when wchar_t == UTF-16. -TEST(StringUtilTest, ConvertUTF16ToUTF8) { - struct UTF16ToUTF8Case { - const wchar_t* utf16; - const char* utf8; - bool success; - } convert_cases[] = { - // Regular UTF-16 input. - {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, - // Test a non-BMP character. - {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, - // Non-characters are passed through. - {L"\xffffHello", "\xEF\xBF\xBFHello", true}, - {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, - // The first character is a truncated UTF-16 character. - {L"\xd800\x597d", "\xe5\xa5\xbd", false}, - // Truncated at the end. - {L"\x597d\xd800", "\xe5\xa5\xbd", false}, - }; - - for (int i = 0; i < arraysize(convert_cases); i++) { - std::string converted; - EXPECT_EQ(convert_cases[i].success, - WideToUTF8(convert_cases[i].utf16, - wcslen(convert_cases[i].utf16), - &converted)); - std::string expected(convert_cases[i].utf8); - EXPECT_EQ(expected, converted); - } -} - -#elif defined(WCHAR_T_IS_UTF32) -// This test is only valid when wchar_t == UTF-32. -TEST(StringUtilTest, ConvertUTF32ToUTF8) { - struct WideToUTF8Case { - const wchar_t* utf32; - const char* utf8; - bool success; - } convert_cases[] = { - // Regular 16-bit input. - {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, - // Test a non-BMP character. - {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, - // Non-characters are passed through. - {L"\xffffHello", "\xEF\xBF\xBFHello", true}, - {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, - // Invalid Unicode code points. - {L"\xfffffffHello", "Hello", false}, - // The first character is a truncated UTF-16 character. - {L"\xd800\x597d", "\xe5\xa5\xbd", false}, - {L"\xdc01Hello", "Hello", false}, - }; - - for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { - std::string converted; - EXPECT_EQ(convert_cases[i].success, - WideToUTF8(convert_cases[i].utf32, - wcslen(convert_cases[i].utf32), - &converted)); - std::string expected(convert_cases[i].utf8); - EXPECT_EQ(expected, converted); - } -} -#endif // defined(WCHAR_T_IS_UTF32) - -TEST(StringUtilTest, ConvertMultiString) { - static wchar_t wmulti[] = { - L'f', L'o', L'o', L'\0', - L'b', L'a', L'r', L'\0', - L'b', L'a', L'z', L'\0', - L'\0' - }; - static char multi[] = { - 'f', 'o', 'o', '\0', - 'b', 'a', 'r', '\0', - 'b', 'a', 'z', '\0', - '\0' - }; - std::wstring wmultistring; - memcpy(WriteInto(&wmultistring, arraysize(wmulti)), wmulti, sizeof(wmulti)); - EXPECT_EQ(arraysize(wmulti) - 1, wmultistring.length()); - std::string expected; - memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi)); - EXPECT_EQ(arraysize(multi) - 1, expected.length()); - const std::string& converted = WideToUTF8(wmultistring); - EXPECT_EQ(arraysize(multi) - 1, converted.length()); - EXPECT_EQ(expected, converted); -} - TEST(StringUtilTest, ConvertASCII) { static const char* char_cases[] = { "Google Video", diff --git a/base/utf_string_conversions.cc b/base/utf_string_conversions.cc index 6b25cd8..ffff50a 100644 --- a/base/utf_string_conversions.cc +++ b/base/utf_string_conversions.cc @@ -84,43 +84,50 @@ bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, // WriteUnicodeCharacter ------------------------------------------------------- -// Appends a UTF-8 character to the given 8-bit string. -void WriteUnicodeCharacter(uint32 code_point, std::string* output) { +// Appends a UTF-8 character to the given 8-bit string. Returns the number of +// bytes written. +size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) { if (code_point <= 0x7f) { // Fast path the common case of one byte. output->push_back(code_point); - return; + return 1; } - // U8_APPEND_UNSAFE can append up to 4 bytes. - int32 char_offset = static_cast<int32>(output->length()); + // CBU8_APPEND_UNSAFE can append up to 4 bytes. + size_t char_offset = output->length(); + size_t original_char_offset = char_offset; output->resize(char_offset + CBU8_MAX_LENGTH); CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); - // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so + // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so // it will represent the new length of the string. output->resize(char_offset); + return char_offset - original_char_offset; } -// Appends the given code point as a UTF-16 character to the STL string. -void WriteUnicodeCharacter(uint32 code_point, string16* output) { +// Appends the given code point as a UTF-16 character to the given 16-bit +// string. Returns the number of 16-bit values written. +size_t WriteUnicodeCharacter(uint32 code_point, string16* output) { if (CBU16_LENGTH(code_point) == 1) { // Thie code point is in the Basic Multilingual Plane (BMP). output->push_back(static_cast<char16>(code_point)); - } else { - // Non-BMP characters use a double-character encoding. - int32 char_offset = static_cast<int32>(output->length()); - output->resize(char_offset + CBU16_MAX_LENGTH); - CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + return 1; } + // Non-BMP characters use a double-character encoding. + size_t char_offset = output->length(); + output->resize(char_offset + CBU16_MAX_LENGTH); + CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + return CBU16_MAX_LENGTH; } #if defined(WCHAR_T_IS_UTF32) -// Appends the given UTF-32 character to the given 32-bit string. -inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { +// Appends the given UTF-32 character to the given 32-bit string. Returns the +// number of 32-bit values written. +inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { // This is the easy case, just append the character. output->push_back(code_point); + return 1; } #endif // defined(WCHAR_T_IS_UTF32) @@ -131,31 +138,57 @@ inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { // determine the source, and the given output STL string will be replaced by // the result. template<typename SRC_CHAR, typename DEST_STRING> -bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { - output->clear(); +bool ConvertUnicode(const SRC_CHAR* src, + size_t src_len, + DEST_STRING* output, + size_t* offset_for_adjustment) { + size_t output_offset = + (offset_for_adjustment && *offset_for_adjustment < src_len) ? + *offset_for_adjustment : DEST_STRING::npos; // ICU requires 32-bit numbers. bool success = true; int32 src_len32 = static_cast<int32>(src_len); for (int32 i = 0; i < src_len32; i++) { uint32 code_point; + size_t original_i = i; + size_t chars_written = 0; if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { - WriteUnicodeCharacter(code_point, output); + chars_written = WriteUnicodeCharacter(code_point, output); } else { // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) // in place of an invalid codepoint. success = false; } + if ((output_offset != DEST_STRING::npos) && + (*offset_for_adjustment > original_i)) { + // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last + // character read, not after it (so that incrementing it in the loop + // increment will place it at the right location), so we need to account + // for that in determining the amount that was read. + if (*offset_for_adjustment <= static_cast<size_t>(i)) + output_offset = DEST_STRING::npos; + else + output_offset += chars_written - (i - original_i + 1); + } } + + if (offset_for_adjustment) + *offset_for_adjustment = output_offset; return success; } -// Guesses the length of the output in UTF-8 in bytes, and reserves that amount -// of space in the given string. We also assume that the input character types -// are unsigned, which will be true for UTF-16 and -32 on our systems. We assume -// the string length is greater than zero. +// Guesses the length of the output in UTF-8 in bytes, clears that output +// string, and reserves that amount of space. We assume that the input +// character types are unsigned, which will be true for UTF-16 and -32 on our +// systems. template<typename CHAR> -void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { +void PrepareForUTF8Output(const CHAR* src, + size_t src_len, + std::string* output) { + output->clear(); + if (src_len == 0) + return; if (src[0] < 0x80) { // Assume that the entire input will be ASCII. output->reserve(src_len); @@ -165,11 +198,15 @@ void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { } } -// Guesses the size of the output buffer (containing either UTF-16 or -32 data) -// given some UTF-8 input that will be converted to it. See ReserveUTF8Output. -// We assume the source length is > 0. +// Prepares an output buffer (containing either UTF-16 or -32 data) given some +// UTF-8 input that will be converted to it. See PrepareForUTF8Output(). template<typename STRING> -void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { +void PrepareForUTF16Or32Output(const char* src, + size_t src_len, + STRING* output) { + output->clear(); + if (src_len == 0) + return; if (static_cast<unsigned char>(src[0]) < 0x80) { // Assume the input is all ASCII, which means 1:1 correspondence. output->reserve(src_len); @@ -184,111 +221,121 @@ void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { // UTF-8 <-> Wide -------------------------------------------------------------- -std::string WideToUTF8(const std::wstring& wide) { - std::string ret; - if (wide.empty()) - return ret; +bool WideToUTF8AndAdjustOffset(const wchar_t* src, + size_t src_len, + std::string* output, + size_t* offset_for_adjustment) { + PrepareForUTF8Output(src, src_len, output); + return ConvertUnicode<wchar_t, std::string>(src, src_len, output, + offset_for_adjustment); +} +std::string WideToUTF8AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment) { + std::string ret; // Ignore the success flag of this call, it will do the best it can for // invalid input, which is what we want here. - WideToUTF8(wide.data(), wide.length(), &ret); + WideToUTF8AndAdjustOffset(wide.data(), wide.length(), &ret, + offset_for_adjustment); return ret; } -bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF8Output(src, src_len, output); - return ConvertUnicode<wchar_t, std::string>(src, src_len, output); +bool UTF8ToWideAndAdjustOffset(const char* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment) { + PrepareForUTF16Or32Output(src, src_len, output); + return ConvertUnicode<char, std::wstring>(src, src_len, output, + offset_for_adjustment); } -std::wstring UTF8ToWide(const base::StringPiece& utf8) { +std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, + size_t* offset_for_adjustment) { std::wstring ret; - if (utf8.empty()) - return ret; - - UTF8ToWide(utf8.data(), utf8.length(), &ret); + UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret, + offset_for_adjustment); return ret; } -bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF16Or32Output(src, src_len, output); - return ConvertUnicode<char, std::wstring>(src, src_len, output); -} - // UTF-16 <-> Wide ------------------------------------------------------------- #if defined(WCHAR_T_IS_UTF16) // When wide == UTF-16, then conversions are a NOP. -string16 WideToUTF16(const std::wstring& wide) { - return wide; -} - -bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { +bool WideToUTF16AndAdjustOffset(const wchar_t* src, + size_t src_len, + string16* output, + size_t* offset_for_adjustment) { output->assign(src, src_len); + if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) + *offset_for_adjustment = string16::npos; return true; } -std::wstring UTF16ToWide(const string16& utf16) { - return utf16; +string16 WideToUTF16AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment) { + if (offset_for_adjustment && (*offset_for_adjustment >= wide.length())) + *offset_for_adjustment = string16::npos; + return wide; } -bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { +bool UTF16ToWideAndAdjustOffset(const char16* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment) { output->assign(src, src_len); + if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) + *offset_for_adjustment = std::wstring::npos; return true; } -#elif defined(WCHAR_T_IS_UTF32) - -string16 WideToUTF16(const std::wstring& wide) { - string16 ret; - if (wide.empty()) - return ret; - - WideToUTF16(wide.data(), wide.length(), &ret); - return ret; +std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, + size_t* offset_for_adjustment) { + if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) + *offset_for_adjustment = std::wstring::npos; + return utf16; } -bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { - if (src_len == 0) { - output->clear(); - return true; - } +#elif defined(WCHAR_T_IS_UTF32) +bool WideToUTF16AndAdjustOffset(const wchar_t* src, + size_t src_len, + string16* output, + size_t* offset_for_adjustment) { + output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); - return ConvertUnicode<wchar_t, string16>(src, src_len, output); + return ConvertUnicode<wchar_t, string16>(src, src_len, output, + offset_for_adjustment); } -std::wstring UTF16ToWide(const string16& utf16) { - std::wstring ret; - if (utf16.empty()) - return ret; - - UTF16ToWide(utf16.data(), utf16.length(), &ret); +string16 WideToUTF16AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment) { + string16 ret; + WideToUTF16AndAdjustOffset(wide.data(), wide.length(), &ret, + offset_for_adjustment); return ret; } -bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { - if (src_len == 0) { - output->clear(); - return true; - } - +bool UTF16ToWideAndAdjustOffset(const char16* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment) { + output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); - return ConvertUnicode<char16, std::wstring>(src, src_len, output); + return ConvertUnicode<char16, std::wstring>(src, src_len, output, + offset_for_adjustment); +} + +std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, + size_t* offset_for_adjustment) { + std::wstring ret; + UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret, + offset_for_adjustment); + return ret; } #endif // defined(WCHAR_T_IS_UTF32) @@ -298,20 +345,12 @@ bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { #if defined(WCHAR_T_IS_UTF32) bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF16Or32Output(src, src_len, output); - return ConvertUnicode<char, string16>(src, src_len, output); + PrepareForUTF16Or32Output(src, src_len, output); + return ConvertUnicode<char, string16>(src, src_len, output, NULL); } string16 UTF8ToUTF16(const std::string& utf8) { string16 ret; - if (utf8.empty()) - return ret; - // Ignore the success flag of this call, it will do the best it can for // invalid input, which is what we want here. UTF8ToUTF16(utf8.data(), utf8.length(), &ret); @@ -319,20 +358,12 @@ string16 UTF8ToUTF16(const std::string& utf8) { } bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF8Output(src, src_len, output); - return ConvertUnicode<char16, std::string>(src, src_len, output); + PrepareForUTF8Output(src, src_len, output); + return ConvertUnicode<char16, std::string>(src, src_len, output, NULL); } std::string UTF16ToUTF8(const string16& utf16) { std::string ret; - if (utf16.empty()) - return ret; - // Ignore the success flag of this call, it will do the best it can for // invalid input, which is what we want here. UTF16ToUTF8(utf16.data(), utf16.length(), &ret); diff --git a/base/utf_string_conversions.h b/base/utf_string_conversions.h index 89846ed..323233b 100644 --- a/base/utf_string_conversions.h +++ b/base/utf_string_conversions.h @@ -10,6 +10,37 @@ #include "base/string16.h" #include "base/string_piece.h" +// Like the conversions below, but also takes an offset into the source string, +// which will be adjusted to point at the same logical place in the result +// string. If this isn't possible because it points past the end of the source +// string or into the middle of a multibyte sequence, it will be set to +// std::wstring::npos. |offset_for_adjustment| may be NULL. +bool WideToUTF8AndAdjustOffset(const wchar_t* src, + size_t src_len, + std::string* output, + size_t* offset_for_adjustment); +std::string WideToUTF8AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment); +bool UTF8ToWideAndAdjustOffset(const char* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment); +std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, + size_t* offset_for_adjustment); + +bool WideToUTF16AndAdjustOffset(const wchar_t* src, + size_t src_len, + string16* output, + size_t* offset_for_adjustment); +string16 WideToUTF16AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment); +bool UTF16ToWideAndAdjustOffset(const char16* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment); +std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, + size_t* offset_for_adjustment); + // These convert between UTF-8, -16, and -32 strings. They are potentially slow, // so avoid unnecessary conversions. The low-level versions return a boolean // indicating whether the conversion was 100% valid. In this case, it will still @@ -23,15 +54,34 @@ // the Unicode replacement character or adding |replacement_char| parameter. // Currently, it's skipped in the ouput, which could be problematic in // some situations. -bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output); -std::string WideToUTF8(const std::wstring& wide); -bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output); -std::wstring UTF8ToWide(const base::StringPiece& utf8); - -bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output); -string16 WideToUTF16(const std::wstring& wide); -bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output); -std::wstring UTF16ToWide(const string16& utf16); +inline bool WideToUTF8(const wchar_t* src, + size_t src_len, + std::string* output) { + return WideToUTF8AndAdjustOffset(src, src_len, output, NULL); +} +inline std::string WideToUTF8(const std::wstring& wide) { + return WideToUTF8AndAdjustOffset(wide, NULL); +} +inline bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { + return UTF8ToWideAndAdjustOffset(src, src_len, output, NULL); +} +inline std::wstring UTF8ToWide(const base::StringPiece& utf8) { + return UTF8ToWideAndAdjustOffset(utf8, NULL); +} + +inline bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { + return WideToUTF16AndAdjustOffset(src, src_len, output, NULL); +} +inline string16 WideToUTF16(const std::wstring& wide) { + return WideToUTF16AndAdjustOffset(wide, NULL); +} +inline bool UTF16ToWide(const char16* src, size_t src_len, + std::wstring* output) { + return UTF16ToWideAndAdjustOffset(src, src_len, output, NULL); +} +inline std::wstring UTF16ToWide(const string16& utf16) { + return UTF16ToWideAndAdjustOffset(utf16, NULL); +} bool UTF8ToUTF16(const char* src, size_t src_len, string16* output); string16 UTF8ToUTF16(const std::string& utf8); diff --git a/base/utf_string_conversions_unittest.cc b/base/utf_string_conversions_unittest.cc new file mode 100644 index 0000000..67af7c3 --- /dev/null +++ b/base/utf_string_conversions_unittest.cc @@ -0,0 +1,306 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/basictypes.h" +#include "base/string_util.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace base { + +namespace { + +// Given a null-terminated string of wchar_t with each wchar_t representing +// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. +// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) +// should be represented as a surrogate pair (two UTF-16 units) +// *even* where wchar_t is 32-bit (Linux and Mac). +// +// This is to help write tests for functions with string16 params until +// the C++ 0x UTF-16 literal is well-supported by compilers. +string16 BuildString16(const wchar_t* s) { +#if defined(WCHAR_T_IS_UTF16) + return string16(s); +#elif defined(WCHAR_T_IS_UTF32) + string16 u16; + while (*s != 0) { + DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu); + u16.push_back(*s++); + } + return u16; +#endif +} + +const wchar_t* const kConvertRoundtripCases[] = { + L"Google Video", + // "网页 图片 资讯更多 »" + L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", + // "Παγκόσμιος Ιστός" + L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", + // "Поиск страниц на русском" + L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" + L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" + L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", + // "전체서비스" + L"\xc804\xccb4\xc11c\xbe44\xc2a4", + + // Test characters that take more than 16 bits. This will depend on whether + // wchar_t is 16 or 32 bits. +#if defined(WCHAR_T_IS_UTF16) + L"\xd800\xdf00", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", +#elif defined(WCHAR_T_IS_UTF32) + L"\x10300", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\x11d40\x11d41\x11d42\x11d43\x11d44", +#endif +}; + +} // namespace + +TEST(UTFStringConversionsTest, ConvertUTF8AndWide) { + // we round-trip all the wide strings through UTF-8 to make sure everything + // agrees on the conversion. This uses the stream operators to test them + // simultaneously. + for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { + std::ostringstream utf8; + utf8 << WideToUTF8(kConvertRoundtripCases[i]); + std::wostringstream wide; + wide << UTF8ToWide(utf8.str()); + + EXPECT_EQ(kConvertRoundtripCases[i], wide.str()); + } +} + +TEST(UTFStringConversionsTest, ConvertUTF8AndWideEmptyString) { + // An empty std::wstring should be converted to an empty std::string, + // and vice versa. + std::wstring wempty; + std::string empty; + EXPECT_EQ(empty, WideToUTF8(wempty)); + EXPECT_EQ(wempty, UTF8ToWide(empty)); +} + +TEST(UTFStringConversionsTest, ConvertUTF8ToWide) { + struct UTF8ToWideCase { + const char* utf8; + const wchar_t* wide; + bool success; + } convert_cases[] = { + // Regular UTF-8 input. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, + // Non-character is passed through. + {"\xef\xbf\xbfHello", L"\xffffHello", true}, + // Truncated UTF-8 sequence. + {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, + // Truncated off the end. + {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, + // Non-shortest-form UTF-8. + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, + // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. + {"\xed\xb0\x80", L"", false}, + // Non-BMP characters. The second is a non-character regarded as valid. + // The result will either be in UTF-16 or UTF-32. +#if defined(WCHAR_T_IS_UTF16) + {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, +#elif defined(WCHAR_T_IS_UTF32) + {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, +#endif + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { + std::wstring converted; + EXPECT_EQ(convert_cases[i].success, + UTF8ToWide(convert_cases[i].utf8, + strlen(convert_cases[i].utf8), + &converted)); + std::wstring expected(convert_cases[i].wide); + EXPECT_EQ(expected, converted); + } + + // Manually test an embedded NULL. + std::wstring converted; + EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); + ASSERT_EQ(3U, converted.length()); + EXPECT_EQ(static_cast<wchar_t>(0), converted[0]); + EXPECT_EQ('Z', converted[1]); + EXPECT_EQ('\t', converted[2]); + + // Make sure that conversion replaces, not appends. + EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); + ASSERT_EQ(1U, converted.length()); + EXPECT_EQ('B', converted[0]); +} + +#if defined(WCHAR_T_IS_UTF16) +// This test is only valid when wchar_t == UTF-16. +TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf16; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular UTF-16 input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + // Truncated at the end. + {L"\x597d\xd800", "\xe5\xa5\xbd", false}, + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} + +#elif defined(WCHAR_T_IS_UTF32) +// This test is only valid when wchar_t == UTF-32. +TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf32; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular 16-bit input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // Invalid Unicode code points. + {L"\xfffffffHello", "Hello", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + {L"\xdc01Hello", "Hello", false}, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf32, + wcslen(convert_cases[i].utf32), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} +#endif // defined(WCHAR_T_IS_UTF32) + +TEST(UTFStringConversionsTest, ConvertMultiString) { + static wchar_t wmulti[] = { + L'f', L'o', L'o', L'\0', + L'b', L'a', L'r', L'\0', + L'b', L'a', L'z', L'\0', + L'\0' + }; + static char multi[] = { + 'f', 'o', 'o', '\0', + 'b', 'a', 'r', '\0', + 'b', 'a', 'z', '\0', + '\0' + }; + std::wstring wmultistring; + memcpy(WriteInto(&wmultistring, arraysize(wmulti)), wmulti, sizeof(wmulti)); + EXPECT_EQ(arraysize(wmulti) - 1, wmultistring.length()); + std::string expected; + memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi)); + EXPECT_EQ(arraysize(multi) - 1, expected.length()); + const std::string& converted = WideToUTF8(wmultistring); + EXPECT_EQ(arraysize(multi) - 1, converted.length()); + EXPECT_EQ(expected, converted); +} + +TEST(UTFStringConversionsTest, AdjustOffset) { + // Under the hood, all the functions call the same converter function, so we + // don't need to exhaustively check every case. + struct WideToUTF8Case { + const wchar_t* wide; + size_t input_offset; + size_t output_offset; + } wide_to_utf8_cases[] = { + {L"", 0, std::string::npos}, + {L"\x4f60\x597d", 0, 0}, + {L"\x4f60\x597d", 1, 3}, + {L"\x4f60\x597d", 2, std::string::npos}, + {L"\x4f60\x597d", std::wstring::npos, std::string::npos}, + {L"\xd800\x597dz", 1, 0}, + {L"\xd800\x597dz", 2, 3}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(wide_to_utf8_cases); ++i) { + size_t offset = wide_to_utf8_cases[i].input_offset; + WideToUTF8AndAdjustOffset(wide_to_utf8_cases[i].wide, &offset); + EXPECT_EQ(wide_to_utf8_cases[i].output_offset, offset); + } + + struct UTF8ToWideCase { + const char* utf8; + size_t input_offset; + size_t output_offset; + } utf8_to_wide_cases[] = { + {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos}, + {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1}, + {"\xed\xb0\x80z", 3, 0}, + {"A\xF0\x90\x8C\x80z", 1, 1}, + {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos}, +#if defined(WCHAR_T_IS_UTF16) + {"A\xF0\x90\x8C\x80z", 5, 3}, +#elif defined(WCHAR_T_IS_UTF32) + {"A\xF0\x90\x8C\x80z", 5, 2}, +#endif + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf8_to_wide_cases); ++i) { + size_t offset = utf8_to_wide_cases[i].input_offset; + UTF8ToWideAndAdjustOffset(utf8_to_wide_cases[i].utf8, &offset); + EXPECT_EQ(utf8_to_wide_cases[i].output_offset, offset); + } + +#if defined(WCHAR_T_IS_UTF32) + struct WideToUTF16Case { + const wchar_t* wide; + size_t input_offset; + size_t output_offset; + } wide_to_utf16_cases[] = { + {L"\x4F60\x597D", 1, 1}, + {L"\x20000\x4E00", 1, 2}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(wide_to_utf16_cases); ++i) { + size_t offset = wide_to_utf16_cases[i].input_offset; + WideToUTF16AndAdjustOffset(wide_to_utf16_cases[i].wide, &offset); + EXPECT_EQ(wide_to_utf16_cases[i].output_offset, offset); + } + + struct UTF16ToWideCase { + const wchar_t* wide; + size_t input_offset; + size_t output_offset; + } utf16_to_wide_cases[] = { + {L"\xD840\xDC00\x4E00", 0, 0}, + {L"\xD840\xDC00\x4E00", 1, std::wstring::npos}, + {L"\xD840\xDC00\x4E00", 2, 1}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_wide_cases); ++i) { + size_t offset = utf16_to_wide_cases[i].input_offset; + UTF16ToWideAndAdjustOffset(BuildString16(utf16_to_wide_cases[i].wide), + &offset); + EXPECT_EQ(utf16_to_wide_cases[i].output_offset, offset); + } +#endif +} + +} // namaspace base |