diff options
author | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-08-01 00:13:10 +0000 |
---|---|---|
committer | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-08-01 00:13:10 +0000 |
commit | 656e3b3857e315a4a6386944fb140ef202580f77 (patch) | |
tree | 37fc49db14f3b43a03da33ef58dddfff9b58be2f /base/string_util_unittest.cc | |
parent | 660efb2db208d7a64e04eebad1e0e1dd7b54f3b0 (diff) | |
download | chromium_src-656e3b3857e315a4a6386944fb140ef202580f77.zip chromium_src-656e3b3857e315a4a6386944fb140ef202580f77.tar.gz chromium_src-656e3b3857e315a4a6386944fb140ef202580f77.tar.bz2 |
Write our own utf8<->wide conversion functions. This gives us more control over error handling instead of getting a blank string for invalid encodings. It also allows us to decrease the amount of platform-specific code.
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@211 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/string_util_unittest.cc')
-rw-r--r-- | base/string_util_unittest.cc | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index c6ff622..6d19b0e 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -183,6 +183,123 @@ TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) { EXPECT_EQ(wempty, UTF8ToWide(empty)); } +// This tests the current behavior of our UTF-8/UTF-16 conversion. On Windows, +// we just use the platform functions which strip invalid characters. This isn't +// necessarily the best behavior, we may want to write our own converter using +// ICU to get more customized results (for example, substituting the +// "replacement character" U+FFFD for invalid sequences. +TEST(StringUtilTest, ConvertUTF8ToWide) { + struct UTF8ToWideCase { + const char* utf8; + const wchar_t* wide; + bool success; + } convert_cases[] = { + // Regular UTF-8 input. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, + // Invalid Unicode code point. + {"\xef\xbf\xbfHello", L"Hello", false}, + // Truncated UTF-8 sequence. + {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, + // Truncated off the end. + {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, + // Non-shortest-form UTF-8. + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, + // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. + {"\xed\xb0\x80", L"", false}, + // Non-BMP character. The result will either be in UTF-16 or UCS-4. +#ifdef WIN32 + {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, +#else + {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, +#endif + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::wstring converted; + EXPECT_EQ(convert_cases[i].success, + UTF8ToWide(convert_cases[i].utf8, + strlen(convert_cases[i].utf8), + &converted)); + std::wstring expected(convert_cases[i].wide); + EXPECT_EQ(expected, converted); + } + + // Manually test an embedded NULL. + std::wstring converted; + EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); + ASSERT_EQ(3, converted.length()); + EXPECT_EQ(0, converted[0]); + EXPECT_EQ('Z', converted[1]); + EXPECT_EQ('\t', converted[2]); + + // Make sure that conversion replaces, not appends. + EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); + ASSERT_EQ(1, converted.length()); + EXPECT_EQ('B', converted[0]); +} + +#ifdef WIN32 +// This test is only valid when wchar_t == UTF-16. +TEST(StringUtilTest, ConvertUTF16ToUTF8) { + struct UTF16ToUTF8Case { + const wchar_t* utf16; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular UTF-16 input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, + // Invalid Unicode code point. + {L"\xffffHello", "Hello", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + // Truncated at the end. + {L"\x597d\xd800", "\xe5\xa5\xbd", false}, + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} + +#else +// This test is only valid when wchar_t == UCS-4. +TEST(StringUtilTest, ConvertUCS4ToUTF8) { + struct UTF8ToWideCase { + const wchar_t* ucs4; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular 16-bit input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Invalid Unicode code points. + {L"\xffffHello", "Hello, false", false}, + {L"\xfffffffHello", "Hello, false", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + } + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} +#endif + TEST(StringUtilTest, ConvertMultiString) { static wchar_t wmulti[] = { L'f', L'o', L'o', L'\0', |