From 8df44a01ec210a3e0c04191fb34b392727017a2c Mon Sep 17 00:00:00 2001 From: "jshin@chromium.org" Date: Wed, 24 Jun 2009 16:44:49 +0000 Subject: Pass through non-character codepoints in UTF-8,16,32 and Wide conversion functions. They're structurally valid code points unlike malformed byte/surrogate sequences. I believe it's better to leave them alone in conversion functions. This CL was triggered by file_util_unittest failure on Linux/Mac with my upcoming change to file_util::ReplaceIllegalCharacters (a part of http://codereview.chromium.org/126223 ). In addition, the upper bound for the output length in CodepageToWide was tightened. TEST=pass string_util and file_util unittests BUG=NONE Review URL: http://codereview.chromium.org/147038 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@19132 0039d316-1c4b-4281-b951-d872f2087c98 --- base/string_util_unittest.cc | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'base/string_util_unittest.cc') diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 1087aea..6f196cc 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -309,8 +309,8 @@ TEST(StringUtilTest, ConvertUTF8ToWide) { } convert_cases[] = { // Regular UTF-8 input. {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, - // Invalid Unicode code point. - {"\xef\xbf\xbfHello", L"Hello", false}, + // Non-character is passed through. + {"\xef\xbf\xbfHello", L"\xffffHello", true}, // Truncated UTF-8 sequence. {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, // Truncated off the end. @@ -319,11 +319,14 @@ TEST(StringUtilTest, ConvertUTF8ToWide) { {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. {"\xed\xb0\x80", L"", false}, - // Non-BMP character. The result will either be in UTF-16 or UTF-32. + // Non-BMP characters. The second is a non-character regarded as valid. + // The result will either be in UTF-16 or UTF-32. #if defined(WCHAR_T_IS_UTF16) {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, #elif defined(WCHAR_T_IS_UTF32) {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, #endif }; @@ -367,8 +370,9 @@ TEST(StringUtilTest, ConvertUTF16ToUTF8) { {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, // Test a non-BMP character. {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, - // Invalid Unicode code point. - {L"\xffffHello", "Hello", false}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, // The first character is a truncated UTF-16 character. {L"\xd800\x597d", "\xe5\xa5\xbd", false}, // Truncated at the end. @@ -389,7 +393,7 @@ TEST(StringUtilTest, ConvertUTF16ToUTF8) { #elif defined(WCHAR_T_IS_UTF32) // This test is only valid when wchar_t == UTF-32. TEST(StringUtilTest, ConvertUTF32ToUTF8) { - struct UTF8ToWideCase { + struct WideToUTF8Case { const wchar_t* utf32; const char* utf8; bool success; @@ -398,11 +402,14 @@ TEST(StringUtilTest, ConvertUTF32ToUTF8) { {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, // Test a non-BMP character. {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, // Invalid Unicode code points. - {L"\xffffHello", "Hello", false}, {L"\xfffffffHello", "Hello", false}, // The first character is a truncated UTF-16 character. {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + {L"\xdc01Hello", "Hello", false}, }; for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { -- cgit v1.1