Pass through non-character codepoints in UTF-8,16,32 and Wide conversion functions.

They're structurally valid code points unlike malformed byte/surrogate sequences. I believe it's better to leave them alone in conversion functions. This CL was triggered by file_util_unittest failure on Linux/Mac with my upcoming change to file_util::ReplaceIllegalCharacters (a part of http://codereview.chromium.org/126223 ). In addition, the upper bound for the output length in CodepageToWide was tightened. TEST=pass string_util and file_util unittests BUG=NONE Review URL: http://codereview.chromium.org/147038 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@19132 0039d316-1c4b-4281-b951-d872f2087c98
author: jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-06-24 16:44:49 +0000
committer: jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-06-24 16:44:49 +0000
commit: 8df44a01ec210a3e0c04191fb34b392727017a2c (patch)
tree: c0cabca440e09bc579955a9338219ffc27309e50 /base/string_util_unittest.cc
parent: 9f9c5296b022dc280cd38ff418f5177cf71856d6 (diff)
download: chromium_src-8df44a01ec210a3e0c04191fb34b392727017a2c.zip
chromium_src-8df44a01ec210a3e0c04191fb34b392727017a2c.tar.gz
chromium_src-8df44a01ec210a3e0c04191fb34b392727017a2c.tar.bz2
1 files changed, 14 insertions, 7 deletions
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 1087aea..6f196cc 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -309,8 +309,8 @@ TEST(StringUtilTest, ConvertUTF8ToWide) {
   } convert_cases[] = {
     // Regular UTF-8 input.
     {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true},
-    // Invalid Unicode code point.
-    {"\xef\xbf\xbfHello", L"Hello", false},
+    // Non-character is passed through.
+    {"\xef\xbf\xbfHello", L"\xffffHello", true},
     // Truncated UTF-8 sequence.
     {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false},
     // Truncated off the end.
@@ -319,11 +319,14 @@ TEST(StringUtilTest, ConvertUTF8ToWide) {
     {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false},
     // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
     {"\xed\xb0\x80", L"", false},
-    // Non-BMP character. The result will either be in UTF-16 or UTF-32.
+    // Non-BMP characters. The second is a non-character regarded as valid.
+    // The result will either be in UTF-16 or UTF-32.
 #if defined(WCHAR_T_IS_UTF16)
     {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true},
+    {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true},
 #elif defined(WCHAR_T_IS_UTF32)
     {"A\xF0\x90\x8C\x80z", L"A\x10300z", true},
+    {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true},
 #endif
   };
 
@@ -367,8 +370,9 @@ TEST(StringUtilTest, ConvertUTF16ToUTF8) {
     {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
     // Test a non-BMP character.
     {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true},
-    // Invalid Unicode code point.
-    {L"\xffffHello", "Hello", false},
+    // Non-characters are passed through.
+    {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+    {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true},
     // The first character is a truncated UTF-16 character.
     {L"\xd800\x597d", "\xe5\xa5\xbd", false},
     // Truncated at the end.
@@ -389,7 +393,7 @@ TEST(StringUtilTest, ConvertUTF16ToUTF8) {
 #elif defined(WCHAR_T_IS_UTF32)
 // This test is only valid when wchar_t == UTF-32.
 TEST(StringUtilTest, ConvertUTF32ToUTF8) {
-  struct UTF8ToWideCase {
+  struct WideToUTF8Case {
     const wchar_t* utf32;
     const char* utf8;
     bool success;
@@ -398,11 +402,14 @@ TEST(StringUtilTest, ConvertUTF32ToUTF8) {
     {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
     // Test a non-BMP character.
     {L"A\x10300z", "A\xF0\x90\x8C\x80z", true},
+    // Non-characters are passed through.
+    {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+    {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true},
     // Invalid Unicode code points.
-    {L"\xffffHello", "Hello", false},
     {L"\xfffffffHello", "Hello", false},
     // The first character is a truncated UTF-16 character.
     {L"\xd800\x597d", "\xe5\xa5\xbd", false},
+    {L"\xdc01Hello", "Hello", false},
   };
 
   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) {
author	jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-06-24 16:44:49 +0000
committer	jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-06-24 16:44:49 +0000
commit	8df44a01ec210a3e0c04191fb34b392727017a2c (patch)
tree	c0cabca440e09bc579955a9338219ffc27309e50 /base/string_util_unittest.cc
parent	9f9c5296b022dc280cd38ff418f5177cf71856d6 (diff)
download	chromium_src-8df44a01ec210a3e0c04191fb34b392727017a2c.zip chromium_src-8df44a01ec210a3e0c04191fb34b392727017a2c.tar.gz chromium_src-8df44a01ec210a3e0c04191fb34b392727017a2c.tar.bz2