If we can't read a unicode character, write the standard "unknown" (0xFFFD) character. This will prevent security issues where the current behaviour can be used to strip characters out of a string after it has passed some validation.

BUG=30798 TEST=utf_string_conversions_unittest.cc,utf_offset_string_conversions_unittest.cc,zip_unittest.cc Review URL: http://codereview.chromium.org/522029 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@35430 0039d316-1c4b-4281-b951-d872f2087c98
author: cevans@chromium.org <cevans@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-01-01 22:16:38 +0000
committer: cevans@chromium.org <cevans@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-01-01 22:16:38 +0000
commit: d7a3e8ec24958958db28dba44542a2c126d94e88 (patch)
tree: 624b1ccbf82d1bd2586088d624b465c4cfa72ee8 /base
parent: 4838a195200c971b1c81bddf7e483f4b95b2017a (diff)
download: chromium_src-d7a3e8ec24958958db28dba44542a2c126d94e88.zip
chromium_src-d7a3e8ec24958958db28dba44542a2c126d94e88.tar.gz
chromium_src-d7a3e8ec24958958db28dba44542a2c126d94e88.tar.bz2
4 files changed, 12 insertions, 14 deletions
diff --git a/base/utf_offset_string_conversions.cc b/base/utf_offset_string_conversions.cc
index 69b572e..4c47ef8 100644
--- a/base/utf_offset_string_conversions.cc
+++ b/base/utf_offset_string_conversions.cc
@@ -36,8 +36,7 @@ bool ConvertUnicode(const SRC_CHAR* src,
     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
       chars_written = WriteUnicodeCharacter(code_point, output);
     } else {
-      // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
-      // in place of an invalid codepoint.
+      chars_written = WriteUnicodeCharacter(0xFFFD, output);
       success = false;
     }
     if ((output_offset != std::wstring::npos) &&
diff --git a/base/utf_offset_string_conversions_unittest.cc b/base/utf_offset_string_conversions_unittest.cc
index 00d87d3..4f13ab3 100644
--- a/base/utf_offset_string_conversions_unittest.cc
+++ b/base/utf_offset_string_conversions_unittest.cc
@@ -43,7 +43,7 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
     {"", 0, std::wstring::npos},
     {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos},
     {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
-    {"\xed\xb0\x80z", 3, 0},
+    {"\xed\xb0\x80z", 3, 1},
     {"A\xF0\x90\x8C\x80z", 1, 1},
     {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos},
 #if defined(WCHAR_T_IS_UTF16)
diff --git a/base/utf_string_conversions.cc b/base/utf_string_conversions.cc
index 7376933..d517e1b 100644
--- a/base/utf_string_conversions.cc
+++ b/base/utf_string_conversions.cc
@@ -32,8 +32,7 @@ bool ConvertUnicode(const SRC_CHAR* src,
     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
       WriteUnicodeCharacter(code_point, output);
     } else {
-      // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
-      // in place of an invalid codepoint.
+      WriteUnicodeCharacter(0xFFFD, output);
       success = false;
     }
   }
diff --git a/base/utf_string_conversions_unittest.cc b/base/utf_string_conversions_unittest.cc
index 19189971..6ba0b5b 100644
--- a/base/utf_string_conversions_unittest.cc
+++ b/base/utf_string_conversions_unittest.cc
@@ -94,13 +94,13 @@ TEST(UTFStringConversionsTest, ConvertUTF8ToWide) {
     // Non-character is passed through.
     {"\xef\xbf\xbfHello", L"\xffffHello", true},
     // Truncated UTF-8 sequence.
-    {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false},
+    {"\xe4\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false},
     // Truncated off the end.
-    {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false},
+    {"\xe5\xa5\xbd\xe4\xa0", L"\x597d\xfffd", false},
     // Non-shortest-form UTF-8.
-    {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false},
+    {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false},
     // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
-    {"\xed\xb0\x80", L"", false},
+    {"\xed\xb0\x80", L"\xfffd", false},
     // Non-BMP characters. The second is a non-character regarded as valid.
     // The result will either be in UTF-16 or UTF-32.
 #if defined(WCHAR_T_IS_UTF16)
@@ -152,9 +152,9 @@ TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) {
     {L"\xffffHello", "\xEF\xBF\xBFHello", true},
     {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true},
     // The first character is a truncated UTF-16 character.
-    {L"\xd800\x597d", "\xe5\xa5\xbd", false},
+    {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false},
     // Truncated at the end.
-    {L"\x597d\xd800", "\xe5\xa5\xbd", false},
+    {L"\x597d\xd800", "\xe5\xa5\xbd\xef\xbf\xbd", false},
   };
 
   for (int i = 0; i < arraysize(convert_cases); i++) {
@@ -184,10 +184,10 @@ TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) {
     {L"\xffffHello", "\xEF\xBF\xBFHello", true},
     {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true},
     // Invalid Unicode code points.
-    {L"\xfffffffHello", "Hello", false},
+    {L"\xfffffffHello", "\xEF\xBF\xBDHello", false},
     // The first character is a truncated UTF-16 character.
-    {L"\xd800\x597d", "\xe5\xa5\xbd", false},
-    {L"\xdc01Hello", "Hello", false},
+    {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false},
+    {L"\xdc01Hello", "\xef\xbf\xbdHello", false},
   };
 
   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) {
author	cevans@chromium.org <cevans@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-01-01 22:16:38 +0000
committer	cevans@chromium.org <cevans@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-01-01 22:16:38 +0000
commit	d7a3e8ec24958958db28dba44542a2c126d94e88 (patch)
tree	624b1ccbf82d1bd2586088d624b465c4cfa72ee8 /base
parent	4838a195200c971b1c81bddf7e483f4b95b2017a (diff)
download	chromium_src-d7a3e8ec24958958db28dba44542a2c126d94e88.zip chromium_src-d7a3e8ec24958958db28dba44542a2c126d94e88.tar.gz chromium_src-d7a3e8ec24958958db28dba44542a2c126d94e88.tar.bz2