summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
authorcevans@chromium.org <cevans@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-01-01 22:16:38 +0000
committercevans@chromium.org <cevans@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-01-01 22:16:38 +0000
commitd7a3e8ec24958958db28dba44542a2c126d94e88 (patch)
tree624b1ccbf82d1bd2586088d624b465c4cfa72ee8 /base
parent4838a195200c971b1c81bddf7e483f4b95b2017a (diff)
downloadchromium_src-d7a3e8ec24958958db28dba44542a2c126d94e88.zip
chromium_src-d7a3e8ec24958958db28dba44542a2c126d94e88.tar.gz
chromium_src-d7a3e8ec24958958db28dba44542a2c126d94e88.tar.bz2
If we can't read a unicode character, write the standard "unknown" (0xFFFD) character. This will prevent security issues where the current behaviour can be used to strip characters out of a string after it has passed some validation.
BUG=30798 TEST=utf_string_conversions_unittest.cc,utf_offset_string_conversions_unittest.cc,zip_unittest.cc Review URL: http://codereview.chromium.org/522029 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@35430 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r--base/utf_offset_string_conversions.cc3
-rw-r--r--base/utf_offset_string_conversions_unittest.cc2
-rw-r--r--base/utf_string_conversions.cc3
-rw-r--r--base/utf_string_conversions_unittest.cc18
4 files changed, 12 insertions, 14 deletions
diff --git a/base/utf_offset_string_conversions.cc b/base/utf_offset_string_conversions.cc
index 69b572e..4c47ef8 100644
--- a/base/utf_offset_string_conversions.cc
+++ b/base/utf_offset_string_conversions.cc
@@ -36,8 +36,7 @@ bool ConvertUnicode(const SRC_CHAR* src,
if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
chars_written = WriteUnicodeCharacter(code_point, output);
} else {
- // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
- // in place of an invalid codepoint.
+ chars_written = WriteUnicodeCharacter(0xFFFD, output);
success = false;
}
if ((output_offset != std::wstring::npos) &&
diff --git a/base/utf_offset_string_conversions_unittest.cc b/base/utf_offset_string_conversions_unittest.cc
index 00d87d3..4f13ab3 100644
--- a/base/utf_offset_string_conversions_unittest.cc
+++ b/base/utf_offset_string_conversions_unittest.cc
@@ -43,7 +43,7 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
{"", 0, std::wstring::npos},
{"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos},
{"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
- {"\xed\xb0\x80z", 3, 0},
+ {"\xed\xb0\x80z", 3, 1},
{"A\xF0\x90\x8C\x80z", 1, 1},
{"A\xF0\x90\x8C\x80z", 2, std::wstring::npos},
#if defined(WCHAR_T_IS_UTF16)
diff --git a/base/utf_string_conversions.cc b/base/utf_string_conversions.cc
index 7376933..d517e1b 100644
--- a/base/utf_string_conversions.cc
+++ b/base/utf_string_conversions.cc
@@ -32,8 +32,7 @@ bool ConvertUnicode(const SRC_CHAR* src,
if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
WriteUnicodeCharacter(code_point, output);
} else {
- // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
- // in place of an invalid codepoint.
+ WriteUnicodeCharacter(0xFFFD, output);
success = false;
}
}
diff --git a/base/utf_string_conversions_unittest.cc b/base/utf_string_conversions_unittest.cc
index 19189971..6ba0b5b 100644
--- a/base/utf_string_conversions_unittest.cc
+++ b/base/utf_string_conversions_unittest.cc
@@ -94,13 +94,13 @@ TEST(UTFStringConversionsTest, ConvertUTF8ToWide) {
// Non-character is passed through.
{"\xef\xbf\xbfHello", L"\xffffHello", true},
// Truncated UTF-8 sequence.
- {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false},
+ {"\xe4\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false},
// Truncated off the end.
- {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false},
+ {"\xe5\xa5\xbd\xe4\xa0", L"\x597d\xfffd", false},
// Non-shortest-form UTF-8.
- {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false},
+ {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false},
// This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
- {"\xed\xb0\x80", L"", false},
+ {"\xed\xb0\x80", L"\xfffd", false},
// Non-BMP characters. The second is a non-character regarded as valid.
// The result will either be in UTF-16 or UTF-32.
#if defined(WCHAR_T_IS_UTF16)
@@ -152,9 +152,9 @@ TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) {
{L"\xffffHello", "\xEF\xBF\xBFHello", true},
{L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true},
// The first character is a truncated UTF-16 character.
- {L"\xd800\x597d", "\xe5\xa5\xbd", false},
+ {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false},
// Truncated at the end.
- {L"\x597d\xd800", "\xe5\xa5\xbd", false},
+ {L"\x597d\xd800", "\xe5\xa5\xbd\xef\xbf\xbd", false},
};
for (int i = 0; i < arraysize(convert_cases); i++) {
@@ -184,10 +184,10 @@ TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) {
{L"\xffffHello", "\xEF\xBF\xBFHello", true},
{L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true},
// Invalid Unicode code points.
- {L"\xfffffffHello", "Hello", false},
+ {L"\xfffffffHello", "\xEF\xBF\xBDHello", false},
// The first character is a truncated UTF-16 character.
- {L"\xd800\x597d", "\xe5\xa5\xbd", false},
- {L"\xdc01Hello", "Hello", false},
+ {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false},
+ {L"\xdc01Hello", "\xef\xbf\xbdHello", false},
};
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) {