diff options
author | jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-06-24 16:44:49 +0000 |
---|---|---|
committer | jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-06-24 16:44:49 +0000 |
commit | 8df44a01ec210a3e0c04191fb34b392727017a2c (patch) | |
tree | c0cabca440e09bc579955a9338219ffc27309e50 /base | |
parent | 9f9c5296b022dc280cd38ff418f5177cf71856d6 (diff) | |
download | chromium_src-8df44a01ec210a3e0c04191fb34b392727017a2c.zip chromium_src-8df44a01ec210a3e0c04191fb34b392727017a2c.tar.gz chromium_src-8df44a01ec210a3e0c04191fb34b392727017a2c.tar.bz2 |
Pass through non-character codepoints in UTF-8,16,32 and Wide conversion functions.
They're structurally valid code points unlike malformed byte/surrogate sequences. I believe it's better to leave them
alone in conversion functions.
This CL was triggered by file_util_unittest failure on Linux/Mac with my upcoming change
to file_util::ReplaceIllegalCharacters (a part of http://codereview.chromium.org/126223 ).
In addition, the upper bound for the output length in CodepageToWide was tightened.
TEST=pass string_util and file_util unittests
BUG=NONE
Review URL: http://codereview.chromium.org/147038
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@19132 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/file_util_unittest.cc | 5 | ||||
-rw-r--r-- | base/string_util.h | 14 | ||||
-rw-r--r-- | base/string_util_icu.cc | 32 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 21 |
4 files changed, 56 insertions, 16 deletions
diff --git a/base/file_util_unittest.cc b/base/file_util_unittest.cc index e0884f7..738bac8 100644 --- a/base/file_util_unittest.cc +++ b/base/file_util_unittest.cc @@ -839,11 +839,9 @@ static const struct goodbad_pair { #if defined(OS_WIN) {L"bad*file\\name.jpg", L"bad-file-name.jpg"}, {L"\t bad*file\\name/.jpg ", L"bad-file-name-.jpg"}, - {L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"}, #elif defined(OS_POSIX) {L"bad*file?name.jpg", L"bad-file-name.jpg"}, {L"\t bad*file?name/.jpg ", L"bad-file-name-.jpg"}, - {L"bad\uFFFFfile-name.jpg ", L"bad-file-name.jpg"}, #endif {L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"}, {L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"}, @@ -851,6 +849,9 @@ static const struct goodbad_pair { {L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"}, // Unassigned codepoints are ok. {L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"}, + // Non-characters are not allowed. + {L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"}, + {L"bad\uFDD0file\uFDEFname.jpg ", L"bad-file-name.jpg"}, }; TEST_F(FileUtilTest, ReplaceIllegalCharactersTest) { diff --git a/base/string_util.h b/base/string_util.h index d17e7d7..9a033b4 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -186,6 +186,13 @@ string16 ASCIIToUTF16(const StringPiece& ascii); // do the best it can and put the result in the output buffer. The versions that // return strings ignore this error and just return the best conversion // possible. +// +// Note that only the structural validity is checked and non-character +// codepoints and unassigned are regarded as valid. +// TODO(jungshik): Consider replacing an invalid input sequence with +// the Unicode replacement character or adding |replacement_char| parameter. +// Currently, it's skipped in the ouput, which could be problematic in +// some situations. bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output); std::string WideToUTF8(const std::wstring& wide); bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output); @@ -250,6 +257,13 @@ bool WideToLatin1(const std::wstring& wide, std::string* latin1); // string be 8-bit or UTF8? It contains only characters that are < 256 (in the // first case) or characters that use only 8-bits and whose 8-bit // representation looks like a UTF-8 string (the second case). +// +// Note that IsStringUTF8 checks not only if the input is structrually +// valid but also if it doesn't contain any non-character codepoint +// (e.g. U+FFFE). It's done on purpose because all the existing callers want +// to have the maximum 'discriminating' power from other encodings. If +// there's a use case for just checking the structural validity, we have to +// add a new function for that. bool IsString8Bit(const std::wstring& str); bool IsStringUTF8(const std::string& str); bool IsStringWideUTF8(const std::wstring& str); diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc index df3ee4f..87731de 100644 --- a/base/string_util_icu.cc +++ b/base/string_util_icu.cc @@ -16,6 +16,14 @@ namespace { +inline bool IsValidCodepoint(uint32 code_point) { + // Excludes the surrogate code points ([0xD800, 0xDFFF]) and + // codepoints larger than 0x10FFFF (the highest codepoint allowed). + // Non-characters and unassigned codepoints are allowed. + return code_point < 0xD800u || + (code_point >= 0xE000u && code_point <= 0x10FFFFu); +} + // ReadUnicodeCharacter -------------------------------------------------------- // Reads a UTF-8 stream, placing the next code point into the given output @@ -39,7 +47,7 @@ bool ReadUnicodeCharacter(const char* src, int32 src_len, (*char_index)--; // Validate the decoded value. - return U_IS_UNICODE_CHAR(code_point); + return IsValidCodepoint(code_point); } // Reads a UTF-16 character. The usage is the same as the 8-bit version above. @@ -62,7 +70,7 @@ bool ReadUnicodeCharacter(const char16* src, int32 src_len, *code_point = src[*char_index]; } - return U_IS_UNICODE_CHAR(*code_point); + return IsValidCodepoint(*code_point); } #if defined(WCHAR_T_IS_UTF32) @@ -73,7 +81,7 @@ bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, *code_point = src[*char_index]; // Validate the value. - return U_IS_UNICODE_CHAR(*code_point); + return IsValidCodepoint(*code_point); } #endif // defined(WCHAR_T_IS_UTF32) @@ -134,10 +142,13 @@ bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { int32 src_len32 = static_cast<int32>(src_len); for (int32 i = 0; i < src_len32; i++) { uint32 code_point; - if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) + if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { WriteUnicodeCharacter(code_point, output); - else + } else { + // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) + // in place of an invalid codepoint. success = false; + } } return success; } @@ -428,8 +439,15 @@ bool CodepageToWide(const std::string& encoded, if (!U_SUCCESS(status)) return false; - // The worst case is all the input characters are non-BMP (32-bit) ones. - size_t uchar_max_length = encoded.length() * 2 + 1; + // Even in the worst case, the maximum length in 2-byte units of UTF-16 + // output would be at most the same as the number of bytes in input. There + // is no single-byte encoding in which a character is mapped to a + // non-BMP character requiring two 2-byte units. + // + // Moreover, non-BMP characters in legacy multibyte encodings + // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are + // BOCU and SCSU, but we don't care about them. + size_t uchar_max_length = encoded.length() + 1; UChar* uchar_dst; #if defined(WCHAR_T_IS_UTF16) diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 1087aea..6f196cc 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -309,8 +309,8 @@ TEST(StringUtilTest, ConvertUTF8ToWide) { } convert_cases[] = { // Regular UTF-8 input. {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, - // Invalid Unicode code point. - {"\xef\xbf\xbfHello", L"Hello", false}, + // Non-character is passed through. + {"\xef\xbf\xbfHello", L"\xffffHello", true}, // Truncated UTF-8 sequence. {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, // Truncated off the end. @@ -319,11 +319,14 @@ TEST(StringUtilTest, ConvertUTF8ToWide) { {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. {"\xed\xb0\x80", L"", false}, - // Non-BMP character. The result will either be in UTF-16 or UTF-32. + // Non-BMP characters. The second is a non-character regarded as valid. + // The result will either be in UTF-16 or UTF-32. #if defined(WCHAR_T_IS_UTF16) {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, #elif defined(WCHAR_T_IS_UTF32) {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, #endif }; @@ -367,8 +370,9 @@ TEST(StringUtilTest, ConvertUTF16ToUTF8) { {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, // Test a non-BMP character. {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, - // Invalid Unicode code point. - {L"\xffffHello", "Hello", false}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, // The first character is a truncated UTF-16 character. {L"\xd800\x597d", "\xe5\xa5\xbd", false}, // Truncated at the end. @@ -389,7 +393,7 @@ TEST(StringUtilTest, ConvertUTF16ToUTF8) { #elif defined(WCHAR_T_IS_UTF32) // This test is only valid when wchar_t == UTF-32. TEST(StringUtilTest, ConvertUTF32ToUTF8) { - struct UTF8ToWideCase { + struct WideToUTF8Case { const wchar_t* utf32; const char* utf8; bool success; @@ -398,11 +402,14 @@ TEST(StringUtilTest, ConvertUTF32ToUTF8) { {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, // Test a non-BMP character. {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, // Invalid Unicode code points. - {L"\xffffHello", "Hello", false}, {L"\xfffffffHello", "Hello", false}, // The first character is a truncated UTF-16 character. {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + {L"\xdc01Hello", "Hello", false}, }; for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { |