diff options
author | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-08-01 00:13:10 +0000 |
---|---|---|
committer | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-08-01 00:13:10 +0000 |
commit | 656e3b3857e315a4a6386944fb140ef202580f77 (patch) | |
tree | 37fc49db14f3b43a03da33ef58dddfff9b58be2f /base | |
parent | 660efb2db208d7a64e04eebad1e0e1dd7b54f3b0 (diff) | |
download | chromium_src-656e3b3857e315a4a6386944fb140ef202580f77.zip chromium_src-656e3b3857e315a4a6386944fb140ef202580f77.tar.gz chromium_src-656e3b3857e315a4a6386944fb140ef202580f77.tar.bz2 |
Write our own utf8<->wide conversion functions. This gives us more control over error handling instead of getting a blank string for invalid encodings. It also allows us to decrease the amount of platform-specific code.
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@211 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/string_util.cc | 21 | ||||
-rw-r--r-- | base/string_util.h | 10 | ||||
-rw-r--r-- | base/string_util_icu.cc | 169 | ||||
-rw-r--r-- | base/string_util_mac.cc | 110 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 117 | ||||
-rw-r--r-- | base/string_util_win.cc | 10 |
6 files changed, 316 insertions, 121 deletions
diff --git a/base/string_util.cc b/base/string_util.cc index 2122b9f..faf5ef9 100644 --- a/base/string_util.cc +++ b/base/string_util.cc @@ -250,6 +250,27 @@ std::wstring ASCIIToWide(const std::string& ascii) { return std::wstring(ascii.begin(), ascii.end()); } +std::string WideToUTF8(const std::wstring& wide) { + std::string ret; + if (wide.empty()) + return ret; + + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + WideToUTF8(wide.data(), wide.length(), &ret); + return ret; +} + +// Similar to the Wide->UTF8 version above. +std::wstring UTF8ToWide(const std::string& utf8) { + std::wstring ret; + if (utf8.empty()) + return ret; + + UTF8ToWide(utf8.data(), utf8.length(), &ret); + return ret; +} + // Latin1 is just the low range of Unicode, so we can copy directly to convert. bool WideToLatin1(const std::wstring& wide, std::string* latin1) { std::string output; diff --git a/base/string_util.h b/base/string_util.h index e5fd147..340a7eb 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -155,9 +155,15 @@ std::wstring CollapseWhitespace(const std::wstring& text, std::string WideToASCII(const std::wstring& wide); std::wstring ASCIIToWide(const std::string& ascii); -// These convert between UTF8 and UTF16 strings. They are potentially slow, -// so avoid unnecessary conversions. Most things should be in UTF16. +// These convert between UTF8 and UTF16 strings. They are potentially slow, so +// avoid unnecessary conversions. Most things should be in wide. The low-level +// versions return a boolean indicating whether the conversion was 100% valid. +// In this case, it will still do the best it can and put the result in the +// output buffer. The versions that return strings ignore this error and just +// return the best conversion possible. +bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output); std::string WideToUTF8(const std::wstring& wide); +bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output); std::wstring UTF8ToWide(const std::string& utf8); // Converts between wide strings and whatever the native multibyte encoding diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc index 797ccbd..6df5581 100644 --- a/base/string_util_icu.cc +++ b/base/string_util_icu.cc @@ -38,6 +38,175 @@ #include "unicode/numfmt.h" #include "unicode/ustring.h" +namespace { + +// ReadUnicodeCharacter -------------------------------------------------------- + +// Reads a UTF-8 stream, placing the next code point into the given output +// |*code_point|. |src| represents the entire string to read, and |*char_index| +// is the character offset within the string to start reading at. |*char_index| +// will be updated to index the last character read, such that incrementing it +// (as in a for loop) will take the reader to the next character. +// +// Returns true on success. On false, |*code_point| will be invalid. +bool ReadUnicodeCharacter(const char* src, int32 src_len, + int32* char_index, uint32* code_point) { + U8_NEXT(src, *char_index, src_len, *code_point); + + // The ICU macro above moves to the next char, we want to point to the last + // char consumed. + (*char_index)--; + + // Validate the decoded value. + return U_IS_UNICODE_CHAR(*code_point); +} + +#ifdef WIN32 +// Reads a UTF-16 character for Windows. The usage is the same as the 8-bit +// version above. +bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, + int32* char_index, uint32* code_point) { + if (U16_IS_SURROGATE(src[*char_index])) { + if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || + *char_index + 1 >= src_len || + !U16_IS_TRAIL(src[*char_index + 1])) { + // Invalid surrogate pair. + return false; + } + + // Valid surrogate pair. + *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], + src[*char_index + 1]); + (*char_index)++; + } else { + // Not a surrogate, just one 16-bit word. + *code_point = src[*char_index]; + } + + return U_IS_UNICODE_CHAR(*code_point); +} +#else +// Reads a 32-bit character for Mac and Linux systems. The usage is the same as +// the 8-bit version above. +bool ReadUnicodeCharacter(const wchar_t* src, in32 src_len, + int32* char_index, uint32* code_point) { + // Conversion is easy since the source is 32-bit. + *code_point = src[*char_index]; + + // Validate the value. + return U_IS_UNICODE_CHAR(*code_point); +} +#endif + +// WriteUnicodeCharacter ------------------------------------------------------- + +// Appends a UTF-8 character to the given 8-bit string. +void WriteUnicodeCharacter(uint32 code_point, std::basic_string<char>* output) { + if (code_point <= 0x7f) { + // Fast path the common case of one byte. + output->push_back(code_point); + return; + } + + // U8_APPEND_UNSAFE can append up to 4 bytes. + int32 char_offset = static_cast<int32>(output->length()); + output->resize(char_offset + U8_MAX_LENGTH); + + U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + + // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so + // it will represent the new length of the string. + output->resize(char_offset); +} + +#ifdef WIN32 +// Appends the given code point as a UTF-16 character to the STL string. On +// Windows, wchar_t is UTF-16. +void WriteUnicodeCharacter(uint32 code_point, + std::basic_string<wchar_t>* output) { + if (U16_LENGTH(code_point) == 1) { + // Thie code point is in the Basic Multilingual Plane (BMP). + output->push_back(static_cast<wchar_t>(code_point)); + } else { + // Non-BMP characters use a double-character encoding. + int32 char_offset = static_cast<int32>(output->length()); + output->resize(char_offset + U16_MAX_LENGTH); + U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + } +} +#else +// Appends the given UCS-4 character to the given 32-bit string for Linux and +// Mac where wchar_t is UCS-4. +inline void WriteUnicodeCharacter(uint32 code_point, + std::basic_string<wchar_t>* output) { + // This is the easy case, just append the character. + output->push_back(code_point); +} +#endif + +// Generalized Unicode converter ----------------------------------------------- + +// Converts the given source Unicode character type to the given destination +// Unicode character type as a STL string. The given input buffer and size +// determine the source, and the given output STL string will be replaced by +// the result. +template<typename SRC_CHAR, typename DEST_CHAR> +bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, + std::basic_string<DEST_CHAR>* output) { + output->clear(); + + // ICU requires 32-bit numbers. + bool success = true; + int32 src_len32 = static_cast<int32>(src_len); + for (int32 i = 0; i < src_len32; i++) { + uint32 code_point; + if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) + WriteUnicodeCharacter(code_point, output); + else + success = false; + } + return success; +} + +} // namespace + +// UTF-x <-> UTF-x ------------------------------------------------------------- + +bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { + if (src_len == 0) { + output->clear(); + return true; + } + + // Intelligently guess the size of the output string. When it's an ASCII + // character, assume the rest will be ASCII and use a buffer size the same as + // the input. When it's not ASCII, assume 3-bytes per character as the + // starting point. This will be resized internally later if it's too small. + if (src[0] < 0x80) + output->reserve(src_len); + else + output->reserve(src_len * 3); + return ConvertUnicode<wchar_t, char>(src, src_len, output); +} + +bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { + if (src_len == 0) { + output->clear(); + return true; + } + + // Intelligently guess the size of the output string. When it's an ASCII + // character, assume the rest will be ASCII and use a buffer size the same as + // the input. When it's not ASCII, assume the UTF-8 takes 2 bytes per + // character (this is more conservative than 3 which we use above when + // converting the other way). + if (src[0] < 0x80) + output->reserve(src_len); + else + output->reserve(src_len / 2); + return ConvertUnicode<char, wchar_t>(src, src_len, output); +} + // Codepage <-> Wide ----------------------------------------------------------- // Convert a unicode string into the specified codepage_name. If the codepage diff --git a/base/string_util_mac.cc b/base/string_util_mac.cc index 4c5f3dc..76b72b0 100644 --- a/base/string_util_mac.cc +++ b/base/string_util_mac.cc @@ -44,7 +44,7 @@ // routines. template<typename CharType> static inline bool StrNCpyT(CharType* dst, const CharType* src, - size_t dst_size, size_t src_size) { + size_t dst_size, size_t src_size) { // The initial value of count has room for a NUL terminator. size_t count = std::min(dst_size, src_size + 1); if (count == 0) @@ -105,114 +105,6 @@ static void InitializeStatics() { pthread_once(&pthread_once_initialized, DoInitializeStatics); } -// Convert the supplied cfsring into the specified encoding, and return it as -// an STL string of the template type. Returns an empty string on failure. -template<typename StringType> -static StringType CFStringToSTLStringWithEncodingT(CFStringRef cfstring, - CFStringEncoding encoding) { - CFIndex length = CFStringGetLength(cfstring); - if (length == 0) - return StringType(); - - CFRange whole_string = CFRangeMake(0, length); - CFIndex out_size; - CFIndex converted = CFStringGetBytes(cfstring, - whole_string, - encoding, - 0, // lossByte - false, // isExternalRepresentation - NULL, // buffer - 0, // maxBufLen - &out_size); - DCHECK(converted != 0 && out_size != 0); - if (converted == 0 || out_size == 0) - return StringType(); - - // out_size is the number of UInt8-sized units needed in the destination. - // A buffer allocated as UInt8 units might not be properly aligned to - // contain elements of StringType::value_type. Use a container for the - // proper value_type, and convert out_size by figuring the number of - // value_type elements per UInt8. Leave room for a NUL terminator. - typename StringType::size_type elements = - out_size * sizeof(UInt8) / sizeof(typename StringType::value_type) + 1; - - // Make sure that integer truncation didn't occur. For the conversions done - // here, it never should. - DCHECK(((out_size * sizeof(UInt8)) % - sizeof(typename StringType::value_type)) == 0); - - std::vector<typename StringType::value_type> out_buffer(elements); - converted = CFStringGetBytes(cfstring, - whole_string, - encoding, - 0, // lossByte - false, // isExternalRepresentation - reinterpret_cast<UInt8*>(&out_buffer[0]), - out_size, - NULL); // usedBufLen - DCHECK(converted != 0); - if (converted == 0) - return StringType(); - - out_buffer[elements - 1] = '\0'; - return StringType(&out_buffer[0]); -} - -// Given an STL string |in| with an encoding specified by |in_encoding|, -// convert it to |out_encoding| and return it as an STL string of the -// |OutStringType| template type. Returns an empty string on failure. -template<typename OutStringType, typename InStringType> -static OutStringType STLStringToSTLStringWithEncodingsT( - const InStringType& in, - CFStringEncoding in_encoding, - CFStringEncoding out_encoding) { - typename InStringType::size_type in_length = in.length(); - if (in_length == 0) - return OutStringType(); - - scoped_cftyperef<CFStringRef> cfstring( - CFStringCreateWithBytesNoCopy(NULL, - reinterpret_cast<const UInt8*>(in.c_str()), - in_length * - sizeof(typename InStringType::value_type), - in_encoding, - false, - kCFAllocatorNull)); - DCHECK(cfstring); - if (!cfstring) - return OutStringType(); - - return CFStringToSTLStringWithEncodingT<OutStringType>(cfstring, - out_encoding); -} - -// Specify the byte ordering explicitly, otherwise CFString will be confused -// when strings don't carry BOMs, as they typically won't. -static const CFStringEncoding kNarrowStringEncoding = kCFStringEncodingUTF8; -#ifdef __BIG_ENDIAN__ -#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff -static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16BE; -#else // __WCHAR_MAX__ -static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32BE; -#endif // __WCHAR_MAX__ -#else // __BIG_ENDIAN__ -#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff -static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16LE; -#else // __WCHAR_MAX__ -static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32LE; -#endif // __WCHAR_MAX__ -#endif // __BIG_ENDIAN__ - -std::string WideToUTF8(const std::wstring& wide) { - return STLStringToSTLStringWithEncodingsT<std::string>( - wide, kWideStringEncoding, kNarrowStringEncoding); -} - -std::wstring UTF8ToWide(const std::string& utf8) { - return STLStringToSTLStringWithEncodingsT<std::wstring>( - utf8, kNarrowStringEncoding, kWideStringEncoding); -} - // Technically, the native multibyte encoding would be the encoding returned // by CFStringGetSystemEncoding or GetApplicationTextEncoding, but I can't // imagine anyone needing or using that from these APIs, so just treat UTF-8 diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index c6ff622..6d19b0e 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -183,6 +183,123 @@ TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) { EXPECT_EQ(wempty, UTF8ToWide(empty)); } +// This tests the current behavior of our UTF-8/UTF-16 conversion. On Windows, +// we just use the platform functions which strip invalid characters. This isn't +// necessarily the best behavior, we may want to write our own converter using +// ICU to get more customized results (for example, substituting the +// "replacement character" U+FFFD for invalid sequences. +TEST(StringUtilTest, ConvertUTF8ToWide) { + struct UTF8ToWideCase { + const char* utf8; + const wchar_t* wide; + bool success; + } convert_cases[] = { + // Regular UTF-8 input. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, + // Invalid Unicode code point. + {"\xef\xbf\xbfHello", L"Hello", false}, + // Truncated UTF-8 sequence. + {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, + // Truncated off the end. + {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, + // Non-shortest-form UTF-8. + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, + // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. + {"\xed\xb0\x80", L"", false}, + // Non-BMP character. The result will either be in UTF-16 or UCS-4. +#ifdef WIN32 + {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, +#else + {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, +#endif + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::wstring converted; + EXPECT_EQ(convert_cases[i].success, + UTF8ToWide(convert_cases[i].utf8, + strlen(convert_cases[i].utf8), + &converted)); + std::wstring expected(convert_cases[i].wide); + EXPECT_EQ(expected, converted); + } + + // Manually test an embedded NULL. + std::wstring converted; + EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); + ASSERT_EQ(3, converted.length()); + EXPECT_EQ(0, converted[0]); + EXPECT_EQ('Z', converted[1]); + EXPECT_EQ('\t', converted[2]); + + // Make sure that conversion replaces, not appends. + EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); + ASSERT_EQ(1, converted.length()); + EXPECT_EQ('B', converted[0]); +} + +#ifdef WIN32 +// This test is only valid when wchar_t == UTF-16. +TEST(StringUtilTest, ConvertUTF16ToUTF8) { + struct UTF16ToUTF8Case { + const wchar_t* utf16; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular UTF-16 input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, + // Invalid Unicode code point. + {L"\xffffHello", "Hello", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + // Truncated at the end. + {L"\x597d\xd800", "\xe5\xa5\xbd", false}, + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} + +#else +// This test is only valid when wchar_t == UCS-4. +TEST(StringUtilTest, ConvertUCS4ToUTF8) { + struct UTF8ToWideCase { + const wchar_t* ucs4; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular 16-bit input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Invalid Unicode code points. + {L"\xffffHello", "Hello, false", false}, + {L"\xfffffffHello", "Hello, false", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + } + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} +#endif + TEST(StringUtilTest, ConvertMultiString) { static wchar_t wmulti[] = { L'f', L'o', L'o', L'\0', diff --git a/base/string_util_win.cc b/base/string_util_win.cc index 6cad854..53044cc 100644 --- a/base/string_util_win.cc +++ b/base/string_util_win.cc @@ -76,16 +76,6 @@ static std::wstring MultiByteToWide(const std::string& mb, UINT code_page) { return wide; } -// Wide <--> UTF-8 -std::string WideToUTF8(const std::wstring& wide) { - - return WideToMultiByte(wide, CP_UTF8); -} - -std::wstring UTF8ToWide(const std::string& utf8) { - return MultiByteToWide(utf8, CP_UTF8); -} - // Wide <--> native multibyte std::string WideToNativeMB(const std::wstring& wide) { return WideToMultiByte(wide, CP_ACP); |