diff options
-rw-r--r-- | base/string_util.h | 18 | ||||
-rw-r--r-- | base/string_util_icu.cc | 89 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 11 |
3 files changed, 94 insertions, 24 deletions
diff --git a/base/string_util.h b/base/string_util.h index 1a4080a..46269ff 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -37,6 +37,7 @@ #include <stdarg.h> // va_list #include "base/basictypes.h" +#include "base/string16.h" // Safe standard library wrappers for all platforms. @@ -152,17 +153,22 @@ std::wstring CollapseWhitespace(const std::wstring& text, std::string WideToASCII(const std::wstring& wide); std::wstring ASCIIToWide(const std::string& ascii); -// These convert between UTF8 and UTF16 strings. They are potentially slow, so -// avoid unnecessary conversions. Most things should be in wide. The low-level -// versions return a boolean indicating whether the conversion was 100% valid. -// In this case, it will still do the best it can and put the result in the -// output buffer. The versions that return strings ignore this error and just -// return the best conversion possible. +// These convert between UTF-8, -16, and -32 strings. They are potentially slow, +// so avoid unnecessary conversions. The low-level versions return a boolean +// indicating whether the conversion was 100% valid. In this case, it will still +// do the best it can and put the result in the output buffer. The versions that +// return strings ignore this error and just return the best conversion +// possible. bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output); std::string WideToUTF8(const std::wstring& wide); bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output); std::wstring UTF8ToWide(const std::string& utf8); +bool WideToUTF16(const wchar_t* src, size_t src_len, std::string16* output); +std::string16 WideToUTF16(const std::wstring& wide); +bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output); +std::wstring UTF16ToWide(const std::string16& utf8); + // Defines the error handling modes of WideToCodepage and CodepageToWide. class OnStringUtilConversionError { public: diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc index 534ca88..895a03e 100644 --- a/base/string_util_icu.cc +++ b/base/string_util_icu.cc @@ -62,9 +62,8 @@ bool ReadUnicodeCharacter(const char* src, int32 src_len, return U_IS_UNICODE_CHAR(*code_point); } -#if defined(WCHAR_T_IS_UTF16) // Reads a UTF-16 character. The usage is the same as the 8-bit version above. -bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, +bool ReadUnicodeCharacter(const char16* src, int32 src_len, int32* char_index, uint32* code_point) { if (U16_IS_SURROGATE(src[*char_index])) { if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || @@ -85,10 +84,11 @@ bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, return U_IS_UNICODE_CHAR(*code_point); } -#elif defined(WCHAR_T_IS_UTF32) + +#if defined(WCHAR_T_IS_UTF32) // Reads UTF-32 character. The usage is the same as the 8-bit version above. -bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, - int32* char_index, uint32* code_point) { +bool ReadUTF32Character(const wchar_t* src, int32 src_len, + int32* char_index, uint32* code_point) { // Conversion is easy since the source is 32-bit. *code_point = src[*char_index]; @@ -118,13 +118,12 @@ void WriteUnicodeCharacter(uint32 code_point, std::basic_string<char>* output) { output->resize(char_offset); } -#if defined(WCHAR_T_IS_UTF16) // Appends the given code point as a UTF-16 character to the STL string. void WriteUnicodeCharacter(uint32 code_point, - std::basic_string<wchar_t>* output) { + std::basic_string<char16>* output) { if (U16_LENGTH(code_point) == 1) { // Thie code point is in the Basic Multilingual Plane (BMP). - output->push_back(static_cast<wchar_t>(code_point)); + output->push_back(static_cast<char16>(code_point)); } else { // Non-BMP characters use a double-character encoding. int32 char_offset = static_cast<int32>(output->length()); @@ -132,7 +131,8 @@ void WriteUnicodeCharacter(uint32 code_point, U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); } } -#elif defined(WCHAR_T_IS_UTF32) + +#if defined(WCHAR_T_IS_UTF32) // Appends the given UTF-32 character to the given 32-bit string. inline void WriteUnicodeCharacter(uint32 code_point, std::basic_string<wchar_t>* output) { @@ -167,7 +167,7 @@ bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, } // namespace -// UTF-x <-> UTF-x ------------------------------------------------------------- +// UTF-8 <-> Wide -------------------------------------------------------------- std::string WideToUTF8(const std::wstring& wide) { std::string ret; @@ -224,6 +224,75 @@ bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { return ConvertUnicode<char, wchar_t>(src, src_len, output); } +// UTF-16 <-> Wide ------------------------------------------------------------- + +#if defined(WCHAR_T_IS_UTF16) + +// When wide == UTF-16, then conversions are a NOP. +std::string16 WideToUTF16(const std::wstring& wide) { + return wide; +} + +bool WideToUTF16(const wchar_t* src, size_t src_len, std::string16* output) { + output->assign(src, src_len); + return true; +} + +std::wstring UTF16ToWide(const std::string16& utf16) { + return utf16; +} + +bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { + output->assign(src, src_len); + return true; +} + +#elif defined(WCHAR_T_IS_UTF32) + +std::string16 WideToUTF16(const std::wstring& wide) { + std::string16 ret; + if (wide.empty()) + return ret; + + UTF8ToWide(wide.data(), wide.length(), &ret); + return ret; +} + +bool WideToUTF16(const wchar_t* src, size_t src_len, std::string16* output) { + if (src_len == 0) { + output->clear(); + return true; + } + + // Assume that normally we won't have any non-BMP characters so the counts + // will be the same. + output->reserve(src_len); + return ConvertUnicode<wchar_t, char16>(src, src_len, output); +} + +std::wstring UTF16ToWide(const std::string16& utf16) { + std::wstring ret; + if (utf16.empty()) + return ret; + + UTF8ToWide(utf16.data(), utf16.length(), &ret); + return ret; +} + +bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { + if (src_len == 0) { + output->clear(); + return true; + } + + // Assume that normally we won't have any non-BMP characters so the counts + // will be the same. + output->reserve(src_len); + return ConvertUnicode<char16, wchar_t>(src, src_len, output); +} + +#endif // defined(WCHAR_T_IS_UTF32) + // Codepage <-> Wide ----------------------------------------------------------- // Convert a unicode string into the specified codepage_name. If the codepage diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 1112676..837e000 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -183,11 +183,6 @@ TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) { EXPECT_EQ(wempty, UTF8ToWide(empty)); } -// This tests the current behavior of our UTF-8/UTF-16 conversion. On Windows, -// we just use the platform functions which strip invalid characters. This isn't -// necessarily the best behavior, we may want to write our own converter using -// ICU to get more customized results (for example, substituting the -// "replacement character" U+FFFD for invalid sequences. TEST(StringUtilTest, ConvertUTF8ToWide) { struct UTF8ToWideCase { const char* utf8; @@ -206,7 +201,7 @@ TEST(StringUtilTest, ConvertUTF8ToWide) { {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. {"\xed\xb0\x80", L"", false}, - // Non-BMP character. The result will either be in UTF-16 or UCS-4. + // Non-BMP character. The result will either be in UTF-16 or UTF-32. #if defined(WCHAR_T_IS_UTF16) {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, #elif defined(WCHAR_T_IS_UTF32) @@ -270,8 +265,8 @@ TEST(StringUtilTest, ConvertUTF16ToUTF8) { } #elif defined(WCHAR_T_IS_UTF32) -// This test is only valid when wchar_t == UCS-4. -TEST(StringUtilTest, ConvertUCS4ToUTF8) { +// This test is only valid when wchar_t == UTF-32. +TEST(StringUtilTest, ConvertUTF32ToUTF8) { struct UTF8ToWideCase { const wchar_t* ucs4; const char* utf8; |