diff options
author | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-08-01 00:49:29 +0000 |
---|---|---|
committer | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-08-01 00:49:29 +0000 |
commit | 856080bd99e86a1eb6f1eb034f65657d03d1f095 (patch) | |
tree | b3e4b2b2fd68ffc0646b414b92a26493e2ba5886 | |
parent | 7db65fdab1d5f59df3bc6c4ed49dd6e0fd4facff (diff) | |
download | chromium_src-856080bd99e86a1eb6f1eb034f65657d03d1f095.zip chromium_src-856080bd99e86a1eb6f1eb034f65657d03d1f095.tar.gz chromium_src-856080bd99e86a1eb6f1eb034f65657d03d1f095.tar.bz2 |
Revert my UTF change until we can figure out what to do with the sandbox.
BUG=1201008
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@215 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | base/string_util.cc | 21 | ||||
-rw-r--r-- | base/string_util.h | 10 | ||||
-rw-r--r-- | base/string_util_icu.cc | 169 | ||||
-rw-r--r-- | base/string_util_mac.cc | 110 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 117 | ||||
-rw-r--r-- | base/string_util_win.cc | 10 |
6 files changed, 121 insertions, 316 deletions
diff --git a/base/string_util.cc b/base/string_util.cc index faf5ef9..2122b9f 100644 --- a/base/string_util.cc +++ b/base/string_util.cc @@ -250,27 +250,6 @@ std::wstring ASCIIToWide(const std::string& ascii) { return std::wstring(ascii.begin(), ascii.end()); } -std::string WideToUTF8(const std::wstring& wide) { - std::string ret; - if (wide.empty()) - return ret; - - // Ignore the success flag of this call, it will do the best it can for - // invalid input, which is what we want here. - WideToUTF8(wide.data(), wide.length(), &ret); - return ret; -} - -// Similar to the Wide->UTF8 version above. -std::wstring UTF8ToWide(const std::string& utf8) { - std::wstring ret; - if (utf8.empty()) - return ret; - - UTF8ToWide(utf8.data(), utf8.length(), &ret); - return ret; -} - // Latin1 is just the low range of Unicode, so we can copy directly to convert. bool WideToLatin1(const std::wstring& wide, std::string* latin1) { std::string output; diff --git a/base/string_util.h b/base/string_util.h index 340a7eb..e5fd147 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -155,15 +155,9 @@ std::wstring CollapseWhitespace(const std::wstring& text, std::string WideToASCII(const std::wstring& wide); std::wstring ASCIIToWide(const std::string& ascii); -// These convert between UTF8 and UTF16 strings. They are potentially slow, so -// avoid unnecessary conversions. Most things should be in wide. The low-level -// versions return a boolean indicating whether the conversion was 100% valid. -// In this case, it will still do the best it can and put the result in the -// output buffer. The versions that return strings ignore this error and just -// return the best conversion possible. -bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output); +// These convert between UTF8 and UTF16 strings. They are potentially slow, +// so avoid unnecessary conversions. Most things should be in UTF16. std::string WideToUTF8(const std::wstring& wide); -bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output); std::wstring UTF8ToWide(const std::string& utf8); // Converts between wide strings and whatever the native multibyte encoding diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc index 6df5581..797ccbd 100644 --- a/base/string_util_icu.cc +++ b/base/string_util_icu.cc @@ -38,175 +38,6 @@ #include "unicode/numfmt.h" #include "unicode/ustring.h" -namespace { - -// ReadUnicodeCharacter -------------------------------------------------------- - -// Reads a UTF-8 stream, placing the next code point into the given output -// |*code_point|. |src| represents the entire string to read, and |*char_index| -// is the character offset within the string to start reading at. |*char_index| -// will be updated to index the last character read, such that incrementing it -// (as in a for loop) will take the reader to the next character. -// -// Returns true on success. On false, |*code_point| will be invalid. -bool ReadUnicodeCharacter(const char* src, int32 src_len, - int32* char_index, uint32* code_point) { - U8_NEXT(src, *char_index, src_len, *code_point); - - // The ICU macro above moves to the next char, we want to point to the last - // char consumed. - (*char_index)--; - - // Validate the decoded value. - return U_IS_UNICODE_CHAR(*code_point); -} - -#ifdef WIN32 -// Reads a UTF-16 character for Windows. The usage is the same as the 8-bit -// version above. -bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, - int32* char_index, uint32* code_point) { - if (U16_IS_SURROGATE(src[*char_index])) { - if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || - *char_index + 1 >= src_len || - !U16_IS_TRAIL(src[*char_index + 1])) { - // Invalid surrogate pair. - return false; - } - - // Valid surrogate pair. - *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], - src[*char_index + 1]); - (*char_index)++; - } else { - // Not a surrogate, just one 16-bit word. - *code_point = src[*char_index]; - } - - return U_IS_UNICODE_CHAR(*code_point); -} -#else -// Reads a 32-bit character for Mac and Linux systems. The usage is the same as -// the 8-bit version above. -bool ReadUnicodeCharacter(const wchar_t* src, in32 src_len, - int32* char_index, uint32* code_point) { - // Conversion is easy since the source is 32-bit. - *code_point = src[*char_index]; - - // Validate the value. - return U_IS_UNICODE_CHAR(*code_point); -} -#endif - -// WriteUnicodeCharacter ------------------------------------------------------- - -// Appends a UTF-8 character to the given 8-bit string. -void WriteUnicodeCharacter(uint32 code_point, std::basic_string<char>* output) { - if (code_point <= 0x7f) { - // Fast path the common case of one byte. - output->push_back(code_point); - return; - } - - // U8_APPEND_UNSAFE can append up to 4 bytes. - int32 char_offset = static_cast<int32>(output->length()); - output->resize(char_offset + U8_MAX_LENGTH); - - U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); - - // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so - // it will represent the new length of the string. - output->resize(char_offset); -} - -#ifdef WIN32 -// Appends the given code point as a UTF-16 character to the STL string. On -// Windows, wchar_t is UTF-16. -void WriteUnicodeCharacter(uint32 code_point, - std::basic_string<wchar_t>* output) { - if (U16_LENGTH(code_point) == 1) { - // Thie code point is in the Basic Multilingual Plane (BMP). - output->push_back(static_cast<wchar_t>(code_point)); - } else { - // Non-BMP characters use a double-character encoding. - int32 char_offset = static_cast<int32>(output->length()); - output->resize(char_offset + U16_MAX_LENGTH); - U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); - } -} -#else -// Appends the given UCS-4 character to the given 32-bit string for Linux and -// Mac where wchar_t is UCS-4. -inline void WriteUnicodeCharacter(uint32 code_point, - std::basic_string<wchar_t>* output) { - // This is the easy case, just append the character. - output->push_back(code_point); -} -#endif - -// Generalized Unicode converter ----------------------------------------------- - -// Converts the given source Unicode character type to the given destination -// Unicode character type as a STL string. The given input buffer and size -// determine the source, and the given output STL string will be replaced by -// the result. -template<typename SRC_CHAR, typename DEST_CHAR> -bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, - std::basic_string<DEST_CHAR>* output) { - output->clear(); - - // ICU requires 32-bit numbers. - bool success = true; - int32 src_len32 = static_cast<int32>(src_len); - for (int32 i = 0; i < src_len32; i++) { - uint32 code_point; - if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) - WriteUnicodeCharacter(code_point, output); - else - success = false; - } - return success; -} - -} // namespace - -// UTF-x <-> UTF-x ------------------------------------------------------------- - -bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - // Intelligently guess the size of the output string. When it's an ASCII - // character, assume the rest will be ASCII and use a buffer size the same as - // the input. When it's not ASCII, assume 3-bytes per character as the - // starting point. This will be resized internally later if it's too small. - if (src[0] < 0x80) - output->reserve(src_len); - else - output->reserve(src_len * 3); - return ConvertUnicode<wchar_t, char>(src, src_len, output); -} - -bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - // Intelligently guess the size of the output string. When it's an ASCII - // character, assume the rest will be ASCII and use a buffer size the same as - // the input. When it's not ASCII, assume the UTF-8 takes 2 bytes per - // character (this is more conservative than 3 which we use above when - // converting the other way). - if (src[0] < 0x80) - output->reserve(src_len); - else - output->reserve(src_len / 2); - return ConvertUnicode<char, wchar_t>(src, src_len, output); -} - // Codepage <-> Wide ----------------------------------------------------------- // Convert a unicode string into the specified codepage_name. If the codepage diff --git a/base/string_util_mac.cc b/base/string_util_mac.cc index 76b72b0..4c5f3dc 100644 --- a/base/string_util_mac.cc +++ b/base/string_util_mac.cc @@ -44,7 +44,7 @@ // routines. template<typename CharType> static inline bool StrNCpyT(CharType* dst, const CharType* src, - size_t dst_size, size_t src_size) { + size_t dst_size, size_t src_size) { // The initial value of count has room for a NUL terminator. size_t count = std::min(dst_size, src_size + 1); if (count == 0) @@ -105,6 +105,114 @@ static void InitializeStatics() { pthread_once(&pthread_once_initialized, DoInitializeStatics); } +// Convert the supplied cfsring into the specified encoding, and return it as +// an STL string of the template type. Returns an empty string on failure. +template<typename StringType> +static StringType CFStringToSTLStringWithEncodingT(CFStringRef cfstring, + CFStringEncoding encoding) { + CFIndex length = CFStringGetLength(cfstring); + if (length == 0) + return StringType(); + + CFRange whole_string = CFRangeMake(0, length); + CFIndex out_size; + CFIndex converted = CFStringGetBytes(cfstring, + whole_string, + encoding, + 0, // lossByte + false, // isExternalRepresentation + NULL, // buffer + 0, // maxBufLen + &out_size); + DCHECK(converted != 0 && out_size != 0); + if (converted == 0 || out_size == 0) + return StringType(); + + // out_size is the number of UInt8-sized units needed in the destination. + // A buffer allocated as UInt8 units might not be properly aligned to + // contain elements of StringType::value_type. Use a container for the + // proper value_type, and convert out_size by figuring the number of + // value_type elements per UInt8. Leave room for a NUL terminator. + typename StringType::size_type elements = + out_size * sizeof(UInt8) / sizeof(typename StringType::value_type) + 1; + + // Make sure that integer truncation didn't occur. For the conversions done + // here, it never should. + DCHECK(((out_size * sizeof(UInt8)) % + sizeof(typename StringType::value_type)) == 0); + + std::vector<typename StringType::value_type> out_buffer(elements); + converted = CFStringGetBytes(cfstring, + whole_string, + encoding, + 0, // lossByte + false, // isExternalRepresentation + reinterpret_cast<UInt8*>(&out_buffer[0]), + out_size, + NULL); // usedBufLen + DCHECK(converted != 0); + if (converted == 0) + return StringType(); + + out_buffer[elements - 1] = '\0'; + return StringType(&out_buffer[0]); +} + +// Given an STL string |in| with an encoding specified by |in_encoding|, +// convert it to |out_encoding| and return it as an STL string of the +// |OutStringType| template type. Returns an empty string on failure. +template<typename OutStringType, typename InStringType> +static OutStringType STLStringToSTLStringWithEncodingsT( + const InStringType& in, + CFStringEncoding in_encoding, + CFStringEncoding out_encoding) { + typename InStringType::size_type in_length = in.length(); + if (in_length == 0) + return OutStringType(); + + scoped_cftyperef<CFStringRef> cfstring( + CFStringCreateWithBytesNoCopy(NULL, + reinterpret_cast<const UInt8*>(in.c_str()), + in_length * + sizeof(typename InStringType::value_type), + in_encoding, + false, + kCFAllocatorNull)); + DCHECK(cfstring); + if (!cfstring) + return OutStringType(); + + return CFStringToSTLStringWithEncodingT<OutStringType>(cfstring, + out_encoding); +} + +// Specify the byte ordering explicitly, otherwise CFString will be confused +// when strings don't carry BOMs, as they typically won't. +static const CFStringEncoding kNarrowStringEncoding = kCFStringEncodingUTF8; +#ifdef __BIG_ENDIAN__ +#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff +static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16BE; +#else // __WCHAR_MAX__ +static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32BE; +#endif // __WCHAR_MAX__ +#else // __BIG_ENDIAN__ +#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff +static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16LE; +#else // __WCHAR_MAX__ +static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32LE; +#endif // __WCHAR_MAX__ +#endif // __BIG_ENDIAN__ + +std::string WideToUTF8(const std::wstring& wide) { + return STLStringToSTLStringWithEncodingsT<std::string>( + wide, kWideStringEncoding, kNarrowStringEncoding); +} + +std::wstring UTF8ToWide(const std::string& utf8) { + return STLStringToSTLStringWithEncodingsT<std::wstring>( + utf8, kNarrowStringEncoding, kWideStringEncoding); +} + // Technically, the native multibyte encoding would be the encoding returned // by CFStringGetSystemEncoding or GetApplicationTextEncoding, but I can't // imagine anyone needing or using that from these APIs, so just treat UTF-8 diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 6d19b0e..c6ff622 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -183,123 +183,6 @@ TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) { EXPECT_EQ(wempty, UTF8ToWide(empty)); } -// This tests the current behavior of our UTF-8/UTF-16 conversion. On Windows, -// we just use the platform functions which strip invalid characters. This isn't -// necessarily the best behavior, we may want to write our own converter using -// ICU to get more customized results (for example, substituting the -// "replacement character" U+FFFD for invalid sequences. -TEST(StringUtilTest, ConvertUTF8ToWide) { - struct UTF8ToWideCase { - const char* utf8; - const wchar_t* wide; - bool success; - } convert_cases[] = { - // Regular UTF-8 input. - {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, - // Invalid Unicode code point. - {"\xef\xbf\xbfHello", L"Hello", false}, - // Truncated UTF-8 sequence. - {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, - // Truncated off the end. - {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, - // Non-shortest-form UTF-8. - {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, - // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. - {"\xed\xb0\x80", L"", false}, - // Non-BMP character. The result will either be in UTF-16 or UCS-4. -#ifdef WIN32 - {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, -#else - {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, -#endif - }; - - for (int i = 0; i < arraysize(convert_cases); i++) { - std::wstring converted; - EXPECT_EQ(convert_cases[i].success, - UTF8ToWide(convert_cases[i].utf8, - strlen(convert_cases[i].utf8), - &converted)); - std::wstring expected(convert_cases[i].wide); - EXPECT_EQ(expected, converted); - } - - // Manually test an embedded NULL. - std::wstring converted; - EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); - ASSERT_EQ(3, converted.length()); - EXPECT_EQ(0, converted[0]); - EXPECT_EQ('Z', converted[1]); - EXPECT_EQ('\t', converted[2]); - - // Make sure that conversion replaces, not appends. - EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); - ASSERT_EQ(1, converted.length()); - EXPECT_EQ('B', converted[0]); -} - -#ifdef WIN32 -// This test is only valid when wchar_t == UTF-16. -TEST(StringUtilTest, ConvertUTF16ToUTF8) { - struct UTF16ToUTF8Case { - const wchar_t* utf16; - const char* utf8; - bool success; - } convert_cases[] = { - // Regular UTF-16 input. - {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, - // Test a non-BMP character. - {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, - // Invalid Unicode code point. - {L"\xffffHello", "Hello", false}, - // The first character is a truncated UTF-16 character. - {L"\xd800\x597d", "\xe5\xa5\xbd", false}, - // Truncated at the end. - {L"\x597d\xd800", "\xe5\xa5\xbd", false}, - }; - - for (int i = 0; i < arraysize(convert_cases); i++) { - std::string converted; - EXPECT_EQ(convert_cases[i].success, - WideToUTF8(convert_cases[i].utf16, - wcslen(convert_cases[i].utf16), - &converted)); - std::string expected(convert_cases[i].utf8); - EXPECT_EQ(expected, converted); - } -} - -#else -// This test is only valid when wchar_t == UCS-4. -TEST(StringUtilTest, ConvertUCS4ToUTF8) { - struct UTF8ToWideCase { - const wchar_t* ucs4; - const char* utf8; - bool success; - } convert_cases[] = { - // Regular 16-bit input. - {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, - // Test a non-BMP character. - {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, - // Invalid Unicode code points. - {L"\xffffHello", "Hello, false", false}, - {L"\xfffffffHello", "Hello, false", false}, - // The first character is a truncated UTF-16 character. - {L"\xd800\x597d", "\xe5\xa5\xbd", false}, - } - - for (int i = 0; i < arraysize(convert_cases); i++) { - std::string converted; - EXPECT_EQ(convert_cases[i].success, - WideToUTF8(convert_cases[i].utf16, - wcslen(convert_cases[i].utf16), - &converted)); - std::string expected(convert_cases[i].utf8); - EXPECT_EQ(expected, converted); - } -} -#endif - TEST(StringUtilTest, ConvertMultiString) { static wchar_t wmulti[] = { L'f', L'o', L'o', L'\0', diff --git a/base/string_util_win.cc b/base/string_util_win.cc index 53044cc..6cad854 100644 --- a/base/string_util_win.cc +++ b/base/string_util_win.cc @@ -76,6 +76,16 @@ static std::wstring MultiByteToWide(const std::string& mb, UINT code_page) { return wide; } +// Wide <--> UTF-8 +std::string WideToUTF8(const std::wstring& wide) { + + return WideToMultiByte(wide, CP_UTF8); +} + +std::wstring UTF8ToWide(const std::string& utf8) { + return MultiByteToWide(utf8, CP_UTF8); +} + // Wide <--> native multibyte std::string WideToNativeMB(const std::wstring& wide) { return WideToMultiByte(wide, CP_ACP); |