diff options
author | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-08-07 15:29:49 +0000 |
---|---|---|
committer | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-08-07 15:29:49 +0000 |
commit | 6b27db809e959efaf7183ea2de64c6ab3947ef3d (patch) | |
tree | ed2ed10f826f6eb40884231ee0c98d86afef44a7 /base | |
parent | 65b1094478e054ef1f924d3681f8d34ec88d9fcf (diff) | |
download | chromium_src-6b27db809e959efaf7183ea2de64c6ab3947ef3d.zip chromium_src-6b27db809e959efaf7183ea2de64c6ab3947ef3d.tar.gz chromium_src-6b27db809e959efaf7183ea2de64c6ab3947ef3d.tar.bz2 |
Remove the old NativeMB functions from string util, and use the new ones in sys_strings.h. I also removed duplicated code from the sandbox that can now use this, and fixed one case in the bug reporter that should not have been using the native multibyte encoding.
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@515 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/string_util.h | 20 | ||||
-rw-r--r-- | base/string_util_icu.cc | 190 | ||||
-rw-r--r-- | base/string_util_mac.cc | 110 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 117 | ||||
-rw-r--r-- | base/string_util_win.cc | 63 |
5 files changed, 316 insertions, 184 deletions
diff --git a/base/string_util.h b/base/string_util.h index d47d5f2..129f124 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -155,21 +155,17 @@ std::wstring CollapseWhitespace(const std::wstring& text, std::string WideToASCII(const std::wstring& wide); std::wstring ASCIIToWide(const std::string& ascii); -// These convert between UTF8 and UTF16 strings. They are potentially slow, -// so avoid unnecessary conversions. Most things should be in UTF16. +// These convert between UTF8 and UTF16 strings. They are potentially slow, so +// avoid unnecessary conversions. Most things should be in wide. The low-level +// versions return a boolean indicating whether the conversion was 100% valid. +// In this case, it will still do the best it can and put the result in the +// output buffer. The versions that return strings ignore this error and just +// return the best conversion possible. +bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output); std::string WideToUTF8(const std::wstring& wide); +bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output); std::wstring UTF8ToWide(const std::string& utf8); -// Converts between wide strings and whatever the native multibyte encoding -// is. The native multibyte encoding on English machines will often Latin-1, -// but could be ShiftJIS or even UTF-8, among others. -// -// These functions can be dangerous. Do not use unless you are sure you are -// giving them to/getting them from somebody who expects the current platform -// 8-bit encoding. -std::string WideToNativeMB(const std::wstring& wide); -std::wstring NativeMBToWide(const std::string& native_mb); - // Defines the error handling modes of WideToCodepage and CodepageToWide. class OnStringUtilConversionError { public: diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc index 797ccbd..1a84be3 100644 --- a/base/string_util_icu.cc +++ b/base/string_util_icu.cc @@ -26,6 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #include "base/string_util.h" #include <string.h> @@ -38,6 +39,195 @@ #include "unicode/numfmt.h" #include "unicode/ustring.h" +namespace { + +// ReadUnicodeCharacter -------------------------------------------------------- + +// Reads a UTF-8 stream, placing the next code point into the given output +// |*code_point|. |src| represents the entire string to read, and |*char_index| +// is the character offset within the string to start reading at. |*char_index| +// will be updated to index the last character read, such that incrementing it +// (as in a for loop) will take the reader to the next character. +// +// Returns true on success. On false, |*code_point| will be invalid. +bool ReadUnicodeCharacter(const char* src, int32 src_len, + int32* char_index, uint32* code_point) { + U8_NEXT(src, *char_index, src_len, *code_point); + + // The ICU macro above moves to the next char, we want to point to the last + // char consumed. + (*char_index)--; + + // Validate the decoded value. + return U_IS_UNICODE_CHAR(*code_point); +} + +#ifdef WIN32 +// Reads a UTF-16 character for Windows. The usage is the same as the 8-bit +// version above. +bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, + int32* char_index, uint32* code_point) { + if (U16_IS_SURROGATE(src[*char_index])) { + if (!U16_IS_SURROGATE_LEAD(src[*char_index]) || + *char_index + 1 >= src_len || + !U16_IS_TRAIL(src[*char_index + 1])) { + // Invalid surrogate pair. + return false; + } + + // Valid surrogate pair. + *code_point = U16_GET_SUPPLEMENTARY(src[*char_index], + src[*char_index + 1]); + (*char_index)++; + } else { + // Not a surrogate, just one 16-bit word. + *code_point = src[*char_index]; + } + + return U_IS_UNICODE_CHAR(*code_point); +} +#else +// Reads a 32-bit character for Mac and Linux systems. The usage is the same as +// the 8-bit version above. +bool ReadUnicodeCharacter(const wchar_t* src, in32 src_len, + int32* char_index, uint32* code_point) { + // Conversion is easy since the source is 32-bit. + *code_point = src[*char_index]; + + // Validate the value. + return U_IS_UNICODE_CHAR(*code_point); +} +#endif + +// WriteUnicodeCharacter ------------------------------------------------------- + +// Appends a UTF-8 character to the given 8-bit string. +void WriteUnicodeCharacter(uint32 code_point, std::basic_string<char>* output) { + if (code_point <= 0x7f) { + // Fast path the common case of one byte. + output->push_back(code_point); + return; + } + + // U8_APPEND_UNSAFE can append up to 4 bytes. + int32 char_offset = static_cast<int32>(output->length()); + output->resize(char_offset + U8_MAX_LENGTH); + + U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + + // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so + // it will represent the new length of the string. + output->resize(char_offset); +} + +#ifdef WIN32 +// Appends the given code point as a UTF-16 character to the STL string. On +// Windows, wchar_t is UTF-16. +void WriteUnicodeCharacter(uint32 code_point, + std::basic_string<wchar_t>* output) { + if (U16_LENGTH(code_point) == 1) { + // Thie code point is in the Basic Multilingual Plane (BMP). + output->push_back(static_cast<wchar_t>(code_point)); + } else { + // Non-BMP characters use a double-character encoding. + int32 char_offset = static_cast<int32>(output->length()); + output->resize(char_offset + U16_MAX_LENGTH); + U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + } +} +#else +// Appends the given UCS-4 character to the given 32-bit string for Linux and +// Mac where wchar_t is UCS-4. +inline void WriteUnicodeCharacter(uint32 code_point, + std::basic_string<wchar_t>* output) { + // This is the easy case, just append the character. + output->push_back(code_point); +} +#endif + +// Generalized Unicode converter ----------------------------------------------- + +// Converts the given source Unicode character type to the given destination +// Unicode character type as a STL string. The given input buffer and size +// determine the source, and the given output STL string will be replaced by +// the result. +template<typename SRC_CHAR, typename DEST_CHAR> +bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, + std::basic_string<DEST_CHAR>* output) { + output->clear(); + + // ICU requires 32-bit numbers. + bool success = true; + int32 src_len32 = static_cast<int32>(src_len); + for (int32 i = 0; i < src_len32; i++) { + uint32 code_point; + if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) + WriteUnicodeCharacter(code_point, output); + else + success = false; + } + return success; +} + +} // namespace + +// UTF-x <-> UTF-x ------------------------------------------------------------- + +std::string WideToUTF8(const std::wstring& wide) { + std::string ret; + if (wide.empty()) + return ret; + + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + WideToUTF8(wide.data(), wide.length(), &ret); + return ret; +} + +bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { + if (src_len == 0) { + output->clear(); + return true; + } + + // Intelligently guess the size of the output string. When it's an ASCII + // character, assume the rest will be ASCII and use a buffer size the same as + // the input. When it's not ASCII, assume 3-bytes per character as the + // starting point. This will be resized internally later if it's too small. + if (src[0] < 0x80) + output->reserve(src_len); + else + output->reserve(src_len * 3); + return ConvertUnicode<wchar_t, char>(src, src_len, output); +} + +std::wstring UTF8ToWide(const std::string& utf8) { + std::wstring ret; + if (utf8.empty()) + return ret; + + UTF8ToWide(utf8.data(), utf8.length(), &ret); + return ret; +} + +bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { + if (src_len == 0) { + output->clear(); + return true; + } + + // Intelligently guess the size of the output string. When it's an ASCII + // character, assume the rest will be ASCII and use a buffer size the same as + // the input. When it's not ASCII, assume the UTF-8 takes 2 bytes per + // character (this is more conservative than 3 which we use above when + // converting the other way). + if (src[0] < 0x80) + output->reserve(src_len); + else + output->reserve(src_len / 2); + return ConvertUnicode<char, wchar_t>(src, src_len, output); +} + // Codepage <-> Wide ----------------------------------------------------------- // Convert a unicode string into the specified codepage_name. If the codepage diff --git a/base/string_util_mac.cc b/base/string_util_mac.cc index 5079da1..614bbcc 100644 --- a/base/string_util_mac.cc +++ b/base/string_util_mac.cc @@ -44,7 +44,7 @@ // routines. template<typename CharType> static inline bool StrNCpyT(CharType* dst, const CharType* src, - size_t dst_size, size_t src_size) { + size_t dst_size, size_t src_size) { // The initial value of count has room for a NUL terminator. size_t count = std::min(dst_size, src_size + 1); if (count == 0) @@ -105,114 +105,6 @@ static void InitializeStatics() { pthread_once(&pthread_once_initialized, DoInitializeStatics); } -// Convert the supplied cfsring into the specified encoding, and return it as -// an STL string of the template type. Returns an empty string on failure. -template<typename StringType> -static StringType CFStringToSTLStringWithEncodingT(CFStringRef cfstring, - CFStringEncoding encoding) { - CFIndex length = CFStringGetLength(cfstring); - if (length == 0) - return StringType(); - - CFRange whole_string = CFRangeMake(0, length); - CFIndex out_size; - CFIndex converted = CFStringGetBytes(cfstring, - whole_string, - encoding, - 0, // lossByte - false, // isExternalRepresentation - NULL, // buffer - 0, // maxBufLen - &out_size); - DCHECK(converted != 0 && out_size != 0); - if (converted == 0 || out_size == 0) - return StringType(); - - // out_size is the number of UInt8-sized units needed in the destination. - // A buffer allocated as UInt8 units might not be properly aligned to - // contain elements of StringType::value_type. Use a container for the - // proper value_type, and convert out_size by figuring the number of - // value_type elements per UInt8. Leave room for a NUL terminator. - typename StringType::size_type elements = - out_size * sizeof(UInt8) / sizeof(typename StringType::value_type) + 1; - - // Make sure that integer truncation didn't occur. For the conversions done - // here, it never should. - DCHECK(((out_size * sizeof(UInt8)) % - sizeof(typename StringType::value_type)) == 0); - - std::vector<typename StringType::value_type> out_buffer(elements); - converted = CFStringGetBytes(cfstring, - whole_string, - encoding, - 0, // lossByte - false, // isExternalRepresentation - reinterpret_cast<UInt8*>(&out_buffer[0]), - out_size, - NULL); // usedBufLen - DCHECK(converted != 0); - if (converted == 0) - return StringType(); - - out_buffer[elements - 1] = '\0'; - return StringType(&out_buffer[0]); -} - -// Given an STL string |in| with an encoding specified by |in_encoding|, -// convert it to |out_encoding| and return it as an STL string of the -// |OutStringType| template type. Returns an empty string on failure. -template<typename OutStringType, typename InStringType> -static OutStringType STLStringToSTLStringWithEncodingsT( - const InStringType& in, - CFStringEncoding in_encoding, - CFStringEncoding out_encoding) { - typename InStringType::size_type in_length = in.length(); - if (in_length == 0) - return OutStringType(); - - scoped_cftyperef<CFStringRef> cfstring( - CFStringCreateWithBytesNoCopy(NULL, - reinterpret_cast<const UInt8*>(in.c_str()), - in_length * - sizeof(typename InStringType::value_type), - in_encoding, - false, - kCFAllocatorNull)); - DCHECK(cfstring); - if (!cfstring) - return OutStringType(); - - return CFStringToSTLStringWithEncodingT<OutStringType>(cfstring, - out_encoding); -} - -// Specify the byte ordering explicitly, otherwise CFString will be confused -// when strings don't carry BOMs, as they typically won't. -static const CFStringEncoding kNarrowStringEncoding = kCFStringEncodingUTF8; -#ifdef __BIG_ENDIAN__ -#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff -static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16BE; -#else // __WCHAR_MAX__ -static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32BE; -#endif // __WCHAR_MAX__ -#else // __BIG_ENDIAN__ -#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff -static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16LE; -#else // __WCHAR_MAX__ -static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32LE; -#endif // __WCHAR_MAX__ -#endif // __BIG_ENDIAN__ - -std::string WideToUTF8(const std::wstring& wide) { - return STLStringToSTLStringWithEncodingsT<std::string>( - wide, kWideStringEncoding, kNarrowStringEncoding); -} - -std::wstring UTF8ToWide(const std::string& utf8) { - return STLStringToSTLStringWithEncodingsT<std::wstring>( - utf8, kNarrowStringEncoding, kWideStringEncoding); -} - // Technically, the native multibyte encoding would be the encoding returned // by CFStringGetSystemEncoding or GetApplicationTextEncoding, but I can't // imagine anyone needing or using that from these APIs, so just treat UTF-8 diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 03df6de..1aa4043 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -183,6 +183,123 @@ TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) { EXPECT_EQ(wempty, UTF8ToWide(empty)); } +// This tests the current behavior of our UTF-8/UTF-16 conversion. On Windows, +// we just use the platform functions which strip invalid characters. This isn't +// necessarily the best behavior, we may want to write our own converter using +// ICU to get more customized results (for example, substituting the +// "replacement character" U+FFFD for invalid sequences. +TEST(StringUtilTest, ConvertUTF8ToWide) { + struct UTF8ToWideCase { + const char* utf8; + const wchar_t* wide; + bool success; + } convert_cases[] = { + // Regular UTF-8 input. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, + // Invalid Unicode code point. + {"\xef\xbf\xbfHello", L"Hello", false}, + // Truncated UTF-8 sequence. + {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, + // Truncated off the end. + {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, + // Non-shortest-form UTF-8. + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, + // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. + {"\xed\xb0\x80", L"", false}, + // Non-BMP character. The result will either be in UTF-16 or UCS-4. +#ifdef WIN32 + {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, +#else + {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, +#endif + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::wstring converted; + EXPECT_EQ(convert_cases[i].success, + UTF8ToWide(convert_cases[i].utf8, + strlen(convert_cases[i].utf8), + &converted)); + std::wstring expected(convert_cases[i].wide); + EXPECT_EQ(expected, converted); + } + + // Manually test an embedded NULL. + std::wstring converted; + EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); + ASSERT_EQ(3, converted.length()); + EXPECT_EQ(0, converted[0]); + EXPECT_EQ('Z', converted[1]); + EXPECT_EQ('\t', converted[2]); + + // Make sure that conversion replaces, not appends. + EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); + ASSERT_EQ(1, converted.length()); + EXPECT_EQ('B', converted[0]); +} + +#ifdef WIN32 +// This test is only valid when wchar_t == UTF-16. +TEST(StringUtilTest, ConvertUTF16ToUTF8) { + struct UTF16ToUTF8Case { + const wchar_t* utf16; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular UTF-16 input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, + // Invalid Unicode code point. + {L"\xffffHello", "Hello", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + // Truncated at the end. + {L"\x597d\xd800", "\xe5\xa5\xbd", false}, + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} + +#else +// This test is only valid when wchar_t == UCS-4. +TEST(StringUtilTest, ConvertUCS4ToUTF8) { + struct UTF8ToWideCase { + const wchar_t* ucs4; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular 16-bit input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Invalid Unicode code points. + {L"\xffffHello", "Hello, false", false}, + {L"\xfffffffHello", "Hello, false", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + } + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} +#endif + TEST(StringUtilTest, ConvertMultiString) { static wchar_t wmulti[] = { L'f', L'o', L'o', L'\0', diff --git a/base/string_util_win.cc b/base/string_util_win.cc index 1dac2d3..7592142 100644 --- a/base/string_util_win.cc +++ b/base/string_util_win.cc @@ -34,69 +34,6 @@ #include "unicode/numfmt.h" #include "base/logging.h" -// See WideToUTF8. -static std::string WideToMultiByte(const std::wstring& wide, UINT code_page) { - int wide_length = static_cast<int>(wide.length()); - if (wide_length == 0) - return std::string(); - - // compute the length of the buffer we'll need - int charcount = WideCharToMultiByte(code_page, 0, wide.data(), wide_length, - NULL, 0, NULL, NULL); - if (charcount == 0) - return std::string(); - - // convert - std::string mb; - WideCharToMultiByte(code_page, 0, wide.data(), wide_length, - WriteInto(&mb, charcount + 1), charcount, NULL, NULL); - - return mb; -} - -// Converts the given 8-bit string into a wide string, using the given -// code page. The code page identifier is one accepted by MultiByteToWideChar() -// -// Danger: do not assert in this function, as it is used by the assertion code. -// Doing so will cause an infinite loop. -static std::wstring MultiByteToWide(const std::string& mb, UINT code_page) { - int mb_length = static_cast<int>(mb.length()); - if (mb_length == 0) - return std::wstring(); - - // compute the length of the buffer - int charcount = MultiByteToWideChar(code_page, 0, mb.c_str(), mb_length, - NULL, 0); - if (charcount == 0) - return std::wstring(); - - // convert - std::wstring wide; - MultiByteToWideChar(code_page, 0, mb.c_str(), mb_length, - WriteInto(&wide, charcount + 1), charcount); - - return wide; -} - -// Wide <--> UTF-8 -std::string WideToUTF8(const std::wstring& wide) { - - return WideToMultiByte(wide, CP_UTF8); -} - -std::wstring UTF8ToWide(const std::string& utf8) { - return MultiByteToWide(utf8, CP_UTF8); -} - -// Wide <--> native multibyte -std::string WideToNativeMB(const std::wstring& wide) { - return WideToMultiByte(wide, CP_ACP); -} - -std::wstring NativeMBToWide(const std::string& native_mb) { - return MultiByteToWide(native_mb, CP_ACP); -} - NumberFormat* NumberFormatSingleton() { static NumberFormat* number_format = NULL; if (!number_format) { |