summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
authorbrettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2008-08-01 00:13:10 +0000
committerbrettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2008-08-01 00:13:10 +0000
commit656e3b3857e315a4a6386944fb140ef202580f77 (patch)
tree37fc49db14f3b43a03da33ef58dddfff9b58be2f /base
parent660efb2db208d7a64e04eebad1e0e1dd7b54f3b0 (diff)
downloadchromium_src-656e3b3857e315a4a6386944fb140ef202580f77.zip
chromium_src-656e3b3857e315a4a6386944fb140ef202580f77.tar.gz
chromium_src-656e3b3857e315a4a6386944fb140ef202580f77.tar.bz2
Write our own utf8<->wide conversion functions. This gives us more control over error handling instead of getting a blank string for invalid encodings. It also allows us to decrease the amount of platform-specific code.
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@211 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r--base/string_util.cc21
-rw-r--r--base/string_util.h10
-rw-r--r--base/string_util_icu.cc169
-rw-r--r--base/string_util_mac.cc110
-rw-r--r--base/string_util_unittest.cc117
-rw-r--r--base/string_util_win.cc10
6 files changed, 316 insertions, 121 deletions
diff --git a/base/string_util.cc b/base/string_util.cc
index 2122b9f..faf5ef9 100644
--- a/base/string_util.cc
+++ b/base/string_util.cc
@@ -250,6 +250,27 @@ std::wstring ASCIIToWide(const std::string& ascii) {
return std::wstring(ascii.begin(), ascii.end());
}
+std::string WideToUTF8(const std::wstring& wide) {
+ std::string ret;
+ if (wide.empty())
+ return ret;
+
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ WideToUTF8(wide.data(), wide.length(), &ret);
+ return ret;
+}
+
+// Similar to the Wide->UTF8 version above.
+std::wstring UTF8ToWide(const std::string& utf8) {
+ std::wstring ret;
+ if (utf8.empty())
+ return ret;
+
+ UTF8ToWide(utf8.data(), utf8.length(), &ret);
+ return ret;
+}
+
// Latin1 is just the low range of Unicode, so we can copy directly to convert.
bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
std::string output;
diff --git a/base/string_util.h b/base/string_util.h
index e5fd147..340a7eb 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -155,9 +155,15 @@ std::wstring CollapseWhitespace(const std::wstring& text,
std::string WideToASCII(const std::wstring& wide);
std::wstring ASCIIToWide(const std::string& ascii);
-// These convert between UTF8 and UTF16 strings. They are potentially slow,
-// so avoid unnecessary conversions. Most things should be in UTF16.
+// These convert between UTF8 and UTF16 strings. They are potentially slow, so
+// avoid unnecessary conversions. Most things should be in wide. The low-level
+// versions return a boolean indicating whether the conversion was 100% valid.
+// In this case, it will still do the best it can and put the result in the
+// output buffer. The versions that return strings ignore this error and just
+// return the best conversion possible.
+bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output);
std::string WideToUTF8(const std::wstring& wide);
+bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output);
std::wstring UTF8ToWide(const std::string& utf8);
// Converts between wide strings and whatever the native multibyte encoding
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
index 797ccbd..6df5581 100644
--- a/base/string_util_icu.cc
+++ b/base/string_util_icu.cc
@@ -38,6 +38,175 @@
#include "unicode/numfmt.h"
#include "unicode/ustring.h"
+namespace {
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+// Reads a UTF-8 stream, placing the next code point into the given output
+// |*code_point|. |src| represents the entire string to read, and |*char_index|
+// is the character offset within the string to start reading at. |*char_index|
+// will be updated to index the last character read, such that incrementing it
+// (as in a for loop) will take the reader to the next character.
+//
+// Returns true on success. On false, |*code_point| will be invalid.
+bool ReadUnicodeCharacter(const char* src, int32 src_len,
+ int32* char_index, uint32* code_point) {
+ U8_NEXT(src, *char_index, src_len, *code_point);
+
+ // The ICU macro above moves to the next char, we want to point to the last
+ // char consumed.
+ (*char_index)--;
+
+ // Validate the decoded value.
+ return U_IS_UNICODE_CHAR(*code_point);
+}
+
+#ifdef WIN32
+// Reads a UTF-16 character for Windows. The usage is the same as the 8-bit
+// version above.
+bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
+ int32* char_index, uint32* code_point) {
+ if (U16_IS_SURROGATE(src[*char_index])) {
+ if (!U16_IS_SURROGATE_LEAD(src[*char_index]) ||
+ *char_index + 1 >= src_len ||
+ !U16_IS_TRAIL(src[*char_index + 1])) {
+ // Invalid surrogate pair.
+ return false;
+ }
+
+ // Valid surrogate pair.
+ *code_point = U16_GET_SUPPLEMENTARY(src[*char_index],
+ src[*char_index + 1]);
+ (*char_index)++;
+ } else {
+ // Not a surrogate, just one 16-bit word.
+ *code_point = src[*char_index];
+ }
+
+ return U_IS_UNICODE_CHAR(*code_point);
+}
+#else
+// Reads a 32-bit character for Mac and Linux systems. The usage is the same as
+// the 8-bit version above.
+bool ReadUnicodeCharacter(const wchar_t* src, in32 src_len,
+ int32* char_index, uint32* code_point) {
+ // Conversion is easy since the source is 32-bit.
+ *code_point = src[*char_index];
+
+ // Validate the value.
+ return U_IS_UNICODE_CHAR(*code_point);
+}
+#endif
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+// Appends a UTF-8 character to the given 8-bit string.
+void WriteUnicodeCharacter(uint32 code_point, std::basic_string<char>* output) {
+ if (code_point <= 0x7f) {
+ // Fast path the common case of one byte.
+ output->push_back(code_point);
+ return;
+ }
+
+ // U8_APPEND_UNSAFE can append up to 4 bytes.
+ int32 char_offset = static_cast<int32>(output->length());
+ output->resize(char_offset + U8_MAX_LENGTH);
+
+ U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+
+ // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so
+ // it will represent the new length of the string.
+ output->resize(char_offset);
+}
+
+#ifdef WIN32
+// Appends the given code point as a UTF-16 character to the STL string. On
+// Windows, wchar_t is UTF-16.
+void WriteUnicodeCharacter(uint32 code_point,
+ std::basic_string<wchar_t>* output) {
+ if (U16_LENGTH(code_point) == 1) {
+ // Thie code point is in the Basic Multilingual Plane (BMP).
+ output->push_back(static_cast<wchar_t>(code_point));
+ } else {
+ // Non-BMP characters use a double-character encoding.
+ int32 char_offset = static_cast<int32>(output->length());
+ output->resize(char_offset + U16_MAX_LENGTH);
+ U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+ }
+}
+#else
+// Appends the given UCS-4 character to the given 32-bit string for Linux and
+// Mac where wchar_t is UCS-4.
+inline void WriteUnicodeCharacter(uint32 code_point,
+ std::basic_string<wchar_t>* output) {
+ // This is the easy case, just append the character.
+ output->push_back(code_point);
+}
+#endif
+
+// Generalized Unicode converter -----------------------------------------------
+
+// Converts the given source Unicode character type to the given destination
+// Unicode character type as a STL string. The given input buffer and size
+// determine the source, and the given output STL string will be replaced by
+// the result.
+template<typename SRC_CHAR, typename DEST_CHAR>
+bool ConvertUnicode(const SRC_CHAR* src, size_t src_len,
+ std::basic_string<DEST_CHAR>* output) {
+ output->clear();
+
+ // ICU requires 32-bit numbers.
+ bool success = true;
+ int32 src_len32 = static_cast<int32>(src_len);
+ for (int32 i = 0; i < src_len32; i++) {
+ uint32 code_point;
+ if (ReadUnicodeCharacter(src, src_len32, &i, &code_point))
+ WriteUnicodeCharacter(code_point, output);
+ else
+ success = false;
+ }
+ return success;
+}
+
+} // namespace
+
+// UTF-x <-> UTF-x -------------------------------------------------------------
+
+bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ // Intelligently guess the size of the output string. When it's an ASCII
+ // character, assume the rest will be ASCII and use a buffer size the same as
+ // the input. When it's not ASCII, assume 3-bytes per character as the
+ // starting point. This will be resized internally later if it's too small.
+ if (src[0] < 0x80)
+ output->reserve(src_len);
+ else
+ output->reserve(src_len * 3);
+ return ConvertUnicode<wchar_t, char>(src, src_len, output);
+}
+
+bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ // Intelligently guess the size of the output string. When it's an ASCII
+ // character, assume the rest will be ASCII and use a buffer size the same as
+ // the input. When it's not ASCII, assume the UTF-8 takes 2 bytes per
+ // character (this is more conservative than 3 which we use above when
+ // converting the other way).
+ if (src[0] < 0x80)
+ output->reserve(src_len);
+ else
+ output->reserve(src_len / 2);
+ return ConvertUnicode<char, wchar_t>(src, src_len, output);
+}
+
// Codepage <-> Wide -----------------------------------------------------------
// Convert a unicode string into the specified codepage_name. If the codepage
diff --git a/base/string_util_mac.cc b/base/string_util_mac.cc
index 4c5f3dc..76b72b0 100644
--- a/base/string_util_mac.cc
+++ b/base/string_util_mac.cc
@@ -44,7 +44,7 @@
// routines.
template<typename CharType>
static inline bool StrNCpyT(CharType* dst, const CharType* src,
- size_t dst_size, size_t src_size) {
+ size_t dst_size, size_t src_size) {
// The initial value of count has room for a NUL terminator.
size_t count = std::min(dst_size, src_size + 1);
if (count == 0)
@@ -105,114 +105,6 @@ static void InitializeStatics() {
pthread_once(&pthread_once_initialized, DoInitializeStatics);
}
-// Convert the supplied cfsring into the specified encoding, and return it as
-// an STL string of the template type. Returns an empty string on failure.
-template<typename StringType>
-static StringType CFStringToSTLStringWithEncodingT(CFStringRef cfstring,
- CFStringEncoding encoding) {
- CFIndex length = CFStringGetLength(cfstring);
- if (length == 0)
- return StringType();
-
- CFRange whole_string = CFRangeMake(0, length);
- CFIndex out_size;
- CFIndex converted = CFStringGetBytes(cfstring,
- whole_string,
- encoding,
- 0, // lossByte
- false, // isExternalRepresentation
- NULL, // buffer
- 0, // maxBufLen
- &out_size);
- DCHECK(converted != 0 && out_size != 0);
- if (converted == 0 || out_size == 0)
- return StringType();
-
- // out_size is the number of UInt8-sized units needed in the destination.
- // A buffer allocated as UInt8 units might not be properly aligned to
- // contain elements of StringType::value_type. Use a container for the
- // proper value_type, and convert out_size by figuring the number of
- // value_type elements per UInt8. Leave room for a NUL terminator.
- typename StringType::size_type elements =
- out_size * sizeof(UInt8) / sizeof(typename StringType::value_type) + 1;
-
- // Make sure that integer truncation didn't occur. For the conversions done
- // here, it never should.
- DCHECK(((out_size * sizeof(UInt8)) %
- sizeof(typename StringType::value_type)) == 0);
-
- std::vector<typename StringType::value_type> out_buffer(elements);
- converted = CFStringGetBytes(cfstring,
- whole_string,
- encoding,
- 0, // lossByte
- false, // isExternalRepresentation
- reinterpret_cast<UInt8*>(&out_buffer[0]),
- out_size,
- NULL); // usedBufLen
- DCHECK(converted != 0);
- if (converted == 0)
- return StringType();
-
- out_buffer[elements - 1] = '\0';
- return StringType(&out_buffer[0]);
-}
-
-// Given an STL string |in| with an encoding specified by |in_encoding|,
-// convert it to |out_encoding| and return it as an STL string of the
-// |OutStringType| template type. Returns an empty string on failure.
-template<typename OutStringType, typename InStringType>
-static OutStringType STLStringToSTLStringWithEncodingsT(
- const InStringType& in,
- CFStringEncoding in_encoding,
- CFStringEncoding out_encoding) {
- typename InStringType::size_type in_length = in.length();
- if (in_length == 0)
- return OutStringType();
-
- scoped_cftyperef<CFStringRef> cfstring(
- CFStringCreateWithBytesNoCopy(NULL,
- reinterpret_cast<const UInt8*>(in.c_str()),
- in_length *
- sizeof(typename InStringType::value_type),
- in_encoding,
- false,
- kCFAllocatorNull));
- DCHECK(cfstring);
- if (!cfstring)
- return OutStringType();
-
- return CFStringToSTLStringWithEncodingT<OutStringType>(cfstring,
- out_encoding);
-}
-
-// Specify the byte ordering explicitly, otherwise CFString will be confused
-// when strings don't carry BOMs, as they typically won't.
-static const CFStringEncoding kNarrowStringEncoding = kCFStringEncodingUTF8;
-#ifdef __BIG_ENDIAN__
-#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff
-static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16BE;
-#else // __WCHAR_MAX__
-static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32BE;
-#endif // __WCHAR_MAX__
-#else // __BIG_ENDIAN__
-#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff
-static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16LE;
-#else // __WCHAR_MAX__
-static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32LE;
-#endif // __WCHAR_MAX__
-#endif // __BIG_ENDIAN__
-
-std::string WideToUTF8(const std::wstring& wide) {
- return STLStringToSTLStringWithEncodingsT<std::string>(
- wide, kWideStringEncoding, kNarrowStringEncoding);
-}
-
-std::wstring UTF8ToWide(const std::string& utf8) {
- return STLStringToSTLStringWithEncodingsT<std::wstring>(
- utf8, kNarrowStringEncoding, kWideStringEncoding);
-}
-
// Technically, the native multibyte encoding would be the encoding returned
// by CFStringGetSystemEncoding or GetApplicationTextEncoding, but I can't
// imagine anyone needing or using that from these APIs, so just treat UTF-8
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index c6ff622..6d19b0e 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -183,6 +183,123 @@ TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) {
EXPECT_EQ(wempty, UTF8ToWide(empty));
}
+// This tests the current behavior of our UTF-8/UTF-16 conversion. On Windows,
+// we just use the platform functions which strip invalid characters. This isn't
+// necessarily the best behavior, we may want to write our own converter using
+// ICU to get more customized results (for example, substituting the
+// "replacement character" U+FFFD for invalid sequences.
+TEST(StringUtilTest, ConvertUTF8ToWide) {
+ struct UTF8ToWideCase {
+ const char* utf8;
+ const wchar_t* wide;
+ bool success;
+ } convert_cases[] = {
+ // Regular UTF-8 input.
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true},
+ // Invalid Unicode code point.
+ {"\xef\xbf\xbfHello", L"Hello", false},
+ // Truncated UTF-8 sequence.
+ {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false},
+ // Truncated off the end.
+ {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false},
+ // Non-shortest-form UTF-8.
+ {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false},
+ // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
+ {"\xed\xb0\x80", L"", false},
+ // Non-BMP character. The result will either be in UTF-16 or UCS-4.
+#ifdef WIN32
+ {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true},
+#else
+ {"A\xF0\x90\x8C\x80z", L"A\x10300z", true},
+#endif
+ };
+
+ for (int i = 0; i < arraysize(convert_cases); i++) {
+ std::wstring converted;
+ EXPECT_EQ(convert_cases[i].success,
+ UTF8ToWide(convert_cases[i].utf8,
+ strlen(convert_cases[i].utf8),
+ &converted));
+ std::wstring expected(convert_cases[i].wide);
+ EXPECT_EQ(expected, converted);
+ }
+
+ // Manually test an embedded NULL.
+ std::wstring converted;
+ EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted));
+ ASSERT_EQ(3, converted.length());
+ EXPECT_EQ(0, converted[0]);
+ EXPECT_EQ('Z', converted[1]);
+ EXPECT_EQ('\t', converted[2]);
+
+ // Make sure that conversion replaces, not appends.
+ EXPECT_TRUE(UTF8ToWide("B", 1, &converted));
+ ASSERT_EQ(1, converted.length());
+ EXPECT_EQ('B', converted[0]);
+}
+
+#ifdef WIN32
+// This test is only valid when wchar_t == UTF-16.
+TEST(StringUtilTest, ConvertUTF16ToUTF8) {
+ struct UTF16ToUTF8Case {
+ const wchar_t* utf16;
+ const char* utf8;
+ bool success;
+ } convert_cases[] = {
+ // Regular UTF-16 input.
+ {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
+ // Test a non-BMP character.
+ {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true},
+ // Invalid Unicode code point.
+ {L"\xffffHello", "Hello", false},
+ // The first character is a truncated UTF-16 character.
+ {L"\xd800\x597d", "\xe5\xa5\xbd", false},
+ // Truncated at the end.
+ {L"\x597d\xd800", "\xe5\xa5\xbd", false},
+ };
+
+ for (int i = 0; i < arraysize(convert_cases); i++) {
+ std::string converted;
+ EXPECT_EQ(convert_cases[i].success,
+ WideToUTF8(convert_cases[i].utf16,
+ wcslen(convert_cases[i].utf16),
+ &converted));
+ std::string expected(convert_cases[i].utf8);
+ EXPECT_EQ(expected, converted);
+ }
+}
+
+#else
+// This test is only valid when wchar_t == UCS-4.
+TEST(StringUtilTest, ConvertUCS4ToUTF8) {
+ struct UTF8ToWideCase {
+ const wchar_t* ucs4;
+ const char* utf8;
+ bool success;
+ } convert_cases[] = {
+ // Regular 16-bit input.
+ {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
+ // Test a non-BMP character.
+ {L"A\x10300z", "A\xF0\x90\x8C\x80z", true},
+ // Invalid Unicode code points.
+ {L"\xffffHello", "Hello, false", false},
+ {L"\xfffffffHello", "Hello, false", false},
+ // The first character is a truncated UTF-16 character.
+ {L"\xd800\x597d", "\xe5\xa5\xbd", false},
+ }
+
+ for (int i = 0; i < arraysize(convert_cases); i++) {
+ std::string converted;
+ EXPECT_EQ(convert_cases[i].success,
+ WideToUTF8(convert_cases[i].utf16,
+ wcslen(convert_cases[i].utf16),
+ &converted));
+ std::string expected(convert_cases[i].utf8);
+ EXPECT_EQ(expected, converted);
+ }
+}
+#endif
+
TEST(StringUtilTest, ConvertMultiString) {
static wchar_t wmulti[] = {
L'f', L'o', L'o', L'\0',
diff --git a/base/string_util_win.cc b/base/string_util_win.cc
index 6cad854..53044cc 100644
--- a/base/string_util_win.cc
+++ b/base/string_util_win.cc
@@ -76,16 +76,6 @@ static std::wstring MultiByteToWide(const std::string& mb, UINT code_page) {
return wide;
}
-// Wide <--> UTF-8
-std::string WideToUTF8(const std::wstring& wide) {
-
- return WideToMultiByte(wide, CP_UTF8);
-}
-
-std::wstring UTF8ToWide(const std::string& utf8) {
- return MultiByteToWide(utf8, CP_UTF8);
-}
-
// Wide <--> native multibyte
std::string WideToNativeMB(const std::wstring& wide) {
return WideToMultiByte(wide, CP_ACP);