6 files changed, 121 insertions, 316 deletions
diff --git a/base/string_util.cc b/base/string_util.cc
index faf5ef9..2122b9f 100644
--- a/base/string_util.cc
+++ b/base/string_util.cc
@@ -250,27 +250,6 @@ std::wstring ASCIIToWide(const std::string& ascii) {
   return std::wstring(ascii.begin(), ascii.end());
 }
 
-std::string WideToUTF8(const std::wstring& wide) {
-  std::string ret;
-  if (wide.empty())
-    return ret;
-
-  // Ignore the success flag of this call, it will do the best it can for
-  // invalid input, which is what we want here.
-  WideToUTF8(wide.data(), wide.length(), &ret);
-  return ret;
-}
-
-// Similar to the Wide->UTF8 version above.
-std::wstring UTF8ToWide(const std::string& utf8) {
-  std::wstring ret;
-  if (utf8.empty())
-    return ret;
-
-  UTF8ToWide(utf8.data(), utf8.length(), &ret);
-  return ret;
-}
-
 // Latin1 is just the low range of Unicode, so we can copy directly to convert.
 bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
   std::string output;
diff --git a/base/string_util.h b/base/string_util.h
index 340a7eb..e5fd147 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -155,15 +155,9 @@ std::wstring CollapseWhitespace(const std::wstring& text,
 std::string WideToASCII(const std::wstring& wide);
 std::wstring ASCIIToWide(const std::string& ascii);
 
-// These convert between UTF8 and UTF16 strings. They are potentially slow, so
-// avoid unnecessary conversions. Most things should be in wide. The low-level
-// versions return a boolean indicating whether the conversion was 100% valid.
-// In this case, it will still do the best it can and put the result in the
-// output buffer. The versions that return strings ignore this error and just
-// return the best conversion possible.
-bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output);
+// These convert between UTF8 and UTF16 strings. They are potentially slow,
+// so avoid unnecessary conversions. Most things should be in UTF16.
 std::string WideToUTF8(const std::wstring& wide);
-bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output);
 std::wstring UTF8ToWide(const std::string& utf8);
 
 // Converts between wide strings and whatever the native multibyte encoding
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
index 6df5581..797ccbd 100644
--- a/base/string_util_icu.cc
+++ b/base/string_util_icu.cc
@@ -38,175 +38,6 @@
 #include "unicode/numfmt.h"
 #include "unicode/ustring.h"
 
-namespace {
-
-// ReadUnicodeCharacter --------------------------------------------------------
-
-// Reads a UTF-8 stream, placing the next code point into the given output
-// |*code_point|. |src| represents the entire string to read, and |*char_index|
-// is the character offset within the string to start reading at. |*char_index|
-// will be updated to index the last character read, such that incrementing it
-// (as in a for loop) will take the reader to the next character.
-//
-// Returns true on success. On false, |*code_point| will be invalid.
-bool ReadUnicodeCharacter(const char* src, int32 src_len,
-                          int32* char_index, uint32* code_point) {
-  U8_NEXT(src, *char_index, src_len, *code_point);
-
-  // The ICU macro above moves to the next char, we want to point to the last
-  // char consumed.
-  (*char_index)--;
-
-  // Validate the decoded value.
-  return U_IS_UNICODE_CHAR(*code_point);
-}
-
-#ifdef WIN32
-// Reads a UTF-16 character for Windows. The usage is the same as the 8-bit
-// version above.
-bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
-                          int32* char_index, uint32* code_point) {
-  if (U16_IS_SURROGATE(src[*char_index])) {
-    if (!U16_IS_SURROGATE_LEAD(src[*char_index]) ||
-        *char_index + 1 >= src_len ||
-        !U16_IS_TRAIL(src[*char_index + 1])) {
-      // Invalid surrogate pair.
-      return false;
-    }
-    
-    // Valid surrogate pair.
-    *code_point = U16_GET_SUPPLEMENTARY(src[*char_index],
-                                        src[*char_index + 1]);
-    (*char_index)++;
-  } else {
-    // Not a surrogate, just one 16-bit word.
-    *code_point = src[*char_index];
-  }
-
-  return U_IS_UNICODE_CHAR(*code_point);
-}
-#else
-// Reads a 32-bit character for Mac and Linux systems. The usage is the same as
-// the 8-bit version above.
-bool ReadUnicodeCharacter(const wchar_t* src, in32 src_len,
-                          int32* char_index, uint32* code_point) {
-  // Conversion is easy since the source is 32-bit.
-  *code_point = src[*char_index];
-
-  // Validate the value.
-  return U_IS_UNICODE_CHAR(*code_point);
-}
-#endif
-
-// WriteUnicodeCharacter -------------------------------------------------------
-
-// Appends a UTF-8 character to the given 8-bit string.
-void WriteUnicodeCharacter(uint32 code_point, std::basic_string<char>* output) {
-  if (code_point <= 0x7f) {
-    // Fast path the common case of one byte.
-    output->push_back(code_point);
-    return;
-  }
-
-  // U8_APPEND_UNSAFE can append up to 4 bytes.
-  int32 char_offset = static_cast<int32>(output->length());
-  output->resize(char_offset + U8_MAX_LENGTH);
-
-  U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
-
-  // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so
-  // it will represent the new length of the string.
-  output->resize(char_offset);
-}
-
-#ifdef WIN32
-// Appends the given code point as a UTF-16 character to the STL string. On
-// Windows, wchar_t is UTF-16.
-void WriteUnicodeCharacter(uint32 code_point,
-                           std::basic_string<wchar_t>* output) {
-  if (U16_LENGTH(code_point) == 1) {
-    // Thie code point is in the Basic Multilingual Plane (BMP).
-    output->push_back(static_cast<wchar_t>(code_point));
-  } else {
-    // Non-BMP characters use a double-character encoding.
-    int32 char_offset = static_cast<int32>(output->length());
-    output->resize(char_offset + U16_MAX_LENGTH);
-    U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
-  }
-}
-#else
-// Appends the given UCS-4 character to the given 32-bit string for Linux and
-// Mac where wchar_t is UCS-4.
-inline void WriteUnicodeCharacter(uint32 code_point,
-                                  std::basic_string<wchar_t>* output) {
-  // This is the easy case, just append the character.
-  output->push_back(code_point);
-}
-#endif
-
-// Generalized Unicode converter -----------------------------------------------
-
-// Converts the given source Unicode character type to the given destination
-// Unicode character type as a STL string. The given input buffer and size
-// determine the source, and the given output STL string will be replaced by
-// the result.
-template<typename SRC_CHAR, typename DEST_CHAR>
-bool ConvertUnicode(const SRC_CHAR* src, size_t src_len,
-                    std::basic_string<DEST_CHAR>* output) {
-  output->clear();
-
-  // ICU requires 32-bit numbers.
-  bool success = true;
-  int32 src_len32 = static_cast<int32>(src_len);
-  for (int32 i = 0; i < src_len32; i++) {
-    uint32 code_point;
-    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point))
-      WriteUnicodeCharacter(code_point, output);
-    else
-      success = false;
-  }
-  return success;
-}
-
-}  // namespace
-
-// UTF-x <-> UTF-x -------------------------------------------------------------
-
-bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
-  if (src_len == 0) {
-    output->clear();
-    return true;
-  }
-
-  // Intelligently guess the size of the output string. When it's an ASCII
-  // character, assume the rest will be ASCII and use a buffer size the same as
-  // the input. When it's not ASCII, assume 3-bytes per character as the
-  // starting point. This will be resized internally later if it's too small.
-  if (src[0] < 0x80)
-    output->reserve(src_len);
-  else
-    output->reserve(src_len * 3);
-  return ConvertUnicode<wchar_t, char>(src, src_len, output);
-}
-
-bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
-  if (src_len == 0) {
-    output->clear();
-    return true;
-  }
-
-  // Intelligently guess the size of the output string. When it's an ASCII
-  // character, assume the rest will be ASCII and use a buffer size the same as
-  // the input. When it's not ASCII, assume the UTF-8 takes 2 bytes per
-  // character (this is more conservative than 3 which we use above when
-  // converting the other way).
-  if (src[0] < 0x80)
-    output->reserve(src_len);
-  else
-    output->reserve(src_len / 2);
-  return ConvertUnicode<char, wchar_t>(src, src_len, output);
-}
-
 // Codepage <-> Wide -----------------------------------------------------------
 
 // Convert a unicode string into the specified codepage_name.  If the codepage
diff --git a/base/string_util_mac.cc b/base/string_util_mac.cc
index 76b72b0..4c5f3dc 100644
--- a/base/string_util_mac.cc
+++ b/base/string_util_mac.cc
@@ -44,7 +44,7 @@
 // routines.
 template<typename CharType>
 static inline bool StrNCpyT(CharType* dst, const CharType* src,
-                            size_t dst_size, size_t src_size) {
+			    size_t dst_size, size_t src_size) {
   // The initial value of count has room for a NUL terminator.
   size_t count = std::min(dst_size, src_size + 1);
   if (count == 0)
@@ -105,6 +105,114 @@ static void InitializeStatics() {
   pthread_once(&pthread_once_initialized, DoInitializeStatics);
 }
 
+// Convert the supplied cfsring into the specified encoding, and return it as
+// an STL string of the template type.  Returns an empty string on failure.
+template<typename StringType>
+static StringType CFStringToSTLStringWithEncodingT(CFStringRef cfstring,
+                                                   CFStringEncoding encoding) {
+  CFIndex length = CFStringGetLength(cfstring);
+  if (length == 0)
+    return StringType();
+
+  CFRange whole_string = CFRangeMake(0, length);
+  CFIndex out_size;
+  CFIndex converted = CFStringGetBytes(cfstring,
+                                       whole_string,
+                                       encoding,
+                                       0,      // lossByte
+                                       false,  // isExternalRepresentation
+                                       NULL,   // buffer
+                                       0,      // maxBufLen
+                                       &out_size);
+  DCHECK(converted != 0 && out_size != 0);
+  if (converted == 0 || out_size == 0)
+    return StringType();
+
+  // out_size is the number of UInt8-sized units needed in the destination.
+  // A buffer allocated as UInt8 units might not be properly aligned to
+  // contain elements of StringType::value_type.  Use a container for the
+  // proper value_type, and convert out_size by figuring the number of
+  // value_type elements per UInt8.  Leave room for a NUL terminator.
+  typename StringType::size_type elements =
+      out_size * sizeof(UInt8) / sizeof(typename StringType::value_type) + 1;
+
+  // Make sure that integer truncation didn't occur.  For the conversions done
+  // here, it never should.
+  DCHECK(((out_size * sizeof(UInt8)) %
+          sizeof(typename StringType::value_type)) == 0);
+
+  std::vector<typename StringType::value_type> out_buffer(elements);
+  converted = CFStringGetBytes(cfstring,
+                               whole_string,
+                               encoding,
+                               0,      // lossByte
+                               false,  // isExternalRepresentation
+                               reinterpret_cast<UInt8*>(&out_buffer[0]),
+                               out_size,
+                               NULL);  // usedBufLen
+  DCHECK(converted != 0);
+  if (converted == 0)
+    return StringType();
+
+  out_buffer[elements - 1] = '\0';
+  return StringType(&out_buffer[0]);
+}
+
+// Given an STL string |in| with an encoding specified by |in_encoding|,
+// convert it to |out_encoding| and return it as an STL string of the
+// |OutStringType| template type.  Returns an empty string on failure.
+template<typename OutStringType, typename InStringType>
+static OutStringType STLStringToSTLStringWithEncodingsT(
+    const InStringType& in,
+    CFStringEncoding in_encoding,
+    CFStringEncoding out_encoding) {
+  typename InStringType::size_type in_length = in.length();
+  if (in_length == 0)
+    return OutStringType();
+
+  scoped_cftyperef<CFStringRef> cfstring(
+      CFStringCreateWithBytesNoCopy(NULL,
+                                    reinterpret_cast<const UInt8*>(in.c_str()),
+                                    in_length *
+                                      sizeof(typename InStringType::value_type),
+                                    in_encoding,
+                                    false,
+                                    kCFAllocatorNull));
+  DCHECK(cfstring);
+  if (!cfstring)
+    return OutStringType();
+
+  return CFStringToSTLStringWithEncodingT<OutStringType>(cfstring,
+                                                         out_encoding);
+}
+
+// Specify the byte ordering explicitly, otherwise CFString will be confused
+// when strings don't carry BOMs, as they typically won't.
+static const CFStringEncoding kNarrowStringEncoding = kCFStringEncodingUTF8;
+#ifdef __BIG_ENDIAN__
+#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff
+static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16BE;
+#else  // __WCHAR_MAX__
+static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32BE;
+#endif  // __WCHAR_MAX__
+#else  // __BIG_ENDIAN__
+#if defined(__WCHAR_MAX__) && __WCHAR_MAX__ == 0xffff
+static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF16LE;
+#else  // __WCHAR_MAX__
+static const CFStringEncoding kWideStringEncoding = kCFStringEncodingUTF32LE;
+#endif  // __WCHAR_MAX__
+#endif  // __BIG_ENDIAN__
+
+std::string WideToUTF8(const std::wstring& wide) {
+  return STLStringToSTLStringWithEncodingsT<std::string>(
+      wide, kWideStringEncoding, kNarrowStringEncoding);
+}
+
+std::wstring UTF8ToWide(const std::string& utf8) {
+  return STLStringToSTLStringWithEncodingsT<std::wstring>(
+      utf8, kNarrowStringEncoding, kWideStringEncoding);
+}
+
 // Technically, the native multibyte encoding would be the encoding returned
 // by CFStringGetSystemEncoding or GetApplicationTextEncoding, but I can't
 // imagine anyone needing or using that from these APIs, so just treat UTF-8
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 6d19b0e..c6ff622 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -183,123 +183,6 @@ TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) {
   EXPECT_EQ(wempty, UTF8ToWide(empty));
 }
 
-// This tests the current behavior of our UTF-8/UTF-16 conversion. On Windows,
-// we just use the platform functions which strip invalid characters. This isn't
-// necessarily the best behavior, we may want to write our own converter using
-// ICU to get more customized results (for example, substituting the
-// "replacement character" U+FFFD for invalid sequences.
-TEST(StringUtilTest, ConvertUTF8ToWide) {
-  struct UTF8ToWideCase {
-    const char* utf8;
-    const wchar_t* wide;
-    bool success;
-  } convert_cases[] = {
-    // Regular UTF-8 input.
-    {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true},
-    // Invalid Unicode code point.
-    {"\xef\xbf\xbfHello", L"Hello", false},
-    // Truncated UTF-8 sequence.
-    {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false},
-    // Truncated off the end.
-    {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false},
-    // Non-shortest-form UTF-8.
-    {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false},
-    // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
-    {"\xed\xb0\x80", L"", false},
-    // Non-BMP character. The result will either be in UTF-16 or UCS-4.
-#ifdef WIN32
-    {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true},
-#else
-    {"A\xF0\x90\x8C\x80z", L"A\x10300z", true},
-#endif
-  };
-
-  for (int i = 0; i < arraysize(convert_cases); i++) {
-    std::wstring converted;
-    EXPECT_EQ(convert_cases[i].success,
-              UTF8ToWide(convert_cases[i].utf8,
-                         strlen(convert_cases[i].utf8),
-                         &converted));
-    std::wstring expected(convert_cases[i].wide);
-    EXPECT_EQ(expected, converted);
-  }
-
-  // Manually test an embedded NULL.
-  std::wstring converted;
-  EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted));
-  ASSERT_EQ(3, converted.length());
-  EXPECT_EQ(0, converted[0]);
-  EXPECT_EQ('Z', converted[1]);
-  EXPECT_EQ('\t', converted[2]);
-
-  // Make sure that conversion replaces, not appends.
-  EXPECT_TRUE(UTF8ToWide("B", 1, &converted));
-  ASSERT_EQ(1, converted.length());
-  EXPECT_EQ('B', converted[0]);
-}
-
-#ifdef WIN32
-// This test is only valid when wchar_t == UTF-16.
-TEST(StringUtilTest, ConvertUTF16ToUTF8) {
-  struct UTF16ToUTF8Case {
-    const wchar_t* utf16;
-    const char* utf8;
-    bool success;
-  } convert_cases[] = {
-    // Regular UTF-16 input.
-    {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
-    // Test a non-BMP character.
-    {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true},
-    // Invalid Unicode code point.
-    {L"\xffffHello", "Hello", false},
-    // The first character is a truncated UTF-16 character.
-    {L"\xd800\x597d", "\xe5\xa5\xbd", false},
-    // Truncated at the end.
-    {L"\x597d\xd800", "\xe5\xa5\xbd", false},
-  };
-
-  for (int i = 0; i < arraysize(convert_cases); i++) {
-    std::string converted;
-    EXPECT_EQ(convert_cases[i].success,
-              WideToUTF8(convert_cases[i].utf16,
-                         wcslen(convert_cases[i].utf16),
-                         &converted));
-    std::string expected(convert_cases[i].utf8);
-    EXPECT_EQ(expected, converted);
-  }
-}
-
-#else
-// This test is only valid when wchar_t == UCS-4.
-TEST(StringUtilTest, ConvertUCS4ToUTF8) {
-  struct UTF8ToWideCase {
-    const wchar_t* ucs4;
-    const char* utf8;
-    bool success;
-  } convert_cases[] = {
-    // Regular 16-bit input.
-    {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
-    // Test a non-BMP character.
-    {L"A\x10300z", "A\xF0\x90\x8C\x80z", true},
-    // Invalid Unicode code points.
-    {L"\xffffHello", "Hello, false", false},
-    {L"\xfffffffHello", "Hello, false", false},
-    // The first character is a truncated UTF-16 character.
-    {L"\xd800\x597d", "\xe5\xa5\xbd", false},
-  }
-
-  for (int i = 0; i < arraysize(convert_cases); i++) {
-    std::string converted;
-    EXPECT_EQ(convert_cases[i].success,
-              WideToUTF8(convert_cases[i].utf16,
-                         wcslen(convert_cases[i].utf16),
-                         &converted));
-    std::string expected(convert_cases[i].utf8);
-    EXPECT_EQ(expected, converted);
-  }
-}
-#endif
-
 TEST(StringUtilTest, ConvertMultiString) {
   static wchar_t wmulti[] = {
     L'f', L'o', L'o', L'\0',
diff --git a/base/string_util_win.cc b/base/string_util_win.cc
index 53044cc..6cad854 100644
--- a/base/string_util_win.cc
+++ b/base/string_util_win.cc
@@ -76,6 +76,16 @@ static std::wstring MultiByteToWide(const std::string& mb, UINT code_page) {
   return wide;
 }
 
+// Wide <--> UTF-8
+std::string WideToUTF8(const std::wstring& wide) {
+
+  return WideToMultiByte(wide, CP_UTF8);
+}
+
+std::wstring UTF8ToWide(const std::string& utf8) {
+  return MultiByteToWide(utf8, CP_UTF8);
+}
+
 // Wide <--> native multibyte
 std::string WideToNativeMB(const std::wstring& wide) {
   return WideToMultiByte(wide, CP_ACP);