3 files changed, 94 insertions, 24 deletions
diff --git a/base/string_util.h b/base/string_util.h
index 1a4080a..46269ff 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -37,6 +37,7 @@
 #include <stdarg.h>   // va_list
 
 #include "base/basictypes.h"
+#include "base/string16.h"
 
 // Safe standard library wrappers for all platforms.
 
@@ -152,17 +153,22 @@ std::wstring CollapseWhitespace(const std::wstring& text,
 std::string WideToASCII(const std::wstring& wide);
 std::wstring ASCIIToWide(const std::string& ascii);
 
-// These convert between UTF8 and UTF16 strings. They are potentially slow, so
-// avoid unnecessary conversions. Most things should be in wide. The low-level
-// versions return a boolean indicating whether the conversion was 100% valid.
-// In this case, it will still do the best it can and put the result in the
-// output buffer. The versions that return strings ignore this error and just
-// return the best conversion possible.
+// These convert between UTF-8, -16, and -32 strings. They are potentially slow,
+// so avoid unnecessary conversions. The low-level versions return a boolean
+// indicating whether the conversion was 100% valid. In this case, it will still
+// do the best it can and put the result in the output buffer. The versions that
+// return strings ignore this error and just return the best conversion
+// possible.
 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output);
 std::string WideToUTF8(const std::wstring& wide);
 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output);
 std::wstring UTF8ToWide(const std::string& utf8);
 
+bool WideToUTF16(const wchar_t* src, size_t src_len, std::string16* output);
+std::string16 WideToUTF16(const std::wstring& wide);
+bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output);
+std::wstring UTF16ToWide(const std::string16& utf8);
+
 // Defines the error handling modes of WideToCodepage and CodepageToWide.
 class OnStringUtilConversionError {
  public:
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
index 534ca88..895a03e 100644
--- a/base/string_util_icu.cc
+++ b/base/string_util_icu.cc
@@ -62,9 +62,8 @@ bool ReadUnicodeCharacter(const char* src, int32 src_len,
   return U_IS_UNICODE_CHAR(*code_point);
 }
 
-#if defined(WCHAR_T_IS_UTF16)
 // Reads a UTF-16 character. The usage is the same as the 8-bit version above.
-bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
+bool ReadUnicodeCharacter(const char16* src, int32 src_len,
                           int32* char_index, uint32* code_point) {
   if (U16_IS_SURROGATE(src[*char_index])) {
     if (!U16_IS_SURROGATE_LEAD(src[*char_index]) ||
@@ -85,10 +84,11 @@ bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
 
   return U_IS_UNICODE_CHAR(*code_point);
 }
-#elif defined(WCHAR_T_IS_UTF32)
+
+#if defined(WCHAR_T_IS_UTF32)
 // Reads UTF-32 character. The usage is the same as the 8-bit version above.
-bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
-                          int32* char_index, uint32* code_point) {
+bool ReadUTF32Character(const wchar_t* src, int32 src_len,
+                        int32* char_index, uint32* code_point) {
   // Conversion is easy since the source is 32-bit.
   *code_point = src[*char_index];
 
@@ -118,13 +118,12 @@ void WriteUnicodeCharacter(uint32 code_point, std::basic_string<char>* output) {
   output->resize(char_offset);
 }
 
-#if defined(WCHAR_T_IS_UTF16)
 // Appends the given code point as a UTF-16 character to the STL string.
 void WriteUnicodeCharacter(uint32 code_point,
-                           std::basic_string<wchar_t>* output) {
+                           std::basic_string<char16>* output) {
   if (U16_LENGTH(code_point) == 1) {
     // Thie code point is in the Basic Multilingual Plane (BMP).
-    output->push_back(static_cast<wchar_t>(code_point));
+    output->push_back(static_cast<char16>(code_point));
   } else {
     // Non-BMP characters use a double-character encoding.
     int32 char_offset = static_cast<int32>(output->length());
@@ -132,7 +131,8 @@ void WriteUnicodeCharacter(uint32 code_point,
     U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
   }
 }
-#elif defined(WCHAR_T_IS_UTF32)
+
+#if defined(WCHAR_T_IS_UTF32)
 // Appends the given UTF-32 character to the given 32-bit string.
 inline void WriteUnicodeCharacter(uint32 code_point,
                                   std::basic_string<wchar_t>* output) {
@@ -167,7 +167,7 @@ bool ConvertUnicode(const SRC_CHAR* src, size_t src_len,
 
 }  // namespace
 
-// UTF-x <-> UTF-x -------------------------------------------------------------
+// UTF-8 <-> Wide --------------------------------------------------------------
 
 std::string WideToUTF8(const std::wstring& wide) {
   std::string ret;
@@ -224,6 +224,75 @@ bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
   return ConvertUnicode<char, wchar_t>(src, src_len, output);
 }
 
+// UTF-16 <-> Wide -------------------------------------------------------------
+
+#if defined(WCHAR_T_IS_UTF16)
+
+// When wide == UTF-16, then conversions are a NOP.
+std::string16 WideToUTF16(const std::wstring& wide) {
+  return wide;
+}
+
+bool WideToUTF16(const wchar_t* src, size_t src_len, std::string16* output) {
+  output->assign(src, src_len);
+  return true;
+}
+
+std::wstring UTF16ToWide(const std::string16& utf16) {
+  return utf16;
+}
+
+bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
+  output->assign(src, src_len);
+  return true;
+}
+
+#elif defined(WCHAR_T_IS_UTF32)
+
+std::string16 WideToUTF16(const std::wstring& wide) {
+  std::string16 ret;
+  if (wide.empty())
+    return ret;
+
+  UTF8ToWide(wide.data(), wide.length(), &ret);
+  return ret;
+}
+
+bool WideToUTF16(const wchar_t* src, size_t src_len, std::string16* output) {
+  if (src_len == 0) {
+    output->clear();
+    return true;
+  }
+
+  // Assume that normally we won't have any non-BMP characters so the counts
+  // will be the same.
+  output->reserve(src_len);
+  return ConvertUnicode<wchar_t, char16>(src, src_len, output);
+}
+
+std::wstring UTF16ToWide(const std::string16& utf16) {
+  std::wstring ret;
+  if (utf16.empty())
+    return ret;
+
+  UTF8ToWide(utf16.data(), utf16.length(), &ret);
+  return ret;
+}
+
+bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
+  if (src_len == 0) {
+    output->clear();
+    return true;
+  }
+
+  // Assume that normally we won't have any non-BMP characters so the counts
+  // will be the same.
+  output->reserve(src_len);
+  return ConvertUnicode<char16, wchar_t>(src, src_len, output);
+}
+
+#endif  // defined(WCHAR_T_IS_UTF32)
+
 // Codepage <-> Wide -----------------------------------------------------------
 
 // Convert a unicode string into the specified codepage_name.  If the codepage
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 1112676..837e000 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -183,11 +183,6 @@ TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) {
   EXPECT_EQ(wempty, UTF8ToWide(empty));
 }
 
-// This tests the current behavior of our UTF-8/UTF-16 conversion. On Windows,
-// we just use the platform functions which strip invalid characters. This isn't
-// necessarily the best behavior, we may want to write our own converter using
-// ICU to get more customized results (for example, substituting the
-// "replacement character" U+FFFD for invalid sequences.
 TEST(StringUtilTest, ConvertUTF8ToWide) {
   struct UTF8ToWideCase {
     const char* utf8;
@@ -206,7 +201,7 @@ TEST(StringUtilTest, ConvertUTF8ToWide) {
     {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false},
     // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
     {"\xed\xb0\x80", L"", false},
-    // Non-BMP character. The result will either be in UTF-16 or UCS-4.
+    // Non-BMP character. The result will either be in UTF-16 or UTF-32.
 #if defined(WCHAR_T_IS_UTF16)
     {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true},
 #elif defined(WCHAR_T_IS_UTF32)
@@ -270,8 +265,8 @@ TEST(StringUtilTest, ConvertUTF16ToUTF8) {
 }
 
 #elif defined(WCHAR_T_IS_UTF32)
-// This test is only valid when wchar_t == UCS-4.
-TEST(StringUtilTest, ConvertUCS4ToUTF8) {
+// This test is only valid when wchar_t == UTF-32.
+TEST(StringUtilTest, ConvertUTF32ToUTF8) {
   struct UTF8ToWideCase {
     const wchar_t* ucs4;
     const char* utf8;