11 files changed, 645 insertions, 252 deletions
diff --git a/base/string_util.h b/base/string_util.h
index 9a033b4..c7f3115 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -221,7 +221,8 @@ std::string UTF16ToUTF8(const string16& utf16);
 # define UTF16ToWideHack UTF16ToWide
 #endif
 
-// Defines the error handling modes of WideToCodepage and CodepageToWide.
+// Defines the error handling modes of UTF16ToCodepage, CodepageToUTF16,
+// WideToCodepage and CodepageToWide.
 class OnStringUtilConversionError {
  public:
   enum Type {
@@ -231,12 +232,30 @@ class OnStringUtilConversionError {
     // The offending characters are skipped and the conversion will proceed as
     // if they did not exist.
     SKIP,
+
+    // When converting to Unicode, the offending byte sequences are substituted
+    // by Unicode replacement character (U+FFFD). When converting from Unicode,
+    // this is the same as SKIP.
+    SUBSTITUTE,
   };
 
  private:
   OnStringUtilConversionError();
 };
 
+// Converts between UTF-16 strings and the encoding specified.  If the
+// encoding doesn't exist or the encoding fails (when on_error is FAIL),
+// returns false.
+bool UTF16ToCodepage(const string16& utf16,
+                     const char* codepage_name,
+                     OnStringUtilConversionError::Type on_error,
+                     std::string* encoded);
+
+bool CodepageToUTF16(const std::string& encoded,
+                     const char* codepage_name,
+                     OnStringUtilConversionError::Type on_error,
+                     string16* utf16);
+
 // Converts between wide strings and the encoding specified.  If the
 // encoding doesn't exist or the encoding fails (when on_error is FAIL),
 // returns false.
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
index 87731de..3bd6f9b 100644
--- a/base/string_util_icu.cc
+++ b/base/string_util_icu.cc
@@ -10,8 +10,10 @@
 #include "base/basictypes.h"
 #include "base/logging.h"
 #include "base/singleton.h"
-#include "unicode/ucnv.h"
 #include "unicode/numfmt.h"
+#include "unicode/ucnv.h"
+#include "unicode/ucnv_cb.h"
+#include "unicode/ucnv_err.h"
 #include "unicode/ustring.h"
 
 namespace {
@@ -24,6 +26,64 @@ inline bool IsValidCodepoint(uint32 code_point) {
          (code_point >= 0xE000u && code_point <= 0x10FFFFu);
 }
 
+// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE
+// in source/common/ucnv_err.c.
+
+// Copyright (c) 1995-2006 International Business Machines Corporation
+// and others
+//
+// All rights reserved.
+//
+
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, and/or
+// sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, provided that the above copyright notice(s) and
+// this permission notice appear in all copies of the Software and that
+// both the above copyright notice(s) and this permission notice appear in
+// supporting documentation.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+// OR PERFORMANCE OF THIS SOFTWARE.
+//
+// Except as contained in this notice, the name of a copyright holder
+// shall not be used in advertising or otherwise to promote the sale, use
+// or other dealings in this Software without prior written authorization
+// of the copyright holder.
+
+//  ___________________________________________________________________________
+//
+// All trademarks and registered trademarks mentioned herein are the property
+// of their respective owners.
+
+void ToUnicodeCallbackSubstitute(const void* context,
+                                 UConverterToUnicodeArgs *to_args,
+                                 const char* code_units,
+                                 int32_t length,
+                                 UConverterCallbackReason reason,
+                                 UErrorCode * err) {
+  static const UChar kReplacementChar = 0xFFFD;
+  if (reason <= UCNV_IRREGULAR) {
+      if (context == NULL ||
+          (*(reinterpret_cast<const char*>(context)) == 'i' &&
+           reason == UCNV_UNASSIGNED)) {
+        *err = U_ZERO_ERROR;
+        ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err);
+      }
+      // else the caller must have set the error code accordingly.
+  }
+  // else ignore the reset, close and clone calls.
+}
+
 // ReadUnicodeCharacter --------------------------------------------------------
 
 // Reads a UTF-8 stream, placing the next code point into the given output
@@ -76,7 +136,7 @@ bool ReadUnicodeCharacter(const char16* src, int32 src_len,
 #if defined(WCHAR_T_IS_UTF32)
 // Reads UTF-32 character. The usage is the same as the 8-bit version above.
 bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
-                        int32* char_index, uint32* code_point) {
+                          int32* char_index, uint32* code_point) {
   // Conversion is easy since the source is 32-bit.
   *code_point = src[*char_index];
 
@@ -184,6 +244,70 @@ void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) {
   }
 }
 
+bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src,
+                      int uchar_len, OnStringUtilConversionError::Type on_error,
+                      std::string* encoded) {
+  int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
+      ucnv_getMaxCharSize(converter));
+  encoded->resize(encoded_max_length);
+
+  UErrorCode status = U_ZERO_ERROR;
+
+  // Setup our error handler.
+  switch (on_error) {
+    case OnStringUtilConversionError::FAIL:
+      ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
+                            NULL, NULL, &status);
+      break;
+    case OnStringUtilConversionError::SKIP:
+    case OnStringUtilConversionError::SUBSTITUTE:
+      ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
+                            NULL, NULL, &status);
+      break;
+    default:
+      NOTREACHED();
+  }
+
+  // ucnv_fromUChars returns size not including terminating null
+  int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
+      encoded_max_length, uchar_src, uchar_len, &status);
+  encoded->resize(actual_size);
+  ucnv_close(converter);
+  if (U_SUCCESS(status))
+    return true;
+  encoded->clear();  // Make sure the output is empty on error.
+  return false;
+}
+
+// Set up our error handler for ToUTF-16 converters
+void SetUpErrorHandlerForToUChars(OnStringUtilConversionError::Type on_error,
+                                  UConverter* converter, UErrorCode* status) {
+  switch (on_error) {
+    case OnStringUtilConversionError::FAIL:
+      ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
+                          NULL, NULL, status);
+      break;
+    case OnStringUtilConversionError::SKIP:
+      ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
+                          NULL, NULL, status);
+      break;
+    case OnStringUtilConversionError::SUBSTITUTE:
+      ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0,
+                          NULL, NULL, status);
+      break;
+    default:
+      NOTREACHED();
+  }
+}
+
+inline UConverterType utf32_platform_endian() {
+#if U_IS_BIG_ENDIAN
+  return UCNV_UTF32_BigEndian;
+#else
+  return UCNV_UTF32_LittleEndian;
+#endif
+}
+
 }  // namespace
 
 // UTF-8 <-> Wide --------------------------------------------------------------
@@ -364,14 +488,17 @@ std::string UTF16ToUTF8(const string16& utf16) {
 
 #endif
 
-// Codepage <-> Wide -----------------------------------------------------------
+// Codepage <-> Wide/UTF-16  ---------------------------------------------------
 
-// Convert a unicode string into the specified codepage_name.  If the codepage
+// Convert a wstring into the specified codepage_name.  If the codepage
 // isn't found, return false.
 bool WideToCodepage(const std::wstring& wide,
                     const char* codepage_name,
                     OnStringUtilConversionError::Type on_error,
                     std::string* encoded) {
+#if defined(WCHAR_T_IS_UTF16)
+  return UTF16ToCodepage(wide, codepage_name, on_error, encoded);
+#elif defined(WCHAR_T_IS_UTF32)
   encoded->clear();
 
   UErrorCode status = U_ZERO_ERROR;
@@ -379,59 +506,47 @@ bool WideToCodepage(const std::wstring& wide,
   if (!U_SUCCESS(status))
     return false;
 
-  const UChar* uchar_src;
-  int uchar_len;
-#if defined(WCHAR_T_IS_UTF16)
-  uchar_src = wide.c_str();
-  uchar_len = static_cast<int>(wide.length());
-#elif defined(WCHAR_T_IS_UTF32)
+  int utf16_len;
   // When wchar_t is wider than UChar (16 bits), transform |wide| into a
   // UChar* string.  Size the UChar* buffer to be large enough to hold twice
-  // as many UTF-16 code points as there are UTF-16 characters, in case each
-  // character translates to a UTF-16 surrogate pair, and leave room for a NUL
-  // terminator.
-  std::vector<UChar> wide_uchar(wide.length() * 2 + 1);
-  u_strFromWCS(&wide_uchar[0], wide_uchar.size(), &uchar_len,
+  // as many UTF-16 code units (UChar's) as there are Unicode code points,
+  // in case each code points translates to a UTF-16 surrogate pair,
+  // and leave room for a NUL terminator.
+  std::vector<UChar> utf16(wide.length() * 2 + 1);
+  u_strFromWCS(&utf16[0], utf16.size(), &utf16_len,
                wide.c_str(), wide.length(), &status);
-  uchar_src = &wide_uchar[0];
   DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*";
+
+  return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded);
 #endif  // defined(WCHAR_T_IS_UTF32)
+}
 
-  int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
-    ucnv_getMaxCharSize(converter));
-  encoded->resize(encoded_max_length);
+// Convert a UTF-16 string into the specified codepage_name.  If the codepage
+// isn't found, return false.
+bool UTF16ToCodepage(const string16& utf16,
+                    const char* codepage_name,
+                    OnStringUtilConversionError::Type on_error,
+                    std::string* encoded) {
+  encoded->clear();
 
-  // Setup our error handler.
-  switch (on_error) {
-    case OnStringUtilConversionError::FAIL:
-      ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
-                            NULL, NULL, &status);
-      break;
-    case OnStringUtilConversionError::SKIP:
-      ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
-                            NULL, NULL, &status);
-      break;
-    default:
-      NOTREACHED();
-  }
+  UErrorCode status = U_ZERO_ERROR;
+  UConverter* converter = ucnv_open(codepage_name, &status);
+  if (!U_SUCCESS(status))
+    return false;
 
-  // ucnv_fromUChars returns size not including terminating null
-  int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
-    encoded_max_length, uchar_src, uchar_len, &status);
-  encoded->resize(actual_size);
-  ucnv_close(converter);
-  if (U_SUCCESS(status))
-    return true;
-  encoded->clear();  // Make sure the output is empty on error.
-  return false;
+  return ConvertFromUTF16(converter, utf16.c_str(),
+                          static_cast<int>(utf16.length()), on_error, encoded);
 }
 
-// Converts a string of the given codepage into unicode.
+// Converts a string of the given codepage into wstring.
 // If the codepage isn't found, return false.
 bool CodepageToWide(const std::string& encoded,
                     const char* codepage_name,
                     OnStringUtilConversionError::Type on_error,
                     std::wstring* wide) {
+#if defined(WCHAR_T_IS_UTF16)
+  return CodepageToUTF16(encoded, codepage_name, on_error, wide);
+#elif defined(WCHAR_T_IS_UTF32)
   wide->clear();
 
   UErrorCode status = U_ZERO_ERROR;
@@ -439,6 +554,51 @@ bool CodepageToWide(const std::string& encoded,
   if (!U_SUCCESS(status))
     return false;
 
+  // The maximum length in 4 byte unit of UTF-32 output would be
+  // at most the same as the number of bytes in input. In the worst
+  // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),
+  // this can be 4 times larger than actually needed.
+  size_t wchar_max_length = encoded.length() + 1;
+
+  // The byte buffer and its length to pass to ucnv_toAlgorithimic.
+  char* byte_buffer = reinterpret_cast<char*>(
+      WriteInto(wide, wchar_max_length));
+  int byte_buffer_length = static_cast<int>(wchar_max_length) * 4;
+
+  SetUpErrorHandlerForToUChars(on_error, converter, &status);
+  int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(),
+                                       converter,
+                                       byte_buffer,
+                                       byte_buffer_length,
+                                       encoded.data(),
+                                       static_cast<int>(encoded.length()),
+                                       &status);
+  ucnv_close(converter);
+
+  if (!U_SUCCESS(status)) {
+    wide->clear();  // Make sure the output is empty on error.
+    return false;
+  }
+
+  // actual_size is # of bytes.
+  wide->resize(actual_size / 4);
+  return true;
+#endif  // defined(WCHAR_T_IS_UTF32)
+}
+
+// Converts a string of the given codepage into UTF-16.
+// If the codepage isn't found, return false.
+bool CodepageToUTF16(const std::string& encoded,
+                     const char* codepage_name,
+                     OnStringUtilConversionError::Type on_error,
+                     string16* utf16) {
+  utf16->clear();
+
+  UErrorCode status = U_ZERO_ERROR;
+  UConverter* converter = ucnv_open(codepage_name, &status);
+  if (!U_SUCCESS(status))
+    return false;
+
   // Even in the worst case, the maximum length in 2-byte units of UTF-16
   // output would be at most the same as the number of bytes in input. There
   // is no single-byte encoding in which a character is mapped to a
@@ -449,53 +609,20 @@ bool CodepageToWide(const std::string& encoded,
   // BOCU and SCSU, but we don't care about them.
   size_t uchar_max_length = encoded.length() + 1;
 
-  UChar* uchar_dst;
-#if defined(WCHAR_T_IS_UTF16)
-  uchar_dst = WriteInto(wide, uchar_max_length);
-#elif defined(WCHAR_T_IS_UTF32)
-  // When wchar_t is wider than UChar (16 bits), convert into a temporary
-  // UChar* buffer.
-  std::vector<UChar> wide_uchar(uchar_max_length);
-  uchar_dst = &wide_uchar[0];
-#endif  // defined(WCHAR_T_IS_UTF32)
-
-  // Setup our error handler.
-  switch (on_error) {
-    case OnStringUtilConversionError::FAIL:
-      ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
-                          NULL, NULL, &status);
-      break;
-    case OnStringUtilConversionError::SKIP:
-      ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
-                          NULL, NULL, &status);
-      break;
-    default:
-      NOTREACHED();
-  }
-
+  SetUpErrorHandlerForToUChars(on_error, converter, &status);
   int actual_size = ucnv_toUChars(converter,
-                                  uchar_dst,
+                                  WriteInto(utf16, uchar_max_length),
                                   static_cast<int>(uchar_max_length),
                                   encoded.data(),
                                   static_cast<int>(encoded.length()),
                                   &status);
   ucnv_close(converter);
   if (!U_SUCCESS(status)) {
-    wide->clear();  // Make sure the output is empty on error.
+    utf16->clear();  // Make sure the output is empty on error.
     return false;
   }
 
-#ifdef WCHAR_T_IS_UTF32
-  // When wchar_t is wider than UChar (16 bits), it's not possible to wind up
-  // with any more wchar_t elements than UChar elements.  ucnv_toUChars
-  // returns the number of UChar elements not including the NUL terminator, so
-  // leave extra room for that.
-  u_strToWCS(WriteInto(wide, actual_size + 1), actual_size + 1, &actual_size,
-             uchar_dst, actual_size, &status);
-  DCHECK(U_SUCCESS(status)) << "failed to convert UChar* to wstring";
-#endif  // WCHAR_T_IS_UTF32
-
-  wide->resize(actual_size);
+  utf16->resize(actual_size);
   return true;
 }
 
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 6f196cc..4968950 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -13,8 +13,30 @@
 #include "testing/gtest/include/gtest/gtest.h"
 
 namespace {
+
+// Given a null-terminated string of wchar_t with each wchar_t representing
+// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input.
+// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF)
+// should be represented as a surrogate pair (two UTF-16 units)
+// *even* where wchar_t is 32-bit (Linux and Mac).
+//
+// This is to help write tests for functions with string16 params until
+// the C++ 0x UTF-16 literal is well-supported by compilers.
+string16 BuildString16(const wchar_t* s) {
+#if defined(WCHAR_T_IS_UTF16)
+  return string16(s);
+#elif defined(WCHAR_T_IS_UTF32)
+  string16 u16;
+  while (*s != 0) {
+    DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu);
+    u16.push_back(*s++);
+  }
+  return u16;
+#endif
 }
 
+}  // namespace
+
 static const struct trim_case {
   const wchar_t* input;
   const TrimPositions positions;
@@ -459,104 +481,162 @@ TEST(StringUtilTest, ConvertCodepageUTF8) {
   }
 }
 
-TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
-  static const struct {
-    const char* codepage_name;
-    const char* encoded;
-    OnStringUtilConversionError::Type on_error;
-    bool success;
-    const wchar_t* wide;
-  } kConvertCodepageCases[] = {
-    // Test a case where the input can no be decoded, using both SKIP and FAIL
-    // error handling rules. "A7 41" is valid, but "A6" isn't.
-    {"big5",
-     "\xA7\x41\xA6",
-     OnStringUtilConversionError::FAIL,
-     false,
-     L""},
-    {"big5",
-     "\xA7\x41\xA6",
-     OnStringUtilConversionError::SKIP,
-     true,
-     L"\x4F60"},
-    // Arabic (ISO-8859)
-    {"iso-8859-6",
-     "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " "
-     "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" "
-     L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652"},
-    // Chinese Simplified (GB2312)
-    {"gb2312",
-     "\xC4\xE3\xBA\xC3",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x4F60\x597D"},
-    // Chinese Traditional (BIG5)
-    {"big5",
-     "\xA7\x41\xA6\x6E",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x4F60\x597D"},
-    // Greek (ISO-8859)
-    {"iso-8859-7",
-     "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5"},
-    // Hebrew (Windows)
-    {"windows-1255", /* to be replaced with "iso-8859-8-I"? */
-     "\xF9\xD1\xC8\xEC\xE5\xC9\xED",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD"},
-    // Hindi Devanagari (ISCII)
-    {"iscii-dev",
-     "\xEF\x42" "\xC6\xCC\xD7\xE8\xB3\xDA\xCF",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x0928\x092E\x0938\x094D\x0915\x093E\x0930"},
-    // Korean (EUC)
-    {"euc-kr",
-     "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\xC548\xB155\xD558\xC138\xC694"},
-    // Japanese (EUC)
-    {"euc-jp",
-     "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x3053\x3093\x306B\x3061\x306F"},
-    // Japanese (ISO-2022)
-    {"iso-2022-jp",
-     "\x1B\x24\x42" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F" "\x1B\x28\x42",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x3053\x3093\x306B\x3061\x306F"},
-    // Japanese (Shift-JIS)
-    {"sjis",
-     "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x3053\x3093\x306B\x3061\x306F"},
-    // Russian (KOI8)
-    {"koi8-r",
-     "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
-     L"\x0443\x0439\x0442\x0435"},
-    // Thai (ISO-8859)
-    {"windows-874", /* to be replaced with "iso-8859-11". */
-     "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA",
-     OnStringUtilConversionError::FAIL,
-     true,
-     L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35"
-     L"\x0E04\x0E23\x0e31\x0E1A"},
-  };
+// kConverterCodepageCases is not comprehensive. There are a number of cases
+// to add if we really want to have a comprehensive coverage of various
+// codepages and their 'idiosyncrasies'. Currently, the only implementation
+// for CodepageTo* and *ToCodepage uses ICU, which has a very extensive
+// set of tests for the charset conversion. So, we can get away with a
+// relatively small number of cases listed below.
+//
+// Note about |u16_wide| in the following struct.
+// On Windows, the field is always identical to |wide|. On Mac and Linux,
+// it's identical as long as there's no character outside the
+// BMP (<= U+FFFF). When there is, it is different from |wide| and
+// is not a real wide string (UTF-32 string) in that each wchar_t in
+// the string is a UTF-16 code unit zero-extended to be 32-bit
+// even when the code unit belongs to a surrogate pair.
+// For instance, a Unicode string (U+0041 U+010000) is represented as
+// L"\x0041\xD800\xDC00" instead of L"\x0041\x10000".
+// To avoid the clutter, |u16_wide| will be set to NULL
+// if it's identical to |wide| on *all* platforms.
+
+static const struct {
+  const char* codepage_name;
+  const char* encoded;
+  OnStringUtilConversionError::Type on_error;
+  bool success;
+  const wchar_t* wide;
+  const wchar_t* u16_wide;
+} kConvertCodepageCases[] = {
+  // Test a case where the input cannot be decoded, using SKIP, FAIL
+  // and SUBSTITUTE error handling rules. "A7 41" is valid, but "A6" isn't.
+  {"big5",
+   "\xA7\x41\xA6",
+   OnStringUtilConversionError::FAIL,
+   false,
+   L"",
+   NULL},
+  {"big5",
+   "\xA7\x41\xA6",
+   OnStringUtilConversionError::SKIP,
+   true,
+   L"\x4F60",
+   NULL},
+  {"big5",
+   "\xA7\x41\xA6",
+   OnStringUtilConversionError::SUBSTITUTE,
+   true,
+   L"\x4F60\xFFFD",
+   NULL},
+  // Arabic (ISO-8859)
+  {"iso-8859-6",
+   "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " "
+   "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" "
+   L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652",
+   NULL},
+  // Chinese Simplified (GB2312)
+  {"gb2312",
+   "\xC4\xE3\xBA\xC3",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x4F60\x597D",
+   NULL},
+  // Chinese (GB18030) : 4 byte sequences mapped to BMP characters
+  {"gb18030",
+   "\x81\x30\x84\x36\xA1\xA7",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x00A5\x00A8",
+   NULL},
+  // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000)
+  {"gb18030",
+   "\x95\x32\x82\x36\xD2\xBB",
+   OnStringUtilConversionError::FAIL,
+   true,
+#if defined(WCHAR_T_IS_UTF16)
+   L"\xD840\xDC00\x4E00",
+#else
+   L"\x20000\x4E00",
+#endif
+   L"\xD840\xDC00\x4E00"},
+  {"big5",
+   "\xA7\x41\xA6\x6E",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x4F60\x597D",
+   NULL},
+  // Greek (ISO-8859)
+  {"iso-8859-7",
+   "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5",
+   NULL},
+  // Hebrew (Windows)
+  {"windows-1255",
+   "\xF9\xD1\xC8\xEC\xE5\xC9\xED",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD",
+   NULL},
+  // Hindi Devanagari (ISCII)
+  {"iscii-dev",
+   "\xEF\x42" "\xC6\xCC\xD7\xE8\xB3\xDA\xCF",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x0928\x092E\x0938\x094D\x0915\x093E\x0930",
+   NULL},
+  // Korean (EUC)
+  {"euc-kr",
+   "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\xC548\xB155\xD558\xC138\xC694",
+   NULL},
+  // Japanese (EUC)
+  {"euc-jp",
+   "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF\xB0\xEC\x8F\xB0\xA1\x8E\xA6",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x3053\x3093\x306B\x3061\x306F\x4E00\x4E02\xFF66",
+   NULL},
+  // Japanese (ISO-2022)
+  {"iso-2022-jp",
+   "\x1B$B" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F\x30\x6C" "\x1B(B"
+   "ab" "\x1B(J" "\x5C\x7E#$" "\x1B(B",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x3053\x3093\x306B\x3061\x306F\x4E00" L"ab\x00A5\x203E#$",
+   NULL},
+  // Japanese (Shift-JIS)
+  {"sjis",
+   "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD\x88\xEA\xA6",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66",
+   NULL},
+  // Russian (KOI8)
+  {"koi8-r",
+   "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
+   L"\x0443\x0439\x0442\x0435",
+   NULL},
+  // Thai (windows-874)
+  {"windows-874",
+   "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA",
+   OnStringUtilConversionError::FAIL,
+   true,
+   L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35"
+   L"\x0E04\x0E23\x0e31\x0E1A",
+   NULL},
+};
 
+TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) {
     std::wstring wide;
     bool success = CodepageToWide(kConvertCodepageCases[i].encoded,
@@ -567,7 +647,9 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
     EXPECT_EQ(kConvertCodepageCases[i].wide, wide);
 
     // When decoding was successful and nothing was skipped, we also check the
-    // reverse conversion.
+    // reverse conversion. Not all conversions are round-trippable, but
+    // kConverterCodepageCases does not have any one-way conversion at the
+    // moment.
     if (success &&
         kConvertCodepageCases[i].on_error ==
             OnStringUtilConversionError::FAIL) {
@@ -590,6 +672,11 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
   EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1",
                              OnStringUtilConversionError::SKIP, &encoded));
   EXPECT_STREQ("Chinese", encoded.c_str());
+  // From Unicode, SUBSTITUTE is the same as SKIP for now.
+  EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1",
+                             OnStringUtilConversionError::SUBSTITUTE,
+                             &encoded));
+  EXPECT_STREQ("Chinese", encoded.c_str());
 
 #if defined(WCHAR_T_IS_UTF16)
   // When we're in UTF-16 mode, test an invalid UTF-16 character in the input.
@@ -611,6 +698,36 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
                               OnStringUtilConversionError::SKIP, &encoded));
 }
 
+TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) {
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) {
+    string16 utf16;
+    bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded,
+                                   kConvertCodepageCases[i].codepage_name,
+                                   kConvertCodepageCases[i].on_error,
+                                   &utf16);
+    string16 utf16_expected;
+    if (kConvertCodepageCases[i].u16_wide == NULL)
+      utf16_expected = BuildString16(kConvertCodepageCases[i].wide);
+    else
+      utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide);
+    EXPECT_EQ(kConvertCodepageCases[i].success, success);
+    EXPECT_EQ(utf16_expected, utf16);
+
+    // When decoding was successful and nothing was skipped, we also check the
+    // reverse conversion. See also the corresponding comment in
+    // ConvertBetweenCodepageAndWide.
+    if (success &&
+        kConvertCodepageCases[i].on_error ==
+            OnStringUtilConversionError::FAIL) {
+      std::string encoded;
+      success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name,
+                                kConvertCodepageCases[i].on_error, &encoded);
+      EXPECT_EQ(kConvertCodepageCases[i].success, success);
+      EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded);
+    }
+  }
+}
+
 TEST(StringUtilTest, ConvertASCII) {
   static const char* char_cases[] = {
     "Google Video",
diff --git a/net/base/net_util.cc b/net/base/net_util.cc
index 2e6292c..00beb4e 100644
--- a/net/base/net_util.cc
+++ b/net/base/net_util.cc
@@ -860,7 +860,7 @@ std::string CanonicalizeHost(const std::wstring& host,
   return CanonicalizeHost(converted_host, host_info);
 }
 
-std::string GetDirectoryListingHeader(const std::string& title) {
+std::string GetDirectoryListingHeader(const string16& title) {
   static const StringPiece header(NetModule::GetResource(IDR_DIR_HEADER_HTML));
   if (header.empty()) {
     NOTREACHED() << "expected resource not found";
@@ -874,15 +874,21 @@ std::string GetDirectoryListingHeader(const std::string& title) {
   return result;
 }
 
-std::string GetDirectoryListingEntry(const std::string& name,
+std::string GetDirectoryListingEntry(const string16& name,
+                                     const std::string& raw_bytes,
                                      bool is_dir,
                                      int64 size,
-                                     const Time& modified) {
+                                     Time modified) {
   std::string result;
   result.append("<script>addRow(");
   string_escape::JsonDoubleQuote(name, true, &result);
   result.append(",");
-  string_escape::JsonDoubleQuote(EscapePath(name), true, &result);
+  if (raw_bytes.empty()) {
+    string_escape::JsonDoubleQuote(EscapePath(UTF16ToUTF8(name)),
+                                   true, &result);
+  } else {
+    string_escape::JsonDoubleQuote(EscapePath(raw_bytes), true, &result);
+  }
   if (is_dir) {
     result.append(",1,");
   } else {
diff --git a/net/base/net_util.h b/net/base/net_util.h
index 40df770..4320e1c 100644
--- a/net/base/net_util.h
+++ b/net/base/net_util.h
@@ -14,6 +14,7 @@
 #include <string>
 
 #include "base/basictypes.h"
+#include "base/string16.h"
 #include "net/base/escape.h"
 
 struct addrinfo;
@@ -147,12 +148,24 @@ std::string CanonicalizeHost(const std::string& host,
 std::string CanonicalizeHost(const std::wstring& host,
                              url_canon::CanonHostInfo* host_info);
 
-// Call these functions to get the html for a directory listing.
-// They will pass non-7bit-ascii characters unescaped, allowing
-// the browser to interpret the encoding (utf8, etc).
-std::string GetDirectoryListingHeader(const std::string& title);
-std::string GetDirectoryListingEntry(const std::string& name, bool is_dir,
-                                     int64 size, const base::Time& modified);
+// Call these functions to get the html snippet for a directory listing.
+// The return values of both functions are in UTF-8.
+std::string GetDirectoryListingHeader(const string16& title);
+
+// Given the name of a file in a directory (ftp or local) and
+// other information (is_dir, size, modification time), it returns
+// the html snippet to add the entry for the file to the directory listing.
+// Currently, it's a script tag containing a call to a Javascript function
+// |addRow|.
+//
+// Its 1st parameter is derived from |name| and is the Javascript-string
+// escaped form of |name| (i.e \uXXXX). The 2nd parameter is the url-escaped
+// |raw_bytes| if it's not empty. If empty, the 2nd parameter is the
+// url-escaped |name| in UTF-8.
+std::string GetDirectoryListingEntry(const string16& name,
+                                     const std::string& raw_bytes,
+                                     bool is_dir, int64 size,
+                                     base::Time modified);
 
 // If text starts with "www." it is removed, otherwise text is returned
 // unmodified.
diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc
index 78f7ab9..f346e92 100644
--- a/net/base/net_util_unittest.cc
+++ b/net/base/net_util_unittest.cc
@@ -407,18 +407,32 @@ TEST(NetUtilTest, FileURLConversion) {
      "file://some%20computer/foo/bar.txt"}, // UNC
     {L"D:\\Name;with%some symbols*#",
      "file:///D:/Name%3Bwith%25some%20symbols*%23"},
+    // issue 14153: To be tested with the OS default codepage other than 1252.
+    {L"D:\\latin1\\caf\x00E9\x00DD.txt",
+     "file:///D:/latin1/caf%C3%A9%C3%9D.txt"},
+    {L"D:\\otherlatin\\caf\x0119.txt",
+     "file:///D:/otherlatin/caf%C4%99.txt"},
+    {L"D:\\greek\\\x03B1\x03B2\x03B3.txt",
+     "file:///D:/greek/%CE%B1%CE%B2%CE%B3.txt"},
     {L"D:\\Chinese\\\x6240\x6709\x4e2d\x6587\x7f51\x9875.doc",
      "file:///D:/Chinese/%E6%89%80%E6%9C%89%E4%B8%AD%E6%96%87%E7%BD%91"
          "%E9%A1%B5.doc"},
+    {L"D:\\plane1\\\xD835\xDC00\xD835\xDC01.txt",  // Math alphabet "AB"
+     "file:///D:/plane1/%F0%9D%90%80%F0%9D%90%81.txt"},
 #elif defined(OS_POSIX)
     {L"/foo/bar.txt", "file:///foo/bar.txt"},
     {L"/foo/BAR.txt", "file:///foo/BAR.txt"},
     {L"/C:/foo/bar.txt", "file:///C:/foo/bar.txt"},
     {L"/some computer/foo/bar.txt", "file:///some%20computer/foo/bar.txt"},
     {L"/Name;with%some symbols*#", "file:///Name%3Bwith%25some%20symbols*%23"},
+    {L"/latin1/caf\x00E9\x00DD.txt", "file:///latin1/caf%C3%A9%C3%9D.txt"},
+    {L"/otherlatin/caf\x0119.txt", "file:///otherlatin/caf%C4%99.txt"},
+    {L"/greek/\x03B1\x03B2\x03B3.txt", "file:///greek/%CE%B1%CE%B2%CE%B3.txt"},
     {L"/Chinese/\x6240\x6709\x4e2d\x6587\x7f51\x9875.doc",
      "file:///Chinese/%E6%89%80%E6%9C%89%E4%B8%AD%E6%96%87%E7%BD"
          "%91%E9%A1%B5.doc"},
+    {L"/plane1/\x1D400\x1D401.txt",  // Math alphabet "AB"
+     "file:///plane1/%F0%9D%90%80%F0%9D%90%81.txt"},
 #endif
   };
 
@@ -474,21 +488,6 @@ TEST(NetUtilTest, FileURLConversion) {
     EXPECT_EQ(url_cases[i].file, output.ToWStringHack());
   }
 
-  // Here, we test that UTF-8 encoded strings get decoded properly, even when
-  // they might be stored with wide characters.  On posix systems, just treat
-  // this as a stream of bytes.
-  const wchar_t utf8[] = L"file:///d:/Chinese/\xe6\x89\x80\xe6\x9c\x89\xe4\xb8"
-                         L"\xad\xe6\x96\x87\xe7\xbd\x91\xe9\xa1\xb5.doc";
-#if defined(OS_WIN)
-  const wchar_t wide[] =
-      L"D:\\Chinese\\\x6240\x6709\x4e2d\x6587\x7f51\x9875.doc";
-#elif defined(OS_POSIX)
-  const wchar_t wide[] = L"/d:/Chinese/\xe6\x89\x80\xe6\x9c\x89\xe4\xb8\xad\xe6"
-                         L"\x96\x87\xe7\xbd\x91\xe9\xa1\xb5.doc";
-#endif
-  EXPECT_TRUE(net::FileURLToFilePath(GURL(WideToUTF8(utf8)), &output));
-  EXPECT_EQ(wide, output.ToWStringHack());
-
   // Unfortunately, UTF8ToWide discards invalid UTF8 input.
 #ifdef BUG_878908_IS_FIXED
   // Test that no conversion happens if the UTF-8 input is invalid, and that
@@ -862,7 +861,8 @@ TEST(NetUtilTest, GetSuggestedFilename) {
 namespace {
 
 struct GetDirectoryListingEntryCase {
-  const char* name;
+  const wchar_t* name;
+  const char* raw_bytes;
   bool is_dir;
   int64 filesize;
   base::Time time;
@@ -872,22 +872,50 @@ struct GetDirectoryListingEntryCase {
 }  // namespace
 TEST(NetUtilTest, GetDirectoryListingEntry) {
   const GetDirectoryListingEntryCase test_cases[] = {
-    {"Foo",
+    {L"Foo",
+     "",
      false,
      10000,
      base::Time(),
      "<script>addRow(\"Foo\",\"Foo\",0,\"9.8 kB\",\"\");</script>\n"},
-    {"quo\"tes",
+    {L"quo\"tes",
+     "",
+     false,
+     10000,
+     base::Time(),
+     "<script>addRow(\"quo\\\"tes\",\"quo%22tes\",0,\"9.8 kB\",\"\");</script>"
+         "\n"},
+    {L"quo\"tes",
+     "quo\"tes",
      false,
      10000,
      base::Time(),
      "<script>addRow(\"quo\\\"tes\",\"quo%22tes\",0,\"9.8 kB\",\"\");</script>"
          "\n"},
+    // U+D55C0 U+AE00. raw_bytes is empty (either a local file with
+    // UTF-8/UTF-16 encoding or a remote file on an ftp server using UTF-8
+    {L"\xD55C\xAE00.txt",
+     "",
+     false,
+     10000,
+     base::Time(),
+     "<script>addRow(\"\\uD55C\\uAE00.txt\",\"%ED%95%9C%EA%B8%80.txt\""
+         ",0,\"9.8 kB\",\"\");</script>\n"},
+    // U+D55C0 U+AE00. raw_bytes is the corresponding EUC-KR sequence:
+    // a local or remote file in EUC-KR.
+    {L"\xD55C\xAE00.txt",
+     "\xC7\xD1\xB1\xDB.txt",
+     false,
+     10000,
+     base::Time(),
+     "<script>addRow(\"\\uD55C\\uAE00.txt\",\"%C7%D1%B1%DB.txt\""
+         ",0,\"9.8 kB\",\"\");</script>\n"},
   };
 
   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(test_cases); ++i) {
     const std::string results = net::GetDirectoryListingEntry(
-        test_cases[i].name,
+        WideToUTF16(test_cases[i].name),
+        test_cases[i].raw_bytes,
         test_cases[i].is_dir,
         test_cases[i].filesize,
         test_cases[i].time);
diff --git a/net/base/net_util_win.cc b/net/base/net_util_win.cc
index effb212..244f4ad 100644
--- a/net/base/net_util_win.cc
+++ b/net/base/net_util_win.cc
@@ -57,33 +57,13 @@ bool FileURLToFilePath(const GURL& url, FilePath* file_path) {
   }
   file_path_str.assign(UTF8ToWide(path));
 
-  // Now we have an unescaped filename, but are still not sure about its
-  // encoding. For example, each character could be part of a UTF-8 string.
-  if (file_path_str.empty() || !IsString8Bit(file_path_str)) {
-    // assume our 16-bit encoding is correct if it won't fit into an 8-bit
-    // string
-    return true;
-  }
-
-  // Convert our narrow string into the native wide path.
-  std::string narrow;
-  if (!WideToLatin1(file_path_str, &narrow)) {
-    NOTREACHED() << "Should have filtered out non-8-bit strings above.";
-    return false;
-  }
-  if (IsStringUTF8(narrow)) {
-    // Our string actually looks like it could be UTF-8, convert to 8-bit
-    // UTF-8 and then to the corresponding wide string.
-    file_path_str = UTF8ToWide(narrow);
-  } else {
-    // Our wide string contains only 8-bit characters and it's not UTF-8, so
-    // we assume it's in the native codepage.
-    file_path_str = base::SysNativeMBToWide(narrow);
-  }
-
-  // Fail if 8-bit -> wide conversion failed and gave us an empty string back
-  // (we already filtered out empty strings above).
-  return !file_path_str.empty();
+  // We used to try too hard and see if |path| made up entirely of
+  // the 1st 256 characters in the Unicode was a zero-extended UTF-16.
+  // If so, we converted it to 'Latin-1' and checked if the result was UTF-8.
+  // If the check passed, we converted the result to UTF-8.
+  // Otherwise, we treated the result as the native OS encoding.
+  // However, that led to http://crbug.com/4619 and http://crbug.com/14153
+  return true;
 }
 
 }  // namespace net
diff --git a/net/url_request/url_request_file_dir_job.cc b/net/url_request/url_request_file_dir_job.cc
index c242ef9..ecdf014 100644
--- a/net/url_request/url_request_file_dir_job.cc
+++ b/net/url_request/url_request_file_dir_job.cc
@@ -7,6 +7,7 @@
 #include "base/file_util.h"
 #include "base/message_loop.h"
 #include "base/string_util.h"
+#include "base/sys_string_conversions.h"
 #include "base/time.h"
 #include "googleurl/src/gurl.h"
 #include "net/base/io_buffer.h"
@@ -104,9 +105,15 @@ void URLRequestFileDirJob::OnListFile(
   // can catch errors from DirectoryLister and show an error page.
   if (!wrote_header_) {
 #if defined(OS_WIN)
-    const std::string& title = WideToUTF8(dir_path_.value());
+    const string16& title = dir_path_.value();
 #elif defined(OS_POSIX)
-    const std::string& title = dir_path_.value();
+    // TODO(jungshik): Add SysNativeMBToUTF16 to sys_string_conversions.
+    // On Mac, need to add NFKC->NFC conversion either here or in file_path.
+    // On Linux, the file system encoding is not defined, but we assume that
+    // SysNativeMBToWide takes care of it at least for now. We can try something
+    // more sophisticated if necessary later.
+    const string16& title = WideToUTF16(
+        base::SysNativeMBToWide(dir_path_.value()));
 #endif
     data_.append(net::GetDirectoryListingHeader(title));
     wrote_header_ = true;
@@ -119,14 +126,16 @@ void URLRequestFileDirJob::OnListFile(
       data.nFileSizeLow;
 
   data_.append(net::GetDirectoryListingEntry(
-      WideToUTF8(data.cFileName),
+      data.cFileName, std::string(),
       (data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ? true : false,
       size,
       base::Time::FromFileTime(local_time)));
 
 #elif defined(OS_POSIX)
+  // TOOD(jungshik): The same issue as for the directory name.
   data_.append(net::GetDirectoryListingEntry(
-      data.filename.c_str(),
+      WideToUTF16(base::SysNativeMBToWide(data.filename)),
+      data.filename,
       S_ISDIR(data.stat.st_mode),
       data.stat.st_size,
       base::Time::FromTimeT(data.stat.st_mtime)));
diff --git a/net/url_request/url_request_ftp_job.cc b/net/url_request/url_request_ftp_job.cc
index bdfb0b3..c7cb333 100644
--- a/net/url_request/url_request_ftp_job.cc
+++ b/net/url_request/url_request_ftp_job.cc
@@ -9,6 +9,7 @@
 
 #include "base/message_loop.h"
 #include "base/string_util.h"
+#include "base/sys_string_conversions.h"
 #include "base/time.h"
 #include "net/base/auth.h"
 #include "net/base/escape.h"
@@ -388,11 +389,21 @@ void URLRequestFtpJob::OnFindFile(DWORD last_error) {
         (static_cast<unsigned __int64>(find_data_.nFileSizeHigh) << 32) |
         find_data_.nFileSizeLow;
 
-    // We don't know the encoding, and can't assume utf8, so pass the 8bit
-    // directly to the browser for it to decide.
+    // We don't know the encoding used on an FTP server, but we
+    // use FtpFindFirstFileA, which I guess does NOT preserve
+    // the raw byte sequence because it's implemented in terms
+    // of FtpFindFirstFileW. Without the raw byte sequence, we
+    // can't apply the encoding detection or other heuristics
+    // to determine/guess the encoding. Neither can we use UTF-8
+    // used by a RFC-2640-compliant FTP server. In some cases (e.g.
+    // the default code page is an SBCS with almost all bytes assigned.
+    // In lucky cases, it's even possible with a DBCS), it's possible
+    // to recover the raw byte sequence in most cases. We can do
+    // some more here, but it's not worth the effort because  we're
+    // going to replace this class with URLRequestNewFtpJob.
     string file_entry = net::GetDirectoryListingEntry(
-        find_data_.cFileName, false, size,
-        base::Time::FromFileTime(find_data_.ftLastWriteTime));
+        base::SysNativeMBToWide(find_data_.cFileName), std::string(),
+        false, size, base::Time::FromFileTime(find_data_.ftLastWriteTime));
     WriteData(&file_entry, true);
 
     FindNextFile();
@@ -407,14 +418,20 @@ void URLRequestFtpJob::OnStartDirectoryTraversal() {
   state_ = GETTING_DIRECTORY;
 
   // Unescape the URL path and pass the raw 8bit directly to the browser.
+  //
+  // Here we can try to detect the encoding although it may not be very
+  // reliable because it's not likely to be long enough. Because this class
+  // will be replaced by URLRequestNewFtpJob and is used only on Windows,
+  // we use SysNativeMBToWide as a stopgap measure.
   string html = net::GetDirectoryListingHeader(
-      UnescapeURLComponent(request_->url().path(),
-          UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS));
+      base::SysNativeMBToWide(UnescapeURLComponent(request_->url().path(),
+          UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS)));
 
   // If this isn't top level directory (i.e. the path isn't "/",) add a link to
   // the parent directory.
   if (request_->url().path().length() > 1)
-    html.append(net::GetDirectoryListingEntry("..", false, 0, base::Time()));
+    html.append(net::GetDirectoryListingEntry(L"..", std::string(),
+          false, 0, base::Time()));
 
   WriteData(&html, true);
 
diff --git a/net/url_request/url_request_new_ftp_job.cc b/net/url_request/url_request_new_ftp_job.cc
index d3a0c3e..d9f1d27 100644
--- a/net/url_request/url_request_new_ftp_job.cc
+++ b/net/url_request/url_request_new_ftp_job.cc
@@ -7,6 +7,7 @@
 #include "base/compiler_specific.h"
 #include "base/file_version_info.h"
 #include "base/message_loop.h"
+#include "base/sys_string_conversions.h"
 #include "net/base/escape.h"
 #include "net/base/net_errors.h"
 #include "net/base/net_util.h"
@@ -16,6 +17,46 @@
 #include "net/url_request/url_request.h"
 #include "net/url_request/url_request_context.h"
 #include "net/url_request/url_request_error_job.h"
+#include "unicode/ucsdet.h"
+
+namespace {
+
+// A very simple-minded character encoding detection.
+// TODO(jungshik): We can apply more heuristics here (e.g. using various hints
+// like TLD, the UI language/default encoding of a client, etc). In that case,
+// this should be pulled out of here and moved somewhere in base because there
+// can be other use cases.
+std::string DetectEncoding(const char*input, size_t len) {
+  if (IsStringASCII(std::string(input, len)))
+    return std::string();
+  UErrorCode status = U_ZERO_ERROR;
+  UCharsetDetector* detector = ucsdet_open(&status);
+  ucsdet_setText(detector, input, static_cast<int32_t>(len), &status);
+  const UCharsetMatch* match = ucsdet_detect(detector, &status);
+  const char* encoding = ucsdet_getName(match, &status);
+  // Should we check the quality of the match? A rather arbitrary number is
+  // assigned by ICU and it's hard to come up with a lower limit.
+  if (U_FAILURE(status))
+    return std::string();
+  return encoding;
+}
+
+string16 RawByteSequenceToFilename(const char* raw_filename,
+                                   const std::string& encoding) {
+  if (encoding.empty())
+    return ASCIIToUTF16(raw_filename);
+
+  // Try the detected encoding before falling back to the native codepage.
+  // Using the native codepage does not make much sense, but we don't have
+  // much else to resort to.
+  string16 filename;
+  if (!CodepageToUTF16(raw_filename, encoding.c_str(),
+                       OnStringUtilConversionError::SUBSTITUTE, &filename))
+    filename = WideToUTF16Hack(base::SysNativeMBToWide(raw_filename));
+  return filename;
+}
+
+}  // namespace
 
 URLRequestNewFtpJob::URLRequestNewFtpJob(URLRequest* request)
     : URLRequestJob(request),
@@ -69,17 +110,36 @@ bool URLRequestNewFtpJob::ReadRawData(net::IOBuffer* buf,
   if (response_info_ == NULL) {
      response_info_ = transaction_->GetResponseInfo();
     if (response_info_->is_directory_listing) {
-      // Unescape the URL path and pass the raw 8bit directly to the browser.
-      directory_html_ = net::GetDirectoryListingHeader(
+      std::string escaped_path =
           UnescapeURLComponent(request_->url().path(),
-          UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS));
+          UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
+      string16 path_utf16;
+      // Per RFC 2640, FTP servers should use UTF-8 or its proper subset ASCII,
+      // but many old FTP servers use legacy encodings. Try UTF-8 first and
+      // detect the encoding.
+      if (IsStringUTF8(escaped_path)) {
+        path_utf16 = UTF8ToUTF16(escaped_path);
+      } else {
+        std::string encoding = DetectEncoding(escaped_path.c_str(),
+                                              escaped_path.size());
+        // Try the detected encoding. If it fails, resort to the
+        // OS native encoding.
+        if (encoding.empty() ||
+            !CodepageToUTF16(escaped_path, encoding.c_str(),
+                             OnStringUtilConversionError::SUBSTITUTE,
+                             &path_utf16))
+          path_utf16 = WideToUTF16Hack(base::SysNativeMBToWide(escaped_path));
+      }
+
+      directory_html_ = net::GetDirectoryListingHeader(path_utf16);
       // If this isn't top level directory (i.e. the path isn't "/",)
       // add a link to the parent directory.
       if (request_->url().path().length() > 1)
-        directory_html_.append(net::GetDirectoryListingEntry("..",
-                                                             false,
-                                                             0,
-                                                             base::Time()));
+        directory_html_.append(
+            net::GetDirectoryListingEntry(ASCIIToUTF16(".."),
+                                          std::string(),
+                                          false, 0,
+                                          base::Time()));
     }
   }
   if (!directory_html_.empty()) {
@@ -121,6 +181,20 @@ int URLRequestNewFtpJob::ProcessFtpDir(net::IOBuffer *buf,
   std::string file_entry;
   std::string line;
   buf->data()[bytes_read] = 0;
+
+  // If all we've seen so far is ASCII, encoding_ is empty. Try to detect the
+  // encoding. We don't do the separate UTF-8 check here because the encoding
+  // detection with a longer chunk (as opposed to the relatively short path
+  // component of the url) is unlikely to mistake UTF-8 for a legacy encoding.
+  // If it turns out to be wrong, a separate UTF-8 check has to be added.
+  //
+  // TODO(jungshik): UTF-8 has to be 'enforced' without any heuristics when
+  // we're talking to an FTP server compliant to RFC 2640 (that is, its response
+  // to FEAT command includes 'UTF8').
+  // See http://wiki.filezilla-project.org/Character_Set
+  if (encoding_.empty())
+    encoding_ = DetectEncoding(buf->data(), bytes_read);
+
   int64 file_size;
   std::istringstream iss(buf->data());
   while (getline(iss, line)) {
@@ -144,6 +218,7 @@ int URLRequestNewFtpJob::ProcessFtpDir(net::IOBuffer *buf,
         et.day_of_week = result.fe_time.tm_wday;
 
         file_entry.append(net::GetDirectoryListingEntry(
+            RawByteSequenceToFilename(result.fe_fname, encoding_),
             result.fe_fname, true, 0, base::Time::FromLocalExploded(et)));
         break;
       case net::FTP_TYPE_FILE:
@@ -163,6 +238,7 @@ int URLRequestNewFtpJob::ProcessFtpDir(net::IOBuffer *buf,
         // It returns wrong date/time (Differnce is 1 day and 17 Hours).
         if (StringToInt64(result.fe_size, &file_size))
           file_entry.append(net::GetDirectoryListingEntry(
+              RawByteSequenceToFilename(result.fe_fname, encoding_),
               result.fe_fname, false, file_size,
               base::Time::FromLocalExploded(et)));
         break;
diff --git a/net/url_request/url_request_new_ftp_job.h b/net/url_request/url_request_new_ftp_job.h
index a74a265..69c1fef 100644
--- a/net/url_request/url_request_new_ftp_job.h
+++ b/net/url_request/url_request_new_ftp_job.h
@@ -59,6 +59,7 @@ class URLRequestNewFtpJob : public URLRequestJob {
 
   std::string directory_html_;
   bool read_in_progress_;
+  std::string encoding_;
 
   // Keep a reference to the url request context to be sure it's not deleted
   // before us.