summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--base/string_util.h21
-rw-r--r--base/string_util_icu.cc285
-rw-r--r--base/string_util_unittest.cc313
-rw-r--r--net/base/net_util.cc14
-rw-r--r--net/base/net_util.h25
-rw-r--r--net/base/net_util_unittest.cc66
-rw-r--r--net/base/net_util_win.cc34
-rw-r--r--net/url_request/url_request_file_dir_job.cc17
-rw-r--r--net/url_request/url_request_ftp_job.cc31
-rw-r--r--net/url_request/url_request_new_ftp_job.cc90
-rw-r--r--net/url_request/url_request_new_ftp_job.h1
11 files changed, 645 insertions, 252 deletions
diff --git a/base/string_util.h b/base/string_util.h
index 9a033b4..c7f3115 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -221,7 +221,8 @@ std::string UTF16ToUTF8(const string16& utf16);
# define UTF16ToWideHack UTF16ToWide
#endif
-// Defines the error handling modes of WideToCodepage and CodepageToWide.
+// Defines the error handling modes of UTF16ToCodepage, CodepageToUTF16,
+// WideToCodepage and CodepageToWide.
class OnStringUtilConversionError {
public:
enum Type {
@@ -231,12 +232,30 @@ class OnStringUtilConversionError {
// The offending characters are skipped and the conversion will proceed as
// if they did not exist.
SKIP,
+
+ // When converting to Unicode, the offending byte sequences are substituted
+ // by Unicode replacement character (U+FFFD). When converting from Unicode,
+ // this is the same as SKIP.
+ SUBSTITUTE,
};
private:
OnStringUtilConversionError();
};
+// Converts between UTF-16 strings and the encoding specified. If the
+// encoding doesn't exist or the encoding fails (when on_error is FAIL),
+// returns false.
+bool UTF16ToCodepage(const string16& utf16,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ std::string* encoded);
+
+bool CodepageToUTF16(const std::string& encoded,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ string16* utf16);
+
// Converts between wide strings and the encoding specified. If the
// encoding doesn't exist or the encoding fails (when on_error is FAIL),
// returns false.
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
index 87731de..3bd6f9b 100644
--- a/base/string_util_icu.cc
+++ b/base/string_util_icu.cc
@@ -10,8 +10,10 @@
#include "base/basictypes.h"
#include "base/logging.h"
#include "base/singleton.h"
-#include "unicode/ucnv.h"
#include "unicode/numfmt.h"
+#include "unicode/ucnv.h"
+#include "unicode/ucnv_cb.h"
+#include "unicode/ucnv_err.h"
#include "unicode/ustring.h"
namespace {
@@ -24,6 +26,64 @@ inline bool IsValidCodepoint(uint32 code_point) {
(code_point >= 0xE000u && code_point <= 0x10FFFFu);
}
+// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE
+// in source/common/ucnv_err.c.
+
+// Copyright (c) 1995-2006 International Business Machines Corporation
+// and others
+//
+// All rights reserved.
+//
+
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, and/or
+// sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, provided that the above copyright notice(s) and
+// this permission notice appear in all copies of the Software and that
+// both the above copyright notice(s) and this permission notice appear in
+// supporting documentation.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+// OR PERFORMANCE OF THIS SOFTWARE.
+//
+// Except as contained in this notice, the name of a copyright holder
+// shall not be used in advertising or otherwise to promote the sale, use
+// or other dealings in this Software without prior written authorization
+// of the copyright holder.
+
+// ___________________________________________________________________________
+//
+// All trademarks and registered trademarks mentioned herein are the property
+// of their respective owners.
+
+void ToUnicodeCallbackSubstitute(const void* context,
+ UConverterToUnicodeArgs *to_args,
+ const char* code_units,
+ int32_t length,
+ UConverterCallbackReason reason,
+ UErrorCode * err) {
+ static const UChar kReplacementChar = 0xFFFD;
+ if (reason <= UCNV_IRREGULAR) {
+ if (context == NULL ||
+ (*(reinterpret_cast<const char*>(context)) == 'i' &&
+ reason == UCNV_UNASSIGNED)) {
+ *err = U_ZERO_ERROR;
+ ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err);
+ }
+ // else the caller must have set the error code accordingly.
+ }
+ // else ignore the reset, close and clone calls.
+}
+
// ReadUnicodeCharacter --------------------------------------------------------
// Reads a UTF-8 stream, placing the next code point into the given output
@@ -76,7 +136,7 @@ bool ReadUnicodeCharacter(const char16* src, int32 src_len,
#if defined(WCHAR_T_IS_UTF32)
// Reads UTF-32 character. The usage is the same as the 8-bit version above.
bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
- int32* char_index, uint32* code_point) {
+ int32* char_index, uint32* code_point) {
// Conversion is easy since the source is 32-bit.
*code_point = src[*char_index];
@@ -184,6 +244,70 @@ void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) {
}
}
+bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src,
+ int uchar_len, OnStringUtilConversionError::Type on_error,
+ std::string* encoded) {
+ int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
+ ucnv_getMaxCharSize(converter));
+ encoded->resize(encoded_max_length);
+
+ UErrorCode status = U_ZERO_ERROR;
+
+ // Setup our error handler.
+ switch (on_error) {
+ case OnStringUtilConversionError::FAIL:
+ ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
+ NULL, NULL, &status);
+ break;
+ case OnStringUtilConversionError::SKIP:
+ case OnStringUtilConversionError::SUBSTITUTE:
+ ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
+ NULL, NULL, &status);
+ break;
+ default:
+ NOTREACHED();
+ }
+
+ // ucnv_fromUChars returns size not including terminating null
+ int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
+ encoded_max_length, uchar_src, uchar_len, &status);
+ encoded->resize(actual_size);
+ ucnv_close(converter);
+ if (U_SUCCESS(status))
+ return true;
+ encoded->clear(); // Make sure the output is empty on error.
+ return false;
+}
+
+// Set up our error handler for ToUTF-16 converters
+void SetUpErrorHandlerForToUChars(OnStringUtilConversionError::Type on_error,
+ UConverter* converter, UErrorCode* status) {
+ switch (on_error) {
+ case OnStringUtilConversionError::FAIL:
+ ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
+ NULL, NULL, status);
+ break;
+ case OnStringUtilConversionError::SKIP:
+ ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
+ NULL, NULL, status);
+ break;
+ case OnStringUtilConversionError::SUBSTITUTE:
+ ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0,
+ NULL, NULL, status);
+ break;
+ default:
+ NOTREACHED();
+ }
+}
+
+inline UConverterType utf32_platform_endian() {
+#if U_IS_BIG_ENDIAN
+ return UCNV_UTF32_BigEndian;
+#else
+ return UCNV_UTF32_LittleEndian;
+#endif
+}
+
} // namespace
// UTF-8 <-> Wide --------------------------------------------------------------
@@ -364,14 +488,17 @@ std::string UTF16ToUTF8(const string16& utf16) {
#endif
-// Codepage <-> Wide -----------------------------------------------------------
+// Codepage <-> Wide/UTF-16 ---------------------------------------------------
-// Convert a unicode string into the specified codepage_name. If the codepage
+// Convert a wstring into the specified codepage_name. If the codepage
// isn't found, return false.
bool WideToCodepage(const std::wstring& wide,
const char* codepage_name,
OnStringUtilConversionError::Type on_error,
std::string* encoded) {
+#if defined(WCHAR_T_IS_UTF16)
+ return UTF16ToCodepage(wide, codepage_name, on_error, encoded);
+#elif defined(WCHAR_T_IS_UTF32)
encoded->clear();
UErrorCode status = U_ZERO_ERROR;
@@ -379,59 +506,47 @@ bool WideToCodepage(const std::wstring& wide,
if (!U_SUCCESS(status))
return false;
- const UChar* uchar_src;
- int uchar_len;
-#if defined(WCHAR_T_IS_UTF16)
- uchar_src = wide.c_str();
- uchar_len = static_cast<int>(wide.length());
-#elif defined(WCHAR_T_IS_UTF32)
+ int utf16_len;
// When wchar_t is wider than UChar (16 bits), transform |wide| into a
// UChar* string. Size the UChar* buffer to be large enough to hold twice
- // as many UTF-16 code points as there are UTF-16 characters, in case each
- // character translates to a UTF-16 surrogate pair, and leave room for a NUL
- // terminator.
- std::vector<UChar> wide_uchar(wide.length() * 2 + 1);
- u_strFromWCS(&wide_uchar[0], wide_uchar.size(), &uchar_len,
+ // as many UTF-16 code units (UChar's) as there are Unicode code points,
+ // in case each code points translates to a UTF-16 surrogate pair,
+ // and leave room for a NUL terminator.
+ std::vector<UChar> utf16(wide.length() * 2 + 1);
+ u_strFromWCS(&utf16[0], utf16.size(), &utf16_len,
wide.c_str(), wide.length(), &status);
- uchar_src = &wide_uchar[0];
DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*";
+
+ return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded);
#endif // defined(WCHAR_T_IS_UTF32)
+}
- int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
- ucnv_getMaxCharSize(converter));
- encoded->resize(encoded_max_length);
+// Convert a UTF-16 string into the specified codepage_name. If the codepage
+// isn't found, return false.
+bool UTF16ToCodepage(const string16& utf16,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ std::string* encoded) {
+ encoded->clear();
- // Setup our error handler.
- switch (on_error) {
- case OnStringUtilConversionError::FAIL:
- ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
- NULL, NULL, &status);
- break;
- case OnStringUtilConversionError::SKIP:
- ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
- NULL, NULL, &status);
- break;
- default:
- NOTREACHED();
- }
+ UErrorCode status = U_ZERO_ERROR;
+ UConverter* converter = ucnv_open(codepage_name, &status);
+ if (!U_SUCCESS(status))
+ return false;
- // ucnv_fromUChars returns size not including terminating null
- int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
- encoded_max_length, uchar_src, uchar_len, &status);
- encoded->resize(actual_size);
- ucnv_close(converter);
- if (U_SUCCESS(status))
- return true;
- encoded->clear(); // Make sure the output is empty on error.
- return false;
+ return ConvertFromUTF16(converter, utf16.c_str(),
+ static_cast<int>(utf16.length()), on_error, encoded);
}
-// Converts a string of the given codepage into unicode.
+// Converts a string of the given codepage into wstring.
// If the codepage isn't found, return false.
bool CodepageToWide(const std::string& encoded,
const char* codepage_name,
OnStringUtilConversionError::Type on_error,
std::wstring* wide) {
+#if defined(WCHAR_T_IS_UTF16)
+ return CodepageToUTF16(encoded, codepage_name, on_error, wide);
+#elif defined(WCHAR_T_IS_UTF32)
wide->clear();
UErrorCode status = U_ZERO_ERROR;
@@ -439,6 +554,51 @@ bool CodepageToWide(const std::string& encoded,
if (!U_SUCCESS(status))
return false;
+ // The maximum length in 4 byte unit of UTF-32 output would be
+ // at most the same as the number of bytes in input. In the worst
+ // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),
+ // this can be 4 times larger than actually needed.
+ size_t wchar_max_length = encoded.length() + 1;
+
+ // The byte buffer and its length to pass to ucnv_toAlgorithimic.
+ char* byte_buffer = reinterpret_cast<char*>(
+ WriteInto(wide, wchar_max_length));
+ int byte_buffer_length = static_cast<int>(wchar_max_length) * 4;
+
+ SetUpErrorHandlerForToUChars(on_error, converter, &status);
+ int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(),
+ converter,
+ byte_buffer,
+ byte_buffer_length,
+ encoded.data(),
+ static_cast<int>(encoded.length()),
+ &status);
+ ucnv_close(converter);
+
+ if (!U_SUCCESS(status)) {
+ wide->clear(); // Make sure the output is empty on error.
+ return false;
+ }
+
+ // actual_size is # of bytes.
+ wide->resize(actual_size / 4);
+ return true;
+#endif // defined(WCHAR_T_IS_UTF32)
+}
+
+// Converts a string of the given codepage into UTF-16.
+// If the codepage isn't found, return false.
+bool CodepageToUTF16(const std::string& encoded,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ string16* utf16) {
+ utf16->clear();
+
+ UErrorCode status = U_ZERO_ERROR;
+ UConverter* converter = ucnv_open(codepage_name, &status);
+ if (!U_SUCCESS(status))
+ return false;
+
// Even in the worst case, the maximum length in 2-byte units of UTF-16
// output would be at most the same as the number of bytes in input. There
// is no single-byte encoding in which a character is mapped to a
@@ -449,53 +609,20 @@ bool CodepageToWide(const std::string& encoded,
// BOCU and SCSU, but we don't care about them.
size_t uchar_max_length = encoded.length() + 1;
- UChar* uchar_dst;
-#if defined(WCHAR_T_IS_UTF16)
- uchar_dst = WriteInto(wide, uchar_max_length);
-#elif defined(WCHAR_T_IS_UTF32)
- // When wchar_t is wider than UChar (16 bits), convert into a temporary
- // UChar* buffer.
- std::vector<UChar> wide_uchar(uchar_max_length);
- uchar_dst = &wide_uchar[0];
-#endif // defined(WCHAR_T_IS_UTF32)
-
- // Setup our error handler.
- switch (on_error) {
- case OnStringUtilConversionError::FAIL:
- ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
- NULL, NULL, &status);
- break;
- case OnStringUtilConversionError::SKIP:
- ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
- NULL, NULL, &status);
- break;
- default:
- NOTREACHED();
- }
-
+ SetUpErrorHandlerForToUChars(on_error, converter, &status);
int actual_size = ucnv_toUChars(converter,
- uchar_dst,
+ WriteInto(utf16, uchar_max_length),
static_cast<int>(uchar_max_length),
encoded.data(),
static_cast<int>(encoded.length()),
&status);
ucnv_close(converter);
if (!U_SUCCESS(status)) {
- wide->clear(); // Make sure the output is empty on error.
+ utf16->clear(); // Make sure the output is empty on error.
return false;
}
-#ifdef WCHAR_T_IS_UTF32
- // When wchar_t is wider than UChar (16 bits), it's not possible to wind up
- // with any more wchar_t elements than UChar elements. ucnv_toUChars
- // returns the number of UChar elements not including the NUL terminator, so
- // leave extra room for that.
- u_strToWCS(WriteInto(wide, actual_size + 1), actual_size + 1, &actual_size,
- uchar_dst, actual_size, &status);
- DCHECK(U_SUCCESS(status)) << "failed to convert UChar* to wstring";
-#endif // WCHAR_T_IS_UTF32
-
- wide->resize(actual_size);
+ utf16->resize(actual_size);
return true;
}
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 6f196cc..4968950 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -13,8 +13,30 @@
#include "testing/gtest/include/gtest/gtest.h"
namespace {
+
+// Given a null-terminated string of wchar_t with each wchar_t representing
+// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input.
+// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF)
+// should be represented as a surrogate pair (two UTF-16 units)
+// *even* where wchar_t is 32-bit (Linux and Mac).
+//
+// This is to help write tests for functions with string16 params until
+// the C++ 0x UTF-16 literal is well-supported by compilers.
+string16 BuildString16(const wchar_t* s) {
+#if defined(WCHAR_T_IS_UTF16)
+ return string16(s);
+#elif defined(WCHAR_T_IS_UTF32)
+ string16 u16;
+ while (*s != 0) {
+ DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu);
+ u16.push_back(*s++);
+ }
+ return u16;
+#endif
}
+} // namespace
+
static const struct trim_case {
const wchar_t* input;
const TrimPositions positions;
@@ -459,104 +481,162 @@ TEST(StringUtilTest, ConvertCodepageUTF8) {
}
}
-TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
- static const struct {
- const char* codepage_name;
- const char* encoded;
- OnStringUtilConversionError::Type on_error;
- bool success;
- const wchar_t* wide;
- } kConvertCodepageCases[] = {
- // Test a case where the input can no be decoded, using both SKIP and FAIL
- // error handling rules. "A7 41" is valid, but "A6" isn't.
- {"big5",
- "\xA7\x41\xA6",
- OnStringUtilConversionError::FAIL,
- false,
- L""},
- {"big5",
- "\xA7\x41\xA6",
- OnStringUtilConversionError::SKIP,
- true,
- L"\x4F60"},
- // Arabic (ISO-8859)
- {"iso-8859-6",
- "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " "
- "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" "
- L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652"},
- // Chinese Simplified (GB2312)
- {"gb2312",
- "\xC4\xE3\xBA\xC3",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x4F60\x597D"},
- // Chinese Traditional (BIG5)
- {"big5",
- "\xA7\x41\xA6\x6E",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x4F60\x597D"},
- // Greek (ISO-8859)
- {"iso-8859-7",
- "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5"},
- // Hebrew (Windows)
- {"windows-1255", /* to be replaced with "iso-8859-8-I"? */
- "\xF9\xD1\xC8\xEC\xE5\xC9\xED",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD"},
- // Hindi Devanagari (ISCII)
- {"iscii-dev",
- "\xEF\x42" "\xC6\xCC\xD7\xE8\xB3\xDA\xCF",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x0928\x092E\x0938\x094D\x0915\x093E\x0930"},
- // Korean (EUC)
- {"euc-kr",
- "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4",
- OnStringUtilConversionError::FAIL,
- true,
- L"\xC548\xB155\xD558\xC138\xC694"},
- // Japanese (EUC)
- {"euc-jp",
- "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x3053\x3093\x306B\x3061\x306F"},
- // Japanese (ISO-2022)
- {"iso-2022-jp",
- "\x1B\x24\x42" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F" "\x1B\x28\x42",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x3053\x3093\x306B\x3061\x306F"},
- // Japanese (Shift-JIS)
- {"sjis",
- "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x3053\x3093\x306B\x3061\x306F"},
- // Russian (KOI8)
- {"koi8-r",
- "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
- L"\x0443\x0439\x0442\x0435"},
- // Thai (ISO-8859)
- {"windows-874", /* to be replaced with "iso-8859-11". */
- "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35"
- L"\x0E04\x0E23\x0e31\x0E1A"},
- };
+// kConverterCodepageCases is not comprehensive. There are a number of cases
+// to add if we really want to have a comprehensive coverage of various
+// codepages and their 'idiosyncrasies'. Currently, the only implementation
+// for CodepageTo* and *ToCodepage uses ICU, which has a very extensive
+// set of tests for the charset conversion. So, we can get away with a
+// relatively small number of cases listed below.
+//
+// Note about |u16_wide| in the following struct.
+// On Windows, the field is always identical to |wide|. On Mac and Linux,
+// it's identical as long as there's no character outside the
+// BMP (<= U+FFFF). When there is, it is different from |wide| and
+// is not a real wide string (UTF-32 string) in that each wchar_t in
+// the string is a UTF-16 code unit zero-extended to be 32-bit
+// even when the code unit belongs to a surrogate pair.
+// For instance, a Unicode string (U+0041 U+010000) is represented as
+// L"\x0041\xD800\xDC00" instead of L"\x0041\x10000".
+// To avoid the clutter, |u16_wide| will be set to NULL
+// if it's identical to |wide| on *all* platforms.
+
+static const struct {
+ const char* codepage_name;
+ const char* encoded;
+ OnStringUtilConversionError::Type on_error;
+ bool success;
+ const wchar_t* wide;
+ const wchar_t* u16_wide;
+} kConvertCodepageCases[] = {
+ // Test a case where the input cannot be decoded, using SKIP, FAIL
+ // and SUBSTITUTE error handling rules. "A7 41" is valid, but "A6" isn't.
+ {"big5",
+ "\xA7\x41\xA6",
+ OnStringUtilConversionError::FAIL,
+ false,
+ L"",
+ NULL},
+ {"big5",
+ "\xA7\x41\xA6",
+ OnStringUtilConversionError::SKIP,
+ true,
+ L"\x4F60",
+ NULL},
+ {"big5",
+ "\xA7\x41\xA6",
+ OnStringUtilConversionError::SUBSTITUTE,
+ true,
+ L"\x4F60\xFFFD",
+ NULL},
+ // Arabic (ISO-8859)
+ {"iso-8859-6",
+ "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " "
+ "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" "
+ L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652",
+ NULL},
+ // Chinese Simplified (GB2312)
+ {"gb2312",
+ "\xC4\xE3\xBA\xC3",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x4F60\x597D",
+ NULL},
+ // Chinese (GB18030) : 4 byte sequences mapped to BMP characters
+ {"gb18030",
+ "\x81\x30\x84\x36\xA1\xA7",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x00A5\x00A8",
+ NULL},
+ // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000)
+ {"gb18030",
+ "\x95\x32\x82\x36\xD2\xBB",
+ OnStringUtilConversionError::FAIL,
+ true,
+#if defined(WCHAR_T_IS_UTF16)
+ L"\xD840\xDC00\x4E00",
+#else
+ L"\x20000\x4E00",
+#endif
+ L"\xD840\xDC00\x4E00"},
+ {"big5",
+ "\xA7\x41\xA6\x6E",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x4F60\x597D",
+ NULL},
+ // Greek (ISO-8859)
+ {"iso-8859-7",
+ "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5",
+ NULL},
+ // Hebrew (Windows)
+ {"windows-1255",
+ "\xF9\xD1\xC8\xEC\xE5\xC9\xED",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD",
+ NULL},
+ // Hindi Devanagari (ISCII)
+ {"iscii-dev",
+ "\xEF\x42" "\xC6\xCC\xD7\xE8\xB3\xDA\xCF",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x0928\x092E\x0938\x094D\x0915\x093E\x0930",
+ NULL},
+ // Korean (EUC)
+ {"euc-kr",
+ "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\xC548\xB155\xD558\xC138\xC694",
+ NULL},
+ // Japanese (EUC)
+ {"euc-jp",
+ "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF\xB0\xEC\x8F\xB0\xA1\x8E\xA6",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x3053\x3093\x306B\x3061\x306F\x4E00\x4E02\xFF66",
+ NULL},
+ // Japanese (ISO-2022)
+ {"iso-2022-jp",
+ "\x1B$B" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F\x30\x6C" "\x1B(B"
+ "ab" "\x1B(J" "\x5C\x7E#$" "\x1B(B",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x3053\x3093\x306B\x3061\x306F\x4E00" L"ab\x00A5\x203E#$",
+ NULL},
+ // Japanese (Shift-JIS)
+ {"sjis",
+ "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD\x88\xEA\xA6",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66",
+ NULL},
+ // Russian (KOI8)
+ {"koi8-r",
+ "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
+ L"\x0443\x0439\x0442\x0435",
+ NULL},
+ // Thai (windows-874)
+ {"windows-874",
+ "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35"
+ L"\x0E04\x0E23\x0e31\x0E1A",
+ NULL},
+};
+TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) {
std::wstring wide;
bool success = CodepageToWide(kConvertCodepageCases[i].encoded,
@@ -567,7 +647,9 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
EXPECT_EQ(kConvertCodepageCases[i].wide, wide);
// When decoding was successful and nothing was skipped, we also check the
- // reverse conversion.
+ // reverse conversion. Not all conversions are round-trippable, but
+ // kConverterCodepageCases does not have any one-way conversion at the
+ // moment.
if (success &&
kConvertCodepageCases[i].on_error ==
OnStringUtilConversionError::FAIL) {
@@ -590,6 +672,11 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1",
OnStringUtilConversionError::SKIP, &encoded));
EXPECT_STREQ("Chinese", encoded.c_str());
+ // From Unicode, SUBSTITUTE is the same as SKIP for now.
+ EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1",
+ OnStringUtilConversionError::SUBSTITUTE,
+ &encoded));
+ EXPECT_STREQ("Chinese", encoded.c_str());
#if defined(WCHAR_T_IS_UTF16)
// When we're in UTF-16 mode, test an invalid UTF-16 character in the input.
@@ -611,6 +698,36 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
OnStringUtilConversionError::SKIP, &encoded));
}
+TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) {
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) {
+ string16 utf16;
+ bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded,
+ kConvertCodepageCases[i].codepage_name,
+ kConvertCodepageCases[i].on_error,
+ &utf16);
+ string16 utf16_expected;
+ if (kConvertCodepageCases[i].u16_wide == NULL)
+ utf16_expected = BuildString16(kConvertCodepageCases[i].wide);
+ else
+ utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide);
+ EXPECT_EQ(kConvertCodepageCases[i].success, success);
+ EXPECT_EQ(utf16_expected, utf16);
+
+ // When decoding was successful and nothing was skipped, we also check the
+ // reverse conversion. See also the corresponding comment in
+ // ConvertBetweenCodepageAndWide.
+ if (success &&
+ kConvertCodepageCases[i].on_error ==
+ OnStringUtilConversionError::FAIL) {
+ std::string encoded;
+ success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name,
+ kConvertCodepageCases[i].on_error, &encoded);
+ EXPECT_EQ(kConvertCodepageCases[i].success, success);
+ EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded);
+ }
+ }
+}
+
TEST(StringUtilTest, ConvertASCII) {
static const char* char_cases[] = {
"Google Video",
diff --git a/net/base/net_util.cc b/net/base/net_util.cc
index 2e6292c..00beb4e 100644
--- a/net/base/net_util.cc
+++ b/net/base/net_util.cc
@@ -860,7 +860,7 @@ std::string CanonicalizeHost(const std::wstring& host,
return CanonicalizeHost(converted_host, host_info);
}
-std::string GetDirectoryListingHeader(const std::string& title) {
+std::string GetDirectoryListingHeader(const string16& title) {
static const StringPiece header(NetModule::GetResource(IDR_DIR_HEADER_HTML));
if (header.empty()) {
NOTREACHED() << "expected resource not found";
@@ -874,15 +874,21 @@ std::string GetDirectoryListingHeader(const std::string& title) {
return result;
}
-std::string GetDirectoryListingEntry(const std::string& name,
+std::string GetDirectoryListingEntry(const string16& name,
+ const std::string& raw_bytes,
bool is_dir,
int64 size,
- const Time& modified) {
+ Time modified) {
std::string result;
result.append("<script>addRow(");
string_escape::JsonDoubleQuote(name, true, &result);
result.append(",");
- string_escape::JsonDoubleQuote(EscapePath(name), true, &result);
+ if (raw_bytes.empty()) {
+ string_escape::JsonDoubleQuote(EscapePath(UTF16ToUTF8(name)),
+ true, &result);
+ } else {
+ string_escape::JsonDoubleQuote(EscapePath(raw_bytes), true, &result);
+ }
if (is_dir) {
result.append(",1,");
} else {
diff --git a/net/base/net_util.h b/net/base/net_util.h
index 40df770..4320e1c 100644
--- a/net/base/net_util.h
+++ b/net/base/net_util.h
@@ -14,6 +14,7 @@
#include <string>
#include "base/basictypes.h"
+#include "base/string16.h"
#include "net/base/escape.h"
struct addrinfo;
@@ -147,12 +148,24 @@ std::string CanonicalizeHost(const std::string& host,
std::string CanonicalizeHost(const std::wstring& host,
url_canon::CanonHostInfo* host_info);
-// Call these functions to get the html for a directory listing.
-// They will pass non-7bit-ascii characters unescaped, allowing
-// the browser to interpret the encoding (utf8, etc).
-std::string GetDirectoryListingHeader(const std::string& title);
-std::string GetDirectoryListingEntry(const std::string& name, bool is_dir,
- int64 size, const base::Time& modified);
+// Call these functions to get the html snippet for a directory listing.
+// The return values of both functions are in UTF-8.
+std::string GetDirectoryListingHeader(const string16& title);
+
+// Given the name of a file in a directory (ftp or local) and
+// other information (is_dir, size, modification time), it returns
+// the html snippet to add the entry for the file to the directory listing.
+// Currently, it's a script tag containing a call to a Javascript function
+// |addRow|.
+//
+// Its 1st parameter is derived from |name| and is the Javascript-string
+// escaped form of |name| (i.e \uXXXX). The 2nd parameter is the url-escaped
+// |raw_bytes| if it's not empty. If empty, the 2nd parameter is the
+// url-escaped |name| in UTF-8.
+std::string GetDirectoryListingEntry(const string16& name,
+ const std::string& raw_bytes,
+ bool is_dir, int64 size,
+ base::Time modified);
// If text starts with "www." it is removed, otherwise text is returned
// unmodified.
diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc
index 78f7ab9..f346e92 100644
--- a/net/base/net_util_unittest.cc
+++ b/net/base/net_util_unittest.cc
@@ -407,18 +407,32 @@ TEST(NetUtilTest, FileURLConversion) {
"file://some%20computer/foo/bar.txt"}, // UNC
{L"D:\\Name;with%some symbols*#",
"file:///D:/Name%3Bwith%25some%20symbols*%23"},
+ // issue 14153: To be tested with the OS default codepage other than 1252.
+ {L"D:\\latin1\\caf\x00E9\x00DD.txt",
+ "file:///D:/latin1/caf%C3%A9%C3%9D.txt"},
+ {L"D:\\otherlatin\\caf\x0119.txt",
+ "file:///D:/otherlatin/caf%C4%99.txt"},
+ {L"D:\\greek\\\x03B1\x03B2\x03B3.txt",
+ "file:///D:/greek/%CE%B1%CE%B2%CE%B3.txt"},
{L"D:\\Chinese\\\x6240\x6709\x4e2d\x6587\x7f51\x9875.doc",
"file:///D:/Chinese/%E6%89%80%E6%9C%89%E4%B8%AD%E6%96%87%E7%BD%91"
"%E9%A1%B5.doc"},
+ {L"D:\\plane1\\\xD835\xDC00\xD835\xDC01.txt", // Math alphabet "AB"
+ "file:///D:/plane1/%F0%9D%90%80%F0%9D%90%81.txt"},
#elif defined(OS_POSIX)
{L"/foo/bar.txt", "file:///foo/bar.txt"},
{L"/foo/BAR.txt", "file:///foo/BAR.txt"},
{L"/C:/foo/bar.txt", "file:///C:/foo/bar.txt"},
{L"/some computer/foo/bar.txt", "file:///some%20computer/foo/bar.txt"},
{L"/Name;with%some symbols*#", "file:///Name%3Bwith%25some%20symbols*%23"},
+ {L"/latin1/caf\x00E9\x00DD.txt", "file:///latin1/caf%C3%A9%C3%9D.txt"},
+ {L"/otherlatin/caf\x0119.txt", "file:///otherlatin/caf%C4%99.txt"},
+ {L"/greek/\x03B1\x03B2\x03B3.txt", "file:///greek/%CE%B1%CE%B2%CE%B3.txt"},
{L"/Chinese/\x6240\x6709\x4e2d\x6587\x7f51\x9875.doc",
"file:///Chinese/%E6%89%80%E6%9C%89%E4%B8%AD%E6%96%87%E7%BD"
"%91%E9%A1%B5.doc"},
+ {L"/plane1/\x1D400\x1D401.txt", // Math alphabet "AB"
+ "file:///plane1/%F0%9D%90%80%F0%9D%90%81.txt"},
#endif
};
@@ -474,21 +488,6 @@ TEST(NetUtilTest, FileURLConversion) {
EXPECT_EQ(url_cases[i].file, output.ToWStringHack());
}
- // Here, we test that UTF-8 encoded strings get decoded properly, even when
- // they might be stored with wide characters. On posix systems, just treat
- // this as a stream of bytes.
- const wchar_t utf8[] = L"file:///d:/Chinese/\xe6\x89\x80\xe6\x9c\x89\xe4\xb8"
- L"\xad\xe6\x96\x87\xe7\xbd\x91\xe9\xa1\xb5.doc";
-#if defined(OS_WIN)
- const wchar_t wide[] =
- L"D:\\Chinese\\\x6240\x6709\x4e2d\x6587\x7f51\x9875.doc";
-#elif defined(OS_POSIX)
- const wchar_t wide[] = L"/d:/Chinese/\xe6\x89\x80\xe6\x9c\x89\xe4\xb8\xad\xe6"
- L"\x96\x87\xe7\xbd\x91\xe9\xa1\xb5.doc";
-#endif
- EXPECT_TRUE(net::FileURLToFilePath(GURL(WideToUTF8(utf8)), &output));
- EXPECT_EQ(wide, output.ToWStringHack());
-
// Unfortunately, UTF8ToWide discards invalid UTF8 input.
#ifdef BUG_878908_IS_FIXED
// Test that no conversion happens if the UTF-8 input is invalid, and that
@@ -862,7 +861,8 @@ TEST(NetUtilTest, GetSuggestedFilename) {
namespace {
struct GetDirectoryListingEntryCase {
- const char* name;
+ const wchar_t* name;
+ const char* raw_bytes;
bool is_dir;
int64 filesize;
base::Time time;
@@ -872,22 +872,50 @@ struct GetDirectoryListingEntryCase {
} // namespace
TEST(NetUtilTest, GetDirectoryListingEntry) {
const GetDirectoryListingEntryCase test_cases[] = {
- {"Foo",
+ {L"Foo",
+ "",
false,
10000,
base::Time(),
"<script>addRow(\"Foo\",\"Foo\",0,\"9.8 kB\",\"\");</script>\n"},
- {"quo\"tes",
+ {L"quo\"tes",
+ "",
+ false,
+ 10000,
+ base::Time(),
+ "<script>addRow(\"quo\\\"tes\",\"quo%22tes\",0,\"9.8 kB\",\"\");</script>"
+ "\n"},
+ {L"quo\"tes",
+ "quo\"tes",
false,
10000,
base::Time(),
"<script>addRow(\"quo\\\"tes\",\"quo%22tes\",0,\"9.8 kB\",\"\");</script>"
"\n"},
+ // U+D55C0 U+AE00. raw_bytes is empty (either a local file with
+ // UTF-8/UTF-16 encoding or a remote file on an ftp server using UTF-8
+ {L"\xD55C\xAE00.txt",
+ "",
+ false,
+ 10000,
+ base::Time(),
+ "<script>addRow(\"\\uD55C\\uAE00.txt\",\"%ED%95%9C%EA%B8%80.txt\""
+ ",0,\"9.8 kB\",\"\");</script>\n"},
+ // U+D55C0 U+AE00. raw_bytes is the corresponding EUC-KR sequence:
+ // a local or remote file in EUC-KR.
+ {L"\xD55C\xAE00.txt",
+ "\xC7\xD1\xB1\xDB.txt",
+ false,
+ 10000,
+ base::Time(),
+ "<script>addRow(\"\\uD55C\\uAE00.txt\",\"%C7%D1%B1%DB.txt\""
+ ",0,\"9.8 kB\",\"\");</script>\n"},
};
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(test_cases); ++i) {
const std::string results = net::GetDirectoryListingEntry(
- test_cases[i].name,
+ WideToUTF16(test_cases[i].name),
+ test_cases[i].raw_bytes,
test_cases[i].is_dir,
test_cases[i].filesize,
test_cases[i].time);
diff --git a/net/base/net_util_win.cc b/net/base/net_util_win.cc
index effb212..244f4ad 100644
--- a/net/base/net_util_win.cc
+++ b/net/base/net_util_win.cc
@@ -57,33 +57,13 @@ bool FileURLToFilePath(const GURL& url, FilePath* file_path) {
}
file_path_str.assign(UTF8ToWide(path));
- // Now we have an unescaped filename, but are still not sure about its
- // encoding. For example, each character could be part of a UTF-8 string.
- if (file_path_str.empty() || !IsString8Bit(file_path_str)) {
- // assume our 16-bit encoding is correct if it won't fit into an 8-bit
- // string
- return true;
- }
-
- // Convert our narrow string into the native wide path.
- std::string narrow;
- if (!WideToLatin1(file_path_str, &narrow)) {
- NOTREACHED() << "Should have filtered out non-8-bit strings above.";
- return false;
- }
- if (IsStringUTF8(narrow)) {
- // Our string actually looks like it could be UTF-8, convert to 8-bit
- // UTF-8 and then to the corresponding wide string.
- file_path_str = UTF8ToWide(narrow);
- } else {
- // Our wide string contains only 8-bit characters and it's not UTF-8, so
- // we assume it's in the native codepage.
- file_path_str = base::SysNativeMBToWide(narrow);
- }
-
- // Fail if 8-bit -> wide conversion failed and gave us an empty string back
- // (we already filtered out empty strings above).
- return !file_path_str.empty();
+ // We used to try too hard and see if |path| made up entirely of
+ // the 1st 256 characters in the Unicode was a zero-extended UTF-16.
+ // If so, we converted it to 'Latin-1' and checked if the result was UTF-8.
+ // If the check passed, we converted the result to UTF-8.
+ // Otherwise, we treated the result as the native OS encoding.
+ // However, that led to http://crbug.com/4619 and http://crbug.com/14153
+ return true;
}
} // namespace net
diff --git a/net/url_request/url_request_file_dir_job.cc b/net/url_request/url_request_file_dir_job.cc
index c242ef9..ecdf014 100644
--- a/net/url_request/url_request_file_dir_job.cc
+++ b/net/url_request/url_request_file_dir_job.cc
@@ -7,6 +7,7 @@
#include "base/file_util.h"
#include "base/message_loop.h"
#include "base/string_util.h"
+#include "base/sys_string_conversions.h"
#include "base/time.h"
#include "googleurl/src/gurl.h"
#include "net/base/io_buffer.h"
@@ -104,9 +105,15 @@ void URLRequestFileDirJob::OnListFile(
// can catch errors from DirectoryLister and show an error page.
if (!wrote_header_) {
#if defined(OS_WIN)
- const std::string& title = WideToUTF8(dir_path_.value());
+ const string16& title = dir_path_.value();
#elif defined(OS_POSIX)
- const std::string& title = dir_path_.value();
+ // TODO(jungshik): Add SysNativeMBToUTF16 to sys_string_conversions.
+ // On Mac, need to add NFKC->NFC conversion either here or in file_path.
+ // On Linux, the file system encoding is not defined, but we assume that
+ // SysNativeMBToWide takes care of it at least for now. We can try something
+ // more sophisticated if necessary later.
+ const string16& title = WideToUTF16(
+ base::SysNativeMBToWide(dir_path_.value()));
#endif
data_.append(net::GetDirectoryListingHeader(title));
wrote_header_ = true;
@@ -119,14 +126,16 @@ void URLRequestFileDirJob::OnListFile(
data.nFileSizeLow;
data_.append(net::GetDirectoryListingEntry(
- WideToUTF8(data.cFileName),
+ data.cFileName, std::string(),
(data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ? true : false,
size,
base::Time::FromFileTime(local_time)));
#elif defined(OS_POSIX)
+ // TOOD(jungshik): The same issue as for the directory name.
data_.append(net::GetDirectoryListingEntry(
- data.filename.c_str(),
+ WideToUTF16(base::SysNativeMBToWide(data.filename)),
+ data.filename,
S_ISDIR(data.stat.st_mode),
data.stat.st_size,
base::Time::FromTimeT(data.stat.st_mtime)));
diff --git a/net/url_request/url_request_ftp_job.cc b/net/url_request/url_request_ftp_job.cc
index bdfb0b3..c7cb333 100644
--- a/net/url_request/url_request_ftp_job.cc
+++ b/net/url_request/url_request_ftp_job.cc
@@ -9,6 +9,7 @@
#include "base/message_loop.h"
#include "base/string_util.h"
+#include "base/sys_string_conversions.h"
#include "base/time.h"
#include "net/base/auth.h"
#include "net/base/escape.h"
@@ -388,11 +389,21 @@ void URLRequestFtpJob::OnFindFile(DWORD last_error) {
(static_cast<unsigned __int64>(find_data_.nFileSizeHigh) << 32) |
find_data_.nFileSizeLow;
- // We don't know the encoding, and can't assume utf8, so pass the 8bit
- // directly to the browser for it to decide.
+ // We don't know the encoding used on an FTP server, but we
+ // use FtpFindFirstFileA, which I guess does NOT preserve
+ // the raw byte sequence because it's implemented in terms
+ // of FtpFindFirstFileW. Without the raw byte sequence, we
+ // can't apply the encoding detection or other heuristics
+ // to determine/guess the encoding. Neither can we use UTF-8
+ // used by a RFC-2640-compliant FTP server. In some cases (e.g.
+ // the default code page is an SBCS with almost all bytes assigned.
+ // In lucky cases, it's even possible with a DBCS), it's possible
+ // to recover the raw byte sequence in most cases. We can do
+ // some more here, but it's not worth the effort because we're
+ // going to replace this class with URLRequestNewFtpJob.
string file_entry = net::GetDirectoryListingEntry(
- find_data_.cFileName, false, size,
- base::Time::FromFileTime(find_data_.ftLastWriteTime));
+ base::SysNativeMBToWide(find_data_.cFileName), std::string(),
+ false, size, base::Time::FromFileTime(find_data_.ftLastWriteTime));
WriteData(&file_entry, true);
FindNextFile();
@@ -407,14 +418,20 @@ void URLRequestFtpJob::OnStartDirectoryTraversal() {
state_ = GETTING_DIRECTORY;
// Unescape the URL path and pass the raw 8bit directly to the browser.
+ //
+ // Here we can try to detect the encoding although it may not be very
+ // reliable because it's not likely to be long enough. Because this class
+ // will be replaced by URLRequestNewFtpJob and is used only on Windows,
+ // we use SysNativeMBToWide as a stopgap measure.
string html = net::GetDirectoryListingHeader(
- UnescapeURLComponent(request_->url().path(),
- UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS));
+ base::SysNativeMBToWide(UnescapeURLComponent(request_->url().path(),
+ UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS)));
// If this isn't top level directory (i.e. the path isn't "/",) add a link to
// the parent directory.
if (request_->url().path().length() > 1)
- html.append(net::GetDirectoryListingEntry("..", false, 0, base::Time()));
+ html.append(net::GetDirectoryListingEntry(L"..", std::string(),
+ false, 0, base::Time()));
WriteData(&html, true);
diff --git a/net/url_request/url_request_new_ftp_job.cc b/net/url_request/url_request_new_ftp_job.cc
index d3a0c3e..d9f1d27 100644
--- a/net/url_request/url_request_new_ftp_job.cc
+++ b/net/url_request/url_request_new_ftp_job.cc
@@ -7,6 +7,7 @@
#include "base/compiler_specific.h"
#include "base/file_version_info.h"
#include "base/message_loop.h"
+#include "base/sys_string_conversions.h"
#include "net/base/escape.h"
#include "net/base/net_errors.h"
#include "net/base/net_util.h"
@@ -16,6 +17,46 @@
#include "net/url_request/url_request.h"
#include "net/url_request/url_request_context.h"
#include "net/url_request/url_request_error_job.h"
+#include "unicode/ucsdet.h"
+
+namespace {
+
+// A very simple-minded character encoding detection.
+// TODO(jungshik): We can apply more heuristics here (e.g. using various hints
+// like TLD, the UI language/default encoding of a client, etc). In that case,
+// this should be pulled out of here and moved somewhere in base because there
+// can be other use cases.
+std::string DetectEncoding(const char*input, size_t len) {
+ if (IsStringASCII(std::string(input, len)))
+ return std::string();
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector* detector = ucsdet_open(&status);
+ ucsdet_setText(detector, input, static_cast<int32_t>(len), &status);
+ const UCharsetMatch* match = ucsdet_detect(detector, &status);
+ const char* encoding = ucsdet_getName(match, &status);
+ // Should we check the quality of the match? A rather arbitrary number is
+ // assigned by ICU and it's hard to come up with a lower limit.
+ if (U_FAILURE(status))
+ return std::string();
+ return encoding;
+}
+
+string16 RawByteSequenceToFilename(const char* raw_filename,
+ const std::string& encoding) {
+ if (encoding.empty())
+ return ASCIIToUTF16(raw_filename);
+
+ // Try the detected encoding before falling back to the native codepage.
+ // Using the native codepage does not make much sense, but we don't have
+ // much else to resort to.
+ string16 filename;
+ if (!CodepageToUTF16(raw_filename, encoding.c_str(),
+ OnStringUtilConversionError::SUBSTITUTE, &filename))
+ filename = WideToUTF16Hack(base::SysNativeMBToWide(raw_filename));
+ return filename;
+}
+
+} // namespace
URLRequestNewFtpJob::URLRequestNewFtpJob(URLRequest* request)
: URLRequestJob(request),
@@ -69,17 +110,36 @@ bool URLRequestNewFtpJob::ReadRawData(net::IOBuffer* buf,
if (response_info_ == NULL) {
response_info_ = transaction_->GetResponseInfo();
if (response_info_->is_directory_listing) {
- // Unescape the URL path and pass the raw 8bit directly to the browser.
- directory_html_ = net::GetDirectoryListingHeader(
+ std::string escaped_path =
UnescapeURLComponent(request_->url().path(),
- UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS));
+ UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
+ string16 path_utf16;
+ // Per RFC 2640, FTP servers should use UTF-8 or its proper subset ASCII,
+ // but many old FTP servers use legacy encodings. Try UTF-8 first and
+ // detect the encoding.
+ if (IsStringUTF8(escaped_path)) {
+ path_utf16 = UTF8ToUTF16(escaped_path);
+ } else {
+ std::string encoding = DetectEncoding(escaped_path.c_str(),
+ escaped_path.size());
+ // Try the detected encoding. If it fails, resort to the
+ // OS native encoding.
+ if (encoding.empty() ||
+ !CodepageToUTF16(escaped_path, encoding.c_str(),
+ OnStringUtilConversionError::SUBSTITUTE,
+ &path_utf16))
+ path_utf16 = WideToUTF16Hack(base::SysNativeMBToWide(escaped_path));
+ }
+
+ directory_html_ = net::GetDirectoryListingHeader(path_utf16);
// If this isn't top level directory (i.e. the path isn't "/",)
// add a link to the parent directory.
if (request_->url().path().length() > 1)
- directory_html_.append(net::GetDirectoryListingEntry("..",
- false,
- 0,
- base::Time()));
+ directory_html_.append(
+ net::GetDirectoryListingEntry(ASCIIToUTF16(".."),
+ std::string(),
+ false, 0,
+ base::Time()));
}
}
if (!directory_html_.empty()) {
@@ -121,6 +181,20 @@ int URLRequestNewFtpJob::ProcessFtpDir(net::IOBuffer *buf,
std::string file_entry;
std::string line;
buf->data()[bytes_read] = 0;
+
+ // If all we've seen so far is ASCII, encoding_ is empty. Try to detect the
+ // encoding. We don't do the separate UTF-8 check here because the encoding
+ // detection with a longer chunk (as opposed to the relatively short path
+ // component of the url) is unlikely to mistake UTF-8 for a legacy encoding.
+ // If it turns out to be wrong, a separate UTF-8 check has to be added.
+ //
+ // TODO(jungshik): UTF-8 has to be 'enforced' without any heuristics when
+ // we're talking to an FTP server compliant to RFC 2640 (that is, its response
+ // to FEAT command includes 'UTF8').
+ // See http://wiki.filezilla-project.org/Character_Set
+ if (encoding_.empty())
+ encoding_ = DetectEncoding(buf->data(), bytes_read);
+
int64 file_size;
std::istringstream iss(buf->data());
while (getline(iss, line)) {
@@ -144,6 +218,7 @@ int URLRequestNewFtpJob::ProcessFtpDir(net::IOBuffer *buf,
et.day_of_week = result.fe_time.tm_wday;
file_entry.append(net::GetDirectoryListingEntry(
+ RawByteSequenceToFilename(result.fe_fname, encoding_),
result.fe_fname, true, 0, base::Time::FromLocalExploded(et)));
break;
case net::FTP_TYPE_FILE:
@@ -163,6 +238,7 @@ int URLRequestNewFtpJob::ProcessFtpDir(net::IOBuffer *buf,
// It returns wrong date/time (Differnce is 1 day and 17 Hours).
if (StringToInt64(result.fe_size, &file_size))
file_entry.append(net::GetDirectoryListingEntry(
+ RawByteSequenceToFilename(result.fe_fname, encoding_),
result.fe_fname, false, file_size,
base::Time::FromLocalExploded(et)));
break;
diff --git a/net/url_request/url_request_new_ftp_job.h b/net/url_request/url_request_new_ftp_job.h
index a74a265..69c1fef 100644
--- a/net/url_request/url_request_new_ftp_job.h
+++ b/net/url_request/url_request_new_ftp_job.h
@@ -59,6 +59,7 @@ class URLRequestNewFtpJob : public URLRequestJob {
std::string directory_html_;
bool read_in_progress_;
+ std::string encoding_;
// Keep a reference to the url request context to be sure it's not deleted
// before us.