diff options
-rw-r--r-- | base/string_util.h | 21 | ||||
-rw-r--r-- | base/string_util_icu.cc | 285 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 313 | ||||
-rw-r--r-- | net/base/net_util.cc | 14 | ||||
-rw-r--r-- | net/base/net_util.h | 25 | ||||
-rw-r--r-- | net/base/net_util_unittest.cc | 66 | ||||
-rw-r--r-- | net/base/net_util_win.cc | 34 | ||||
-rw-r--r-- | net/url_request/url_request_file_dir_job.cc | 17 | ||||
-rw-r--r-- | net/url_request/url_request_ftp_job.cc | 31 | ||||
-rw-r--r-- | net/url_request/url_request_new_ftp_job.cc | 90 | ||||
-rw-r--r-- | net/url_request/url_request_new_ftp_job.h | 1 |
11 files changed, 645 insertions, 252 deletions
diff --git a/base/string_util.h b/base/string_util.h index 9a033b4..c7f3115 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -221,7 +221,8 @@ std::string UTF16ToUTF8(const string16& utf16); # define UTF16ToWideHack UTF16ToWide #endif -// Defines the error handling modes of WideToCodepage and CodepageToWide. +// Defines the error handling modes of UTF16ToCodepage, CodepageToUTF16, +// WideToCodepage and CodepageToWide. class OnStringUtilConversionError { public: enum Type { @@ -231,12 +232,30 @@ class OnStringUtilConversionError { // The offending characters are skipped and the conversion will proceed as // if they did not exist. SKIP, + + // When converting to Unicode, the offending byte sequences are substituted + // by Unicode replacement character (U+FFFD). When converting from Unicode, + // this is the same as SKIP. + SUBSTITUTE, }; private: OnStringUtilConversionError(); }; +// Converts between UTF-16 strings and the encoding specified. If the +// encoding doesn't exist or the encoding fails (when on_error is FAIL), +// returns false. +bool UTF16ToCodepage(const string16& utf16, + const char* codepage_name, + OnStringUtilConversionError::Type on_error, + std::string* encoded); + +bool CodepageToUTF16(const std::string& encoded, + const char* codepage_name, + OnStringUtilConversionError::Type on_error, + string16* utf16); + // Converts between wide strings and the encoding specified. If the // encoding doesn't exist or the encoding fails (when on_error is FAIL), // returns false. diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc index 87731de..3bd6f9b 100644 --- a/base/string_util_icu.cc +++ b/base/string_util_icu.cc @@ -10,8 +10,10 @@ #include "base/basictypes.h" #include "base/logging.h" #include "base/singleton.h" -#include "unicode/ucnv.h" #include "unicode/numfmt.h" +#include "unicode/ucnv.h" +#include "unicode/ucnv_cb.h" +#include "unicode/ucnv_err.h" #include "unicode/ustring.h" namespace { @@ -24,6 +26,64 @@ inline bool IsValidCodepoint(uint32 code_point) { (code_point >= 0xE000u && code_point <= 0x10FFFFu); } +// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE +// in source/common/ucnv_err.c. + +// Copyright (c) 1995-2006 International Business Machines Corporation +// and others +// +// All rights reserved. +// + +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, and/or +// sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, provided that the above copyright notice(s) and +// this permission notice appear in all copies of the Software and that +// both the above copyright notice(s) and this permission notice appear in +// supporting documentation. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS +// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT +// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +// OR PERFORMANCE OF THIS SOFTWARE. +// +// Except as contained in this notice, the name of a copyright holder +// shall not be used in advertising or otherwise to promote the sale, use +// or other dealings in this Software without prior written authorization +// of the copyright holder. + +// ___________________________________________________________________________ +// +// All trademarks and registered trademarks mentioned herein are the property +// of their respective owners. + +void ToUnicodeCallbackSubstitute(const void* context, + UConverterToUnicodeArgs *to_args, + const char* code_units, + int32_t length, + UConverterCallbackReason reason, + UErrorCode * err) { + static const UChar kReplacementChar = 0xFFFD; + if (reason <= UCNV_IRREGULAR) { + if (context == NULL || + (*(reinterpret_cast<const char*>(context)) == 'i' && + reason == UCNV_UNASSIGNED)) { + *err = U_ZERO_ERROR; + ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); + } + // else the caller must have set the error code accordingly. + } + // else ignore the reset, close and clone calls. +} + // ReadUnicodeCharacter -------------------------------------------------------- // Reads a UTF-8 stream, placing the next code point into the given output @@ -76,7 +136,7 @@ bool ReadUnicodeCharacter(const char16* src, int32 src_len, #if defined(WCHAR_T_IS_UTF32) // Reads UTF-32 character. The usage is the same as the 8-bit version above. bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, - int32* char_index, uint32* code_point) { + int32* char_index, uint32* code_point) { // Conversion is easy since the source is 32-bit. *code_point = src[*char_index]; @@ -184,6 +244,70 @@ void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { } } +bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, + int uchar_len, OnStringUtilConversionError::Type on_error, + std::string* encoded) { + int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, + ucnv_getMaxCharSize(converter)); + encoded->resize(encoded_max_length); + + UErrorCode status = U_ZERO_ERROR; + + // Setup our error handler. + switch (on_error) { + case OnStringUtilConversionError::FAIL: + ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, + NULL, NULL, &status); + break; + case OnStringUtilConversionError::SKIP: + case OnStringUtilConversionError::SUBSTITUTE: + ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, + NULL, NULL, &status); + break; + default: + NOTREACHED(); + } + + // ucnv_fromUChars returns size not including terminating null + int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], + encoded_max_length, uchar_src, uchar_len, &status); + encoded->resize(actual_size); + ucnv_close(converter); + if (U_SUCCESS(status)) + return true; + encoded->clear(); // Make sure the output is empty on error. + return false; +} + +// Set up our error handler for ToUTF-16 converters +void SetUpErrorHandlerForToUChars(OnStringUtilConversionError::Type on_error, + UConverter* converter, UErrorCode* status) { + switch (on_error) { + case OnStringUtilConversionError::FAIL: + ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, + NULL, NULL, status); + break; + case OnStringUtilConversionError::SKIP: + ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, + NULL, NULL, status); + break; + case OnStringUtilConversionError::SUBSTITUTE: + ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0, + NULL, NULL, status); + break; + default: + NOTREACHED(); + } +} + +inline UConverterType utf32_platform_endian() { +#if U_IS_BIG_ENDIAN + return UCNV_UTF32_BigEndian; +#else + return UCNV_UTF32_LittleEndian; +#endif +} + } // namespace // UTF-8 <-> Wide -------------------------------------------------------------- @@ -364,14 +488,17 @@ std::string UTF16ToUTF8(const string16& utf16) { #endif -// Codepage <-> Wide ----------------------------------------------------------- +// Codepage <-> Wide/UTF-16 --------------------------------------------------- -// Convert a unicode string into the specified codepage_name. If the codepage +// Convert a wstring into the specified codepage_name. If the codepage // isn't found, return false. bool WideToCodepage(const std::wstring& wide, const char* codepage_name, OnStringUtilConversionError::Type on_error, std::string* encoded) { +#if defined(WCHAR_T_IS_UTF16) + return UTF16ToCodepage(wide, codepage_name, on_error, encoded); +#elif defined(WCHAR_T_IS_UTF32) encoded->clear(); UErrorCode status = U_ZERO_ERROR; @@ -379,59 +506,47 @@ bool WideToCodepage(const std::wstring& wide, if (!U_SUCCESS(status)) return false; - const UChar* uchar_src; - int uchar_len; -#if defined(WCHAR_T_IS_UTF16) - uchar_src = wide.c_str(); - uchar_len = static_cast<int>(wide.length()); -#elif defined(WCHAR_T_IS_UTF32) + int utf16_len; // When wchar_t is wider than UChar (16 bits), transform |wide| into a // UChar* string. Size the UChar* buffer to be large enough to hold twice - // as many UTF-16 code points as there are UTF-16 characters, in case each - // character translates to a UTF-16 surrogate pair, and leave room for a NUL - // terminator. - std::vector<UChar> wide_uchar(wide.length() * 2 + 1); - u_strFromWCS(&wide_uchar[0], wide_uchar.size(), &uchar_len, + // as many UTF-16 code units (UChar's) as there are Unicode code points, + // in case each code points translates to a UTF-16 surrogate pair, + // and leave room for a NUL terminator. + std::vector<UChar> utf16(wide.length() * 2 + 1); + u_strFromWCS(&utf16[0], utf16.size(), &utf16_len, wide.c_str(), wide.length(), &status); - uchar_src = &wide_uchar[0]; DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*"; + + return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded); #endif // defined(WCHAR_T_IS_UTF32) +} - int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, - ucnv_getMaxCharSize(converter)); - encoded->resize(encoded_max_length); +// Convert a UTF-16 string into the specified codepage_name. If the codepage +// isn't found, return false. +bool UTF16ToCodepage(const string16& utf16, + const char* codepage_name, + OnStringUtilConversionError::Type on_error, + std::string* encoded) { + encoded->clear(); - // Setup our error handler. - switch (on_error) { - case OnStringUtilConversionError::FAIL: - ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, - NULL, NULL, &status); - break; - case OnStringUtilConversionError::SKIP: - ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, - NULL, NULL, &status); - break; - default: - NOTREACHED(); - } + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; - // ucnv_fromUChars returns size not including terminating null - int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], - encoded_max_length, uchar_src, uchar_len, &status); - encoded->resize(actual_size); - ucnv_close(converter); - if (U_SUCCESS(status)) - return true; - encoded->clear(); // Make sure the output is empty on error. - return false; + return ConvertFromUTF16(converter, utf16.c_str(), + static_cast<int>(utf16.length()), on_error, encoded); } -// Converts a string of the given codepage into unicode. +// Converts a string of the given codepage into wstring. // If the codepage isn't found, return false. bool CodepageToWide(const std::string& encoded, const char* codepage_name, OnStringUtilConversionError::Type on_error, std::wstring* wide) { +#if defined(WCHAR_T_IS_UTF16) + return CodepageToUTF16(encoded, codepage_name, on_error, wide); +#elif defined(WCHAR_T_IS_UTF32) wide->clear(); UErrorCode status = U_ZERO_ERROR; @@ -439,6 +554,51 @@ bool CodepageToWide(const std::string& encoded, if (!U_SUCCESS(status)) return false; + // The maximum length in 4 byte unit of UTF-32 output would be + // at most the same as the number of bytes in input. In the worst + // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP), + // this can be 4 times larger than actually needed. + size_t wchar_max_length = encoded.length() + 1; + + // The byte buffer and its length to pass to ucnv_toAlgorithimic. + char* byte_buffer = reinterpret_cast<char*>( + WriteInto(wide, wchar_max_length)); + int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; + + SetUpErrorHandlerForToUChars(on_error, converter, &status); + int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), + converter, + byte_buffer, + byte_buffer_length, + encoded.data(), + static_cast<int>(encoded.length()), + &status); + ucnv_close(converter); + + if (!U_SUCCESS(status)) { + wide->clear(); // Make sure the output is empty on error. + return false; + } + + // actual_size is # of bytes. + wide->resize(actual_size / 4); + return true; +#endif // defined(WCHAR_T_IS_UTF32) +} + +// Converts a string of the given codepage into UTF-16. +// If the codepage isn't found, return false. +bool CodepageToUTF16(const std::string& encoded, + const char* codepage_name, + OnStringUtilConversionError::Type on_error, + string16* utf16) { + utf16->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + // Even in the worst case, the maximum length in 2-byte units of UTF-16 // output would be at most the same as the number of bytes in input. There // is no single-byte encoding in which a character is mapped to a @@ -449,53 +609,20 @@ bool CodepageToWide(const std::string& encoded, // BOCU and SCSU, but we don't care about them. size_t uchar_max_length = encoded.length() + 1; - UChar* uchar_dst; -#if defined(WCHAR_T_IS_UTF16) - uchar_dst = WriteInto(wide, uchar_max_length); -#elif defined(WCHAR_T_IS_UTF32) - // When wchar_t is wider than UChar (16 bits), convert into a temporary - // UChar* buffer. - std::vector<UChar> wide_uchar(uchar_max_length); - uchar_dst = &wide_uchar[0]; -#endif // defined(WCHAR_T_IS_UTF32) - - // Setup our error handler. - switch (on_error) { - case OnStringUtilConversionError::FAIL: - ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, - NULL, NULL, &status); - break; - case OnStringUtilConversionError::SKIP: - ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, - NULL, NULL, &status); - break; - default: - NOTREACHED(); - } - + SetUpErrorHandlerForToUChars(on_error, converter, &status); int actual_size = ucnv_toUChars(converter, - uchar_dst, + WriteInto(utf16, uchar_max_length), static_cast<int>(uchar_max_length), encoded.data(), static_cast<int>(encoded.length()), &status); ucnv_close(converter); if (!U_SUCCESS(status)) { - wide->clear(); // Make sure the output is empty on error. + utf16->clear(); // Make sure the output is empty on error. return false; } -#ifdef WCHAR_T_IS_UTF32 - // When wchar_t is wider than UChar (16 bits), it's not possible to wind up - // with any more wchar_t elements than UChar elements. ucnv_toUChars - // returns the number of UChar elements not including the NUL terminator, so - // leave extra room for that. - u_strToWCS(WriteInto(wide, actual_size + 1), actual_size + 1, &actual_size, - uchar_dst, actual_size, &status); - DCHECK(U_SUCCESS(status)) << "failed to convert UChar* to wstring"; -#endif // WCHAR_T_IS_UTF32 - - wide->resize(actual_size); + utf16->resize(actual_size); return true; } diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 6f196cc..4968950 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -13,8 +13,30 @@ #include "testing/gtest/include/gtest/gtest.h" namespace { + +// Given a null-terminated string of wchar_t with each wchar_t representing +// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. +// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) +// should be represented as a surrogate pair (two UTF-16 units) +// *even* where wchar_t is 32-bit (Linux and Mac). +// +// This is to help write tests for functions with string16 params until +// the C++ 0x UTF-16 literal is well-supported by compilers. +string16 BuildString16(const wchar_t* s) { +#if defined(WCHAR_T_IS_UTF16) + return string16(s); +#elif defined(WCHAR_T_IS_UTF32) + string16 u16; + while (*s != 0) { + DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu); + u16.push_back(*s++); + } + return u16; +#endif } +} // namespace + static const struct trim_case { const wchar_t* input; const TrimPositions positions; @@ -459,104 +481,162 @@ TEST(StringUtilTest, ConvertCodepageUTF8) { } } -TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { - static const struct { - const char* codepage_name; - const char* encoded; - OnStringUtilConversionError::Type on_error; - bool success; - const wchar_t* wide; - } kConvertCodepageCases[] = { - // Test a case where the input can no be decoded, using both SKIP and FAIL - // error handling rules. "A7 41" is valid, but "A6" isn't. - {"big5", - "\xA7\x41\xA6", - OnStringUtilConversionError::FAIL, - false, - L""}, - {"big5", - "\xA7\x41\xA6", - OnStringUtilConversionError::SKIP, - true, - L"\x4F60"}, - // Arabic (ISO-8859) - {"iso-8859-6", - "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " " - "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2", - OnStringUtilConversionError::FAIL, - true, - L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" " - L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652"}, - // Chinese Simplified (GB2312) - {"gb2312", - "\xC4\xE3\xBA\xC3", - OnStringUtilConversionError::FAIL, - true, - L"\x4F60\x597D"}, - // Chinese Traditional (BIG5) - {"big5", - "\xA7\x41\xA6\x6E", - OnStringUtilConversionError::FAIL, - true, - L"\x4F60\x597D"}, - // Greek (ISO-8859) - {"iso-8859-7", - "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5", - OnStringUtilConversionError::FAIL, - true, - L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5"}, - // Hebrew (Windows) - {"windows-1255", /* to be replaced with "iso-8859-8-I"? */ - "\xF9\xD1\xC8\xEC\xE5\xC9\xED", - OnStringUtilConversionError::FAIL, - true, - L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD"}, - // Hindi Devanagari (ISCII) - {"iscii-dev", - "\xEF\x42" "\xC6\xCC\xD7\xE8\xB3\xDA\xCF", - OnStringUtilConversionError::FAIL, - true, - L"\x0928\x092E\x0938\x094D\x0915\x093E\x0930"}, - // Korean (EUC) - {"euc-kr", - "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4", - OnStringUtilConversionError::FAIL, - true, - L"\xC548\xB155\xD558\xC138\xC694"}, - // Japanese (EUC) - {"euc-jp", - "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF", - OnStringUtilConversionError::FAIL, - true, - L"\x3053\x3093\x306B\x3061\x306F"}, - // Japanese (ISO-2022) - {"iso-2022-jp", - "\x1B\x24\x42" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F" "\x1B\x28\x42", - OnStringUtilConversionError::FAIL, - true, - L"\x3053\x3093\x306B\x3061\x306F"}, - // Japanese (Shift-JIS) - {"sjis", - "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD", - OnStringUtilConversionError::FAIL, - true, - L"\x3053\x3093\x306B\x3061\x306F"}, - // Russian (KOI8) - {"koi8-r", - "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5", - OnStringUtilConversionError::FAIL, - true, - L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" - L"\x0443\x0439\x0442\x0435"}, - // Thai (ISO-8859) - {"windows-874", /* to be replaced with "iso-8859-11". */ - "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA", - OnStringUtilConversionError::FAIL, - true, - L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35" - L"\x0E04\x0E23\x0e31\x0E1A"}, - }; +// kConverterCodepageCases is not comprehensive. There are a number of cases +// to add if we really want to have a comprehensive coverage of various +// codepages and their 'idiosyncrasies'. Currently, the only implementation +// for CodepageTo* and *ToCodepage uses ICU, which has a very extensive +// set of tests for the charset conversion. So, we can get away with a +// relatively small number of cases listed below. +// +// Note about |u16_wide| in the following struct. +// On Windows, the field is always identical to |wide|. On Mac and Linux, +// it's identical as long as there's no character outside the +// BMP (<= U+FFFF). When there is, it is different from |wide| and +// is not a real wide string (UTF-32 string) in that each wchar_t in +// the string is a UTF-16 code unit zero-extended to be 32-bit +// even when the code unit belongs to a surrogate pair. +// For instance, a Unicode string (U+0041 U+010000) is represented as +// L"\x0041\xD800\xDC00" instead of L"\x0041\x10000". +// To avoid the clutter, |u16_wide| will be set to NULL +// if it's identical to |wide| on *all* platforms. + +static const struct { + const char* codepage_name; + const char* encoded; + OnStringUtilConversionError::Type on_error; + bool success; + const wchar_t* wide; + const wchar_t* u16_wide; +} kConvertCodepageCases[] = { + // Test a case where the input cannot be decoded, using SKIP, FAIL + // and SUBSTITUTE error handling rules. "A7 41" is valid, but "A6" isn't. + {"big5", + "\xA7\x41\xA6", + OnStringUtilConversionError::FAIL, + false, + L"", + NULL}, + {"big5", + "\xA7\x41\xA6", + OnStringUtilConversionError::SKIP, + true, + L"\x4F60", + NULL}, + {"big5", + "\xA7\x41\xA6", + OnStringUtilConversionError::SUBSTITUTE, + true, + L"\x4F60\xFFFD", + NULL}, + // Arabic (ISO-8859) + {"iso-8859-6", + "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " " + "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2", + OnStringUtilConversionError::FAIL, + true, + L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" " + L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652", + NULL}, + // Chinese Simplified (GB2312) + {"gb2312", + "\xC4\xE3\xBA\xC3", + OnStringUtilConversionError::FAIL, + true, + L"\x4F60\x597D", + NULL}, + // Chinese (GB18030) : 4 byte sequences mapped to BMP characters + {"gb18030", + "\x81\x30\x84\x36\xA1\xA7", + OnStringUtilConversionError::FAIL, + true, + L"\x00A5\x00A8", + NULL}, + // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000) + {"gb18030", + "\x95\x32\x82\x36\xD2\xBB", + OnStringUtilConversionError::FAIL, + true, +#if defined(WCHAR_T_IS_UTF16) + L"\xD840\xDC00\x4E00", +#else + L"\x20000\x4E00", +#endif + L"\xD840\xDC00\x4E00"}, + {"big5", + "\xA7\x41\xA6\x6E", + OnStringUtilConversionError::FAIL, + true, + L"\x4F60\x597D", + NULL}, + // Greek (ISO-8859) + {"iso-8859-7", + "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5", + OnStringUtilConversionError::FAIL, + true, + L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5", + NULL}, + // Hebrew (Windows) + {"windows-1255", + "\xF9\xD1\xC8\xEC\xE5\xC9\xED", + OnStringUtilConversionError::FAIL, + true, + L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD", + NULL}, + // Hindi Devanagari (ISCII) + {"iscii-dev", + "\xEF\x42" "\xC6\xCC\xD7\xE8\xB3\xDA\xCF", + OnStringUtilConversionError::FAIL, + true, + L"\x0928\x092E\x0938\x094D\x0915\x093E\x0930", + NULL}, + // Korean (EUC) + {"euc-kr", + "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4", + OnStringUtilConversionError::FAIL, + true, + L"\xC548\xB155\xD558\xC138\xC694", + NULL}, + // Japanese (EUC) + {"euc-jp", + "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF\xB0\xEC\x8F\xB0\xA1\x8E\xA6", + OnStringUtilConversionError::FAIL, + true, + L"\x3053\x3093\x306B\x3061\x306F\x4E00\x4E02\xFF66", + NULL}, + // Japanese (ISO-2022) + {"iso-2022-jp", + "\x1B$B" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F\x30\x6C" "\x1B(B" + "ab" "\x1B(J" "\x5C\x7E#$" "\x1B(B", + OnStringUtilConversionError::FAIL, + true, + L"\x3053\x3093\x306B\x3061\x306F\x4E00" L"ab\x00A5\x203E#$", + NULL}, + // Japanese (Shift-JIS) + {"sjis", + "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD\x88\xEA\xA6", + OnStringUtilConversionError::FAIL, + true, + L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66", + NULL}, + // Russian (KOI8) + {"koi8-r", + "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5", + OnStringUtilConversionError::FAIL, + true, + L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" + L"\x0443\x0439\x0442\x0435", + NULL}, + // Thai (windows-874) + {"windows-874", + "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA", + OnStringUtilConversionError::FAIL, + true, + L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35" + L"\x0E04\x0E23\x0e31\x0E1A", + NULL}, +}; +TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { std::wstring wide; bool success = CodepageToWide(kConvertCodepageCases[i].encoded, @@ -567,7 +647,9 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { EXPECT_EQ(kConvertCodepageCases[i].wide, wide); // When decoding was successful and nothing was skipped, we also check the - // reverse conversion. + // reverse conversion. Not all conversions are round-trippable, but + // kConverterCodepageCases does not have any one-way conversion at the + // moment. if (success && kConvertCodepageCases[i].on_error == OnStringUtilConversionError::FAIL) { @@ -590,6 +672,11 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", OnStringUtilConversionError::SKIP, &encoded)); EXPECT_STREQ("Chinese", encoded.c_str()); + // From Unicode, SUBSTITUTE is the same as SKIP for now. + EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1", + OnStringUtilConversionError::SUBSTITUTE, + &encoded)); + EXPECT_STREQ("Chinese", encoded.c_str()); #if defined(WCHAR_T_IS_UTF16) // When we're in UTF-16 mode, test an invalid UTF-16 character in the input. @@ -611,6 +698,36 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { OnStringUtilConversionError::SKIP, &encoded)); } +TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { + string16 utf16; + bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, + kConvertCodepageCases[i].codepage_name, + kConvertCodepageCases[i].on_error, + &utf16); + string16 utf16_expected; + if (kConvertCodepageCases[i].u16_wide == NULL) + utf16_expected = BuildString16(kConvertCodepageCases[i].wide); + else + utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide); + EXPECT_EQ(kConvertCodepageCases[i].success, success); + EXPECT_EQ(utf16_expected, utf16); + + // When decoding was successful and nothing was skipped, we also check the + // reverse conversion. See also the corresponding comment in + // ConvertBetweenCodepageAndWide. + if (success && + kConvertCodepageCases[i].on_error == + OnStringUtilConversionError::FAIL) { + std::string encoded; + success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name, + kConvertCodepageCases[i].on_error, &encoded); + EXPECT_EQ(kConvertCodepageCases[i].success, success); + EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded); + } + } +} + TEST(StringUtilTest, ConvertASCII) { static const char* char_cases[] = { "Google Video", diff --git a/net/base/net_util.cc b/net/base/net_util.cc index 2e6292c..00beb4e 100644 --- a/net/base/net_util.cc +++ b/net/base/net_util.cc @@ -860,7 +860,7 @@ std::string CanonicalizeHost(const std::wstring& host, return CanonicalizeHost(converted_host, host_info); } -std::string GetDirectoryListingHeader(const std::string& title) { +std::string GetDirectoryListingHeader(const string16& title) { static const StringPiece header(NetModule::GetResource(IDR_DIR_HEADER_HTML)); if (header.empty()) { NOTREACHED() << "expected resource not found"; @@ -874,15 +874,21 @@ std::string GetDirectoryListingHeader(const std::string& title) { return result; } -std::string GetDirectoryListingEntry(const std::string& name, +std::string GetDirectoryListingEntry(const string16& name, + const std::string& raw_bytes, bool is_dir, int64 size, - const Time& modified) { + Time modified) { std::string result; result.append("<script>addRow("); string_escape::JsonDoubleQuote(name, true, &result); result.append(","); - string_escape::JsonDoubleQuote(EscapePath(name), true, &result); + if (raw_bytes.empty()) { + string_escape::JsonDoubleQuote(EscapePath(UTF16ToUTF8(name)), + true, &result); + } else { + string_escape::JsonDoubleQuote(EscapePath(raw_bytes), true, &result); + } if (is_dir) { result.append(",1,"); } else { diff --git a/net/base/net_util.h b/net/base/net_util.h index 40df770..4320e1c 100644 --- a/net/base/net_util.h +++ b/net/base/net_util.h @@ -14,6 +14,7 @@ #include <string> #include "base/basictypes.h" +#include "base/string16.h" #include "net/base/escape.h" struct addrinfo; @@ -147,12 +148,24 @@ std::string CanonicalizeHost(const std::string& host, std::string CanonicalizeHost(const std::wstring& host, url_canon::CanonHostInfo* host_info); -// Call these functions to get the html for a directory listing. -// They will pass non-7bit-ascii characters unescaped, allowing -// the browser to interpret the encoding (utf8, etc). -std::string GetDirectoryListingHeader(const std::string& title); -std::string GetDirectoryListingEntry(const std::string& name, bool is_dir, - int64 size, const base::Time& modified); +// Call these functions to get the html snippet for a directory listing. +// The return values of both functions are in UTF-8. +std::string GetDirectoryListingHeader(const string16& title); + +// Given the name of a file in a directory (ftp or local) and +// other information (is_dir, size, modification time), it returns +// the html snippet to add the entry for the file to the directory listing. +// Currently, it's a script tag containing a call to a Javascript function +// |addRow|. +// +// Its 1st parameter is derived from |name| and is the Javascript-string +// escaped form of |name| (i.e \uXXXX). The 2nd parameter is the url-escaped +// |raw_bytes| if it's not empty. If empty, the 2nd parameter is the +// url-escaped |name| in UTF-8. +std::string GetDirectoryListingEntry(const string16& name, + const std::string& raw_bytes, + bool is_dir, int64 size, + base::Time modified); // If text starts with "www." it is removed, otherwise text is returned // unmodified. diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc index 78f7ab9..f346e92 100644 --- a/net/base/net_util_unittest.cc +++ b/net/base/net_util_unittest.cc @@ -407,18 +407,32 @@ TEST(NetUtilTest, FileURLConversion) { "file://some%20computer/foo/bar.txt"}, // UNC {L"D:\\Name;with%some symbols*#", "file:///D:/Name%3Bwith%25some%20symbols*%23"}, + // issue 14153: To be tested with the OS default codepage other than 1252. + {L"D:\\latin1\\caf\x00E9\x00DD.txt", + "file:///D:/latin1/caf%C3%A9%C3%9D.txt"}, + {L"D:\\otherlatin\\caf\x0119.txt", + "file:///D:/otherlatin/caf%C4%99.txt"}, + {L"D:\\greek\\\x03B1\x03B2\x03B3.txt", + "file:///D:/greek/%CE%B1%CE%B2%CE%B3.txt"}, {L"D:\\Chinese\\\x6240\x6709\x4e2d\x6587\x7f51\x9875.doc", "file:///D:/Chinese/%E6%89%80%E6%9C%89%E4%B8%AD%E6%96%87%E7%BD%91" "%E9%A1%B5.doc"}, + {L"D:\\plane1\\\xD835\xDC00\xD835\xDC01.txt", // Math alphabet "AB" + "file:///D:/plane1/%F0%9D%90%80%F0%9D%90%81.txt"}, #elif defined(OS_POSIX) {L"/foo/bar.txt", "file:///foo/bar.txt"}, {L"/foo/BAR.txt", "file:///foo/BAR.txt"}, {L"/C:/foo/bar.txt", "file:///C:/foo/bar.txt"}, {L"/some computer/foo/bar.txt", "file:///some%20computer/foo/bar.txt"}, {L"/Name;with%some symbols*#", "file:///Name%3Bwith%25some%20symbols*%23"}, + {L"/latin1/caf\x00E9\x00DD.txt", "file:///latin1/caf%C3%A9%C3%9D.txt"}, + {L"/otherlatin/caf\x0119.txt", "file:///otherlatin/caf%C4%99.txt"}, + {L"/greek/\x03B1\x03B2\x03B3.txt", "file:///greek/%CE%B1%CE%B2%CE%B3.txt"}, {L"/Chinese/\x6240\x6709\x4e2d\x6587\x7f51\x9875.doc", "file:///Chinese/%E6%89%80%E6%9C%89%E4%B8%AD%E6%96%87%E7%BD" "%91%E9%A1%B5.doc"}, + {L"/plane1/\x1D400\x1D401.txt", // Math alphabet "AB" + "file:///plane1/%F0%9D%90%80%F0%9D%90%81.txt"}, #endif }; @@ -474,21 +488,6 @@ TEST(NetUtilTest, FileURLConversion) { EXPECT_EQ(url_cases[i].file, output.ToWStringHack()); } - // Here, we test that UTF-8 encoded strings get decoded properly, even when - // they might be stored with wide characters. On posix systems, just treat - // this as a stream of bytes. - const wchar_t utf8[] = L"file:///d:/Chinese/\xe6\x89\x80\xe6\x9c\x89\xe4\xb8" - L"\xad\xe6\x96\x87\xe7\xbd\x91\xe9\xa1\xb5.doc"; -#if defined(OS_WIN) - const wchar_t wide[] = - L"D:\\Chinese\\\x6240\x6709\x4e2d\x6587\x7f51\x9875.doc"; -#elif defined(OS_POSIX) - const wchar_t wide[] = L"/d:/Chinese/\xe6\x89\x80\xe6\x9c\x89\xe4\xb8\xad\xe6" - L"\x96\x87\xe7\xbd\x91\xe9\xa1\xb5.doc"; -#endif - EXPECT_TRUE(net::FileURLToFilePath(GURL(WideToUTF8(utf8)), &output)); - EXPECT_EQ(wide, output.ToWStringHack()); - // Unfortunately, UTF8ToWide discards invalid UTF8 input. #ifdef BUG_878908_IS_FIXED // Test that no conversion happens if the UTF-8 input is invalid, and that @@ -862,7 +861,8 @@ TEST(NetUtilTest, GetSuggestedFilename) { namespace { struct GetDirectoryListingEntryCase { - const char* name; + const wchar_t* name; + const char* raw_bytes; bool is_dir; int64 filesize; base::Time time; @@ -872,22 +872,50 @@ struct GetDirectoryListingEntryCase { } // namespace TEST(NetUtilTest, GetDirectoryListingEntry) { const GetDirectoryListingEntryCase test_cases[] = { - {"Foo", + {L"Foo", + "", false, 10000, base::Time(), "<script>addRow(\"Foo\",\"Foo\",0,\"9.8 kB\",\"\");</script>\n"}, - {"quo\"tes", + {L"quo\"tes", + "", + false, + 10000, + base::Time(), + "<script>addRow(\"quo\\\"tes\",\"quo%22tes\",0,\"9.8 kB\",\"\");</script>" + "\n"}, + {L"quo\"tes", + "quo\"tes", false, 10000, base::Time(), "<script>addRow(\"quo\\\"tes\",\"quo%22tes\",0,\"9.8 kB\",\"\");</script>" "\n"}, + // U+D55C0 U+AE00. raw_bytes is empty (either a local file with + // UTF-8/UTF-16 encoding or a remote file on an ftp server using UTF-8 + {L"\xD55C\xAE00.txt", + "", + false, + 10000, + base::Time(), + "<script>addRow(\"\\uD55C\\uAE00.txt\",\"%ED%95%9C%EA%B8%80.txt\"" + ",0,\"9.8 kB\",\"\");</script>\n"}, + // U+D55C0 U+AE00. raw_bytes is the corresponding EUC-KR sequence: + // a local or remote file in EUC-KR. + {L"\xD55C\xAE00.txt", + "\xC7\xD1\xB1\xDB.txt", + false, + 10000, + base::Time(), + "<script>addRow(\"\\uD55C\\uAE00.txt\",\"%C7%D1%B1%DB.txt\"" + ",0,\"9.8 kB\",\"\");</script>\n"}, }; for (size_t i = 0; i < ARRAYSIZE_UNSAFE(test_cases); ++i) { const std::string results = net::GetDirectoryListingEntry( - test_cases[i].name, + WideToUTF16(test_cases[i].name), + test_cases[i].raw_bytes, test_cases[i].is_dir, test_cases[i].filesize, test_cases[i].time); diff --git a/net/base/net_util_win.cc b/net/base/net_util_win.cc index effb212..244f4ad 100644 --- a/net/base/net_util_win.cc +++ b/net/base/net_util_win.cc @@ -57,33 +57,13 @@ bool FileURLToFilePath(const GURL& url, FilePath* file_path) { } file_path_str.assign(UTF8ToWide(path)); - // Now we have an unescaped filename, but are still not sure about its - // encoding. For example, each character could be part of a UTF-8 string. - if (file_path_str.empty() || !IsString8Bit(file_path_str)) { - // assume our 16-bit encoding is correct if it won't fit into an 8-bit - // string - return true; - } - - // Convert our narrow string into the native wide path. - std::string narrow; - if (!WideToLatin1(file_path_str, &narrow)) { - NOTREACHED() << "Should have filtered out non-8-bit strings above."; - return false; - } - if (IsStringUTF8(narrow)) { - // Our string actually looks like it could be UTF-8, convert to 8-bit - // UTF-8 and then to the corresponding wide string. - file_path_str = UTF8ToWide(narrow); - } else { - // Our wide string contains only 8-bit characters and it's not UTF-8, so - // we assume it's in the native codepage. - file_path_str = base::SysNativeMBToWide(narrow); - } - - // Fail if 8-bit -> wide conversion failed and gave us an empty string back - // (we already filtered out empty strings above). - return !file_path_str.empty(); + // We used to try too hard and see if |path| made up entirely of + // the 1st 256 characters in the Unicode was a zero-extended UTF-16. + // If so, we converted it to 'Latin-1' and checked if the result was UTF-8. + // If the check passed, we converted the result to UTF-8. + // Otherwise, we treated the result as the native OS encoding. + // However, that led to http://crbug.com/4619 and http://crbug.com/14153 + return true; } } // namespace net diff --git a/net/url_request/url_request_file_dir_job.cc b/net/url_request/url_request_file_dir_job.cc index c242ef9..ecdf014 100644 --- a/net/url_request/url_request_file_dir_job.cc +++ b/net/url_request/url_request_file_dir_job.cc @@ -7,6 +7,7 @@ #include "base/file_util.h" #include "base/message_loop.h" #include "base/string_util.h" +#include "base/sys_string_conversions.h" #include "base/time.h" #include "googleurl/src/gurl.h" #include "net/base/io_buffer.h" @@ -104,9 +105,15 @@ void URLRequestFileDirJob::OnListFile( // can catch errors from DirectoryLister and show an error page. if (!wrote_header_) { #if defined(OS_WIN) - const std::string& title = WideToUTF8(dir_path_.value()); + const string16& title = dir_path_.value(); #elif defined(OS_POSIX) - const std::string& title = dir_path_.value(); + // TODO(jungshik): Add SysNativeMBToUTF16 to sys_string_conversions. + // On Mac, need to add NFKC->NFC conversion either here or in file_path. + // On Linux, the file system encoding is not defined, but we assume that + // SysNativeMBToWide takes care of it at least for now. We can try something + // more sophisticated if necessary later. + const string16& title = WideToUTF16( + base::SysNativeMBToWide(dir_path_.value())); #endif data_.append(net::GetDirectoryListingHeader(title)); wrote_header_ = true; @@ -119,14 +126,16 @@ void URLRequestFileDirJob::OnListFile( data.nFileSizeLow; data_.append(net::GetDirectoryListingEntry( - WideToUTF8(data.cFileName), + data.cFileName, std::string(), (data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ? true : false, size, base::Time::FromFileTime(local_time))); #elif defined(OS_POSIX) + // TOOD(jungshik): The same issue as for the directory name. data_.append(net::GetDirectoryListingEntry( - data.filename.c_str(), + WideToUTF16(base::SysNativeMBToWide(data.filename)), + data.filename, S_ISDIR(data.stat.st_mode), data.stat.st_size, base::Time::FromTimeT(data.stat.st_mtime))); diff --git a/net/url_request/url_request_ftp_job.cc b/net/url_request/url_request_ftp_job.cc index bdfb0b3..c7cb333 100644 --- a/net/url_request/url_request_ftp_job.cc +++ b/net/url_request/url_request_ftp_job.cc @@ -9,6 +9,7 @@ #include "base/message_loop.h" #include "base/string_util.h" +#include "base/sys_string_conversions.h" #include "base/time.h" #include "net/base/auth.h" #include "net/base/escape.h" @@ -388,11 +389,21 @@ void URLRequestFtpJob::OnFindFile(DWORD last_error) { (static_cast<unsigned __int64>(find_data_.nFileSizeHigh) << 32) | find_data_.nFileSizeLow; - // We don't know the encoding, and can't assume utf8, so pass the 8bit - // directly to the browser for it to decide. + // We don't know the encoding used on an FTP server, but we + // use FtpFindFirstFileA, which I guess does NOT preserve + // the raw byte sequence because it's implemented in terms + // of FtpFindFirstFileW. Without the raw byte sequence, we + // can't apply the encoding detection or other heuristics + // to determine/guess the encoding. Neither can we use UTF-8 + // used by a RFC-2640-compliant FTP server. In some cases (e.g. + // the default code page is an SBCS with almost all bytes assigned. + // In lucky cases, it's even possible with a DBCS), it's possible + // to recover the raw byte sequence in most cases. We can do + // some more here, but it's not worth the effort because we're + // going to replace this class with URLRequestNewFtpJob. string file_entry = net::GetDirectoryListingEntry( - find_data_.cFileName, false, size, - base::Time::FromFileTime(find_data_.ftLastWriteTime)); + base::SysNativeMBToWide(find_data_.cFileName), std::string(), + false, size, base::Time::FromFileTime(find_data_.ftLastWriteTime)); WriteData(&file_entry, true); FindNextFile(); @@ -407,14 +418,20 @@ void URLRequestFtpJob::OnStartDirectoryTraversal() { state_ = GETTING_DIRECTORY; // Unescape the URL path and pass the raw 8bit directly to the browser. + // + // Here we can try to detect the encoding although it may not be very + // reliable because it's not likely to be long enough. Because this class + // will be replaced by URLRequestNewFtpJob and is used only on Windows, + // we use SysNativeMBToWide as a stopgap measure. string html = net::GetDirectoryListingHeader( - UnescapeURLComponent(request_->url().path(), - UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS)); + base::SysNativeMBToWide(UnescapeURLComponent(request_->url().path(), + UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS))); // If this isn't top level directory (i.e. the path isn't "/",) add a link to // the parent directory. if (request_->url().path().length() > 1) - html.append(net::GetDirectoryListingEntry("..", false, 0, base::Time())); + html.append(net::GetDirectoryListingEntry(L"..", std::string(), + false, 0, base::Time())); WriteData(&html, true); diff --git a/net/url_request/url_request_new_ftp_job.cc b/net/url_request/url_request_new_ftp_job.cc index d3a0c3e..d9f1d27 100644 --- a/net/url_request/url_request_new_ftp_job.cc +++ b/net/url_request/url_request_new_ftp_job.cc @@ -7,6 +7,7 @@ #include "base/compiler_specific.h" #include "base/file_version_info.h" #include "base/message_loop.h" +#include "base/sys_string_conversions.h" #include "net/base/escape.h" #include "net/base/net_errors.h" #include "net/base/net_util.h" @@ -16,6 +17,46 @@ #include "net/url_request/url_request.h" #include "net/url_request/url_request_context.h" #include "net/url_request/url_request_error_job.h" +#include "unicode/ucsdet.h" + +namespace { + +// A very simple-minded character encoding detection. +// TODO(jungshik): We can apply more heuristics here (e.g. using various hints +// like TLD, the UI language/default encoding of a client, etc). In that case, +// this should be pulled out of here and moved somewhere in base because there +// can be other use cases. +std::string DetectEncoding(const char*input, size_t len) { + if (IsStringASCII(std::string(input, len))) + return std::string(); + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector* detector = ucsdet_open(&status); + ucsdet_setText(detector, input, static_cast<int32_t>(len), &status); + const UCharsetMatch* match = ucsdet_detect(detector, &status); + const char* encoding = ucsdet_getName(match, &status); + // Should we check the quality of the match? A rather arbitrary number is + // assigned by ICU and it's hard to come up with a lower limit. + if (U_FAILURE(status)) + return std::string(); + return encoding; +} + +string16 RawByteSequenceToFilename(const char* raw_filename, + const std::string& encoding) { + if (encoding.empty()) + return ASCIIToUTF16(raw_filename); + + // Try the detected encoding before falling back to the native codepage. + // Using the native codepage does not make much sense, but we don't have + // much else to resort to. + string16 filename; + if (!CodepageToUTF16(raw_filename, encoding.c_str(), + OnStringUtilConversionError::SUBSTITUTE, &filename)) + filename = WideToUTF16Hack(base::SysNativeMBToWide(raw_filename)); + return filename; +} + +} // namespace URLRequestNewFtpJob::URLRequestNewFtpJob(URLRequest* request) : URLRequestJob(request), @@ -69,17 +110,36 @@ bool URLRequestNewFtpJob::ReadRawData(net::IOBuffer* buf, if (response_info_ == NULL) { response_info_ = transaction_->GetResponseInfo(); if (response_info_->is_directory_listing) { - // Unescape the URL path and pass the raw 8bit directly to the browser. - directory_html_ = net::GetDirectoryListingHeader( + std::string escaped_path = UnescapeURLComponent(request_->url().path(), - UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS)); + UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); + string16 path_utf16; + // Per RFC 2640, FTP servers should use UTF-8 or its proper subset ASCII, + // but many old FTP servers use legacy encodings. Try UTF-8 first and + // detect the encoding. + if (IsStringUTF8(escaped_path)) { + path_utf16 = UTF8ToUTF16(escaped_path); + } else { + std::string encoding = DetectEncoding(escaped_path.c_str(), + escaped_path.size()); + // Try the detected encoding. If it fails, resort to the + // OS native encoding. + if (encoding.empty() || + !CodepageToUTF16(escaped_path, encoding.c_str(), + OnStringUtilConversionError::SUBSTITUTE, + &path_utf16)) + path_utf16 = WideToUTF16Hack(base::SysNativeMBToWide(escaped_path)); + } + + directory_html_ = net::GetDirectoryListingHeader(path_utf16); // If this isn't top level directory (i.e. the path isn't "/",) // add a link to the parent directory. if (request_->url().path().length() > 1) - directory_html_.append(net::GetDirectoryListingEntry("..", - false, - 0, - base::Time())); + directory_html_.append( + net::GetDirectoryListingEntry(ASCIIToUTF16(".."), + std::string(), + false, 0, + base::Time())); } } if (!directory_html_.empty()) { @@ -121,6 +181,20 @@ int URLRequestNewFtpJob::ProcessFtpDir(net::IOBuffer *buf, std::string file_entry; std::string line; buf->data()[bytes_read] = 0; + + // If all we've seen so far is ASCII, encoding_ is empty. Try to detect the + // encoding. We don't do the separate UTF-8 check here because the encoding + // detection with a longer chunk (as opposed to the relatively short path + // component of the url) is unlikely to mistake UTF-8 for a legacy encoding. + // If it turns out to be wrong, a separate UTF-8 check has to be added. + // + // TODO(jungshik): UTF-8 has to be 'enforced' without any heuristics when + // we're talking to an FTP server compliant to RFC 2640 (that is, its response + // to FEAT command includes 'UTF8'). + // See http://wiki.filezilla-project.org/Character_Set + if (encoding_.empty()) + encoding_ = DetectEncoding(buf->data(), bytes_read); + int64 file_size; std::istringstream iss(buf->data()); while (getline(iss, line)) { @@ -144,6 +218,7 @@ int URLRequestNewFtpJob::ProcessFtpDir(net::IOBuffer *buf, et.day_of_week = result.fe_time.tm_wday; file_entry.append(net::GetDirectoryListingEntry( + RawByteSequenceToFilename(result.fe_fname, encoding_), result.fe_fname, true, 0, base::Time::FromLocalExploded(et))); break; case net::FTP_TYPE_FILE: @@ -163,6 +238,7 @@ int URLRequestNewFtpJob::ProcessFtpDir(net::IOBuffer *buf, // It returns wrong date/time (Differnce is 1 day and 17 Hours). if (StringToInt64(result.fe_size, &file_size)) file_entry.append(net::GetDirectoryListingEntry( + RawByteSequenceToFilename(result.fe_fname, encoding_), result.fe_fname, false, file_size, base::Time::FromLocalExploded(et))); break; diff --git a/net/url_request/url_request_new_ftp_job.h b/net/url_request/url_request_new_ftp_job.h index a74a265..69c1fef 100644 --- a/net/url_request/url_request_new_ftp_job.h +++ b/net/url_request/url_request_new_ftp_job.h @@ -59,6 +59,7 @@ class URLRequestNewFtpJob : public URLRequestJob { std::string directory_html_; bool read_in_progress_; + std::string encoding_; // Keep a reference to the url request context to be sure it's not deleted // before us. |