diff options
author | jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-07-09 22:48:16 +0000 |
---|---|---|
committer | jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-07-09 22:48:16 +0000 |
commit | 5420bc1e4fa6d861107a5c847843ac7bd25fb3c4 (patch) | |
tree | 7cf5fdfbbb128ec57462450e3c7167e017351bfd /base/string_util_icu.cc | |
parent | 8f82f9d9ae8dfd23ab63fb9e63c6246da71d29fd (diff) | |
download | chromium_src-5420bc1e4fa6d861107a5c847843ac7bd25fb3c4.zip chromium_src-5420bc1e4fa6d861107a5c847843ac7bd25fb3c4.tar.gz chromium_src-5420bc1e4fa6d861107a5c847843ac7bd25fb3c4.tar.bz2 |
Fix the local directory listing, FTP directory listing and the local file handling (drag'n'drop and opening from the file list).
For the local file listing, use the OS file system encoding.
For the FTP directory listing, use ICU's encoding detector.GetDirectoryListingEntry and GetDirectoryLisingHeader were changed to accept string16 for file/directory names. To the former, a new parameter (|raw_bytes|) was added. It can be used to make a FTP request to a file with a non-ASCII name encoded in a legacy encoding.
For the local file handling on Windows, get rid of the code for 'doubly converted' UTF-8 in FileURLToFilePath, which led to issue 4619 and add a few cases to NetUtil*.FileURLConversion* test.
In addition, add
CodepageToUTF16 and UTF16ToCodepage along with a new unittest (ConvertBetweenCodepageAndUTF16) that shares the same set of case as ConvertBetweenCodepageAndWide. The test cases were expanded and revised a bit.
BUG=2939,13229,4619
http://crbug.com/2939 http://crbug.com/13229 http://crbug.com/4619
TEST=1. Pass URLRequest*.FTP* (net_unittests)
2. Pass StringUtiltTest.ConvertBetweenCode*
3. Pass NetUtil*.GetDirectoryLis* (net_unittests)
4. Open a local directory containing files with non-ASCII names and they're displayed correctly in the directory list. On Windows and Mac OS X, it should always work. On Linux, your locale encoding (as returned by nl_langinfo(CODESET)) should match the actual encoding used in your filename.
5a. Pass NetUtil*.FileURL* (net_unittests) with the default codepage set to 1252 and 932.
5b. Make a file named 'caf챕.txt' on Windows and see if it can be opened both by clicking in the directory listing page of Chrome and by drag'n'drop. Test this with the default OS code pages set to Windows-1252, Windows-1251 (Russian) and Windows-932 (Japanese).
Review URL: http://codereview.chromium.org/151065
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@20331 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/string_util_icu.cc')
-rw-r--r-- | base/string_util_icu.cc | 285 |
1 files changed, 206 insertions, 79 deletions
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc index 87731de..3bd6f9b 100644 --- a/base/string_util_icu.cc +++ b/base/string_util_icu.cc @@ -10,8 +10,10 @@ #include "base/basictypes.h" #include "base/logging.h" #include "base/singleton.h" -#include "unicode/ucnv.h" #include "unicode/numfmt.h" +#include "unicode/ucnv.h" +#include "unicode/ucnv_cb.h" +#include "unicode/ucnv_err.h" #include "unicode/ustring.h" namespace { @@ -24,6 +26,64 @@ inline bool IsValidCodepoint(uint32 code_point) { (code_point >= 0xE000u && code_point <= 0x10FFFFu); } +// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE +// in source/common/ucnv_err.c. + +// Copyright (c) 1995-2006 International Business Machines Corporation +// and others +// +// All rights reserved. +// + +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, and/or +// sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, provided that the above copyright notice(s) and +// this permission notice appear in all copies of the Software and that +// both the above copyright notice(s) and this permission notice appear in +// supporting documentation. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS +// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT +// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +// OR PERFORMANCE OF THIS SOFTWARE. +// +// Except as contained in this notice, the name of a copyright holder +// shall not be used in advertising or otherwise to promote the sale, use +// or other dealings in this Software without prior written authorization +// of the copyright holder. + +// ___________________________________________________________________________ +// +// All trademarks and registered trademarks mentioned herein are the property +// of their respective owners. + +void ToUnicodeCallbackSubstitute(const void* context, + UConverterToUnicodeArgs *to_args, + const char* code_units, + int32_t length, + UConverterCallbackReason reason, + UErrorCode * err) { + static const UChar kReplacementChar = 0xFFFD; + if (reason <= UCNV_IRREGULAR) { + if (context == NULL || + (*(reinterpret_cast<const char*>(context)) == 'i' && + reason == UCNV_UNASSIGNED)) { + *err = U_ZERO_ERROR; + ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err); + } + // else the caller must have set the error code accordingly. + } + // else ignore the reset, close and clone calls. +} + // ReadUnicodeCharacter -------------------------------------------------------- // Reads a UTF-8 stream, placing the next code point into the given output @@ -76,7 +136,7 @@ bool ReadUnicodeCharacter(const char16* src, int32 src_len, #if defined(WCHAR_T_IS_UTF32) // Reads UTF-32 character. The usage is the same as the 8-bit version above. bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, - int32* char_index, uint32* code_point) { + int32* char_index, uint32* code_point) { // Conversion is easy since the source is 32-bit. *code_point = src[*char_index]; @@ -184,6 +244,70 @@ void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { } } +bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src, + int uchar_len, OnStringUtilConversionError::Type on_error, + std::string* encoded) { + int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, + ucnv_getMaxCharSize(converter)); + encoded->resize(encoded_max_length); + + UErrorCode status = U_ZERO_ERROR; + + // Setup our error handler. + switch (on_error) { + case OnStringUtilConversionError::FAIL: + ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, + NULL, NULL, &status); + break; + case OnStringUtilConversionError::SKIP: + case OnStringUtilConversionError::SUBSTITUTE: + ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, + NULL, NULL, &status); + break; + default: + NOTREACHED(); + } + + // ucnv_fromUChars returns size not including terminating null + int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], + encoded_max_length, uchar_src, uchar_len, &status); + encoded->resize(actual_size); + ucnv_close(converter); + if (U_SUCCESS(status)) + return true; + encoded->clear(); // Make sure the output is empty on error. + return false; +} + +// Set up our error handler for ToUTF-16 converters +void SetUpErrorHandlerForToUChars(OnStringUtilConversionError::Type on_error, + UConverter* converter, UErrorCode* status) { + switch (on_error) { + case OnStringUtilConversionError::FAIL: + ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, + NULL, NULL, status); + break; + case OnStringUtilConversionError::SKIP: + ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, + NULL, NULL, status); + break; + case OnStringUtilConversionError::SUBSTITUTE: + ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0, + NULL, NULL, status); + break; + default: + NOTREACHED(); + } +} + +inline UConverterType utf32_platform_endian() { +#if U_IS_BIG_ENDIAN + return UCNV_UTF32_BigEndian; +#else + return UCNV_UTF32_LittleEndian; +#endif +} + } // namespace // UTF-8 <-> Wide -------------------------------------------------------------- @@ -364,14 +488,17 @@ std::string UTF16ToUTF8(const string16& utf16) { #endif -// Codepage <-> Wide ----------------------------------------------------------- +// Codepage <-> Wide/UTF-16 --------------------------------------------------- -// Convert a unicode string into the specified codepage_name. If the codepage +// Convert a wstring into the specified codepage_name. If the codepage // isn't found, return false. bool WideToCodepage(const std::wstring& wide, const char* codepage_name, OnStringUtilConversionError::Type on_error, std::string* encoded) { +#if defined(WCHAR_T_IS_UTF16) + return UTF16ToCodepage(wide, codepage_name, on_error, encoded); +#elif defined(WCHAR_T_IS_UTF32) encoded->clear(); UErrorCode status = U_ZERO_ERROR; @@ -379,59 +506,47 @@ bool WideToCodepage(const std::wstring& wide, if (!U_SUCCESS(status)) return false; - const UChar* uchar_src; - int uchar_len; -#if defined(WCHAR_T_IS_UTF16) - uchar_src = wide.c_str(); - uchar_len = static_cast<int>(wide.length()); -#elif defined(WCHAR_T_IS_UTF32) + int utf16_len; // When wchar_t is wider than UChar (16 bits), transform |wide| into a // UChar* string. Size the UChar* buffer to be large enough to hold twice - // as many UTF-16 code points as there are UTF-16 characters, in case each - // character translates to a UTF-16 surrogate pair, and leave room for a NUL - // terminator. - std::vector<UChar> wide_uchar(wide.length() * 2 + 1); - u_strFromWCS(&wide_uchar[0], wide_uchar.size(), &uchar_len, + // as many UTF-16 code units (UChar's) as there are Unicode code points, + // in case each code points translates to a UTF-16 surrogate pair, + // and leave room for a NUL terminator. + std::vector<UChar> utf16(wide.length() * 2 + 1); + u_strFromWCS(&utf16[0], utf16.size(), &utf16_len, wide.c_str(), wide.length(), &status); - uchar_src = &wide_uchar[0]; DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*"; + + return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded); #endif // defined(WCHAR_T_IS_UTF32) +} - int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len, - ucnv_getMaxCharSize(converter)); - encoded->resize(encoded_max_length); +// Convert a UTF-16 string into the specified codepage_name. If the codepage +// isn't found, return false. +bool UTF16ToCodepage(const string16& utf16, + const char* codepage_name, + OnStringUtilConversionError::Type on_error, + std::string* encoded) { + encoded->clear(); - // Setup our error handler. - switch (on_error) { - case OnStringUtilConversionError::FAIL: - ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0, - NULL, NULL, &status); - break; - case OnStringUtilConversionError::SKIP: - ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0, - NULL, NULL, &status); - break; - default: - NOTREACHED(); - } + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; - // ucnv_fromUChars returns size not including terminating null - int actual_size = ucnv_fromUChars(converter, &(*encoded)[0], - encoded_max_length, uchar_src, uchar_len, &status); - encoded->resize(actual_size); - ucnv_close(converter); - if (U_SUCCESS(status)) - return true; - encoded->clear(); // Make sure the output is empty on error. - return false; + return ConvertFromUTF16(converter, utf16.c_str(), + static_cast<int>(utf16.length()), on_error, encoded); } -// Converts a string of the given codepage into unicode. +// Converts a string of the given codepage into wstring. // If the codepage isn't found, return false. bool CodepageToWide(const std::string& encoded, const char* codepage_name, OnStringUtilConversionError::Type on_error, std::wstring* wide) { +#if defined(WCHAR_T_IS_UTF16) + return CodepageToUTF16(encoded, codepage_name, on_error, wide); +#elif defined(WCHAR_T_IS_UTF32) wide->clear(); UErrorCode status = U_ZERO_ERROR; @@ -439,6 +554,51 @@ bool CodepageToWide(const std::string& encoded, if (!U_SUCCESS(status)) return false; + // The maximum length in 4 byte unit of UTF-32 output would be + // at most the same as the number of bytes in input. In the worst + // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP), + // this can be 4 times larger than actually needed. + size_t wchar_max_length = encoded.length() + 1; + + // The byte buffer and its length to pass to ucnv_toAlgorithimic. + char* byte_buffer = reinterpret_cast<char*>( + WriteInto(wide, wchar_max_length)); + int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; + + SetUpErrorHandlerForToUChars(on_error, converter, &status); + int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), + converter, + byte_buffer, + byte_buffer_length, + encoded.data(), + static_cast<int>(encoded.length()), + &status); + ucnv_close(converter); + + if (!U_SUCCESS(status)) { + wide->clear(); // Make sure the output is empty on error. + return false; + } + + // actual_size is # of bytes. + wide->resize(actual_size / 4); + return true; +#endif // defined(WCHAR_T_IS_UTF32) +} + +// Converts a string of the given codepage into UTF-16. +// If the codepage isn't found, return false. +bool CodepageToUTF16(const std::string& encoded, + const char* codepage_name, + OnStringUtilConversionError::Type on_error, + string16* utf16) { + utf16->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + // Even in the worst case, the maximum length in 2-byte units of UTF-16 // output would be at most the same as the number of bytes in input. There // is no single-byte encoding in which a character is mapped to a @@ -449,53 +609,20 @@ bool CodepageToWide(const std::string& encoded, // BOCU and SCSU, but we don't care about them. size_t uchar_max_length = encoded.length() + 1; - UChar* uchar_dst; -#if defined(WCHAR_T_IS_UTF16) - uchar_dst = WriteInto(wide, uchar_max_length); -#elif defined(WCHAR_T_IS_UTF32) - // When wchar_t is wider than UChar (16 bits), convert into a temporary - // UChar* buffer. - std::vector<UChar> wide_uchar(uchar_max_length); - uchar_dst = &wide_uchar[0]; -#endif // defined(WCHAR_T_IS_UTF32) - - // Setup our error handler. - switch (on_error) { - case OnStringUtilConversionError::FAIL: - ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0, - NULL, NULL, &status); - break; - case OnStringUtilConversionError::SKIP: - ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0, - NULL, NULL, &status); - break; - default: - NOTREACHED(); - } - + SetUpErrorHandlerForToUChars(on_error, converter, &status); int actual_size = ucnv_toUChars(converter, - uchar_dst, + WriteInto(utf16, uchar_max_length), static_cast<int>(uchar_max_length), encoded.data(), static_cast<int>(encoded.length()), &status); ucnv_close(converter); if (!U_SUCCESS(status)) { - wide->clear(); // Make sure the output is empty on error. + utf16->clear(); // Make sure the output is empty on error. return false; } -#ifdef WCHAR_T_IS_UTF32 - // When wchar_t is wider than UChar (16 bits), it's not possible to wind up - // with any more wchar_t elements than UChar elements. ucnv_toUChars - // returns the number of UChar elements not including the NUL terminator, so - // leave extra room for that. - u_strToWCS(WriteInto(wide, actual_size + 1), actual_size + 1, &actual_size, - uchar_dst, actual_size, &status); - DCHECK(U_SUCCESS(status)) << "failed to convert UChar* to wstring"; -#endif // WCHAR_T_IS_UTF32 - - wide->resize(actual_size); + utf16->resize(actual_size); return true; } |