summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
authorjshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-07-09 22:48:16 +0000
committerjshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-07-09 22:48:16 +0000
commit5420bc1e4fa6d861107a5c847843ac7bd25fb3c4 (patch)
tree7cf5fdfbbb128ec57462450e3c7167e017351bfd /base
parent8f82f9d9ae8dfd23ab63fb9e63c6246da71d29fd (diff)
downloadchromium_src-5420bc1e4fa6d861107a5c847843ac7bd25fb3c4.zip
chromium_src-5420bc1e4fa6d861107a5c847843ac7bd25fb3c4.tar.gz
chromium_src-5420bc1e4fa6d861107a5c847843ac7bd25fb3c4.tar.bz2
Fix the local directory listing, FTP directory listing and the local file handling (drag'n'drop and opening from the file list).
For the local file listing, use the OS file system encoding. For the FTP directory listing, use ICU's encoding detector.GetDirectoryListingEntry and GetDirectoryLisingHeader were changed to accept string16 for file/directory names. To the former, a new parameter (|raw_bytes|) was added. It can be used to make a FTP request to a file with a non-ASCII name encoded in a legacy encoding. For the local file handling on Windows, get rid of the code for 'doubly converted' UTF-8 in FileURLToFilePath, which led to issue 4619 and add a few cases to NetUtil*.FileURLConversion* test. In addition, add CodepageToUTF16 and UTF16ToCodepage along with a new unittest (ConvertBetweenCodepageAndUTF16) that shares the same set of case as ConvertBetweenCodepageAndWide. The test cases were expanded and revised a bit. BUG=2939,13229,4619 http://crbug.com/2939 http://crbug.com/13229 http://crbug.com/4619 TEST=1. Pass URLRequest*.FTP* (net_unittests) 2. Pass StringUtiltTest.ConvertBetweenCode* 3. Pass NetUtil*.GetDirectoryLis* (net_unittests) 4. Open a local directory containing files with non-ASCII names and they're displayed correctly in the directory list. On Windows and Mac OS X, it should always work. On Linux, your locale encoding (as returned by nl_langinfo(CODESET)) should match the actual encoding used in your filename. 5a. Pass NetUtil*.FileURL* (net_unittests) with the default codepage set to 1252 and 932. 5b. Make a file named 'caf챕.txt' on Windows and see if it can be opened both by clicking in the directory listing page of Chrome and by drag'n'drop. Test this with the default OS code pages set to Windows-1252, Windows-1251 (Russian) and Windows-932 (Japanese). Review URL: http://codereview.chromium.org/151065 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@20331 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r--base/string_util.h21
-rw-r--r--base/string_util_icu.cc285
-rw-r--r--base/string_util_unittest.cc313
3 files changed, 441 insertions, 178 deletions
diff --git a/base/string_util.h b/base/string_util.h
index 9a033b4..c7f3115 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -221,7 +221,8 @@ std::string UTF16ToUTF8(const string16& utf16);
# define UTF16ToWideHack UTF16ToWide
#endif
-// Defines the error handling modes of WideToCodepage and CodepageToWide.
+// Defines the error handling modes of UTF16ToCodepage, CodepageToUTF16,
+// WideToCodepage and CodepageToWide.
class OnStringUtilConversionError {
public:
enum Type {
@@ -231,12 +232,30 @@ class OnStringUtilConversionError {
// The offending characters are skipped and the conversion will proceed as
// if they did not exist.
SKIP,
+
+ // When converting to Unicode, the offending byte sequences are substituted
+ // by Unicode replacement character (U+FFFD). When converting from Unicode,
+ // this is the same as SKIP.
+ SUBSTITUTE,
};
private:
OnStringUtilConversionError();
};
+// Converts between UTF-16 strings and the encoding specified. If the
+// encoding doesn't exist or the encoding fails (when on_error is FAIL),
+// returns false.
+bool UTF16ToCodepage(const string16& utf16,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ std::string* encoded);
+
+bool CodepageToUTF16(const std::string& encoded,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ string16* utf16);
+
// Converts between wide strings and the encoding specified. If the
// encoding doesn't exist or the encoding fails (when on_error is FAIL),
// returns false.
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
index 87731de..3bd6f9b 100644
--- a/base/string_util_icu.cc
+++ b/base/string_util_icu.cc
@@ -10,8 +10,10 @@
#include "base/basictypes.h"
#include "base/logging.h"
#include "base/singleton.h"
-#include "unicode/ucnv.h"
#include "unicode/numfmt.h"
+#include "unicode/ucnv.h"
+#include "unicode/ucnv_cb.h"
+#include "unicode/ucnv_err.h"
#include "unicode/ustring.h"
namespace {
@@ -24,6 +26,64 @@ inline bool IsValidCodepoint(uint32 code_point) {
(code_point >= 0xE000u && code_point <= 0x10FFFFu);
}
+// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUSBSTITUTE
+// in source/common/ucnv_err.c.
+
+// Copyright (c) 1995-2006 International Business Machines Corporation
+// and others
+//
+// All rights reserved.
+//
+
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, and/or
+// sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, provided that the above copyright notice(s) and
+// this permission notice appear in all copies of the Software and that
+// both the above copyright notice(s) and this permission notice appear in
+// supporting documentation.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+// OR PERFORMANCE OF THIS SOFTWARE.
+//
+// Except as contained in this notice, the name of a copyright holder
+// shall not be used in advertising or otherwise to promote the sale, use
+// or other dealings in this Software without prior written authorization
+// of the copyright holder.
+
+// ___________________________________________________________________________
+//
+// All trademarks and registered trademarks mentioned herein are the property
+// of their respective owners.
+
+void ToUnicodeCallbackSubstitute(const void* context,
+ UConverterToUnicodeArgs *to_args,
+ const char* code_units,
+ int32_t length,
+ UConverterCallbackReason reason,
+ UErrorCode * err) {
+ static const UChar kReplacementChar = 0xFFFD;
+ if (reason <= UCNV_IRREGULAR) {
+ if (context == NULL ||
+ (*(reinterpret_cast<const char*>(context)) == 'i' &&
+ reason == UCNV_UNASSIGNED)) {
+ *err = U_ZERO_ERROR;
+ ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err);
+ }
+ // else the caller must have set the error code accordingly.
+ }
+ // else ignore the reset, close and clone calls.
+}
+
// ReadUnicodeCharacter --------------------------------------------------------
// Reads a UTF-8 stream, placing the next code point into the given output
@@ -76,7 +136,7 @@ bool ReadUnicodeCharacter(const char16* src, int32 src_len,
#if defined(WCHAR_T_IS_UTF32)
// Reads UTF-32 character. The usage is the same as the 8-bit version above.
bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
- int32* char_index, uint32* code_point) {
+ int32* char_index, uint32* code_point) {
// Conversion is easy since the source is 32-bit.
*code_point = src[*char_index];
@@ -184,6 +244,70 @@ void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) {
}
}
+bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src,
+ int uchar_len, OnStringUtilConversionError::Type on_error,
+ std::string* encoded) {
+ int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
+ ucnv_getMaxCharSize(converter));
+ encoded->resize(encoded_max_length);
+
+ UErrorCode status = U_ZERO_ERROR;
+
+ // Setup our error handler.
+ switch (on_error) {
+ case OnStringUtilConversionError::FAIL:
+ ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
+ NULL, NULL, &status);
+ break;
+ case OnStringUtilConversionError::SKIP:
+ case OnStringUtilConversionError::SUBSTITUTE:
+ ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
+ NULL, NULL, &status);
+ break;
+ default:
+ NOTREACHED();
+ }
+
+ // ucnv_fromUChars returns size not including terminating null
+ int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
+ encoded_max_length, uchar_src, uchar_len, &status);
+ encoded->resize(actual_size);
+ ucnv_close(converter);
+ if (U_SUCCESS(status))
+ return true;
+ encoded->clear(); // Make sure the output is empty on error.
+ return false;
+}
+
+// Set up our error handler for ToUTF-16 converters
+void SetUpErrorHandlerForToUChars(OnStringUtilConversionError::Type on_error,
+ UConverter* converter, UErrorCode* status) {
+ switch (on_error) {
+ case OnStringUtilConversionError::FAIL:
+ ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
+ NULL, NULL, status);
+ break;
+ case OnStringUtilConversionError::SKIP:
+ ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
+ NULL, NULL, status);
+ break;
+ case OnStringUtilConversionError::SUBSTITUTE:
+ ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, 0,
+ NULL, NULL, status);
+ break;
+ default:
+ NOTREACHED();
+ }
+}
+
+inline UConverterType utf32_platform_endian() {
+#if U_IS_BIG_ENDIAN
+ return UCNV_UTF32_BigEndian;
+#else
+ return UCNV_UTF32_LittleEndian;
+#endif
+}
+
} // namespace
// UTF-8 <-> Wide --------------------------------------------------------------
@@ -364,14 +488,17 @@ std::string UTF16ToUTF8(const string16& utf16) {
#endif
-// Codepage <-> Wide -----------------------------------------------------------
+// Codepage <-> Wide/UTF-16 ---------------------------------------------------
-// Convert a unicode string into the specified codepage_name. If the codepage
+// Convert a wstring into the specified codepage_name. If the codepage
// isn't found, return false.
bool WideToCodepage(const std::wstring& wide,
const char* codepage_name,
OnStringUtilConversionError::Type on_error,
std::string* encoded) {
+#if defined(WCHAR_T_IS_UTF16)
+ return UTF16ToCodepage(wide, codepage_name, on_error, encoded);
+#elif defined(WCHAR_T_IS_UTF32)
encoded->clear();
UErrorCode status = U_ZERO_ERROR;
@@ -379,59 +506,47 @@ bool WideToCodepage(const std::wstring& wide,
if (!U_SUCCESS(status))
return false;
- const UChar* uchar_src;
- int uchar_len;
-#if defined(WCHAR_T_IS_UTF16)
- uchar_src = wide.c_str();
- uchar_len = static_cast<int>(wide.length());
-#elif defined(WCHAR_T_IS_UTF32)
+ int utf16_len;
// When wchar_t is wider than UChar (16 bits), transform |wide| into a
// UChar* string. Size the UChar* buffer to be large enough to hold twice
- // as many UTF-16 code points as there are UTF-16 characters, in case each
- // character translates to a UTF-16 surrogate pair, and leave room for a NUL
- // terminator.
- std::vector<UChar> wide_uchar(wide.length() * 2 + 1);
- u_strFromWCS(&wide_uchar[0], wide_uchar.size(), &uchar_len,
+ // as many UTF-16 code units (UChar's) as there are Unicode code points,
+ // in case each code points translates to a UTF-16 surrogate pair,
+ // and leave room for a NUL terminator.
+ std::vector<UChar> utf16(wide.length() * 2 + 1);
+ u_strFromWCS(&utf16[0], utf16.size(), &utf16_len,
wide.c_str(), wide.length(), &status);
- uchar_src = &wide_uchar[0];
DCHECK(U_SUCCESS(status)) << "failed to convert wstring to UChar*";
+
+ return ConvertFromUTF16(converter, &utf16[0], utf16_len, on_error, encoded);
#endif // defined(WCHAR_T_IS_UTF32)
+}
- int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(uchar_len,
- ucnv_getMaxCharSize(converter));
- encoded->resize(encoded_max_length);
+// Convert a UTF-16 string into the specified codepage_name. If the codepage
+// isn't found, return false.
+bool UTF16ToCodepage(const string16& utf16,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ std::string* encoded) {
+ encoded->clear();
- // Setup our error handler.
- switch (on_error) {
- case OnStringUtilConversionError::FAIL:
- ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, 0,
- NULL, NULL, &status);
- break;
- case OnStringUtilConversionError::SKIP:
- ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, 0,
- NULL, NULL, &status);
- break;
- default:
- NOTREACHED();
- }
+ UErrorCode status = U_ZERO_ERROR;
+ UConverter* converter = ucnv_open(codepage_name, &status);
+ if (!U_SUCCESS(status))
+ return false;
- // ucnv_fromUChars returns size not including terminating null
- int actual_size = ucnv_fromUChars(converter, &(*encoded)[0],
- encoded_max_length, uchar_src, uchar_len, &status);
- encoded->resize(actual_size);
- ucnv_close(converter);
- if (U_SUCCESS(status))
- return true;
- encoded->clear(); // Make sure the output is empty on error.
- return false;
+ return ConvertFromUTF16(converter, utf16.c_str(),
+ static_cast<int>(utf16.length()), on_error, encoded);
}
-// Converts a string of the given codepage into unicode.
+// Converts a string of the given codepage into wstring.
// If the codepage isn't found, return false.
bool CodepageToWide(const std::string& encoded,
const char* codepage_name,
OnStringUtilConversionError::Type on_error,
std::wstring* wide) {
+#if defined(WCHAR_T_IS_UTF16)
+ return CodepageToUTF16(encoded, codepage_name, on_error, wide);
+#elif defined(WCHAR_T_IS_UTF32)
wide->clear();
UErrorCode status = U_ZERO_ERROR;
@@ -439,6 +554,51 @@ bool CodepageToWide(const std::string& encoded,
if (!U_SUCCESS(status))
return false;
+ // The maximum length in 4 byte unit of UTF-32 output would be
+ // at most the same as the number of bytes in input. In the worst
+ // case of GB18030 (excluding escaped-based encodings like ISO-2022-JP),
+ // this can be 4 times larger than actually needed.
+ size_t wchar_max_length = encoded.length() + 1;
+
+ // The byte buffer and its length to pass to ucnv_toAlgorithimic.
+ char* byte_buffer = reinterpret_cast<char*>(
+ WriteInto(wide, wchar_max_length));
+ int byte_buffer_length = static_cast<int>(wchar_max_length) * 4;
+
+ SetUpErrorHandlerForToUChars(on_error, converter, &status);
+ int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(),
+ converter,
+ byte_buffer,
+ byte_buffer_length,
+ encoded.data(),
+ static_cast<int>(encoded.length()),
+ &status);
+ ucnv_close(converter);
+
+ if (!U_SUCCESS(status)) {
+ wide->clear(); // Make sure the output is empty on error.
+ return false;
+ }
+
+ // actual_size is # of bytes.
+ wide->resize(actual_size / 4);
+ return true;
+#endif // defined(WCHAR_T_IS_UTF32)
+}
+
+// Converts a string of the given codepage into UTF-16.
+// If the codepage isn't found, return false.
+bool CodepageToUTF16(const std::string& encoded,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ string16* utf16) {
+ utf16->clear();
+
+ UErrorCode status = U_ZERO_ERROR;
+ UConverter* converter = ucnv_open(codepage_name, &status);
+ if (!U_SUCCESS(status))
+ return false;
+
// Even in the worst case, the maximum length in 2-byte units of UTF-16
// output would be at most the same as the number of bytes in input. There
// is no single-byte encoding in which a character is mapped to a
@@ -449,53 +609,20 @@ bool CodepageToWide(const std::string& encoded,
// BOCU and SCSU, but we don't care about them.
size_t uchar_max_length = encoded.length() + 1;
- UChar* uchar_dst;
-#if defined(WCHAR_T_IS_UTF16)
- uchar_dst = WriteInto(wide, uchar_max_length);
-#elif defined(WCHAR_T_IS_UTF32)
- // When wchar_t is wider than UChar (16 bits), convert into a temporary
- // UChar* buffer.
- std::vector<UChar> wide_uchar(uchar_max_length);
- uchar_dst = &wide_uchar[0];
-#endif // defined(WCHAR_T_IS_UTF32)
-
- // Setup our error handler.
- switch (on_error) {
- case OnStringUtilConversionError::FAIL:
- ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, 0,
- NULL, NULL, &status);
- break;
- case OnStringUtilConversionError::SKIP:
- ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, 0,
- NULL, NULL, &status);
- break;
- default:
- NOTREACHED();
- }
-
+ SetUpErrorHandlerForToUChars(on_error, converter, &status);
int actual_size = ucnv_toUChars(converter,
- uchar_dst,
+ WriteInto(utf16, uchar_max_length),
static_cast<int>(uchar_max_length),
encoded.data(),
static_cast<int>(encoded.length()),
&status);
ucnv_close(converter);
if (!U_SUCCESS(status)) {
- wide->clear(); // Make sure the output is empty on error.
+ utf16->clear(); // Make sure the output is empty on error.
return false;
}
-#ifdef WCHAR_T_IS_UTF32
- // When wchar_t is wider than UChar (16 bits), it's not possible to wind up
- // with any more wchar_t elements than UChar elements. ucnv_toUChars
- // returns the number of UChar elements not including the NUL terminator, so
- // leave extra room for that.
- u_strToWCS(WriteInto(wide, actual_size + 1), actual_size + 1, &actual_size,
- uchar_dst, actual_size, &status);
- DCHECK(U_SUCCESS(status)) << "failed to convert UChar* to wstring";
-#endif // WCHAR_T_IS_UTF32
-
- wide->resize(actual_size);
+ utf16->resize(actual_size);
return true;
}
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 6f196cc..4968950 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -13,8 +13,30 @@
#include "testing/gtest/include/gtest/gtest.h"
namespace {
+
+// Given a null-terminated string of wchar_t with each wchar_t representing
+// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input.
+// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF)
+// should be represented as a surrogate pair (two UTF-16 units)
+// *even* where wchar_t is 32-bit (Linux and Mac).
+//
+// This is to help write tests for functions with string16 params until
+// the C++ 0x UTF-16 literal is well-supported by compilers.
+string16 BuildString16(const wchar_t* s) {
+#if defined(WCHAR_T_IS_UTF16)
+ return string16(s);
+#elif defined(WCHAR_T_IS_UTF32)
+ string16 u16;
+ while (*s != 0) {
+ DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu);
+ u16.push_back(*s++);
+ }
+ return u16;
+#endif
}
+} // namespace
+
static const struct trim_case {
const wchar_t* input;
const TrimPositions positions;
@@ -459,104 +481,162 @@ TEST(StringUtilTest, ConvertCodepageUTF8) {
}
}
-TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
- static const struct {
- const char* codepage_name;
- const char* encoded;
- OnStringUtilConversionError::Type on_error;
- bool success;
- const wchar_t* wide;
- } kConvertCodepageCases[] = {
- // Test a case where the input can no be decoded, using both SKIP and FAIL
- // error handling rules. "A7 41" is valid, but "A6" isn't.
- {"big5",
- "\xA7\x41\xA6",
- OnStringUtilConversionError::FAIL,
- false,
- L""},
- {"big5",
- "\xA7\x41\xA6",
- OnStringUtilConversionError::SKIP,
- true,
- L"\x4F60"},
- // Arabic (ISO-8859)
- {"iso-8859-6",
- "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " "
- "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" "
- L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652"},
- // Chinese Simplified (GB2312)
- {"gb2312",
- "\xC4\xE3\xBA\xC3",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x4F60\x597D"},
- // Chinese Traditional (BIG5)
- {"big5",
- "\xA7\x41\xA6\x6E",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x4F60\x597D"},
- // Greek (ISO-8859)
- {"iso-8859-7",
- "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5"},
- // Hebrew (Windows)
- {"windows-1255", /* to be replaced with "iso-8859-8-I"? */
- "\xF9\xD1\xC8\xEC\xE5\xC9\xED",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD"},
- // Hindi Devanagari (ISCII)
- {"iscii-dev",
- "\xEF\x42" "\xC6\xCC\xD7\xE8\xB3\xDA\xCF",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x0928\x092E\x0938\x094D\x0915\x093E\x0930"},
- // Korean (EUC)
- {"euc-kr",
- "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4",
- OnStringUtilConversionError::FAIL,
- true,
- L"\xC548\xB155\xD558\xC138\xC694"},
- // Japanese (EUC)
- {"euc-jp",
- "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x3053\x3093\x306B\x3061\x306F"},
- // Japanese (ISO-2022)
- {"iso-2022-jp",
- "\x1B\x24\x42" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F" "\x1B\x28\x42",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x3053\x3093\x306B\x3061\x306F"},
- // Japanese (Shift-JIS)
- {"sjis",
- "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x3053\x3093\x306B\x3061\x306F"},
- // Russian (KOI8)
- {"koi8-r",
- "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
- L"\x0443\x0439\x0442\x0435"},
- // Thai (ISO-8859)
- {"windows-874", /* to be replaced with "iso-8859-11". */
- "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA",
- OnStringUtilConversionError::FAIL,
- true,
- L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35"
- L"\x0E04\x0E23\x0e31\x0E1A"},
- };
+// kConverterCodepageCases is not comprehensive. There are a number of cases
+// to add if we really want to have a comprehensive coverage of various
+// codepages and their 'idiosyncrasies'. Currently, the only implementation
+// for CodepageTo* and *ToCodepage uses ICU, which has a very extensive
+// set of tests for the charset conversion. So, we can get away with a
+// relatively small number of cases listed below.
+//
+// Note about |u16_wide| in the following struct.
+// On Windows, the field is always identical to |wide|. On Mac and Linux,
+// it's identical as long as there's no character outside the
+// BMP (<= U+FFFF). When there is, it is different from |wide| and
+// is not a real wide string (UTF-32 string) in that each wchar_t in
+// the string is a UTF-16 code unit zero-extended to be 32-bit
+// even when the code unit belongs to a surrogate pair.
+// For instance, a Unicode string (U+0041 U+010000) is represented as
+// L"\x0041\xD800\xDC00" instead of L"\x0041\x10000".
+// To avoid the clutter, |u16_wide| will be set to NULL
+// if it's identical to |wide| on *all* platforms.
+
+static const struct {
+ const char* codepage_name;
+ const char* encoded;
+ OnStringUtilConversionError::Type on_error;
+ bool success;
+ const wchar_t* wide;
+ const wchar_t* u16_wide;
+} kConvertCodepageCases[] = {
+ // Test a case where the input cannot be decoded, using SKIP, FAIL
+ // and SUBSTITUTE error handling rules. "A7 41" is valid, but "A6" isn't.
+ {"big5",
+ "\xA7\x41\xA6",
+ OnStringUtilConversionError::FAIL,
+ false,
+ L"",
+ NULL},
+ {"big5",
+ "\xA7\x41\xA6",
+ OnStringUtilConversionError::SKIP,
+ true,
+ L"\x4F60",
+ NULL},
+ {"big5",
+ "\xA7\x41\xA6",
+ OnStringUtilConversionError::SUBSTITUTE,
+ true,
+ L"\x4F60\xFFFD",
+ NULL},
+ // Arabic (ISO-8859)
+ {"iso-8859-6",
+ "\xC7\xEE\xE4\xD3\xF1\xEE\xE4\xC7\xE5\xEF" " "
+ "\xD9\xEE\xE4\xEE\xEA\xF2\xE3\xEF\xE5\xF2",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x0627\x064E\x0644\x0633\x0651\x064E\x0644\x0627\x0645\x064F" L" "
+ L"\x0639\x064E\x0644\x064E\x064A\x0652\x0643\x064F\x0645\x0652",
+ NULL},
+ // Chinese Simplified (GB2312)
+ {"gb2312",
+ "\xC4\xE3\xBA\xC3",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x4F60\x597D",
+ NULL},
+ // Chinese (GB18030) : 4 byte sequences mapped to BMP characters
+ {"gb18030",
+ "\x81\x30\x84\x36\xA1\xA7",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x00A5\x00A8",
+ NULL},
+ // Chinese (GB18030) : A 4 byte sequence mapped to plane 2 (U+20000)
+ {"gb18030",
+ "\x95\x32\x82\x36\xD2\xBB",
+ OnStringUtilConversionError::FAIL,
+ true,
+#if defined(WCHAR_T_IS_UTF16)
+ L"\xD840\xDC00\x4E00",
+#else
+ L"\x20000\x4E00",
+#endif
+ L"\xD840\xDC00\x4E00"},
+ {"big5",
+ "\xA7\x41\xA6\x6E",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x4F60\x597D",
+ NULL},
+ // Greek (ISO-8859)
+ {"iso-8859-7",
+ "\xE3\xE5\xE9\xDC" " " "\xF3\xEF\xF5",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5",
+ NULL},
+ // Hebrew (Windows)
+ {"windows-1255",
+ "\xF9\xD1\xC8\xEC\xE5\xC9\xED",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x05E9\x05C1\x05B8\x05DC\x05D5\x05B9\x05DD",
+ NULL},
+ // Hindi Devanagari (ISCII)
+ {"iscii-dev",
+ "\xEF\x42" "\xC6\xCC\xD7\xE8\xB3\xDA\xCF",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x0928\x092E\x0938\x094D\x0915\x093E\x0930",
+ NULL},
+ // Korean (EUC)
+ {"euc-kr",
+ "\xBE\xC8\xB3\xE7\xC7\xCF\xBC\xBC\xBF\xE4",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\xC548\xB155\xD558\xC138\xC694",
+ NULL},
+ // Japanese (EUC)
+ {"euc-jp",
+ "\xA4\xB3\xA4\xF3\xA4\xCB\xA4\xC1\xA4\xCF\xB0\xEC\x8F\xB0\xA1\x8E\xA6",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x3053\x3093\x306B\x3061\x306F\x4E00\x4E02\xFF66",
+ NULL},
+ // Japanese (ISO-2022)
+ {"iso-2022-jp",
+ "\x1B$B" "\x24\x33\x24\x73\x24\x4B\x24\x41\x24\x4F\x30\x6C" "\x1B(B"
+ "ab" "\x1B(J" "\x5C\x7E#$" "\x1B(B",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x3053\x3093\x306B\x3061\x306F\x4E00" L"ab\x00A5\x203E#$",
+ NULL},
+ // Japanese (Shift-JIS)
+ {"sjis",
+ "\x82\xB1\x82\xF1\x82\xC9\x82\xBF\x82\xCD\x88\xEA\xA6",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x3053\x3093\x306B\x3061\x306F\x4E00\xFF66",
+ NULL},
+ // Russian (KOI8)
+ {"koi8-r",
+ "\xDA\xC4\xD2\xC1\xD7\xD3\xD4\xD7\xD5\xCA\xD4\xC5",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
+ L"\x0443\x0439\x0442\x0435",
+ NULL},
+ // Thai (windows-874)
+ {"windows-874",
+ "\xCA\xC7\xD1\xCA\xB4\xD5" "\xA4\xC3\xD1\xBA",
+ OnStringUtilConversionError::FAIL,
+ true,
+ L"\x0E2A\x0E27\x0E31\x0E2A\x0E14\x0E35"
+ L"\x0E04\x0E23\x0e31\x0E1A",
+ NULL},
+};
+TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) {
std::wstring wide;
bool success = CodepageToWide(kConvertCodepageCases[i].encoded,
@@ -567,7 +647,9 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
EXPECT_EQ(kConvertCodepageCases[i].wide, wide);
// When decoding was successful and nothing was skipped, we also check the
- // reverse conversion.
+ // reverse conversion. Not all conversions are round-trippable, but
+ // kConverterCodepageCases does not have any one-way conversion at the
+ // moment.
if (success &&
kConvertCodepageCases[i].on_error ==
OnStringUtilConversionError::FAIL) {
@@ -590,6 +672,11 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1",
OnStringUtilConversionError::SKIP, &encoded));
EXPECT_STREQ("Chinese", encoded.c_str());
+ // From Unicode, SUBSTITUTE is the same as SKIP for now.
+ EXPECT_TRUE(WideToCodepage(L"Chinese\xff27", "iso-8859-1",
+ OnStringUtilConversionError::SUBSTITUTE,
+ &encoded));
+ EXPECT_STREQ("Chinese", encoded.c_str());
#if defined(WCHAR_T_IS_UTF16)
// When we're in UTF-16 mode, test an invalid UTF-16 character in the input.
@@ -611,6 +698,36 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) {
OnStringUtilConversionError::SKIP, &encoded));
}
+TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) {
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) {
+ string16 utf16;
+ bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded,
+ kConvertCodepageCases[i].codepage_name,
+ kConvertCodepageCases[i].on_error,
+ &utf16);
+ string16 utf16_expected;
+ if (kConvertCodepageCases[i].u16_wide == NULL)
+ utf16_expected = BuildString16(kConvertCodepageCases[i].wide);
+ else
+ utf16_expected = BuildString16(kConvertCodepageCases[i].u16_wide);
+ EXPECT_EQ(kConvertCodepageCases[i].success, success);
+ EXPECT_EQ(utf16_expected, utf16);
+
+ // When decoding was successful and nothing was skipped, we also check the
+ // reverse conversion. See also the corresponding comment in
+ // ConvertBetweenCodepageAndWide.
+ if (success &&
+ kConvertCodepageCases[i].on_error ==
+ OnStringUtilConversionError::FAIL) {
+ std::string encoded;
+ success = UTF16ToCodepage(utf16, kConvertCodepageCases[i].codepage_name,
+ kConvertCodepageCases[i].on_error, &encoded);
+ EXPECT_EQ(kConvertCodepageCases[i].success, success);
+ EXPECT_EQ(kConvertCodepageCases[i].encoded, encoded);
+ }
+ }
+}
+
TEST(StringUtilTest, ConvertASCII) {
static const char* char_cases[] = {
"Google Video",