summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbrettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-10-07 02:10:20 +0000
committerbrettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-10-07 02:10:20 +0000
commit047a03f4cefa75a67070f08b3f6b727f7ea702d5 (patch)
treed00ccbd9e59106de8fd904b06720be59219d61fe
parent0511c153260e5d402d7552ff7b47a2acb17bdf2b (diff)
downloadchromium_src-047a03f4cefa75a67070f08b3f6b727f7ea702d5.zip
chromium_src-047a03f4cefa75a67070f08b3f6b727f7ea702d5.tar.gz
chromium_src-047a03f4cefa75a67070f08b3f6b727f7ea702d5.tar.bz2
Copy the relevant parts of ICU to a new file base/third_party/icu/icu_utf.*
so we can do basic UTF8/16/32 conversions without linking all of ICU. Change callers who used to call SysUTF8ToWide/SysWideToUTF8 in base to using these new functions. I will remove the Sys versions of these functions in a later patch. TEST=none BUG=none Review URL: http://codereview.chromium.org/243102 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@28219 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--base/base.gyp12
-rw-r--r--base/file_util.cc5
-rw-r--r--base/file_util_posix.cc1
-rw-r--r--base/file_util_unittest.cc2
-rw-r--r--base/file_version_info_mac.mm21
-rw-r--r--base/i18n/icu_string_conversions.cc (renamed from base/i18n/string_conversions.cc)340
-rw-r--r--base/i18n/icu_string_conversions.h60
-rw-r--r--base/json_reader.cc7
-rw-r--r--base/json_writer.cc3
-rw-r--r--base/logging.cc6
-rw-r--r--base/platform_file_posix.cc4
-rw-r--r--base/process_util_mac.mm9
-rw-r--r--base/stats_table.cc5
-rw-r--r--base/string16.cc3
-rw-r--r--base/string16_unittest.cc52
-rw-r--r--base/string_util.h3
-rw-r--r--base/sys_info_posix.cc8
-rw-r--r--base/system_monitor_unittest.cc8
-rw-r--r--base/third_party/icu/README8
-rw-r--r--base/third_party/icu/icu_utf.cc228
-rw-r--r--base/third_party/icu/icu_utf.h358
-rw-r--r--base/trace_event.cc1
-rw-r--r--base/utf_string_conversions.cc361
-rw-r--r--base/utf_string_conversions.h (renamed from base/i18n/string_conversions.h)53
-rw-r--r--base/values.cc3
-rw-r--r--chrome/browser/download/download_manager_unittest.cc18
26 files changed, 1144 insertions, 435 deletions
diff --git a/base/base.gyp b/base/base.gyp
index 0ad3f16..c1d7a8b 100644
--- a/base/base.gyp
+++ b/base/base.gyp
@@ -34,6 +34,8 @@
'third_party/dmg_fp/dmg_fp.h',
'third_party/dmg_fp/dtoa.cc',
'third_party/dmg_fp/g_fmt.cc',
+ 'third_party/icu/icu_utf.cc',
+ 'third_party/icu/icu_utf.h',
'third_party/nspr/prcpucfg.h',
'third_party/nspr/prcpucfg_win.h',
'third_party/nspr/prtime.cc',
@@ -138,8 +140,8 @@
'hmac_mac.cc',
'hmac_nss.cc',
'hmac_win.cc',
- 'i18n/string_conversions.cc',
- 'i18n/string_conversions.h',
+ 'i18n/icu_string_conversions.cc',
+ 'i18n/icu_string_conversions.h',
'iat_patch.cc',
'iat_patch.h',
'icu_util.cc',
@@ -151,8 +153,8 @@
'json_reader.h',
'json_writer.cc',
'json_writer.h',
- 'keyboard_code_conversion_gtk.cc',
- 'keyboard_code_conversion_gtk.h',
+ 'keyboard_code_conversion_gtk.cc',
+ 'keyboard_code_conversion_gtk.h',
'keyboard_codes.h',
'keyboard_codes_win.h',
'keyboard_codes_posix.h',
@@ -323,6 +325,8 @@
'tracked_objects.cc',
'tracked_objects.h',
'tuple.h',
+ 'utf_string_conversions.cc',
+ 'utf_string_conversions.h',
'unix_domain_socket_posix.cc',
'values.cc',
'values.h',
diff --git a/base/file_util.cc b/base/file_util.cc
index 1ee7abc..d3a989b5 100644
--- a/base/file_util.cc
+++ b/base/file_util.cc
@@ -13,10 +13,9 @@
#include "base/file_path.h"
#include "base/logging.h"
-#include "base/string_util.h"
-
#include "base/string_piece.h"
-#include "base/sys_string_conversions.h"
+#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
namespace {
diff --git a/base/file_util_posix.cc b/base/file_util_posix.cc
index 7274d76..27adbfa 100644
--- a/base/file_util_posix.cc
+++ b/base/file_util_posix.cc
@@ -34,6 +34,7 @@
#include "base/string_util.h"
#include "base/sys_string_conversions.h"
#include "base/time.h"
+#include "base/utf_string_conversions.h"
#include "unicode/coll.h"
diff --git a/base/file_util_unittest.cc b/base/file_util_unittest.cc
index 57190c5..5b606c9 100644
--- a/base/file_util_unittest.cc
+++ b/base/file_util_unittest.cc
@@ -20,8 +20,8 @@
#include "base/logging.h"
#include "base/path_service.h"
#include "base/platform_thread.h"
-#include "base/string_util.h"
#include "base/time.h"
+#include "base/utf_string_conversions.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "testing/platform_test.h"
diff --git a/base/file_version_info_mac.mm b/base/file_version_info_mac.mm
index ae6603f..f177bca 100644
--- a/base/file_version_info_mac.mm
+++ b/base/file_version_info_mac.mm
@@ -9,6 +9,7 @@
#include "base/file_path.h"
#include "base/logging.h"
#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
FileVersionInfo::FileVersionInfo(NSBundle *bundle) : bundle_(bundle) {
[bundle_ retain];
@@ -43,15 +44,15 @@ FileVersionInfo* FileVersionInfo::CreateFileVersionInfo(
}
std::wstring FileVersionInfo::company_name() {
- return L"";
+ return std::wstring();
}
std::wstring FileVersionInfo::company_short_name() {
- return L"";
+ return std::wstring();
}
std::wstring FileVersionInfo::internal_name() {
- return L"";
+ return std::wstring();
}
std::wstring FileVersionInfo::product_name() {
@@ -63,7 +64,7 @@ std::wstring FileVersionInfo::product_short_name() {
}
std::wstring FileVersionInfo::comments() {
- return L"";
+ return std::wstring();
}
std::wstring FileVersionInfo::legal_copyright() {
@@ -75,22 +76,22 @@ std::wstring FileVersionInfo::product_version() {
}
std::wstring FileVersionInfo::file_description() {
- return L"";
+ return std::wstring();
}
std::wstring FileVersionInfo::legal_trademarks() {
- return L"";
+ return std::wstring();
}
std::wstring FileVersionInfo::private_build() {
- return L"";
+ return std::wstring();
}
std::wstring FileVersionInfo::file_version() {
// CFBundleVersion has limitations that may not be honored by a
// proper Chromium version number, so try KSVersion first.
std::wstring version = GetStringValue(L"KSVersion");
- if (version == L"")
+ if (version.empty())
version = GetStringValue(L"CFBundleVersion");
return version;
}
@@ -100,7 +101,7 @@ std::wstring FileVersionInfo::original_filename() {
}
std::wstring FileVersionInfo::special_build() {
- return L"";
+ return std::wstring();
}
std::wstring FileVersionInfo::last_change() {
@@ -132,5 +133,5 @@ std::wstring FileVersionInfo::GetStringValue(const wchar_t* name) {
std::wstring str;
if (GetValue(name, &str))
return str;
- return L"";
+ return std::wstring();
}
diff --git a/base/i18n/string_conversions.cc b/base/i18n/icu_string_conversions.cc
index 35c9d6d..225fe0b 100644
--- a/base/i18n/string_conversions.cc
+++ b/base/i18n/icu_string_conversions.cc
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include "base/i18n/string_conversions.h"
+#include "base/i18n/icu_string_conversions.h"
#include <vector>
@@ -82,166 +82,6 @@ void ToUnicodeCallbackSubstitute(const void* context,
// else ignore the reset, close and clone calls.
}
-// ReadUnicodeCharacter --------------------------------------------------------
-
-// Reads a UTF-8 stream, placing the next code point into the given output
-// |*code_point|. |src| represents the entire string to read, and |*char_index|
-// is the character offset within the string to start reading at. |*char_index|
-// will be updated to index the last character read, such that incrementing it
-// (as in a for loop) will take the reader to the next character.
-//
-// Returns true on success. On false, |*code_point| will be invalid.
-bool ReadUnicodeCharacter(const char* src, int32 src_len,
- int32* char_index, uint32* code_point_out) {
- // U8_NEXT expects to be able to use -1 to signal an error, so we must
- // use a signed type for code_point. But this function returns false
- // on error anyway, so code_point_out is unsigned.
- int32 code_point;
- U8_NEXT(src, *char_index, src_len, code_point);
- *code_point_out = static_cast<uint32>(code_point);
-
- // The ICU macro above moves to the next char, we want to point to the last
- // char consumed.
- (*char_index)--;
-
- // Validate the decoded value.
- return IsValidCodepoint(code_point);
-}
-
-// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
-bool ReadUnicodeCharacter(const char16* src, int32 src_len,
- int32* char_index, uint32* code_point) {
- if (U16_IS_SURROGATE(src[*char_index])) {
- if (!U16_IS_SURROGATE_LEAD(src[*char_index]) ||
- *char_index + 1 >= src_len ||
- !U16_IS_TRAIL(src[*char_index + 1])) {
- // Invalid surrogate pair.
- return false;
- }
-
- // Valid surrogate pair.
- *code_point = U16_GET_SUPPLEMENTARY(src[*char_index],
- src[*char_index + 1]);
- (*char_index)++;
- } else {
- // Not a surrogate, just one 16-bit word.
- *code_point = src[*char_index];
- }
-
- return IsValidCodepoint(*code_point);
-}
-
-#if defined(WCHAR_T_IS_UTF32)
-// Reads UTF-32 character. The usage is the same as the 8-bit version above.
-bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
- int32* char_index, uint32* code_point) {
- // Conversion is easy since the source is 32-bit.
- *code_point = src[*char_index];
-
- // Validate the value.
- return IsValidCodepoint(*code_point);
-}
-#endif // defined(WCHAR_T_IS_UTF32)
-
-// WriteUnicodeCharacter -------------------------------------------------------
-
-// Appends a UTF-8 character to the given 8-bit string.
-void WriteUnicodeCharacter(uint32 code_point, std::string* output) {
- if (code_point <= 0x7f) {
- // Fast path the common case of one byte.
- output->push_back(code_point);
- return;
- }
-
- // U8_APPEND_UNSAFE can append up to 4 bytes.
- int32 char_offset = static_cast<int32>(output->length());
- output->resize(char_offset + U8_MAX_LENGTH);
-
- U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
-
- // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so
- // it will represent the new length of the string.
- output->resize(char_offset);
-}
-
-// Appends the given code point as a UTF-16 character to the STL string.
-void WriteUnicodeCharacter(uint32 code_point, string16* output) {
- if (U16_LENGTH(code_point) == 1) {
- // Thie code point is in the Basic Multilingual Plane (BMP).
- output->push_back(static_cast<char16>(code_point));
- } else {
- // Non-BMP characters use a double-character encoding.
- int32 char_offset = static_cast<int32>(output->length());
- output->resize(char_offset + U16_MAX_LENGTH);
- U16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
- }
-}
-
-#if defined(WCHAR_T_IS_UTF32)
-// Appends the given UTF-32 character to the given 32-bit string.
-inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
- // This is the easy case, just append the character.
- output->push_back(code_point);
-}
-#endif // defined(WCHAR_T_IS_UTF32)
-
-// Generalized Unicode converter -----------------------------------------------
-
-// Converts the given source Unicode character type to the given destination
-// Unicode character type as a STL string. The given input buffer and size
-// determine the source, and the given output STL string will be replaced by
-// the result.
-template<typename SRC_CHAR, typename DEST_STRING>
-bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) {
- output->clear();
-
- // ICU requires 32-bit numbers.
- bool success = true;
- int32 src_len32 = static_cast<int32>(src_len);
- for (int32 i = 0; i < src_len32; i++) {
- uint32 code_point;
- if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
- WriteUnicodeCharacter(code_point, output);
- } else {
- // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
- // in place of an invalid codepoint.
- success = false;
- }
- }
- return success;
-}
-
-
-// Guesses the length of the output in UTF-8 in bytes, and reserves that amount
-// of space in the given string. We also assume that the input character types
-// are unsigned, which will be true for UTF-16 and -32 on our systems. We assume
-// the string length is greater than zero.
-template<typename CHAR>
-void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) {
- if (src[0] < 0x80) {
- // Assume that the entire input will be ASCII.
- output->reserve(src_len);
- } else {
- // Assume that the entire input is non-ASCII and will have 3 bytes per char.
- output->reserve(src_len * 3);
- }
-}
-
-// Guesses the size of the output buffer (containing either UTF-16 or -32 data)
-// given some UTF-8 input that will be converted to it. See ReserveUTF8Output.
-// We assume the source length is > 0.
-template<typename STRING>
-void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) {
- if (static_cast<unsigned char>(src[0]) < 0x80) {
- // Assume the input is all ASCII, which means 1:1 correspondence.
- output->reserve(src_len);
- } else {
- // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
- // character.
- output->reserve(src_len / 2);
- }
-}
-
bool ConvertFromUTF16(UConverter* converter, const UChar* uchar_src,
int uchar_len, OnStringUtilConversionError::Type on_error,
std::string* encoded) {
@@ -308,184 +148,6 @@ inline UConverterType utf32_platform_endian() {
} // namespace
-// UTF-8 <-> Wide --------------------------------------------------------------
-
-std::string WideToUTF8(const std::wstring& wide) {
- std::string ret;
- if (wide.empty())
- return ret;
-
- // Ignore the success flag of this call, it will do the best it can for
- // invalid input, which is what we want here.
- WideToUTF8(wide.data(), wide.length(), &ret);
- return ret;
-}
-
-bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- ReserveUTF8Output(src, src_len, output);
- return ConvertUnicode<wchar_t, std::string>(src, src_len, output);
-}
-
-std::wstring UTF8ToWide(const base::StringPiece& utf8) {
- std::wstring ret;
- if (utf8.empty())
- return ret;
-
- UTF8ToWide(utf8.data(), utf8.length(), &ret);
- return ret;
-}
-
-bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- ReserveUTF16Or32Output(src, src_len, output);
- return ConvertUnicode<char, std::wstring>(src, src_len, output);
-}
-
-// UTF-16 <-> Wide -------------------------------------------------------------
-
-#if defined(WCHAR_T_IS_UTF16)
-
-// When wide == UTF-16, then conversions are a NOP.
-string16 WideToUTF16(const std::wstring& wide) {
- return wide;
-}
-
-bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
- output->assign(src, src_len);
- return true;
-}
-
-std::wstring UTF16ToWide(const string16& utf16) {
- return utf16;
-}
-
-bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
- output->assign(src, src_len);
- return true;
-}
-
-#elif defined(WCHAR_T_IS_UTF32)
-
-string16 WideToUTF16(const std::wstring& wide) {
- string16 ret;
- if (wide.empty())
- return ret;
-
- WideToUTF16(wide.data(), wide.length(), &ret);
- return ret;
-}
-
-bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- // Assume that normally we won't have any non-BMP characters so the counts
- // will be the same.
- output->reserve(src_len);
- return ConvertUnicode<wchar_t, string16>(src, src_len, output);
-}
-
-std::wstring UTF16ToWide(const string16& utf16) {
- std::wstring ret;
- if (utf16.empty())
- return ret;
-
- UTF16ToWide(utf16.data(), utf16.length(), &ret);
- return ret;
-}
-
-bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- // Assume that normally we won't have any non-BMP characters so the counts
- // will be the same.
- output->reserve(src_len);
- return ConvertUnicode<char16, std::wstring>(src, src_len, output);
-}
-
-#endif // defined(WCHAR_T_IS_UTF32)
-
-// UTF16 <-> UTF8 --------------------------------------------------------------
-
-#if defined(WCHAR_T_IS_UTF32)
-
-bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- ReserveUTF16Or32Output(src, src_len, output);
- return ConvertUnicode<char, string16>(src, src_len, output);
-}
-
-string16 UTF8ToUTF16(const std::string& utf8) {
- string16 ret;
- if (utf8.empty())
- return ret;
-
- // Ignore the success flag of this call, it will do the best it can for
- // invalid input, which is what we want here.
- UTF8ToUTF16(utf8.data(), utf8.length(), &ret);
- return ret;
-}
-
-bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
- if (src_len == 0) {
- output->clear();
- return true;
- }
-
- ReserveUTF8Output(src, src_len, output);
- return ConvertUnicode<char16, std::string>(src, src_len, output);
-}
-
-std::string UTF16ToUTF8(const string16& utf16) {
- std::string ret;
- if (utf16.empty())
- return ret;
-
- // Ignore the success flag of this call, it will do the best it can for
- // invalid input, which is what we want here.
- UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
- return ret;
-}
-
-#elif defined(WCHAR_T_IS_UTF16)
-// Easy case since we can use the "wide" versions we already wrote above.
-
-bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
- return UTF8ToWide(src, src_len, output);
-}
-
-string16 UTF8ToUTF16(const std::string& utf8) {
- return UTF8ToWide(utf8);
-}
-
-bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
- return WideToUTF8(src, src_len, output);
-}
-
-std::string UTF16ToUTF8(const string16& utf16) {
- return WideToUTF8(utf16);
-}
-
-#endif
-
// Codepage <-> Wide/UTF-16 ---------------------------------------------------
// Convert a wstring into the specified codepage_name. If the codepage
diff --git a/base/i18n/icu_string_conversions.h b/base/i18n/icu_string_conversions.h
new file mode 100644
index 0000000..d849c71
--- /dev/null
+++ b/base/i18n/icu_string_conversions.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_I18N_ICU_STRING_CONVERSIONS_H_
+#define BASE_I18N_ICU_STRING_CONVERSIONS_H_
+
+#include <string>
+
+#include "base/string16.h"
+#include "base/string_piece.h"
+
+// Defines the error handling modes of UTF16ToCodepage, CodepageToUTF16,
+// WideToCodepage and CodepageToWide.
+class OnStringUtilConversionError {
+ public:
+ enum Type {
+ // The function will return failure. The output buffer will be empty.
+ FAIL,
+
+ // The offending characters are skipped and the conversion will proceed as
+ // if they did not exist.
+ SKIP,
+
+ // When converting to Unicode, the offending byte sequences are substituted
+ // by Unicode replacement character (U+FFFD). When converting from Unicode,
+ // this is the same as SKIP.
+ SUBSTITUTE,
+ };
+
+ private:
+ OnStringUtilConversionError();
+};
+
+// Converts between UTF-16 strings and the encoding specified. If the
+// encoding doesn't exist or the encoding fails (when on_error is FAIL),
+// returns false.
+bool UTF16ToCodepage(const string16& utf16,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ std::string* encoded);
+
+bool CodepageToUTF16(const std::string& encoded,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ string16* utf16);
+
+// Converts between wide strings and the encoding specified. If the
+// encoding doesn't exist or the encoding fails (when on_error is FAIL),
+// returns false.
+bool WideToCodepage(const std::wstring& wide,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ std::string* encoded);
+bool CodepageToWide(const std::string& encoded,
+ const char* codepage_name,
+ OnStringUtilConversionError::Type on_error,
+ std::wstring* wide);
+
+#endif // BASE_I18N_ICU_STRING_CONVERSIONS_H_
diff --git a/base/json_reader.cc b/base/json_reader.cc
index 2c3ab0b..ca33cb2 100644
--- a/base/json_reader.cc
+++ b/base/json_reader.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -8,6 +8,7 @@
#include "base/logging.h"
#include "base/scoped_ptr.h"
#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
#include "base/values.h"
static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN,
@@ -118,8 +119,8 @@ std::string JSONReader::FormatErrorMessage(int line, int column,
}
JSONReader::JSONReader()
- : start_pos_(NULL), json_pos_(NULL), stack_depth_(0),
- allow_trailing_comma_(false) {}
+ : start_pos_(NULL), json_pos_(NULL), stack_depth_(0),
+ allow_trailing_comma_(false) {}
Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
bool allow_trailing_comma) {
diff --git a/base/json_writer.cc b/base/json_writer.cc
index 1a9f1b6..25df120 100644
--- a/base/json_writer.cc
+++ b/base/json_writer.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -8,6 +8,7 @@
#include "base/string_util.h"
#include "base/values.h"
#include "base/string_escape.h"
+#include "base/utf_string_conversions.h"
#if defined(OS_WIN)
static const char kPrettyPrintLineEnding[] = "\r\n";
diff --git a/base/logging.cc b/base/logging.cc
index d35cfdb..7981310 100644
--- a/base/logging.cc
+++ b/base/logging.cc
@@ -39,7 +39,7 @@ typedef pthread_mutex_t* MutexHandle;
#include "base/lock_impl.h"
#include "base/string_piece.h"
#include "base/string_util.h"
-#include "base/sys_string_conversions.h"
+#include "base/utf_string_conversions.h"
namespace logging {
@@ -328,7 +328,7 @@ void DisplayDebugMessage(const std::string& str) {
backslash[1] = 0;
wcscat_s(prog_name, MAX_PATH, L"debug_message.exe");
- std::wstring cmdline = base::SysUTF8ToWide(str);
+ std::wstring cmdline = UTF8ToWide(str);
if (cmdline.empty())
return;
@@ -578,5 +578,5 @@ void CloseLogFile() {
} // namespace logging
std::ostream& operator<<(std::ostream& out, const wchar_t* wstr) {
- return out << base::SysWideToUTF8(std::wstring(wstr));
+ return out << WideToUTF8(std::wstring(wstr));
}
diff --git a/base/platform_file_posix.cc b/base/platform_file_posix.cc
index f964c62..623223c 100644
--- a/base/platform_file_posix.cc
+++ b/base/platform_file_posix.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -9,7 +9,7 @@
#include <sys/stat.h>
#include "base/logging.h"
-#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
namespace base {
diff --git a/base/process_util_mac.mm b/base/process_util_mac.mm
index 183fe31..61029c0 100644
--- a/base/process_util_mac.mm
+++ b/base/process_util_mac.mm
@@ -19,6 +19,7 @@
#include "base/eintr_wrapper.h"
#include "base/logging.h"
#include "base/string_util.h"
+#include "base/sys_string_conversions.h"
#include "base/time.h"
namespace base {
@@ -42,9 +43,9 @@ void RestoreDefaultExceptionHandler() {
NamedProcessIterator::NamedProcessIterator(const std::wstring& executable_name,
const ProcessFilter* filter)
- : executable_name_(executable_name),
- index_of_kinfo_proc_(0),
- filter_(filter) {
+ : executable_name_(executable_name),
+ index_of_kinfo_proc_(0),
+ filter_(filter) {
// Get a snapshot of all of my processes (yes, as we loop it can go stale, but
// but trying to find where we were in a constantly changing list is basically
// impossible.
@@ -111,7 +112,7 @@ const ProcessEntry* NamedProcessIterator::NextProcessEntry() {
}
bool NamedProcessIterator::CheckForNextProcess() {
- std::string executable_name_utf8(WideToUTF8(executable_name_));
+ std::string executable_name_utf8(base::SysWideToUTF8(executable_name_));
std::string data;
std::string exec_name;
diff --git a/base/stats_table.cc b/base/stats_table.cc
index c175551..522db5a 100644
--- a/base/stats_table.cc
+++ b/base/stats_table.cc
@@ -11,8 +11,8 @@
#include "base/shared_memory.h"
#include "base/string_piece.h"
#include "base/string_util.h"
-#include "base/sys_string_conversions.h"
#include "base/thread_local_storage.h"
+#include "base/utf_string_conversions.h"
#if defined(OS_POSIX)
#include "errno.h"
@@ -170,8 +170,7 @@ StatsTablePrivate* StatsTablePrivate::New(const std::string& name,
int max_threads,
int max_counters) {
scoped_ptr<StatsTablePrivate> priv(new StatsTablePrivate());
- if (!priv->shared_memory_.Create(base::SysUTF8ToWide(name), false, true,
- size))
+ if (!priv->shared_memory_.Create(UTF8ToWide(name), false, true, size))
return NULL;
if (!priv->shared_memory_.Map(size))
return NULL;
diff --git a/base/string16.cc b/base/string16.cc
index ca45fba..d1d0908 100644
--- a/base/string16.cc
+++ b/base/string16.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -14,6 +14,7 @@
#elif defined(WCHAR_T_IS_UTF32)
#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
namespace base {
diff --git a/base/string16_unittest.cc b/base/string16_unittest.cc
new file mode 100644
index 0000000..69eed4b
--- /dev/null
+++ b/base/string16_unittest.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <sstream>
+
+#include "base/string16.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+#if defined(WCHAR_T_IS_UTF32)
+
+// We define a custom operator<< for string16 so we can use it with logging.
+// This tests that conversion.
+TEST(String16Test, OutputStream) {
+ // Basic stream test.
+ {
+ std::ostringstream stream;
+ stream << "Empty '" << string16() << "' standard '"
+ << string16(ASCIIToUTF16("Hello, world")) << "'";
+ EXPECT_STREQ("Empty '' standard 'Hello, world'",
+ stream.str().c_str());
+ }
+
+ // Interesting edge cases.
+ {
+ // These should each get converted to the invalid character: EF BF BD.
+ string16 initial_surrogate;
+ initial_surrogate.push_back(0xd800);
+ string16 final_surrogate;
+ final_surrogate.push_back(0xdc00);
+
+ // Old italic A = U+10300, will get converted to: F0 90 8C 80 'z'.
+ string16 surrogate_pair;
+ surrogate_pair.push_back(0xd800);
+ surrogate_pair.push_back(0xdf00);
+ surrogate_pair.push_back('z');
+
+ // Will get converted to the invalid char + 's': EF BF BD 's'.
+ string16 unterminated_surrogate;
+ unterminated_surrogate.push_back(0xd800);
+ unterminated_surrogate.push_back('s');
+
+ std::ostringstream stream;
+ stream << initial_surrogate << "," << final_surrogate << ","
+ << surrogate_pair << ",", unterminated_surrogate;
+
+ EXPECT_STREQ("\xef\xbf\xbd,\xef\xbf\xbd,\xf0\x90\x8c\x80z,\xef\xbf\xbds",
+ stream.str().c_str());
+ }
+}
+
+#endif
diff --git a/base/string_util.h b/base/string_util.h
index 52c2a84..254e18f 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -18,7 +18,8 @@
// TODO(brettw) this dependency should be removed and callers that need
// these functions should include this file directly.
-#include "base/i18n/string_conversions.h"
+#include "base/utf_string_conversions.h"
+#include "base/i18n/icu_string_conversions.h"
// Safe standard library wrappers for all platforms.
diff --git a/base/sys_info_posix.cc b/base/sys_info_posix.cc
index 06f7526..74a10ac 100644
--- a/base/sys_info_posix.cc
+++ b/base/sys_info_posix.cc
@@ -2,9 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include "base/file_util.h"
#include "base/sys_info.h"
-#include "base/basictypes.h"
#include <errno.h>
#include <string.h>
@@ -23,8 +21,10 @@
#include <sys/sysctl.h>
#endif
+#include "base/basictypes.h"
+#include "base/file_util.h"
#include "base/logging.h"
-#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
namespace base {
@@ -105,7 +105,7 @@ std::wstring SysInfo::GetEnvVar(const wchar_t* var) {
std::string var_utf8 = WideToUTF8(std::wstring(var));
char* value = getenv(var_utf8.c_str());
if (!value) {
- return L"";
+ return std::wstring();
} else {
return UTF8ToWide(value);
}
diff --git a/base/system_monitor_unittest.cc b/base/system_monitor_unittest.cc
index ff39d87..7ba3a6b 100644
--- a/base/system_monitor_unittest.cc
+++ b/base/system_monitor_unittest.cc
@@ -8,10 +8,10 @@
class PowerTest : public base::SystemMonitor::PowerObserver {
public:
PowerTest()
- : battery_(false),
- power_state_changes_(0),
- suspends_(0),
- resumes_(0) {};
+ : battery_(false),
+ power_state_changes_(0),
+ suspends_(0),
+ resumes_(0) {};
// PowerObserver callbacks.
void OnPowerStateChange(bool on_battery_power) {
diff --git a/base/third_party/icu/README b/base/third_party/icu/README
new file mode 100644
index 0000000..faeb5ef
--- /dev/null
+++ b/base/third_party/icu/README
@@ -0,0 +1,8 @@
+This file has the relevant components from ICU copied to handle basic
+UTF8/16/32 conversions. Components are copied from utf.h utf8.h utf16.h and
+utf_impl.c
+
+The main change is that U_/U8_/U16_ prefixes have been replaced with
+CBU_/CBU8_/CBU16_ (for "Chrome Base") to avoid confusion with the "real" ICU
+macros should ICU be in use on the system. For the same reason, the functions
+and types have been put in the "base_icu" namespace.
diff --git a/base/third_party/icu/icu_utf.cc b/base/third_party/icu/icu_utf.cc
new file mode 100644
index 0000000..b47c8ac
--- /dev/null
+++ b/base/third_party/icu/icu_utf.cc
@@ -0,0 +1,228 @@
+/*
+******************************************************************************
+*
+* Copyright (C) 1999-2006, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: utf_impl.c
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999sep13
+* created by: Markus W. Scherer
+*
+* This file provides implementation functions for macros in the utfXX.h
+* that would otherwise be too long as macros.
+*/
+
+#include "base/third_party/icu/icu_utf.h"
+
+namespace base_icu {
+
+/**
+ * UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8,
+ * which need 1 or 2 bytes in UTF-8:
+ * \code
+ * U+0015 = NAK = Negative Acknowledge, C0 control character
+ * U+009f = highest C1 control character
+ * \endcode
+ *
+ * These are used by UTF8_..._SAFE macros so that they can return an error value
+ * that needs the same number of code units (bytes) as were seen by
+ * a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().
+ *
+ * @deprecated ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define CBUTF8_ERROR_VALUE_1 0x15
+
+/**
+ * See documentation on UTF8_ERROR_VALUE_1 for details.
+ *
+ * @deprecated ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define CBUTF8_ERROR_VALUE_2 0x9f
+
+
+/**
+ * Error value for all UTFs. This code point value will be set by macros with e>
+ * checking if an error is detected.
+ *
+ * @deprecated ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define CBUTF_ERROR_VALUE 0xffff
+
+/*
+ * This table could be replaced on many machines by
+ * a few lines of assembler code using an
+ * "index of first 0-bit from msb" instruction and
+ * one or two more integer instructions.
+ *
+ * For example, on an i386, do something like
+ * - MOV AL, leadByte
+ * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0)
+ * - MOV AH, 0
+ * - BSR BX, AX (16-bit)
+ * - MOV AX, 6 (result)
+ * - JZ finish (ZF==1 if leadByte==0xff)
+ * - SUB AX, BX (result)
+ * -finish:
+ * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
+ *
+ * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
+ * lead bytes above 0xf4 are illegal.
+ * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
+ */
+const uint8
+utf8_countTrailBytes[256]={
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3,
+ 3, 3, 3, /* illegal in Unicode */
+ 4, 4, 4, 4, /* illegal in Unicode */
+ 5, 5, /* illegal in Unicode */
+ 0, 0 /* illegal bytes 0xfe and 0xff */
+};
+
+static const UChar32
+utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
+
+static const UChar32
+utf8_errorValue[6]={
+ CBUTF8_ERROR_VALUE_1, CBUTF8_ERROR_VALUE_2, CBUTF_ERROR_VALUE, 0x10ffff,
+ 0x3ffffff, 0x7fffffff
+};
+
+/*
+ * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling
+ * UTF8_NEXT_CHAR_SAFE().
+ *
+ * The "strict" parameter controls the error behavior:
+ * <0 "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative
+ * code point result.
+ * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
+ * All illegal byte sequences yield a positive code point such that this
+ * result code point would be encoded with the same number of bytes as
+ * the illegal sequence.
+ * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
+ * Same as the obsolete "safe" behavior, but non-characters are also treated
+ * like illegal sequences.
+ *
+ * The special negative (<0) value -2 is used for lenient treatment of surrogate
+ * code points as legal. Some implementations use this for roundtripping of
+ * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
+ * contain unpaired surrogates.
+ *
+ * Note that a UBool is the same as an int8_t.
+ */
+UChar32
+utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict) {
+ int32 i=*pi;
+ uint8 count=CBU8_COUNT_TRAIL_BYTES(c);
+ if((i)+count<=(length)) {
+ uint8 trail, illegal=0;
+
+ CBU8_MASK_LEAD_BYTE((c), count);
+ /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
+ switch(count) {
+ /* each branch falls through to the next one */
+ case 5:
+ case 4:
+ /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
+ illegal=1;
+ break;
+ case 3:
+ trail=s[(i)++];
+ (c)=((c)<<6)|(trail&0x3f);
+ if(c<0x110) {
+ illegal|=(trail&0xc0)^0x80;
+ } else {
+ /* code point>0x10ffff, outside Unicode */
+ illegal=1;
+ break;
+ }
+ case 2:
+ trail=s[(i)++];
+ (c)=((c)<<6)|(trail&0x3f);
+ illegal|=(trail&0xc0)^0x80;
+ case 1:
+ trail=s[(i)++];
+ (c)=((c)<<6)|(trail&0x3f);
+ illegal|=(trail&0xc0)^0x80;
+ break;
+ case 0:
+ if(strict>=0) {
+ return CBUTF8_ERROR_VALUE_1;
+ } else {
+ return CBU_SENTINEL;
+ }
+ /* no default branch to optimize switch() - all values are covered */
+ }
+
+ /*
+ * All the error handling should return a value
+ * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
+ *
+ * Starting with Unicode 3.0.1, non-shortest forms are illegal.
+ * Starting with Unicode 3.2, surrogate code points must not be
+ * encoded in UTF-8, and there are no irregular sequences any more.
+ *
+ * U8_ macros (new in ICU 2.4) return negative values for error conditions.
+ */
+
+ /* correct sequence - all trail bytes have (b7..b6)==(10)? */
+ /* illegal is also set if count>=4 */
+ if(illegal || (c)<utf8_minLegal[count] || (CBU_IS_SURROGATE(c) && strict!=-2)) {
+ /* error handling */
+ uint8 errorCount=count;
+ /* don't go beyond this sequence */
+ i=*pi;
+ while(count>0 && CBU8_IS_TRAIL(s[i])) {
+ ++(i);
+ --count;
+ }
+ if(strict>=0) {
+ c=utf8_errorValue[errorCount-count];
+ } else {
+ c=CBU_SENTINEL;
+ }
+ } else if((strict)>0 && CBU_IS_UNICODE_NONCHAR(c)) {
+ /* strict: forbid non-characters like U+fffe */
+ c=utf8_errorValue[count];
+ }
+ } else /* too few bytes left */ {
+ /* error handling */
+ int32 i0=i;
+ /* don't just set (i)=(length) in case there is an illegal sequence */
+ while((i)<(length) && CBU8_IS_TRAIL(s[i])) {
+ ++(i);
+ }
+ if(strict>=0) {
+ c=utf8_errorValue[i-i0];
+ } else {
+ c=CBU_SENTINEL;
+ }
+ }
+ *pi=i;
+ return c;
+}
+
+} // namespace base_icu
diff --git a/base/third_party/icu/icu_utf.h b/base/third_party/icu/icu_utf.h
new file mode 100644
index 0000000..050a84b
--- /dev/null
+++ b/base/third_party/icu/icu_utf.h
@@ -0,0 +1,358 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2004, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: utf.h
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999sep09
+* created by: Markus W. Scherer
+*/
+
+#ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_
+#define BASE_THIRD_PARTY_ICU_ICU_UTF_H_
+
+#include "base/basictypes.h"
+
+namespace base_icu {
+
+typedef uint32 UChar32;
+typedef int8 UBool;
+
+// General ---------------------------------------------------------------------
+// from utf.h
+
+/**
+ * This value is intended for sentinel values for APIs that
+ * (take or) return single code points (UChar32).
+ * It is outside of the Unicode code point range 0..0x10ffff.
+ *
+ * For example, a "done" or "error" value in a new API
+ * could be indicated with CBU_SENTINEL.
+ *
+ * ICU APIs designed before ICU 2.4 usually define service-specific "done"
+ * values, mostly 0xffff.
+ * Those may need to be distinguished from
+ * actual U+ffff text contents by calling functions like
+ * CharacterIterator::hasNext() or UnicodeString::length().
+ *
+ * @return -1
+ * @see UChar32
+ * @stable ICU 2.4
+ */
+#define CBU_SENTINEL (-1)
+
+/**
+ * Is this code point a Unicode noncharacter?
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_UNICODE_NONCHAR(c) \
+ ((c)>=0xfdd0 && \
+ ((uint32)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
+ (uint32)(c)<=0x10ffff)
+
+/**
+ * Is c a Unicode code point value (0..U+10ffff)
+ * that can be assigned a character?
+ *
+ * Code points that are not characters include:
+ * - single surrogate code points (U+d800..U+dfff, 2048 code points)
+ * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
+ * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
+ * - the highest Unicode code point value is U+10ffff
+ *
+ * This means that all code points below U+d800 are character code points,
+ * and that boundary is tested first for performance.
+ *
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_UNICODE_CHAR(c) \
+ ((uint32)(c)<0xd800 || \
+ ((uint32)(c)>0xdfff && \
+ (uint32)(c)<=0x10ffff && \
+ !U_IS_UNICODE_NONCHAR(c)))
+
+/**
+ * Is this code point a surrogate (U+d800..U+dfff)?
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
+
+/**
+ * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
+ * is it a lead surrogate?
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
+
+
+// UTF-8 macros ----------------------------------------------------------------
+// from utf8.h
+
+extern const uint8 utf8_countTrailBytes[256];
+
+/**
+ * Count the trail bytes for a UTF-8 lead byte.
+ * @internal
+ */
+#define CBU8_COUNT_TRAIL_BYTES(leadByte) (base_icu::utf8_countTrailBytes[(uint8)leadByte])
+
+/**
+ * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
+ * @internal
+ */
+#define CBU8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
+
+/**
+ * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU8_IS_SINGLE(c) (((c)&0x80)==0)
+
+/**
+ * Is this code unit (byte) a UTF-8 lead byte?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU8_IS_LEAD(c) ((uint8)((c)-0xc0)<0x3e)
+
+/**
+ * Is this code unit (byte) a UTF-8 trail byte?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU8_IS_TRAIL(c) (((c)&0xc0)==0x80)
+
+/**
+ * How many code units (bytes) are used for the UTF-8 encoding
+ * of this Unicode code point?
+ * @param c 32-bit code point
+ * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
+ * @stable ICU 2.4
+ */
+#define CBU8_LENGTH(c) \
+ ((uint32)(c)<=0x7f ? 1 : \
+ ((uint32)(c)<=0x7ff ? 2 : \
+ ((uint32)(c)<=0xd7ff ? 3 : \
+ ((uint32)(c)<=0xdfff || (uint32)(c)>0x10ffff ? 0 : \
+ ((uint32)(c)<=0xffff ? 3 : 4)\
+ ) \
+ ) \
+ ) \
+ )
+
+/**
+ * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
+ * @return 4
+ * @stable ICU 2.4
+ */
+#define CBU8_MAX_LENGTH 4
+
+/**
+ * Function for handling "next code point" with error-checking.
+ * @internal
+ */
+UChar32 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict);
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The offset may point to the lead byte of a multi-byte sequence,
+ * in which case the macro will read the whole sequence.
+ * If the offset points to a trail byte or an illegal UTF-8 sequence, then
+ * c is set to a negative value.
+ *
+ * @param s const uint8 * string
+ * @param i string offset, i<length
+ * @param length string length
+ * @param c output UChar32 variable, set to <0 in case of an error
+ * @see CBU8_NEXT_UNSAFE
+ * @stable ICU 2.4
+ */
+#define CBU8_NEXT(s, i, length, c) { \
+ (c)=(s)[(i)++]; \
+ if(((uint8)(c))>=0x80) { \
+ if(CBU8_IS_LEAD(c)) { \
+ (c)=base_icu::utf8_nextCharSafeBody((const uint8 *)s, &(i), (int32)(length), c, -1); \
+ } else { \
+ (c)=CBU_SENTINEL; \
+ } \
+ } \
+}
+
+/**
+ * Append a code point to a string, overwriting 1 to 4 bytes.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * Otherwise, the result is undefined.
+ *
+ * @param s const uint8 * string buffer
+ * @param i string offset
+ * @param c code point to append
+ * @see CBU8_APPEND
+ * @stable ICU 2.4
+ */
+#define CBU8_APPEND_UNSAFE(s, i, c) { \
+ if((uint32)(c)<=0x7f) { \
+ (s)[(i)++]=(uint8)(c); \
+ } else { \
+ if((uint32)(c)<=0x7ff) { \
+ (s)[(i)++]=(uint8)(((c)>>6)|0xc0); \
+ } else { \
+ if((uint32)(c)<=0xffff) { \
+ (s)[(i)++]=(uint8)(((c)>>12)|0xe0); \
+ } else { \
+ (s)[(i)++]=(uint8)(((c)>>18)|0xf0); \
+ (s)[(i)++]=(uint8)((((c)>>12)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8)((((c)>>6)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8)(((c)&0x3f)|0x80); \
+ } \
+}
+
+// UTF-16 macros ---------------------------------------------------------------
+// from utf8.h
+
+/**
+ * Does this code unit alone encode a code point (BMP, not a surrogate)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_SINGLE(c) !U_IS_SURROGATE(c)
+
+/**
+ * Is this code unit a lead surrogate (U+d800..U+dbff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
+
+/**
+ * Is this code unit a trail surrogate (U+dc00..U+dfff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
+
+/**
+ * Is this code unit a surrogate (U+d800..U+dfff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c)
+
+/**
+ * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
+ * is it a lead surrogate?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
+
+/**
+ * Helper constant for CBU16_GET_SUPPLEMENTARY.
+ * @internal
+ */
+#define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
+
+/**
+ * Get a supplementary code point value (U+10000..U+10ffff)
+ * from its lead and trail surrogates.
+ * The result is undefined if the input values are not
+ * lead and trail surrogates.
+ *
+ * @param lead lead surrogate (U+d800..U+dbff)
+ * @param trail trail surrogate (U+dc00..U+dfff)
+ * @return supplementary code point (U+10000..U+10ffff)
+ * @stable ICU 2.4
+ */
+#define CBU16_GET_SUPPLEMENTARY(lead, trail) \
+ (((base_icu::UChar32)(lead)<<10UL)+(base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET)
+
+
+/**
+ * Get the lead surrogate (0xd800..0xdbff) for a
+ * supplementary code point (0x10000..0x10ffff).
+ * @param supplementary 32-bit code point (U+10000..U+10ffff)
+ * @return lead surrogate (U+d800..U+dbff) for supplementary
+ * @stable ICU 2.4
+ */
+#define CBU16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
+
+/**
+ * Get the trail surrogate (0xdc00..0xdfff) for a
+ * supplementary code point (0x10000..0x10ffff).
+ * @param supplementary 32-bit code point (U+10000..U+10ffff)
+ * @return trail surrogate (U+dc00..U+dfff) for supplementary
+ * @stable ICU 2.4
+ */
+#define CBU16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
+
+/**
+ * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
+ * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
+ * @param c 32-bit code point
+ * @return 1 or 2
+ * @stable ICU 2.4
+ */
+#define CBU16_LENGTH(c) ((uint32)(c)<=0xffff ? 1 : 2)
+
+/**
+ * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
+ * @return 2
+ * @stable ICU 2.4
+ */
+#define CBU16_MAX_LENGTH 2
+
+/**
+ * Append a code point to a string, overwriting 1 or 2 code units.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * Otherwise, the result is undefined.
+ *
+ * @param s const UChar * string buffer
+ * @param i string offset
+ * @param c code point to append
+ * @see CBU16_APPEND
+ * @stable ICU 2.4
+ */
+#define CBU16_APPEND_UNSAFE(s, i, c) { \
+ if((uint32)(c)<=0xffff) { \
+ (s)[(i)++]=(uint16)(c); \
+ } else { \
+ (s)[(i)++]=(uint16)(((c)>>10)+0xd7c0); \
+ (s)[(i)++]=(uint16)(((c)&0x3ff)|0xdc00); \
+ } \
+}
+
+} // namesapce base_icu
+
+#endif // BASE_THIRD_PARTY_ICU_ICU_UTF_H_
diff --git a/base/trace_event.cc b/base/trace_event.cc
index 6c79825..be2fbaa 100644
--- a/base/trace_event.cc
+++ b/base/trace_event.cc
@@ -11,6 +11,7 @@
#include "base/platform_thread.h"
#include "base/process_util.h"
#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
#include "base/time.h"
#define USE_UNRELIABLE_NOW
diff --git a/base/utf_string_conversions.cc b/base/utf_string_conversions.cc
new file mode 100644
index 0000000..6b25cd8
--- /dev/null
+++ b/base/utf_string_conversions.cc
@@ -0,0 +1,361 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/utf_string_conversions.h"
+
+#include <vector>
+
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "base/string_util.h"
+#include "base/third_party/icu/icu_utf.h"
+
+namespace {
+
+inline bool IsValidCodepoint(uint32 code_point) {
+ // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
+ // codepoints larger than 0x10FFFF (the highest codepoint allowed).
+ // Non-characters and unassigned codepoints are allowed.
+ return code_point < 0xD800u ||
+ (code_point >= 0xE000u && code_point <= 0x10FFFFu);
+}
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+// Reads a UTF-8 stream, placing the next code point into the given output
+// |*code_point|. |src| represents the entire string to read, and |*char_index|
+// is the character offset within the string to start reading at. |*char_index|
+// will be updated to index the last character read, such that incrementing it
+// (as in a for loop) will take the reader to the next character.
+//
+// Returns true on success. On false, |*code_point| will be invalid.
+bool ReadUnicodeCharacter(const char* src, int32 src_len,
+ int32* char_index, uint32* code_point_out) {
+ // U8_NEXT expects to be able to use -1 to signal an error, so we must
+ // use a signed type for code_point. But this function returns false
+ // on error anyway, so code_point_out is unsigned.
+ int32 code_point;
+ CBU8_NEXT(src, *char_index, src_len, code_point);
+ *code_point_out = static_cast<uint32>(code_point);
+
+ // The ICU macro above moves to the next char, we want to point to the last
+ // char consumed.
+ (*char_index)--;
+
+ // Validate the decoded value.
+ return IsValidCodepoint(code_point);
+}
+
+// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
+bool ReadUnicodeCharacter(const char16* src, int32 src_len,
+ int32* char_index, uint32* code_point) {
+ if (CBU16_IS_SURROGATE(src[*char_index])) {
+ if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) ||
+ *char_index + 1 >= src_len ||
+ !CBU16_IS_TRAIL(src[*char_index + 1])) {
+ // Invalid surrogate pair.
+ return false;
+ }
+
+ // Valid surrogate pair.
+ *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index],
+ src[*char_index + 1]);
+ (*char_index)++;
+ } else {
+ // Not a surrogate, just one 16-bit word.
+ *code_point = src[*char_index];
+ }
+
+ return IsValidCodepoint(*code_point);
+}
+
+#if defined(WCHAR_T_IS_UTF32)
+// Reads UTF-32 character. The usage is the same as the 8-bit version above.
+bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len,
+ int32* char_index, uint32* code_point) {
+ // Conversion is easy since the source is 32-bit.
+ *code_point = src[*char_index];
+
+ // Validate the value.
+ return IsValidCodepoint(*code_point);
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+// Appends a UTF-8 character to the given 8-bit string.
+void WriteUnicodeCharacter(uint32 code_point, std::string* output) {
+ if (code_point <= 0x7f) {
+ // Fast path the common case of one byte.
+ output->push_back(code_point);
+ return;
+ }
+
+ // U8_APPEND_UNSAFE can append up to 4 bytes.
+ int32 char_offset = static_cast<int32>(output->length());
+ output->resize(char_offset + CBU8_MAX_LENGTH);
+
+ CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+
+ // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so
+ // it will represent the new length of the string.
+ output->resize(char_offset);
+}
+
+// Appends the given code point as a UTF-16 character to the STL string.
+void WriteUnicodeCharacter(uint32 code_point, string16* output) {
+ if (CBU16_LENGTH(code_point) == 1) {
+ // Thie code point is in the Basic Multilingual Plane (BMP).
+ output->push_back(static_cast<char16>(code_point));
+ } else {
+ // Non-BMP characters use a double-character encoding.
+ int32 char_offset = static_cast<int32>(output->length());
+ output->resize(char_offset + CBU16_MAX_LENGTH);
+ CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+ }
+}
+
+#if defined(WCHAR_T_IS_UTF32)
+// Appends the given UTF-32 character to the given 32-bit string.
+inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
+ // This is the easy case, just append the character.
+ output->push_back(code_point);
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// Generalized Unicode converter -----------------------------------------------
+
+// Converts the given source Unicode character type to the given destination
+// Unicode character type as a STL string. The given input buffer and size
+// determine the source, and the given output STL string will be replaced by
+// the result.
+template<typename SRC_CHAR, typename DEST_STRING>
+bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) {
+ output->clear();
+
+ // ICU requires 32-bit numbers.
+ bool success = true;
+ int32 src_len32 = static_cast<int32>(src_len);
+ for (int32 i = 0; i < src_len32; i++) {
+ uint32 code_point;
+ if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
+ WriteUnicodeCharacter(code_point, output);
+ } else {
+ // TODO(jungshik): consider adding 'Replacement character' (U+FFFD)
+ // in place of an invalid codepoint.
+ success = false;
+ }
+ }
+ return success;
+}
+
+// Guesses the length of the output in UTF-8 in bytes, and reserves that amount
+// of space in the given string. We also assume that the input character types
+// are unsigned, which will be true for UTF-16 and -32 on our systems. We assume
+// the string length is greater than zero.
+template<typename CHAR>
+void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) {
+ if (src[0] < 0x80) {
+ // Assume that the entire input will be ASCII.
+ output->reserve(src_len);
+ } else {
+ // Assume that the entire input is non-ASCII and will have 3 bytes per char.
+ output->reserve(src_len * 3);
+ }
+}
+
+// Guesses the size of the output buffer (containing either UTF-16 or -32 data)
+// given some UTF-8 input that will be converted to it. See ReserveUTF8Output.
+// We assume the source length is > 0.
+template<typename STRING>
+void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) {
+ if (static_cast<unsigned char>(src[0]) < 0x80) {
+ // Assume the input is all ASCII, which means 1:1 correspondence.
+ output->reserve(src_len);
+ } else {
+ // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
+ // character.
+ output->reserve(src_len / 2);
+ }
+}
+
+} // namespace
+
+// UTF-8 <-> Wide --------------------------------------------------------------
+
+std::string WideToUTF8(const std::wstring& wide) {
+ std::string ret;
+ if (wide.empty())
+ return ret;
+
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ WideToUTF8(wide.data(), wide.length(), &ret);
+ return ret;
+}
+
+bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ ReserveUTF8Output(src, src_len, output);
+ return ConvertUnicode<wchar_t, std::string>(src, src_len, output);
+}
+
+std::wstring UTF8ToWide(const base::StringPiece& utf8) {
+ std::wstring ret;
+ if (utf8.empty())
+ return ret;
+
+ UTF8ToWide(utf8.data(), utf8.length(), &ret);
+ return ret;
+}
+
+bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ ReserveUTF16Or32Output(src, src_len, output);
+ return ConvertUnicode<char, std::wstring>(src, src_len, output);
+}
+
+// UTF-16 <-> Wide -------------------------------------------------------------
+
+#if defined(WCHAR_T_IS_UTF16)
+
+// When wide == UTF-16, then conversions are a NOP.
+string16 WideToUTF16(const std::wstring& wide) {
+ return wide;
+}
+
+bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
+ output->assign(src, src_len);
+ return true;
+}
+
+std::wstring UTF16ToWide(const string16& utf16) {
+ return utf16;
+}
+
+bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
+ output->assign(src, src_len);
+ return true;
+}
+
+#elif defined(WCHAR_T_IS_UTF32)
+
+string16 WideToUTF16(const std::wstring& wide) {
+ string16 ret;
+ if (wide.empty())
+ return ret;
+
+ WideToUTF16(wide.data(), wide.length(), &ret);
+ return ret;
+}
+
+bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ // Assume that normally we won't have any non-BMP characters so the counts
+ // will be the same.
+ output->reserve(src_len);
+ return ConvertUnicode<wchar_t, string16>(src, src_len, output);
+}
+
+std::wstring UTF16ToWide(const string16& utf16) {
+ std::wstring ret;
+ if (utf16.empty())
+ return ret;
+
+ UTF16ToWide(utf16.data(), utf16.length(), &ret);
+ return ret;
+}
+
+bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ // Assume that normally we won't have any non-BMP characters so the counts
+ // will be the same.
+ output->reserve(src_len);
+ return ConvertUnicode<char16, std::wstring>(src, src_len, output);
+}
+
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// UTF16 <-> UTF8 --------------------------------------------------------------
+
+#if defined(WCHAR_T_IS_UTF32)
+
+bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ ReserveUTF16Or32Output(src, src_len, output);
+ return ConvertUnicode<char, string16>(src, src_len, output);
+}
+
+string16 UTF8ToUTF16(const std::string& utf8) {
+ string16 ret;
+ if (utf8.empty())
+ return ret;
+
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ UTF8ToUTF16(utf8.data(), utf8.length(), &ret);
+ return ret;
+}
+
+bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
+ if (src_len == 0) {
+ output->clear();
+ return true;
+ }
+
+ ReserveUTF8Output(src, src_len, output);
+ return ConvertUnicode<char16, std::string>(src, src_len, output);
+}
+
+std::string UTF16ToUTF8(const string16& utf16) {
+ std::string ret;
+ if (utf16.empty())
+ return ret;
+
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
+ return ret;
+}
+
+#elif defined(WCHAR_T_IS_UTF16)
+// Easy case since we can use the "wide" versions we already wrote above.
+
+bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
+ return UTF8ToWide(src, src_len, output);
+}
+
+string16 UTF8ToUTF16(const std::string& utf8) {
+ return UTF8ToWide(utf8);
+}
+
+bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
+ return WideToUTF8(src, src_len, output);
+}
+
+std::string UTF16ToUTF8(const string16& utf16) {
+ return WideToUTF8(utf16);
+}
+
+#endif
diff --git a/base/i18n/string_conversions.h b/base/utf_string_conversions.h
index c055bb1..89846ed 100644
--- a/base/i18n/string_conversions.h
+++ b/base/utf_string_conversions.h
@@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#ifndef BASE_I18N_STRING_CONVERSIONS_H_
-#define BASE_I18N_STRING_CONVERSIONS_H_
+#ifndef BASE_UTF_STRING_CONVERSIONS_H_
+#define BASE_UTF_STRING_CONVERSIONS_H_
#include <string>
@@ -51,51 +51,4 @@ std::string UTF16ToUTF8(const string16& utf16);
# define UTF16ToWideHack UTF16ToWide
#endif
-// Defines the error handling modes of UTF16ToCodepage, CodepageToUTF16,
-// WideToCodepage and CodepageToWide.
-class OnStringUtilConversionError {
- public:
- enum Type {
- // The function will return failure. The output buffer will be empty.
- FAIL,
-
- // The offending characters are skipped and the conversion will proceed as
- // if they did not exist.
- SKIP,
-
- // When converting to Unicode, the offending byte sequences are substituted
- // by Unicode replacement character (U+FFFD). When converting from Unicode,
- // this is the same as SKIP.
- SUBSTITUTE,
- };
-
- private:
- OnStringUtilConversionError();
-};
-
-// Converts between UTF-16 strings and the encoding specified. If the
-// encoding doesn't exist or the encoding fails (when on_error is FAIL),
-// returns false.
-bool UTF16ToCodepage(const string16& utf16,
- const char* codepage_name,
- OnStringUtilConversionError::Type on_error,
- std::string* encoded);
-
-bool CodepageToUTF16(const std::string& encoded,
- const char* codepage_name,
- OnStringUtilConversionError::Type on_error,
- string16* utf16);
-
-// Converts between wide strings and the encoding specified. If the
-// encoding doesn't exist or the encoding fails (when on_error is FAIL),
-// returns false.
-bool WideToCodepage(const std::wstring& wide,
- const char* codepage_name,
- OnStringUtilConversionError::Type on_error,
- std::string* encoded);
-bool CodepageToWide(const std::string& encoded,
- const char* codepage_name,
- OnStringUtilConversionError::Type on_error,
- std::wstring* wide);
-
-#endif // BASE_I18N_STRING_CONVERSIONS_H_
+#endif // BASE_UTF_STRING_CONVERSIONS_H_
diff --git a/base/values.cc b/base/values.cc
index 51e68a7..305f1cb 100644
--- a/base/values.cc
+++ b/base/values.cc
@@ -1,9 +1,10 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/logging.h"
#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
#include "base/values.h"
///////////////////// Value ////////////////////
diff --git a/chrome/browser/download/download_manager_unittest.cc b/chrome/browser/download/download_manager_unittest.cc
index 164c92b..a5058cf 100644
--- a/chrome/browser/download/download_manager_unittest.cc
+++ b/chrome/browser/download/download_manager_unittest.cc
@@ -1,13 +1,19 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <string>
+#include "base/string_util.h"
+#include "build/build_config.h"
#include "chrome/browser/download/download_manager.h"
#include "chrome/browser/download/download_util.h"
#include "testing/gtest/include/gtest/gtest.h"
+#if defined(OS_LINUX)
+#include <locale.h>
+#endif
+
class DownloadManagerTest : public testing::Test {
public:
DownloadManagerTest() {
@@ -455,6 +461,16 @@ const struct {
// Tests to ensure that the file names we generate from hints from the server
// (content-disposition, URL name, etc) don't cause security holes.
TEST_F(DownloadManagerTest, TestDownloadFilename) {
+#if defined(OS_LINUX)
+ // This test doesn't run when the locale is not UTF-8 becuase some of the
+ // string conversions fail. This is OK (we have the default value) but they
+ // don't match our expectations.
+ std::string locale = setlocale(LC_CTYPE, NULL);
+ StringToLowerASCII(&locale);
+ ASSERT_NE(std::string::npos, locale.find("utf-8"))
+ << "Your locale must be set to UTF-8 for this test to pass!";
+#endif
+
std::wstring file_name;
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kGeneratedFiles); ++i) {
GetGeneratedFilename(kGeneratedFiles[i].disposition,