diff options
author | brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-06 21:32:31 +0000 |
---|---|---|
committer | brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-06 21:32:31 +0000 |
commit | 7ec490096f5508b0cae67cf1cc0664eaabd090f9 (patch) | |
tree | 7b5b2191e6bc52f5c04ef0437f687780a96067b5 /base/strings | |
parent | 12e60fdb9c4c0c5cdb95c5fabebb15b4c161718c (diff) | |
download | chromium_src-7ec490096f5508b0cae67cf1cc0664eaabd090f9.zip chromium_src-7ec490096f5508b0cae67cf1cc0664eaabd090f9.tar.gz chromium_src-7ec490096f5508b0cae67cf1cc0664eaabd090f9.tar.bz2 |
Move utf_string_conversions to strings/ and add namespace.
This keeps a forwarding header in the old location and adds "using" statements to avoid changing all callers.
BUG=None
R=tfarina@chromium.org
Review URL: https://codereview.chromium.org/14126006
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@198542 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/strings')
-rw-r--r-- | base/strings/utf_string_conversions.cc | 185 | ||||
-rw-r--r-- | base/strings/utf_string_conversions.h | 72 | ||||
-rw-r--r-- | base/strings/utf_string_conversions_unittest.cc | 211 |
3 files changed, 468 insertions, 0 deletions
diff --git a/base/strings/utf_string_conversions.cc b/base/strings/utf_string_conversions.cc new file mode 100644 index 0000000..b75ed0c --- /dev/null +++ b/base/strings/utf_string_conversions.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/utf_string_conversions.h" + +#include "base/string_util.h" +#include "base/strings/string_piece.h" +#include "base/strings/utf_string_conversion_utils.h" + +namespace base { + +namespace { + +// Generalized Unicode converter ----------------------------------------------- + +// Converts the given source Unicode character type to the given destination +// Unicode character type as a STL string. The given input buffer and size +// determine the source, and the given output STL string will be replaced by +// the result. +template<typename SRC_CHAR, typename DEST_STRING> +bool ConvertUnicode(const SRC_CHAR* src, + size_t src_len, + DEST_STRING* output) { + // ICU requires 32-bit numbers. + bool success = true; + int32 src_len32 = static_cast<int32>(src_len); + for (int32 i = 0; i < src_len32; i++) { + uint32 code_point; + if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { + WriteUnicodeCharacter(code_point, output); + } else { + WriteUnicodeCharacter(0xFFFD, output); + success = false; + } + } + + return success; +} + +} // namespace + +// UTF-8 <-> Wide -------------------------------------------------------------- + +bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { + PrepareForUTF8Output(src, src_len, output); + return ConvertUnicode(src, src_len, output); +} + +std::string WideToUTF8(const std::wstring& wide) { + std::string ret; + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + WideToUTF8(wide.data(), wide.length(), &ret); + return ret; +} + +bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { + PrepareForUTF16Or32Output(src, src_len, output); + return ConvertUnicode(src, src_len, output); +} + +std::wstring UTF8ToWide(const StringPiece& utf8) { + std::wstring ret; + UTF8ToWide(utf8.data(), utf8.length(), &ret); + return ret; +} + +// UTF-16 <-> Wide ------------------------------------------------------------- + +#if defined(WCHAR_T_IS_UTF16) + +// When wide == UTF-16, then conversions are a NOP. +bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { + output->assign(src, src_len); + return true; +} + +string16 WideToUTF16(const std::wstring& wide) { + return wide; +} + +bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { + output->assign(src, src_len); + return true; +} + +std::wstring UTF16ToWide(const string16& utf16) { + return utf16; +} + +#elif defined(WCHAR_T_IS_UTF32) + +bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { + output->clear(); + // Assume that normally we won't have any non-BMP characters so the counts + // will be the same. + output->reserve(src_len); + return ConvertUnicode(src, src_len, output); +} + +string16 WideToUTF16(const std::wstring& wide) { + string16 ret; + WideToUTF16(wide.data(), wide.length(), &ret); + return ret; +} + +bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { + output->clear(); + // Assume that normally we won't have any non-BMP characters so the counts + // will be the same. + output->reserve(src_len); + return ConvertUnicode(src, src_len, output); +} + +std::wstring UTF16ToWide(const string16& utf16) { + std::wstring ret; + UTF16ToWide(utf16.data(), utf16.length(), &ret); + return ret; +} + +#endif // defined(WCHAR_T_IS_UTF32) + +// UTF16 <-> UTF8 -------------------------------------------------------------- + +#if defined(WCHAR_T_IS_UTF32) + +bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { + PrepareForUTF16Or32Output(src, src_len, output); + return ConvertUnicode(src, src_len, output); +} + +string16 UTF8ToUTF16(const StringPiece& utf8) { + string16 ret; + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + UTF8ToUTF16(utf8.data(), utf8.length(), &ret); + return ret; +} + +bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { + PrepareForUTF8Output(src, src_len, output); + return ConvertUnicode(src, src_len, output); +} + +std::string UTF16ToUTF8(const string16& utf16) { + std::string ret; + // Ignore the success flag of this call, it will do the best it can for + // invalid input, which is what we want here. + UTF16ToUTF8(utf16.data(), utf16.length(), &ret); + return ret; +} + +#elif defined(WCHAR_T_IS_UTF16) +// Easy case since we can use the "wide" versions we already wrote above. + +bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { + return UTF8ToWide(src, src_len, output); +} + +string16 UTF8ToUTF16(const StringPiece& utf8) { + return UTF8ToWide(utf8); +} + +bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { + return WideToUTF8(src, src_len, output); +} + +std::string UTF16ToUTF8(const string16& utf16) { + return WideToUTF8(utf16); +} + +#endif + +std::wstring ASCIIToWide(const StringPiece& ascii) { + DCHECK(IsStringASCII(ascii)) << ascii; + return std::wstring(ascii.begin(), ascii.end()); +} + +string16 ASCIIToUTF16(const StringPiece& ascii) { + DCHECK(IsStringASCII(ascii)) << ascii; + return string16(ascii.begin(), ascii.end()); +} + +} // namespace base diff --git a/base/strings/utf_string_conversions.h b/base/strings/utf_string_conversions.h new file mode 100644 index 0000000..ec6a87f --- /dev/null +++ b/base/strings/utf_string_conversions.h @@ -0,0 +1,72 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_UTF_STRING_CONVERSIONS_H_ +#define BASE_STRINGS_UTF_STRING_CONVERSIONS_H_ + +#include <string> + +#include "base/base_export.h" +#include "base/string16.h" +#include "base/strings/string_piece.h" + +namespace base { + +// These convert between UTF-8, -16, and -32 strings. They are potentially slow, +// so avoid unnecessary conversions. The low-level versions return a boolean +// indicating whether the conversion was 100% valid. In this case, it will still +// do the best it can and put the result in the output buffer. The versions that +// return strings ignore this error and just return the best conversion +// possible. +BASE_EXPORT bool WideToUTF8(const wchar_t* src, size_t src_len, + std::string* output); +BASE_EXPORT std::string WideToUTF8(const std::wstring& wide); +BASE_EXPORT bool UTF8ToWide(const char* src, size_t src_len, + std::wstring* output); +BASE_EXPORT std::wstring UTF8ToWide(const StringPiece& utf8); + +BASE_EXPORT bool WideToUTF16(const wchar_t* src, size_t src_len, + string16* output); +BASE_EXPORT string16 WideToUTF16(const std::wstring& wide); +BASE_EXPORT bool UTF16ToWide(const char16* src, size_t src_len, + std::wstring* output); +BASE_EXPORT std::wstring UTF16ToWide(const string16& utf16); + +BASE_EXPORT bool UTF8ToUTF16(const char* src, size_t src_len, string16* output); +BASE_EXPORT string16 UTF8ToUTF16(const StringPiece& utf8); +BASE_EXPORT bool UTF16ToUTF8(const char16* src, size_t src_len, + std::string* output); +BASE_EXPORT std::string UTF16ToUTF8(const string16& utf16); + +// We are trying to get rid of wstring as much as possible, but it's too big +// a mess to do it all at once. These conversions should be used when we +// really should just be passing a string16 around, but we haven't finished +// porting whatever module uses wstring and the conversion is being used as a +// stopcock. This makes it easy to grep for the ones that should be removed. +#if defined(OS_WIN) +# define WideToUTF16Hack +# define UTF16ToWideHack +#else +# define WideToUTF16Hack WideToUTF16 +# define UTF16ToWideHack UTF16ToWide +#endif + +// These convert an ASCII string, typically a hardcoded constant, to a +// UTF16/Wide string. +BASE_EXPORT std::wstring ASCIIToWide(const StringPiece& ascii); +BASE_EXPORT string16 ASCIIToUTF16(const StringPiece& ascii); + +} // namespace base + +// TODO(brettw) remove these when callers are fixed up. +using base::WideToUTF8; +using base::UTF8ToWide; +using base::WideToUTF16; +using base::UTF16ToWide; +using base::UTF8ToUTF16; +using base::UTF16ToUTF8; +using base::ASCIIToWide; +using base::ASCIIToUTF16; + +#endif // BASE_STRINGS_UTF_STRING_CONVERSIONS_H_ diff --git a/base/strings/utf_string_conversions_unittest.cc b/base/strings/utf_string_conversions_unittest.cc new file mode 100644 index 0000000..974184e --- /dev/null +++ b/base/strings/utf_string_conversions_unittest.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/string_util.h" +#include "base/strings/string_piece.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace base { + +namespace { + +const wchar_t* const kConvertRoundtripCases[] = { + L"Google Video", + // "网页 图片 资讯更多 »" + L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", + // "Παγκόσμιος Ιστός" + L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", + // "Поиск страниц на русском" + L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" + L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" + L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", + // "전체서비스" + L"\xc804\xccb4\xc11c\xbe44\xc2a4", + + // Test characters that take more than 16 bits. This will depend on whether + // wchar_t is 16 or 32 bits. +#if defined(WCHAR_T_IS_UTF16) + L"\xd800\xdf00", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", +#elif defined(WCHAR_T_IS_UTF32) + L"\x10300", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\x11d40\x11d41\x11d42\x11d43\x11d44", +#endif +}; + +} // namespace + +TEST(UTFStringConversionsTest, ConvertUTF8AndWide) { + // we round-trip all the wide strings through UTF-8 to make sure everything + // agrees on the conversion. This uses the stream operators to test them + // simultaneously. + for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { + std::ostringstream utf8; + utf8 << WideToUTF8(kConvertRoundtripCases[i]); + std::wostringstream wide; + wide << UTF8ToWide(utf8.str()); + + EXPECT_EQ(kConvertRoundtripCases[i], wide.str()); + } +} + +TEST(UTFStringConversionsTest, ConvertUTF8AndWideEmptyString) { + // An empty std::wstring should be converted to an empty std::string, + // and vice versa. + std::wstring wempty; + std::string empty; + EXPECT_EQ(empty, WideToUTF8(wempty)); + EXPECT_EQ(wempty, UTF8ToWide(empty)); +} + +TEST(UTFStringConversionsTest, ConvertUTF8ToWide) { + struct UTF8ToWideCase { + const char* utf8; + const wchar_t* wide; + bool success; + } convert_cases[] = { + // Regular UTF-8 input. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, + // Non-character is passed through. + {"\xef\xbf\xbfHello", L"\xffffHello", true}, + // Truncated UTF-8 sequence. + {"\xe4\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false}, + // Truncated off the end. + {"\xe5\xa5\xbd\xe4\xa0", L"\x597d\xfffd", false}, + // Non-shortest-form UTF-8. + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false}, + // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. + {"\xed\xb0\x80", L"\xfffd", false}, + // Non-BMP characters. The second is a non-character regarded as valid. + // The result will either be in UTF-16 or UTF-32. +#if defined(WCHAR_T_IS_UTF16) + {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, +#elif defined(WCHAR_T_IS_UTF32) + {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, +#endif + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { + std::wstring converted; + EXPECT_EQ(convert_cases[i].success, + UTF8ToWide(convert_cases[i].utf8, + strlen(convert_cases[i].utf8), + &converted)); + std::wstring expected(convert_cases[i].wide); + EXPECT_EQ(expected, converted); + } + + // Manually test an embedded NULL. + std::wstring converted; + EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); + ASSERT_EQ(3U, converted.length()); + EXPECT_EQ(static_cast<wchar_t>(0), converted[0]); + EXPECT_EQ('Z', converted[1]); + EXPECT_EQ('\t', converted[2]); + + // Make sure that conversion replaces, not appends. + EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); + ASSERT_EQ(1U, converted.length()); + EXPECT_EQ('B', converted[0]); +} + +#if defined(WCHAR_T_IS_UTF16) +// This test is only valid when wchar_t == UTF-16. +TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf16; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular UTF-16 input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false}, + // Truncated at the end. + {L"\x597d\xd800", "\xe5\xa5\xbd\xef\xbf\xbd", false}, + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} + +#elif defined(WCHAR_T_IS_UTF32) +// This test is only valid when wchar_t == UTF-32. +TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf32; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular 16-bit input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // Invalid Unicode code points. + {L"\xfffffffHello", "\xEF\xBF\xBDHello", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false}, + {L"\xdc01Hello", "\xef\xbf\xbdHello", false}, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf32, + wcslen(convert_cases[i].utf32), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} +#endif // defined(WCHAR_T_IS_UTF32) + +TEST(UTFStringConversionsTest, ConvertMultiString) { + static wchar_t wmulti[] = { + L'f', L'o', L'o', L'\0', + L'b', L'a', L'r', L'\0', + L'b', L'a', L'z', L'\0', + L'\0' + }; + static char multi[] = { + 'f', 'o', 'o', '\0', + 'b', 'a', 'r', '\0', + 'b', 'a', 'z', '\0', + '\0' + }; + std::wstring wmultistring; + memcpy(WriteInto(&wmultistring, arraysize(wmulti)), wmulti, sizeof(wmulti)); + EXPECT_EQ(arraysize(wmulti) - 1, wmultistring.length()); + std::string expected; + memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi)); + EXPECT_EQ(arraysize(multi) - 1, expected.length()); + const std::string& converted = WideToUTF8(wmultistring); + EXPECT_EQ(arraysize(multi) - 1, converted.length()); + EXPECT_EQ(expected, converted); +} + +} // base |