summaryrefslogtreecommitdiffstats
path: root/base/utf_string_conversions_unittest.cc
diff options
context:
space:
mode:
authorpkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-11-07 01:34:53 +0000
committerpkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-11-07 01:34:53 +0000
commitce85f60cd9d399109dab39fe5a9613879ab9a8f7 (patch)
tree0e9e0072d2e5eadfeec08eef0f06a43c56dc1751 /base/utf_string_conversions_unittest.cc
parentd90684d0cf0aa16389c9202153c97d373829b7f3 (diff)
downloadchromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.zip
chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.gz
chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.bz2
Fix various problems with inline autocomplete and URLs that change length during fixup:
* URLs with http auth info, which gets stripped * URLs with IDN hosts * URLs with escaped values that get unescaped In cases like these, we'd inline autocomplete from the wrong locations, highlight the wrong portions of the URL as matches, and sometimes DCHECK() in debug mode. The fix is to track how fixup affects the offsets into the URL we care about. Plumbing this required an enormous number of additions :( There is also a fix here to the URL Fixer Upper, which was obviously modified at some point in the past to use the Parsed components, but without updating the comments or some of the functionality to match. Since this isn't supposed to "fix up" things that aren't simple typos, I removed some code to "fix" bogus ports, which was causing bizarre effects when typing HTTP auth URLs ("http://foo:bar" would be fixed to "http://foo" and then matched for inline autocompletion, which was clearly wrong). This is tested incidentally by one of the new History URL Provider tests (which is how I discovered it). BUG=4010 TEST=Covered by unittests Review URL: http://codereview.chromium.org/372017 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@31352 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/utf_string_conversions_unittest.cc')
-rw-r--r--base/utf_string_conversions_unittest.cc306
1 files changed, 306 insertions, 0 deletions
diff --git a/base/utf_string_conversions_unittest.cc b/base/utf_string_conversions_unittest.cc
new file mode 100644
index 0000000..67af7c3
--- /dev/null
+++ b/base/utf_string_conversions_unittest.cc
@@ -0,0 +1,306 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/basictypes.h"
+#include "base/string_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace base {
+
+namespace {
+
+// Given a null-terminated string of wchar_t with each wchar_t representing
+// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input.
+// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF)
+// should be represented as a surrogate pair (two UTF-16 units)
+// *even* where wchar_t is 32-bit (Linux and Mac).
+//
+// This is to help write tests for functions with string16 params until
+// the C++ 0x UTF-16 literal is well-supported by compilers.
+string16 BuildString16(const wchar_t* s) {
+#if defined(WCHAR_T_IS_UTF16)
+ return string16(s);
+#elif defined(WCHAR_T_IS_UTF32)
+ string16 u16;
+ while (*s != 0) {
+ DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu);
+ u16.push_back(*s++);
+ }
+ return u16;
+#endif
+}
+
+const wchar_t* const kConvertRoundtripCases[] = {
+ L"Google Video",
+ // "网页 图片 资讯更多 »"
+ L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb",
+ // "Παγκόσμιος Ιστός"
+ L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
+ L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2",
+ // "Поиск страниц на русском"
+ L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442"
+ L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430"
+ L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c",
+ // "전체서비스"
+ L"\xc804\xccb4\xc11c\xbe44\xc2a4",
+
+ // Test characters that take more than 16 bits. This will depend on whether
+ // wchar_t is 16 or 32 bits.
+#if defined(WCHAR_T_IS_UTF16)
+ L"\xd800\xdf00",
+ // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
+ L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44",
+#elif defined(WCHAR_T_IS_UTF32)
+ L"\x10300",
+ // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
+ L"\x11d40\x11d41\x11d42\x11d43\x11d44",
+#endif
+};
+
+} // namespace
+
+TEST(UTFStringConversionsTest, ConvertUTF8AndWide) {
+ // we round-trip all the wide strings through UTF-8 to make sure everything
+ // agrees on the conversion. This uses the stream operators to test them
+ // simultaneously.
+ for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) {
+ std::ostringstream utf8;
+ utf8 << WideToUTF8(kConvertRoundtripCases[i]);
+ std::wostringstream wide;
+ wide << UTF8ToWide(utf8.str());
+
+ EXPECT_EQ(kConvertRoundtripCases[i], wide.str());
+ }
+}
+
+TEST(UTFStringConversionsTest, ConvertUTF8AndWideEmptyString) {
+ // An empty std::wstring should be converted to an empty std::string,
+ // and vice versa.
+ std::wstring wempty;
+ std::string empty;
+ EXPECT_EQ(empty, WideToUTF8(wempty));
+ EXPECT_EQ(wempty, UTF8ToWide(empty));
+}
+
+TEST(UTFStringConversionsTest, ConvertUTF8ToWide) {
+ struct UTF8ToWideCase {
+ const char* utf8;
+ const wchar_t* wide;
+ bool success;
+ } convert_cases[] = {
+ // Regular UTF-8 input.
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true},
+ // Non-character is passed through.
+ {"\xef\xbf\xbfHello", L"\xffffHello", true},
+ // Truncated UTF-8 sequence.
+ {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false},
+ // Truncated off the end.
+ {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false},
+ // Non-shortest-form UTF-8.
+ {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false},
+ // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
+ {"\xed\xb0\x80", L"", false},
+ // Non-BMP characters. The second is a non-character regarded as valid.
+ // The result will either be in UTF-16 or UTF-32.
+#if defined(WCHAR_T_IS_UTF16)
+ {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true},
+ {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true},
+#elif defined(WCHAR_T_IS_UTF32)
+ {"A\xF0\x90\x8C\x80z", L"A\x10300z", true},
+ {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true},
+#endif
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) {
+ std::wstring converted;
+ EXPECT_EQ(convert_cases[i].success,
+ UTF8ToWide(convert_cases[i].utf8,
+ strlen(convert_cases[i].utf8),
+ &converted));
+ std::wstring expected(convert_cases[i].wide);
+ EXPECT_EQ(expected, converted);
+ }
+
+ // Manually test an embedded NULL.
+ std::wstring converted;
+ EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted));
+ ASSERT_EQ(3U, converted.length());
+ EXPECT_EQ(static_cast<wchar_t>(0), converted[0]);
+ EXPECT_EQ('Z', converted[1]);
+ EXPECT_EQ('\t', converted[2]);
+
+ // Make sure that conversion replaces, not appends.
+ EXPECT_TRUE(UTF8ToWide("B", 1, &converted));
+ ASSERT_EQ(1U, converted.length());
+ EXPECT_EQ('B', converted[0]);
+}
+
+#if defined(WCHAR_T_IS_UTF16)
+// This test is only valid when wchar_t == UTF-16.
+TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) {
+ struct WideToUTF8Case {
+ const wchar_t* utf16;
+ const char* utf8;
+ bool success;
+ } convert_cases[] = {
+ // Regular UTF-16 input.
+ {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
+ // Test a non-BMP character.
+ {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true},
+ // Non-characters are passed through.
+ {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+ {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true},
+ // The first character is a truncated UTF-16 character.
+ {L"\xd800\x597d", "\xe5\xa5\xbd", false},
+ // Truncated at the end.
+ {L"\x597d\xd800", "\xe5\xa5\xbd", false},
+ };
+
+ for (int i = 0; i < arraysize(convert_cases); i++) {
+ std::string converted;
+ EXPECT_EQ(convert_cases[i].success,
+ WideToUTF8(convert_cases[i].utf16,
+ wcslen(convert_cases[i].utf16),
+ &converted));
+ std::string expected(convert_cases[i].utf8);
+ EXPECT_EQ(expected, converted);
+ }
+}
+
+#elif defined(WCHAR_T_IS_UTF32)
+// This test is only valid when wchar_t == UTF-32.
+TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) {
+ struct WideToUTF8Case {
+ const wchar_t* utf32;
+ const char* utf8;
+ bool success;
+ } convert_cases[] = {
+ // Regular 16-bit input.
+ {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
+ // Test a non-BMP character.
+ {L"A\x10300z", "A\xF0\x90\x8C\x80z", true},
+ // Non-characters are passed through.
+ {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+ {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true},
+ // Invalid Unicode code points.
+ {L"\xfffffffHello", "Hello", false},
+ // The first character is a truncated UTF-16 character.
+ {L"\xd800\x597d", "\xe5\xa5\xbd", false},
+ {L"\xdc01Hello", "Hello", false},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) {
+ std::string converted;
+ EXPECT_EQ(convert_cases[i].success,
+ WideToUTF8(convert_cases[i].utf32,
+ wcslen(convert_cases[i].utf32),
+ &converted));
+ std::string expected(convert_cases[i].utf8);
+ EXPECT_EQ(expected, converted);
+ }
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+TEST(UTFStringConversionsTest, ConvertMultiString) {
+ static wchar_t wmulti[] = {
+ L'f', L'o', L'o', L'\0',
+ L'b', L'a', L'r', L'\0',
+ L'b', L'a', L'z', L'\0',
+ L'\0'
+ };
+ static char multi[] = {
+ 'f', 'o', 'o', '\0',
+ 'b', 'a', 'r', '\0',
+ 'b', 'a', 'z', '\0',
+ '\0'
+ };
+ std::wstring wmultistring;
+ memcpy(WriteInto(&wmultistring, arraysize(wmulti)), wmulti, sizeof(wmulti));
+ EXPECT_EQ(arraysize(wmulti) - 1, wmultistring.length());
+ std::string expected;
+ memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi));
+ EXPECT_EQ(arraysize(multi) - 1, expected.length());
+ const std::string& converted = WideToUTF8(wmultistring);
+ EXPECT_EQ(arraysize(multi) - 1, converted.length());
+ EXPECT_EQ(expected, converted);
+}
+
+TEST(UTFStringConversionsTest, AdjustOffset) {
+ // Under the hood, all the functions call the same converter function, so we
+ // don't need to exhaustively check every case.
+ struct WideToUTF8Case {
+ const wchar_t* wide;
+ size_t input_offset;
+ size_t output_offset;
+ } wide_to_utf8_cases[] = {
+ {L"", 0, std::string::npos},
+ {L"\x4f60\x597d", 0, 0},
+ {L"\x4f60\x597d", 1, 3},
+ {L"\x4f60\x597d", 2, std::string::npos},
+ {L"\x4f60\x597d", std::wstring::npos, std::string::npos},
+ {L"\xd800\x597dz", 1, 0},
+ {L"\xd800\x597dz", 2, 3},
+ };
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(wide_to_utf8_cases); ++i) {
+ size_t offset = wide_to_utf8_cases[i].input_offset;
+ WideToUTF8AndAdjustOffset(wide_to_utf8_cases[i].wide, &offset);
+ EXPECT_EQ(wide_to_utf8_cases[i].output_offset, offset);
+ }
+
+ struct UTF8ToWideCase {
+ const char* utf8;
+ size_t input_offset;
+ size_t output_offset;
+ } utf8_to_wide_cases[] = {
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos},
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
+ {"\xed\xb0\x80z", 3, 0},
+ {"A\xF0\x90\x8C\x80z", 1, 1},
+ {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos},
+#if defined(WCHAR_T_IS_UTF16)
+ {"A\xF0\x90\x8C\x80z", 5, 3},
+#elif defined(WCHAR_T_IS_UTF32)
+ {"A\xF0\x90\x8C\x80z", 5, 2},
+#endif
+ };
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf8_to_wide_cases); ++i) {
+ size_t offset = utf8_to_wide_cases[i].input_offset;
+ UTF8ToWideAndAdjustOffset(utf8_to_wide_cases[i].utf8, &offset);
+ EXPECT_EQ(utf8_to_wide_cases[i].output_offset, offset);
+ }
+
+#if defined(WCHAR_T_IS_UTF32)
+ struct WideToUTF16Case {
+ const wchar_t* wide;
+ size_t input_offset;
+ size_t output_offset;
+ } wide_to_utf16_cases[] = {
+ {L"\x4F60\x597D", 1, 1},
+ {L"\x20000\x4E00", 1, 2},
+ };
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(wide_to_utf16_cases); ++i) {
+ size_t offset = wide_to_utf16_cases[i].input_offset;
+ WideToUTF16AndAdjustOffset(wide_to_utf16_cases[i].wide, &offset);
+ EXPECT_EQ(wide_to_utf16_cases[i].output_offset, offset);
+ }
+
+ struct UTF16ToWideCase {
+ const wchar_t* wide;
+ size_t input_offset;
+ size_t output_offset;
+ } utf16_to_wide_cases[] = {
+ {L"\xD840\xDC00\x4E00", 0, 0},
+ {L"\xD840\xDC00\x4E00", 1, std::wstring::npos},
+ {L"\xD840\xDC00\x4E00", 2, 1},
+ };
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_wide_cases); ++i) {
+ size_t offset = utf16_to_wide_cases[i].input_offset;
+ UTF16ToWideAndAdjustOffset(BuildString16(utf16_to_wide_cases[i].wide),
+ &offset);
+ EXPECT_EQ(utf16_to_wide_cases[i].output_offset, offset);
+ }
+#endif
+}
+
+} // namaspace base