diff options
32 files changed, 1393 insertions, 747 deletions
diff --git a/app/gfx/text_elider.cc b/app/gfx/text_elider.cc index a1db1c6..dc9b199 100644 --- a/app/gfx/text_elider.cc +++ b/app/gfx/text_elider.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -33,8 +33,8 @@ std::wstring ElideUrl(const GURL& url, const std::wstring& languages) { // Get a formatted string and corresponding parsing of the url. url_parse::Parsed parsed; - std::wstring url_string = - net::FormatUrl(url, languages, true, UnescapeRule::SPACES, &parsed, NULL); + std::wstring url_string = net::FormatUrl(url, languages, true, + UnescapeRule::SPACES, &parsed, NULL, NULL); if (available_pixel_width <= 0) return url_string; @@ -334,12 +334,12 @@ std::wstring ElideText(const std::wstring& text, SortedDisplayURL::SortedDisplayURL(const GURL& url, const std::wstring& languages) { std::wstring host; - net::AppendFormattedHost(url, languages, &host, NULL); + net::AppendFormattedHost(url, languages, &host, NULL, NULL); sort_host_ = WideToUTF16Hack(host); string16 host_minus_www = WideToUTF16Hack(net::StripWWW(host)); url_parse::Parsed parsed; display_url_ = WideToUTF16Hack(net::FormatUrl(url, languages, - true, UnescapeRule::SPACES, &parsed, &prefix_end_)); + true, UnescapeRule::SPACES, &parsed, &prefix_end_, NULL)); if (sort_host_.length() > host_minus_www.length()) { prefix_end_ += sort_host_.length() - host_minus_www.length(); sort_host_.swap(host_minus_www); diff --git a/base/base.gyp b/base/base.gyp index 71ff640..f09e2e5 100644 --- a/base/base.gyp +++ b/base/base.gyp @@ -633,6 +633,7 @@ 'timer_unittest.cc', 'tracked_objects_unittest.cc', 'tuple_unittest.cc', + 'utf_string_conversions_unittest.cc', 'values_unittest.cc', 'version_unittest.cc', 'waitable_event_unittest.cc', diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc index ba9f9ae..c93b103 100644 --- a/base/i18n/icu_string_conversions.cc +++ b/base/i18n/icu_string_conversions.cc @@ -157,6 +157,90 @@ const char kCodepageUTF16LE[] = "UTF-16LE"; // Codepage <-> Wide/UTF-16 --------------------------------------------------- +// Convert a UTF-16 string into the specified codepage_name. If the codepage +// isn't found, return false. +bool UTF16ToCodepage(const string16& utf16, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::string* encoded) { + encoded->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + return ConvertFromUTF16(converter, utf16.c_str(), + static_cast<int>(utf16.length()), on_error, encoded); +} + +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment) { + utf16->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + // Even in the worst case, the maximum length in 2-byte units of UTF-16 + // output would be at most the same as the number of bytes in input. There + // is no single-byte encoding in which a character is mapped to a + // non-BMP character requiring two 2-byte units. + // + // Moreover, non-BMP characters in legacy multibyte encodings + // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are + // BOCU and SCSU, but we don't care about them. + size_t uchar_max_length = encoded.length() + 1; + + SetUpErrorHandlerForToUChars(on_error, converter, &status); + char16* byte_buffer = WriteInto(utf16, uchar_max_length); + int byte_buffer_length = static_cast<int>(uchar_max_length); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = string16::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, + data, static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size; + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = string16::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data, + length, &status); + ucnv_close(converter); + if (!U_SUCCESS(status)) { + utf16->clear(); // Make sure the output is empty on error. + return false; + } + + utf16->resize(actual_size); + return true; +} + // Convert a wstring into the specified codepage_name. If the codepage // isn't found, return false. bool WideToCodepage(const std::wstring& wide, @@ -188,31 +272,16 @@ bool WideToCodepage(const std::wstring& wide, #endif // defined(WCHAR_T_IS_UTF32) } -// Convert a UTF-16 string into the specified codepage_name. If the codepage -// isn't found, return false. -bool UTF16ToCodepage(const string16& utf16, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::string* encoded) { - encoded->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - return ConvertFromUTF16(converter, utf16.c_str(), - static_cast<int>(utf16.length()), on_error, encoded); -} - // Converts a string of the given codepage into wstring. // If the codepage isn't found, return false. -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide) { +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment) { #if defined(WCHAR_T_IS_UTF16) - return CodepageToUTF16(encoded, codepage_name, on_error, wide); + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide, + offset_for_adjustment); #elif defined(WCHAR_T_IS_UTF32) wide->clear(); @@ -227,70 +296,53 @@ bool CodepageToWide(const std::string& encoded, // this can be 4 times larger than actually needed. size_t wchar_max_length = encoded.length() + 1; - // The byte buffer and its length to pass to ucnv_toAlgorithimic. - char* byte_buffer = reinterpret_cast<char*>( - WriteInto(wide, wchar_max_length)); - int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), - converter, - byte_buffer, - byte_buffer_length, - encoded.data(), - static_cast<int>(encoded.length()), - &status); + char* byte_buffer = + reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)); + int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = std::wstring::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, + static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size / sizeof(wchar_t); + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = std::wstring::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, length, &status); ucnv_close(converter); - if (!U_SUCCESS(status)) { wide->clear(); // Make sure the output is empty on error. return false; } // actual_size is # of bytes. - wide->resize(actual_size / 4); + wide->resize(actual_size / sizeof(wchar_t)); return true; #endif // defined(WCHAR_T_IS_UTF32) } -// Converts a string of the given codepage into UTF-16. -// If the codepage isn't found, return false. -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16) { - utf16->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - // Even in the worst case, the maximum length in 2-byte units of UTF-16 - // output would be at most the same as the number of bytes in input. There - // is no single-byte encoding in which a character is mapped to a - // non-BMP character requiring two 2-byte units. - // - // Moreover, non-BMP characters in legacy multibyte encodings - // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are - // BOCU and SCSU, but we don't care about them. - size_t uchar_max_length = encoded.length() + 1; - - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toUChars(converter, - WriteInto(utf16, uchar_max_length), - static_cast<int>(uchar_max_length), - encoded.data(), - static_cast<int>(encoded.length()), - &status); - ucnv_close(converter); - if (!U_SUCCESS(status)) { - utf16->clear(); // Make sure the output is empty on error. - return false; - } - - utf16->resize(actual_size); - return true; -} - } // namespace base diff --git a/base/i18n/icu_string_conversions.h b/base/i18n/icu_string_conversions.h index e7dac605..6f2cab7 100644 --- a/base/i18n/icu_string_conversions.h +++ b/base/i18n/icu_string_conversions.h @@ -40,6 +40,17 @@ extern const char kCodepageUTF8[]; extern const char kCodepageUTF16BE[]; extern const char kCodepageUTF16LE[]; +// Like CodepageToUTF16() (see below), but also takes an offset into |encoded|, +// which will be adjusted to point at the same logical place in |utf16|. If +// this isn't possible because it points past the end of |encoded| or into the +// middle of a multibyte sequence, it will be set to std::string16::npos. +// |offset_for_adjustment| may be NULL. +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment); + // Converts between UTF-16 strings and the encoding specified. If the // encoding doesn't exist or the encoding fails (when on_error is FAIL), // returns false. @@ -47,11 +58,24 @@ bool UTF16ToCodepage(const string16& utf16, const char* codepage_name, OnStringConversionError::Type on_error, std::string* encoded); +inline bool CodepageToUTF16(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16) { + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, utf16, + NULL); +} -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16); +// Like CodepageToWide() (see below), but also takes an offset into |encoded|, +// which will be adjusted to point at the same logical place in |wide|. If +// this isn't possible because it points past the end of |encoded| or into the +// middle of a multibyte sequence, it will be set to std::wstring::npos. +// |offset_for_adjustment| may be NULL. +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment); // Converts between wide strings and the encoding specified. If the // encoding doesn't exist or the encoding fails (when on_error is FAIL), @@ -60,10 +84,13 @@ bool WideToCodepage(const std::wstring& wide, const char* codepage_name, OnStringConversionError::Type on_error, std::string* encoded); -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide); +inline bool CodepageToWide(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide) { + return CodepageToWideAndAdjustOffset(encoded, codepage_name, on_error, wide, + NULL); +} } // namespace base diff --git a/base/i18n/icu_string_conversions_unittest.cc b/base/i18n/icu_string_conversions_unittest.cc index 969ddb7..0088a03 100644 --- a/base/i18n/icu_string_conversions_unittest.cc +++ b/base/i18n/icu_string_conversions_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -9,9 +9,9 @@ #include <sstream> #include "base/basictypes.h" +#include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/utf_string_conversions.h" -#include "base/i18n/icu_string_conversions.h" #include "testing/gtest/include/gtest/gtest.h" namespace base { @@ -39,7 +39,7 @@ string16 BuildString16(const wchar_t* s) { #endif } -static const wchar_t* const kConvertRoundtripCases[] = { +const wchar_t* const kConvertRoundtripCases[] = { L"Google Video", // "网页 图片 资讯更多 »" L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", @@ -68,7 +68,7 @@ static const wchar_t* const kConvertRoundtripCases[] = { } // namespace -TEST(StringUtilTest, ConvertCodepageUTF8) { +TEST(ICUStringConversionsTest, ConvertCodepageUTF8) { // Make sure WideToCodepage works like WideToUTF8. for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { std::string expected(WideToUTF8(kConvertRoundtripCases[i])); @@ -156,7 +156,7 @@ static const struct { true, #if defined(WCHAR_T_IS_UTF16) L"\xD840\xDC00\x4E00", -#else +#elif defined(WCHAR_T_IS_UTF32) L"\x20000\x4E00", #endif L"\xD840\xDC00\x4E00"}, @@ -234,7 +234,7 @@ static const struct { NULL}, }; -TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { +TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndWide) { for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { std::wstring wide; bool success = CodepageToWide(kConvertCodepageCases[i].encoded, @@ -296,7 +296,7 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { OnStringConversionError::SKIP, &encoded)); } -TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { +TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { string16 utf16; bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, @@ -325,4 +325,45 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { } } +static const struct { + const char* codepage_name; + const char* encoded; + size_t input_offset; + size_t u16_output_offset; + size_t wide_output_offset; +} kAdjustOffsetCases[] = { + {"gb2312", "", 0, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", 0, 0, 0}, + {"gb2312", "\xC4\xE3\xBA\xC3", 2, 1, 1}, + {"gb2312", "\xC4\xE3\xBA\xC3", 4, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", 1, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", std::string::npos, string16::npos, + std::wstring::npos}, + {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 2, string16::npos, + std::wstring::npos}, + {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 4, 2, 1}, +}; + +TEST(ICUStringConversionsTest, AdjustOffset) { + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kAdjustOffsetCases); ++i) { + string16 utf16; + size_t offset = kAdjustOffsetCases[i].input_offset; + EXPECT_TRUE(CodepageToUTF16AndAdjustOffset(kAdjustOffsetCases[i].encoded, + kAdjustOffsetCases[i].codepage_name, + OnStringConversionError::FAIL, &utf16, &offset)); + EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); + + std::wstring wide; + offset = kAdjustOffsetCases[i].input_offset; + CodepageToWideAndAdjustOffset(kAdjustOffsetCases[i].encoded, + kAdjustOffsetCases[i].codepage_name, + OnStringConversionError::FAIL, &wide, &offset); +#if defined(WCHAR_T_IS_UTF16) + EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); +#elif defined(WCHAR_T_IS_UTF32) + EXPECT_EQ(kAdjustOffsetCases[i].wide_output_offset, offset); +#endif + } +} + } // namespace base diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 0ccea91..d691003 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -229,203 +229,6 @@ TEST(StringUtilTest, IsStringUTF8) { EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); } -static const wchar_t* const kConvertRoundtripCases[] = { - L"Google Video", - // "网页 图片 资讯更多 »" - L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", - // "Παγκόσμιος Ιστός" - L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" - L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", - // "Поиск страниц на русском" - L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" - L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" - L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", - // "전체서비스" - L"\xc804\xccb4\xc11c\xbe44\xc2a4", - - // Test characters that take more than 16 bits. This will depend on whether - // wchar_t is 16 or 32 bits. -#if defined(WCHAR_T_IS_UTF16) - L"\xd800\xdf00", - // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) - L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", -#elif defined(WCHAR_T_IS_UTF32) - L"\x10300", - // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) - L"\x11d40\x11d41\x11d42\x11d43\x11d44", -#endif -}; - -TEST(StringUtilTest, ConvertUTF8AndWide) { - // we round-trip all the wide strings through UTF-8 to make sure everything - // agrees on the conversion. This uses the stream operators to test them - // simultaneously. - for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { - std::ostringstream utf8; - utf8 << WideToUTF8(kConvertRoundtripCases[i]); - std::wostringstream wide; - wide << UTF8ToWide(utf8.str()); - - EXPECT_EQ(kConvertRoundtripCases[i], wide.str()); - } -} - -TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) { - // An empty std::wstring should be converted to an empty std::string, - // and vice versa. - std::wstring wempty; - std::string empty; - EXPECT_EQ(empty, WideToUTF8(wempty)); - EXPECT_EQ(wempty, UTF8ToWide(empty)); -} - -TEST(StringUtilTest, ConvertUTF8ToWide) { - struct UTF8ToWideCase { - const char* utf8; - const wchar_t* wide; - bool success; - } convert_cases[] = { - // Regular UTF-8 input. - {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, - // Non-character is passed through. - {"\xef\xbf\xbfHello", L"\xffffHello", true}, - // Truncated UTF-8 sequence. - {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, - // Truncated off the end. - {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, - // Non-shortest-form UTF-8. - {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, - // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. - {"\xed\xb0\x80", L"", false}, - // Non-BMP characters. The second is a non-character regarded as valid. - // The result will either be in UTF-16 or UTF-32. -#if defined(WCHAR_T_IS_UTF16) - {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, - {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, -#elif defined(WCHAR_T_IS_UTF32) - {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, - {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, -#endif - }; - - for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { - std::wstring converted; - EXPECT_EQ(convert_cases[i].success, - UTF8ToWide(convert_cases[i].utf8, - strlen(convert_cases[i].utf8), - &converted)); - std::wstring expected(convert_cases[i].wide); - EXPECT_EQ(expected, converted); - } - - // Manually test an embedded NULL. - std::wstring converted; - EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); - ASSERT_EQ(3U, converted.length()); -#if defined(WCHAR_T_IS_UNSIGNED) - EXPECT_EQ(0U, converted[0]); -#else - EXPECT_EQ(0, converted[0]); -#endif - EXPECT_EQ('Z', converted[1]); - EXPECT_EQ('\t', converted[2]); - - // Make sure that conversion replaces, not appends. - EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); - ASSERT_EQ(1U, converted.length()); - EXPECT_EQ('B', converted[0]); -} - -#if defined(WCHAR_T_IS_UTF16) -// This test is only valid when wchar_t == UTF-16. -TEST(StringUtilTest, ConvertUTF16ToUTF8) { - struct UTF16ToUTF8Case { - const wchar_t* utf16; - const char* utf8; - bool success; - } convert_cases[] = { - // Regular UTF-16 input. - {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, - // Test a non-BMP character. - {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, - // Non-characters are passed through. - {L"\xffffHello", "\xEF\xBF\xBFHello", true}, - {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, - // The first character is a truncated UTF-16 character. - {L"\xd800\x597d", "\xe5\xa5\xbd", false}, - // Truncated at the end. - {L"\x597d\xd800", "\xe5\xa5\xbd", false}, - }; - - for (int i = 0; i < arraysize(convert_cases); i++) { - std::string converted; - EXPECT_EQ(convert_cases[i].success, - WideToUTF8(convert_cases[i].utf16, - wcslen(convert_cases[i].utf16), - &converted)); - std::string expected(convert_cases[i].utf8); - EXPECT_EQ(expected, converted); - } -} - -#elif defined(WCHAR_T_IS_UTF32) -// This test is only valid when wchar_t == UTF-32. -TEST(StringUtilTest, ConvertUTF32ToUTF8) { - struct WideToUTF8Case { - const wchar_t* utf32; - const char* utf8; - bool success; - } convert_cases[] = { - // Regular 16-bit input. - {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, - // Test a non-BMP character. - {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, - // Non-characters are passed through. - {L"\xffffHello", "\xEF\xBF\xBFHello", true}, - {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, - // Invalid Unicode code points. - {L"\xfffffffHello", "Hello", false}, - // The first character is a truncated UTF-16 character. - {L"\xd800\x597d", "\xe5\xa5\xbd", false}, - {L"\xdc01Hello", "Hello", false}, - }; - - for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { - std::string converted; - EXPECT_EQ(convert_cases[i].success, - WideToUTF8(convert_cases[i].utf32, - wcslen(convert_cases[i].utf32), - &converted)); - std::string expected(convert_cases[i].utf8); - EXPECT_EQ(expected, converted); - } -} -#endif // defined(WCHAR_T_IS_UTF32) - -TEST(StringUtilTest, ConvertMultiString) { - static wchar_t wmulti[] = { - L'f', L'o', L'o', L'\0', - L'b', L'a', L'r', L'\0', - L'b', L'a', L'z', L'\0', - L'\0' - }; - static char multi[] = { - 'f', 'o', 'o', '\0', - 'b', 'a', 'r', '\0', - 'b', 'a', 'z', '\0', - '\0' - }; - std::wstring wmultistring; - memcpy(WriteInto(&wmultistring, arraysize(wmulti)), wmulti, sizeof(wmulti)); - EXPECT_EQ(arraysize(wmulti) - 1, wmultistring.length()); - std::string expected; - memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi)); - EXPECT_EQ(arraysize(multi) - 1, expected.length()); - const std::string& converted = WideToUTF8(wmultistring); - EXPECT_EQ(arraysize(multi) - 1, converted.length()); - EXPECT_EQ(expected, converted); -} - TEST(StringUtilTest, ConvertASCII) { static const char* char_cases[] = { "Google Video", diff --git a/base/utf_string_conversions.cc b/base/utf_string_conversions.cc index 6b25cd8..ffff50a 100644 --- a/base/utf_string_conversions.cc +++ b/base/utf_string_conversions.cc @@ -84,43 +84,50 @@ bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, // WriteUnicodeCharacter ------------------------------------------------------- -// Appends a UTF-8 character to the given 8-bit string. -void WriteUnicodeCharacter(uint32 code_point, std::string* output) { +// Appends a UTF-8 character to the given 8-bit string. Returns the number of +// bytes written. +size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) { if (code_point <= 0x7f) { // Fast path the common case of one byte. output->push_back(code_point); - return; + return 1; } - // U8_APPEND_UNSAFE can append up to 4 bytes. - int32 char_offset = static_cast<int32>(output->length()); + // CBU8_APPEND_UNSAFE can append up to 4 bytes. + size_t char_offset = output->length(); + size_t original_char_offset = char_offset; output->resize(char_offset + CBU8_MAX_LENGTH); CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); - // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so + // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so // it will represent the new length of the string. output->resize(char_offset); + return char_offset - original_char_offset; } -// Appends the given code point as a UTF-16 character to the STL string. -void WriteUnicodeCharacter(uint32 code_point, string16* output) { +// Appends the given code point as a UTF-16 character to the given 16-bit +// string. Returns the number of 16-bit values written. +size_t WriteUnicodeCharacter(uint32 code_point, string16* output) { if (CBU16_LENGTH(code_point) == 1) { // Thie code point is in the Basic Multilingual Plane (BMP). output->push_back(static_cast<char16>(code_point)); - } else { - // Non-BMP characters use a double-character encoding. - int32 char_offset = static_cast<int32>(output->length()); - output->resize(char_offset + CBU16_MAX_LENGTH); - CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + return 1; } + // Non-BMP characters use a double-character encoding. + size_t char_offset = output->length(); + output->resize(char_offset + CBU16_MAX_LENGTH); + CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + return CBU16_MAX_LENGTH; } #if defined(WCHAR_T_IS_UTF32) -// Appends the given UTF-32 character to the given 32-bit string. -inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { +// Appends the given UTF-32 character to the given 32-bit string. Returns the +// number of 32-bit values written. +inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { // This is the easy case, just append the character. output->push_back(code_point); + return 1; } #endif // defined(WCHAR_T_IS_UTF32) @@ -131,31 +138,57 @@ inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { // determine the source, and the given output STL string will be replaced by // the result. template<typename SRC_CHAR, typename DEST_STRING> -bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { - output->clear(); +bool ConvertUnicode(const SRC_CHAR* src, + size_t src_len, + DEST_STRING* output, + size_t* offset_for_adjustment) { + size_t output_offset = + (offset_for_adjustment && *offset_for_adjustment < src_len) ? + *offset_for_adjustment : DEST_STRING::npos; // ICU requires 32-bit numbers. bool success = true; int32 src_len32 = static_cast<int32>(src_len); for (int32 i = 0; i < src_len32; i++) { uint32 code_point; + size_t original_i = i; + size_t chars_written = 0; if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { - WriteUnicodeCharacter(code_point, output); + chars_written = WriteUnicodeCharacter(code_point, output); } else { // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) // in place of an invalid codepoint. success = false; } + if ((output_offset != DEST_STRING::npos) && + (*offset_for_adjustment > original_i)) { + // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last + // character read, not after it (so that incrementing it in the loop + // increment will place it at the right location), so we need to account + // for that in determining the amount that was read. + if (*offset_for_adjustment <= static_cast<size_t>(i)) + output_offset = DEST_STRING::npos; + else + output_offset += chars_written - (i - original_i + 1); + } } + + if (offset_for_adjustment) + *offset_for_adjustment = output_offset; return success; } -// Guesses the length of the output in UTF-8 in bytes, and reserves that amount -// of space in the given string. We also assume that the input character types -// are unsigned, which will be true for UTF-16 and -32 on our systems. We assume -// the string length is greater than zero. +// Guesses the length of the output in UTF-8 in bytes, clears that output +// string, and reserves that amount of space. We assume that the input +// character types are unsigned, which will be true for UTF-16 and -32 on our +// systems. template<typename CHAR> -void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { +void PrepareForUTF8Output(const CHAR* src, + size_t src_len, + std::string* output) { + output->clear(); + if (src_len == 0) + return; if (src[0] < 0x80) { // Assume that the entire input will be ASCII. output->reserve(src_len); @@ -165,11 +198,15 @@ void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { } } -// Guesses the size of the output buffer (containing either UTF-16 or -32 data) -// given some UTF-8 input that will be converted to it. See ReserveUTF8Output. -// We assume the source length is > 0. +// Prepares an output buffer (containing either UTF-16 or -32 data) given some +// UTF-8 input that will be converted to it. See PrepareForUTF8Output(). template<typename STRING> -void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { +void PrepareForUTF16Or32Output(const char* src, + size_t src_len, + STRING* output) { + output->clear(); + if (src_len == 0) + return; if (static_cast<unsigned char>(src[0]) < 0x80) { // Assume the input is all ASCII, which means 1:1 correspondence. output->reserve(src_len); @@ -184,111 +221,121 @@ void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { // UTF-8 <-> Wide -------------------------------------------------------------- -std::string WideToUTF8(const std::wstring& wide) { - std::string ret; - if (wide.empty()) - return ret; +bool WideToUTF8AndAdjustOffset(const wchar_t* src, + size_t src_len, + std::string* output, + size_t* offset_for_adjustment) { + PrepareForUTF8Output(src, src_len, output); + return ConvertUnicode<wchar_t, std::string>(src, src_len, output, + offset_for_adjustment); +} +std::string WideToUTF8AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment) { + std::string ret; // Ignore the success flag of this call, it will do the best it can for // invalid input, which is what we want here. - WideToUTF8(wide.data(), wide.length(), &ret); + WideToUTF8AndAdjustOffset(wide.data(), wide.length(), &ret, + offset_for_adjustment); return ret; } -bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF8Output(src, src_len, output); - return ConvertUnicode<wchar_t, std::string>(src, src_len, output); +bool UTF8ToWideAndAdjustOffset(const char* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment) { + PrepareForUTF16Or32Output(src, src_len, output); + return ConvertUnicode<char, std::wstring>(src, src_len, output, + offset_for_adjustment); } -std::wstring UTF8ToWide(const base::StringPiece& utf8) { +std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, + size_t* offset_for_adjustment) { std::wstring ret; - if (utf8.empty()) - return ret; - - UTF8ToWide(utf8.data(), utf8.length(), &ret); + UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret, + offset_for_adjustment); return ret; } -bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF16Or32Output(src, src_len, output); - return ConvertUnicode<char, std::wstring>(src, src_len, output); -} - // UTF-16 <-> Wide ------------------------------------------------------------- #if defined(WCHAR_T_IS_UTF16) // When wide == UTF-16, then conversions are a NOP. -string16 WideToUTF16(const std::wstring& wide) { - return wide; -} - -bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { +bool WideToUTF16AndAdjustOffset(const wchar_t* src, + size_t src_len, + string16* output, + size_t* offset_for_adjustment) { output->assign(src, src_len); + if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) + *offset_for_adjustment = string16::npos; return true; } -std::wstring UTF16ToWide(const string16& utf16) { - return utf16; +string16 WideToUTF16AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment) { + if (offset_for_adjustment && (*offset_for_adjustment >= wide.length())) + *offset_for_adjustment = string16::npos; + return wide; } -bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { +bool UTF16ToWideAndAdjustOffset(const char16* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment) { output->assign(src, src_len); + if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) + *offset_for_adjustment = std::wstring::npos; return true; } -#elif defined(WCHAR_T_IS_UTF32) - -string16 WideToUTF16(const std::wstring& wide) { - string16 ret; - if (wide.empty()) - return ret; - - WideToUTF16(wide.data(), wide.length(), &ret); - return ret; +std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, + size_t* offset_for_adjustment) { + if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) + *offset_for_adjustment = std::wstring::npos; + return utf16; } -bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { - if (src_len == 0) { - output->clear(); - return true; - } +#elif defined(WCHAR_T_IS_UTF32) +bool WideToUTF16AndAdjustOffset(const wchar_t* src, + size_t src_len, + string16* output, + size_t* offset_for_adjustment) { + output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); - return ConvertUnicode<wchar_t, string16>(src, src_len, output); + return ConvertUnicode<wchar_t, string16>(src, src_len, output, + offset_for_adjustment); } -std::wstring UTF16ToWide(const string16& utf16) { - std::wstring ret; - if (utf16.empty()) - return ret; - - UTF16ToWide(utf16.data(), utf16.length(), &ret); +string16 WideToUTF16AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment) { + string16 ret; + WideToUTF16AndAdjustOffset(wide.data(), wide.length(), &ret, + offset_for_adjustment); return ret; } -bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { - if (src_len == 0) { - output->clear(); - return true; - } - +bool UTF16ToWideAndAdjustOffset(const char16* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment) { + output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); - return ConvertUnicode<char16, std::wstring>(src, src_len, output); + return ConvertUnicode<char16, std::wstring>(src, src_len, output, + offset_for_adjustment); +} + +std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, + size_t* offset_for_adjustment) { + std::wstring ret; + UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret, + offset_for_adjustment); + return ret; } #endif // defined(WCHAR_T_IS_UTF32) @@ -298,20 +345,12 @@ bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { #if defined(WCHAR_T_IS_UTF32) bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF16Or32Output(src, src_len, output); - return ConvertUnicode<char, string16>(src, src_len, output); + PrepareForUTF16Or32Output(src, src_len, output); + return ConvertUnicode<char, string16>(src, src_len, output, NULL); } string16 UTF8ToUTF16(const std::string& utf8) { string16 ret; - if (utf8.empty()) - return ret; - // Ignore the success flag of this call, it will do the best it can for // invalid input, which is what we want here. UTF8ToUTF16(utf8.data(), utf8.length(), &ret); @@ -319,20 +358,12 @@ string16 UTF8ToUTF16(const std::string& utf8) { } bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF8Output(src, src_len, output); - return ConvertUnicode<char16, std::string>(src, src_len, output); + PrepareForUTF8Output(src, src_len, output); + return ConvertUnicode<char16, std::string>(src, src_len, output, NULL); } std::string UTF16ToUTF8(const string16& utf16) { std::string ret; - if (utf16.empty()) - return ret; - // Ignore the success flag of this call, it will do the best it can for // invalid input, which is what we want here. UTF16ToUTF8(utf16.data(), utf16.length(), &ret); diff --git a/base/utf_string_conversions.h b/base/utf_string_conversions.h index 89846ed..323233b 100644 --- a/base/utf_string_conversions.h +++ b/base/utf_string_conversions.h @@ -10,6 +10,37 @@ #include "base/string16.h" #include "base/string_piece.h" +// Like the conversions below, but also takes an offset into the source string, +// which will be adjusted to point at the same logical place in the result +// string. If this isn't possible because it points past the end of the source +// string or into the middle of a multibyte sequence, it will be set to +// std::wstring::npos. |offset_for_adjustment| may be NULL. +bool WideToUTF8AndAdjustOffset(const wchar_t* src, + size_t src_len, + std::string* output, + size_t* offset_for_adjustment); +std::string WideToUTF8AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment); +bool UTF8ToWideAndAdjustOffset(const char* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment); +std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, + size_t* offset_for_adjustment); + +bool WideToUTF16AndAdjustOffset(const wchar_t* src, + size_t src_len, + string16* output, + size_t* offset_for_adjustment); +string16 WideToUTF16AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment); +bool UTF16ToWideAndAdjustOffset(const char16* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment); +std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, + size_t* offset_for_adjustment); + // These convert between UTF-8, -16, and -32 strings. They are potentially slow, // so avoid unnecessary conversions. The low-level versions return a boolean // indicating whether the conversion was 100% valid. In this case, it will still @@ -23,15 +54,34 @@ // the Unicode replacement character or adding |replacement_char| parameter. // Currently, it's skipped in the ouput, which could be problematic in // some situations. -bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output); -std::string WideToUTF8(const std::wstring& wide); -bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output); -std::wstring UTF8ToWide(const base::StringPiece& utf8); - -bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output); -string16 WideToUTF16(const std::wstring& wide); -bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output); -std::wstring UTF16ToWide(const string16& utf16); +inline bool WideToUTF8(const wchar_t* src, + size_t src_len, + std::string* output) { + return WideToUTF8AndAdjustOffset(src, src_len, output, NULL); +} +inline std::string WideToUTF8(const std::wstring& wide) { + return WideToUTF8AndAdjustOffset(wide, NULL); +} +inline bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { + return UTF8ToWideAndAdjustOffset(src, src_len, output, NULL); +} +inline std::wstring UTF8ToWide(const base::StringPiece& utf8) { + return UTF8ToWideAndAdjustOffset(utf8, NULL); +} + +inline bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { + return WideToUTF16AndAdjustOffset(src, src_len, output, NULL); +} +inline string16 WideToUTF16(const std::wstring& wide) { + return WideToUTF16AndAdjustOffset(wide, NULL); +} +inline bool UTF16ToWide(const char16* src, size_t src_len, + std::wstring* output) { + return UTF16ToWideAndAdjustOffset(src, src_len, output, NULL); +} +inline std::wstring UTF16ToWide(const string16& utf16) { + return UTF16ToWideAndAdjustOffset(utf16, NULL); +} bool UTF8ToUTF16(const char* src, size_t src_len, string16* output); string16 UTF8ToUTF16(const std::string& utf8); diff --git a/base/utf_string_conversions_unittest.cc b/base/utf_string_conversions_unittest.cc new file mode 100644 index 0000000..67af7c3 --- /dev/null +++ b/base/utf_string_conversions_unittest.cc @@ -0,0 +1,306 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/basictypes.h" +#include "base/string_util.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace base { + +namespace { + +// Given a null-terminated string of wchar_t with each wchar_t representing +// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. +// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) +// should be represented as a surrogate pair (two UTF-16 units) +// *even* where wchar_t is 32-bit (Linux and Mac). +// +// This is to help write tests for functions with string16 params until +// the C++ 0x UTF-16 literal is well-supported by compilers. +string16 BuildString16(const wchar_t* s) { +#if defined(WCHAR_T_IS_UTF16) + return string16(s); +#elif defined(WCHAR_T_IS_UTF32) + string16 u16; + while (*s != 0) { + DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu); + u16.push_back(*s++); + } + return u16; +#endif +} + +const wchar_t* const kConvertRoundtripCases[] = { + L"Google Video", + // "网页 图片 资讯更多 »" + L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", + // "Παγκόσμιος Ιστός" + L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", + // "Поиск страниц на русском" + L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" + L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" + L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", + // "전체서비스" + L"\xc804\xccb4\xc11c\xbe44\xc2a4", + + // Test characters that take more than 16 bits. This will depend on whether + // wchar_t is 16 or 32 bits. +#if defined(WCHAR_T_IS_UTF16) + L"\xd800\xdf00", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", +#elif defined(WCHAR_T_IS_UTF32) + L"\x10300", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\x11d40\x11d41\x11d42\x11d43\x11d44", +#endif +}; + +} // namespace + +TEST(UTFStringConversionsTest, ConvertUTF8AndWide) { + // we round-trip all the wide strings through UTF-8 to make sure everything + // agrees on the conversion. This uses the stream operators to test them + // simultaneously. + for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { + std::ostringstream utf8; + utf8 << WideToUTF8(kConvertRoundtripCases[i]); + std::wostringstream wide; + wide << UTF8ToWide(utf8.str()); + + EXPECT_EQ(kConvertRoundtripCases[i], wide.str()); + } +} + +TEST(UTFStringConversionsTest, ConvertUTF8AndWideEmptyString) { + // An empty std::wstring should be converted to an empty std::string, + // and vice versa. + std::wstring wempty; + std::string empty; + EXPECT_EQ(empty, WideToUTF8(wempty)); + EXPECT_EQ(wempty, UTF8ToWide(empty)); +} + +TEST(UTFStringConversionsTest, ConvertUTF8ToWide) { + struct UTF8ToWideCase { + const char* utf8; + const wchar_t* wide; + bool success; + } convert_cases[] = { + // Regular UTF-8 input. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, + // Non-character is passed through. + {"\xef\xbf\xbfHello", L"\xffffHello", true}, + // Truncated UTF-8 sequence. + {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, + // Truncated off the end. + {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, + // Non-shortest-form UTF-8. + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, + // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. + {"\xed\xb0\x80", L"", false}, + // Non-BMP characters. The second is a non-character regarded as valid. + // The result will either be in UTF-16 or UTF-32. +#if defined(WCHAR_T_IS_UTF16) + {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, +#elif defined(WCHAR_T_IS_UTF32) + {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, +#endif + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { + std::wstring converted; + EXPECT_EQ(convert_cases[i].success, + UTF8ToWide(convert_cases[i].utf8, + strlen(convert_cases[i].utf8), + &converted)); + std::wstring expected(convert_cases[i].wide); + EXPECT_EQ(expected, converted); + } + + // Manually test an embedded NULL. + std::wstring converted; + EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); + ASSERT_EQ(3U, converted.length()); + EXPECT_EQ(static_cast<wchar_t>(0), converted[0]); + EXPECT_EQ('Z', converted[1]); + EXPECT_EQ('\t', converted[2]); + + // Make sure that conversion replaces, not appends. + EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); + ASSERT_EQ(1U, converted.length()); + EXPECT_EQ('B', converted[0]); +} + +#if defined(WCHAR_T_IS_UTF16) +// This test is only valid when wchar_t == UTF-16. +TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf16; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular UTF-16 input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + // Truncated at the end. + {L"\x597d\xd800", "\xe5\xa5\xbd", false}, + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} + +#elif defined(WCHAR_T_IS_UTF32) +// This test is only valid when wchar_t == UTF-32. +TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf32; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular 16-bit input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // Invalid Unicode code points. + {L"\xfffffffHello", "Hello", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + {L"\xdc01Hello", "Hello", false}, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf32, + wcslen(convert_cases[i].utf32), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} +#endif // defined(WCHAR_T_IS_UTF32) + +TEST(UTFStringConversionsTest, ConvertMultiString) { + static wchar_t wmulti[] = { + L'f', L'o', L'o', L'\0', + L'b', L'a', L'r', L'\0', + L'b', L'a', L'z', L'\0', + L'\0' + }; + static char multi[] = { + 'f', 'o', 'o', '\0', + 'b', 'a', 'r', '\0', + 'b', 'a', 'z', '\0', + '\0' + }; + std::wstring wmultistring; + memcpy(WriteInto(&wmultistring, arraysize(wmulti)), wmulti, sizeof(wmulti)); + EXPECT_EQ(arraysize(wmulti) - 1, wmultistring.length()); + std::string expected; + memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi)); + EXPECT_EQ(arraysize(multi) - 1, expected.length()); + const std::string& converted = WideToUTF8(wmultistring); + EXPECT_EQ(arraysize(multi) - 1, converted.length()); + EXPECT_EQ(expected, converted); +} + +TEST(UTFStringConversionsTest, AdjustOffset) { + // Under the hood, all the functions call the same converter function, so we + // don't need to exhaustively check every case. + struct WideToUTF8Case { + const wchar_t* wide; + size_t input_offset; + size_t output_offset; + } wide_to_utf8_cases[] = { + {L"", 0, std::string::npos}, + {L"\x4f60\x597d", 0, 0}, + {L"\x4f60\x597d", 1, 3}, + {L"\x4f60\x597d", 2, std::string::npos}, + {L"\x4f60\x597d", std::wstring::npos, std::string::npos}, + {L"\xd800\x597dz", 1, 0}, + {L"\xd800\x597dz", 2, 3}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(wide_to_utf8_cases); ++i) { + size_t offset = wide_to_utf8_cases[i].input_offset; + WideToUTF8AndAdjustOffset(wide_to_utf8_cases[i].wide, &offset); + EXPECT_EQ(wide_to_utf8_cases[i].output_offset, offset); + } + + struct UTF8ToWideCase { + const char* utf8; + size_t input_offset; + size_t output_offset; + } utf8_to_wide_cases[] = { + {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos}, + {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1}, + {"\xed\xb0\x80z", 3, 0}, + {"A\xF0\x90\x8C\x80z", 1, 1}, + {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos}, +#if defined(WCHAR_T_IS_UTF16) + {"A\xF0\x90\x8C\x80z", 5, 3}, +#elif defined(WCHAR_T_IS_UTF32) + {"A\xF0\x90\x8C\x80z", 5, 2}, +#endif + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf8_to_wide_cases); ++i) { + size_t offset = utf8_to_wide_cases[i].input_offset; + UTF8ToWideAndAdjustOffset(utf8_to_wide_cases[i].utf8, &offset); + EXPECT_EQ(utf8_to_wide_cases[i].output_offset, offset); + } + +#if defined(WCHAR_T_IS_UTF32) + struct WideToUTF16Case { + const wchar_t* wide; + size_t input_offset; + size_t output_offset; + } wide_to_utf16_cases[] = { + {L"\x4F60\x597D", 1, 1}, + {L"\x20000\x4E00", 1, 2}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(wide_to_utf16_cases); ++i) { + size_t offset = wide_to_utf16_cases[i].input_offset; + WideToUTF16AndAdjustOffset(wide_to_utf16_cases[i].wide, &offset); + EXPECT_EQ(wide_to_utf16_cases[i].output_offset, offset); + } + + struct UTF16ToWideCase { + const wchar_t* wide; + size_t input_offset; + size_t output_offset; + } utf16_to_wide_cases[] = { + {L"\xD840\xDC00\x4E00", 0, 0}, + {L"\xD840\xDC00\x4E00", 1, std::wstring::npos}, + {L"\xD840\xDC00\x4E00", 2, 1}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_wide_cases); ++i) { + size_t offset = utf16_to_wide_cases[i].input_offset; + UTF16ToWideAndAdjustOffset(BuildString16(utf16_to_wide_cases[i].wide), + &offset); + EXPECT_EQ(utf16_to_wide_cases[i].output_offset, offset); + } +#endif +} + +} // namaspace base diff --git a/chrome/browser/autocomplete/autocomplete.cc b/chrome/browser/autocomplete/autocomplete.cc index f9223b4..1b0340d2 100644 --- a/chrome/browser/autocomplete/autocomplete.cc +++ b/chrome/browser/autocomplete/autocomplete.cc @@ -438,10 +438,6 @@ void AutocompleteMatch::ClassifyLocationInString( size_t overall_length, int style, ACMatchClassifications* classification) { - // Classifying an empty match makes no sense and will lead to validation - // errors later. - DCHECK(match_length > 0); - classification->clear(); // Don't classify anything about an empty string @@ -459,6 +455,9 @@ void AutocompleteMatch::ClassifyLocationInString( // No match, above classification will suffice for whole string. return; } + // Classifying an empty match makes no sense and will lead to validation + // errors later. + DCHECK(match_length > 0); classification->push_back(ACMatchClassification(match_location, (style | ACMatchClassification::MATCH) & ~ACMatchClassification::DIM)); diff --git a/chrome/browser/autocomplete/autocomplete.h b/chrome/browser/autocomplete/autocomplete.h index 0193b8c..f5d9ac0 100644 --- a/chrome/browser/autocomplete/autocomplete.h +++ b/chrome/browser/autocomplete/autocomplete.h @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -548,9 +548,9 @@ class AutocompleteProvider // profile's bookmark bar model. void UpdateStarredStateOfMatches(); - // A convenience function to call gfx::GetCleanStringFromUrl() with the - // current set of "Accept Languages" when check_accept_lang is true. - // Otherwise, it's called with an empty list. + // A convenience function to call net::FormatUrl() with the current set of + // "Accept Languages" when check_accept_lang is true. Otherwise, it's called + // with an empty list. std::wstring StringForURLDisplay(const GURL& url, bool check_accept_lang) const; diff --git a/chrome/browser/autocomplete/history_url_provider.cc b/chrome/browser/autocomplete/history_url_provider.cc index b44b6e7..a1d971a 100644 --- a/chrome/browser/autocomplete/history_url_provider.cc +++ b/chrome/browser/autocomplete/history_url_provider.cc @@ -68,9 +68,8 @@ void HistoryURLProvider::DeleteMatch(const AutocompleteMatch& match) { DCHECK(done_); // Delete the match from the history DB. - HistoryService* history_service = - profile_ ? profile_->GetHistoryService(Profile::EXPLICIT_ACCESS) : - history_service_; + HistoryService* const history_service = + profile_->GetHistoryService(Profile::EXPLICIT_ACCESS); GURL selected_url(match.destination_url); if (!history_service || !selected_url.is_valid()) { NOTREACHED() << "Can't delete requested URL"; @@ -628,16 +627,17 @@ void HistoryURLProvider::RunAutocompletePasses( matches_.push_back(SuggestExactInput(input, trim_http)); // We'll need the history service to run both passes, so try to obtain it. - HistoryService* const history_service = profile_ ? - profile_->GetHistoryService(Profile::EXPLICIT_ACCESS) : history_service_; + HistoryService* const history_service = + profile_->GetHistoryService(Profile::EXPLICIT_ACCESS); if (!history_service) return; // Create the data structure for the autocomplete passes. We'll save this off // onto the |params_| member for later deletion below if we need to run pass // 2. - const std::wstring& languages = profile_ ? - profile_->GetPrefs()->GetString(prefs::kAcceptLanguages) : std::wstring(); + std::wstring languages(languages_); + if (languages.empty() && profile_) + languages = profile_->GetPrefs()->GetString(prefs::kAcceptLanguages); scoped_ptr<HistoryURLProviderParams> params( new HistoryURLProviderParams(input, trim_http, languages)); @@ -826,28 +826,47 @@ AutocompleteMatch HistoryURLProvider::HistoryMatchToACMatch( !!info.visit_count(), AutocompleteMatch::HISTORY_URL); match.destination_url = info.url(); DCHECK(match.destination_url.is_valid()); + size_t inline_autocomplete_offset = + history_match.input_location + params->input.text().length(); match.fill_into_edit = net::FormatUrl(info.url(), - match_type == WHAT_YOU_TYPED ? std::wstring() : params->languages); - if (!params->input.prevent_inline_autocomplete()) { - match.inline_autocomplete_offset = - history_match.input_location + params->input.text().length(); - } + match_type == WHAT_YOU_TYPED ? std::wstring() : params->languages, true, + UnescapeRule::SPACES, NULL, NULL, &inline_autocomplete_offset); size_t offset = 0; if (params->trim_http && !history_match.match_in_scheme) { offset = TrimHttpPrefix(&match.fill_into_edit); - if (match.inline_autocomplete_offset != std::wstring::npos) { - DCHECK(match.inline_autocomplete_offset >= offset); - match.inline_autocomplete_offset -= offset; + if (inline_autocomplete_offset != std::wstring::npos) { + DCHECK(inline_autocomplete_offset >= offset); + inline_autocomplete_offset -= offset; } } + if (!params->input.prevent_inline_autocomplete()) + match.inline_autocomplete_offset = inline_autocomplete_offset; DCHECK((match.inline_autocomplete_offset == std::wstring::npos) || (match.inline_autocomplete_offset <= match.fill_into_edit.length())); - match.contents = match.fill_into_edit; - AutocompleteMatch::ClassifyLocationInString( - history_match.input_location - offset, params->input.text().length(), - match.contents.length(), ACMatchClassification::URL, - &match.contents_class); + size_t match_start = history_match.input_location; + match.contents = net::FormatUrl(info.url(), + match_type == WHAT_YOU_TYPED ? std::wstring() : params->languages, true, + UnescapeRule::SPACES, NULL, NULL, &match_start); + if (offset) { + TrimHttpPrefix(&match.contents); + if (match_start != std::wstring::npos) { + DCHECK(match_start >= offset); + match_start -= offset; + } + } + if ((match_start != std::wstring::npos) && + (inline_autocomplete_offset != std::wstring::npos) && + (inline_autocomplete_offset != match_start)) { + DCHECK(inline_autocomplete_offset > match_start); + AutocompleteMatch::ClassifyLocationInString(match_start, + inline_autocomplete_offset - match_start, match.contents.length(), + ACMatchClassification::URL, &match.contents_class); + } else { + AutocompleteMatch::ClassifyLocationInString(std::wstring::npos, 0, + match.contents.length(), ACMatchClassification::URL, + &match.contents_class); + } match.description = info.title(); AutocompleteMatch::ClassifyMatchInString(params->input.text(), info.title(), ACMatchClassification::NONE, diff --git a/chrome/browser/autocomplete/history_url_provider.h b/chrome/browser/autocomplete/history_url_provider.h index 50f6ba7..152a938 100644 --- a/chrome/browser/autocomplete/history_url_provider.h +++ b/chrome/browser/autocomplete/history_url_provider.h @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -135,18 +135,18 @@ class HistoryURLProvider : public AutocompleteProvider { public: HistoryURLProvider(ACProviderListener* listener, Profile* profile) : AutocompleteProvider(listener, profile, "HistoryURL"), - history_service_(NULL), prefixes_(GetPrefixes()), params_(NULL) { } #ifdef UNIT_TEST HistoryURLProvider(ACProviderListener* listener, - HistoryService* history_service) - : AutocompleteProvider(listener, NULL, "History"), - history_service_(history_service), + Profile* profile, + const std::wstring& languages) + : AutocompleteProvider(listener, profile, "History"), prefixes_(GetPrefixes()), - params_(NULL) { + params_(NULL), + languages_(languages) { } #endif // no destructor (see note above) @@ -379,10 +379,6 @@ class HistoryURLProvider : public AutocompleteProvider { MatchType match_type, size_t match_number); - // This is only non-null for testing, otherwise the HistoryService from the - // Profile is used. - HistoryService* history_service_; - // Prefixes to try appending to user input when looking for a match. const Prefixes prefixes_; @@ -391,6 +387,10 @@ class HistoryURLProvider : public AutocompleteProvider { // parameter itself is freed once it's no longer needed. The only reason we // keep this member is so we can set the cancel bit on it. HistoryURLProviderParams* params_; + + // Only used by unittests; if non-empty, overrides accept-languages in the + // profile's pref system. + std::wstring languages_; }; #endif // CHROME_BROWSER_AUTOCOMPLETE_HISTORY_URL_PROVIDER_H_ diff --git a/chrome/browser/autocomplete/history_url_provider_unittest.cc b/chrome/browser/autocomplete/history_url_provider_unittest.cc index 408526a..45e1426 100644 --- a/chrome/browser/autocomplete/history_url_provider_unittest.cc +++ b/chrome/browser/autocomplete/history_url_provider_unittest.cc @@ -83,6 +83,11 @@ static TestURLInfo test_db[] = { {"http://go/", L"Intranet URL", 1, 1}, {"http://gooey/", L"Intranet URL 2", 5, 5}, + // URLs for testing offset adjustment + {"http://www.\xEA\xB5\x90\xEC\x9C\xA1.kr/", L"Korean", 2, 2}, + {"http://spaces.com/path%20with%20spaces/foo.html", L"Spaces", 2, 2}, + {"http://ms/c++%20style%20guide", L"Style guide", 2, 2}, + {"http://foo:bar@baz.com/", L"HTTP auth", 2, 2}, }; class HistoryURLProviderTest : public testing::Test, @@ -116,6 +121,8 @@ class HistoryURLProviderTest : public testing::Test, const std::string* expected_urls, size_t num_results); + void RunAdjustOffsetTest(const std::wstring text, size_t expected_offset); + MessageLoopForUI message_loop_; ChromeThread ui_thread_; ChromeThread file_thread_; @@ -144,7 +151,7 @@ void HistoryURLProviderTest::SetUpImpl(bool no_db) { profile_->CreateHistoryService(true, no_db); history_service_ = profile_->GetHistoryService(Profile::EXPLICIT_ACCESS); - autocomplete_ = new HistoryURLProvider(this, profile_.get()); + autocomplete_ = new HistoryURLProvider(this, profile_.get(), L"en-US,en,ko"); FillData(); } @@ -189,6 +196,18 @@ void HistoryURLProviderTest::RunTest(const std::wstring text, EXPECT_EQ(expected_urls[i], matches_[i].destination_url.spec()); } +void HistoryURLProviderTest::RunAdjustOffsetTest(const std::wstring text, + size_t expected_offset) { + AutocompleteInput input(text, std::wstring(), false, false, false); + autocomplete_->Start(input, false); + if (!autocomplete_->done()) + MessageLoop::current()->Run(); + + matches_ = autocomplete_->matches(); + ASSERT_GE(matches_.size(), 1U) << "Input text: " << text; + EXPECT_EQ(expected_offset, matches_[0].inline_autocomplete_offset); +} + TEST_F(HistoryURLProviderTest, PromoteShorterURLs) { // Test that hosts get synthesized below popular pages. const std::string expected_nonsynth[] = { @@ -382,6 +401,14 @@ TEST_F(HistoryURLProviderTest, Fixup) { RunTest(L"17173", std::wstring(), false, fixup_5, arraysize(fixup_5)); } +TEST_F(HistoryURLProviderTest, AdjustOffset) { + RunAdjustOffsetTest(L"http://www.\uAD50\uC721", 13); + RunAdjustOffsetTest(L"http://spaces.com/path%20with%20spa", 31); + RunAdjustOffsetTest(L"http://ms/c++ s", 15); + RunAdjustOffsetTest(L"http://foo:ba", std::wstring::npos); + RunAdjustOffsetTest(L"http://foo:bar@ba", 9); +} + TEST_F(HistoryURLProviderTestNoDB, NavigateWithoutDB) { // Ensure that we will still produce matches for navigation when there is no // database. diff --git a/chrome/browser/bookmarks/bookmark_table_model.cc b/chrome/browser/bookmarks/bookmark_table_model.cc index 142090c..9b4fd82 100644 --- a/chrome/browser/bookmarks/bookmark_table_model.cc +++ b/chrome/browser/bookmarks/bookmark_table_model.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -324,9 +324,8 @@ std::wstring BookmarkTableModel::GetText(int row, int column_id) { std::wstring languages = model_ && model_->profile() ? model_->profile()->GetPrefs()->GetString(prefs::kAcceptLanguages) : std::wstring(); - std::wstring url_text = - net::FormatUrl(node->GetURL(), languages, false, UnescapeRule::SPACES, - NULL, NULL); + std::wstring url_text = net::FormatUrl(node->GetURL(), languages, false, + UnescapeRule::SPACES, NULL, NULL, NULL); if (l10n_util::GetTextDirection() == l10n_util::RIGHT_TO_LEFT) l10n_util::WrapStringWithLTRFormatting(&url_text); return url_text; diff --git a/chrome/browser/bookmarks/bookmark_utils.cc b/chrome/browser/bookmarks/bookmark_utils.cc index 54ea21cb..e62a30a 100644 --- a/chrome/browser/bookmarks/bookmark_utils.cc +++ b/chrome/browser/bookmarks/bookmark_utils.cc @@ -187,7 +187,7 @@ bool DoesBookmarkContainWords(const BookmarkNode* node, l10n_util::ToLower(node->GetTitle()), words) || DoesBookmarkTextContainWords(UTF8ToWide(node->GetURL().spec()), words) || DoesBookmarkTextContainWords(net::FormatUrl( - node->GetURL(), languages, false, true, NULL, NULL), words); + node->GetURL(), languages, false, true, NULL, NULL, NULL), words); } } // namespace diff --git a/chrome/browser/gtk/options/exceptions_page_gtk.cc b/chrome/browser/gtk/options/exceptions_page_gtk.cc index 164a821..10a8f2d 100644 --- a/chrome/browser/gtk/options/exceptions_page_gtk.cc +++ b/chrome/browser/gtk/options/exceptions_page_gtk.cc @@ -113,8 +113,7 @@ void ExceptionsPageGtk::SetExceptionList( for (size_t i = 0; i < result.size(); ++i) { exception_list_[i] = *result[i]; std::wstring formatted = net::FormatUrl(result[i]->origin, languages, - false, UnescapeRule::NONE, - NULL, NULL); + false, UnescapeRule::NONE, NULL, NULL, NULL); std::string site = WideToUTF8(formatted); GtkTreeIter iter; gtk_list_store_insert_with_values(exception_list_store_, &iter, (gint) i, diff --git a/chrome/browser/gtk/options/passwords_page_gtk.cc b/chrome/browser/gtk/options/passwords_page_gtk.cc index b2f6345..f4a2197 100644 --- a/chrome/browser/gtk/options/passwords_page_gtk.cc +++ b/chrome/browser/gtk/options/passwords_page_gtk.cc @@ -156,8 +156,7 @@ void PasswordsPageGtk::SetPasswordList( for (size_t i = 0; i < result.size(); ++i) { password_list_[i] = *result[i]; std::wstring formatted = net::FormatUrl(result[i]->origin, languages, - false, UnescapeRule::NONE, - NULL, NULL); + false, UnescapeRule::NONE, NULL, NULL, NULL); std::string site = WideToUTF8(formatted); std::string user = UTF16ToUTF8(result[i]->username_value); GtkTreeIter iter; diff --git a/chrome/browser/gtk/options/url_picker_dialog_gtk.cc b/chrome/browser/gtk/options/url_picker_dialog_gtk.cc index 6c4e38f..e646552 100644 --- a/chrome/browser/gtk/options/url_picker_dialog_gtk.cc +++ b/chrome/browser/gtk/options/url_picker_dialog_gtk.cc @@ -196,9 +196,8 @@ std::string UrlPickerDialogGtk::GetURLForPath(GtkTreePath* path) const { profile_->GetPrefs()->GetString(prefs::kAcceptLanguages); // Because the url_field_ is user-editable, we set the URL with // username:password and escaped path and query. - std::wstring formatted = net::FormatUrl( - url_table_model_->GetURL(row), languages, - false, UnescapeRule::NONE, NULL, NULL); + std::wstring formatted = net::FormatUrl(url_table_model_->GetURL(row), + languages, false, UnescapeRule::NONE, NULL, NULL, NULL); return WideToUTF8(formatted); } diff --git a/chrome/browser/net/browser_url_util.cc b/chrome/browser/net/browser_url_util.cc index 940d3b6..5f287795 100644 --- a/chrome/browser/net/browser_url_util.cc +++ b/chrome/browser/net/browser_url_util.cc @@ -21,9 +21,9 @@ void WriteURLToClipboard(const GURL& url, // Unescaping path and query is not a good idea because other applications // may not encode non-ASCII characters in UTF-8. See crbug.com/2820. string16 text = url.SchemeIs(chrome::kMailToScheme) ? - ASCIIToUTF16(url.path()) : - WideToUTF16(net::FormatUrl(url, languages, false, - UnescapeRule::NONE, NULL, NULL)); + ASCIIToUTF16(url.path()) : + WideToUTF16(net::FormatUrl(url, languages, false, UnescapeRule::NONE, + NULL, NULL, NULL)); ScopedClipboardWriter scw(clipboard); scw.WriteURL(text); diff --git a/chrome/browser/net/url_fixer_upper.cc b/chrome/browser/net/url_fixer_upper.cc index b465268..a68bc34 100644 --- a/chrome/browser/net/url_fixer_upper.cc +++ b/chrome/browser/net/url_fixer_upper.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -146,11 +146,10 @@ static string FixupHomedir(const string& text) { #endif // Tries to create a file: URL from |text| if it looks like a filename, even if -// it doesn't resolve as a valid path or to an existing file. Returns true -// with a (possibly invalid) file: URL in |fixed_up_url| for input beginning -// with a drive specifier or "\\". Returns false in other cases (including -// file: URLs: these don't look like filenames), leaving fixed_up_url -// unchanged. +// it doesn't resolve as a valid path or to an existing file. Returns a +// (possibly invalid) file: URL in |fixed_up_url| for input beginning +// with a drive specifier or "\\". Returns the unchanged input in other cases +// (including file: URLs: these don't look like filenames). static string FixupPath(const string& text) { DCHECK(!text.empty()); @@ -173,7 +172,7 @@ static string FixupPath(const string& text) { GURL file_url = net::FilePathToFileURL(FilePath(filename)); if (file_url.is_valid()) { return WideToUTF8(net::FormatUrl(file_url, std::wstring(), true, - UnescapeRule::NORMAL, NULL, NULL)); + UnescapeRule::NORMAL, NULL, NULL, NULL)); } // Invalid file URL, just return the input. @@ -182,7 +181,6 @@ static string FixupPath(const string& text) { // Checks |domain| to see if a valid TLD is already present. If not, appends // |desired_tld| to the domain, and prepends "www." unless it's already present. -// Then modifies |fixed_up_url| to reflect the changes. static void AddDesiredTLD(const string& desired_tld, string* domain) { if (desired_tld.empty() || domain->empty()) @@ -268,30 +266,15 @@ static void FixupHost(const string& text, url->append(domain); } -// Looks for a port number, including initial colon, at port_start. If -// something invalid (which cannot be fixed up) is found, like ":foo" or -// ":7:7", returns false. Otherwise, removes any extra colons -// ("::1337" -> ":1337", ":/" -> "/") and returns true. static void FixupPort(const string& text, const url_parse::Component& part, string* url) { if (!part.is_valid()) return; - // Look for non-digit in port and strip if found. - string port(text, part.begin, part.len); - for (string::iterator i = port.begin(); i != port.end();) { - if (IsAsciiDigit(*i)) - ++i; - else - i = port.erase(i); - } - - if (port.empty()) - return; // Nothing to append. - + // We don't fix up the port at the moment. url->append(":"); - url->append(port); + url->append(text, part.begin, part.len); } static inline void FixupPath(const string& text, @@ -573,7 +556,7 @@ string URLFixerUpper::FixupRelativeFile(const FilePath& base_dir, GURL file_url = net::FilePathToFileURL(full_path); if (file_url.is_valid()) return WideToUTF8(net::FormatUrl(file_url, std::wstring(), - true, UnescapeRule::NORMAL, NULL, NULL)); + true, UnescapeRule::NORMAL, NULL, NULL, NULL)); // Invalid files fall through to regular processing. } diff --git a/chrome/browser/net/url_fixer_upper_unittest.cc b/chrome/browser/net/url_fixer_upper_unittest.cc index d7f8b93..5028cb2 100644 --- a/chrome/browser/net/url_fixer_upper_unittest.cc +++ b/chrome/browser/net/url_fixer_upper_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -210,8 +210,8 @@ struct fixup_case { {" foo.com/asdf bar", "", "http://foo.com/asdf bar"}, {"..www.google.com..", "", "http://www.google.com./"}, {"http://......", "", "http://....../"}, - {"http://host.com:ninety-two/", "", "http://host.com/"}, - {"http://host.com:ninety-two?foo", "", "http://host.com/?foo"}, + {"http://host.com:ninety-two/", "", "http://host.com:ninety-two/"}, + {"http://host.com:ninety-two?foo", "", "http://host.com:ninety-two/?foo"}, {"google.com:123", "", "http://google.com:123/"}, {"about:", "", "about:"}, {"about:version", "", "about:version"}, diff --git a/chrome/browser/tab_contents/tab_contents.cc b/chrome/browser/tab_contents/tab_contents.cc index 3a34459..4e45553 100644 --- a/chrome/browser/tab_contents/tab_contents.cc +++ b/chrome/browser/tab_contents/tab_contents.cc @@ -2517,9 +2517,9 @@ void TabContents::LoadStateChanged(const GURL& url, upload_size_ = upload_size; std::wstring languages = profile()->GetPrefs()->GetString(prefs::kAcceptLanguages); - load_state_host_.clear(); std::string host = url.host(); - net::IDNToUnicode(host.c_str(), host.size(), languages, &load_state_host_); + load_state_host_ = + net::IDNToUnicode(host.c_str(), host.size(), languages, NULL); if (load_state_ == net::LOAD_STATE_READING_RESPONSE) SetNotWaitingForResponse(); if (is_loading()) diff --git a/chrome/browser/toolbar_model.cc b/chrome/browser/toolbar_model.cc index 1169c42..42977f6 100644 --- a/chrome/browser/toolbar_model.cc +++ b/chrome/browser/toolbar_model.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -41,7 +41,8 @@ std::wstring ToolbarModel::GetText() { url = entry->virtual_url(); } } - return net::FormatUrl(url, languages, true, UnescapeRule::NORMAL, NULL, NULL); + return net::FormatUrl(url, languages, true, UnescapeRule::NORMAL, NULL, NULL, + NULL); } ToolbarModel::SecurityLevel ToolbarModel::GetSecurityLevel() { diff --git a/chrome/browser/views/bookmark_editor_view.cc b/chrome/browser/views/bookmark_editor_view.cc index 5443f81..f40e25f 100644 --- a/chrome/browser/views/bookmark_editor_view.cc +++ b/chrome/browser/views/bookmark_editor_view.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -278,9 +278,8 @@ void BookmarkEditorView::Init() { : std::wstring(); // The following URL is user-editable. We specify omit_username_password= // false and unescape=false to show the original URL except IDN. - url_text = - net::FormatUrl(details_.existing_node->GetURL(), languages, false, - UnescapeRule::NONE, NULL, NULL); + url_text = net::FormatUrl(details_.existing_node->GetURL(), languages, + false, UnescapeRule::NONE, NULL, NULL, NULL); } url_tf_.SetText(url_text); url_tf_.SetController(this); diff --git a/chrome/browser/views/url_picker.cc b/chrome/browser/views/url_picker.cc index 5232676..0133dbd 100644 --- a/chrome/browser/views/url_picker.cc +++ b/chrome/browser/views/url_picker.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -293,9 +293,8 @@ void UrlPicker::OnSelectionChanged() { profile_->GetPrefs()->GetString(prefs::kAcceptLanguages); // Because the url_field_ is user-editable, we set the URL with // username:password and escaped path and query. - std::wstring formatted = net::FormatUrl( - url_table_model_->GetURL(selection), languages, - false, UnescapeRule::NONE, NULL, NULL); + std::wstring formatted = net::FormatUrl(url_table_model_->GetURL(selection), + languages, false, UnescapeRule::NONE, NULL, NULL, NULL); url_field_->SetText(formatted); if (title_field_) title_field_->SetText(url_table_model_->GetTitle(selection)); diff --git a/net/base/escape.cc b/net/base/escape.cc index 3d2aca2..5196eb6 100644 --- a/net/base/escape.cc +++ b/net/base/escape.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -107,7 +107,14 @@ const char kUrlUnescape[128] = { }; std::string UnescapeURLImpl(const std::string& escaped_text, - UnescapeRule::Type rules) { + UnescapeRule::Type rules, + size_t* offset_for_adjustment) { + size_t offset_temp = std::wstring::npos; + if (!offset_for_adjustment) + offset_for_adjustment = &offset_temp; + else if (*offset_for_adjustment >= escaped_text.length()) + *offset_for_adjustment = std::wstring::npos; + // Do not unescape anything, return the |escaped_text| text. if (rules == UnescapeRule::NONE) return escaped_text; @@ -136,8 +143,17 @@ std::string UnescapeURLImpl(const std::string& escaped_text, // Additionally allow control characters if requested. (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { // Use the unescaped version of the character. + size_t length_before_append = result.length(); result.push_back(value); i += 2; + + // Adjust offset to match length change. + if (*offset_for_adjustment != std::string::npos) { + if (*offset_for_adjustment > (length_before_append + 2)) + *offset_for_adjustment -= 2; + else if (*offset_for_adjustment > length_before_append) + *offset_for_adjustment = std::string::npos; + } } else { // Keep escaped. Append a percent and we'll get the following two // digits on the next loops through. @@ -231,19 +247,27 @@ bool EscapeQueryParamValue(const std::wstring& text, const char* codepage, return true; } -std::wstring UnescapeAndDecodeURLComponent(const std::string& text, - const char* codepage, - UnescapeRule::Type rules) { +std::wstring UnescapeAndDecodeUTF8URLComponent(const std::string& text, + UnescapeRule::Type rules, + size_t* offset_for_adjustment) { std::wstring result; - if (base::CodepageToWide(UnescapeURLImpl(text, rules), codepage, - base::OnStringConversionError::FAIL, &result)) + size_t original_offset = offset_for_adjustment ? *offset_for_adjustment : 0; + if (base::CodepageToWideAndAdjustOffset( + UnescapeURLImpl(text, rules, offset_for_adjustment), + "UTF-8", base::OnStringConversionError::FAIL, &result, + offset_for_adjustment)) return result; // Character set looks like it's valid. - return UTF8ToWide(text); // Return the escaped version when it's not. + + // Not valid. Return the escaped version. Undo our changes to + // |offset_for_adjustment| since we haven't changed the string after all. + if (offset_for_adjustment) + *offset_for_adjustment = original_offset; + return UTF8ToWideAndAdjustOffset(text, offset_for_adjustment); } std::string UnescapeURLComponent(const std::string& escaped_text, UnescapeRule::Type rules) { - return UnescapeURLImpl(escaped_text, rules); + return UnescapeURLImpl(escaped_text, rules, NULL); } template <class str> diff --git a/net/base/escape.h b/net/base/escape.h index 8761d4d..9ff17b6 100644 --- a/net/base/escape.h +++ b/net/base/escape.h @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -93,17 +93,17 @@ std::string UnescapeURLComponent(const std::string& escaped_text, UnescapeRule::Type rules); // Unescapes the given substring as a URL, and then tries to interpret the -// result as being encoded in the given code page. If the result is convertable -// into the code page, it will be returned as converted. If it is not, the -// original escaped string will be converted into a wide string and returned. -std::wstring UnescapeAndDecodeURLComponent(const std::string& text, - const char* codepage, - UnescapeRule::Type rules); -inline std::wstring UnescapeAndDecodeUTF8URLComponent( - const std::string& text, - UnescapeRule::Type rules) { - return UnescapeAndDecodeURLComponent(text, "UTF-8", rules); -} +// result as being encoded as UTF-8. If the result is convertable into UTF-8, it +// will be returned as converted. If it is not, the original escaped string will +// be converted into a wide string and returned. +// +// |offset_for_adjustment| may be NULL; if not, it is an offset into |text| that +// will be adjusted to point at the same logical place in the result string. If +// this isn't possible because it points into the middle of an escape sequence +// or past the end of the string, it will be set to std::wstring::npos. +std::wstring UnescapeAndDecodeUTF8URLComponent(const std::string& text, + UnescapeRule::Type rules, + size_t* offset_for_adjustment); // Deprecated ------------------------------------------------------------------ diff --git a/net/base/escape_unittest.cc b/net/base/escape_unittest.cc index 44bb9972..8e5e7dc 100644 --- a/net/base/escape_unittest.cc +++ b/net/base/escape_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -24,8 +24,7 @@ struct UnescapeURLCase { const char* output; }; -struct UnescapeAndDecodeURLCase { - const char* encoding; +struct UnescapeAndDecodeCase { const char* input; // The expected output when run through UnescapeURL. @@ -38,6 +37,12 @@ struct UnescapeAndDecodeURLCase { const wchar_t* decoded; }; +struct AdjustOffsetCase { + const char* input; + size_t input_offset; + size_t output_offset; +}; + struct EscapeForHTMLCase { const char* input; const char* expected_output; @@ -45,7 +50,7 @@ struct EscapeForHTMLCase { } // namespace -TEST(Escape, EscapeTextForFormSubmission) { +TEST(EscapeTest, EscapeTextForFormSubmission) { const EscapeCase escape_cases[] = { {L"foo", L"foo"}, {L"foo bar", L"foo+bar"}, @@ -93,7 +98,7 @@ TEST(Escape, EscapeTextForFormSubmission) { EXPECT_EQ(wide, EscapeQueryParamValueUTF8(test_str)); } -TEST(Escape, EscapePath) { +TEST(EscapeTest, EscapePath) { ASSERT_EQ( // Most of the character space we care about, un-escaped EscapePath( @@ -108,7 +113,7 @@ TEST(Escape, EscapePath) { "%7B%7C%7D~%7F%80%FF"); } -TEST(Escape, EscapeUrlEncodedData) { +TEST(EscapeTest, EscapeUrlEncodedData) { ASSERT_EQ( // Most of the character space we care about, un-escaped EscapeUrlEncodedData( @@ -123,7 +128,7 @@ TEST(Escape, EscapeUrlEncodedData) { "%7B%7C%7D~%7F%80%FF"); } -TEST(Escape, UnescapeURLComponent) { +TEST(EscapeTest, UnescapeURLComponent) { const UnescapeURLCase unescape_cases[] = { {"", UnescapeRule::NORMAL, ""}, {"%2", UnescapeRule::NORMAL, "%2"}, @@ -184,40 +189,48 @@ TEST(Escape, UnescapeURLComponent) { EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL)); } -TEST(Escape, UnescapeAndDecodeURLComponent) { - const UnescapeAndDecodeURLCase unescape_cases[] = { - {"UTF8", "%", "%", "%", L"%"}, - {"UTF8", "+", "+", " ", L"+"}, - {"UTF8", "%2+", "%2+", "%2 ", L"%2+"}, - {"UTF8", "+%%%+%%%", "+%%%+%%%", " %%% %%%", L"+%%%+%%%"}, - {"UTF8", "Don't escape anything", - "Don't escape anything", - "Don't escape anything", - L"Don't escape anything"}, - {"UTF8", "+Invalid %escape %2+", - "+Invalid %escape %2+", - " Invalid %escape %2 ", - L"+Invalid %escape %2+"}, - {"UTF8", "Some random text %25%3bOK", - "Some random text %25;OK", - "Some random text %25;OK", - L"Some random text %25;OK"}, - {"UTF8", "%01%02%03%04%05%06%07%08%09", - "%01%02%03%04%05%06%07%08%09", - "%01%02%03%04%05%06%07%08%09", - L"%01%02%03%04%05%06%07%08%09"}, - {"UTF8", "%E4%BD%A0+%E5%A5%BD", - "\xE4\xBD\xA0+\xE5\xA5\xBD", - "\xE4\xBD\xA0 \xE5\xA5\xBD", - L"\x4f60+\x597d"}, - {"BIG5", "%A7A%A6n", - "\xA7\x41\xA6n", - "\xA7\x41\xA6n", - L"\x4f60\x597d"}, - {"UTF8", "%ED%ED", // Invalid UTF-8. - "\xED\xED", - "\xED\xED", - L"%ED%ED"}, // Invalid UTF-8 -> kept unescaped. +TEST(EscapeTest, UnescapeAndDecodeUTF8URLComponent) { + const UnescapeAndDecodeCase unescape_cases[] = { + { "%", + "%", + "%", + L"%"}, + { "+", + "+", + " ", + L"+"}, + { "%2+", + "%2+", + "%2 ", + L"%2+"}, + { "+%%%+%%%", + "+%%%+%%%", + " %%% %%%", + L"+%%%+%%%"}, + { "Don't escape anything", + "Don't escape anything", + "Don't escape anything", + L"Don't escape anything"}, + { "+Invalid %escape %2+", + "+Invalid %escape %2+", + " Invalid %escape %2 ", + L"+Invalid %escape %2+"}, + { "Some random text %25%3BOK", + "Some random text %25;OK", + "Some random text %25;OK", + L"Some random text %25;OK"}, + { "%01%02%03%04%05%06%07%08%09", + "%01%02%03%04%05%06%07%08%09", + "%01%02%03%04%05%06%07%08%09", + L"%01%02%03%04%05%06%07%08%09"}, + { "%E4%BD%A0+%E5%A5%BD", + "\xE4\xBD\xA0+\xE5\xA5\xBD", + "\xE4\xBD\xA0 \xE5\xA5\xBD", + L"\x4f60+\x597d"}, + { "%ED%ED", // Invalid UTF-8. + "\xED\xED", + "\xED\xED", + L"%ED%ED"}, // Invalid UTF-8 -> kept unescaped. }; for (size_t i = 0; i < arraysize(unescape_cases); i++) { @@ -230,14 +243,36 @@ TEST(Escape, UnescapeAndDecodeURLComponent) { EXPECT_EQ(std::string(unescape_cases[i].query_unescaped), unescaped); // TODO: Need to test unescape_spaces and unescape_percent. - std::wstring decoded = UnescapeAndDecodeURLComponent( - unescape_cases[i].input, unescape_cases[i].encoding, - UnescapeRule::NORMAL); + std::wstring decoded = UnescapeAndDecodeUTF8URLComponent( + unescape_cases[i].input, UnescapeRule::NORMAL, NULL); EXPECT_EQ(std::wstring(unescape_cases[i].decoded), decoded); } } -TEST(Escape, EscapeForHTML) { +TEST(EscapeTest, AdjustOffset) { + const AdjustOffsetCase adjust_cases[] = { + {"", 0, std::wstring::npos}, + {"test", 0, 0}, + {"test", 2, 2}, + {"test", 4, std::wstring::npos}, + {"test", std::wstring::npos, std::wstring::npos}, + {"%3Btest", 6, 4}, + {"%3Btest", 2, std::wstring::npos}, + {"test%3B", 2, 2}, + {"%E4%BD%A0+%E5%A5%BD", 9, 1}, + {"%E4%BD%A0+%E5%A5%BD", 6, std::wstring::npos}, + {"%ED%B0%80+%E5%A5%BD", 6, 6}, + }; + + for (size_t i = 0; i < arraysize(adjust_cases); i++) { + size_t offset = adjust_cases[i].input_offset; + UnescapeAndDecodeUTF8URLComponent(adjust_cases[i].input, + UnescapeRule::NORMAL, &offset); + EXPECT_EQ(adjust_cases[i].output_offset, offset); + } +} + +TEST(EscapeTest, EscapeForHTML) { const EscapeForHTMLCase tests[] = { { "hello", "hello" }, { "<hello>", "<hello>" }, diff --git a/net/base/net_util.cc b/net/base/net_util.cc index 85151e9..9171e54 100644 --- a/net/base/net_util.cc +++ b/net/base/net_util.cc @@ -650,60 +650,51 @@ bool IsIDNComponentSafe(const char16* str, } // Converts one component of a host (between dots) to IDN if safe. The result -// will be APPENDED to the given output string and will be the same as the -// input if it is not IDN or the IDN is unsafe to display. -void IDNToUnicodeOneComponent(const char16* comp, - int comp_len, +// will be APPENDED to the given output string and will be the same as the input +// if it is not IDN or the IDN is unsafe to display. Returns whether any +// conversion was performed. +bool IDNToUnicodeOneComponent(const char16* comp, + size_t comp_len, const std::wstring& languages, string16* out) { - DCHECK(comp_len >= 0); + DCHECK(out); if (comp_len == 0) - return; + return false; - // Expand the output string to make room for a possibly longer string - // (we'll expand if it's still not big enough below). - int extra_space = 64; - size_t host_begin_in_output = out->size(); - - // Just copy the input if it can't be an IDN component. - if (comp_len < 4 || - comp[0] != 'x' || comp[1] != 'n' || comp[2] != '-' || comp[3] != '-') { - out->resize(host_begin_in_output + comp_len); - for (int i = 0; i < comp_len; i++) - (*out)[host_begin_in_output + i] = comp[i]; - return; - } + // Only transform if the input can be an IDN component. + static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'}; + if ((comp_len > arraysize(kIdnPrefix)) && + !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) { + // Repeatedly expand the output string until it's big enough. It looks like + // ICU will return the required size of the buffer, but that's not + // documented, so we'll just grow by 2x. This should be rare and is not on a + // critical path. + size_t original_length = out->length(); + for (int extra_space = 64; ; extra_space *= 2) { + UErrorCode status = U_ZERO_ERROR; + out->resize(out->length() + extra_space); + int output_chars = uidna_IDNToUnicode(comp, + static_cast<int32_t>(comp_len), &(*out)[original_length], extra_space, + UIDNA_DEFAULT, NULL, &status); + if (status == U_ZERO_ERROR) { + // Converted successfully. + out->resize(original_length + output_chars); + if (IsIDNComponentSafe(out->data() + original_length, output_chars, + languages)) + return true; + } - while (true) { - UErrorCode status = U_ZERO_ERROR; - out->resize(out->size() + extra_space); - int output_chars = - uidna_IDNToUnicode(comp, comp_len, &(*out)[host_begin_in_output], - extra_space, UIDNA_DEFAULT, NULL, &status); - if (status == U_ZERO_ERROR) { - // Converted successfully. - out->resize(host_begin_in_output + output_chars); - if (!IsIDNComponentSafe(&out->data()[host_begin_in_output], - output_chars, - languages)) - break; // The error handling below will undo the IDN. - return; + if (status != U_BUFFER_OVERFLOW_ERROR) + break; } - if (status != U_BUFFER_OVERFLOW_ERROR) - break; - - // Need to loop again with a bigger buffer. It looks like ICU will - // return the required size of the buffer, but that's not documented, - // so we'll just grow by 2x. This should be rare and is not on a - // critical path. - extra_space *= 2; + // Failed, revert back to original string. + out->resize(original_length); } - // We get here on error, in which case we replace anything that was added - // with the literal input. - out->resize(host_begin_in_output + comp_len); - for (int i = 0; i < comp_len; i++) - (*out)[host_begin_in_output + i] = comp[i]; + // We get here with no IDN or on error, in which case we just append the + // literal input. + out->append(comp, comp_len); + return false; } // Helper for FormatUrl(). @@ -712,19 +703,23 @@ std::wstring FormatViewSourceUrl(const GURL& url, bool omit_username_password, UnescapeRule::Type unescape_rules, url_parse::Parsed* new_parsed, - size_t* prefix_end) { + size_t* prefix_end, + size_t* offset_for_adjustment) { DCHECK(new_parsed); const wchar_t* const kWideViewSource = L"view-source:"; const size_t kViewSourceLengthPlus1 = 12; GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1)); + size_t temp_offset = (*offset_for_adjustment == std::wstring::npos) ? + std::wstring::npos : (*offset_for_adjustment - kViewSourceLengthPlus1); + size_t* temp_offset_ptr = (*offset_for_adjustment < kViewSourceLengthPlus1) ? + NULL : &temp_offset; std::wstring result = net::FormatUrl(real_url, languages, - omit_username_password, unescape_rules, new_parsed, prefix_end); + omit_username_password, unescape_rules, new_parsed, prefix_end, + temp_offset_ptr); result.insert(0, kWideViewSource); // Adjust position values. - if (prefix_end) - *prefix_end += kViewSourceLengthPlus1; if (new_parsed->scheme.is_nonempty()) { // Assume "view-source:real-scheme" as a scheme. new_parsed->scheme.len += kViewSourceLengthPlus1; @@ -746,6 +741,12 @@ std::wstring FormatViewSourceUrl(const GURL& url, new_parsed->query.begin += kViewSourceLengthPlus1; if (new_parsed->ref.is_nonempty()) new_parsed->ref.begin += kViewSourceLengthPlus1; + if (prefix_end) + *prefix_end += kViewSourceLengthPlus1; + if (temp_offset_ptr) { + *offset_for_adjustment = (temp_offset == std::wstring::npos) ? + std::wstring::npos : (temp_offset + kViewSourceLengthPlus1); + } return result; } @@ -769,12 +770,20 @@ std::set<int> explicitly_allowed_ports; // Appends the substring |in_component| inside of the URL |spec| to |output|, // and the resulting range will be filled into |out_component|. |unescape_rules| -// defines how to clean the URL for human readability. +// defines how to clean the URL for human readability. |offset_for_adjustment| +// is an offset into |output| which will be adjusted based on how it maps to the +// component being converted; if it is less than output->length(), it will be +// untouched, and if it is greater than output->length() + in_component.len it +// will be shortened by the difference in lengths between the input and output +// components. Otherwise it points into the component being converted, and is +// adjusted to point to the same logical place in |output|. +// |offset_for_adjustment| may not be NULL. static void AppendFormattedComponent(const std::string& spec, const url_parse::Component& in_component, UnescapeRule::Type unescape_rules, std::wstring* output, - url_parse::Component* out_component); + url_parse::Component* out_component, + size_t* offset_for_adjustment); GURL FilePathToFileURL(const FilePath& path) { // Produce a URL like "file:///C:/foo" for a regular file, or @@ -849,58 +858,56 @@ std::string GetHeaderParamValue(const std::string& field, // // We may want to skip this step in the case of file URLs to allow unicode // UNC hostnames regardless of encodings. -void IDNToUnicode(const char* host, - int host_len, - const std::wstring& languages, - std::wstring* out) { +std::wstring IDNToUnicode(const char* host, + size_t host_len, + const std::wstring& languages, + size_t* offset_for_adjustment) { // Convert the ASCII input to a wide string for ICU. string16 input16; input16.reserve(host_len); - for (int i = 0; i < host_len; i++) - input16.push_back(host[i]); + std::copy(host, host + host_len, std::back_inserter(input16)); string16 out16; - // The output string is appended to, so convert what's already there if - // needed. -#if defined(WCHAR_T_IS_UTF32) - WideToUTF16(out->data(), out->length(), &out16); - out->clear(); // for equivalence with the swap below -#elif defined(WCHAR_T_IS_UTF16) - out->swap(out16); -#endif + size_t output_offset = offset_for_adjustment ? + *offset_for_adjustment : std::wstring::npos; // Do each component of the host separately, since we enforce script matching // on a per-component basis. - size_t cur_begin = 0; // Beginning of the current component (inclusive). - while (cur_begin < input16.size()) { - // Find the next dot or the end of the string. - size_t next_dot = input16.find_first_of('.', cur_begin); - if (next_dot == std::wstring::npos) - next_dot = input16.size(); // For getting the last component. - - if (next_dot > cur_begin) { + for (size_t component_start = 0, component_end; + component_start < input16.length(); + component_start = component_end + 1) { + // Find the end of the component. + component_end = input16.find('.', component_start); + if (component_end == string16::npos) + component_end = input16.length(); // For getting the last component. + size_t component_length = component_end - component_start; + + size_t output_component_start = out16.length(); + bool converted_idn = false; + if (component_end > component_start) { // Add the substring that we just found. - IDNToUnicodeOneComponent(&input16[cur_begin], - static_cast<int>(next_dot - cur_begin), - languages, - &out16); + converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start, + component_length, languages, &out16); + } + size_t output_component_length = out16.length() - output_component_start; + + if ((output_offset != std::wstring::npos) && + (*offset_for_adjustment > component_start)) { + if ((*offset_for_adjustment < component_end) && converted_idn) + output_offset = std::wstring::npos; + else + output_offset += output_component_length - component_length; } - // Need to add the dot we just found (if we found one). This needs to be - // done before we break out below in case the URL ends in a dot. - if (next_dot < input16.size()) + // Need to add the dot we just found (if we found one). + if (component_end < input16.length()) out16.push_back('.'); - else - break; // No more components left. - - cur_begin = next_dot + 1; } -#if defined(WCHAR_T_IS_UTF32) - UTF16ToWide(out16.data(), out16.length(), out); -#elif defined(WCHAR_T_IS_UTF16) - out->swap(out16); -#endif + if (offset_for_adjustment) + *offset_for_adjustment = output_offset; + + return UTF16ToWideAndAdjustOffset(out16, offset_for_adjustment); } std::string CanonicalizeHost(const std::string& host, @@ -1262,31 +1269,48 @@ void GetIdentityFromURL(const GURL& url, std::wstring* username, std::wstring* password) { UnescapeRule::Type flags = UnescapeRule::SPACES; - *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags); - *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags); + *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags, NULL); + *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags, NULL); } void AppendFormattedHost(const GURL& url, const std::wstring& languages, std::wstring* output, - url_parse::Parsed* new_parsed) { + url_parse::Parsed* new_parsed, + size_t* offset_for_adjustment) { + DCHECK(output); const url_parse::Component& host = url.parsed_for_possibly_invalid_spec().host; if (host.is_nonempty()) { // Handle possible IDN in the host name. + int new_host_begin = static_cast<int>(output->length()); if (new_parsed) - new_parsed->host.begin = static_cast<int>(output->length()); + new_parsed->host.begin = new_host_begin; + size_t offset_past_current_output = + (!offset_for_adjustment || + (*offset_for_adjustment == std::wstring::npos) || + (*offset_for_adjustment < output->length())) ? + std::wstring::npos : (*offset_for_adjustment - output->length()); + size_t* offset_into_host = + (offset_past_current_output >= static_cast<size_t>(host.len)) ? + NULL : &offset_past_current_output; const std::string& spec = url.possibly_invalid_spec(); DCHECK(host.begin >= 0 && ((spec.length() == 0 && host.begin == 0) || host.begin < static_cast<int>(spec.length()))); - net::IDNToUnicode(&spec[host.begin], host.len, languages, output); + output->append(net::IDNToUnicode(&spec[host.begin], + static_cast<size_t>(host.len), languages, offset_into_host)); - if (new_parsed) { - new_parsed->host.len = - static_cast<int>(output->length()) - new_parsed->host.begin; + int new_host_len = static_cast<int>(output->length()) - new_host_begin; + if (new_parsed) + new_parsed->host.len = new_host_len; + if (offset_into_host) { + *offset_for_adjustment = (*offset_into_host == std::wstring::npos) ? + std::wstring::npos : (new_host_begin + *offset_into_host); + } else if (offset_past_current_output != std::wstring::npos) { + *offset_for_adjustment += new_host_len - host.len; } } else if (new_parsed) { new_parsed->host.reset(); @@ -1298,19 +1322,36 @@ void AppendFormattedComponent(const std::string& spec, const url_parse::Component& in_component, UnescapeRule::Type unescape_rules, std::wstring* output, - url_parse::Component* out_component) { + url_parse::Component* out_component, + size_t* offset_for_adjustment) { + DCHECK(output); + DCHECK(offset_for_adjustment); if (in_component.is_nonempty()) { out_component->begin = static_cast<int>(output->length()); + size_t offset_past_current_output = + ((*offset_for_adjustment == std::wstring::npos) || + (*offset_for_adjustment < output->length())) ? + std::wstring::npos : (*offset_for_adjustment - output->length()); + size_t* offset_into_component = + (offset_past_current_output >= static_cast<size_t>(in_component.len)) ? + NULL : &offset_past_current_output; if (unescape_rules == UnescapeRule::NONE) { - output->append(UTF8ToWide(spec.substr( - in_component.begin, in_component.len))); + output->append(UTF8ToWideAndAdjustOffset( + spec.substr(in_component.begin, in_component.len), + offset_into_component)); } else { output->append(UnescapeAndDecodeUTF8URLComponent( - spec.substr(in_component.begin, in_component.len), - unescape_rules)); + spec.substr(in_component.begin, in_component.len), unescape_rules, + offset_into_component)); } out_component->len = static_cast<int>(output->length()) - out_component->begin; + if (offset_into_component) { + *offset_for_adjustment = (*offset_into_component == std::wstring::npos) ? + std::wstring::npos : (out_component->begin + *offset_into_component); + } else if (offset_past_current_output != std::wstring::npos) { + *offset_for_adjustment += out_component->len - in_component.len; + } } else { out_component->reset(); } @@ -1321,10 +1362,14 @@ std::wstring FormatUrl(const GURL& url, bool omit_username_password, UnescapeRule::Type unescape_rules, url_parse::Parsed* new_parsed, - size_t* prefix_end) { + size_t* prefix_end, + size_t* offset_for_adjustment) { url_parse::Parsed parsed_temp; if (!new_parsed) new_parsed = &parsed_temp; + size_t offset_temp = std::wstring::npos; + if (!offset_for_adjustment) + offset_for_adjustment = &offset_temp; std::wstring url_string; @@ -1332,6 +1377,7 @@ std::wstring FormatUrl(const GURL& url, if (url.is_empty()) { if (prefix_end) *prefix_end = 0; + *offset_for_adjustment = std::wstring::npos; return url_string; } @@ -1343,19 +1389,22 @@ std::wstring FormatUrl(const GURL& url, if (url.SchemeIs(kViewSource) && !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) { return FormatViewSourceUrl(url, languages, omit_username_password, - unescape_rules, new_parsed, prefix_end); + unescape_rules, new_parsed, prefix_end, offset_for_adjustment); } // We handle both valid and invalid URLs (this will give us the spec // regardless of validity). const std::string& spec = url.possibly_invalid_spec(); const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec(); + if (*offset_for_adjustment >= spec.length()) + *offset_for_adjustment = std::wstring::npos; // Copy everything before the username (the scheme and the separators.) // These are ASCII. - int pre_end = parsed.CountCharactersBefore(url_parse::Parsed::USERNAME, true); - for (int i = 0; i < pre_end; ++i) - url_string.push_back(spec[i]); + std::copy(spec.begin(), + spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME, + true), + std::back_inserter(url_string)); new_parsed->scheme = parsed.scheme; if (omit_username_password) { @@ -1364,16 +1413,41 @@ std::wstring FormatUrl(const GURL& url, // e.g. "http://google.com:search@evil.ru/" new_parsed->username.reset(); new_parsed->password.reset(); + if ((*offset_for_adjustment != std::wstring::npos) && + (parsed.username.is_nonempty() || parsed.password.is_nonempty())) { + if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) { + // The seeming off-by-one and off-by-two in these first two lines are to + // account for the ':' after the username and '@' after the password. + if (*offset_for_adjustment > + static_cast<size_t>(parsed.password.end())) { + *offset_for_adjustment -= + (parsed.username.len + parsed.password.len + 2); + } else if (*offset_for_adjustment > + static_cast<size_t>(parsed.username.begin)) { + *offset_for_adjustment = std::wstring::npos; + } + } else { + const url_parse::Component* nonempty_component = + parsed.username.is_nonempty() ? &parsed.username : &parsed.password; + // The seeming off-by-one in these first two lines is to account for the + // '@' after the username/password. + if (*offset_for_adjustment > + static_cast<size_t>(nonempty_component->end())) { + *offset_for_adjustment -= (nonempty_component->len + 1); + } else if (*offset_for_adjustment > + static_cast<size_t>(nonempty_component->begin)) { + *offset_for_adjustment = std::wstring::npos; + } + } + } } else { - AppendFormattedComponent( - spec, parsed.username, unescape_rules, - &url_string, &new_parsed->username); + AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string, + &new_parsed->username, offset_for_adjustment); if (parsed.password.is_valid()) { url_string.push_back(':'); } - AppendFormattedComponent( - spec, parsed.password, unescape_rules, - &url_string, &new_parsed->password); + AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string, + &new_parsed->password, offset_for_adjustment); if (parsed.username.is_valid() || parsed.password.is_valid()) { url_string.push_back('@'); } @@ -1381,39 +1455,56 @@ std::wstring FormatUrl(const GURL& url, if (prefix_end) *prefix_end = static_cast<size_t>(url_string.length()); - AppendFormattedHost(url, languages, &url_string, new_parsed); + AppendFormattedHost(url, languages, &url_string, new_parsed, + offset_for_adjustment); // Port. if (parsed.port.is_nonempty()) { url_string.push_back(':'); - int begin = url_string.length(); - for (int i = parsed.port.begin; i < parsed.port.end(); ++i) - url_string.push_back(spec[i]); - new_parsed->port.begin = begin; - new_parsed->port.len = url_string.length() - begin; + new_parsed->port.begin = url_string.length(); + std::copy(spec.begin() + parsed.port.begin, + spec.begin() + parsed.port.end(), std::back_inserter(url_string)); + new_parsed->port.len = url_string.length() - new_parsed->port.begin; } else { new_parsed->port.reset(); } // Path and query both get the same general unescape & convert treatment. - AppendFormattedComponent( - spec, parsed.path, unescape_rules, &url_string, - &new_parsed->path); + AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string, + &new_parsed->path, offset_for_adjustment); if (parsed.query.is_valid()) url_string.push_back('?'); - AppendFormattedComponent( - spec, parsed.query, unescape_rules, &url_string, - &new_parsed->query); + AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string, + &new_parsed->query, offset_for_adjustment); // Reference is stored in valid, unescaped UTF-8, so we can just convert. if (parsed.ref.is_valid()) { url_string.push_back('#'); - int begin = url_string.length(); - if (parsed.ref.len > 0) - url_string.append(UTF8ToWide(std::string(&spec[parsed.ref.begin], - parsed.ref.len))); - new_parsed->ref.begin = begin; - new_parsed->ref.len = url_string.length() - begin; + new_parsed->ref.begin = url_string.length(); + size_t offset_past_current_output = + ((*offset_for_adjustment == std::wstring::npos) || + (*offset_for_adjustment < url_string.length())) ? + std::wstring::npos : (*offset_for_adjustment - url_string.length()); + size_t* offset_into_ref = + (offset_past_current_output >= static_cast<size_t>(parsed.ref.len)) ? + NULL : &offset_past_current_output; + if (parsed.ref.len > 0) { + url_string.append(UTF8ToWideAndAdjustOffset(spec.substr(parsed.ref.begin, + parsed.ref.len), + offset_into_ref)); + } + new_parsed->ref.len = url_string.length() - new_parsed->ref.begin; + if (offset_into_ref) { + *offset_for_adjustment = (*offset_into_ref == std::wstring::npos) ? + std::wstring::npos : (new_parsed->ref.begin + *offset_into_ref); + } else if (offset_past_current_output != std::wstring::npos) { + // We clamped the offset near the beginning of this function to ensure it + // was within the input URL. If we reach here, the input was something + // invalid and non-parseable such that the offset was past any component + // we could figure out. In this case it won't be represented in the + // output string, so reset it. + *offset_for_adjustment = std::wstring::npos; + } } return url_string; diff --git a/net/base/net_util.h b/net/base/net_util.h index 1f1516f..d9affe6 100644 --- a/net/base/net_util.h +++ b/net/base/net_util.h @@ -129,10 +129,9 @@ std::string GetHeaderParamValue(const std::string& field, std::string GetFileNameFromCD(const std::string& header, const std::string& referrer_charset); -// Converts the given host name to unicode characters, APPENDING them to the -// the given output string. This can be called for any host name, if the -// input is not IDN or is invalid in some way, we'll just append the ASCII -// source to the output so it is still usable. +// Converts the given host name to unicode characters. This can be called for +// any host name, if the input is not IDN or is invalid in some way, we'll just +// return the ASCII source so it is still usable. // // The input should be the canonicalized ASCII host name from GURL. This // function does NOT accept UTF-8! Its length must also be given (this is @@ -146,10 +145,16 @@ std::string GetFileNameFromCD(const std::string& header, // Latin letters in the ASCII range can be mixed with a limited set of // script-language pairs (currently Han, Kana and Hangul for zh,ja and ko). // When |languages| is empty, even that mixing is not allowed. -void IDNToUnicode(const char* host, - int host_len, - const std::wstring& languages, - std::wstring* out); +// +// |offset_for_adjustment| is an offset into |host|, which will be adjusted to +// point at the same logical place in the output string. If this isn't possible +// because it points past the end of |host| or into the middle of a punycode +// sequence, it will be set to std::wstring::npos. |offset_for_adjustment| may +// be NULL. +std::wstring IDNToUnicode(const char* host, + size_t host_len, + const std::wstring& languages, + size_t* offset_for_adjustment); // Canonicalizes |host| and returns it. Also fills |host_info| with // IP address information. |host_info| must not be NULL. @@ -228,31 +233,47 @@ int SetNonBlocking(int fd); // the user. The given parsed structure will be updated. The host name formatter // also takes the same accept languages component as ElideURL. |new_parsed| may // be null. -void AppendFormattedHost(const GURL& url, const std::wstring& languages, - std::wstring* output, url_parse::Parsed* new_parsed); - -// Creates a string representation of |url|. The IDN host name may -// be in Unicode if |languages| accepts the Unicode representation. -// If |omit_username_password| is true, the username and the password are -// omitted. |unescape_rules| defines how to clean the URL for human readability. +void AppendFormattedHost(const GURL& url, + const std::wstring& languages, + std::wstring* output, + url_parse::Parsed* new_parsed, + size_t* offset_for_adjustment); + +// Creates a string representation of |url|. The IDN host name may be in Unicode +// if |languages| accepts the Unicode representation. If +// |omit_username_password| is true, any username and password are removed. +// |unescape_rules| defines how to clean the URL for human readability. // You will generally want |UnescapeRule::SPACES| for display to the user if you // can handle spaces, or |UnescapeRule::NORMAL| if not. If the path part and the // query part seem to be encoded in %-encoded UTF-8, decodes %-encoding and -// UTF-8. |new_parsed| will have parsing parameters of the resultant URL. +// UTF-8. +// +// The last three parameters may be NULL. +// |new_parsed| will be set to the parsing parameters of the resultant URL. // |prefix_end| will be the length before the hostname of the resultant URL. -// |new_parsed| and |prefix_end| may be NULL. +// |offset_for_adjustment| is an offset into the original |url|'s spec(), which +// will be modified to reflect changes this function makes to the output string; +// for example, if |url| is "http://a:b@c.com/", |omit_username_password| is +// true, and |offset_for_adjustment| is 12 (the offset of '.'), then on return +// the output string will be "http://c.com/" and |offset_for_adjustment| will be +// 8. If the offset cannot be successfully adjusted (e.g. because it points +// into the middle of a component that was entirely removed, past the end of the +// string, or into the middle of an encoding sequence), it will be set to +// std::wstring::npos. std::wstring FormatUrl(const GURL& url, const std::wstring& languages, bool omit_username_password, UnescapeRule::Type unescape_rules, url_parse::Parsed* new_parsed, - size_t* prefix_end); + size_t* prefix_end, + size_t* offset_for_adjustment); // Creates a string representation of |url| for display to the user. // This is a shorthand of the above function with omit_username_password=true, // unescape=SPACES, new_parsed=NULL, and prefix_end=NULL. inline std::wstring FormatUrl(const GURL& url, const std::wstring& languages) { - return FormatUrl(url, languages, true, UnescapeRule::SPACES, NULL, NULL); + return FormatUrl(url, languages, true, UnescapeRule::SPACES, NULL, NULL, + NULL); } // Strip the portions of |url| that aren't core to the network request. diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc index 07ec17c..308ef80 100644 --- a/net/base/net_util_unittest.cc +++ b/net/base/net_util_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -345,6 +345,11 @@ const IDNTestCase idn_cases[] = { #endif }; +struct AdjustOffsetCase { + size_t input_offset; + size_t output_offset; +}; + struct CompliantHostCase { const char* host; bool expected_output; @@ -782,14 +787,10 @@ TEST(NetUtilTest, IDNToUnicodeFast) { // ja || zh-TW,en || ko,ja -> IDNToUnicodeSlow if (j == 3 || j == 17 || j == 18) continue; - std::wstring output; - net::IDNToUnicode(idn_cases[i].input, - static_cast<int>(strlen(idn_cases[i].input)), - kLanguages[j], - &output); + std::wstring output(net::IDNToUnicode(idn_cases[i].input, + strlen(idn_cases[i].input), kLanguages[j], NULL)); std::wstring expected(idn_cases[i].unicode_allowed[j] ? - idn_cases[i].unicode_output : - ASCIIToWide(idn_cases[i].input)); + idn_cases[i].unicode_output : ASCIIToWide(idn_cases[i].input)); AppendLanguagesToOutputs(kLanguages[j], &expected, &output); EXPECT_EQ(expected, output); } @@ -802,20 +803,43 @@ TEST(NetUtilTest, IDNToUnicodeSlow) { // !(ja || zh-TW,en || ko,ja) -> IDNToUnicodeFast if (!(j == 3 || j == 17 || j == 18)) continue; - std::wstring output; - net::IDNToUnicode(idn_cases[i].input, - static_cast<int>(strlen(idn_cases[i].input)), - kLanguages[j], - &output); + std::wstring output(net::IDNToUnicode(idn_cases[i].input, + strlen(idn_cases[i].input), kLanguages[j], NULL)); std::wstring expected(idn_cases[i].unicode_allowed[j] ? - idn_cases[i].unicode_output : - ASCIIToWide(idn_cases[i].input)); + idn_cases[i].unicode_output : ASCIIToWide(idn_cases[i].input)); AppendLanguagesToOutputs(kLanguages[j], &expected, &output); EXPECT_EQ(expected, output); } } } +TEST(NetUtilTest, IDNToUnicodeAdjustOffset) { + const AdjustOffsetCase adjust_cases[] = { + {0, 0}, + {2, 2}, + {4, 4}, + {5, 5}, + {6, std::wstring::npos}, + {16, std::wstring::npos}, + {17, 7}, + {18, 8}, + {19, std::wstring::npos}, + {25, std::wstring::npos}, + {34, 12}, + {35, 13}, + {38, 16}, + {39, std::wstring::npos}, + {std::wstring::npos, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(adjust_cases); ++i) { + size_t offset = adjust_cases[i].input_offset; + // "test.\x89c6\x9891.\x5317\x4eac\x5927\x5b78.test" + net::IDNToUnicode("test.xn--cy2a840a.xn--1lq90ic7f1rc.test", 39, L"zh-CN", + &offset); + EXPECT_EQ(adjust_cases[i].output_offset, offset); + } +} + TEST(NetUtilTest, CompliantHost) { const CompliantHostCase compliant_host_cases[] = { {"", false}, @@ -1328,7 +1352,7 @@ TEST(NetUtilTest, FormatUrl) { size_t prefix_len; std::wstring formatted = net::FormatUrl( GURL(tests[i].input), tests[i].languages, tests[i].omit, - tests[i].escape_rules, NULL, &prefix_len); + tests[i].escape_rules, NULL, &prefix_len, NULL); EXPECT_EQ(tests[i].output, formatted) << tests[i].description; EXPECT_EQ(tests[i].prefix_len, prefix_len) << tests[i].description; } @@ -1340,7 +1364,7 @@ TEST(NetUtilTest, FormatUrlParsed) { std::wstring formatted = net::FormatUrl( GURL("http://\xE3\x82\xB0:\xE3\x83\xBC@xn--qcka1pmc.jp:8080/" "%E3%82%B0/?q=%E3%82%B0#\xE3\x82\xB0"), - L"ja", false, UnescapeRule::NONE, &parsed, NULL); + L"ja", false, UnescapeRule::NONE, &parsed, NULL, NULL); EXPECT_EQ(L"http://%E3%82%B0:%E3%83%BC@\x30B0\x30FC\x30B0\x30EB.jp:8080" L"/%E3%82%B0/?q=%E3%82%B0#\x30B0", formatted); EXPECT_EQ(L"%E3%82%B0", @@ -1360,7 +1384,7 @@ TEST(NetUtilTest, FormatUrlParsed) { formatted = net::FormatUrl( GURL("http://\xE3\x82\xB0:\xE3\x83\xBC@xn--qcka1pmc.jp:8080/" "%E3%82%B0/?q=%E3%82%B0#\xE3\x82\xB0"), - L"ja", false, UnescapeRule::NORMAL, &parsed, NULL); + L"ja", false, UnescapeRule::NORMAL, &parsed, NULL, NULL); EXPECT_EQ(L"http://\x30B0:\x30FC@\x30B0\x30FC\x30B0\x30EB.jp:8080" L"/\x30B0/?q=\x30B0#\x30B0", formatted); EXPECT_EQ(L"\x30B0", @@ -1379,7 +1403,7 @@ TEST(NetUtilTest, FormatUrlParsed) { formatted = net::FormatUrl( GURL("http://\xE3\x82\xB0:\xE3\x83\xBC@xn--qcka1pmc.jp:8080/" "%E3%82%B0/?q=%E3%82%B0#\xE3\x82\xB0"), - L"ja", true, UnescapeRule::NORMAL, &parsed, NULL); + L"ja", true, UnescapeRule::NORMAL, &parsed, NULL, NULL); EXPECT_EQ(L"http://\x30B0\x30FC\x30B0\x30EB.jp:8080" L"/\x30B0/?q=\x30B0#\x30B0", formatted); EXPECT_FALSE(parsed.username.is_valid()); @@ -1395,7 +1419,7 @@ TEST(NetUtilTest, FormatUrlParsed) { // View-source case. formatted = net::FormatUrl( GURL("view-source:http://user:passwd@host:81/path?query#ref"), - L"", true, UnescapeRule::NORMAL, &parsed, NULL); + L"", true, UnescapeRule::NORMAL, &parsed, NULL, NULL); EXPECT_EQ(L"view-source:http://host:81/path?query#ref", formatted); EXPECT_EQ(L"view-source:http", formatted.substr(parsed.scheme.begin, parsed.scheme.len)); @@ -1408,6 +1432,124 @@ TEST(NetUtilTest, FormatUrlParsed) { EXPECT_EQ(L"ref", formatted.substr(parsed.ref.begin, parsed.ref.len)); } +TEST(NetUtilTest, FormatUrlAdjustOffset) { + const AdjustOffsetCase basic_cases[] = { + {0, 0}, + {3, 3}, + {5, 5}, + {6, 6}, + {13, 13}, + {21, 21}, + {22, 22}, + {23, 23}, + {25, 25}, + {26, std::wstring::npos}, + {500000, std::wstring::npos}, + {std::wstring::npos, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(basic_cases); ++i) { + size_t offset = basic_cases[i].input_offset; + net::FormatUrl(GURL("http://www.google.com/foo/"), L"en", true, + UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(basic_cases[i].output_offset, offset); + } + + const struct { + const char* input_url; + size_t input_offset; + size_t output_offset; + } omit_auth_cases[] = { + {"http://foo:bar@www.google.com/", 6, 6}, + {"http://foo:bar@www.google.com/", 7, 7}, + {"http://foo:bar@www.google.com/", 8, std::wstring::npos}, + {"http://foo:bar@www.google.com/", 10, std::wstring::npos}, + {"http://foo:bar@www.google.com/", 11, std::wstring::npos}, + {"http://foo:bar@www.google.com/", 14, std::wstring::npos}, + {"http://foo:bar@www.google.com/", 15, 7}, + {"http://foo:bar@www.google.com/", 25, 17}, + {"http://foo@www.google.com/", 9, std::wstring::npos}, + {"http://foo@www.google.com/", 11, 7}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(omit_auth_cases); ++i) { + size_t offset = omit_auth_cases[i].input_offset; + net::FormatUrl(GURL(omit_auth_cases[i].input_url), L"en", true, + UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(omit_auth_cases[i].output_offset, offset); + } + + const AdjustOffsetCase view_source_cases[] = { + {0, 0}, + {3, 3}, + {11, 11}, + {12, 12}, + {13, 13}, + {19, 19}, + {20, std::wstring::npos}, + {23, 19}, + {26, 22}, + {std::wstring::npos, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(view_source_cases); ++i) { + size_t offset = view_source_cases[i].input_offset; + net::FormatUrl(GURL("view-source:http://foo@www.google.com/"), L"en", true, + UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(view_source_cases[i].output_offset, offset); + } + + const AdjustOffsetCase idn_hostname_cases[] = { + {8, std::wstring::npos}, + {16, std::wstring::npos}, + {24, std::wstring::npos}, + {25, 12}, + {30, 17}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(idn_hostname_cases); ++i) { + size_t offset = idn_hostname_cases[i].input_offset; + // "http://\x671d\x65e5\x3042\x3055\x3072.jp/foo/" + net::FormatUrl(GURL("http://xn--l8jvb1ey91xtjb.jp/foo/"), L"ja", true, + UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(idn_hostname_cases[i].output_offset, offset); + } + + const AdjustOffsetCase unescape_cases[] = { + {25, 25}, + {26, std::wstring::npos}, + {27, std::wstring::npos}, + {28, 26}, + {35, std::wstring::npos}, + {41, 31}, + {59, 33}, + {60, std::wstring::npos}, + {67, std::wstring::npos}, + {68, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(unescape_cases); ++i) { + size_t offset = unescape_cases[i].input_offset; + // "http://www.google.com/foo bar/\x30B0\x30FC\x30B0\x30EB" + net::FormatUrl(GURL( + "http://www.google.com/foo%20bar/%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB"), + L"en", true, UnescapeRule::SPACES, NULL, NULL, &offset); + EXPECT_EQ(unescape_cases[i].output_offset, offset); + } + + const AdjustOffsetCase ref_cases[] = { + {30, 30}, + {31, 31}, + {32, std::wstring::npos}, + {34, 32}, + {37, 33}, + {38, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(ref_cases); ++i) { + size_t offset = ref_cases[i].input_offset; + // "http://www.google.com/foo.html#\x30B0\x30B0z" + net::FormatUrl(GURL( + "http://www.google.com/foo.html#\xE3\x82\xB0\xE3\x82\xB0z"), L"en", + true, UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(ref_cases[i].output_offset, offset); + } +} + TEST(NetUtilTest, SimplifyUrlForRequest) { struct { const char* input_url; @@ -1466,4 +1608,3 @@ TEST(NetUtilTest, SetExplicitlyAllowedPortsTest) { EXPECT_EQ(i, net::explicitly_allowed_ports.size()); } } - |