diff options
author | pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-11-07 01:34:53 +0000 |
---|---|---|
committer | pkasting@chromium.org <pkasting@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-11-07 01:34:53 +0000 |
commit | ce85f60cd9d399109dab39fe5a9613879ab9a8f7 (patch) | |
tree | 0e9e0072d2e5eadfeec08eef0f06a43c56dc1751 | |
parent | d90684d0cf0aa16389c9202153c97d373829b7f3 (diff) | |
download | chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.zip chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.gz chromium_src-ce85f60cd9d399109dab39fe5a9613879ab9a8f7.tar.bz2 |
Fix various problems with inline autocomplete and URLs that change length during fixup:
* URLs with http auth info, which gets stripped
* URLs with IDN hosts
* URLs with escaped values that get unescaped
In cases like these, we'd inline autocomplete from the wrong locations, highlight the wrong portions of the URL as matches, and sometimes DCHECK() in debug mode.
The fix is to track how fixup affects the offsets into the URL we care about. Plumbing this required an enormous number of additions :(
There is also a fix here to the URL Fixer Upper, which was obviously modified at some point in the past to use the Parsed components, but without updating the comments or some of the functionality to match. Since this isn't supposed to "fix up" things that aren't simple typos, I removed some code to "fix" bogus ports, which was causing bizarre effects when typing HTTP auth URLs ("http://foo:bar" would be fixed to "http://foo" and then matched for inline autocompletion, which was clearly wrong). This is tested incidentally by one of the new History URL Provider tests (which is how I discovered it).
BUG=4010
TEST=Covered by unittests
Review URL: http://codereview.chromium.org/372017
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@31352 0039d316-1c4b-4281-b951-d872f2087c98
32 files changed, 1393 insertions, 747 deletions
diff --git a/app/gfx/text_elider.cc b/app/gfx/text_elider.cc index a1db1c6..dc9b199 100644 --- a/app/gfx/text_elider.cc +++ b/app/gfx/text_elider.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -33,8 +33,8 @@ std::wstring ElideUrl(const GURL& url, const std::wstring& languages) { // Get a formatted string and corresponding parsing of the url. url_parse::Parsed parsed; - std::wstring url_string = - net::FormatUrl(url, languages, true, UnescapeRule::SPACES, &parsed, NULL); + std::wstring url_string = net::FormatUrl(url, languages, true, + UnescapeRule::SPACES, &parsed, NULL, NULL); if (available_pixel_width <= 0) return url_string; @@ -334,12 +334,12 @@ std::wstring ElideText(const std::wstring& text, SortedDisplayURL::SortedDisplayURL(const GURL& url, const std::wstring& languages) { std::wstring host; - net::AppendFormattedHost(url, languages, &host, NULL); + net::AppendFormattedHost(url, languages, &host, NULL, NULL); sort_host_ = WideToUTF16Hack(host); string16 host_minus_www = WideToUTF16Hack(net::StripWWW(host)); url_parse::Parsed parsed; display_url_ = WideToUTF16Hack(net::FormatUrl(url, languages, - true, UnescapeRule::SPACES, &parsed, &prefix_end_)); + true, UnescapeRule::SPACES, &parsed, &prefix_end_, NULL)); if (sort_host_.length() > host_minus_www.length()) { prefix_end_ += sort_host_.length() - host_minus_www.length(); sort_host_.swap(host_minus_www); diff --git a/base/base.gyp b/base/base.gyp index 71ff640..f09e2e5 100644 --- a/base/base.gyp +++ b/base/base.gyp @@ -633,6 +633,7 @@ 'timer_unittest.cc', 'tracked_objects_unittest.cc', 'tuple_unittest.cc', + 'utf_string_conversions_unittest.cc', 'values_unittest.cc', 'version_unittest.cc', 'waitable_event_unittest.cc', diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc index ba9f9ae..c93b103 100644 --- a/base/i18n/icu_string_conversions.cc +++ b/base/i18n/icu_string_conversions.cc @@ -157,6 +157,90 @@ const char kCodepageUTF16LE[] = "UTF-16LE"; // Codepage <-> Wide/UTF-16 --------------------------------------------------- +// Convert a UTF-16 string into the specified codepage_name. If the codepage +// isn't found, return false. +bool UTF16ToCodepage(const string16& utf16, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::string* encoded) { + encoded->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + return ConvertFromUTF16(converter, utf16.c_str(), + static_cast<int>(utf16.length()), on_error, encoded); +} + +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment) { + utf16->clear(); + + UErrorCode status = U_ZERO_ERROR; + UConverter* converter = ucnv_open(codepage_name, &status); + if (!U_SUCCESS(status)) + return false; + + // Even in the worst case, the maximum length in 2-byte units of UTF-16 + // output would be at most the same as the number of bytes in input. There + // is no single-byte encoding in which a character is mapped to a + // non-BMP character requiring two 2-byte units. + // + // Moreover, non-BMP characters in legacy multibyte encodings + // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are + // BOCU and SCSU, but we don't care about them. + size_t uchar_max_length = encoded.length() + 1; + + SetUpErrorHandlerForToUChars(on_error, converter, &status); + char16* byte_buffer = WriteInto(utf16, uchar_max_length); + int byte_buffer_length = static_cast<int>(uchar_max_length); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = string16::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, + data, static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size; + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = string16::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toUChars(converter, byte_buffer, byte_buffer_length, data, + length, &status); + ucnv_close(converter); + if (!U_SUCCESS(status)) { + utf16->clear(); // Make sure the output is empty on error. + return false; + } + + utf16->resize(actual_size); + return true; +} + // Convert a wstring into the specified codepage_name. If the codepage // isn't found, return false. bool WideToCodepage(const std::wstring& wide, @@ -188,31 +272,16 @@ bool WideToCodepage(const std::wstring& wide, #endif // defined(WCHAR_T_IS_UTF32) } -// Convert a UTF-16 string into the specified codepage_name. If the codepage -// isn't found, return false. -bool UTF16ToCodepage(const string16& utf16, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::string* encoded) { - encoded->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - return ConvertFromUTF16(converter, utf16.c_str(), - static_cast<int>(utf16.length()), on_error, encoded); -} - // Converts a string of the given codepage into wstring. // If the codepage isn't found, return false. -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide) { +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment) { #if defined(WCHAR_T_IS_UTF16) - return CodepageToUTF16(encoded, codepage_name, on_error, wide); + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, wide, + offset_for_adjustment); #elif defined(WCHAR_T_IS_UTF32) wide->clear(); @@ -227,70 +296,53 @@ bool CodepageToWide(const std::string& encoded, // this can be 4 times larger than actually needed. size_t wchar_max_length = encoded.length() + 1; - // The byte buffer and its length to pass to ucnv_toAlgorithimic. - char* byte_buffer = reinterpret_cast<char*>( - WriteInto(wide, wchar_max_length)); - int byte_buffer_length = static_cast<int>(wchar_max_length) * 4; - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toAlgorithmic(utf32_platform_endian(), - converter, - byte_buffer, - byte_buffer_length, - encoded.data(), - static_cast<int>(encoded.length()), - &status); + char* byte_buffer = + reinterpret_cast<char*>(WriteInto(wide, wchar_max_length)); + int byte_buffer_length = static_cast<int>(wchar_max_length) * sizeof(wchar_t); + const char* data = encoded.data(); + int length = static_cast<int>(encoded.length()); + int actual_size = 0; + if (offset_for_adjustment) { + if (*offset_for_adjustment >= encoded.length()) { + *offset_for_adjustment = std::wstring::npos; + } else if (*offset_for_adjustment != 0) { + // Try to adjust the offset by converting the string in two pieces and + // using the length of the first piece as the adjusted offset. + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, + static_cast<int>(*offset_for_adjustment), &status); + if (U_SUCCESS(status)) { + // Conversion succeeded, so update the offset and then fall through to + // appending the second half of the string. + data += *offset_for_adjustment; + length -= *offset_for_adjustment; + *offset_for_adjustment = actual_size / sizeof(wchar_t); + byte_buffer += actual_size; + byte_buffer_length -= actual_size; + } else { + // The offset may have been in the middle of an encoding sequence; mark + // it as having failed to adjust and then try to convert the entire + // string. + *offset_for_adjustment = std::wstring::npos; + actual_size = 0; + ucnv_reset(converter); + status = U_ZERO_ERROR; + } + } + } + actual_size += ucnv_toAlgorithmic(utf32_platform_endian(), converter, + byte_buffer, byte_buffer_length, data, length, &status); ucnv_close(converter); - if (!U_SUCCESS(status)) { wide->clear(); // Make sure the output is empty on error. return false; } // actual_size is # of bytes. - wide->resize(actual_size / 4); + wide->resize(actual_size / sizeof(wchar_t)); return true; #endif // defined(WCHAR_T_IS_UTF32) } -// Converts a string of the given codepage into UTF-16. -// If the codepage isn't found, return false. -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16) { - utf16->clear(); - - UErrorCode status = U_ZERO_ERROR; - UConverter* converter = ucnv_open(codepage_name, &status); - if (!U_SUCCESS(status)) - return false; - - // Even in the worst case, the maximum length in 2-byte units of UTF-16 - // output would be at most the same as the number of bytes in input. There - // is no single-byte encoding in which a character is mapped to a - // non-BMP character requiring two 2-byte units. - // - // Moreover, non-BMP characters in legacy multibyte encodings - // (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are - // BOCU and SCSU, but we don't care about them. - size_t uchar_max_length = encoded.length() + 1; - - SetUpErrorHandlerForToUChars(on_error, converter, &status); - int actual_size = ucnv_toUChars(converter, - WriteInto(utf16, uchar_max_length), - static_cast<int>(uchar_max_length), - encoded.data(), - static_cast<int>(encoded.length()), - &status); - ucnv_close(converter); - if (!U_SUCCESS(status)) { - utf16->clear(); // Make sure the output is empty on error. - return false; - } - - utf16->resize(actual_size); - return true; -} - } // namespace base diff --git a/base/i18n/icu_string_conversions.h b/base/i18n/icu_string_conversions.h index e7dac605..6f2cab7 100644 --- a/base/i18n/icu_string_conversions.h +++ b/base/i18n/icu_string_conversions.h @@ -40,6 +40,17 @@ extern const char kCodepageUTF8[]; extern const char kCodepageUTF16BE[]; extern const char kCodepageUTF16LE[]; +// Like CodepageToUTF16() (see below), but also takes an offset into |encoded|, +// which will be adjusted to point at the same logical place in |utf16|. If +// this isn't possible because it points past the end of |encoded| or into the +// middle of a multibyte sequence, it will be set to std::string16::npos. +// |offset_for_adjustment| may be NULL. +bool CodepageToUTF16AndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16, + size_t* offset_for_adjustment); + // Converts between UTF-16 strings and the encoding specified. If the // encoding doesn't exist or the encoding fails (when on_error is FAIL), // returns false. @@ -47,11 +58,24 @@ bool UTF16ToCodepage(const string16& utf16, const char* codepage_name, OnStringConversionError::Type on_error, std::string* encoded); +inline bool CodepageToUTF16(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + string16* utf16) { + return CodepageToUTF16AndAdjustOffset(encoded, codepage_name, on_error, utf16, + NULL); +} -bool CodepageToUTF16(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - string16* utf16); +// Like CodepageToWide() (see below), but also takes an offset into |encoded|, +// which will be adjusted to point at the same logical place in |wide|. If +// this isn't possible because it points past the end of |encoded| or into the +// middle of a multibyte sequence, it will be set to std::wstring::npos. +// |offset_for_adjustment| may be NULL. +bool CodepageToWideAndAdjustOffset(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide, + size_t* offset_for_adjustment); // Converts between wide strings and the encoding specified. If the // encoding doesn't exist or the encoding fails (when on_error is FAIL), @@ -60,10 +84,13 @@ bool WideToCodepage(const std::wstring& wide, const char* codepage_name, OnStringConversionError::Type on_error, std::string* encoded); -bool CodepageToWide(const std::string& encoded, - const char* codepage_name, - OnStringConversionError::Type on_error, - std::wstring* wide); +inline bool CodepageToWide(const std::string& encoded, + const char* codepage_name, + OnStringConversionError::Type on_error, + std::wstring* wide) { + return CodepageToWideAndAdjustOffset(encoded, codepage_name, on_error, wide, + NULL); +} } // namespace base diff --git a/base/i18n/icu_string_conversions_unittest.cc b/base/i18n/icu_string_conversions_unittest.cc index 969ddb7..0088a03 100644 --- a/base/i18n/icu_string_conversions_unittest.cc +++ b/base/i18n/icu_string_conversions_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -9,9 +9,9 @@ #include <sstream> #include "base/basictypes.h" +#include "base/i18n/icu_string_conversions.h" #include "base/logging.h" #include "base/utf_string_conversions.h" -#include "base/i18n/icu_string_conversions.h" #include "testing/gtest/include/gtest/gtest.h" namespace base { @@ -39,7 +39,7 @@ string16 BuildString16(const wchar_t* s) { #endif } -static const wchar_t* const kConvertRoundtripCases[] = { +const wchar_t* const kConvertRoundtripCases[] = { L"Google Video", // "网页 图片 资讯更多 »" L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", @@ -68,7 +68,7 @@ static const wchar_t* const kConvertRoundtripCases[] = { } // namespace -TEST(StringUtilTest, ConvertCodepageUTF8) { +TEST(ICUStringConversionsTest, ConvertCodepageUTF8) { // Make sure WideToCodepage works like WideToUTF8. for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { std::string expected(WideToUTF8(kConvertRoundtripCases[i])); @@ -156,7 +156,7 @@ static const struct { true, #if defined(WCHAR_T_IS_UTF16) L"\xD840\xDC00\x4E00", -#else +#elif defined(WCHAR_T_IS_UTF32) L"\x20000\x4E00", #endif L"\xD840\xDC00\x4E00"}, @@ -234,7 +234,7 @@ static const struct { NULL}, }; -TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { +TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndWide) { for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { std::wstring wide; bool success = CodepageToWide(kConvertCodepageCases[i].encoded, @@ -296,7 +296,7 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndWide) { OnStringConversionError::SKIP, &encoded)); } -TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { +TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) { for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertCodepageCases); ++i) { string16 utf16; bool success = CodepageToUTF16(kConvertCodepageCases[i].encoded, @@ -325,4 +325,45 @@ TEST(StringUtilTest, ConvertBetweenCodepageAndUTF16) { } } +static const struct { + const char* codepage_name; + const char* encoded; + size_t input_offset; + size_t u16_output_offset; + size_t wide_output_offset; +} kAdjustOffsetCases[] = { + {"gb2312", "", 0, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", 0, 0, 0}, + {"gb2312", "\xC4\xE3\xBA\xC3", 2, 1, 1}, + {"gb2312", "\xC4\xE3\xBA\xC3", 4, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", 1, string16::npos, std::wstring::npos}, + {"gb2312", "\xC4\xE3\xBA\xC3", std::string::npos, string16::npos, + std::wstring::npos}, + {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 2, string16::npos, + std::wstring::npos}, + {"gb18030", "\x95\x32\x82\x36\xD2\xBB", 4, 2, 1}, +}; + +TEST(ICUStringConversionsTest, AdjustOffset) { + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kAdjustOffsetCases); ++i) { + string16 utf16; + size_t offset = kAdjustOffsetCases[i].input_offset; + EXPECT_TRUE(CodepageToUTF16AndAdjustOffset(kAdjustOffsetCases[i].encoded, + kAdjustOffsetCases[i].codepage_name, + OnStringConversionError::FAIL, &utf16, &offset)); + EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); + + std::wstring wide; + offset = kAdjustOffsetCases[i].input_offset; + CodepageToWideAndAdjustOffset(kAdjustOffsetCases[i].encoded, + kAdjustOffsetCases[i].codepage_name, + OnStringConversionError::FAIL, &wide, &offset); +#if defined(WCHAR_T_IS_UTF16) + EXPECT_EQ(kAdjustOffsetCases[i].u16_output_offset, offset); +#elif defined(WCHAR_T_IS_UTF32) + EXPECT_EQ(kAdjustOffsetCases[i].wide_output_offset, offset); +#endif + } +} + } // namespace base diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 0ccea91..d691003 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -229,203 +229,6 @@ TEST(StringUtilTest, IsStringUTF8) { EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); } -static const wchar_t* const kConvertRoundtripCases[] = { - L"Google Video", - // "网页 图片 资讯更多 »" - L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", - // "Παγκόσμιος Ιστός" - L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" - L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", - // "Поиск страниц на русском" - L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" - L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" - L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", - // "전체서비스" - L"\xc804\xccb4\xc11c\xbe44\xc2a4", - - // Test characters that take more than 16 bits. This will depend on whether - // wchar_t is 16 or 32 bits. -#if defined(WCHAR_T_IS_UTF16) - L"\xd800\xdf00", - // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) - L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", -#elif defined(WCHAR_T_IS_UTF32) - L"\x10300", - // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) - L"\x11d40\x11d41\x11d42\x11d43\x11d44", -#endif -}; - -TEST(StringUtilTest, ConvertUTF8AndWide) { - // we round-trip all the wide strings through UTF-8 to make sure everything - // agrees on the conversion. This uses the stream operators to test them - // simultaneously. - for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { - std::ostringstream utf8; - utf8 << WideToUTF8(kConvertRoundtripCases[i]); - std::wostringstream wide; - wide << UTF8ToWide(utf8.str()); - - EXPECT_EQ(kConvertRoundtripCases[i], wide.str()); - } -} - -TEST(StringUtilTest, ConvertUTF8AndWideEmptyString) { - // An empty std::wstring should be converted to an empty std::string, - // and vice versa. - std::wstring wempty; - std::string empty; - EXPECT_EQ(empty, WideToUTF8(wempty)); - EXPECT_EQ(wempty, UTF8ToWide(empty)); -} - -TEST(StringUtilTest, ConvertUTF8ToWide) { - struct UTF8ToWideCase { - const char* utf8; - const wchar_t* wide; - bool success; - } convert_cases[] = { - // Regular UTF-8 input. - {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, - // Non-character is passed through. - {"\xef\xbf\xbfHello", L"\xffffHello", true}, - // Truncated UTF-8 sequence. - {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, - // Truncated off the end. - {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, - // Non-shortest-form UTF-8. - {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, - // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. - {"\xed\xb0\x80", L"", false}, - // Non-BMP characters. The second is a non-character regarded as valid. - // The result will either be in UTF-16 or UTF-32. -#if defined(WCHAR_T_IS_UTF16) - {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, - {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, -#elif defined(WCHAR_T_IS_UTF32) - {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, - {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, -#endif - }; - - for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { - std::wstring converted; - EXPECT_EQ(convert_cases[i].success, - UTF8ToWide(convert_cases[i].utf8, - strlen(convert_cases[i].utf8), - &converted)); - std::wstring expected(convert_cases[i].wide); - EXPECT_EQ(expected, converted); - } - - // Manually test an embedded NULL. - std::wstring converted; - EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); - ASSERT_EQ(3U, converted.length()); -#if defined(WCHAR_T_IS_UNSIGNED) - EXPECT_EQ(0U, converted[0]); -#else - EXPECT_EQ(0, converted[0]); -#endif - EXPECT_EQ('Z', converted[1]); - EXPECT_EQ('\t', converted[2]); - - // Make sure that conversion replaces, not appends. - EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); - ASSERT_EQ(1U, converted.length()); - EXPECT_EQ('B', converted[0]); -} - -#if defined(WCHAR_T_IS_UTF16) -// This test is only valid when wchar_t == UTF-16. -TEST(StringUtilTest, ConvertUTF16ToUTF8) { - struct UTF16ToUTF8Case { - const wchar_t* utf16; - const char* utf8; - bool success; - } convert_cases[] = { - // Regular UTF-16 input. - {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, - // Test a non-BMP character. - {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, - // Non-characters are passed through. - {L"\xffffHello", "\xEF\xBF\xBFHello", true}, - {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, - // The first character is a truncated UTF-16 character. - {L"\xd800\x597d", "\xe5\xa5\xbd", false}, - // Truncated at the end. - {L"\x597d\xd800", "\xe5\xa5\xbd", false}, - }; - - for (int i = 0; i < arraysize(convert_cases); i++) { - std::string converted; - EXPECT_EQ(convert_cases[i].success, - WideToUTF8(convert_cases[i].utf16, - wcslen(convert_cases[i].utf16), - &converted)); - std::string expected(convert_cases[i].utf8); - EXPECT_EQ(expected, converted); - } -} - -#elif defined(WCHAR_T_IS_UTF32) -// This test is only valid when wchar_t == UTF-32. -TEST(StringUtilTest, ConvertUTF32ToUTF8) { - struct WideToUTF8Case { - const wchar_t* utf32; - const char* utf8; - bool success; - } convert_cases[] = { - // Regular 16-bit input. - {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, - // Test a non-BMP character. - {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, - // Non-characters are passed through. - {L"\xffffHello", "\xEF\xBF\xBFHello", true}, - {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, - // Invalid Unicode code points. - {L"\xfffffffHello", "Hello", false}, - // The first character is a truncated UTF-16 character. - {L"\xd800\x597d", "\xe5\xa5\xbd", false}, - {L"\xdc01Hello", "Hello", false}, - }; - - for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { - std::string converted; - EXPECT_EQ(convert_cases[i].success, - WideToUTF8(convert_cases[i].utf32, - wcslen(convert_cases[i].utf32), - &converted)); - std::string expected(convert_cases[i].utf8); - EXPECT_EQ(expected, converted); - } -} -#endif // defined(WCHAR_T_IS_UTF32) - -TEST(StringUtilTest, ConvertMultiString) { - static wchar_t wmulti[] = { - L'f', L'o', L'o', L'\0', - L'b', L'a', L'r', L'\0', - L'b', L'a', L'z', L'\0', - L'\0' - }; - static char multi[] = { - 'f', 'o', 'o', '\0', - 'b', 'a', 'r', '\0', - 'b', 'a', 'z', '\0', - '\0' - }; - std::wstring wmultistring; - memcpy(WriteInto(&wmultistring, arraysize(wmulti)), wmulti, sizeof(wmulti)); - EXPECT_EQ(arraysize(wmulti) - 1, wmultistring.length()); - std::string expected; - memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi)); - EXPECT_EQ(arraysize(multi) - 1, expected.length()); - const std::string& converted = WideToUTF8(wmultistring); - EXPECT_EQ(arraysize(multi) - 1, converted.length()); - EXPECT_EQ(expected, converted); -} - TEST(StringUtilTest, ConvertASCII) { static const char* char_cases[] = { "Google Video", diff --git a/base/utf_string_conversions.cc b/base/utf_string_conversions.cc index 6b25cd8..ffff50a 100644 --- a/base/utf_string_conversions.cc +++ b/base/utf_string_conversions.cc @@ -84,43 +84,50 @@ bool ReadUnicodeCharacter(const wchar_t* src, int32 src_len, // WriteUnicodeCharacter ------------------------------------------------------- -// Appends a UTF-8 character to the given 8-bit string. -void WriteUnicodeCharacter(uint32 code_point, std::string* output) { +// Appends a UTF-8 character to the given 8-bit string. Returns the number of +// bytes written. +size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) { if (code_point <= 0x7f) { // Fast path the common case of one byte. output->push_back(code_point); - return; + return 1; } - // U8_APPEND_UNSAFE can append up to 4 bytes. - int32 char_offset = static_cast<int32>(output->length()); + // CBU8_APPEND_UNSAFE can append up to 4 bytes. + size_t char_offset = output->length(); + size_t original_char_offset = char_offset; output->resize(char_offset + CBU8_MAX_LENGTH); CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); - // U8_APPEND_UNSAFE will advance our pointer past the inserted character, so + // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so // it will represent the new length of the string. output->resize(char_offset); + return char_offset - original_char_offset; } -// Appends the given code point as a UTF-16 character to the STL string. -void WriteUnicodeCharacter(uint32 code_point, string16* output) { +// Appends the given code point as a UTF-16 character to the given 16-bit +// string. Returns the number of 16-bit values written. +size_t WriteUnicodeCharacter(uint32 code_point, string16* output) { if (CBU16_LENGTH(code_point) == 1) { // Thie code point is in the Basic Multilingual Plane (BMP). output->push_back(static_cast<char16>(code_point)); - } else { - // Non-BMP characters use a double-character encoding. - int32 char_offset = static_cast<int32>(output->length()); - output->resize(char_offset + CBU16_MAX_LENGTH); - CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + return 1; } + // Non-BMP characters use a double-character encoding. + size_t char_offset = output->length(); + output->resize(char_offset + CBU16_MAX_LENGTH); + CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point); + return CBU16_MAX_LENGTH; } #if defined(WCHAR_T_IS_UTF32) -// Appends the given UTF-32 character to the given 32-bit string. -inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { +// Appends the given UTF-32 character to the given 32-bit string. Returns the +// number of 32-bit values written. +inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { // This is the easy case, just append the character. output->push_back(code_point); + return 1; } #endif // defined(WCHAR_T_IS_UTF32) @@ -131,31 +138,57 @@ inline void WriteUnicodeCharacter(uint32 code_point, std::wstring* output) { // determine the source, and the given output STL string will be replaced by // the result. template<typename SRC_CHAR, typename DEST_STRING> -bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) { - output->clear(); +bool ConvertUnicode(const SRC_CHAR* src, + size_t src_len, + DEST_STRING* output, + size_t* offset_for_adjustment) { + size_t output_offset = + (offset_for_adjustment && *offset_for_adjustment < src_len) ? + *offset_for_adjustment : DEST_STRING::npos; // ICU requires 32-bit numbers. bool success = true; int32 src_len32 = static_cast<int32>(src_len); for (int32 i = 0; i < src_len32; i++) { uint32 code_point; + size_t original_i = i; + size_t chars_written = 0; if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { - WriteUnicodeCharacter(code_point, output); + chars_written = WriteUnicodeCharacter(code_point, output); } else { // TODO(jungshik): consider adding 'Replacement character' (U+FFFD) // in place of an invalid codepoint. success = false; } + if ((output_offset != DEST_STRING::npos) && + (*offset_for_adjustment > original_i)) { + // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last + // character read, not after it (so that incrementing it in the loop + // increment will place it at the right location), so we need to account + // for that in determining the amount that was read. + if (*offset_for_adjustment <= static_cast<size_t>(i)) + output_offset = DEST_STRING::npos; + else + output_offset += chars_written - (i - original_i + 1); + } } + + if (offset_for_adjustment) + *offset_for_adjustment = output_offset; return success; } -// Guesses the length of the output in UTF-8 in bytes, and reserves that amount -// of space in the given string. We also assume that the input character types -// are unsigned, which will be true for UTF-16 and -32 on our systems. We assume -// the string length is greater than zero. +// Guesses the length of the output in UTF-8 in bytes, clears that output +// string, and reserves that amount of space. We assume that the input +// character types are unsigned, which will be true for UTF-16 and -32 on our +// systems. template<typename CHAR> -void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { +void PrepareForUTF8Output(const CHAR* src, + size_t src_len, + std::string* output) { + output->clear(); + if (src_len == 0) + return; if (src[0] < 0x80) { // Assume that the entire input will be ASCII. output->reserve(src_len); @@ -165,11 +198,15 @@ void ReserveUTF8Output(const CHAR* src, size_t src_len, std::string* output) { } } -// Guesses the size of the output buffer (containing either UTF-16 or -32 data) -// given some UTF-8 input that will be converted to it. See ReserveUTF8Output. -// We assume the source length is > 0. +// Prepares an output buffer (containing either UTF-16 or -32 data) given some +// UTF-8 input that will be converted to it. See PrepareForUTF8Output(). template<typename STRING> -void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { +void PrepareForUTF16Or32Output(const char* src, + size_t src_len, + STRING* output) { + output->clear(); + if (src_len == 0) + return; if (static_cast<unsigned char>(src[0]) < 0x80) { // Assume the input is all ASCII, which means 1:1 correspondence. output->reserve(src_len); @@ -184,111 +221,121 @@ void ReserveUTF16Or32Output(const char* src, size_t src_len, STRING* output) { // UTF-8 <-> Wide -------------------------------------------------------------- -std::string WideToUTF8(const std::wstring& wide) { - std::string ret; - if (wide.empty()) - return ret; +bool WideToUTF8AndAdjustOffset(const wchar_t* src, + size_t src_len, + std::string* output, + size_t* offset_for_adjustment) { + PrepareForUTF8Output(src, src_len, output); + return ConvertUnicode<wchar_t, std::string>(src, src_len, output, + offset_for_adjustment); +} +std::string WideToUTF8AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment) { + std::string ret; // Ignore the success flag of this call, it will do the best it can for // invalid input, which is what we want here. - WideToUTF8(wide.data(), wide.length(), &ret); + WideToUTF8AndAdjustOffset(wide.data(), wide.length(), &ret, + offset_for_adjustment); return ret; } -bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF8Output(src, src_len, output); - return ConvertUnicode<wchar_t, std::string>(src, src_len, output); +bool UTF8ToWideAndAdjustOffset(const char* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment) { + PrepareForUTF16Or32Output(src, src_len, output); + return ConvertUnicode<char, std::wstring>(src, src_len, output, + offset_for_adjustment); } -std::wstring UTF8ToWide(const base::StringPiece& utf8) { +std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, + size_t* offset_for_adjustment) { std::wstring ret; - if (utf8.empty()) - return ret; - - UTF8ToWide(utf8.data(), utf8.length(), &ret); + UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret, + offset_for_adjustment); return ret; } -bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF16Or32Output(src, src_len, output); - return ConvertUnicode<char, std::wstring>(src, src_len, output); -} - // UTF-16 <-> Wide ------------------------------------------------------------- #if defined(WCHAR_T_IS_UTF16) // When wide == UTF-16, then conversions are a NOP. -string16 WideToUTF16(const std::wstring& wide) { - return wide; -} - -bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { +bool WideToUTF16AndAdjustOffset(const wchar_t* src, + size_t src_len, + string16* output, + size_t* offset_for_adjustment) { output->assign(src, src_len); + if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) + *offset_for_adjustment = string16::npos; return true; } -std::wstring UTF16ToWide(const string16& utf16) { - return utf16; +string16 WideToUTF16AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment) { + if (offset_for_adjustment && (*offset_for_adjustment >= wide.length())) + *offset_for_adjustment = string16::npos; + return wide; } -bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { +bool UTF16ToWideAndAdjustOffset(const char16* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment) { output->assign(src, src_len); + if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) + *offset_for_adjustment = std::wstring::npos; return true; } -#elif defined(WCHAR_T_IS_UTF32) - -string16 WideToUTF16(const std::wstring& wide) { - string16 ret; - if (wide.empty()) - return ret; - - WideToUTF16(wide.data(), wide.length(), &ret); - return ret; +std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, + size_t* offset_for_adjustment) { + if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) + *offset_for_adjustment = std::wstring::npos; + return utf16; } -bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { - if (src_len == 0) { - output->clear(); - return true; - } +#elif defined(WCHAR_T_IS_UTF32) +bool WideToUTF16AndAdjustOffset(const wchar_t* src, + size_t src_len, + string16* output, + size_t* offset_for_adjustment) { + output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); - return ConvertUnicode<wchar_t, string16>(src, src_len, output); + return ConvertUnicode<wchar_t, string16>(src, src_len, output, + offset_for_adjustment); } -std::wstring UTF16ToWide(const string16& utf16) { - std::wstring ret; - if (utf16.empty()) - return ret; - - UTF16ToWide(utf16.data(), utf16.length(), &ret); +string16 WideToUTF16AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment) { + string16 ret; + WideToUTF16AndAdjustOffset(wide.data(), wide.length(), &ret, + offset_for_adjustment); return ret; } -bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { - if (src_len == 0) { - output->clear(); - return true; - } - +bool UTF16ToWideAndAdjustOffset(const char16* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment) { + output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); - return ConvertUnicode<char16, std::wstring>(src, src_len, output); + return ConvertUnicode<char16, std::wstring>(src, src_len, output, + offset_for_adjustment); +} + +std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, + size_t* offset_for_adjustment) { + std::wstring ret; + UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret, + offset_for_adjustment); + return ret; } #endif // defined(WCHAR_T_IS_UTF32) @@ -298,20 +345,12 @@ bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) { #if defined(WCHAR_T_IS_UTF32) bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF16Or32Output(src, src_len, output); - return ConvertUnicode<char, string16>(src, src_len, output); + PrepareForUTF16Or32Output(src, src_len, output); + return ConvertUnicode<char, string16>(src, src_len, output, NULL); } string16 UTF8ToUTF16(const std::string& utf8) { string16 ret; - if (utf8.empty()) - return ret; - // Ignore the success flag of this call, it will do the best it can for // invalid input, which is what we want here. UTF8ToUTF16(utf8.data(), utf8.length(), &ret); @@ -319,20 +358,12 @@ string16 UTF8ToUTF16(const std::string& utf8) { } bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) { - if (src_len == 0) { - output->clear(); - return true; - } - - ReserveUTF8Output(src, src_len, output); - return ConvertUnicode<char16, std::string>(src, src_len, output); + PrepareForUTF8Output(src, src_len, output); + return ConvertUnicode<char16, std::string>(src, src_len, output, NULL); } std::string UTF16ToUTF8(const string16& utf16) { std::string ret; - if (utf16.empty()) - return ret; - // Ignore the success flag of this call, it will do the best it can for // invalid input, which is what we want here. UTF16ToUTF8(utf16.data(), utf16.length(), &ret); diff --git a/base/utf_string_conversions.h b/base/utf_string_conversions.h index 89846ed..323233b 100644 --- a/base/utf_string_conversions.h +++ b/base/utf_string_conversions.h @@ -10,6 +10,37 @@ #include "base/string16.h" #include "base/string_piece.h" +// Like the conversions below, but also takes an offset into the source string, +// which will be adjusted to point at the same logical place in the result +// string. If this isn't possible because it points past the end of the source +// string or into the middle of a multibyte sequence, it will be set to +// std::wstring::npos. |offset_for_adjustment| may be NULL. +bool WideToUTF8AndAdjustOffset(const wchar_t* src, + size_t src_len, + std::string* output, + size_t* offset_for_adjustment); +std::string WideToUTF8AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment); +bool UTF8ToWideAndAdjustOffset(const char* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment); +std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, + size_t* offset_for_adjustment); + +bool WideToUTF16AndAdjustOffset(const wchar_t* src, + size_t src_len, + string16* output, + size_t* offset_for_adjustment); +string16 WideToUTF16AndAdjustOffset(const std::wstring& wide, + size_t* offset_for_adjustment); +bool UTF16ToWideAndAdjustOffset(const char16* src, + size_t src_len, + std::wstring* output, + size_t* offset_for_adjustment); +std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, + size_t* offset_for_adjustment); + // These convert between UTF-8, -16, and -32 strings. They are potentially slow, // so avoid unnecessary conversions. The low-level versions return a boolean // indicating whether the conversion was 100% valid. In this case, it will still @@ -23,15 +54,34 @@ // the Unicode replacement character or adding |replacement_char| parameter. // Currently, it's skipped in the ouput, which could be problematic in // some situations. -bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output); -std::string WideToUTF8(const std::wstring& wide); -bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output); -std::wstring UTF8ToWide(const base::StringPiece& utf8); - -bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output); -string16 WideToUTF16(const std::wstring& wide); -bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output); -std::wstring UTF16ToWide(const string16& utf16); +inline bool WideToUTF8(const wchar_t* src, + size_t src_len, + std::string* output) { + return WideToUTF8AndAdjustOffset(src, src_len, output, NULL); +} +inline std::string WideToUTF8(const std::wstring& wide) { + return WideToUTF8AndAdjustOffset(wide, NULL); +} +inline bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) { + return UTF8ToWideAndAdjustOffset(src, src_len, output, NULL); +} +inline std::wstring UTF8ToWide(const base::StringPiece& utf8) { + return UTF8ToWideAndAdjustOffset(utf8, NULL); +} + +inline bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) { + return WideToUTF16AndAdjustOffset(src, src_len, output, NULL); +} +inline string16 WideToUTF16(const std::wstring& wide) { + return WideToUTF16AndAdjustOffset(wide, NULL); +} +inline bool UTF16ToWide(const char16* src, size_t src_len, + std::wstring* output) { + return UTF16ToWideAndAdjustOffset(src, src_len, output, NULL); +} +inline std::wstring UTF16ToWide(const string16& utf16) { + return UTF16ToWideAndAdjustOffset(utf16, NULL); +} bool UTF8ToUTF16(const char* src, size_t src_len, string16* output); string16 UTF8ToUTF16(const std::string& utf8); diff --git a/base/utf_string_conversions_unittest.cc b/base/utf_string_conversions_unittest.cc new file mode 100644 index 0000000..67af7c3 --- /dev/null +++ b/base/utf_string_conversions_unittest.cc @@ -0,0 +1,306 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/basictypes.h" +#include "base/string_util.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace base { + +namespace { + +// Given a null-terminated string of wchar_t with each wchar_t representing +// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. +// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) +// should be represented as a surrogate pair (two UTF-16 units) +// *even* where wchar_t is 32-bit (Linux and Mac). +// +// This is to help write tests for functions with string16 params until +// the C++ 0x UTF-16 literal is well-supported by compilers. +string16 BuildString16(const wchar_t* s) { +#if defined(WCHAR_T_IS_UTF16) + return string16(s); +#elif defined(WCHAR_T_IS_UTF32) + string16 u16; + while (*s != 0) { + DCHECK(static_cast<unsigned int>(*s) <= 0xFFFFu); + u16.push_back(*s++); + } + return u16; +#endif +} + +const wchar_t* const kConvertRoundtripCases[] = { + L"Google Video", + // "网页 图片 资讯更多 »" + L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", + // "Παγκόσμιος Ιστός" + L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", + // "Поиск страниц на русском" + L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" + L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" + L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", + // "전체서비스" + L"\xc804\xccb4\xc11c\xbe44\xc2a4", + + // Test characters that take more than 16 bits. This will depend on whether + // wchar_t is 16 or 32 bits. +#if defined(WCHAR_T_IS_UTF16) + L"\xd800\xdf00", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", +#elif defined(WCHAR_T_IS_UTF32) + L"\x10300", + // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) + L"\x11d40\x11d41\x11d42\x11d43\x11d44", +#endif +}; + +} // namespace + +TEST(UTFStringConversionsTest, ConvertUTF8AndWide) { + // we round-trip all the wide strings through UTF-8 to make sure everything + // agrees on the conversion. This uses the stream operators to test them + // simultaneously. + for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { + std::ostringstream utf8; + utf8 << WideToUTF8(kConvertRoundtripCases[i]); + std::wostringstream wide; + wide << UTF8ToWide(utf8.str()); + + EXPECT_EQ(kConvertRoundtripCases[i], wide.str()); + } +} + +TEST(UTFStringConversionsTest, ConvertUTF8AndWideEmptyString) { + // An empty std::wstring should be converted to an empty std::string, + // and vice versa. + std::wstring wempty; + std::string empty; + EXPECT_EQ(empty, WideToUTF8(wempty)); + EXPECT_EQ(wempty, UTF8ToWide(empty)); +} + +TEST(UTFStringConversionsTest, ConvertUTF8ToWide) { + struct UTF8ToWideCase { + const char* utf8; + const wchar_t* wide; + bool success; + } convert_cases[] = { + // Regular UTF-8 input. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, + // Non-character is passed through. + {"\xef\xbf\xbfHello", L"\xffffHello", true}, + // Truncated UTF-8 sequence. + {"\xe4\xa0\xe5\xa5\xbd", L"\x597d", false}, + // Truncated off the end. + {"\xe5\xa5\xbd\xe4\xa0", L"\x597d", false}, + // Non-shortest-form UTF-8. + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\x597d", false}, + // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. + {"\xed\xb0\x80", L"", false}, + // Non-BMP characters. The second is a non-character regarded as valid. + // The result will either be in UTF-16 or UTF-32. +#if defined(WCHAR_T_IS_UTF16) + {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, +#elif defined(WCHAR_T_IS_UTF32) + {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, + {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, +#endif + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { + std::wstring converted; + EXPECT_EQ(convert_cases[i].success, + UTF8ToWide(convert_cases[i].utf8, + strlen(convert_cases[i].utf8), + &converted)); + std::wstring expected(convert_cases[i].wide); + EXPECT_EQ(expected, converted); + } + + // Manually test an embedded NULL. + std::wstring converted; + EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); + ASSERT_EQ(3U, converted.length()); + EXPECT_EQ(static_cast<wchar_t>(0), converted[0]); + EXPECT_EQ('Z', converted[1]); + EXPECT_EQ('\t', converted[2]); + + // Make sure that conversion replaces, not appends. + EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); + ASSERT_EQ(1U, converted.length()); + EXPECT_EQ('B', converted[0]); +} + +#if defined(WCHAR_T_IS_UTF16) +// This test is only valid when wchar_t == UTF-16. +TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf16; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular UTF-16 input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + // Truncated at the end. + {L"\x597d\xd800", "\xe5\xa5\xbd", false}, + }; + + for (int i = 0; i < arraysize(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf16, + wcslen(convert_cases[i].utf16), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} + +#elif defined(WCHAR_T_IS_UTF32) +// This test is only valid when wchar_t == UTF-32. +TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) { + struct WideToUTF8Case { + const wchar_t* utf32; + const char* utf8; + bool success; + } convert_cases[] = { + // Regular 16-bit input. + {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, + // Test a non-BMP character. + {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, + // Non-characters are passed through. + {L"\xffffHello", "\xEF\xBF\xBFHello", true}, + {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, + // Invalid Unicode code points. + {L"\xfffffffHello", "Hello", false}, + // The first character is a truncated UTF-16 character. + {L"\xd800\x597d", "\xe5\xa5\xbd", false}, + {L"\xdc01Hello", "Hello", false}, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(convert_cases); i++) { + std::string converted; + EXPECT_EQ(convert_cases[i].success, + WideToUTF8(convert_cases[i].utf32, + wcslen(convert_cases[i].utf32), + &converted)); + std::string expected(convert_cases[i].utf8); + EXPECT_EQ(expected, converted); + } +} +#endif // defined(WCHAR_T_IS_UTF32) + +TEST(UTFStringConversionsTest, ConvertMultiString) { + static wchar_t wmulti[] = { + L'f', L'o', L'o', L'\0', + L'b', L'a', L'r', L'\0', + L'b', L'a', L'z', L'\0', + L'\0' + }; + static char multi[] = { + 'f', 'o', 'o', '\0', + 'b', 'a', 'r', '\0', + 'b', 'a', 'z', '\0', + '\0' + }; + std::wstring wmultistring; + memcpy(WriteInto(&wmultistring, arraysize(wmulti)), wmulti, sizeof(wmulti)); + EXPECT_EQ(arraysize(wmulti) - 1, wmultistring.length()); + std::string expected; + memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi)); + EXPECT_EQ(arraysize(multi) - 1, expected.length()); + const std::string& converted = WideToUTF8(wmultistring); + EXPECT_EQ(arraysize(multi) - 1, converted.length()); + EXPECT_EQ(expected, converted); +} + +TEST(UTFStringConversionsTest, AdjustOffset) { + // Under the hood, all the functions call the same converter function, so we + // don't need to exhaustively check every case. + struct WideToUTF8Case { + const wchar_t* wide; + size_t input_offset; + size_t output_offset; + } wide_to_utf8_cases[] = { + {L"", 0, std::string::npos}, + {L"\x4f60\x597d", 0, 0}, + {L"\x4f60\x597d", 1, 3}, + {L"\x4f60\x597d", 2, std::string::npos}, + {L"\x4f60\x597d", std::wstring::npos, std::string::npos}, + {L"\xd800\x597dz", 1, 0}, + {L"\xd800\x597dz", 2, 3}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(wide_to_utf8_cases); ++i) { + size_t offset = wide_to_utf8_cases[i].input_offset; + WideToUTF8AndAdjustOffset(wide_to_utf8_cases[i].wide, &offset); + EXPECT_EQ(wide_to_utf8_cases[i].output_offset, offset); + } + + struct UTF8ToWideCase { + const char* utf8; + size_t input_offset; + size_t output_offset; + } utf8_to_wide_cases[] = { + {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos}, + {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1}, + {"\xed\xb0\x80z", 3, 0}, + {"A\xF0\x90\x8C\x80z", 1, 1}, + {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos}, +#if defined(WCHAR_T_IS_UTF16) + {"A\xF0\x90\x8C\x80z", 5, 3}, +#elif defined(WCHAR_T_IS_UTF32) + {"A\xF0\x90\x8C\x80z", 5, 2}, +#endif + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf8_to_wide_cases); ++i) { + size_t offset = utf8_to_wide_cases[i].input_offset; + UTF8ToWideAndAdjustOffset(utf8_to_wide_cases[i].utf8, &offset); + EXPECT_EQ(utf8_to_wide_cases[i].output_offset, offset); + } + +#if defined(WCHAR_T_IS_UTF32) + struct WideToUTF16Case { + const wchar_t* wide; + size_t input_offset; + size_t output_offset; + } wide_to_utf16_cases[] = { + {L"\x4F60\x597D", 1, 1}, + {L"\x20000\x4E00", 1, 2}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(wide_to_utf16_cases); ++i) { + size_t offset = wide_to_utf16_cases[i].input_offset; + WideToUTF16AndAdjustOffset(wide_to_utf16_cases[i].wide, &offset); + EXPECT_EQ(wide_to_utf16_cases[i].output_offset, offset); + } + + struct UTF16ToWideCase { + const wchar_t* wide; + size_t input_offset; + size_t output_offset; + } utf16_to_wide_cases[] = { + {L"\xD840\xDC00\x4E00", 0, 0}, + {L"\xD840\xDC00\x4E00", 1, std::wstring::npos}, + {L"\xD840\xDC00\x4E00", 2, 1}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_wide_cases); ++i) { + size_t offset = utf16_to_wide_cases[i].input_offset; + UTF16ToWideAndAdjustOffset(BuildString16(utf16_to_wide_cases[i].wide), + &offset); + EXPECT_EQ(utf16_to_wide_cases[i].output_offset, offset); + } +#endif +} + +} // namaspace base diff --git a/chrome/browser/autocomplete/autocomplete.cc b/chrome/browser/autocomplete/autocomplete.cc index f9223b4..1b0340d2 100644 --- a/chrome/browser/autocomplete/autocomplete.cc +++ b/chrome/browser/autocomplete/autocomplete.cc @@ -438,10 +438,6 @@ void AutocompleteMatch::ClassifyLocationInString( size_t overall_length, int style, ACMatchClassifications* classification) { - // Classifying an empty match makes no sense and will lead to validation - // errors later. - DCHECK(match_length > 0); - classification->clear(); // Don't classify anything about an empty string @@ -459,6 +455,9 @@ void AutocompleteMatch::ClassifyLocationInString( // No match, above classification will suffice for whole string. return; } + // Classifying an empty match makes no sense and will lead to validation + // errors later. + DCHECK(match_length > 0); classification->push_back(ACMatchClassification(match_location, (style | ACMatchClassification::MATCH) & ~ACMatchClassification::DIM)); diff --git a/chrome/browser/autocomplete/autocomplete.h b/chrome/browser/autocomplete/autocomplete.h index 0193b8c..f5d9ac0 100644 --- a/chrome/browser/autocomplete/autocomplete.h +++ b/chrome/browser/autocomplete/autocomplete.h @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -548,9 +548,9 @@ class AutocompleteProvider // profile's bookmark bar model. void UpdateStarredStateOfMatches(); - // A convenience function to call gfx::GetCleanStringFromUrl() with the - // current set of "Accept Languages" when check_accept_lang is true. - // Otherwise, it's called with an empty list. + // A convenience function to call net::FormatUrl() with the current set of + // "Accept Languages" when check_accept_lang is true. Otherwise, it's called + // with an empty list. std::wstring StringForURLDisplay(const GURL& url, bool check_accept_lang) const; diff --git a/chrome/browser/autocomplete/history_url_provider.cc b/chrome/browser/autocomplete/history_url_provider.cc index b44b6e7..a1d971a 100644 --- a/chrome/browser/autocomplete/history_url_provider.cc +++ b/chrome/browser/autocomplete/history_url_provider.cc @@ -68,9 +68,8 @@ void HistoryURLProvider::DeleteMatch(const AutocompleteMatch& match) { DCHECK(done_); // Delete the match from the history DB. - HistoryService* history_service = - profile_ ? profile_->GetHistoryService(Profile::EXPLICIT_ACCESS) : - history_service_; + HistoryService* const history_service = + profile_->GetHistoryService(Profile::EXPLICIT_ACCESS); GURL selected_url(match.destination_url); if (!history_service || !selected_url.is_valid()) { NOTREACHED() << "Can't delete requested URL"; @@ -628,16 +627,17 @@ void HistoryURLProvider::RunAutocompletePasses( matches_.push_back(SuggestExactInput(input, trim_http)); // We'll need the history service to run both passes, so try to obtain it. - HistoryService* const history_service = profile_ ? - profile_->GetHistoryService(Profile::EXPLICIT_ACCESS) : history_service_; + HistoryService* const history_service = + profile_->GetHistoryService(Profile::EXPLICIT_ACCESS); if (!history_service) return; // Create the data structure for the autocomplete passes. We'll save this off // onto the |params_| member for later deletion below if we need to run pass // 2. - const std::wstring& languages = profile_ ? - profile_->GetPrefs()->GetString(prefs::kAcceptLanguages) : std::wstring(); + std::wstring languages(languages_); + if (languages.empty() && profile_) + languages = profile_->GetPrefs()->GetString(prefs::kAcceptLanguages); scoped_ptr<HistoryURLProviderParams> params( new HistoryURLProviderParams(input, trim_http, languages)); @@ -826,28 +826,47 @@ AutocompleteMatch HistoryURLProvider::HistoryMatchToACMatch( !!info.visit_count(), AutocompleteMatch::HISTORY_URL); match.destination_url = info.url(); DCHECK(match.destination_url.is_valid()); + size_t inline_autocomplete_offset = + history_match.input_location + params->input.text().length(); match.fill_into_edit = net::FormatUrl(info.url(), - match_type == WHAT_YOU_TYPED ? std::wstring() : params->languages); - if (!params->input.prevent_inline_autocomplete()) { - match.inline_autocomplete_offset = - history_match.input_location + params->input.text().length(); - } + match_type == WHAT_YOU_TYPED ? std::wstring() : params->languages, true, + UnescapeRule::SPACES, NULL, NULL, &inline_autocomplete_offset); size_t offset = 0; if (params->trim_http && !history_match.match_in_scheme) { offset = TrimHttpPrefix(&match.fill_into_edit); - if (match.inline_autocomplete_offset != std::wstring::npos) { - DCHECK(match.inline_autocomplete_offset >= offset); - match.inline_autocomplete_offset -= offset; + if (inline_autocomplete_offset != std::wstring::npos) { + DCHECK(inline_autocomplete_offset >= offset); + inline_autocomplete_offset -= offset; } } + if (!params->input.prevent_inline_autocomplete()) + match.inline_autocomplete_offset = inline_autocomplete_offset; DCHECK((match.inline_autocomplete_offset == std::wstring::npos) || (match.inline_autocomplete_offset <= match.fill_into_edit.length())); - match.contents = match.fill_into_edit; - AutocompleteMatch::ClassifyLocationInString( - history_match.input_location - offset, params->input.text().length(), - match.contents.length(), ACMatchClassification::URL, - &match.contents_class); + size_t match_start = history_match.input_location; + match.contents = net::FormatUrl(info.url(), + match_type == WHAT_YOU_TYPED ? std::wstring() : params->languages, true, + UnescapeRule::SPACES, NULL, NULL, &match_start); + if (offset) { + TrimHttpPrefix(&match.contents); + if (match_start != std::wstring::npos) { + DCHECK(match_start >= offset); + match_start -= offset; + } + } + if ((match_start != std::wstring::npos) && + (inline_autocomplete_offset != std::wstring::npos) && + (inline_autocomplete_offset != match_start)) { + DCHECK(inline_autocomplete_offset > match_start); + AutocompleteMatch::ClassifyLocationInString(match_start, + inline_autocomplete_offset - match_start, match.contents.length(), + ACMatchClassification::URL, &match.contents_class); + } else { + AutocompleteMatch::ClassifyLocationInString(std::wstring::npos, 0, + match.contents.length(), ACMatchClassification::URL, + &match.contents_class); + } match.description = info.title(); AutocompleteMatch::ClassifyMatchInString(params->input.text(), info.title(), ACMatchClassification::NONE, diff --git a/chrome/browser/autocomplete/history_url_provider.h b/chrome/browser/autocomplete/history_url_provider.h index 50f6ba7..152a938 100644 --- a/chrome/browser/autocomplete/history_url_provider.h +++ b/chrome/browser/autocomplete/history_url_provider.h @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -135,18 +135,18 @@ class HistoryURLProvider : public AutocompleteProvider { public: HistoryURLProvider(ACProviderListener* listener, Profile* profile) : AutocompleteProvider(listener, profile, "HistoryURL"), - history_service_(NULL), prefixes_(GetPrefixes()), params_(NULL) { } #ifdef UNIT_TEST HistoryURLProvider(ACProviderListener* listener, - HistoryService* history_service) - : AutocompleteProvider(listener, NULL, "History"), - history_service_(history_service), + Profile* profile, + const std::wstring& languages) + : AutocompleteProvider(listener, profile, "History"), prefixes_(GetPrefixes()), - params_(NULL) { + params_(NULL), + languages_(languages) { } #endif // no destructor (see note above) @@ -379,10 +379,6 @@ class HistoryURLProvider : public AutocompleteProvider { MatchType match_type, size_t match_number); - // This is only non-null for testing, otherwise the HistoryService from the - // Profile is used. - HistoryService* history_service_; - // Prefixes to try appending to user input when looking for a match. const Prefixes prefixes_; @@ -391,6 +387,10 @@ class HistoryURLProvider : public AutocompleteProvider { // parameter itself is freed once it's no longer needed. The only reason we // keep this member is so we can set the cancel bit on it. HistoryURLProviderParams* params_; + + // Only used by unittests; if non-empty, overrides accept-languages in the + // profile's pref system. + std::wstring languages_; }; #endif // CHROME_BROWSER_AUTOCOMPLETE_HISTORY_URL_PROVIDER_H_ diff --git a/chrome/browser/autocomplete/history_url_provider_unittest.cc b/chrome/browser/autocomplete/history_url_provider_unittest.cc index 408526a..45e1426 100644 --- a/chrome/browser/autocomplete/history_url_provider_unittest.cc +++ b/chrome/browser/autocomplete/history_url_provider_unittest.cc @@ -83,6 +83,11 @@ static TestURLInfo test_db[] = { {"http://go/", L"Intranet URL", 1, 1}, {"http://gooey/", L"Intranet URL 2", 5, 5}, + // URLs for testing offset adjustment + {"http://www.\xEA\xB5\x90\xEC\x9C\xA1.kr/", L"Korean", 2, 2}, + {"http://spaces.com/path%20with%20spaces/foo.html", L"Spaces", 2, 2}, + {"http://ms/c++%20style%20guide", L"Style guide", 2, 2}, + {"http://foo:bar@baz.com/", L"HTTP auth", 2, 2}, }; class HistoryURLProviderTest : public testing::Test, @@ -116,6 +121,8 @@ class HistoryURLProviderTest : public testing::Test, const std::string* expected_urls, size_t num_results); + void RunAdjustOffsetTest(const std::wstring text, size_t expected_offset); + MessageLoopForUI message_loop_; ChromeThread ui_thread_; ChromeThread file_thread_; @@ -144,7 +151,7 @@ void HistoryURLProviderTest::SetUpImpl(bool no_db) { profile_->CreateHistoryService(true, no_db); history_service_ = profile_->GetHistoryService(Profile::EXPLICIT_ACCESS); - autocomplete_ = new HistoryURLProvider(this, profile_.get()); + autocomplete_ = new HistoryURLProvider(this, profile_.get(), L"en-US,en,ko"); FillData(); } @@ -189,6 +196,18 @@ void HistoryURLProviderTest::RunTest(const std::wstring text, EXPECT_EQ(expected_urls[i], matches_[i].destination_url.spec()); } +void HistoryURLProviderTest::RunAdjustOffsetTest(const std::wstring text, + size_t expected_offset) { + AutocompleteInput input(text, std::wstring(), false, false, false); + autocomplete_->Start(input, false); + if (!autocomplete_->done()) + MessageLoop::current()->Run(); + + matches_ = autocomplete_->matches(); + ASSERT_GE(matches_.size(), 1U) << "Input text: " << text; + EXPECT_EQ(expected_offset, matches_[0].inline_autocomplete_offset); +} + TEST_F(HistoryURLProviderTest, PromoteShorterURLs) { // Test that hosts get synthesized below popular pages. const std::string expected_nonsynth[] = { @@ -382,6 +401,14 @@ TEST_F(HistoryURLProviderTest, Fixup) { RunTest(L"17173", std::wstring(), false, fixup_5, arraysize(fixup_5)); } +TEST_F(HistoryURLProviderTest, AdjustOffset) { + RunAdjustOffsetTest(L"http://www.\uAD50\uC721", 13); + RunAdjustOffsetTest(L"http://spaces.com/path%20with%20spa", 31); + RunAdjustOffsetTest(L"http://ms/c++ s", 15); + RunAdjustOffsetTest(L"http://foo:ba", std::wstring::npos); + RunAdjustOffsetTest(L"http://foo:bar@ba", 9); +} + TEST_F(HistoryURLProviderTestNoDB, NavigateWithoutDB) { // Ensure that we will still produce matches for navigation when there is no // database. diff --git a/chrome/browser/bookmarks/bookmark_table_model.cc b/chrome/browser/bookmarks/bookmark_table_model.cc index 142090c..9b4fd82 100644 --- a/chrome/browser/bookmarks/bookmark_table_model.cc +++ b/chrome/browser/bookmarks/bookmark_table_model.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -324,9 +324,8 @@ std::wstring BookmarkTableModel::GetText(int row, int column_id) { std::wstring languages = model_ && model_->profile() ? model_->profile()->GetPrefs()->GetString(prefs::kAcceptLanguages) : std::wstring(); - std::wstring url_text = - net::FormatUrl(node->GetURL(), languages, false, UnescapeRule::SPACES, - NULL, NULL); + std::wstring url_text = net::FormatUrl(node->GetURL(), languages, false, + UnescapeRule::SPACES, NULL, NULL, NULL); if (l10n_util::GetTextDirection() == l10n_util::RIGHT_TO_LEFT) l10n_util::WrapStringWithLTRFormatting(&url_text); return url_text; diff --git a/chrome/browser/bookmarks/bookmark_utils.cc b/chrome/browser/bookmarks/bookmark_utils.cc index 54ea21cb..e62a30a 100644 --- a/chrome/browser/bookmarks/bookmark_utils.cc +++ b/chrome/browser/bookmarks/bookmark_utils.cc @@ -187,7 +187,7 @@ bool DoesBookmarkContainWords(const BookmarkNode* node, l10n_util::ToLower(node->GetTitle()), words) || DoesBookmarkTextContainWords(UTF8ToWide(node->GetURL().spec()), words) || DoesBookmarkTextContainWords(net::FormatUrl( - node->GetURL(), languages, false, true, NULL, NULL), words); + node->GetURL(), languages, false, true, NULL, NULL, NULL), words); } } // namespace diff --git a/chrome/browser/gtk/options/exceptions_page_gtk.cc b/chrome/browser/gtk/options/exceptions_page_gtk.cc index 164a821..10a8f2d 100644 --- a/chrome/browser/gtk/options/exceptions_page_gtk.cc +++ b/chrome/browser/gtk/options/exceptions_page_gtk.cc @@ -113,8 +113,7 @@ void ExceptionsPageGtk::SetExceptionList( for (size_t i = 0; i < result.size(); ++i) { exception_list_[i] = *result[i]; std::wstring formatted = net::FormatUrl(result[i]->origin, languages, - false, UnescapeRule::NONE, - NULL, NULL); + false, UnescapeRule::NONE, NULL, NULL, NULL); std::string site = WideToUTF8(formatted); GtkTreeIter iter; gtk_list_store_insert_with_values(exception_list_store_, &iter, (gint) i, diff --git a/chrome/browser/gtk/options/passwords_page_gtk.cc b/chrome/browser/gtk/options/passwords_page_gtk.cc index b2f6345..f4a2197 100644 --- a/chrome/browser/gtk/options/passwords_page_gtk.cc +++ b/chrome/browser/gtk/options/passwords_page_gtk.cc @@ -156,8 +156,7 @@ void PasswordsPageGtk::SetPasswordList( for (size_t i = 0; i < result.size(); ++i) { password_list_[i] = *result[i]; std::wstring formatted = net::FormatUrl(result[i]->origin, languages, - false, UnescapeRule::NONE, - NULL, NULL); + false, UnescapeRule::NONE, NULL, NULL, NULL); std::string site = WideToUTF8(formatted); std::string user = UTF16ToUTF8(result[i]->username_value); GtkTreeIter iter; diff --git a/chrome/browser/gtk/options/url_picker_dialog_gtk.cc b/chrome/browser/gtk/options/url_picker_dialog_gtk.cc index 6c4e38f..e646552 100644 --- a/chrome/browser/gtk/options/url_picker_dialog_gtk.cc +++ b/chrome/browser/gtk/options/url_picker_dialog_gtk.cc @@ -196,9 +196,8 @@ std::string UrlPickerDialogGtk::GetURLForPath(GtkTreePath* path) const { profile_->GetPrefs()->GetString(prefs::kAcceptLanguages); // Because the url_field_ is user-editable, we set the URL with // username:password and escaped path and query. - std::wstring formatted = net::FormatUrl( - url_table_model_->GetURL(row), languages, - false, UnescapeRule::NONE, NULL, NULL); + std::wstring formatted = net::FormatUrl(url_table_model_->GetURL(row), + languages, false, UnescapeRule::NONE, NULL, NULL, NULL); return WideToUTF8(formatted); } diff --git a/chrome/browser/net/browser_url_util.cc b/chrome/browser/net/browser_url_util.cc index 940d3b6..5f287795 100644 --- a/chrome/browser/net/browser_url_util.cc +++ b/chrome/browser/net/browser_url_util.cc @@ -21,9 +21,9 @@ void WriteURLToClipboard(const GURL& url, // Unescaping path and query is not a good idea because other applications // may not encode non-ASCII characters in UTF-8. See crbug.com/2820. string16 text = url.SchemeIs(chrome::kMailToScheme) ? - ASCIIToUTF16(url.path()) : - WideToUTF16(net::FormatUrl(url, languages, false, - UnescapeRule::NONE, NULL, NULL)); + ASCIIToUTF16(url.path()) : + WideToUTF16(net::FormatUrl(url, languages, false, UnescapeRule::NONE, + NULL, NULL, NULL)); ScopedClipboardWriter scw(clipboard); scw.WriteURL(text); diff --git a/chrome/browser/net/url_fixer_upper.cc b/chrome/browser/net/url_fixer_upper.cc index b465268..a68bc34 100644 --- a/chrome/browser/net/url_fixer_upper.cc +++ b/chrome/browser/net/url_fixer_upper.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -146,11 +146,10 @@ static string FixupHomedir(const string& text) { #endif // Tries to create a file: URL from |text| if it looks like a filename, even if -// it doesn't resolve as a valid path or to an existing file. Returns true -// with a (possibly invalid) file: URL in |fixed_up_url| for input beginning -// with a drive specifier or "\\". Returns false in other cases (including -// file: URLs: these don't look like filenames), leaving fixed_up_url -// unchanged. +// it doesn't resolve as a valid path or to an existing file. Returns a +// (possibly invalid) file: URL in |fixed_up_url| for input beginning +// with a drive specifier or "\\". Returns the unchanged input in other cases +// (including file: URLs: these don't look like filenames). static string FixupPath(const string& text) { DCHECK(!text.empty()); @@ -173,7 +172,7 @@ static string FixupPath(const string& text) { GURL file_url = net::FilePathToFileURL(FilePath(filename)); if (file_url.is_valid()) { return WideToUTF8(net::FormatUrl(file_url, std::wstring(), true, - UnescapeRule::NORMAL, NULL, NULL)); + UnescapeRule::NORMAL, NULL, NULL, NULL)); } // Invalid file URL, just return the input. @@ -182,7 +181,6 @@ static string FixupPath(const string& text) { // Checks |domain| to see if a valid TLD is already present. If not, appends // |desired_tld| to the domain, and prepends "www." unless it's already present. -// Then modifies |fixed_up_url| to reflect the changes. static void AddDesiredTLD(const string& desired_tld, string* domain) { if (desired_tld.empty() || domain->empty()) @@ -268,30 +266,15 @@ static void FixupHost(const string& text, url->append(domain); } -// Looks for a port number, including initial colon, at port_start. If -// something invalid (which cannot be fixed up) is found, like ":foo" or -// ":7:7", returns false. Otherwise, removes any extra colons -// ("::1337" -> ":1337", ":/" -> "/") and returns true. static void FixupPort(const string& text, const url_parse::Component& part, string* url) { if (!part.is_valid()) return; - // Look for non-digit in port and strip if found. - string port(text, part.begin, part.len); - for (string::iterator i = port.begin(); i != port.end();) { - if (IsAsciiDigit(*i)) - ++i; - else - i = port.erase(i); - } - - if (port.empty()) - return; // Nothing to append. - + // We don't fix up the port at the moment. url->append(":"); - url->append(port); + url->append(text, part.begin, part.len); } static inline void FixupPath(const string& text, @@ -573,7 +556,7 @@ string URLFixerUpper::FixupRelativeFile(const FilePath& base_dir, GURL file_url = net::FilePathToFileURL(full_path); if (file_url.is_valid()) return WideToUTF8(net::FormatUrl(file_url, std::wstring(), - true, UnescapeRule::NORMAL, NULL, NULL)); + true, UnescapeRule::NORMAL, NULL, NULL, NULL)); // Invalid files fall through to regular processing. } diff --git a/chrome/browser/net/url_fixer_upper_unittest.cc b/chrome/browser/net/url_fixer_upper_unittest.cc index d7f8b93..5028cb2 100644 --- a/chrome/browser/net/url_fixer_upper_unittest.cc +++ b/chrome/browser/net/url_fixer_upper_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -210,8 +210,8 @@ struct fixup_case { {" foo.com/asdf bar", "", "http://foo.com/asdf bar"}, {"..www.google.com..", "", "http://www.google.com./"}, {"http://......", "", "http://....../"}, - {"http://host.com:ninety-two/", "", "http://host.com/"}, - {"http://host.com:ninety-two?foo", "", "http://host.com/?foo"}, + {"http://host.com:ninety-two/", "", "http://host.com:ninety-two/"}, + {"http://host.com:ninety-two?foo", "", "http://host.com:ninety-two/?foo"}, {"google.com:123", "", "http://google.com:123/"}, {"about:", "", "about:"}, {"about:version", "", "about:version"}, diff --git a/chrome/browser/tab_contents/tab_contents.cc b/chrome/browser/tab_contents/tab_contents.cc index 3a34459..4e45553 100644 --- a/chrome/browser/tab_contents/tab_contents.cc +++ b/chrome/browser/tab_contents/tab_contents.cc @@ -2517,9 +2517,9 @@ void TabContents::LoadStateChanged(const GURL& url, upload_size_ = upload_size; std::wstring languages = profile()->GetPrefs()->GetString(prefs::kAcceptLanguages); - load_state_host_.clear(); std::string host = url.host(); - net::IDNToUnicode(host.c_str(), host.size(), languages, &load_state_host_); + load_state_host_ = + net::IDNToUnicode(host.c_str(), host.size(), languages, NULL); if (load_state_ == net::LOAD_STATE_READING_RESPONSE) SetNotWaitingForResponse(); if (is_loading()) diff --git a/chrome/browser/toolbar_model.cc b/chrome/browser/toolbar_model.cc index 1169c42..42977f6 100644 --- a/chrome/browser/toolbar_model.cc +++ b/chrome/browser/toolbar_model.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -41,7 +41,8 @@ std::wstring ToolbarModel::GetText() { url = entry->virtual_url(); } } - return net::FormatUrl(url, languages, true, UnescapeRule::NORMAL, NULL, NULL); + return net::FormatUrl(url, languages, true, UnescapeRule::NORMAL, NULL, NULL, + NULL); } ToolbarModel::SecurityLevel ToolbarModel::GetSecurityLevel() { diff --git a/chrome/browser/views/bookmark_editor_view.cc b/chrome/browser/views/bookmark_editor_view.cc index 5443f81..f40e25f 100644 --- a/chrome/browser/views/bookmark_editor_view.cc +++ b/chrome/browser/views/bookmark_editor_view.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -278,9 +278,8 @@ void BookmarkEditorView::Init() { : std::wstring(); // The following URL is user-editable. We specify omit_username_password= // false and unescape=false to show the original URL except IDN. - url_text = - net::FormatUrl(details_.existing_node->GetURL(), languages, false, - UnescapeRule::NONE, NULL, NULL); + url_text = net::FormatUrl(details_.existing_node->GetURL(), languages, + false, UnescapeRule::NONE, NULL, NULL, NULL); } url_tf_.SetText(url_text); url_tf_.SetController(this); diff --git a/chrome/browser/views/url_picker.cc b/chrome/browser/views/url_picker.cc index 5232676..0133dbd 100644 --- a/chrome/browser/views/url_picker.cc +++ b/chrome/browser/views/url_picker.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -293,9 +293,8 @@ void UrlPicker::OnSelectionChanged() { profile_->GetPrefs()->GetString(prefs::kAcceptLanguages); // Because the url_field_ is user-editable, we set the URL with // username:password and escaped path and query. - std::wstring formatted = net::FormatUrl( - url_table_model_->GetURL(selection), languages, - false, UnescapeRule::NONE, NULL, NULL); + std::wstring formatted = net::FormatUrl(url_table_model_->GetURL(selection), + languages, false, UnescapeRule::NONE, NULL, NULL, NULL); url_field_->SetText(formatted); if (title_field_) title_field_->SetText(url_table_model_->GetTitle(selection)); diff --git a/net/base/escape.cc b/net/base/escape.cc index 3d2aca2..5196eb6 100644 --- a/net/base/escape.cc +++ b/net/base/escape.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -107,7 +107,14 @@ const char kUrlUnescape[128] = { }; std::string UnescapeURLImpl(const std::string& escaped_text, - UnescapeRule::Type rules) { + UnescapeRule::Type rules, + size_t* offset_for_adjustment) { + size_t offset_temp = std::wstring::npos; + if (!offset_for_adjustment) + offset_for_adjustment = &offset_temp; + else if (*offset_for_adjustment >= escaped_text.length()) + *offset_for_adjustment = std::wstring::npos; + // Do not unescape anything, return the |escaped_text| text. if (rules == UnescapeRule::NONE) return escaped_text; @@ -136,8 +143,17 @@ std::string UnescapeURLImpl(const std::string& escaped_text, // Additionally allow control characters if requested. (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { // Use the unescaped version of the character. + size_t length_before_append = result.length(); result.push_back(value); i += 2; + + // Adjust offset to match length change. + if (*offset_for_adjustment != std::string::npos) { + if (*offset_for_adjustment > (length_before_append + 2)) + *offset_for_adjustment -= 2; + else if (*offset_for_adjustment > length_before_append) + *offset_for_adjustment = std::string::npos; + } } else { // Keep escaped. Append a percent and we'll get the following two // digits on the next loops through. @@ -231,19 +247,27 @@ bool EscapeQueryParamValue(const std::wstring& text, const char* codepage, return true; } -std::wstring UnescapeAndDecodeURLComponent(const std::string& text, - const char* codepage, - UnescapeRule::Type rules) { +std::wstring UnescapeAndDecodeUTF8URLComponent(const std::string& text, + UnescapeRule::Type rules, + size_t* offset_for_adjustment) { std::wstring result; - if (base::CodepageToWide(UnescapeURLImpl(text, rules), codepage, - base::OnStringConversionError::FAIL, &result)) + size_t original_offset = offset_for_adjustment ? *offset_for_adjustment : 0; + if (base::CodepageToWideAndAdjustOffset( + UnescapeURLImpl(text, rules, offset_for_adjustment), + "UTF-8", base::OnStringConversionError::FAIL, &result, + offset_for_adjustment)) return result; // Character set looks like it's valid. - return UTF8ToWide(text); // Return the escaped version when it's not. + + // Not valid. Return the escaped version. Undo our changes to + // |offset_for_adjustment| since we haven't changed the string after all. + if (offset_for_adjustment) + *offset_for_adjustment = original_offset; + return UTF8ToWideAndAdjustOffset(text, offset_for_adjustment); } std::string UnescapeURLComponent(const std::string& escaped_text, UnescapeRule::Type rules) { - return UnescapeURLImpl(escaped_text, rules); + return UnescapeURLImpl(escaped_text, rules, NULL); } template <class str> diff --git a/net/base/escape.h b/net/base/escape.h index 8761d4d..9ff17b6 100644 --- a/net/base/escape.h +++ b/net/base/escape.h @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -93,17 +93,17 @@ std::string UnescapeURLComponent(const std::string& escaped_text, UnescapeRule::Type rules); // Unescapes the given substring as a URL, and then tries to interpret the -// result as being encoded in the given code page. If the result is convertable -// into the code page, it will be returned as converted. If it is not, the -// original escaped string will be converted into a wide string and returned. -std::wstring UnescapeAndDecodeURLComponent(const std::string& text, - const char* codepage, - UnescapeRule::Type rules); -inline std::wstring UnescapeAndDecodeUTF8URLComponent( - const std::string& text, - UnescapeRule::Type rules) { - return UnescapeAndDecodeURLComponent(text, "UTF-8", rules); -} +// result as being encoded as UTF-8. If the result is convertable into UTF-8, it +// will be returned as converted. If it is not, the original escaped string will +// be converted into a wide string and returned. +// +// |offset_for_adjustment| may be NULL; if not, it is an offset into |text| that +// will be adjusted to point at the same logical place in the result string. If +// this isn't possible because it points into the middle of an escape sequence +// or past the end of the string, it will be set to std::wstring::npos. +std::wstring UnescapeAndDecodeUTF8URLComponent(const std::string& text, + UnescapeRule::Type rules, + size_t* offset_for_adjustment); // Deprecated ------------------------------------------------------------------ diff --git a/net/base/escape_unittest.cc b/net/base/escape_unittest.cc index 44bb9972..8e5e7dc 100644 --- a/net/base/escape_unittest.cc +++ b/net/base/escape_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -24,8 +24,7 @@ struct UnescapeURLCase { const char* output; }; -struct UnescapeAndDecodeURLCase { - const char* encoding; +struct UnescapeAndDecodeCase { const char* input; // The expected output when run through UnescapeURL. @@ -38,6 +37,12 @@ struct UnescapeAndDecodeURLCase { const wchar_t* decoded; }; +struct AdjustOffsetCase { + const char* input; + size_t input_offset; + size_t output_offset; +}; + struct EscapeForHTMLCase { const char* input; const char* expected_output; @@ -45,7 +50,7 @@ struct EscapeForHTMLCase { } // namespace -TEST(Escape, EscapeTextForFormSubmission) { +TEST(EscapeTest, EscapeTextForFormSubmission) { const EscapeCase escape_cases[] = { {L"foo", L"foo"}, {L"foo bar", L"foo+bar"}, @@ -93,7 +98,7 @@ TEST(Escape, EscapeTextForFormSubmission) { EXPECT_EQ(wide, EscapeQueryParamValueUTF8(test_str)); } -TEST(Escape, EscapePath) { +TEST(EscapeTest, EscapePath) { ASSERT_EQ( // Most of the character space we care about, un-escaped EscapePath( @@ -108,7 +113,7 @@ TEST(Escape, EscapePath) { "%7B%7C%7D~%7F%80%FF"); } -TEST(Escape, EscapeUrlEncodedData) { +TEST(EscapeTest, EscapeUrlEncodedData) { ASSERT_EQ( // Most of the character space we care about, un-escaped EscapeUrlEncodedData( @@ -123,7 +128,7 @@ TEST(Escape, EscapeUrlEncodedData) { "%7B%7C%7D~%7F%80%FF"); } -TEST(Escape, UnescapeURLComponent) { +TEST(EscapeTest, UnescapeURLComponent) { const UnescapeURLCase unescape_cases[] = { {"", UnescapeRule::NORMAL, ""}, {"%2", UnescapeRule::NORMAL, "%2"}, @@ -184,40 +189,48 @@ TEST(Escape, UnescapeURLComponent) { EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL)); } -TEST(Escape, UnescapeAndDecodeURLComponent) { - const UnescapeAndDecodeURLCase unescape_cases[] = { - {"UTF8", "%", "%", "%", L"%"}, - {"UTF8", "+", "+", " ", L"+"}, - {"UTF8", "%2+", "%2+", "%2 ", L"%2+"}, - {"UTF8", "+%%%+%%%", "+%%%+%%%", " %%% %%%", L"+%%%+%%%"}, - {"UTF8", "Don't escape anything", - "Don't escape anything", - "Don't escape anything", - L"Don't escape anything"}, - {"UTF8", "+Invalid %escape %2+", - "+Invalid %escape %2+", - " Invalid %escape %2 ", - L"+Invalid %escape %2+"}, - {"UTF8", "Some random text %25%3bOK", - "Some random text %25;OK", - "Some random text %25;OK", - L"Some random text %25;OK"}, - {"UTF8", "%01%02%03%04%05%06%07%08%09", - "%01%02%03%04%05%06%07%08%09", - "%01%02%03%04%05%06%07%08%09", - L"%01%02%03%04%05%06%07%08%09"}, - {"UTF8", "%E4%BD%A0+%E5%A5%BD", - "\xE4\xBD\xA0+\xE5\xA5\xBD", - "\xE4\xBD\xA0 \xE5\xA5\xBD", - L"\x4f60+\x597d"}, - {"BIG5", "%A7A%A6n", - "\xA7\x41\xA6n", - "\xA7\x41\xA6n", - L"\x4f60\x597d"}, - {"UTF8", "%ED%ED", // Invalid UTF-8. - "\xED\xED", - "\xED\xED", - L"%ED%ED"}, // Invalid UTF-8 -> kept unescaped. +TEST(EscapeTest, UnescapeAndDecodeUTF8URLComponent) { + const UnescapeAndDecodeCase unescape_cases[] = { + { "%", + "%", + "%", + L"%"}, + { "+", + "+", + " ", + L"+"}, + { "%2+", + "%2+", + "%2 ", + L"%2+"}, + { "+%%%+%%%", + "+%%%+%%%", + " %%% %%%", + L"+%%%+%%%"}, + { "Don't escape anything", + "Don't escape anything", + "Don't escape anything", + L"Don't escape anything"}, + { "+Invalid %escape %2+", + "+Invalid %escape %2+", + " Invalid %escape %2 ", + L"+Invalid %escape %2+"}, + { "Some random text %25%3BOK", + "Some random text %25;OK", + "Some random text %25;OK", + L"Some random text %25;OK"}, + { "%01%02%03%04%05%06%07%08%09", + "%01%02%03%04%05%06%07%08%09", + "%01%02%03%04%05%06%07%08%09", + L"%01%02%03%04%05%06%07%08%09"}, + { "%E4%BD%A0+%E5%A5%BD", + "\xE4\xBD\xA0+\xE5\xA5\xBD", + "\xE4\xBD\xA0 \xE5\xA5\xBD", + L"\x4f60+\x597d"}, + { "%ED%ED", // Invalid UTF-8. + "\xED\xED", + "\xED\xED", + L"%ED%ED"}, // Invalid UTF-8 -> kept unescaped. }; for (size_t i = 0; i < arraysize(unescape_cases); i++) { @@ -230,14 +243,36 @@ TEST(Escape, UnescapeAndDecodeURLComponent) { EXPECT_EQ(std::string(unescape_cases[i].query_unescaped), unescaped); // TODO: Need to test unescape_spaces and unescape_percent. - std::wstring decoded = UnescapeAndDecodeURLComponent( - unescape_cases[i].input, unescape_cases[i].encoding, - UnescapeRule::NORMAL); + std::wstring decoded = UnescapeAndDecodeUTF8URLComponent( + unescape_cases[i].input, UnescapeRule::NORMAL, NULL); EXPECT_EQ(std::wstring(unescape_cases[i].decoded), decoded); } } -TEST(Escape, EscapeForHTML) { +TEST(EscapeTest, AdjustOffset) { + const AdjustOffsetCase adjust_cases[] = { + {"", 0, std::wstring::npos}, + {"test", 0, 0}, + {"test", 2, 2}, + {"test", 4, std::wstring::npos}, + {"test", std::wstring::npos, std::wstring::npos}, + {"%3Btest", 6, 4}, + {"%3Btest", 2, std::wstring::npos}, + {"test%3B", 2, 2}, + {"%E4%BD%A0+%E5%A5%BD", 9, 1}, + {"%E4%BD%A0+%E5%A5%BD", 6, std::wstring::npos}, + {"%ED%B0%80+%E5%A5%BD", 6, 6}, + }; + + for (size_t i = 0; i < arraysize(adjust_cases); i++) { + size_t offset = adjust_cases[i].input_offset; + UnescapeAndDecodeUTF8URLComponent(adjust_cases[i].input, + UnescapeRule::NORMAL, &offset); + EXPECT_EQ(adjust_cases[i].output_offset, offset); + } +} + +TEST(EscapeTest, EscapeForHTML) { const EscapeForHTMLCase tests[] = { { "hello", "hello" }, { "<hello>", "<hello>" }, diff --git a/net/base/net_util.cc b/net/base/net_util.cc index 85151e9..9171e54 100644 --- a/net/base/net_util.cc +++ b/net/base/net_util.cc @@ -650,60 +650,51 @@ bool IsIDNComponentSafe(const char16* str, } // Converts one component of a host (between dots) to IDN if safe. The result -// will be APPENDED to the given output string and will be the same as the -// input if it is not IDN or the IDN is unsafe to display. -void IDNToUnicodeOneComponent(const char16* comp, - int comp_len, +// will be APPENDED to the given output string and will be the same as the input +// if it is not IDN or the IDN is unsafe to display. Returns whether any +// conversion was performed. +bool IDNToUnicodeOneComponent(const char16* comp, + size_t comp_len, const std::wstring& languages, string16* out) { - DCHECK(comp_len >= 0); + DCHECK(out); if (comp_len == 0) - return; + return false; - // Expand the output string to make room for a possibly longer string - // (we'll expand if it's still not big enough below). - int extra_space = 64; - size_t host_begin_in_output = out->size(); - - // Just copy the input if it can't be an IDN component. - if (comp_len < 4 || - comp[0] != 'x' || comp[1] != 'n' || comp[2] != '-' || comp[3] != '-') { - out->resize(host_begin_in_output + comp_len); - for (int i = 0; i < comp_len; i++) - (*out)[host_begin_in_output + i] = comp[i]; - return; - } + // Only transform if the input can be an IDN component. + static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'}; + if ((comp_len > arraysize(kIdnPrefix)) && + !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) { + // Repeatedly expand the output string until it's big enough. It looks like + // ICU will return the required size of the buffer, but that's not + // documented, so we'll just grow by 2x. This should be rare and is not on a + // critical path. + size_t original_length = out->length(); + for (int extra_space = 64; ; extra_space *= 2) { + UErrorCode status = U_ZERO_ERROR; + out->resize(out->length() + extra_space); + int output_chars = uidna_IDNToUnicode(comp, + static_cast<int32_t>(comp_len), &(*out)[original_length], extra_space, + UIDNA_DEFAULT, NULL, &status); + if (status == U_ZERO_ERROR) { + // Converted successfully. + out->resize(original_length + output_chars); + if (IsIDNComponentSafe(out->data() + original_length, output_chars, + languages)) + return true; + } - while (true) { - UErrorCode status = U_ZERO_ERROR; - out->resize(out->size() + extra_space); - int output_chars = - uidna_IDNToUnicode(comp, comp_len, &(*out)[host_begin_in_output], - extra_space, UIDNA_DEFAULT, NULL, &status); - if (status == U_ZERO_ERROR) { - // Converted successfully. - out->resize(host_begin_in_output + output_chars); - if (!IsIDNComponentSafe(&out->data()[host_begin_in_output], - output_chars, - languages)) - break; // The error handling below will undo the IDN. - return; + if (status != U_BUFFER_OVERFLOW_ERROR) + break; } - if (status != U_BUFFER_OVERFLOW_ERROR) - break; - - // Need to loop again with a bigger buffer. It looks like ICU will - // return the required size of the buffer, but that's not documented, - // so we'll just grow by 2x. This should be rare and is not on a - // critical path. - extra_space *= 2; + // Failed, revert back to original string. + out->resize(original_length); } - // We get here on error, in which case we replace anything that was added - // with the literal input. - out->resize(host_begin_in_output + comp_len); - for (int i = 0; i < comp_len; i++) - (*out)[host_begin_in_output + i] = comp[i]; + // We get here with no IDN or on error, in which case we just append the + // literal input. + out->append(comp, comp_len); + return false; } // Helper for FormatUrl(). @@ -712,19 +703,23 @@ std::wstring FormatViewSourceUrl(const GURL& url, bool omit_username_password, UnescapeRule::Type unescape_rules, url_parse::Parsed* new_parsed, - size_t* prefix_end) { + size_t* prefix_end, + size_t* offset_for_adjustment) { DCHECK(new_parsed); const wchar_t* const kWideViewSource = L"view-source:"; const size_t kViewSourceLengthPlus1 = 12; GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1)); + size_t temp_offset = (*offset_for_adjustment == std::wstring::npos) ? + std::wstring::npos : (*offset_for_adjustment - kViewSourceLengthPlus1); + size_t* temp_offset_ptr = (*offset_for_adjustment < kViewSourceLengthPlus1) ? + NULL : &temp_offset; std::wstring result = net::FormatUrl(real_url, languages, - omit_username_password, unescape_rules, new_parsed, prefix_end); + omit_username_password, unescape_rules, new_parsed, prefix_end, + temp_offset_ptr); result.insert(0, kWideViewSource); // Adjust position values. - if (prefix_end) - *prefix_end += kViewSourceLengthPlus1; if (new_parsed->scheme.is_nonempty()) { // Assume "view-source:real-scheme" as a scheme. new_parsed->scheme.len += kViewSourceLengthPlus1; @@ -746,6 +741,12 @@ std::wstring FormatViewSourceUrl(const GURL& url, new_parsed->query.begin += kViewSourceLengthPlus1; if (new_parsed->ref.is_nonempty()) new_parsed->ref.begin += kViewSourceLengthPlus1; + if (prefix_end) + *prefix_end += kViewSourceLengthPlus1; + if (temp_offset_ptr) { + *offset_for_adjustment = (temp_offset == std::wstring::npos) ? + std::wstring::npos : (temp_offset + kViewSourceLengthPlus1); + } return result; } @@ -769,12 +770,20 @@ std::set<int> explicitly_allowed_ports; // Appends the substring |in_component| inside of the URL |spec| to |output|, // and the resulting range will be filled into |out_component|. |unescape_rules| -// defines how to clean the URL for human readability. +// defines how to clean the URL for human readability. |offset_for_adjustment| +// is an offset into |output| which will be adjusted based on how it maps to the +// component being converted; if it is less than output->length(), it will be +// untouched, and if it is greater than output->length() + in_component.len it +// will be shortened by the difference in lengths between the input and output +// components. Otherwise it points into the component being converted, and is +// adjusted to point to the same logical place in |output|. +// |offset_for_adjustment| may not be NULL. static void AppendFormattedComponent(const std::string& spec, const url_parse::Component& in_component, UnescapeRule::Type unescape_rules, std::wstring* output, - url_parse::Component* out_component); + url_parse::Component* out_component, + size_t* offset_for_adjustment); GURL FilePathToFileURL(const FilePath& path) { // Produce a URL like "file:///C:/foo" for a regular file, or @@ -849,58 +858,56 @@ std::string GetHeaderParamValue(const std::string& field, // // We may want to skip this step in the case of file URLs to allow unicode // UNC hostnames regardless of encodings. -void IDNToUnicode(const char* host, - int host_len, - const std::wstring& languages, - std::wstring* out) { +std::wstring IDNToUnicode(const char* host, + size_t host_len, + const std::wstring& languages, + size_t* offset_for_adjustment) { // Convert the ASCII input to a wide string for ICU. string16 input16; input16.reserve(host_len); - for (int i = 0; i < host_len; i++) - input16.push_back(host[i]); + std::copy(host, host + host_len, std::back_inserter(input16)); string16 out16; - // The output string is appended to, so convert what's already there if - // needed. -#if defined(WCHAR_T_IS_UTF32) - WideToUTF16(out->data(), out->length(), &out16); - out->clear(); // for equivalence with the swap below -#elif defined(WCHAR_T_IS_UTF16) - out->swap(out16); -#endif + size_t output_offset = offset_for_adjustment ? + *offset_for_adjustment : std::wstring::npos; // Do each component of the host separately, since we enforce script matching // on a per-component basis. - size_t cur_begin = 0; // Beginning of the current component (inclusive). - while (cur_begin < input16.size()) { - // Find the next dot or the end of the string. - size_t next_dot = input16.find_first_of('.', cur_begin); - if (next_dot == std::wstring::npos) - next_dot = input16.size(); // For getting the last component. - - if (next_dot > cur_begin) { + for (size_t component_start = 0, component_end; + component_start < input16.length(); + component_start = component_end + 1) { + // Find the end of the component. + component_end = input16.find('.', component_start); + if (component_end == string16::npos) + component_end = input16.length(); // For getting the last component. + size_t component_length = component_end - component_start; + + size_t output_component_start = out16.length(); + bool converted_idn = false; + if (component_end > component_start) { // Add the substring that we just found. - IDNToUnicodeOneComponent(&input16[cur_begin], - static_cast<int>(next_dot - cur_begin), - languages, - &out16); + converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start, + component_length, languages, &out16); + } + size_t output_component_length = out16.length() - output_component_start; + + if ((output_offset != std::wstring::npos) && + (*offset_for_adjustment > component_start)) { + if ((*offset_for_adjustment < component_end) && converted_idn) + output_offset = std::wstring::npos; + else + output_offset += output_component_length - component_length; } - // Need to add the dot we just found (if we found one). This needs to be - // done before we break out below in case the URL ends in a dot. - if (next_dot < input16.size()) + // Need to add the dot we just found (if we found one). + if (component_end < input16.length()) out16.push_back('.'); - else - break; // No more components left. - - cur_begin = next_dot + 1; } -#if defined(WCHAR_T_IS_UTF32) - UTF16ToWide(out16.data(), out16.length(), out); -#elif defined(WCHAR_T_IS_UTF16) - out->swap(out16); -#endif + if (offset_for_adjustment) + *offset_for_adjustment = output_offset; + + return UTF16ToWideAndAdjustOffset(out16, offset_for_adjustment); } std::string CanonicalizeHost(const std::string& host, @@ -1262,31 +1269,48 @@ void GetIdentityFromURL(const GURL& url, std::wstring* username, std::wstring* password) { UnescapeRule::Type flags = UnescapeRule::SPACES; - *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags); - *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags); + *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags, NULL); + *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags, NULL); } void AppendFormattedHost(const GURL& url, const std::wstring& languages, std::wstring* output, - url_parse::Parsed* new_parsed) { + url_parse::Parsed* new_parsed, + size_t* offset_for_adjustment) { + DCHECK(output); const url_parse::Component& host = url.parsed_for_possibly_invalid_spec().host; if (host.is_nonempty()) { // Handle possible IDN in the host name. + int new_host_begin = static_cast<int>(output->length()); if (new_parsed) - new_parsed->host.begin = static_cast<int>(output->length()); + new_parsed->host.begin = new_host_begin; + size_t offset_past_current_output = + (!offset_for_adjustment || + (*offset_for_adjustment == std::wstring::npos) || + (*offset_for_adjustment < output->length())) ? + std::wstring::npos : (*offset_for_adjustment - output->length()); + size_t* offset_into_host = + (offset_past_current_output >= static_cast<size_t>(host.len)) ? + NULL : &offset_past_current_output; const std::string& spec = url.possibly_invalid_spec(); DCHECK(host.begin >= 0 && ((spec.length() == 0 && host.begin == 0) || host.begin < static_cast<int>(spec.length()))); - net::IDNToUnicode(&spec[host.begin], host.len, languages, output); + output->append(net::IDNToUnicode(&spec[host.begin], + static_cast<size_t>(host.len), languages, offset_into_host)); - if (new_parsed) { - new_parsed->host.len = - static_cast<int>(output->length()) - new_parsed->host.begin; + int new_host_len = static_cast<int>(output->length()) - new_host_begin; + if (new_parsed) + new_parsed->host.len = new_host_len; + if (offset_into_host) { + *offset_for_adjustment = (*offset_into_host == std::wstring::npos) ? + std::wstring::npos : (new_host_begin + *offset_into_host); + } else if (offset_past_current_output != std::wstring::npos) { + *offset_for_adjustment += new_host_len - host.len; } } else if (new_parsed) { new_parsed->host.reset(); @@ -1298,19 +1322,36 @@ void AppendFormattedComponent(const std::string& spec, const url_parse::Component& in_component, UnescapeRule::Type unescape_rules, std::wstring* output, - url_parse::Component* out_component) { + url_parse::Component* out_component, + size_t* offset_for_adjustment) { + DCHECK(output); + DCHECK(offset_for_adjustment); if (in_component.is_nonempty()) { out_component->begin = static_cast<int>(output->length()); + size_t offset_past_current_output = + ((*offset_for_adjustment == std::wstring::npos) || + (*offset_for_adjustment < output->length())) ? + std::wstring::npos : (*offset_for_adjustment - output->length()); + size_t* offset_into_component = + (offset_past_current_output >= static_cast<size_t>(in_component.len)) ? + NULL : &offset_past_current_output; if (unescape_rules == UnescapeRule::NONE) { - output->append(UTF8ToWide(spec.substr( - in_component.begin, in_component.len))); + output->append(UTF8ToWideAndAdjustOffset( + spec.substr(in_component.begin, in_component.len), + offset_into_component)); } else { output->append(UnescapeAndDecodeUTF8URLComponent( - spec.substr(in_component.begin, in_component.len), - unescape_rules)); + spec.substr(in_component.begin, in_component.len), unescape_rules, + offset_into_component)); } out_component->len = static_cast<int>(output->length()) - out_component->begin; + if (offset_into_component) { + *offset_for_adjustment = (*offset_into_component == std::wstring::npos) ? + std::wstring::npos : (out_component->begin + *offset_into_component); + } else if (offset_past_current_output != std::wstring::npos) { + *offset_for_adjustment += out_component->len - in_component.len; + } } else { out_component->reset(); } @@ -1321,10 +1362,14 @@ std::wstring FormatUrl(const GURL& url, bool omit_username_password, UnescapeRule::Type unescape_rules, url_parse::Parsed* new_parsed, - size_t* prefix_end) { + size_t* prefix_end, + size_t* offset_for_adjustment) { url_parse::Parsed parsed_temp; if (!new_parsed) new_parsed = &parsed_temp; + size_t offset_temp = std::wstring::npos; + if (!offset_for_adjustment) + offset_for_adjustment = &offset_temp; std::wstring url_string; @@ -1332,6 +1377,7 @@ std::wstring FormatUrl(const GURL& url, if (url.is_empty()) { if (prefix_end) *prefix_end = 0; + *offset_for_adjustment = std::wstring::npos; return url_string; } @@ -1343,19 +1389,22 @@ std::wstring FormatUrl(const GURL& url, if (url.SchemeIs(kViewSource) && !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) { return FormatViewSourceUrl(url, languages, omit_username_password, - unescape_rules, new_parsed, prefix_end); + unescape_rules, new_parsed, prefix_end, offset_for_adjustment); } // We handle both valid and invalid URLs (this will give us the spec // regardless of validity). const std::string& spec = url.possibly_invalid_spec(); const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec(); + if (*offset_for_adjustment >= spec.length()) + *offset_for_adjustment = std::wstring::npos; // Copy everything before the username (the scheme and the separators.) // These are ASCII. - int pre_end = parsed.CountCharactersBefore(url_parse::Parsed::USERNAME, true); - for (int i = 0; i < pre_end; ++i) - url_string.push_back(spec[i]); + std::copy(spec.begin(), + spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME, + true), + std::back_inserter(url_string)); new_parsed->scheme = parsed.scheme; if (omit_username_password) { @@ -1364,16 +1413,41 @@ std::wstring FormatUrl(const GURL& url, // e.g. "http://google.com:search@evil.ru/" new_parsed->username.reset(); new_parsed->password.reset(); + if ((*offset_for_adjustment != std::wstring::npos) && + (parsed.username.is_nonempty() || parsed.password.is_nonempty())) { + if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) { + // The seeming off-by-one and off-by-two in these first two lines are to + // account for the ':' after the username and '@' after the password. + if (*offset_for_adjustment > + static_cast<size_t>(parsed.password.end())) { + *offset_for_adjustment -= + (parsed.username.len + parsed.password.len + 2); + } else if (*offset_for_adjustment > + static_cast<size_t>(parsed.username.begin)) { + *offset_for_adjustment = std::wstring::npos; + } + } else { + const url_parse::Component* nonempty_component = + parsed.username.is_nonempty() ? &parsed.username : &parsed.password; + // The seeming off-by-one in these first two lines is to account for the + // '@' after the username/password. + if (*offset_for_adjustment > + static_cast<size_t>(nonempty_component->end())) { + *offset_for_adjustment -= (nonempty_component->len + 1); + } else if (*offset_for_adjustment > + static_cast<size_t>(nonempty_component->begin)) { + *offset_for_adjustment = std::wstring::npos; + } + } + } } else { - AppendFormattedComponent( - spec, parsed.username, unescape_rules, - &url_string, &new_parsed->username); + AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string, + &new_parsed->username, offset_for_adjustment); if (parsed.password.is_valid()) { url_string.push_back(':'); } - AppendFormattedComponent( - spec, parsed.password, unescape_rules, - &url_string, &new_parsed->password); + AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string, + &new_parsed->password, offset_for_adjustment); if (parsed.username.is_valid() || parsed.password.is_valid()) { url_string.push_back('@'); } @@ -1381,39 +1455,56 @@ std::wstring FormatUrl(const GURL& url, if (prefix_end) *prefix_end = static_cast<size_t>(url_string.length()); - AppendFormattedHost(url, languages, &url_string, new_parsed); + AppendFormattedHost(url, languages, &url_string, new_parsed, + offset_for_adjustment); // Port. if (parsed.port.is_nonempty()) { url_string.push_back(':'); - int begin = url_string.length(); - for (int i = parsed.port.begin; i < parsed.port.end(); ++i) - url_string.push_back(spec[i]); - new_parsed->port.begin = begin; - new_parsed->port.len = url_string.length() - begin; + new_parsed->port.begin = url_string.length(); + std::copy(spec.begin() + parsed.port.begin, + spec.begin() + parsed.port.end(), std::back_inserter(url_string)); + new_parsed->port.len = url_string.length() - new_parsed->port.begin; } else { new_parsed->port.reset(); } // Path and query both get the same general unescape & convert treatment. - AppendFormattedComponent( - spec, parsed.path, unescape_rules, &url_string, - &new_parsed->path); + AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string, + &new_parsed->path, offset_for_adjustment); if (parsed.query.is_valid()) url_string.push_back('?'); - AppendFormattedComponent( - spec, parsed.query, unescape_rules, &url_string, - &new_parsed->query); + AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string, + &new_parsed->query, offset_for_adjustment); // Reference is stored in valid, unescaped UTF-8, so we can just convert. if (parsed.ref.is_valid()) { url_string.push_back('#'); - int begin = url_string.length(); - if (parsed.ref.len > 0) - url_string.append(UTF8ToWide(std::string(&spec[parsed.ref.begin], - parsed.ref.len))); - new_parsed->ref.begin = begin; - new_parsed->ref.len = url_string.length() - begin; + new_parsed->ref.begin = url_string.length(); + size_t offset_past_current_output = + ((*offset_for_adjustment == std::wstring::npos) || + (*offset_for_adjustment < url_string.length())) ? + std::wstring::npos : (*offset_for_adjustment - url_string.length()); + size_t* offset_into_ref = + (offset_past_current_output >= static_cast<size_t>(parsed.ref.len)) ? + NULL : &offset_past_current_output; + if (parsed.ref.len > 0) { + url_string.append(UTF8ToWideAndAdjustOffset(spec.substr(parsed.ref.begin, + parsed.ref.len), + offset_into_ref)); + } + new_parsed->ref.len = url_string.length() - new_parsed->ref.begin; + if (offset_into_ref) { + *offset_for_adjustment = (*offset_into_ref == std::wstring::npos) ? + std::wstring::npos : (new_parsed->ref.begin + *offset_into_ref); + } else if (offset_past_current_output != std::wstring::npos) { + // We clamped the offset near the beginning of this function to ensure it + // was within the input URL. If we reach here, the input was something + // invalid and non-parseable such that the offset was past any component + // we could figure out. In this case it won't be represented in the + // output string, so reset it. + *offset_for_adjustment = std::wstring::npos; + } } return url_string; diff --git a/net/base/net_util.h b/net/base/net_util.h index 1f1516f..d9affe6 100644 --- a/net/base/net_util.h +++ b/net/base/net_util.h @@ -129,10 +129,9 @@ std::string GetHeaderParamValue(const std::string& field, std::string GetFileNameFromCD(const std::string& header, const std::string& referrer_charset); -// Converts the given host name to unicode characters, APPENDING them to the -// the given output string. This can be called for any host name, if the -// input is not IDN or is invalid in some way, we'll just append the ASCII -// source to the output so it is still usable. +// Converts the given host name to unicode characters. This can be called for +// any host name, if the input is not IDN or is invalid in some way, we'll just +// return the ASCII source so it is still usable. // // The input should be the canonicalized ASCII host name from GURL. This // function does NOT accept UTF-8! Its length must also be given (this is @@ -146,10 +145,16 @@ std::string GetFileNameFromCD(const std::string& header, // Latin letters in the ASCII range can be mixed with a limited set of // script-language pairs (currently Han, Kana and Hangul for zh,ja and ko). // When |languages| is empty, even that mixing is not allowed. -void IDNToUnicode(const char* host, - int host_len, - const std::wstring& languages, - std::wstring* out); +// +// |offset_for_adjustment| is an offset into |host|, which will be adjusted to +// point at the same logical place in the output string. If this isn't possible +// because it points past the end of |host| or into the middle of a punycode +// sequence, it will be set to std::wstring::npos. |offset_for_adjustment| may +// be NULL. +std::wstring IDNToUnicode(const char* host, + size_t host_len, + const std::wstring& languages, + size_t* offset_for_adjustment); // Canonicalizes |host| and returns it. Also fills |host_info| with // IP address information. |host_info| must not be NULL. @@ -228,31 +233,47 @@ int SetNonBlocking(int fd); // the user. The given parsed structure will be updated. The host name formatter // also takes the same accept languages component as ElideURL. |new_parsed| may // be null. -void AppendFormattedHost(const GURL& url, const std::wstring& languages, - std::wstring* output, url_parse::Parsed* new_parsed); - -// Creates a string representation of |url|. The IDN host name may -// be in Unicode if |languages| accepts the Unicode representation. -// If |omit_username_password| is true, the username and the password are -// omitted. |unescape_rules| defines how to clean the URL for human readability. +void AppendFormattedHost(const GURL& url, + const std::wstring& languages, + std::wstring* output, + url_parse::Parsed* new_parsed, + size_t* offset_for_adjustment); + +// Creates a string representation of |url|. The IDN host name may be in Unicode +// if |languages| accepts the Unicode representation. If +// |omit_username_password| is true, any username and password are removed. +// |unescape_rules| defines how to clean the URL for human readability. // You will generally want |UnescapeRule::SPACES| for display to the user if you // can handle spaces, or |UnescapeRule::NORMAL| if not. If the path part and the // query part seem to be encoded in %-encoded UTF-8, decodes %-encoding and -// UTF-8. |new_parsed| will have parsing parameters of the resultant URL. +// UTF-8. +// +// The last three parameters may be NULL. +// |new_parsed| will be set to the parsing parameters of the resultant URL. // |prefix_end| will be the length before the hostname of the resultant URL. -// |new_parsed| and |prefix_end| may be NULL. +// |offset_for_adjustment| is an offset into the original |url|'s spec(), which +// will be modified to reflect changes this function makes to the output string; +// for example, if |url| is "http://a:b@c.com/", |omit_username_password| is +// true, and |offset_for_adjustment| is 12 (the offset of '.'), then on return +// the output string will be "http://c.com/" and |offset_for_adjustment| will be +// 8. If the offset cannot be successfully adjusted (e.g. because it points +// into the middle of a component that was entirely removed, past the end of the +// string, or into the middle of an encoding sequence), it will be set to +// std::wstring::npos. std::wstring FormatUrl(const GURL& url, const std::wstring& languages, bool omit_username_password, UnescapeRule::Type unescape_rules, url_parse::Parsed* new_parsed, - size_t* prefix_end); + size_t* prefix_end, + size_t* offset_for_adjustment); // Creates a string representation of |url| for display to the user. // This is a shorthand of the above function with omit_username_password=true, // unescape=SPACES, new_parsed=NULL, and prefix_end=NULL. inline std::wstring FormatUrl(const GURL& url, const std::wstring& languages) { - return FormatUrl(url, languages, true, UnescapeRule::SPACES, NULL, NULL); + return FormatUrl(url, languages, true, UnescapeRule::SPACES, NULL, NULL, + NULL); } // Strip the portions of |url| that aren't core to the network request. diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc index 07ec17c..308ef80 100644 --- a/net/base/net_util_unittest.cc +++ b/net/base/net_util_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -345,6 +345,11 @@ const IDNTestCase idn_cases[] = { #endif }; +struct AdjustOffsetCase { + size_t input_offset; + size_t output_offset; +}; + struct CompliantHostCase { const char* host; bool expected_output; @@ -782,14 +787,10 @@ TEST(NetUtilTest, IDNToUnicodeFast) { // ja || zh-TW,en || ko,ja -> IDNToUnicodeSlow if (j == 3 || j == 17 || j == 18) continue; - std::wstring output; - net::IDNToUnicode(idn_cases[i].input, - static_cast<int>(strlen(idn_cases[i].input)), - kLanguages[j], - &output); + std::wstring output(net::IDNToUnicode(idn_cases[i].input, + strlen(idn_cases[i].input), kLanguages[j], NULL)); std::wstring expected(idn_cases[i].unicode_allowed[j] ? - idn_cases[i].unicode_output : - ASCIIToWide(idn_cases[i].input)); + idn_cases[i].unicode_output : ASCIIToWide(idn_cases[i].input)); AppendLanguagesToOutputs(kLanguages[j], &expected, &output); EXPECT_EQ(expected, output); } @@ -802,20 +803,43 @@ TEST(NetUtilTest, IDNToUnicodeSlow) { // !(ja || zh-TW,en || ko,ja) -> IDNToUnicodeFast if (!(j == 3 || j == 17 || j == 18)) continue; - std::wstring output; - net::IDNToUnicode(idn_cases[i].input, - static_cast<int>(strlen(idn_cases[i].input)), - kLanguages[j], - &output); + std::wstring output(net::IDNToUnicode(idn_cases[i].input, + strlen(idn_cases[i].input), kLanguages[j], NULL)); std::wstring expected(idn_cases[i].unicode_allowed[j] ? - idn_cases[i].unicode_output : - ASCIIToWide(idn_cases[i].input)); + idn_cases[i].unicode_output : ASCIIToWide(idn_cases[i].input)); AppendLanguagesToOutputs(kLanguages[j], &expected, &output); EXPECT_EQ(expected, output); } } } +TEST(NetUtilTest, IDNToUnicodeAdjustOffset) { + const AdjustOffsetCase adjust_cases[] = { + {0, 0}, + {2, 2}, + {4, 4}, + {5, 5}, + {6, std::wstring::npos}, + {16, std::wstring::npos}, + {17, 7}, + {18, 8}, + {19, std::wstring::npos}, + {25, std::wstring::npos}, + {34, 12}, + {35, 13}, + {38, 16}, + {39, std::wstring::npos}, + {std::wstring::npos, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(adjust_cases); ++i) { + size_t offset = adjust_cases[i].input_offset; + // "test.\x89c6\x9891.\x5317\x4eac\x5927\x5b78.test" + net::IDNToUnicode("test.xn--cy2a840a.xn--1lq90ic7f1rc.test", 39, L"zh-CN", + &offset); + EXPECT_EQ(adjust_cases[i].output_offset, offset); + } +} + TEST(NetUtilTest, CompliantHost) { const CompliantHostCase compliant_host_cases[] = { {"", false}, @@ -1328,7 +1352,7 @@ TEST(NetUtilTest, FormatUrl) { size_t prefix_len; std::wstring formatted = net::FormatUrl( GURL(tests[i].input), tests[i].languages, tests[i].omit, - tests[i].escape_rules, NULL, &prefix_len); + tests[i].escape_rules, NULL, &prefix_len, NULL); EXPECT_EQ(tests[i].output, formatted) << tests[i].description; EXPECT_EQ(tests[i].prefix_len, prefix_len) << tests[i].description; } @@ -1340,7 +1364,7 @@ TEST(NetUtilTest, FormatUrlParsed) { std::wstring formatted = net::FormatUrl( GURL("http://\xE3\x82\xB0:\xE3\x83\xBC@xn--qcka1pmc.jp:8080/" "%E3%82%B0/?q=%E3%82%B0#\xE3\x82\xB0"), - L"ja", false, UnescapeRule::NONE, &parsed, NULL); + L"ja", false, UnescapeRule::NONE, &parsed, NULL, NULL); EXPECT_EQ(L"http://%E3%82%B0:%E3%83%BC@\x30B0\x30FC\x30B0\x30EB.jp:8080" L"/%E3%82%B0/?q=%E3%82%B0#\x30B0", formatted); EXPECT_EQ(L"%E3%82%B0", @@ -1360,7 +1384,7 @@ TEST(NetUtilTest, FormatUrlParsed) { formatted = net::FormatUrl( GURL("http://\xE3\x82\xB0:\xE3\x83\xBC@xn--qcka1pmc.jp:8080/" "%E3%82%B0/?q=%E3%82%B0#\xE3\x82\xB0"), - L"ja", false, UnescapeRule::NORMAL, &parsed, NULL); + L"ja", false, UnescapeRule::NORMAL, &parsed, NULL, NULL); EXPECT_EQ(L"http://\x30B0:\x30FC@\x30B0\x30FC\x30B0\x30EB.jp:8080" L"/\x30B0/?q=\x30B0#\x30B0", formatted); EXPECT_EQ(L"\x30B0", @@ -1379,7 +1403,7 @@ TEST(NetUtilTest, FormatUrlParsed) { formatted = net::FormatUrl( GURL("http://\xE3\x82\xB0:\xE3\x83\xBC@xn--qcka1pmc.jp:8080/" "%E3%82%B0/?q=%E3%82%B0#\xE3\x82\xB0"), - L"ja", true, UnescapeRule::NORMAL, &parsed, NULL); + L"ja", true, UnescapeRule::NORMAL, &parsed, NULL, NULL); EXPECT_EQ(L"http://\x30B0\x30FC\x30B0\x30EB.jp:8080" L"/\x30B0/?q=\x30B0#\x30B0", formatted); EXPECT_FALSE(parsed.username.is_valid()); @@ -1395,7 +1419,7 @@ TEST(NetUtilTest, FormatUrlParsed) { // View-source case. formatted = net::FormatUrl( GURL("view-source:http://user:passwd@host:81/path?query#ref"), - L"", true, UnescapeRule::NORMAL, &parsed, NULL); + L"", true, UnescapeRule::NORMAL, &parsed, NULL, NULL); EXPECT_EQ(L"view-source:http://host:81/path?query#ref", formatted); EXPECT_EQ(L"view-source:http", formatted.substr(parsed.scheme.begin, parsed.scheme.len)); @@ -1408,6 +1432,124 @@ TEST(NetUtilTest, FormatUrlParsed) { EXPECT_EQ(L"ref", formatted.substr(parsed.ref.begin, parsed.ref.len)); } +TEST(NetUtilTest, FormatUrlAdjustOffset) { + const AdjustOffsetCase basic_cases[] = { + {0, 0}, + {3, 3}, + {5, 5}, + {6, 6}, + {13, 13}, + {21, 21}, + {22, 22}, + {23, 23}, + {25, 25}, + {26, std::wstring::npos}, + {500000, std::wstring::npos}, + {std::wstring::npos, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(basic_cases); ++i) { + size_t offset = basic_cases[i].input_offset; + net::FormatUrl(GURL("http://www.google.com/foo/"), L"en", true, + UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(basic_cases[i].output_offset, offset); + } + + const struct { + const char* input_url; + size_t input_offset; + size_t output_offset; + } omit_auth_cases[] = { + {"http://foo:bar@www.google.com/", 6, 6}, + {"http://foo:bar@www.google.com/", 7, 7}, + {"http://foo:bar@www.google.com/", 8, std::wstring::npos}, + {"http://foo:bar@www.google.com/", 10, std::wstring::npos}, + {"http://foo:bar@www.google.com/", 11, std::wstring::npos}, + {"http://foo:bar@www.google.com/", 14, std::wstring::npos}, + {"http://foo:bar@www.google.com/", 15, 7}, + {"http://foo:bar@www.google.com/", 25, 17}, + {"http://foo@www.google.com/", 9, std::wstring::npos}, + {"http://foo@www.google.com/", 11, 7}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(omit_auth_cases); ++i) { + size_t offset = omit_auth_cases[i].input_offset; + net::FormatUrl(GURL(omit_auth_cases[i].input_url), L"en", true, + UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(omit_auth_cases[i].output_offset, offset); + } + + const AdjustOffsetCase view_source_cases[] = { + {0, 0}, + {3, 3}, + {11, 11}, + {12, 12}, + {13, 13}, + {19, 19}, + {20, std::wstring::npos}, + {23, 19}, + {26, 22}, + {std::wstring::npos, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(view_source_cases); ++i) { + size_t offset = view_source_cases[i].input_offset; + net::FormatUrl(GURL("view-source:http://foo@www.google.com/"), L"en", true, + UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(view_source_cases[i].output_offset, offset); + } + + const AdjustOffsetCase idn_hostname_cases[] = { + {8, std::wstring::npos}, + {16, std::wstring::npos}, + {24, std::wstring::npos}, + {25, 12}, + {30, 17}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(idn_hostname_cases); ++i) { + size_t offset = idn_hostname_cases[i].input_offset; + // "http://\x671d\x65e5\x3042\x3055\x3072.jp/foo/" + net::FormatUrl(GURL("http://xn--l8jvb1ey91xtjb.jp/foo/"), L"ja", true, + UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(idn_hostname_cases[i].output_offset, offset); + } + + const AdjustOffsetCase unescape_cases[] = { + {25, 25}, + {26, std::wstring::npos}, + {27, std::wstring::npos}, + {28, 26}, + {35, std::wstring::npos}, + {41, 31}, + {59, 33}, + {60, std::wstring::npos}, + {67, std::wstring::npos}, + {68, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(unescape_cases); ++i) { + size_t offset = unescape_cases[i].input_offset; + // "http://www.google.com/foo bar/\x30B0\x30FC\x30B0\x30EB" + net::FormatUrl(GURL( + "http://www.google.com/foo%20bar/%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB"), + L"en", true, UnescapeRule::SPACES, NULL, NULL, &offset); + EXPECT_EQ(unescape_cases[i].output_offset, offset); + } + + const AdjustOffsetCase ref_cases[] = { + {30, 30}, + {31, 31}, + {32, std::wstring::npos}, + {34, 32}, + {37, 33}, + {38, std::wstring::npos}, + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(ref_cases); ++i) { + size_t offset = ref_cases[i].input_offset; + // "http://www.google.com/foo.html#\x30B0\x30B0z" + net::FormatUrl(GURL( + "http://www.google.com/foo.html#\xE3\x82\xB0\xE3\x82\xB0z"), L"en", + true, UnescapeRule::NORMAL, NULL, NULL, &offset); + EXPECT_EQ(ref_cases[i].output_offset, offset); + } +} + TEST(NetUtilTest, SimplifyUrlForRequest) { struct { const char* input_url; @@ -1466,4 +1608,3 @@ TEST(NetUtilTest, SetExplicitlyAllowedPortsTest) { EXPECT_EQ(i, net::explicitly_allowed_ports.size()); } } - |