diff options
-rw-r--r-- | base/utf_offset_string_conversions.cc | 179 | ||||
-rw-r--r-- | base/utf_offset_string_conversions.h | 66 | ||||
-rw-r--r-- | base/utf_offset_string_conversions_unittest.cc | 94 | ||||
-rw-r--r-- | net/base/escape.cc | 117 | ||||
-rw-r--r-- | net/base/escape.h | 31 | ||||
-rw-r--r-- | net/base/escape_unittest.cc | 39 | ||||
-rw-r--r-- | net/base/net_util.cc | 426 | ||||
-rw-r--r-- | net/base/net_util.h | 59 | ||||
-rw-r--r-- | net/base/net_util_unittest.cc | 166 |
9 files changed, 235 insertions, 942 deletions
diff --git a/base/utf_offset_string_conversions.cc b/base/utf_offset_string_conversions.cc index f091cb4..4c47ef8 100644 --- a/base/utf_offset_string_conversions.cc +++ b/base/utf_offset_string_conversions.cc @@ -1,12 +1,9 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/utf_offset_string_conversions.h" -#include <algorithm> - -#include "base/scoped_ptr.h" #include "base/string_piece.h" #include "base/utf_string_conversion_utils.h" @@ -24,16 +21,13 @@ template<typename SRC_CHAR> bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, std::wstring* output, - std::vector<size_t>* offsets_for_adjustment) { - if (offsets_for_adjustment) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - LimitOffset<std::wstring>(src_len)); - } + size_t* offset_for_adjustment) { + size_t output_offset = + (offset_for_adjustment && *offset_for_adjustment < src_len) ? + *offset_for_adjustment : std::wstring::npos; // ICU requires 32-bit numbers. bool success = true; - AdjustOffset::Adjustments adjustments; int32 src_len32 = static_cast<int32>(src_len); for (int32 i = 0; i < src_len32; i++) { uint32 code_point; @@ -45,23 +39,21 @@ bool ConvertUnicode(const SRC_CHAR* src, chars_written = WriteUnicodeCharacter(0xFFFD, output); success = false; } - if (offsets_for_adjustment) { + if ((output_offset != std::wstring::npos) && + (*offset_for_adjustment > original_i)) { // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last // character read, not after it (so that incrementing it in the loop // increment will place it at the right location), so we need to account // for that in determining the amount that was read. - adjustments.push_back(AdjustOffset::Adjustment( - original_i, i - original_i + 1, chars_written)); + if (*offset_for_adjustment <= static_cast<size_t>(i)) + output_offset = std::wstring::npos; + else + output_offset += chars_written - (i - original_i + 1); } } - // Make offset adjustment. - if (offsets_for_adjustment && !adjustments.empty()) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - AdjustOffset(adjustments)); - } - + if (offset_for_adjustment) + *offset_for_adjustment = output_offset; return success; } @@ -71,44 +63,16 @@ bool UTF8ToWideAndAdjustOffset(const char* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - PrepareForUTF16Or32Output(src, src_len, output); - bool ret = ConvertUnicode(src, src_len, output, &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return ret; -} - -bool UTF8ToWideAndAdjustOffsets(const char* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment) { PrepareForUTF16Or32Output(src, src_len, output); - return ConvertUnicode(src, src_len, output, offsets_for_adjustment); + return ConvertUnicode(src, src_len, output, offset_for_adjustment); } std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - std::wstring result; - UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, - &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return result; -} - -std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8, - std::vector<size_t>* - offsets_for_adjustment) { - std::wstring result; - UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, - offsets_for_adjustment); - return result; + std::wstring ret; + UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret, + offset_for_adjustment); + return ret; } // UTF-16 <-> Wide ------------------------------------------------------------- @@ -126,19 +90,6 @@ bool UTF16ToWideAndAdjustOffset(const char16* src, return true; } -bool UTF16ToWideAndAdjustOffsets(const char16* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment) { - output->assign(src, src_len); - if (offsets_for_adjustment) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - LimitOffset<std::wstring>(src_len)); - } - return true; -} - std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment) { if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) @@ -146,109 +97,25 @@ std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, return utf16; } -std::wstring UTF16ToWideAndAdjustOffsets( - const string16& utf16, - std::vector<size_t>* offsets_for_adjustment) { - if (offsets_for_adjustment) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - LimitOffset<std::wstring>(utf16.length())); - } - return utf16; -} - #elif defined(WCHAR_T_IS_UTF32) bool UTF16ToWideAndAdjustOffset(const char16* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - output->clear(); - // Assume that normally we won't have any non-BMP characters so the counts - // will be the same. - output->reserve(src_len); - bool ret = ConvertUnicode(src, src_len, output, &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return ret; -} - -bool UTF16ToWideAndAdjustOffsets(const char16* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment) { output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); - return ConvertUnicode(src, src_len, output, offsets_for_adjustment); + return ConvertUnicode(src, src_len, output, offset_for_adjustment); } std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - std::wstring result; - UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, - &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return result; -} - -std::wstring UTF16ToWideAndAdjustOffsets( - const string16& utf16, - std::vector<size_t>* offsets_for_adjustment) { - std::wstring result; - UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, - offsets_for_adjustment); - return result; + std::wstring ret; + UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret, + offset_for_adjustment); + return ret; } #endif // defined(WCHAR_T_IS_UTF32) - -template <typename T> -LimitOffset<T>::LimitOffset(size_t limit) - : limit_(limit) {} - -template <typename T> -void LimitOffset<T>::operator()(size_t& offset) { - if (offset >= limit_) - offset = T::npos; -} - -AdjustOffset::Adjustment::Adjustment(size_t location, - size_t old_length, - size_t new_length) - : location(location), - old_length(old_length), - new_length(new_length) {} - -AdjustOffset::AdjustOffset(const Adjustments& adjustments) - : adjustments_(adjustments) {} - -void AdjustOffset::operator()(size_t& offset) { - if (offset == std::wstring::npos) - return; - size_t adjustment = 0; - for (Adjustments::const_iterator i = adjustments_.begin(); - i != adjustments_.end(); ++i) { - size_t location = i->location; - if (offset == location && i->new_length == 0) { - offset = std::wstring::npos; - return; - } - if (offset <= location) - break; - if (offset < (location + i->old_length)) { - offset = std::wstring::npos; - return; - } - adjustment += (i->old_length - i->new_length); - } - offset -= adjustment; -} diff --git a/base/utf_offset_string_conversions.h b/base/utf_offset_string_conversions.h index 19b312a..13df1b4 100644 --- a/base/utf_offset_string_conversions.h +++ b/base/utf_offset_string_conversions.h @@ -7,7 +7,6 @@ #pragma once #include <string> -#include <vector> #include "base/base_api.h" #include "base/string16.h" @@ -16,78 +15,23 @@ namespace base { class StringPiece; } -// Like the conversions in utf_string_conversions.h, but also takes one or more -// offsets (|offset[s]_for_adjustment|) into the source strings, each offset -// will be adjusted to point at the same logical place in the result strings. -// If this isn't possible because an offset points past the end of the source -// strings or into the middle of a multibyte sequence, the offending offset will -// be set to std::wstring::npos. |offset[s]_for_adjustment| may be NULL. +// Like the conversions in utf_string_conversions.h, but also take offsets into +// the source strings, which will be adjusted to point at the same logical place +// in the result strings. If this isn't possible because the offsets point past +// the end of the source strings or into the middle of multibyte sequences, they +// will be set to std::wstring::npos. |offset_for_adjustment| may be NULL. BASE_API bool UTF8ToWideAndAdjustOffset(const char* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment); -BASE_API bool UTF8ToWideAndAdjustOffsets( - const char* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment); - BASE_API std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, size_t* offset_for_adjustment); -BASE_API std::wstring UTF8ToWideAndAdjustOffsets( - const base::StringPiece& utf8, - std::vector<size_t>* offsets_for_adjustment); BASE_API bool UTF16ToWideAndAdjustOffset(const char16* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment); -BASE_API bool UTF16ToWideAndAdjustOffsets( - const char16* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment); - BASE_API std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment); -BASE_API std::wstring UTF16ToWideAndAdjustOffsets( - const string16& utf16, - std::vector<size_t>* offsets_for_adjustment); - -// Limiting function callable by std::for_each which will replace any value -// which is equal to or greater than |limit| with npos. -template <typename T> -struct LimitOffset { - explicit LimitOffset(size_t limit); - void operator()(size_t& offset); - - size_t limit_; -}; - -// Adjustment function called by std::transform which will adjust any offset -// that occurs after one or more modified substrings. To use, create any -// number of AdjustOffset::Adjustments, drop them into a vector, then call -// std::transform with the transform function being something similar to -// AdjustOffset(adjustments). Each Adjustment gives the original |location| -// of the encoded section and the |old_length| and |new_length| of the section -// before and after decoding. -struct AdjustOffset { - // Helper structure which indicates where an encoded character occurred - // and how long that encoding was. - struct Adjustment { - Adjustment(size_t location, size_t old_length, size_t new_length); - - size_t location; - size_t old_length; - size_t new_length; - }; - - typedef std::vector<Adjustment> Adjustments; - - explicit AdjustOffset(const Adjustments& adjustments); - void operator()(size_t& offset); - - const Adjustments& adjustments_; -}; #endif // BASE_UTF_OFFSET_STRING_CONVERSIONS_H_ diff --git a/base/utf_offset_string_conversions_unittest.cc b/base/utf_offset_string_conversions_unittest.cc index b731b9e..4f13ab3 100644 --- a/base/utf_offset_string_conversions_unittest.cc +++ b/base/utf_offset_string_conversions_unittest.cc @@ -1,9 +1,7 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include <algorithm> - #include "base/logging.h" #include "base/string_piece.h" #include "base/utf_offset_string_conversions.h" @@ -13,8 +11,6 @@ namespace base { namespace { -static const size_t kNpos = std::wstring::npos; - // Given a null-terminated string of wchar_t with each wchar_t representing // a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. // Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) @@ -44,12 +40,12 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) { size_t input_offset; size_t output_offset; } utf8_to_wide_cases[] = { - {"", 0, kNpos}, - {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos}, + {"", 0, std::wstring::npos}, + {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos}, {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1}, {"\xed\xb0\x80z", 3, 1}, {"A\xF0\x90\x8C\x80z", 1, 1}, - {"A\xF0\x90\x8C\x80z", 2, kNpos}, + {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos}, #if defined(WCHAR_T_IS_UTF16) {"A\xF0\x90\x8C\x80z", 5, 3}, #elif defined(WCHAR_T_IS_UTF32) @@ -69,7 +65,7 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) { size_t output_offset; } utf16_to_wide_cases[] = { {L"\xD840\xDC00\x4E00", 0, 0}, - {L"\xD840\xDC00\x4E00", 1, kNpos}, + {L"\xD840\xDC00\x4E00", 1, std::wstring::npos}, {L"\xD840\xDC00\x4E00", 2, 1}, }; for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_wide_cases); ++i) { @@ -81,84 +77,4 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) { #endif } -TEST(UTFOffsetStringConversionsTest, LimitOffsets) { - const size_t kLimit = 10; - const size_t kItems = 20; - std::vector<size_t> size_ts; - for (size_t t = 0; t < kItems; ++t) - size_ts.push_back(t); - std::for_each(size_ts.begin(), size_ts.end(), - LimitOffset<std::wstring>(kLimit)); - size_t unlimited_count = 0; - for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end(); - ++ti) { - if (*ti < kLimit && *ti != kNpos) - ++unlimited_count; - } - EXPECT_EQ(10U, unlimited_count); - - // Reverse the values in the vector and try again. - size_ts.clear(); - for (size_t t = kItems; t > 0; --t) - size_ts.push_back(t - 1); - std::for_each(size_ts.begin(), size_ts.end(), - LimitOffset<std::wstring>(kLimit)); - unlimited_count = 0; - for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end(); - ++ti) { - if (*ti < kLimit && *ti != kNpos) - ++unlimited_count; - } - EXPECT_EQ(10U, unlimited_count); -} - -TEST(UTFOffsetStringConversionsTest, AdjustOffsets) { - // Imagine we have strings as shown in the following cases where the - // X's represent encoded characters. - // 1: abcXXXdef ==> abcXdef - std::vector<size_t> offsets; - for (size_t t = 0; t < 9; ++t) - offsets.push_back(t); - AdjustOffset::Adjustments adjustments; - adjustments.push_back(AdjustOffset::Adjustment(3, 3, 1)); - std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments)); - size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6}; - EXPECT_EQ(offsets.size(), arraysize(expected_1)); - for (size_t i = 0; i < arraysize(expected_1); ++i) - EXPECT_EQ(expected_1[i], offsets[i]); - - // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX - offsets.clear(); - for (size_t t = 0; t < 23; ++t) - offsets.push_back(t); - adjustments.clear(); - adjustments.push_back(AdjustOffset::Adjustment(0, 3, 1)); - adjustments.push_back(AdjustOffset::Adjustment(4, 4, 2)); - adjustments.push_back(AdjustOffset::Adjustment(10, 7, 4)); - adjustments.push_back(AdjustOffset::Adjustment(20, 3, 1)); - std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments)); - size_t expected_2[] = {0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, - kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 10, 11, 12, - 13, kNpos, kNpos}; - EXPECT_EQ(offsets.size(), arraysize(expected_2)); - for (size_t i = 0; i < arraysize(expected_2); ++i) - EXPECT_EQ(expected_2[i], offsets[i]); - - // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe - offsets.clear(); - for (size_t t = 0; t < 17; ++t) - offsets.push_back(t); - adjustments.clear(); - adjustments.push_back(AdjustOffset::Adjustment(0, 3, 0)); - adjustments.push_back(AdjustOffset::Adjustment(4, 4, 4)); - adjustments.push_back(AdjustOffset::Adjustment(11, 3, 3)); - adjustments.push_back(AdjustOffset::Adjustment(15, 2, 0)); - std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments)); - size_t expected_3[] = {kNpos, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, - 7, 8, kNpos, kNpos, 11, kNpos, kNpos}; - EXPECT_EQ(offsets.size(), arraysize(expected_3)); - for (size_t i = 0; i < arraysize(expected_3); ++i) - EXPECT_EQ(expected_3[i], offsets[i]); -} - } // namaspace base diff --git a/net/base/escape.cc b/net/base/escape.cc index 61c3e81..64bd107 100644 --- a/net/base/escape.cc +++ b/net/base/escape.cc @@ -2,12 +2,11 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "net/base/escape.h" - #include <algorithm> +#include "net/base/escape.h" + #include "base/logging.h" -#include "base/scoped_ptr.h" #include "base/string_piece.h" #include "base/string_util.h" #include "base/utf_string_conversions.h" @@ -99,14 +98,15 @@ const char kUrlUnescape[128] = { }; template<typename STR> -STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, - UnescapeRule::Type rules, - std::vector<size_t>* offsets_for_adjustment) { - if (offsets_for_adjustment) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - LimitOffset<std::wstring>(escaped_text.length())); - } +STR UnescapeURLImpl(const STR& escaped_text, + UnescapeRule::Type rules, + size_t* offset_for_adjustment) { + size_t offset_temp = string16::npos; + if (!offset_for_adjustment) + offset_for_adjustment = &offset_temp; + else if (*offset_for_adjustment >= escaped_text.length()) + *offset_for_adjustment = string16::npos; + // Do not unescape anything, return the |escaped_text| text. if (rules == UnescapeRule::NONE) return escaped_text; @@ -117,7 +117,6 @@ STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, STR result; result.reserve(escaped_text.length()); - AdjustEncodingOffset::Adjustments adjustments; // Locations of adjusted text. for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { if (static_cast<unsigned char>(escaped_text[i]) >= 128) { // Non ASCII character, append as is. @@ -145,9 +144,17 @@ STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, // Additionally allow control characters if requested. (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { // Use the unescaped version of the character. - adjustments.push_back(i); + size_t length_before_append = result.length(); result.push_back(value); i += 2; + + // Adjust offset to match length change. + if (*offset_for_adjustment != std::string::npos) { + if (*offset_for_adjustment > (length_before_append + 2)) + *offset_for_adjustment -= 2; + else if (*offset_for_adjustment > length_before_append) + *offset_for_adjustment = std::string::npos; + } } else { // Keep escaped. Append a percent and we'll get the following two // digits on the next loops through. @@ -167,26 +174,6 @@ STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, } } - // Make offset adjustment. - if (offsets_for_adjustment && !adjustments.empty()) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - AdjustEncodingOffset(adjustments)); - } - - return result; -} - -template<typename STR> -STR UnescapeURLImpl(const STR& escaped_text, - UnescapeRule::Type rules, - size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - STR result = UnescapeURLWithOffsetsImpl(escaped_text, rules, &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; return result; } @@ -247,49 +234,33 @@ std::string EscapeExternalHandlerValue(const std::string& text) { return Escape(text, kExternalHandlerCharmap, false); } -string16 UnescapeAndDecodeUTF8URLComponentWithOffsets( - const std::string& text, - UnescapeRule::Type rules, - std::vector<size_t>* offsets_for_adjustment) { +string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, + UnescapeRule::Type rules, + size_t* offset_for_adjustment) { std::wstring result; - std::vector<size_t> original_offsets; - if (offsets_for_adjustment) - original_offsets = *offsets_for_adjustment; + size_t original_offset = offset_for_adjustment ? *offset_for_adjustment : 0; std::string unescaped_url( - UnescapeURLWithOffsetsImpl(text, rules, offsets_for_adjustment)); - if (UTF8ToWideAndAdjustOffsets(unescaped_url.data(), unescaped_url.length(), - &result, offsets_for_adjustment)) + UnescapeURLImpl(text, rules, offset_for_adjustment)); + if (UTF8ToWideAndAdjustOffset(unescaped_url.data(), unescaped_url.length(), + &result, offset_for_adjustment)) return WideToUTF16Hack(result); // Character set looks like it's valid. // Not valid. Return the escaped version. Undo our changes to // |offset_for_adjustment| since we haven't changed the string after all. - if (offsets_for_adjustment) - *offsets_for_adjustment = original_offsets; - return WideToUTF16Hack(UTF8ToWideAndAdjustOffsets( - text, offsets_for_adjustment)); -} - -string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, - UnescapeRule::Type rules, - size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - string16 result = - UnescapeAndDecodeUTF8URLComponentWithOffsets(text, rules, &offsets); if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return result; + *offset_for_adjustment = original_offset; + return WideToUTF16Hack(UTF8ToWideAndAdjustOffset(text, + offset_for_adjustment)); } std::string UnescapeURLComponent(const std::string& escaped_text, UnescapeRule::Type rules) { - return UnescapeURLWithOffsetsImpl<std::string>(escaped_text, rules, NULL); + return UnescapeURLImpl(escaped_text, rules, NULL); } string16 UnescapeURLComponent(const string16& escaped_text, UnescapeRule::Type rules) { - return UnescapeURLWithOffsetsImpl<string16>(escaped_text, rules, NULL); + return UnescapeURLImpl(escaped_text, rules, NULL); } @@ -379,27 +350,3 @@ string16 UnescapeForHTML(const string16& input) { } return text; } - -AdjustEncodingOffset::AdjustEncodingOffset(const Adjustments& adjustments) - : adjustments(adjustments) {} - -void AdjustEncodingOffset::operator()(size_t& offset) { - // For each encoded character occurring before an offset subtract 2. - if (offset == string16::npos) - return; - size_t adjusted_offset = offset; - for (Adjustments::const_iterator i = adjustments.begin(); - i != adjustments.end(); ++i) { - size_t location = *i; - if (offset <= location) { - offset = adjusted_offset; - return; - } - if (offset <= (location + 2)) { - offset = string16::npos; - return; - } - adjusted_offset -= 2; - } - offset = adjusted_offset; -} diff --git a/net/base/escape.h b/net/base/escape.h index f4c99a3..faa7bd3 100644 --- a/net/base/escape.h +++ b/net/base/escape.h @@ -7,7 +7,6 @@ #pragma once #include <string> -#include <vector> #include "base/basictypes.h" #include "base/string16.h" @@ -100,20 +99,15 @@ string16 UnescapeURLComponent(const string16& escaped_text, // Unescapes the given substring as a URL, and then tries to interpret the // result as being encoded as UTF-8. If the result is convertable into UTF-8, it // will be returned as converted. If it is not, the original escaped string will -// be converted into a string16 and returned. (|offset[s]_for_adjustment|) -// specifies one or more offsets into the source strings; each offset will be -// adjusted to point at the same logical place in the result strings during -// decoding. If this isn't possible because an offset points past the end of -// the source strings or into the middle of a multibyte sequence, the offending -// offset will be set to std::wstring::npos. |offset[s]_for_adjustment| may be -// NULL. +// be converted into a string16 and returned. +// +// |offset_for_adjustment| may be NULL; if not, it is an offset into |text| that +// will be adjusted to point at the same logical place in the result string. If +// this isn't possible because it points into the middle of an escape sequence +// or past the end of the string, it will be set to string16::npos. string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, UnescapeRule::Type rules, size_t* offset_for_adjustment); -string16 UnescapeAndDecodeUTF8URLComponentWithOffsets( - const std::string& text, - UnescapeRule::Type rules, - std::vector<size_t>* offsets_for_adjustment); // Unescape the following ampersand character codes from |text|: // < > & " ' @@ -135,17 +129,4 @@ bool EscapeQueryParamValue(const string16& text, const char* codepage, // assumes the codepage is UTF8. This is provided as a convenience. string16 EscapeQueryParamValueUTF8(const string16& text, bool use_plus); -// Private Functions (Exposed for Unit Testing) -------------------------------- - -// A function called by std::for_each that will adjust any offset which occurs -// after one or more encoded characters. -struct AdjustEncodingOffset { - typedef std::vector<size_t> Adjustments; - - explicit AdjustEncodingOffset(const Adjustments& adjustments); - void operator()(size_t& offset); - - const Adjustments& adjustments; -}; - #endif // NET_BASE_ESCAPE_H_ diff --git a/net/base/escape_unittest.cc b/net/base/escape_unittest.cc index 3a8d895..60d4ae3 100644 --- a/net/base/escape_unittest.cc +++ b/net/base/escape_unittest.cc @@ -2,7 +2,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include <algorithm> #include <string> #include "net/base/escape.h" @@ -16,8 +15,6 @@ namespace { -static const size_t kNpos = string16::npos; - struct EscapeCase { const wchar_t* input; const wchar_t* output; @@ -399,39 +396,3 @@ TEST(EscapeTest, UnescapeForHTML) { EXPECT_EQ(ASCIIToUTF16(tests[i].expected_output), result); } } - -TEST(EscapeTest, AdjustEncodingOffset) { - // Imagine we have strings as shown in the following cases where the - // %XX's represent encoded characters - - // 1: abc%ECdef ==> abcXdef - std::vector<size_t> offsets; - for (size_t t = 0; t < 9; ++t) - offsets.push_back(t); - AdjustEncodingOffset::Adjustments adjustments; - adjustments.push_back(3); - std::for_each(offsets.begin(), offsets.end(), - AdjustEncodingOffset(adjustments)); - size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6}; - EXPECT_EQ(offsets.size(), arraysize(expected_1)); - for (size_t i = 0; i < arraysize(expected_1); ++i) - EXPECT_EQ(expected_1[i], offsets[i]); - - - // 2: %ECabc%EC%ECdef%EC ==> XabcXXdefX - offsets.clear(); - for (size_t t = 0; t < 18; ++t) - offsets.push_back(t); - adjustments.clear(); - adjustments.push_back(0); - adjustments.push_back(6); - adjustments.push_back(9); - adjustments.push_back(15); - std::for_each(offsets.begin(), offsets.end(), - AdjustEncodingOffset(adjustments)); - size_t expected_2[] = {0, kNpos, kNpos, 1, 2, 3, 4, kNpos, kNpos, 5, kNpos, - kNpos, 6, 7, 8, 9, kNpos, kNpos}; - EXPECT_EQ(offsets.size(), arraysize(expected_2)); - for (size_t i = 0; i < arraysize(expected_2); ++i) - EXPECT_EQ(expected_2[i], offsets[i]); -} diff --git a/net/base/net_util.cc b/net/base/net_util.cc index 378ac7b..1aaa98b 100644 --- a/net/base/net_util.cc +++ b/net/base/net_util.cc @@ -67,6 +67,7 @@ #endif #include "unicode/datefmt.h" + using base::Time; namespace net { @@ -733,80 +734,6 @@ bool IDNToUnicodeOneComponent(const char16* comp, return false; } -struct SubtractFromOffset { - explicit SubtractFromOffset(size_t amount) - : amount(amount) {} - void operator()(size_t& offset) { - if (offset != std::wstring::npos) - if (offset >= amount) - offset -= amount; - else - offset = std::wstring::npos; - } - - size_t amount; -}; - -struct AddToOffset { - explicit AddToOffset(size_t amount) - : amount(amount) {} - void operator()(size_t& offset) { - if (offset != std::wstring::npos) - offset += amount; - } - - size_t amount; -}; - -std::vector<size_t> OffsetsIntoSection( - std::vector<size_t>* offsets_for_adjustment, - size_t section_begin) { - std::vector<size_t> offsets_into_section; - if (offsets_for_adjustment) { - std::transform(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - std::back_inserter(offsets_into_section), - ClampComponentOffset(section_begin)); - std::for_each(offsets_into_section.begin(), offsets_into_section.end(), - SubtractFromOffset(section_begin)); - } - return offsets_into_section; -} - -void ApplySectionAdjustments(const std::vector<size_t>& offsets_into_section, - std::vector<size_t>* offsets_for_adjustment, - size_t old_section_len, - size_t new_section_len, - size_t section_begin) { - if (offsets_for_adjustment) { - DCHECK_EQ(offsets_for_adjustment->size(), offsets_into_section.size()); - std::vector<size_t>::const_iterator host_offsets_iter = - offsets_into_section.begin(); - for (std::vector<size_t>::iterator offsets_iter = - offsets_for_adjustment->begin(); - offsets_iter != offsets_for_adjustment->end(); - ++offsets_iter, ++host_offsets_iter) { - size_t offset = *offsets_iter; - if (offset == std::wstring::npos || offset < section_begin) { - // The offset is before the host section so leave it as is. - continue; - } - if (offset >= section_begin + old_section_len) { - // The offset is after the host section so adjust by host length delta. - offset += new_section_len - old_section_len; - } else if (*host_offsets_iter != std::wstring::npos) { - // The offset is within the host and valid so adjust by the host - // reformatting offsets results. - offset = section_begin + *host_offsets_iter; - } else { - // The offset is invalid. - offset = std::wstring::npos; - } - *offsets_iter = offset; - } - } -} - // If |component| is valid, its begin is incremented by |delta|. void AdjustComponent(int delta, url_parse::Component* component) { if (!component->is_valid()) @@ -833,7 +760,7 @@ std::wstring FormatUrlInternal(const GURL& url, UnescapeRule::Type unescape_rules, url_parse::Parsed* new_parsed, size_t* prefix_end, - std::vector<size_t>* offsets_for_adjustment); + size_t* offset_for_adjustment); // Helper for FormatUrl()/FormatUrlInternal(). std::wstring FormatViewSourceUrl(const GURL& url, @@ -842,20 +769,18 @@ std::wstring FormatViewSourceUrl(const GURL& url, UnescapeRule::Type unescape_rules, url_parse::Parsed* new_parsed, size_t* prefix_end, - std::vector<size_t>* offsets_for_adjustment) { + size_t* offset_for_adjustment) { DCHECK(new_parsed); - DCHECK(offsets_for_adjustment); const wchar_t* const kWideViewSource = L"view-source:"; const size_t kViewSourceLengthPlus1 = 12; - std::vector<size_t> saved_offsets(*offsets_for_adjustment); GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1)); - // Clamp the offsets to the source area. - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - SubtractFromOffset(kViewSourceLengthPlus1)); + size_t temp_offset = (*offset_for_adjustment == std::wstring::npos) ? + std::wstring::npos : (*offset_for_adjustment - kViewSourceLengthPlus1); + size_t* temp_offset_ptr = (*offset_for_adjustment < kViewSourceLengthPlus1) ? + NULL : &temp_offset; std::wstring result = FormatUrlInternal(real_url, languages, format_types, - unescape_rules, new_parsed, prefix_end, offsets_for_adjustment); + unescape_rules, new_parsed, prefix_end, temp_offset_ptr); result.insert(0, kWideViewSource); // Adjust position values. @@ -869,61 +794,57 @@ std::wstring FormatViewSourceUrl(const GURL& url, AdjustComponents(kViewSourceLengthPlus1, new_parsed); if (prefix_end) *prefix_end += kViewSourceLengthPlus1; - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - AddToOffset(kViewSourceLengthPlus1)); - // Restore all offsets which were not affected by FormatUrlInternal. - DCHECK_EQ(saved_offsets.size(), offsets_for_adjustment->size()); - for (size_t i = 0; i < saved_offsets.size(); ++i) { - if (saved_offsets[i] < kViewSourceLengthPlus1) - (*offsets_for_adjustment)[i] = saved_offsets[i]; + if (temp_offset_ptr) { + *offset_for_adjustment = (temp_offset == std::wstring::npos) ? + std::wstring::npos : (temp_offset + kViewSourceLengthPlus1); } return result; } // Appends the substring |in_component| inside of the URL |spec| to |output|, // and the resulting range will be filled into |out_component|. |unescape_rules| -// defines how to clean the URL for human readability. |offsets_for_adjustment| -// is an array of offsets into |output| each of which will be adjusted based on -// how it maps to the component being converted; if it is less than -// output->length(), it will be untouched, and if it is greater than -// output->length() + in_component.len it will be adjusted by the difference in -// lengths between the input and output components. Otherwise it points into -// the component being converted, and is adjusted to point to the same logical -// place in |output|. |offsets_for_adjustment| may not be NULL. +// defines how to clean the URL for human readability. |offset_for_adjustment| +// is an offset into |output| which will be adjusted based on how it maps to the +// component being converted; if it is less than output->length(), it will be +// untouched, and if it is greater than output->length() + in_component.len it +// will be shortened by the difference in lengths between the input and output +// components. Otherwise it points into the component being converted, and is +// adjusted to point to the same logical place in |output|. +// |offset_for_adjustment| may not be NULL. void AppendFormattedComponent(const std::string& spec, const url_parse::Component& in_component, UnescapeRule::Type unescape_rules, std::wstring* output, url_parse::Component* out_component, - std::vector<size_t>* offsets_for_adjustment) { + size_t* offset_for_adjustment) { DCHECK(output); - DCHECK(offsets_for_adjustment); + DCHECK(offset_for_adjustment); if (in_component.is_nonempty()) { - size_t component_begin = output->length(); - out_component->begin = static_cast<int>(component_begin); - - // Compose a list of offsets within the component area. - std::vector<size_t> offsets_into_component = - OffsetsIntoSection(offsets_for_adjustment, component_begin); - + out_component->begin = static_cast<int>(output->length()); + size_t offset_past_current_output = + ((*offset_for_adjustment == std::wstring::npos) || + (*offset_for_adjustment < output->length())) ? + std::wstring::npos : (*offset_for_adjustment - output->length()); + size_t* offset_into_component = + (offset_past_current_output >= static_cast<size_t>(in_component.len)) ? + NULL : &offset_past_current_output; if (unescape_rules == UnescapeRule::NONE) { - output->append(UTF8ToWideAndAdjustOffsets( + output->append(UTF8ToWideAndAdjustOffset( spec.substr(in_component.begin, in_component.len), - &offsets_into_component)); + offset_into_component)); } else { - output->append(UTF16ToWideHack( - UnescapeAndDecodeUTF8URLComponentWithOffsets( - spec.substr(in_component.begin, in_component.len), unescape_rules, - &offsets_into_component))); + output->append(UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent( + spec.substr(in_component.begin, in_component.len), unescape_rules, + offset_into_component))); + } + out_component->len = + static_cast<int>(output->length()) - out_component->begin; + if (offset_into_component) { + *offset_for_adjustment = (*offset_into_component == std::wstring::npos) ? + std::wstring::npos : (out_component->begin + *offset_into_component); + } else if (offset_past_current_output != std::wstring::npos) { + *offset_for_adjustment += out_component->len - in_component.len; } - size_t new_component_len = output->length() - component_begin; - out_component->len = static_cast<int>(new_component_len); - - // Apply offset adjustments. - size_t old_component_len = static_cast<size_t>(in_component.len); - ApplySectionAdjustments(offsets_into_component, offsets_for_adjustment, - old_component_len, new_component_len, component_begin); } else { out_component->reset(); } @@ -937,16 +858,15 @@ std::wstring FormatUrlInternal(const GURL& url, UnescapeRule::Type unescape_rules, url_parse::Parsed* new_parsed, size_t* prefix_end, - std::vector<size_t>* offsets_for_adjustment) { + size_t* offset_for_adjustment) { url_parse::Parsed parsed_temp; if (!new_parsed) new_parsed = &parsed_temp; else *new_parsed = url_parse::Parsed(); - - std::vector<size_t> offsets_temp; - if (!offsets_for_adjustment) - offsets_for_adjustment = &offsets_temp; + size_t offset_temp = std::wstring::npos; + if (!offset_for_adjustment) + offset_for_adjustment = &offset_temp; std::wstring url_string; @@ -954,9 +874,7 @@ std::wstring FormatUrlInternal(const GURL& url, if (url.is_empty()) { if (prefix_end) *prefix_end = 0; - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - LimitOffset<std::wstring>(0)); + *offset_for_adjustment = std::wstring::npos; return url_string; } @@ -968,17 +886,15 @@ std::wstring FormatUrlInternal(const GURL& url, if (url.SchemeIs(kViewSource) && !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) { return FormatViewSourceUrl(url, languages, format_types, - unescape_rules, new_parsed, prefix_end, offsets_for_adjustment); + unescape_rules, new_parsed, prefix_end, offset_for_adjustment); } // We handle both valid and invalid URLs (this will give us the spec // regardless of validity). const std::string& spec = url.possibly_invalid_spec(); const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec(); - size_t spec_length = spec.length(); - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - LimitOffset<std::wstring>(spec_length)); + if (*offset_for_adjustment >= spec.length()) + *offset_for_adjustment = std::wstring::npos; // Copy everything before the username (the scheme and the separators.) // These are ASCII. @@ -1006,47 +922,48 @@ std::wstring FormatUrlInternal(const GURL& url, // e.g. "http://google.com:search@evil.ru/" new_parsed->username.reset(); new_parsed->password.reset(); - // Update the offsets based on removed username and/or password. - if (!offsets_for_adjustment->empty() && + if ((*offset_for_adjustment != std::wstring::npos) && (parsed.username.is_nonempty() || parsed.password.is_nonempty())) { - AdjustOffset::Adjustments adjustments; if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) { // The seeming off-by-one and off-by-two in these first two lines are to // account for the ':' after the username and '@' after the password. - adjustments.push_back(AdjustOffset::Adjustment( - static_cast<size_t>(parsed.username.begin), - static_cast<size_t>(parsed.username.len + parsed.password.len + - 2), 0)); + if (*offset_for_adjustment > + static_cast<size_t>(parsed.password.end())) { + *offset_for_adjustment -= + (parsed.username.len + parsed.password.len + 2); + } else if (*offset_for_adjustment > + static_cast<size_t>(parsed.username.begin)) { + *offset_for_adjustment = std::wstring::npos; + } } else { const url_parse::Component* nonempty_component = parsed.username.is_nonempty() ? &parsed.username : &parsed.password; - // The seeming off-by-one in below is to account for the '@' after the - // username/password. - adjustments.push_back(AdjustOffset::Adjustment( - static_cast<size_t>(nonempty_component->begin), - static_cast<size_t>(nonempty_component->len + 1), 0)); + // The seeming off-by-one in these first two lines is to account for the + // '@' after the username/password. + if (*offset_for_adjustment > + static_cast<size_t>(nonempty_component->end())) { + *offset_for_adjustment -= (nonempty_component->len + 1); + } else if (*offset_for_adjustment > + static_cast<size_t>(nonempty_component->begin)) { + *offset_for_adjustment = std::wstring::npos; + } } - - // Make offset adjustment. - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - AdjustOffset(adjustments)); } } else { AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string, - &new_parsed->username, offsets_for_adjustment); + &new_parsed->username, offset_for_adjustment); if (parsed.password.is_valid()) url_string.push_back(':'); AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string, - &new_parsed->password, offsets_for_adjustment); + &new_parsed->password, offset_for_adjustment); if (parsed.username.is_valid() || parsed.password.is_valid()) url_string.push_back('@'); } if (prefix_end) *prefix_end = static_cast<size_t>(url_string.length()); - AppendFormattedHostWithOffsets(url, languages, &url_string, new_parsed, - offsets_for_adjustment); + AppendFormattedHost(url, languages, &url_string, new_parsed, + offset_for_adjustment); // Port. if (parsed.port.is_nonempty()) { @@ -1064,35 +981,41 @@ std::wstring FormatUrlInternal(const GURL& url, if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) || !CanStripTrailingSlash(url)) { AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string, - &new_parsed->path, offsets_for_adjustment); + &new_parsed->path, offset_for_adjustment); } if (parsed.query.is_valid()) url_string.push_back('?'); AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string, - &new_parsed->query, offsets_for_adjustment); + &new_parsed->query, offset_for_adjustment); // Reference is stored in valid, unescaped UTF-8, so we can just convert. if (parsed.ref.is_valid()) { url_string.push_back('#'); - size_t ref_begin = url_string.length(); - new_parsed->ref.begin = static_cast<int>(ref_begin); - - // Compose a list of offsets within the section. - std::vector<size_t> offsets_into_ref = - OffsetsIntoSection(offsets_for_adjustment, ref_begin); - + new_parsed->ref.begin = url_string.length(); + size_t offset_past_current_output = + ((*offset_for_adjustment == std::wstring::npos) || + (*offset_for_adjustment < url_string.length())) ? + std::wstring::npos : (*offset_for_adjustment - url_string.length()); + size_t* offset_into_ref = + (offset_past_current_output >= static_cast<size_t>(parsed.ref.len)) ? + NULL : &offset_past_current_output; if (parsed.ref.len > 0) { - url_string.append(UTF8ToWideAndAdjustOffsets(spec.substr(parsed.ref.begin, - parsed.ref.len), - &offsets_into_ref)); + url_string.append(UTF8ToWideAndAdjustOffset(spec.substr(parsed.ref.begin, + parsed.ref.len), + offset_into_ref)); + } + new_parsed->ref.len = url_string.length() - new_parsed->ref.begin; + if (offset_into_ref) { + *offset_for_adjustment = (*offset_into_ref == std::wstring::npos) ? + std::wstring::npos : (new_parsed->ref.begin + *offset_into_ref); + } else if (offset_past_current_output != std::wstring::npos) { + // We clamped the offset near the beginning of this function to ensure it + // was within the input URL. If we reach here, the input was something + // invalid and non-parseable such that the offset was past any component + // we could figure out. In this case it won't be represented in the + // output string, so reset it. + *offset_for_adjustment = std::wstring::npos; } - size_t old_ref_len = static_cast<size_t>(parsed.ref.len); - size_t new_ref_len = url_string.length() - new_parsed->ref.begin; - new_parsed->ref.len = static_cast<int>(new_ref_len); - - // Apply offset adjustments. - ApplySectionAdjustments(offsets_into_ref, offsets_for_adjustment, - old_ref_len, new_ref_len, ref_begin); } // If we need to strip out http do it after the fact. This way we don't need @@ -1100,11 +1023,12 @@ std::wstring FormatUrlInternal(const GURL& url, const size_t kHTTPSize = arraysize(kHTTP) - 1; if (omit_http && !url_string.compare(0, kHTTPSize, kHTTP)) { url_string = url_string.substr(kHTTPSize); - AdjustOffset::Adjustments adjustments; - adjustments.push_back(AdjustOffset::Adjustment(0, kHTTPSize, 0)); - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - AdjustOffset(adjustments)); + if (*offset_for_adjustment != std::wstring::npos) { + if (*offset_for_adjustment < kHTTPSize) + *offset_for_adjustment = std::wstring::npos; + else + *offset_for_adjustment -= kHTTPSize; + } if (prefix_end) *prefix_end -= kHTTPSize; @@ -1262,20 +1186,21 @@ std::string GetHeaderParamValue(const std::string& field, // // We may want to skip this step in the case of file URLs to allow unicode // UNC hostnames regardless of encodings. -std::wstring IDNToUnicodeWithOffsets( - const char* host, - size_t host_len, - const std::wstring& languages, - std::vector<size_t>* offsets_for_adjustment) { +std::wstring IDNToUnicode(const char* host, + size_t host_len, + const std::wstring& languages, + size_t* offset_for_adjustment) { // Convert the ASCII input to a wide string for ICU. string16 input16; input16.reserve(host_len); input16.insert(input16.end(), host, host + host_len); + string16 out16; + size_t output_offset = offset_for_adjustment ? + *offset_for_adjustment : std::wstring::npos; + // Do each component of the host separately, since we enforce script matching // on a per-component basis. - AdjustOffset::Adjustments adjustments; - string16 out16; for (size_t component_start = 0, component_end; component_start < input16.length(); component_start = component_end + 1) { @@ -1284,18 +1209,22 @@ std::wstring IDNToUnicodeWithOffsets( if (component_end == string16::npos) component_end = input16.length(); // For getting the last component. size_t component_length = component_end - component_start; - size_t new_component_start = out16.length(); + + size_t output_component_start = out16.length(); bool converted_idn = false; if (component_end > component_start) { // Add the substring that we just found. converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start, component_length, languages, &out16); } - size_t new_component_length = out16.length() - new_component_start; + size_t output_component_length = out16.length() - output_component_start; - if (converted_idn && offsets_for_adjustment) { - adjustments.push_back(AdjustOffset::Adjustment( - component_start, component_length, new_component_length)); + if ((output_offset != std::wstring::npos) && + (*offset_for_adjustment > component_start)) { + if ((*offset_for_adjustment < component_end) && converted_idn) + output_offset = std::wstring::npos; + else + output_offset += output_component_length - component_length; } // Need to add the dot we just found (if we found one). @@ -1303,28 +1232,10 @@ std::wstring IDNToUnicodeWithOffsets( out16.push_back('.'); } - // Make offset adjustment. - if (offsets_for_adjustment && !adjustments.empty()) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - AdjustOffset(adjustments)); - } - - return UTF16ToWideAndAdjustOffsets(out16, offsets_for_adjustment); -} - -std::wstring IDNToUnicode(const char* host, - size_t host_len, - const std::wstring& languages, - size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - std::wstring result = - IDNToUnicodeWithOffsets(host, host_len, languages, &offsets); if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return result; + *offset_for_adjustment = output_offset; + + return UTF16ToWideAndAdjustOffset(out16, offset_for_adjustment); } std::string CanonicalizeHost(const std::string& host, @@ -1737,73 +1648,51 @@ std::string GetHostOrSpecFromURL(const GURL& url) { return url.has_host() ? TrimEndingDot(url.host()) : url.spec(); } -void AppendFormattedHostWithOffsets( - const GURL& url, - const std::wstring& languages, - std::wstring* output, - url_parse::Parsed* new_parsed, - std::vector<size_t>* offsets_for_adjustment) { +void AppendFormattedHost(const GURL& url, + const std::wstring& languages, + std::wstring* output, + url_parse::Parsed* new_parsed, + size_t* offset_for_adjustment) { DCHECK(output); const url_parse::Component& host = url.parsed_for_possibly_invalid_spec().host; if (host.is_nonempty()) { // Handle possible IDN in the host name. - size_t host_begin = output->length(); + int new_host_begin = static_cast<int>(output->length()); if (new_parsed) - new_parsed->host.begin = static_cast<int>(host_begin); - size_t old_host_len = static_cast<size_t>(host.len); - - // Compose a list of offsets within the host area. - std::vector<size_t> offsets_into_host = - OffsetsIntoSection(offsets_for_adjustment, host_begin); + new_parsed->host.begin = new_host_begin; + size_t offset_past_current_output = + (!offset_for_adjustment || + (*offset_for_adjustment == std::wstring::npos) || + (*offset_for_adjustment < output->length())) ? + std::wstring::npos : (*offset_for_adjustment - output->length()); + size_t* offset_into_host = + (offset_past_current_output >= static_cast<size_t>(host.len)) ? + NULL : &offset_past_current_output; const std::string& spec = url.possibly_invalid_spec(); DCHECK(host.begin >= 0 && ((spec.length() == 0 && host.begin == 0) || host.begin < static_cast<int>(spec.length()))); - output->append(IDNToUnicodeWithOffsets(&spec[host.begin], old_host_len, - languages, &offsets_into_host)); + output->append(IDNToUnicode(&spec[host.begin], + static_cast<size_t>(host.len), languages, offset_into_host)); - size_t new_host_len = output->length() - host_begin; + int new_host_len = static_cast<int>(output->length()) - new_host_begin; if (new_parsed) - new_parsed->host.len = static_cast<int>(new_host_len); - - // Apply offset adjustments. - ApplySectionAdjustments(offsets_into_host, offsets_for_adjustment, - old_host_len, new_host_len, host_begin); + new_parsed->host.len = new_host_len; + if (offset_into_host) { + *offset_for_adjustment = (*offset_into_host == std::wstring::npos) ? + std::wstring::npos : (new_host_begin + *offset_into_host); + } else if (offset_past_current_output != std::wstring::npos) { + *offset_for_adjustment += new_host_len - host.len; + } } else if (new_parsed) { new_parsed->host.reset(); } } -void AppendFormattedHost(const GURL& url, - const std::wstring& languages, - std::wstring* output, - url_parse::Parsed* new_parsed, - size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - AppendFormattedHostWithOffsets(url, languages, output, new_parsed, &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; -} - // TODO(viettrungluu): convert the wstring |FormatUrlInternal()|. -string16 FormatUrlWithOffsets(const GURL& url, - const std::string& languages, - FormatUrlTypes format_types, - UnescapeRule::Type unescape_rules, - url_parse::Parsed* new_parsed, - size_t* prefix_end, - std::vector<size_t>* offsets_for_adjustment) { - return WideToUTF16Hack( - FormatUrlInternal(url, ASCIIToWide(languages), format_types, - unescape_rules, new_parsed, prefix_end, - offsets_for_adjustment)); -} - string16 FormatUrl(const GURL& url, const std::string& languages, FormatUrlTypes format_types, @@ -1811,15 +1700,10 @@ string16 FormatUrl(const GURL& url, url_parse::Parsed* new_parsed, size_t* prefix_end, size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - string16 result = WideToUTF16Hack( + return WideToUTF16Hack( FormatUrlInternal(url, ASCIIToWide(languages), format_types, - unescape_rules, new_parsed, prefix_end, &offsets)); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return result; + unescape_rules, new_parsed, prefix_end, + offset_for_adjustment)); } bool CanStripTrailingSlash(const GURL& url) { @@ -2272,12 +2156,4 @@ NetworkInterface::NetworkInterface(const std::string& name, NetworkInterface::~NetworkInterface() { } -ClampComponentOffset::ClampComponentOffset(size_t component_start) - : component_start(component_start) {} - -size_t ClampComponentOffset::operator()(size_t offset) { - return (offset >= component_start) ? - offset : std::wstring::npos; -} - } // namespace net diff --git a/net/base/net_util.h b/net/base/net_util.h index 0ff3369..bae27c3 100644 --- a/net/base/net_util.h +++ b/net/base/net_util.h @@ -198,21 +198,15 @@ std::string GetFileNameFromCD(const std::string& header, // script-language pairs (currently Han, Kana and Hangul for zh,ja and ko). // When |languages| is empty, even that mixing is not allowed. // -// (|offset[s]_for_adjustment|) specifies one or more offsets into the original -// |url|'s spec(); each offset will be adjusted to point at the same logical -// place in the result strings during decoding. If this isn't possible because -// an offset points past the end of |host| or into the middle of a punycode -// sequence, the offending offset will be set to std::wstring::npos. -// |offset[s]_for_adjustment| may be NULL. +// |offset_for_adjustment| is an offset into |host|, which will be adjusted to +// point at the same logical place in the output string. If this isn't possible +// because it points past the end of |host| or into the middle of a punycode +// sequence, it will be set to std::wstring::npos. |offset_for_adjustment| may +// be NULL. std::wstring IDNToUnicode(const char* host, size_t host_len, const std::wstring& languages, size_t* offset_for_adjustment); -std::wstring IDNToUnicodeWithOffsets( - const char* host, - size_t host_len, - const std::wstring& languages, - std::vector<size_t>* offsets_for_adjustment); // Canonicalizes |host| and returns it. Also fills |host_info| with // IP address information. |host_info| must not be NULL. @@ -298,24 +292,11 @@ int SetNonBlocking(int fd); // the user. The given parsed structure will be updated. The host name formatter // also takes the same accept languages component as ElideURL. |new_parsed| may // be null. -// -// (|offset[s]_for_adjustment|) specifies one or more offsets into the original -// |url|'s spec(); each offset will be adjusted to point at the same logical -// place in the result strings after reformatting of the host. If this isn't -// possible because an offset points past the end of the host or into the middle -// of a multi-character sequence, the offending offset will be set to -// std::wstring::npos. |offset[s]_for_adjustment| may be NULL. void AppendFormattedHost(const GURL& url, const std::wstring& languages, std::wstring* output, url_parse::Parsed* new_parsed, size_t* offset_for_adjustment); -void AppendFormattedHostWithOffsets( - const GURL& url, - const std::wstring& languages, - std::wstring* output, - url_parse::Parsed* new_parsed, - std::vector<size_t>* offsets_for_adjustment); // Creates a string representation of |url|. The IDN host name may be in Unicode // if |languages| accepts the Unicode representation. |format_type| is a bitmask @@ -328,13 +309,12 @@ void AppendFormattedHostWithOffsets( // The last three parameters may be NULL. // |new_parsed| will be set to the parsing parameters of the resultant URL. // |prefix_end| will be the length before the hostname of the resultant URL. -// -// (|offset[s]_for_adjustment|) specifies one or more offsets into the original -// |url|'s spec(); each offset will be modified to reflect changes this function -// makes to the output string. For example, if |url| is "http://a:b@c.com/", -// |omit_username_password| is true, and an offset is 12 (the offset of '.'), -// then on return the output string will be "http://c.com/" and the offset will -// be 8. If an offset cannot be successfully adjusted (e.g. because it points +// |offset_for_adjustment| is an offset into the original |url|'s spec(), which +// will be modified to reflect changes this function makes to the output string; +// for example, if |url| is "http://a:b@c.com/", |omit_username_password| is +// true, and |offset_for_adjustment| is 12 (the offset of '.'), then on return +// the output string will be "http://c.com/" and |offset_for_adjustment| will be +// 8. If the offset cannot be successfully adjusted (e.g. because it points // into the middle of a component that was entirely removed, past the end of the // string, or into the middle of an encoding sequence), it will be set to // string16::npos. @@ -345,13 +325,6 @@ string16 FormatUrl(const GURL& url, url_parse::Parsed* new_parsed, size_t* prefix_end, size_t* offset_for_adjustment); -string16 FormatUrlWithOffsets(const GURL& url, - const std::string& languages, - FormatUrlTypes format_types, - UnescapeRule::Type unescape_rules, - url_parse::Parsed* new_parsed, - size_t* prefix_end, - std::vector<size_t>* offsets_for_adjustment); // This is a convenience function for FormatUrl() with // format_types = kFormatUrlOmitAll and unescape = SPACES. This is the typical @@ -481,16 +454,6 @@ typedef std::list<NetworkInterface> NetworkInterfaceList; // Can be called only on a thread that allows IO. bool GetNetworkList(NetworkInterfaceList* networks); -// Private adjustment function called by std::transform which sets the offset -// to npos if the offset occurs at or before |component_start|, otherwise don't -// alter the offset. Exposed here for unit testing. -struct ClampComponentOffset { - explicit ClampComponentOffset(size_t component_start); - size_t operator()(size_t offset); - - const size_t component_start; -}; - } // namespace net #endif // NET_BASE_NET_UTIL_H_ diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc index 4265866..b547f83 100644 --- a/net/base/net_util_unittest.cc +++ b/net/base/net_util_unittest.cc @@ -4,8 +4,6 @@ #include "net/base/net_util.h" -#include <algorithm> - #include "base/file_path.h" #include "base/format_macros.h" #include "base/string_number_conversions.h" @@ -23,8 +21,6 @@ namespace net { namespace { -static const size_t kNpos = string16::npos; - struct FileCase { const wchar_t* file; const char* url; @@ -992,20 +988,6 @@ TEST(NetUtilTest, IDNToUnicodeAdjustOffset) { &offset); EXPECT_EQ(adjust_cases[i].output_offset, offset); } - - std::vector<size_t> offsets; - for (size_t i = 0; i < 40; ++i) - offsets.push_back(i); - IDNToUnicodeWithOffsets("test.xn--cy2a840a.xn--1lq90ic7f1rc.test", 39, - L"zh-CN", &offsets); - size_t expected[] = {0, 1, 2, 3, 4, 5, kNpos, kNpos, kNpos, kNpos, kNpos, - kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 7, 8, kNpos, - kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, - kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 12, 13, 14, 15, - 16, kNpos}; - ASSERT_EQ(40U, arraysize(expected)); - for (size_t i = 0; i < 40; ++i) - EXPECT_EQ(expected[i], offsets[i]); } TEST(NetUtilTest, CompliantHost) { @@ -1817,24 +1799,13 @@ TEST(NetUtilTest, FormatUrlAdjustOffset) { EXPECT_EQ(basic_cases[i].output_offset, offset); } - size_t url_size = 26; - std::vector<size_t> offsets; - for (size_t i = 0; i < url_size + 1; ++i) - offsets.push_back(i); - FormatUrlWithOffsets(GURL("http://www.google.com/foo/"), "en", - kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, - NULL, NULL, &offsets); - for (size_t i = 0; i < url_size; ++i) - EXPECT_EQ(i, offsets[i]); - EXPECT_EQ(kNpos, offsets[url_size]); - const struct { const char* input_url; size_t input_offset; size_t output_offset; } omit_auth_cases[] = { {"http://foo:bar@www.google.com/", 6, 6}, - {"http://foo:bar@www.google.com/", 7, string16::npos}, + {"http://foo:bar@www.google.com/", 7, 7}, {"http://foo:bar@www.google.com/", 8, string16::npos}, {"http://foo:bar@www.google.com/", 10, string16::npos}, {"http://foo:bar@www.google.com/", 11, string16::npos}, @@ -1852,28 +1823,13 @@ TEST(NetUtilTest, FormatUrlAdjustOffset) { EXPECT_EQ(omit_auth_cases[i].output_offset, offset); } - url_size = 30; - offsets.clear(); - for (size_t i = 0; i < url_size; ++i) - offsets.push_back(i); - FormatUrlWithOffsets(GURL("http://foo:bar@www.google.com/"), "en", - kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, - NULL, NULL, &offsets); - for (size_t i = 0; i < 7; ++i) - EXPECT_EQ(i, offsets[i]); - for (size_t i = 7; i < 15; ++i) - EXPECT_EQ(kNpos, offsets[i]); - for (size_t i = 16; i < url_size; ++i) - EXPECT_EQ(i - 8 , offsets[i]); - const AdjustOffsetCase view_source_cases[] = { {0, 0}, {3, 3}, {11, 11}, {12, 12}, {13, 13}, - {18, 18}, - {19, string16::npos}, + {19, 19}, {20, string16::npos}, {23, 19}, {26, 22}, @@ -1887,20 +1843,6 @@ TEST(NetUtilTest, FormatUrlAdjustOffset) { EXPECT_EQ(view_source_cases[i].output_offset, offset); } - url_size = 38; - offsets.clear(); - for (size_t i = 0; i < url_size; ++i) - offsets.push_back(i); - FormatUrlWithOffsets(GURL("view-source:http://foo@www.google.com/"), "en", - kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, - NULL, NULL, &offsets); - size_t expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - 17, 18, kNpos, kNpos, kNpos, kNpos, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}; - ASSERT_EQ(url_size, arraysize(expected)); - for (size_t i = 0; i < url_size; ++i) - EXPECT_EQ(expected[i], offsets[i]); - const AdjustOffsetCase idn_hostname_cases[] = { {8, string16::npos}, {16, string16::npos}, @@ -1917,21 +1859,6 @@ TEST(NetUtilTest, FormatUrlAdjustOffset) { EXPECT_EQ(idn_hostname_cases[i].output_offset, offset); } - url_size = 33; - offsets.clear(); - for (size_t i = 0; i < url_size; ++i) - offsets.push_back(i); - FormatUrlWithOffsets(GURL("http://xn--l8jvb1ey91xtjb.jp/foo/"), "ja", - kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, - NULL, NULL, &offsets); - size_t expected_1[] = {0, 1, 2, 3, 4, 5, 6, 7, kNpos, kNpos, kNpos, kNpos, - kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, - kNpos, kNpos, kNpos, kNpos, kNpos, 12, 13, 14, 15, 16, - 17, 18, 19}; - ASSERT_EQ(url_size, arraysize(expected_1)); - for (size_t i = 0; i < url_size; ++i) - EXPECT_EQ(expected_1[i], offsets[i]); - const AdjustOffsetCase unescape_cases[] = { {25, 25}, {26, string16::npos}, @@ -1954,31 +1881,11 @@ TEST(NetUtilTest, FormatUrlAdjustOffset) { EXPECT_EQ(unescape_cases[i].output_offset, offset); } - url_size = 68; - offsets.clear(); - for (size_t i = 0; i < url_size; ++i) - offsets.push_back(i); - FormatUrlWithOffsets(GURL( - "http://www.google.com/foo%20bar/%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB"), - "en", kFormatUrlOmitUsernamePassword, UnescapeRule::SPACES, NULL, NULL, - &offsets); - size_t expected_2[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, kNpos, kNpos, - 26, 27, 28, 29, 30, kNpos, kNpos, kNpos, kNpos, kNpos, - kNpos, kNpos, kNpos, 31, kNpos, kNpos, kNpos, kNpos, - kNpos, kNpos, kNpos, kNpos, 32, kNpos, kNpos, kNpos, - kNpos, kNpos, kNpos, kNpos, kNpos, 33, kNpos, kNpos, - kNpos, kNpos, kNpos, kNpos, kNpos, kNpos}; - ASSERT_EQ(url_size, arraysize(expected_2)); - for (size_t i = 0; i < url_size; ++i) - EXPECT_EQ(expected_2[i], offsets[i]); - const AdjustOffsetCase ref_cases[] = { {30, 30}, {31, 31}, {32, string16::npos}, {34, 32}, - {35, string16::npos}, {37, 33}, {38, string16::npos}, }; @@ -1992,22 +1899,6 @@ TEST(NetUtilTest, FormatUrlAdjustOffset) { EXPECT_EQ(ref_cases[i].output_offset, offset); } - url_size = 38; - offsets.clear(); - for (size_t i = 0; i < url_size; ++i) - offsets.push_back(i); - // "http://www.google.com/foo.html#\x30B0\x30B0z" - FormatUrlWithOffsets(GURL( - "http://www.google.com/foo.html#\xE3\x82\xB0\xE3\x82\xB0z"), "en", - kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, NULL, NULL, - &offsets); - size_t expected_3[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 30, 31, kNpos, kNpos, 32, kNpos, kNpos, 33}; - ASSERT_EQ(url_size, arraysize(expected_3)); - for (size_t i = 0; i < url_size; ++i) - EXPECT_EQ(expected_3[i], offsets[i]); - const AdjustOffsetCase omit_http_cases[] = { {0, string16::npos}, {3, string16::npos}, @@ -2021,18 +1912,6 @@ TEST(NetUtilTest, FormatUrlAdjustOffset) { EXPECT_EQ(omit_http_cases[i].output_offset, offset); } - url_size = 23; - offsets.clear(); - for (size_t i = 0; i < url_size; ++i) - offsets.push_back(i); - FormatUrlWithOffsets(GURL("http://www.google.com"), "en", - kFormatUrlOmitHTTP, UnescapeRule::NORMAL, NULL, NULL, &offsets); - size_t expected_4[] = {kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 0, 1, - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, kNpos}; - ASSERT_EQ(url_size, arraysize(expected_4)); - for (size_t i = 0; i < url_size; ++i) - EXPECT_EQ(expected_4[i], offsets[i]); - const AdjustOffsetCase omit_http_start_with_ftp[] = { {0, 0}, {3, 3}, @@ -2045,18 +1924,6 @@ TEST(NetUtilTest, FormatUrlAdjustOffset) { EXPECT_EQ(omit_http_start_with_ftp[i].output_offset, offset); } - url_size = 23; - offsets.clear(); - for (size_t i = 0; i < url_size; ++i) - offsets.push_back(i); - FormatUrlWithOffsets(GURL("http://ftp.google.com"), "en", - kFormatUrlOmitHTTP, UnescapeRule::NORMAL, NULL, NULL, &offsets); - size_t expected_5[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, kNpos}; - ASSERT_EQ(url_size, arraysize(expected_5)); - for (size_t i = 0; i < url_size; ++i) - EXPECT_EQ(expected_5[i], offsets[i]); - const AdjustOffsetCase omit_all_cases[] = { {12, 0}, {13, 1}, @@ -2069,19 +1936,6 @@ TEST(NetUtilTest, FormatUrlAdjustOffset) { UnescapeRule::NORMAL, NULL, NULL, &offset); EXPECT_EQ(omit_all_cases[i].output_offset, offset); } - - url_size = 21; - offsets.clear(); - for (size_t i = 0; i < url_size; ++i) - offsets.push_back(i); - FormatUrlWithOffsets(GURL("http://user@foo.com/"), "en", kFormatUrlOmitAll, - UnescapeRule::NORMAL, NULL, NULL, &offsets); - size_t expected_6[] = {kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, - kNpos, kNpos, kNpos, kNpos, 0, 1, 2, 3, 4, 5, 6, 7, - kNpos}; - ASSERT_EQ(url_size, arraysize(expected_6)); - for (size_t i = 0; i < url_size; ++i) - EXPECT_EQ(expected_6[i], offsets[i]); } TEST(NetUtilTest, SimplifyUrlForRequest) { @@ -2367,20 +2221,4 @@ TEST(NetUtilTest, GetNetworkList) { } } -TEST(NetUtilTest, AdjustComponentOffset) { - std::vector<size_t> old_offsets; - for (size_t i = 0; i < 10; ++i) - old_offsets.push_back(i); - std::vector<size_t> new_offsets; - std::transform(old_offsets.begin(), - old_offsets.end(), - std::back_inserter(new_offsets), - ClampComponentOffset(5)); - size_t expected_1[] = {kNpos, kNpos, kNpos, kNpos, kNpos, 5, 6, 7, 8, 9}; - EXPECT_EQ(new_offsets.size(), arraysize(expected_1)); - EXPECT_EQ(new_offsets.size(), old_offsets.size()); - for (size_t i = 0; i < arraysize(expected_1); ++i) - EXPECT_EQ(expected_1[i], new_offsets[i]); -} - } // namespace net |