diff options
author | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-04-13 00:45:39 +0000 |
---|---|---|
committer | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-04-13 00:45:39 +0000 |
commit | a47f8eadd67f75d3b663fdcc898caabb335bad0b (patch) | |
tree | fdf872770d4cd58ee753f219475850490a008f6d /base | |
parent | 2e0e8253a232fa499d22e47753c5bbadaebd69e7 (diff) | |
download | chromium_src-a47f8eadd67f75d3b663fdcc898caabb335bad0b.zip chromium_src-a47f8eadd67f75d3b663fdcc898caabb335bad0b.tar.gz chromium_src-a47f8eadd67f75d3b663fdcc898caabb335bad0b.tar.bz2 |
Add multiple-offset versions of the various URL reformatting functions. Fixed a couple of erroneous unit tests of offsets into username/password.
Note: This does not complete the work required for 78153 -- tis but the first 2/3rds.
BUG=78153
TEST=Many unit tests updated and added.
Review URL: http://codereview.chromium.org/6822038
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@81343 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/utf_offset_string_conversions.cc | 179 | ||||
-rw-r--r-- | base/utf_offset_string_conversions.h | 66 | ||||
-rw-r--r-- | base/utf_offset_string_conversions_unittest.cc | 94 |
3 files changed, 306 insertions, 33 deletions
diff --git a/base/utf_offset_string_conversions.cc b/base/utf_offset_string_conversions.cc index 4c47ef8..f091cb4 100644 --- a/base/utf_offset_string_conversions.cc +++ b/base/utf_offset_string_conversions.cc @@ -1,9 +1,12 @@ -// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/utf_offset_string_conversions.h" +#include <algorithm> + +#include "base/scoped_ptr.h" #include "base/string_piece.h" #include "base/utf_string_conversion_utils.h" @@ -21,13 +24,16 @@ template<typename SRC_CHAR> bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, std::wstring* output, - size_t* offset_for_adjustment) { - size_t output_offset = - (offset_for_adjustment && *offset_for_adjustment < src_len) ? - *offset_for_adjustment : std::wstring::npos; + std::vector<size_t>* offsets_for_adjustment) { + if (offsets_for_adjustment) { + std::for_each(offsets_for_adjustment->begin(), + offsets_for_adjustment->end(), + LimitOffset<std::wstring>(src_len)); + } // ICU requires 32-bit numbers. bool success = true; + AdjustOffset::Adjustments adjustments; int32 src_len32 = static_cast<int32>(src_len); for (int32 i = 0; i < src_len32; i++) { uint32 code_point; @@ -39,21 +45,23 @@ bool ConvertUnicode(const SRC_CHAR* src, chars_written = WriteUnicodeCharacter(0xFFFD, output); success = false; } - if ((output_offset != std::wstring::npos) && - (*offset_for_adjustment > original_i)) { + if (offsets_for_adjustment) { // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last // character read, not after it (so that incrementing it in the loop // increment will place it at the right location), so we need to account // for that in determining the amount that was read. - if (*offset_for_adjustment <= static_cast<size_t>(i)) - output_offset = std::wstring::npos; - else - output_offset += chars_written - (i - original_i + 1); + adjustments.push_back(AdjustOffset::Adjustment( + original_i, i - original_i + 1, chars_written)); } } - if (offset_for_adjustment) - *offset_for_adjustment = output_offset; + // Make offset adjustment. + if (offsets_for_adjustment && !adjustments.empty()) { + std::for_each(offsets_for_adjustment->begin(), + offsets_for_adjustment->end(), + AdjustOffset(adjustments)); + } + return success; } @@ -63,16 +71,44 @@ bool UTF8ToWideAndAdjustOffset(const char* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment) { + std::vector<size_t> offsets; + if (offset_for_adjustment) + offsets.push_back(*offset_for_adjustment); + PrepareForUTF16Or32Output(src, src_len, output); + bool ret = ConvertUnicode(src, src_len, output, &offsets); + if (offset_for_adjustment) + *offset_for_adjustment = offsets[0]; + return ret; +} + +bool UTF8ToWideAndAdjustOffsets(const char* src, + size_t src_len, + std::wstring* output, + std::vector<size_t>* offsets_for_adjustment) { PrepareForUTF16Or32Output(src, src_len, output); - return ConvertUnicode(src, src_len, output, offset_for_adjustment); + return ConvertUnicode(src, src_len, output, offsets_for_adjustment); } std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, size_t* offset_for_adjustment) { - std::wstring ret; - UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret, - offset_for_adjustment); - return ret; + std::vector<size_t> offsets; + if (offset_for_adjustment) + offsets.push_back(*offset_for_adjustment); + std::wstring result; + UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, + &offsets); + if (offset_for_adjustment) + *offset_for_adjustment = offsets[0]; + return result; +} + +std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8, + std::vector<size_t>* + offsets_for_adjustment) { + std::wstring result; + UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, + offsets_for_adjustment); + return result; } // UTF-16 <-> Wide ------------------------------------------------------------- @@ -90,6 +126,19 @@ bool UTF16ToWideAndAdjustOffset(const char16* src, return true; } +bool UTF16ToWideAndAdjustOffsets(const char16* src, + size_t src_len, + std::wstring* output, + std::vector<size_t>* offsets_for_adjustment) { + output->assign(src, src_len); + if (offsets_for_adjustment) { + std::for_each(offsets_for_adjustment->begin(), + offsets_for_adjustment->end(), + LimitOffset<std::wstring>(src_len)); + } + return true; +} + std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment) { if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) @@ -97,25 +146,109 @@ std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, return utf16; } +std::wstring UTF16ToWideAndAdjustOffsets( + const string16& utf16, + std::vector<size_t>* offsets_for_adjustment) { + if (offsets_for_adjustment) { + std::for_each(offsets_for_adjustment->begin(), + offsets_for_adjustment->end(), + LimitOffset<std::wstring>(utf16.length())); + } + return utf16; +} + #elif defined(WCHAR_T_IS_UTF32) bool UTF16ToWideAndAdjustOffset(const char16* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment) { + std::vector<size_t> offsets; + if (offset_for_adjustment) + offsets.push_back(*offset_for_adjustment); + output->clear(); + // Assume that normally we won't have any non-BMP characters so the counts + // will be the same. + output->reserve(src_len); + bool ret = ConvertUnicode(src, src_len, output, &offsets); + if (offset_for_adjustment) + *offset_for_adjustment = offsets[0]; + return ret; +} + +bool UTF16ToWideAndAdjustOffsets(const char16* src, + size_t src_len, + std::wstring* output, + std::vector<size_t>* offsets_for_adjustment) { output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); - return ConvertUnicode(src, src_len, output, offset_for_adjustment); + return ConvertUnicode(src, src_len, output, offsets_for_adjustment); } std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment) { - std::wstring ret; - UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret, - offset_for_adjustment); - return ret; + std::vector<size_t> offsets; + if (offset_for_adjustment) + offsets.push_back(*offset_for_adjustment); + std::wstring result; + UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, + &offsets); + if (offset_for_adjustment) + *offset_for_adjustment = offsets[0]; + return result; +} + +std::wstring UTF16ToWideAndAdjustOffsets( + const string16& utf16, + std::vector<size_t>* offsets_for_adjustment) { + std::wstring result; + UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, + offsets_for_adjustment); + return result; } #endif // defined(WCHAR_T_IS_UTF32) + +template <typename T> +LimitOffset<T>::LimitOffset(size_t limit) + : limit_(limit) {} + +template <typename T> +void LimitOffset<T>::operator()(size_t& offset) { + if (offset >= limit_) + offset = T::npos; +} + +AdjustOffset::Adjustment::Adjustment(size_t location, + size_t old_length, + size_t new_length) + : location(location), + old_length(old_length), + new_length(new_length) {} + +AdjustOffset::AdjustOffset(const Adjustments& adjustments) + : adjustments_(adjustments) {} + +void AdjustOffset::operator()(size_t& offset) { + if (offset == std::wstring::npos) + return; + size_t adjustment = 0; + for (Adjustments::const_iterator i = adjustments_.begin(); + i != adjustments_.end(); ++i) { + size_t location = i->location; + if (offset == location && i->new_length == 0) { + offset = std::wstring::npos; + return; + } + if (offset <= location) + break; + if (offset < (location + i->old_length)) { + offset = std::wstring::npos; + return; + } + adjustment += (i->old_length - i->new_length); + } + offset -= adjustment; +} diff --git a/base/utf_offset_string_conversions.h b/base/utf_offset_string_conversions.h index 13df1b4..19b312a 100644 --- a/base/utf_offset_string_conversions.h +++ b/base/utf_offset_string_conversions.h @@ -7,6 +7,7 @@ #pragma once #include <string> +#include <vector> #include "base/base_api.h" #include "base/string16.h" @@ -15,23 +16,78 @@ namespace base { class StringPiece; } -// Like the conversions in utf_string_conversions.h, but also take offsets into -// the source strings, which will be adjusted to point at the same logical place -// in the result strings. If this isn't possible because the offsets point past -// the end of the source strings or into the middle of multibyte sequences, they -// will be set to std::wstring::npos. |offset_for_adjustment| may be NULL. +// Like the conversions in utf_string_conversions.h, but also takes one or more +// offsets (|offset[s]_for_adjustment|) into the source strings, each offset +// will be adjusted to point at the same logical place in the result strings. +// If this isn't possible because an offset points past the end of the source +// strings or into the middle of a multibyte sequence, the offending offset will +// be set to std::wstring::npos. |offset[s]_for_adjustment| may be NULL. BASE_API bool UTF8ToWideAndAdjustOffset(const char* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment); +BASE_API bool UTF8ToWideAndAdjustOffsets( + const char* src, + size_t src_len, + std::wstring* output, + std::vector<size_t>* offsets_for_adjustment); + BASE_API std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, size_t* offset_for_adjustment); +BASE_API std::wstring UTF8ToWideAndAdjustOffsets( + const base::StringPiece& utf8, + std::vector<size_t>* offsets_for_adjustment); BASE_API bool UTF16ToWideAndAdjustOffset(const char16* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment); +BASE_API bool UTF16ToWideAndAdjustOffsets( + const char16* src, + size_t src_len, + std::wstring* output, + std::vector<size_t>* offsets_for_adjustment); + BASE_API std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment); +BASE_API std::wstring UTF16ToWideAndAdjustOffsets( + const string16& utf16, + std::vector<size_t>* offsets_for_adjustment); + +// Limiting function callable by std::for_each which will replace any value +// which is equal to or greater than |limit| with npos. +template <typename T> +struct LimitOffset { + explicit LimitOffset(size_t limit); + void operator()(size_t& offset); + + size_t limit_; +}; + +// Adjustment function called by std::transform which will adjust any offset +// that occurs after one or more modified substrings. To use, create any +// number of AdjustOffset::Adjustments, drop them into a vector, then call +// std::transform with the transform function being something similar to +// AdjustOffset(adjustments). Each Adjustment gives the original |location| +// of the encoded section and the |old_length| and |new_length| of the section +// before and after decoding. +struct AdjustOffset { + // Helper structure which indicates where an encoded character occurred + // and how long that encoding was. + struct Adjustment { + Adjustment(size_t location, size_t old_length, size_t new_length); + + size_t location; + size_t old_length; + size_t new_length; + }; + + typedef std::vector<Adjustment> Adjustments; + + explicit AdjustOffset(const Adjustments& adjustments); + void operator()(size_t& offset); + + const Adjustments& adjustments_; +}; #endif // BASE_UTF_OFFSET_STRING_CONVERSIONS_H_ diff --git a/base/utf_offset_string_conversions_unittest.cc b/base/utf_offset_string_conversions_unittest.cc index 4f13ab3..b731b9e 100644 --- a/base/utf_offset_string_conversions_unittest.cc +++ b/base/utf_offset_string_conversions_unittest.cc @@ -1,7 +1,9 @@ -// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#include <algorithm> + #include "base/logging.h" #include "base/string_piece.h" #include "base/utf_offset_string_conversions.h" @@ -11,6 +13,8 @@ namespace base { namespace { +static const size_t kNpos = std::wstring::npos; + // Given a null-terminated string of wchar_t with each wchar_t representing // a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. // Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) @@ -40,12 +44,12 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) { size_t input_offset; size_t output_offset; } utf8_to_wide_cases[] = { - {"", 0, std::wstring::npos}, - {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos}, + {"", 0, kNpos}, + {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos}, {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1}, {"\xed\xb0\x80z", 3, 1}, {"A\xF0\x90\x8C\x80z", 1, 1}, - {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos}, + {"A\xF0\x90\x8C\x80z", 2, kNpos}, #if defined(WCHAR_T_IS_UTF16) {"A\xF0\x90\x8C\x80z", 5, 3}, #elif defined(WCHAR_T_IS_UTF32) @@ -65,7 +69,7 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) { size_t output_offset; } utf16_to_wide_cases[] = { {L"\xD840\xDC00\x4E00", 0, 0}, - {L"\xD840\xDC00\x4E00", 1, std::wstring::npos}, + {L"\xD840\xDC00\x4E00", 1, kNpos}, {L"\xD840\xDC00\x4E00", 2, 1}, }; for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_wide_cases); ++i) { @@ -77,4 +81,84 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) { #endif } +TEST(UTFOffsetStringConversionsTest, LimitOffsets) { + const size_t kLimit = 10; + const size_t kItems = 20; + std::vector<size_t> size_ts; + for (size_t t = 0; t < kItems; ++t) + size_ts.push_back(t); + std::for_each(size_ts.begin(), size_ts.end(), + LimitOffset<std::wstring>(kLimit)); + size_t unlimited_count = 0; + for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end(); + ++ti) { + if (*ti < kLimit && *ti != kNpos) + ++unlimited_count; + } + EXPECT_EQ(10U, unlimited_count); + + // Reverse the values in the vector and try again. + size_ts.clear(); + for (size_t t = kItems; t > 0; --t) + size_ts.push_back(t - 1); + std::for_each(size_ts.begin(), size_ts.end(), + LimitOffset<std::wstring>(kLimit)); + unlimited_count = 0; + for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end(); + ++ti) { + if (*ti < kLimit && *ti != kNpos) + ++unlimited_count; + } + EXPECT_EQ(10U, unlimited_count); +} + +TEST(UTFOffsetStringConversionsTest, AdjustOffsets) { + // Imagine we have strings as shown in the following cases where the + // X's represent encoded characters. + // 1: abcXXXdef ==> abcXdef + std::vector<size_t> offsets; + for (size_t t = 0; t < 9; ++t) + offsets.push_back(t); + AdjustOffset::Adjustments adjustments; + adjustments.push_back(AdjustOffset::Adjustment(3, 3, 1)); + std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments)); + size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6}; + EXPECT_EQ(offsets.size(), arraysize(expected_1)); + for (size_t i = 0; i < arraysize(expected_1); ++i) + EXPECT_EQ(expected_1[i], offsets[i]); + + // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX + offsets.clear(); + for (size_t t = 0; t < 23; ++t) + offsets.push_back(t); + adjustments.clear(); + adjustments.push_back(AdjustOffset::Adjustment(0, 3, 1)); + adjustments.push_back(AdjustOffset::Adjustment(4, 4, 2)); + adjustments.push_back(AdjustOffset::Adjustment(10, 7, 4)); + adjustments.push_back(AdjustOffset::Adjustment(20, 3, 1)); + std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments)); + size_t expected_2[] = {0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, + kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 10, 11, 12, + 13, kNpos, kNpos}; + EXPECT_EQ(offsets.size(), arraysize(expected_2)); + for (size_t i = 0; i < arraysize(expected_2); ++i) + EXPECT_EQ(expected_2[i], offsets[i]); + + // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe + offsets.clear(); + for (size_t t = 0; t < 17; ++t) + offsets.push_back(t); + adjustments.clear(); + adjustments.push_back(AdjustOffset::Adjustment(0, 3, 0)); + adjustments.push_back(AdjustOffset::Adjustment(4, 4, 4)); + adjustments.push_back(AdjustOffset::Adjustment(11, 3, 3)); + adjustments.push_back(AdjustOffset::Adjustment(15, 2, 0)); + std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments)); + size_t expected_3[] = {kNpos, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, + 7, 8, kNpos, kNpos, 11, kNpos, kNpos}; + EXPECT_EQ(offsets.size(), arraysize(expected_3)); + for (size_t i = 0; i < arraysize(expected_3); ++i) + EXPECT_EQ(expected_3[i], offsets[i]); +} + } // namaspace base |