summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
authormrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-04-13 00:45:39 +0000
committermrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-04-13 00:45:39 +0000
commita47f8eadd67f75d3b663fdcc898caabb335bad0b (patch)
treefdf872770d4cd58ee753f219475850490a008f6d /base
parent2e0e8253a232fa499d22e47753c5bbadaebd69e7 (diff)
downloadchromium_src-a47f8eadd67f75d3b663fdcc898caabb335bad0b.zip
chromium_src-a47f8eadd67f75d3b663fdcc898caabb335bad0b.tar.gz
chromium_src-a47f8eadd67f75d3b663fdcc898caabb335bad0b.tar.bz2
Add multiple-offset versions of the various URL reformatting functions. Fixed a couple of erroneous unit tests of offsets into username/password.
Note: This does not complete the work required for 78153 -- tis but the first 2/3rds. BUG=78153 TEST=Many unit tests updated and added. Review URL: http://codereview.chromium.org/6822038 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@81343 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r--base/utf_offset_string_conversions.cc179
-rw-r--r--base/utf_offset_string_conversions.h66
-rw-r--r--base/utf_offset_string_conversions_unittest.cc94
3 files changed, 306 insertions, 33 deletions
diff --git a/base/utf_offset_string_conversions.cc b/base/utf_offset_string_conversions.cc
index 4c47ef8..f091cb4 100644
--- a/base/utf_offset_string_conversions.cc
+++ b/base/utf_offset_string_conversions.cc
@@ -1,9 +1,12 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/utf_offset_string_conversions.h"
+#include <algorithm>
+
+#include "base/scoped_ptr.h"
#include "base/string_piece.h"
#include "base/utf_string_conversion_utils.h"
@@ -21,13 +24,16 @@ template<typename SRC_CHAR>
bool ConvertUnicode(const SRC_CHAR* src,
size_t src_len,
std::wstring* output,
- size_t* offset_for_adjustment) {
- size_t output_offset =
- (offset_for_adjustment && *offset_for_adjustment < src_len) ?
- *offset_for_adjustment : std::wstring::npos;
+ std::vector<size_t>* offsets_for_adjustment) {
+ if (offsets_for_adjustment) {
+ std::for_each(offsets_for_adjustment->begin(),
+ offsets_for_adjustment->end(),
+ LimitOffset<std::wstring>(src_len));
+ }
// ICU requires 32-bit numbers.
bool success = true;
+ AdjustOffset::Adjustments adjustments;
int32 src_len32 = static_cast<int32>(src_len);
for (int32 i = 0; i < src_len32; i++) {
uint32 code_point;
@@ -39,21 +45,23 @@ bool ConvertUnicode(const SRC_CHAR* src,
chars_written = WriteUnicodeCharacter(0xFFFD, output);
success = false;
}
- if ((output_offset != std::wstring::npos) &&
- (*offset_for_adjustment > original_i)) {
+ if (offsets_for_adjustment) {
// NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
// character read, not after it (so that incrementing it in the loop
// increment will place it at the right location), so we need to account
// for that in determining the amount that was read.
- if (*offset_for_adjustment <= static_cast<size_t>(i))
- output_offset = std::wstring::npos;
- else
- output_offset += chars_written - (i - original_i + 1);
+ adjustments.push_back(AdjustOffset::Adjustment(
+ original_i, i - original_i + 1, chars_written));
}
}
- if (offset_for_adjustment)
- *offset_for_adjustment = output_offset;
+ // Make offset adjustment.
+ if (offsets_for_adjustment && !adjustments.empty()) {
+ std::for_each(offsets_for_adjustment->begin(),
+ offsets_for_adjustment->end(),
+ AdjustOffset(adjustments));
+ }
+
return success;
}
@@ -63,16 +71,44 @@ bool UTF8ToWideAndAdjustOffset(const char* src,
size_t src_len,
std::wstring* output,
size_t* offset_for_adjustment) {
+ std::vector<size_t> offsets;
+ if (offset_for_adjustment)
+ offsets.push_back(*offset_for_adjustment);
+ PrepareForUTF16Or32Output(src, src_len, output);
+ bool ret = ConvertUnicode(src, src_len, output, &offsets);
+ if (offset_for_adjustment)
+ *offset_for_adjustment = offsets[0];
+ return ret;
+}
+
+bool UTF8ToWideAndAdjustOffsets(const char* src,
+ size_t src_len,
+ std::wstring* output,
+ std::vector<size_t>* offsets_for_adjustment) {
PrepareForUTF16Or32Output(src, src_len, output);
- return ConvertUnicode(src, src_len, output, offset_for_adjustment);
+ return ConvertUnicode(src, src_len, output, offsets_for_adjustment);
}
std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8,
size_t* offset_for_adjustment) {
- std::wstring ret;
- UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret,
- offset_for_adjustment);
- return ret;
+ std::vector<size_t> offsets;
+ if (offset_for_adjustment)
+ offsets.push_back(*offset_for_adjustment);
+ std::wstring result;
+ UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result,
+ &offsets);
+ if (offset_for_adjustment)
+ *offset_for_adjustment = offsets[0];
+ return result;
+}
+
+std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8,
+ std::vector<size_t>*
+ offsets_for_adjustment) {
+ std::wstring result;
+ UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result,
+ offsets_for_adjustment);
+ return result;
}
// UTF-16 <-> Wide -------------------------------------------------------------
@@ -90,6 +126,19 @@ bool UTF16ToWideAndAdjustOffset(const char16* src,
return true;
}
+bool UTF16ToWideAndAdjustOffsets(const char16* src,
+ size_t src_len,
+ std::wstring* output,
+ std::vector<size_t>* offsets_for_adjustment) {
+ output->assign(src, src_len);
+ if (offsets_for_adjustment) {
+ std::for_each(offsets_for_adjustment->begin(),
+ offsets_for_adjustment->end(),
+ LimitOffset<std::wstring>(src_len));
+ }
+ return true;
+}
+
std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
size_t* offset_for_adjustment) {
if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length()))
@@ -97,25 +146,109 @@ std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
return utf16;
}
+std::wstring UTF16ToWideAndAdjustOffsets(
+ const string16& utf16,
+ std::vector<size_t>* offsets_for_adjustment) {
+ if (offsets_for_adjustment) {
+ std::for_each(offsets_for_adjustment->begin(),
+ offsets_for_adjustment->end(),
+ LimitOffset<std::wstring>(utf16.length()));
+ }
+ return utf16;
+}
+
#elif defined(WCHAR_T_IS_UTF32)
bool UTF16ToWideAndAdjustOffset(const char16* src,
size_t src_len,
std::wstring* output,
size_t* offset_for_adjustment) {
+ std::vector<size_t> offsets;
+ if (offset_for_adjustment)
+ offsets.push_back(*offset_for_adjustment);
+ output->clear();
+ // Assume that normally we won't have any non-BMP characters so the counts
+ // will be the same.
+ output->reserve(src_len);
+ bool ret = ConvertUnicode(src, src_len, output, &offsets);
+ if (offset_for_adjustment)
+ *offset_for_adjustment = offsets[0];
+ return ret;
+}
+
+bool UTF16ToWideAndAdjustOffsets(const char16* src,
+ size_t src_len,
+ std::wstring* output,
+ std::vector<size_t>* offsets_for_adjustment) {
output->clear();
// Assume that normally we won't have any non-BMP characters so the counts
// will be the same.
output->reserve(src_len);
- return ConvertUnicode(src, src_len, output, offset_for_adjustment);
+ return ConvertUnicode(src, src_len, output, offsets_for_adjustment);
}
std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
size_t* offset_for_adjustment) {
- std::wstring ret;
- UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret,
- offset_for_adjustment);
- return ret;
+ std::vector<size_t> offsets;
+ if (offset_for_adjustment)
+ offsets.push_back(*offset_for_adjustment);
+ std::wstring result;
+ UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result,
+ &offsets);
+ if (offset_for_adjustment)
+ *offset_for_adjustment = offsets[0];
+ return result;
+}
+
+std::wstring UTF16ToWideAndAdjustOffsets(
+ const string16& utf16,
+ std::vector<size_t>* offsets_for_adjustment) {
+ std::wstring result;
+ UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result,
+ offsets_for_adjustment);
+ return result;
}
#endif // defined(WCHAR_T_IS_UTF32)
+
+template <typename T>
+LimitOffset<T>::LimitOffset(size_t limit)
+ : limit_(limit) {}
+
+template <typename T>
+void LimitOffset<T>::operator()(size_t& offset) {
+ if (offset >= limit_)
+ offset = T::npos;
+}
+
+AdjustOffset::Adjustment::Adjustment(size_t location,
+ size_t old_length,
+ size_t new_length)
+ : location(location),
+ old_length(old_length),
+ new_length(new_length) {}
+
+AdjustOffset::AdjustOffset(const Adjustments& adjustments)
+ : adjustments_(adjustments) {}
+
+void AdjustOffset::operator()(size_t& offset) {
+ if (offset == std::wstring::npos)
+ return;
+ size_t adjustment = 0;
+ for (Adjustments::const_iterator i = adjustments_.begin();
+ i != adjustments_.end(); ++i) {
+ size_t location = i->location;
+ if (offset == location && i->new_length == 0) {
+ offset = std::wstring::npos;
+ return;
+ }
+ if (offset <= location)
+ break;
+ if (offset < (location + i->old_length)) {
+ offset = std::wstring::npos;
+ return;
+ }
+ adjustment += (i->old_length - i->new_length);
+ }
+ offset -= adjustment;
+}
diff --git a/base/utf_offset_string_conversions.h b/base/utf_offset_string_conversions.h
index 13df1b4..19b312a 100644
--- a/base/utf_offset_string_conversions.h
+++ b/base/utf_offset_string_conversions.h
@@ -7,6 +7,7 @@
#pragma once
#include <string>
+#include <vector>
#include "base/base_api.h"
#include "base/string16.h"
@@ -15,23 +16,78 @@ namespace base {
class StringPiece;
}
-// Like the conversions in utf_string_conversions.h, but also take offsets into
-// the source strings, which will be adjusted to point at the same logical place
-// in the result strings. If this isn't possible because the offsets point past
-// the end of the source strings or into the middle of multibyte sequences, they
-// will be set to std::wstring::npos. |offset_for_adjustment| may be NULL.
+// Like the conversions in utf_string_conversions.h, but also takes one or more
+// offsets (|offset[s]_for_adjustment|) into the source strings, each offset
+// will be adjusted to point at the same logical place in the result strings.
+// If this isn't possible because an offset points past the end of the source
+// strings or into the middle of a multibyte sequence, the offending offset will
+// be set to std::wstring::npos. |offset[s]_for_adjustment| may be NULL.
BASE_API bool UTF8ToWideAndAdjustOffset(const char* src,
size_t src_len,
std::wstring* output,
size_t* offset_for_adjustment);
+BASE_API bool UTF8ToWideAndAdjustOffsets(
+ const char* src,
+ size_t src_len,
+ std::wstring* output,
+ std::vector<size_t>* offsets_for_adjustment);
+
BASE_API std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8,
size_t* offset_for_adjustment);
+BASE_API std::wstring UTF8ToWideAndAdjustOffsets(
+ const base::StringPiece& utf8,
+ std::vector<size_t>* offsets_for_adjustment);
BASE_API bool UTF16ToWideAndAdjustOffset(const char16* src,
size_t src_len,
std::wstring* output,
size_t* offset_for_adjustment);
+BASE_API bool UTF16ToWideAndAdjustOffsets(
+ const char16* src,
+ size_t src_len,
+ std::wstring* output,
+ std::vector<size_t>* offsets_for_adjustment);
+
BASE_API std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
size_t* offset_for_adjustment);
+BASE_API std::wstring UTF16ToWideAndAdjustOffsets(
+ const string16& utf16,
+ std::vector<size_t>* offsets_for_adjustment);
+
+// Limiting function callable by std::for_each which will replace any value
+// which is equal to or greater than |limit| with npos.
+template <typename T>
+struct LimitOffset {
+ explicit LimitOffset(size_t limit);
+ void operator()(size_t& offset);
+
+ size_t limit_;
+};
+
+// Adjustment function called by std::transform which will adjust any offset
+// that occurs after one or more modified substrings. To use, create any
+// number of AdjustOffset::Adjustments, drop them into a vector, then call
+// std::transform with the transform function being something similar to
+// AdjustOffset(adjustments). Each Adjustment gives the original |location|
+// of the encoded section and the |old_length| and |new_length| of the section
+// before and after decoding.
+struct AdjustOffset {
+ // Helper structure which indicates where an encoded character occurred
+ // and how long that encoding was.
+ struct Adjustment {
+ Adjustment(size_t location, size_t old_length, size_t new_length);
+
+ size_t location;
+ size_t old_length;
+ size_t new_length;
+ };
+
+ typedef std::vector<Adjustment> Adjustments;
+
+ explicit AdjustOffset(const Adjustments& adjustments);
+ void operator()(size_t& offset);
+
+ const Adjustments& adjustments_;
+};
#endif // BASE_UTF_OFFSET_STRING_CONVERSIONS_H_
diff --git a/base/utf_offset_string_conversions_unittest.cc b/base/utf_offset_string_conversions_unittest.cc
index 4f13ab3..b731b9e 100644
--- a/base/utf_offset_string_conversions_unittest.cc
+++ b/base/utf_offset_string_conversions_unittest.cc
@@ -1,7 +1,9 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+#include <algorithm>
+
#include "base/logging.h"
#include "base/string_piece.h"
#include "base/utf_offset_string_conversions.h"
@@ -11,6 +13,8 @@ namespace base {
namespace {
+static const size_t kNpos = std::wstring::npos;
+
// Given a null-terminated string of wchar_t with each wchar_t representing
// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input.
// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF)
@@ -40,12 +44,12 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
size_t input_offset;
size_t output_offset;
} utf8_to_wide_cases[] = {
- {"", 0, std::wstring::npos},
- {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos},
+ {"", 0, kNpos},
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
{"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
{"\xed\xb0\x80z", 3, 1},
{"A\xF0\x90\x8C\x80z", 1, 1},
- {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos},
+ {"A\xF0\x90\x8C\x80z", 2, kNpos},
#if defined(WCHAR_T_IS_UTF16)
{"A\xF0\x90\x8C\x80z", 5, 3},
#elif defined(WCHAR_T_IS_UTF32)
@@ -65,7 +69,7 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
size_t output_offset;
} utf16_to_wide_cases[] = {
{L"\xD840\xDC00\x4E00", 0, 0},
- {L"\xD840\xDC00\x4E00", 1, std::wstring::npos},
+ {L"\xD840\xDC00\x4E00", 1, kNpos},
{L"\xD840\xDC00\x4E00", 2, 1},
};
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_wide_cases); ++i) {
@@ -77,4 +81,84 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
#endif
}
+TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
+ const size_t kLimit = 10;
+ const size_t kItems = 20;
+ std::vector<size_t> size_ts;
+ for (size_t t = 0; t < kItems; ++t)
+ size_ts.push_back(t);
+ std::for_each(size_ts.begin(), size_ts.end(),
+ LimitOffset<std::wstring>(kLimit));
+ size_t unlimited_count = 0;
+ for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
+ ++ti) {
+ if (*ti < kLimit && *ti != kNpos)
+ ++unlimited_count;
+ }
+ EXPECT_EQ(10U, unlimited_count);
+
+ // Reverse the values in the vector and try again.
+ size_ts.clear();
+ for (size_t t = kItems; t > 0; --t)
+ size_ts.push_back(t - 1);
+ std::for_each(size_ts.begin(), size_ts.end(),
+ LimitOffset<std::wstring>(kLimit));
+ unlimited_count = 0;
+ for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
+ ++ti) {
+ if (*ti < kLimit && *ti != kNpos)
+ ++unlimited_count;
+ }
+ EXPECT_EQ(10U, unlimited_count);
+}
+
+TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
+ // Imagine we have strings as shown in the following cases where the
+ // X's represent encoded characters.
+ // 1: abcXXXdef ==> abcXdef
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t < 9; ++t)
+ offsets.push_back(t);
+ AdjustOffset::Adjustments adjustments;
+ adjustments.push_back(AdjustOffset::Adjustment(3, 3, 1));
+ std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments));
+ size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6};
+ EXPECT_EQ(offsets.size(), arraysize(expected_1));
+ for (size_t i = 0; i < arraysize(expected_1); ++i)
+ EXPECT_EQ(expected_1[i], offsets[i]);
+
+ // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
+ offsets.clear();
+ for (size_t t = 0; t < 23; ++t)
+ offsets.push_back(t);
+ adjustments.clear();
+ adjustments.push_back(AdjustOffset::Adjustment(0, 3, 1));
+ adjustments.push_back(AdjustOffset::Adjustment(4, 4, 2));
+ adjustments.push_back(AdjustOffset::Adjustment(10, 7, 4));
+ adjustments.push_back(AdjustOffset::Adjustment(20, 3, 1));
+ std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments));
+ size_t expected_2[] = {0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6,
+ kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 10, 11, 12,
+ 13, kNpos, kNpos};
+ EXPECT_EQ(offsets.size(), arraysize(expected_2));
+ for (size_t i = 0; i < arraysize(expected_2); ++i)
+ EXPECT_EQ(expected_2[i], offsets[i]);
+
+ // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
+ offsets.clear();
+ for (size_t t = 0; t < 17; ++t)
+ offsets.push_back(t);
+ adjustments.clear();
+ adjustments.push_back(AdjustOffset::Adjustment(0, 3, 0));
+ adjustments.push_back(AdjustOffset::Adjustment(4, 4, 4));
+ adjustments.push_back(AdjustOffset::Adjustment(11, 3, 3));
+ adjustments.push_back(AdjustOffset::Adjustment(15, 2, 0));
+ std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments));
+ size_t expected_3[] = {kNpos, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6,
+ 7, 8, kNpos, kNpos, 11, kNpos, kNpos};
+ EXPECT_EQ(offsets.size(), arraysize(expected_3));
+ for (size_t i = 0; i < arraysize(expected_3); ++i)
+ EXPECT_EQ(expected_3[i], offsets[i]);
+}
+
} // namaspace base