diff options
Diffstat (limited to 'base')
-rw-r--r-- | base/utf_offset_string_conversions.cc | 179 | ||||
-rw-r--r-- | base/utf_offset_string_conversions.h | 66 | ||||
-rw-r--r-- | base/utf_offset_string_conversions_unittest.cc | 94 |
3 files changed, 33 insertions, 306 deletions
diff --git a/base/utf_offset_string_conversions.cc b/base/utf_offset_string_conversions.cc index f091cb4..4c47ef8 100644 --- a/base/utf_offset_string_conversions.cc +++ b/base/utf_offset_string_conversions.cc @@ -1,12 +1,9 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/utf_offset_string_conversions.h" -#include <algorithm> - -#include "base/scoped_ptr.h" #include "base/string_piece.h" #include "base/utf_string_conversion_utils.h" @@ -24,16 +21,13 @@ template<typename SRC_CHAR> bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, std::wstring* output, - std::vector<size_t>* offsets_for_adjustment) { - if (offsets_for_adjustment) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - LimitOffset<std::wstring>(src_len)); - } + size_t* offset_for_adjustment) { + size_t output_offset = + (offset_for_adjustment && *offset_for_adjustment < src_len) ? + *offset_for_adjustment : std::wstring::npos; // ICU requires 32-bit numbers. bool success = true; - AdjustOffset::Adjustments adjustments; int32 src_len32 = static_cast<int32>(src_len); for (int32 i = 0; i < src_len32; i++) { uint32 code_point; @@ -45,23 +39,21 @@ bool ConvertUnicode(const SRC_CHAR* src, chars_written = WriteUnicodeCharacter(0xFFFD, output); success = false; } - if (offsets_for_adjustment) { + if ((output_offset != std::wstring::npos) && + (*offset_for_adjustment > original_i)) { // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last // character read, not after it (so that incrementing it in the loop // increment will place it at the right location), so we need to account // for that in determining the amount that was read. - adjustments.push_back(AdjustOffset::Adjustment( - original_i, i - original_i + 1, chars_written)); + if (*offset_for_adjustment <= static_cast<size_t>(i)) + output_offset = std::wstring::npos; + else + output_offset += chars_written - (i - original_i + 1); } } - // Make offset adjustment. - if (offsets_for_adjustment && !adjustments.empty()) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - AdjustOffset(adjustments)); - } - + if (offset_for_adjustment) + *offset_for_adjustment = output_offset; return success; } @@ -71,44 +63,16 @@ bool UTF8ToWideAndAdjustOffset(const char* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - PrepareForUTF16Or32Output(src, src_len, output); - bool ret = ConvertUnicode(src, src_len, output, &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return ret; -} - -bool UTF8ToWideAndAdjustOffsets(const char* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment) { PrepareForUTF16Or32Output(src, src_len, output); - return ConvertUnicode(src, src_len, output, offsets_for_adjustment); + return ConvertUnicode(src, src_len, output, offset_for_adjustment); } std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - std::wstring result; - UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, - &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return result; -} - -std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8, - std::vector<size_t>* - offsets_for_adjustment) { - std::wstring result; - UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, - offsets_for_adjustment); - return result; + std::wstring ret; + UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret, + offset_for_adjustment); + return ret; } // UTF-16 <-> Wide ------------------------------------------------------------- @@ -126,19 +90,6 @@ bool UTF16ToWideAndAdjustOffset(const char16* src, return true; } -bool UTF16ToWideAndAdjustOffsets(const char16* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment) { - output->assign(src, src_len); - if (offsets_for_adjustment) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - LimitOffset<std::wstring>(src_len)); - } - return true; -} - std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment) { if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) @@ -146,109 +97,25 @@ std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, return utf16; } -std::wstring UTF16ToWideAndAdjustOffsets( - const string16& utf16, - std::vector<size_t>* offsets_for_adjustment) { - if (offsets_for_adjustment) { - std::for_each(offsets_for_adjustment->begin(), - offsets_for_adjustment->end(), - LimitOffset<std::wstring>(utf16.length())); - } - return utf16; -} - #elif defined(WCHAR_T_IS_UTF32) bool UTF16ToWideAndAdjustOffset(const char16* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - output->clear(); - // Assume that normally we won't have any non-BMP characters so the counts - // will be the same. - output->reserve(src_len); - bool ret = ConvertUnicode(src, src_len, output, &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return ret; -} - -bool UTF16ToWideAndAdjustOffsets(const char16* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment) { output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); - return ConvertUnicode(src, src_len, output, offsets_for_adjustment); + return ConvertUnicode(src, src_len, output, offset_for_adjustment); } std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment) { - std::vector<size_t> offsets; - if (offset_for_adjustment) - offsets.push_back(*offset_for_adjustment); - std::wstring result; - UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, - &offsets); - if (offset_for_adjustment) - *offset_for_adjustment = offsets[0]; - return result; -} - -std::wstring UTF16ToWideAndAdjustOffsets( - const string16& utf16, - std::vector<size_t>* offsets_for_adjustment) { - std::wstring result; - UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, - offsets_for_adjustment); - return result; + std::wstring ret; + UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret, + offset_for_adjustment); + return ret; } #endif // defined(WCHAR_T_IS_UTF32) - -template <typename T> -LimitOffset<T>::LimitOffset(size_t limit) - : limit_(limit) {} - -template <typename T> -void LimitOffset<T>::operator()(size_t& offset) { - if (offset >= limit_) - offset = T::npos; -} - -AdjustOffset::Adjustment::Adjustment(size_t location, - size_t old_length, - size_t new_length) - : location(location), - old_length(old_length), - new_length(new_length) {} - -AdjustOffset::AdjustOffset(const Adjustments& adjustments) - : adjustments_(adjustments) {} - -void AdjustOffset::operator()(size_t& offset) { - if (offset == std::wstring::npos) - return; - size_t adjustment = 0; - for (Adjustments::const_iterator i = adjustments_.begin(); - i != adjustments_.end(); ++i) { - size_t location = i->location; - if (offset == location && i->new_length == 0) { - offset = std::wstring::npos; - return; - } - if (offset <= location) - break; - if (offset < (location + i->old_length)) { - offset = std::wstring::npos; - return; - } - adjustment += (i->old_length - i->new_length); - } - offset -= adjustment; -} diff --git a/base/utf_offset_string_conversions.h b/base/utf_offset_string_conversions.h index 19b312a..13df1b4 100644 --- a/base/utf_offset_string_conversions.h +++ b/base/utf_offset_string_conversions.h @@ -7,7 +7,6 @@ #pragma once #include <string> -#include <vector> #include "base/base_api.h" #include "base/string16.h" @@ -16,78 +15,23 @@ namespace base { class StringPiece; } -// Like the conversions in utf_string_conversions.h, but also takes one or more -// offsets (|offset[s]_for_adjustment|) into the source strings, each offset -// will be adjusted to point at the same logical place in the result strings. -// If this isn't possible because an offset points past the end of the source -// strings or into the middle of a multibyte sequence, the offending offset will -// be set to std::wstring::npos. |offset[s]_for_adjustment| may be NULL. +// Like the conversions in utf_string_conversions.h, but also take offsets into +// the source strings, which will be adjusted to point at the same logical place +// in the result strings. If this isn't possible because the offsets point past +// the end of the source strings or into the middle of multibyte sequences, they +// will be set to std::wstring::npos. |offset_for_adjustment| may be NULL. BASE_API bool UTF8ToWideAndAdjustOffset(const char* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment); -BASE_API bool UTF8ToWideAndAdjustOffsets( - const char* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment); - BASE_API std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, size_t* offset_for_adjustment); -BASE_API std::wstring UTF8ToWideAndAdjustOffsets( - const base::StringPiece& utf8, - std::vector<size_t>* offsets_for_adjustment); BASE_API bool UTF16ToWideAndAdjustOffset(const char16* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment); -BASE_API bool UTF16ToWideAndAdjustOffsets( - const char16* src, - size_t src_len, - std::wstring* output, - std::vector<size_t>* offsets_for_adjustment); - BASE_API std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment); -BASE_API std::wstring UTF16ToWideAndAdjustOffsets( - const string16& utf16, - std::vector<size_t>* offsets_for_adjustment); - -// Limiting function callable by std::for_each which will replace any value -// which is equal to or greater than |limit| with npos. -template <typename T> -struct LimitOffset { - explicit LimitOffset(size_t limit); - void operator()(size_t& offset); - - size_t limit_; -}; - -// Adjustment function called by std::transform which will adjust any offset -// that occurs after one or more modified substrings. To use, create any -// number of AdjustOffset::Adjustments, drop them into a vector, then call -// std::transform with the transform function being something similar to -// AdjustOffset(adjustments). Each Adjustment gives the original |location| -// of the encoded section and the |old_length| and |new_length| of the section -// before and after decoding. -struct AdjustOffset { - // Helper structure which indicates where an encoded character occurred - // and how long that encoding was. - struct Adjustment { - Adjustment(size_t location, size_t old_length, size_t new_length); - - size_t location; - size_t old_length; - size_t new_length; - }; - - typedef std::vector<Adjustment> Adjustments; - - explicit AdjustOffset(const Adjustments& adjustments); - void operator()(size_t& offset); - - const Adjustments& adjustments_; -}; #endif // BASE_UTF_OFFSET_STRING_CONVERSIONS_H_ diff --git a/base/utf_offset_string_conversions_unittest.cc b/base/utf_offset_string_conversions_unittest.cc index b731b9e..4f13ab3 100644 --- a/base/utf_offset_string_conversions_unittest.cc +++ b/base/utf_offset_string_conversions_unittest.cc @@ -1,9 +1,7 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include <algorithm> - #include "base/logging.h" #include "base/string_piece.h" #include "base/utf_offset_string_conversions.h" @@ -13,8 +11,6 @@ namespace base { namespace { -static const size_t kNpos = std::wstring::npos; - // Given a null-terminated string of wchar_t with each wchar_t representing // a UTF-16 code unit, returns a string16 made up of wchar_t's in the input. // Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF) @@ -44,12 +40,12 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) { size_t input_offset; size_t output_offset; } utf8_to_wide_cases[] = { - {"", 0, kNpos}, - {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos}, + {"", 0, std::wstring::npos}, + {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos}, {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1}, {"\xed\xb0\x80z", 3, 1}, {"A\xF0\x90\x8C\x80z", 1, 1}, - {"A\xF0\x90\x8C\x80z", 2, kNpos}, + {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos}, #if defined(WCHAR_T_IS_UTF16) {"A\xF0\x90\x8C\x80z", 5, 3}, #elif defined(WCHAR_T_IS_UTF32) @@ -69,7 +65,7 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) { size_t output_offset; } utf16_to_wide_cases[] = { {L"\xD840\xDC00\x4E00", 0, 0}, - {L"\xD840\xDC00\x4E00", 1, kNpos}, + {L"\xD840\xDC00\x4E00", 1, std::wstring::npos}, {L"\xD840\xDC00\x4E00", 2, 1}, }; for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_wide_cases); ++i) { @@ -81,84 +77,4 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) { #endif } -TEST(UTFOffsetStringConversionsTest, LimitOffsets) { - const size_t kLimit = 10; - const size_t kItems = 20; - std::vector<size_t> size_ts; - for (size_t t = 0; t < kItems; ++t) - size_ts.push_back(t); - std::for_each(size_ts.begin(), size_ts.end(), - LimitOffset<std::wstring>(kLimit)); - size_t unlimited_count = 0; - for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end(); - ++ti) { - if (*ti < kLimit && *ti != kNpos) - ++unlimited_count; - } - EXPECT_EQ(10U, unlimited_count); - - // Reverse the values in the vector and try again. - size_ts.clear(); - for (size_t t = kItems; t > 0; --t) - size_ts.push_back(t - 1); - std::for_each(size_ts.begin(), size_ts.end(), - LimitOffset<std::wstring>(kLimit)); - unlimited_count = 0; - for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end(); - ++ti) { - if (*ti < kLimit && *ti != kNpos) - ++unlimited_count; - } - EXPECT_EQ(10U, unlimited_count); -} - -TEST(UTFOffsetStringConversionsTest, AdjustOffsets) { - // Imagine we have strings as shown in the following cases where the - // X's represent encoded characters. - // 1: abcXXXdef ==> abcXdef - std::vector<size_t> offsets; - for (size_t t = 0; t < 9; ++t) - offsets.push_back(t); - AdjustOffset::Adjustments adjustments; - adjustments.push_back(AdjustOffset::Adjustment(3, 3, 1)); - std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments)); - size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6}; - EXPECT_EQ(offsets.size(), arraysize(expected_1)); - for (size_t i = 0; i < arraysize(expected_1); ++i) - EXPECT_EQ(expected_1[i], offsets[i]); - - // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX - offsets.clear(); - for (size_t t = 0; t < 23; ++t) - offsets.push_back(t); - adjustments.clear(); - adjustments.push_back(AdjustOffset::Adjustment(0, 3, 1)); - adjustments.push_back(AdjustOffset::Adjustment(4, 4, 2)); - adjustments.push_back(AdjustOffset::Adjustment(10, 7, 4)); - adjustments.push_back(AdjustOffset::Adjustment(20, 3, 1)); - std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments)); - size_t expected_2[] = {0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, - kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 10, 11, 12, - 13, kNpos, kNpos}; - EXPECT_EQ(offsets.size(), arraysize(expected_2)); - for (size_t i = 0; i < arraysize(expected_2); ++i) - EXPECT_EQ(expected_2[i], offsets[i]); - - // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe - offsets.clear(); - for (size_t t = 0; t < 17; ++t) - offsets.push_back(t); - adjustments.clear(); - adjustments.push_back(AdjustOffset::Adjustment(0, 3, 0)); - adjustments.push_back(AdjustOffset::Adjustment(4, 4, 4)); - adjustments.push_back(AdjustOffset::Adjustment(11, 3, 3)); - adjustments.push_back(AdjustOffset::Adjustment(15, 2, 0)); - std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments)); - size_t expected_3[] = {kNpos, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, - 7, 8, kNpos, kNpos, 11, kNpos, kNpos}; - EXPECT_EQ(offsets.size(), arraysize(expected_3)); - for (size_t i = 0; i < arraysize(expected_3); ++i) - EXPECT_EQ(expected_3[i], offsets[i]); -} - } // namaspace base |