summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
Diffstat (limited to 'base')
-rw-r--r--base/utf_offset_string_conversions.cc179
-rw-r--r--base/utf_offset_string_conversions.h66
-rw-r--r--base/utf_offset_string_conversions_unittest.cc94
3 files changed, 33 insertions, 306 deletions
diff --git a/base/utf_offset_string_conversions.cc b/base/utf_offset_string_conversions.cc
index f091cb4..4c47ef8 100644
--- a/base/utf_offset_string_conversions.cc
+++ b/base/utf_offset_string_conversions.cc
@@ -1,12 +1,9 @@
-// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/utf_offset_string_conversions.h"
-#include <algorithm>
-
-#include "base/scoped_ptr.h"
#include "base/string_piece.h"
#include "base/utf_string_conversion_utils.h"
@@ -24,16 +21,13 @@ template<typename SRC_CHAR>
bool ConvertUnicode(const SRC_CHAR* src,
size_t src_len,
std::wstring* output,
- std::vector<size_t>* offsets_for_adjustment) {
- if (offsets_for_adjustment) {
- std::for_each(offsets_for_adjustment->begin(),
- offsets_for_adjustment->end(),
- LimitOffset<std::wstring>(src_len));
- }
+ size_t* offset_for_adjustment) {
+ size_t output_offset =
+ (offset_for_adjustment && *offset_for_adjustment < src_len) ?
+ *offset_for_adjustment : std::wstring::npos;
// ICU requires 32-bit numbers.
bool success = true;
- AdjustOffset::Adjustments adjustments;
int32 src_len32 = static_cast<int32>(src_len);
for (int32 i = 0; i < src_len32; i++) {
uint32 code_point;
@@ -45,23 +39,21 @@ bool ConvertUnicode(const SRC_CHAR* src,
chars_written = WriteUnicodeCharacter(0xFFFD, output);
success = false;
}
- if (offsets_for_adjustment) {
+ if ((output_offset != std::wstring::npos) &&
+ (*offset_for_adjustment > original_i)) {
// NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
// character read, not after it (so that incrementing it in the loop
// increment will place it at the right location), so we need to account
// for that in determining the amount that was read.
- adjustments.push_back(AdjustOffset::Adjustment(
- original_i, i - original_i + 1, chars_written));
+ if (*offset_for_adjustment <= static_cast<size_t>(i))
+ output_offset = std::wstring::npos;
+ else
+ output_offset += chars_written - (i - original_i + 1);
}
}
- // Make offset adjustment.
- if (offsets_for_adjustment && !adjustments.empty()) {
- std::for_each(offsets_for_adjustment->begin(),
- offsets_for_adjustment->end(),
- AdjustOffset(adjustments));
- }
-
+ if (offset_for_adjustment)
+ *offset_for_adjustment = output_offset;
return success;
}
@@ -71,44 +63,16 @@ bool UTF8ToWideAndAdjustOffset(const char* src,
size_t src_len,
std::wstring* output,
size_t* offset_for_adjustment) {
- std::vector<size_t> offsets;
- if (offset_for_adjustment)
- offsets.push_back(*offset_for_adjustment);
- PrepareForUTF16Or32Output(src, src_len, output);
- bool ret = ConvertUnicode(src, src_len, output, &offsets);
- if (offset_for_adjustment)
- *offset_for_adjustment = offsets[0];
- return ret;
-}
-
-bool UTF8ToWideAndAdjustOffsets(const char* src,
- size_t src_len,
- std::wstring* output,
- std::vector<size_t>* offsets_for_adjustment) {
PrepareForUTF16Or32Output(src, src_len, output);
- return ConvertUnicode(src, src_len, output, offsets_for_adjustment);
+ return ConvertUnicode(src, src_len, output, offset_for_adjustment);
}
std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8,
size_t* offset_for_adjustment) {
- std::vector<size_t> offsets;
- if (offset_for_adjustment)
- offsets.push_back(*offset_for_adjustment);
- std::wstring result;
- UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result,
- &offsets);
- if (offset_for_adjustment)
- *offset_for_adjustment = offsets[0];
- return result;
-}
-
-std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8,
- std::vector<size_t>*
- offsets_for_adjustment) {
- std::wstring result;
- UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result,
- offsets_for_adjustment);
- return result;
+ std::wstring ret;
+ UTF8ToWideAndAdjustOffset(utf8.data(), utf8.length(), &ret,
+ offset_for_adjustment);
+ return ret;
}
// UTF-16 <-> Wide -------------------------------------------------------------
@@ -126,19 +90,6 @@ bool UTF16ToWideAndAdjustOffset(const char16* src,
return true;
}
-bool UTF16ToWideAndAdjustOffsets(const char16* src,
- size_t src_len,
- std::wstring* output,
- std::vector<size_t>* offsets_for_adjustment) {
- output->assign(src, src_len);
- if (offsets_for_adjustment) {
- std::for_each(offsets_for_adjustment->begin(),
- offsets_for_adjustment->end(),
- LimitOffset<std::wstring>(src_len));
- }
- return true;
-}
-
std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
size_t* offset_for_adjustment) {
if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length()))
@@ -146,109 +97,25 @@ std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
return utf16;
}
-std::wstring UTF16ToWideAndAdjustOffsets(
- const string16& utf16,
- std::vector<size_t>* offsets_for_adjustment) {
- if (offsets_for_adjustment) {
- std::for_each(offsets_for_adjustment->begin(),
- offsets_for_adjustment->end(),
- LimitOffset<std::wstring>(utf16.length()));
- }
- return utf16;
-}
-
#elif defined(WCHAR_T_IS_UTF32)
bool UTF16ToWideAndAdjustOffset(const char16* src,
size_t src_len,
std::wstring* output,
size_t* offset_for_adjustment) {
- std::vector<size_t> offsets;
- if (offset_for_adjustment)
- offsets.push_back(*offset_for_adjustment);
- output->clear();
- // Assume that normally we won't have any non-BMP characters so the counts
- // will be the same.
- output->reserve(src_len);
- bool ret = ConvertUnicode(src, src_len, output, &offsets);
- if (offset_for_adjustment)
- *offset_for_adjustment = offsets[0];
- return ret;
-}
-
-bool UTF16ToWideAndAdjustOffsets(const char16* src,
- size_t src_len,
- std::wstring* output,
- std::vector<size_t>* offsets_for_adjustment) {
output->clear();
// Assume that normally we won't have any non-BMP characters so the counts
// will be the same.
output->reserve(src_len);
- return ConvertUnicode(src, src_len, output, offsets_for_adjustment);
+ return ConvertUnicode(src, src_len, output, offset_for_adjustment);
}
std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
size_t* offset_for_adjustment) {
- std::vector<size_t> offsets;
- if (offset_for_adjustment)
- offsets.push_back(*offset_for_adjustment);
- std::wstring result;
- UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result,
- &offsets);
- if (offset_for_adjustment)
- *offset_for_adjustment = offsets[0];
- return result;
-}
-
-std::wstring UTF16ToWideAndAdjustOffsets(
- const string16& utf16,
- std::vector<size_t>* offsets_for_adjustment) {
- std::wstring result;
- UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result,
- offsets_for_adjustment);
- return result;
+ std::wstring ret;
+ UTF16ToWideAndAdjustOffset(utf16.data(), utf16.length(), &ret,
+ offset_for_adjustment);
+ return ret;
}
#endif // defined(WCHAR_T_IS_UTF32)
-
-template <typename T>
-LimitOffset<T>::LimitOffset(size_t limit)
- : limit_(limit) {}
-
-template <typename T>
-void LimitOffset<T>::operator()(size_t& offset) {
- if (offset >= limit_)
- offset = T::npos;
-}
-
-AdjustOffset::Adjustment::Adjustment(size_t location,
- size_t old_length,
- size_t new_length)
- : location(location),
- old_length(old_length),
- new_length(new_length) {}
-
-AdjustOffset::AdjustOffset(const Adjustments& adjustments)
- : adjustments_(adjustments) {}
-
-void AdjustOffset::operator()(size_t& offset) {
- if (offset == std::wstring::npos)
- return;
- size_t adjustment = 0;
- for (Adjustments::const_iterator i = adjustments_.begin();
- i != adjustments_.end(); ++i) {
- size_t location = i->location;
- if (offset == location && i->new_length == 0) {
- offset = std::wstring::npos;
- return;
- }
- if (offset <= location)
- break;
- if (offset < (location + i->old_length)) {
- offset = std::wstring::npos;
- return;
- }
- adjustment += (i->old_length - i->new_length);
- }
- offset -= adjustment;
-}
diff --git a/base/utf_offset_string_conversions.h b/base/utf_offset_string_conversions.h
index 19b312a..13df1b4 100644
--- a/base/utf_offset_string_conversions.h
+++ b/base/utf_offset_string_conversions.h
@@ -7,7 +7,6 @@
#pragma once
#include <string>
-#include <vector>
#include "base/base_api.h"
#include "base/string16.h"
@@ -16,78 +15,23 @@ namespace base {
class StringPiece;
}
-// Like the conversions in utf_string_conversions.h, but also takes one or more
-// offsets (|offset[s]_for_adjustment|) into the source strings, each offset
-// will be adjusted to point at the same logical place in the result strings.
-// If this isn't possible because an offset points past the end of the source
-// strings or into the middle of a multibyte sequence, the offending offset will
-// be set to std::wstring::npos. |offset[s]_for_adjustment| may be NULL.
+// Like the conversions in utf_string_conversions.h, but also take offsets into
+// the source strings, which will be adjusted to point at the same logical place
+// in the result strings. If this isn't possible because the offsets point past
+// the end of the source strings or into the middle of multibyte sequences, they
+// will be set to std::wstring::npos. |offset_for_adjustment| may be NULL.
BASE_API bool UTF8ToWideAndAdjustOffset(const char* src,
size_t src_len,
std::wstring* output,
size_t* offset_for_adjustment);
-BASE_API bool UTF8ToWideAndAdjustOffsets(
- const char* src,
- size_t src_len,
- std::wstring* output,
- std::vector<size_t>* offsets_for_adjustment);
-
BASE_API std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8,
size_t* offset_for_adjustment);
-BASE_API std::wstring UTF8ToWideAndAdjustOffsets(
- const base::StringPiece& utf8,
- std::vector<size_t>* offsets_for_adjustment);
BASE_API bool UTF16ToWideAndAdjustOffset(const char16* src,
size_t src_len,
std::wstring* output,
size_t* offset_for_adjustment);
-BASE_API bool UTF16ToWideAndAdjustOffsets(
- const char16* src,
- size_t src_len,
- std::wstring* output,
- std::vector<size_t>* offsets_for_adjustment);
-
BASE_API std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
size_t* offset_for_adjustment);
-BASE_API std::wstring UTF16ToWideAndAdjustOffsets(
- const string16& utf16,
- std::vector<size_t>* offsets_for_adjustment);
-
-// Limiting function callable by std::for_each which will replace any value
-// which is equal to or greater than |limit| with npos.
-template <typename T>
-struct LimitOffset {
- explicit LimitOffset(size_t limit);
- void operator()(size_t& offset);
-
- size_t limit_;
-};
-
-// Adjustment function called by std::transform which will adjust any offset
-// that occurs after one or more modified substrings. To use, create any
-// number of AdjustOffset::Adjustments, drop them into a vector, then call
-// std::transform with the transform function being something similar to
-// AdjustOffset(adjustments). Each Adjustment gives the original |location|
-// of the encoded section and the |old_length| and |new_length| of the section
-// before and after decoding.
-struct AdjustOffset {
- // Helper structure which indicates where an encoded character occurred
- // and how long that encoding was.
- struct Adjustment {
- Adjustment(size_t location, size_t old_length, size_t new_length);
-
- size_t location;
- size_t old_length;
- size_t new_length;
- };
-
- typedef std::vector<Adjustment> Adjustments;
-
- explicit AdjustOffset(const Adjustments& adjustments);
- void operator()(size_t& offset);
-
- const Adjustments& adjustments_;
-};
#endif // BASE_UTF_OFFSET_STRING_CONVERSIONS_H_
diff --git a/base/utf_offset_string_conversions_unittest.cc b/base/utf_offset_string_conversions_unittest.cc
index b731b9e..4f13ab3 100644
--- a/base/utf_offset_string_conversions_unittest.cc
+++ b/base/utf_offset_string_conversions_unittest.cc
@@ -1,9 +1,7 @@
-// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include <algorithm>
-
#include "base/logging.h"
#include "base/string_piece.h"
#include "base/utf_offset_string_conversions.h"
@@ -13,8 +11,6 @@ namespace base {
namespace {
-static const size_t kNpos = std::wstring::npos;
-
// Given a null-terminated string of wchar_t with each wchar_t representing
// a UTF-16 code unit, returns a string16 made up of wchar_t's in the input.
// Each wchar_t should be <= 0xFFFF and a non-BMP character (> U+FFFF)
@@ -44,12 +40,12 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
size_t input_offset;
size_t output_offset;
} utf8_to_wide_cases[] = {
- {"", 0, kNpos},
- {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
+ {"", 0, std::wstring::npos},
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, std::wstring::npos},
{"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
{"\xed\xb0\x80z", 3, 1},
{"A\xF0\x90\x8C\x80z", 1, 1},
- {"A\xF0\x90\x8C\x80z", 2, kNpos},
+ {"A\xF0\x90\x8C\x80z", 2, std::wstring::npos},
#if defined(WCHAR_T_IS_UTF16)
{"A\xF0\x90\x8C\x80z", 5, 3},
#elif defined(WCHAR_T_IS_UTF32)
@@ -69,7 +65,7 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
size_t output_offset;
} utf16_to_wide_cases[] = {
{L"\xD840\xDC00\x4E00", 0, 0},
- {L"\xD840\xDC00\x4E00", 1, kNpos},
+ {L"\xD840\xDC00\x4E00", 1, std::wstring::npos},
{L"\xD840\xDC00\x4E00", 2, 1},
};
for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_wide_cases); ++i) {
@@ -81,84 +77,4 @@ TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
#endif
}
-TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
- const size_t kLimit = 10;
- const size_t kItems = 20;
- std::vector<size_t> size_ts;
- for (size_t t = 0; t < kItems; ++t)
- size_ts.push_back(t);
- std::for_each(size_ts.begin(), size_ts.end(),
- LimitOffset<std::wstring>(kLimit));
- size_t unlimited_count = 0;
- for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
- ++ti) {
- if (*ti < kLimit && *ti != kNpos)
- ++unlimited_count;
- }
- EXPECT_EQ(10U, unlimited_count);
-
- // Reverse the values in the vector and try again.
- size_ts.clear();
- for (size_t t = kItems; t > 0; --t)
- size_ts.push_back(t - 1);
- std::for_each(size_ts.begin(), size_ts.end(),
- LimitOffset<std::wstring>(kLimit));
- unlimited_count = 0;
- for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
- ++ti) {
- if (*ti < kLimit && *ti != kNpos)
- ++unlimited_count;
- }
- EXPECT_EQ(10U, unlimited_count);
-}
-
-TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
- // Imagine we have strings as shown in the following cases where the
- // X's represent encoded characters.
- // 1: abcXXXdef ==> abcXdef
- std::vector<size_t> offsets;
- for (size_t t = 0; t < 9; ++t)
- offsets.push_back(t);
- AdjustOffset::Adjustments adjustments;
- adjustments.push_back(AdjustOffset::Adjustment(3, 3, 1));
- std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments));
- size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6};
- EXPECT_EQ(offsets.size(), arraysize(expected_1));
- for (size_t i = 0; i < arraysize(expected_1); ++i)
- EXPECT_EQ(expected_1[i], offsets[i]);
-
- // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
- offsets.clear();
- for (size_t t = 0; t < 23; ++t)
- offsets.push_back(t);
- adjustments.clear();
- adjustments.push_back(AdjustOffset::Adjustment(0, 3, 1));
- adjustments.push_back(AdjustOffset::Adjustment(4, 4, 2));
- adjustments.push_back(AdjustOffset::Adjustment(10, 7, 4));
- adjustments.push_back(AdjustOffset::Adjustment(20, 3, 1));
- std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments));
- size_t expected_2[] = {0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6,
- kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 10, 11, 12,
- 13, kNpos, kNpos};
- EXPECT_EQ(offsets.size(), arraysize(expected_2));
- for (size_t i = 0; i < arraysize(expected_2); ++i)
- EXPECT_EQ(expected_2[i], offsets[i]);
-
- // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
- offsets.clear();
- for (size_t t = 0; t < 17; ++t)
- offsets.push_back(t);
- adjustments.clear();
- adjustments.push_back(AdjustOffset::Adjustment(0, 3, 0));
- adjustments.push_back(AdjustOffset::Adjustment(4, 4, 4));
- adjustments.push_back(AdjustOffset::Adjustment(11, 3, 3));
- adjustments.push_back(AdjustOffset::Adjustment(15, 2, 0));
- std::for_each(offsets.begin(), offsets.end(), AdjustOffset(adjustments));
- size_t expected_3[] = {kNpos, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6,
- 7, 8, kNpos, kNpos, 11, kNpos, kNpos};
- EXPECT_EQ(offsets.size(), arraysize(expected_3));
- for (size_t i = 0; i < arraysize(expected_3); ++i)
- EXPECT_EQ(expected_3[i], offsets[i]);
-}
-
} // namaspace base