summaryrefslogtreecommitdiffstats
path: root/base/strings
diff options
context:
space:
mode:
authorbrettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-02-07 03:59:06 +0000
committerbrettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-02-07 03:59:06 +0000
commita3f72189102fc85a7f9cfe60c61e124a2677f0ea (patch)
treefa5699e92f87a628740fd9b88960b6449541055d /base/strings
parent3d350326fd4def19e343890c55c5388e370c7c2f (diff)
downloadchromium_src-a3f72189102fc85a7f9cfe60c61e124a2677f0ea.zip
chromium_src-a3f72189102fc85a7f9cfe60c61e124a2677f0ea.tar.gz
chromium_src-a3f72189102fc85a7f9cfe60c61e124a2677f0ea.tar.bz2
Move utf_offset_string_conversions and utf_string_conversion_utils to strings.
Review URL: https://codereview.chromium.org/12087115 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@181183 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/strings')
-rw-r--r--base/strings/utf_offset_string_conversions.cc166
-rw-r--r--base/strings/utf_offset_string_conversions.h93
-rw-r--r--base/strings/utf_offset_string_conversions_unittest.cc154
-rw-r--r--base/strings/utf_string_conversion_utils.cc148
-rw-r--r--base/strings/utf_string_conversion_utils.h97
5 files changed, 658 insertions, 0 deletions
diff --git a/base/strings/utf_offset_string_conversions.cc b/base/strings/utf_offset_string_conversions.cc
new file mode 100644
index 0000000..5a6f0c0
--- /dev/null
+++ b/base/strings/utf_offset_string_conversions.cc
@@ -0,0 +1,166 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/utf_offset_string_conversions.h"
+
+#include <algorithm>
+
+#include "base/memory/scoped_ptr.h"
+#include "base/string_piece.h"
+#include "base/strings/utf_string_conversion_utils.h"
+
+namespace base {
+
+// Converts the given source Unicode character type to the given destination
+// Unicode character type as a STL string. The given input buffer and size
+// determine the source, and the given output STL string will be replaced by
+// the result.
+template<typename SrcChar, typename DestStdString>
+bool ConvertUnicode(const SrcChar* src,
+ size_t src_len,
+ DestStdString* output,
+ std::vector<size_t>* offsets_for_adjustment) {
+ if (offsets_for_adjustment) {
+ std::for_each(offsets_for_adjustment->begin(),
+ offsets_for_adjustment->end(),
+ LimitOffset<DestStdString>(src_len));
+ }
+
+ // ICU requires 32-bit numbers.
+ bool success = true;
+ OffsetAdjuster offset_adjuster(offsets_for_adjustment);
+ int32 src_len32 = static_cast<int32>(src_len);
+ for (int32 i = 0; i < src_len32; i++) {
+ uint32 code_point;
+ size_t original_i = i;
+ size_t chars_written = 0;
+ if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
+ chars_written = WriteUnicodeCharacter(code_point, output);
+ } else {
+ chars_written = WriteUnicodeCharacter(0xFFFD, output);
+ success = false;
+ }
+ if (offsets_for_adjustment) {
+ // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
+ // character read, not after it (so that incrementing it in the loop
+ // increment will place it at the right location), so we need to account
+ // for that in determining the amount that was read.
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(original_i,
+ i - original_i + 1, chars_written));
+ }
+ }
+ return success;
+}
+
+bool UTF8ToUTF16AndAdjustOffset(const char* src,
+ size_t src_len,
+ string16* output,
+ size_t* offset_for_adjustment) {
+ std::vector<size_t> offsets;
+ if (offset_for_adjustment)
+ offsets.push_back(*offset_for_adjustment);
+ PrepareForUTF16Or32Output(src, src_len, output);
+ bool ret = ConvertUnicode(src, src_len, output, &offsets);
+ if (offset_for_adjustment)
+ *offset_for_adjustment = offsets[0];
+ return ret;
+}
+
+bool UTF8ToUTF16AndAdjustOffsets(const char* src,
+ size_t src_len,
+ string16* output,
+ std::vector<size_t>* offsets_for_adjustment) {
+ PrepareForUTF16Or32Output(src, src_len, output);
+ return ConvertUnicode(src, src_len, output, offsets_for_adjustment);
+}
+
+string16 UTF8ToUTF16AndAdjustOffset(const base::StringPiece& utf8,
+ size_t* offset_for_adjustment) {
+ std::vector<size_t> offsets;
+ if (offset_for_adjustment)
+ offsets.push_back(*offset_for_adjustment);
+ string16 result;
+ UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result,
+ &offsets);
+ if (offset_for_adjustment)
+ *offset_for_adjustment = offsets[0];
+ return result;
+}
+
+string16 UTF8ToUTF16AndAdjustOffsets(
+ const base::StringPiece& utf8,
+ std::vector<size_t>* offsets_for_adjustment) {
+ string16 result;
+ UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result,
+ offsets_for_adjustment);
+ return result;
+}
+
+std::string UTF16ToUTF8AndAdjustOffset(
+ const base::StringPiece16& utf16,
+ size_t* offset_for_adjustment) {
+ std::vector<size_t> offsets;
+ if (offset_for_adjustment)
+ offsets.push_back(*offset_for_adjustment);
+ std::string result = UTF16ToUTF8AndAdjustOffsets(utf16, &offsets);
+ if (offset_for_adjustment)
+ *offset_for_adjustment = offsets[0];
+ return result;
+}
+
+std::string UTF16ToUTF8AndAdjustOffsets(
+ const base::StringPiece16& utf16,
+ std::vector<size_t>* offsets_for_adjustment) {
+ std::string result;
+ PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
+ ConvertUnicode(utf16.data(), utf16.length(), &result, offsets_for_adjustment);
+ return result;
+}
+
+OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
+ size_t original_length,
+ size_t output_length)
+ : original_offset(original_offset),
+ original_length(original_length),
+ output_length(output_length) {
+}
+
+OffsetAdjuster::OffsetAdjuster(std::vector<size_t>* offsets_for_adjustment)
+ : offsets_for_adjustment_(offsets_for_adjustment) {
+}
+
+OffsetAdjuster::~OffsetAdjuster() {
+ if (!offsets_for_adjustment_ || adjustments_.empty())
+ return;
+ for (std::vector<size_t>::iterator i(offsets_for_adjustment_->begin());
+ i != offsets_for_adjustment_->end(); ++i)
+ AdjustOffset(i);
+}
+
+void OffsetAdjuster::Add(const Adjustment& adjustment) {
+ adjustments_.push_back(adjustment);
+}
+
+void OffsetAdjuster::AdjustOffset(std::vector<size_t>::iterator offset) {
+ if (*offset == string16::npos)
+ return;
+ size_t adjustment = 0;
+ for (std::vector<Adjustment>::const_iterator i = adjustments_.begin();
+ i != adjustments_.end(); ++i) {
+ if (*offset == i->original_offset && i->output_length == 0) {
+ *offset = string16::npos;
+ return;
+ }
+ if (*offset <= i->original_offset)
+ break;
+ if (*offset < (i->original_offset + i->original_length)) {
+ *offset = string16::npos;
+ return;
+ }
+ adjustment += (i->original_length - i->output_length);
+ }
+ *offset -= adjustment;
+}
+
+} // namespace base
diff --git a/base/strings/utf_offset_string_conversions.h b/base/strings/utf_offset_string_conversions.h
new file mode 100644
index 0000000..98f29b9
--- /dev/null
+++ b/base/strings/utf_offset_string_conversions.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
+#define BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
+
+#include <string>
+#include <vector>
+
+#include "base/base_export.h"
+#include "base/string16.h"
+#include "base/string_piece.h"
+
+namespace base {
+
+// Like the conversions in utf_string_conversions.h, but also takes one or more
+// offsets (|offset[s]_for_adjustment|) into the source strings, each offset
+// will be adjusted to point at the same logical place in the result strings.
+// If this isn't possible because an offset points past the end of the source
+// strings or into the middle of a multibyte sequence, the offending offset will
+// be set to string16::npos. |offset[s]_for_adjustment| may be NULL.
+BASE_EXPORT bool UTF8ToUTF16AndAdjustOffset(const char* src,
+ size_t src_len,
+ string16* output,
+ size_t* offset_for_adjustment);
+BASE_EXPORT bool UTF8ToUTF16AndAdjustOffsets(
+ const char* src,
+ size_t src_len,
+ string16* output,
+ std::vector<size_t>* offsets_for_adjustment);
+
+BASE_EXPORT string16 UTF8ToUTF16AndAdjustOffset(const base::StringPiece& utf8,
+ size_t* offset_for_adjustment);
+BASE_EXPORT string16 UTF8ToUTF16AndAdjustOffsets(
+ const base::StringPiece& utf8,
+ std::vector<size_t>* offsets_for_adjustment);
+
+BASE_EXPORT std::string UTF16ToUTF8AndAdjustOffset(
+ const base::StringPiece16& utf16,
+ size_t* offset_for_adjustment);
+BASE_EXPORT std::string UTF16ToUTF8AndAdjustOffsets(
+ const base::StringPiece16& utf16,
+ std::vector<size_t>* offsets_for_adjustment);
+
+// Limiting function callable by std::for_each which will replace any value
+// which is equal to or greater than |limit| with npos.
+template <typename T>
+struct LimitOffset {
+ explicit LimitOffset(size_t limit)
+ : limit_(limit) {}
+
+ void operator()(size_t& offset) {
+ if (offset >= limit_)
+ offset = T::npos;
+ }
+
+ size_t limit_;
+};
+
+// Stack object which, on destruction, will update a vector of offsets based on
+// any supplied adjustments. To use, declare one of these, providing the
+// address of the offset vector to adjust. Then Add() any number of Adjustments
+// (each Adjustment gives the |original_offset| of a substring and the lengths
+// of the substring before and after transforming). When the OffsetAdjuster
+// goes out of scope, all the offsets in the provided vector will be updated.
+class BASE_EXPORT OffsetAdjuster {
+ public:
+ struct BASE_EXPORT Adjustment {
+ Adjustment(size_t original_offset,
+ size_t original_length,
+ size_t output_length);
+
+ size_t original_offset;
+ size_t original_length;
+ size_t output_length;
+ };
+
+ explicit OffsetAdjuster(std::vector<size_t>* offsets_for_adjustment);
+ ~OffsetAdjuster();
+
+ void Add(const Adjustment& adjustment);
+
+ private:
+ void AdjustOffset(std::vector<size_t>::iterator offset);
+
+ std::vector<size_t>* offsets_for_adjustment_;
+ std::vector<Adjustment> adjustments_;
+};
+
+} // namespace base
+
+#endif // BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
diff --git a/base/strings/utf_offset_string_conversions_unittest.cc b/base/strings/utf_offset_string_conversions_unittest.cc
new file mode 100644
index 0000000..885357b
--- /dev/null
+++ b/base/strings/utf_offset_string_conversions_unittest.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <algorithm>
+
+#include "base/logging.h"
+#include "base/string_piece.h"
+#include "base/strings/utf_offset_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace base {
+
+namespace {
+
+static const size_t kNpos = string16::npos;
+
+} // namespace
+
+TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
+ struct UTF8ToUTF16Case {
+ const char* utf8;
+ size_t input_offset;
+ size_t output_offset;
+ } utf8_to_utf16_cases[] = {
+ {"", 0, kNpos},
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
+ {"\xed\xb0\x80z", 3, 1},
+ {"A\xF0\x90\x8C\x80z", 1, 1},
+ {"A\xF0\x90\x8C\x80z", 2, kNpos},
+ {"A\xF0\x90\x8C\x80z", 5, 3},
+ };
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf8_to_utf16_cases); ++i) {
+ size_t offset = utf8_to_utf16_cases[i].input_offset;
+ UTF8ToUTF16AndAdjustOffset(utf8_to_utf16_cases[i].utf8, &offset);
+ EXPECT_EQ(utf8_to_utf16_cases[i].output_offset, offset);
+ }
+
+ struct UTF16ToUTF8Case {
+ char16 utf16[10];
+ size_t input_offset;
+ size_t output_offset;
+ } utf16_to_utf8_cases[] = {
+ {{}, 0, kNpos},
+ // Converted to 3-byte utf-8 sequences
+ {{0x5909, 0x63DB}, 2, kNpos},
+ {{0x5909, 0x63DB}, 1, 3},
+ // Converted to 2-byte utf-8 sequences
+ {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
+ {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
+ {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
+ // Surrogate pair
+ {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
+ {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
+ {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
+ };
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_utf8_cases); ++i) {
+ size_t offset = utf16_to_utf8_cases[i].input_offset;
+ UTF16ToUTF8AndAdjustOffset(utf16_to_utf8_cases[i].utf16, &offset);
+ EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offset);
+ }
+}
+
+TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
+ const size_t kLimit = 10;
+ const size_t kItems = 20;
+ std::vector<size_t> size_ts;
+ for (size_t t = 0; t < kItems; ++t)
+ size_ts.push_back(t);
+ std::for_each(size_ts.begin(), size_ts.end(),
+ LimitOffset<string16>(kLimit));
+ size_t unlimited_count = 0;
+ for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
+ ++ti) {
+ if (*ti < kLimit && *ti != kNpos)
+ ++unlimited_count;
+ }
+ EXPECT_EQ(10U, unlimited_count);
+
+ // Reverse the values in the vector and try again.
+ size_ts.clear();
+ for (size_t t = kItems; t > 0; --t)
+ size_ts.push_back(t - 1);
+ std::for_each(size_ts.begin(), size_ts.end(),
+ LimitOffset<string16>(kLimit));
+ unlimited_count = 0;
+ for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
+ ++ti) {
+ if (*ti < kLimit && *ti != kNpos)
+ ++unlimited_count;
+ }
+ EXPECT_EQ(10U, unlimited_count);
+}
+
+TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
+ // Imagine we have strings as shown in the following cases where the
+ // X's represent encoded characters.
+ // 1: abcXXXdef ==> abcXdef
+ {
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t < 9; ++t)
+ offsets.push_back(t);
+ {
+ OffsetAdjuster offset_adjuster(&offsets);
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(3, 3, 1));
+ }
+ size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6};
+ EXPECT_EQ(offsets.size(), arraysize(expected_1));
+ for (size_t i = 0; i < arraysize(expected_1); ++i)
+ EXPECT_EQ(expected_1[i], offsets[i]);
+ }
+
+ // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
+ {
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t < 23; ++t)
+ offsets.push_back(t);
+ {
+ OffsetAdjuster offset_adjuster(&offsets);
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(0, 3, 1));
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(4, 4, 2));
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(10, 7, 4));
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(20, 3, 1));
+ }
+ size_t expected_2[] = {0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6,
+ kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 10, 11, 12,
+ 13, kNpos, kNpos};
+ EXPECT_EQ(offsets.size(), arraysize(expected_2));
+ for (size_t i = 0; i < arraysize(expected_2); ++i)
+ EXPECT_EQ(expected_2[i], offsets[i]);
+ }
+
+ // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
+ {
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t < 17; ++t)
+ offsets.push_back(t);
+ {
+ OffsetAdjuster offset_adjuster(&offsets);
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(0, 3, 0));
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(4, 4, 4));
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(11, 3, 3));
+ offset_adjuster.Add(OffsetAdjuster::Adjustment(15, 2, 0));
+ }
+ size_t expected_3[] = {kNpos, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6,
+ 7, 8, kNpos, kNpos, 11, kNpos, kNpos};
+ EXPECT_EQ(offsets.size(), arraysize(expected_3));
+ for (size_t i = 0; i < arraysize(expected_3); ++i)
+ EXPECT_EQ(expected_3[i], offsets[i]);
+ }
+}
+
+} // namaspace base
diff --git a/base/strings/utf_string_conversion_utils.cc b/base/strings/utf_string_conversion_utils.cc
new file mode 100644
index 0000000..09a003d
--- /dev/null
+++ b/base/strings/utf_string_conversion_utils.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/utf_string_conversion_utils.h"
+
+#include "base/third_party/icu/icu_utf.h"
+
+namespace base {
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+bool ReadUnicodeCharacter(const char* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point_out) {
+ // U8_NEXT expects to be able to use -1 to signal an error, so we must
+ // use a signed type for code_point. But this function returns false
+ // on error anyway, so code_point_out is unsigned.
+ int32 code_point;
+ CBU8_NEXT(src, *char_index, src_len, code_point);
+ *code_point_out = static_cast<uint32>(code_point);
+
+ // The ICU macro above moves to the next char, we want to point to the last
+ // char consumed.
+ (*char_index)--;
+
+ // Validate the decoded value.
+ return IsValidCodepoint(code_point);
+}
+
+bool ReadUnicodeCharacter(const char16* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point) {
+ if (CBU16_IS_SURROGATE(src[*char_index])) {
+ if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) ||
+ *char_index + 1 >= src_len ||
+ !CBU16_IS_TRAIL(src[*char_index + 1])) {
+ // Invalid surrogate pair.
+ return false;
+ }
+
+ // Valid surrogate pair.
+ *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index],
+ src[*char_index + 1]);
+ (*char_index)++;
+ } else {
+ // Not a surrogate, just one 16-bit word.
+ *code_point = src[*char_index];
+ }
+
+ return IsValidCodepoint(*code_point);
+}
+
+#if defined(WCHAR_T_IS_UTF32)
+bool ReadUnicodeCharacter(const wchar_t* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point) {
+ // Conversion is easy since the source is 32-bit.
+ *code_point = src[*char_index];
+
+ // Validate the value.
+ return IsValidCodepoint(*code_point);
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) {
+ if (code_point <= 0x7f) {
+ // Fast path the common case of one byte.
+ output->push_back(code_point);
+ return 1;
+ }
+
+
+ // CBU8_APPEND_UNSAFE can append up to 4 bytes.
+ size_t char_offset = output->length();
+ size_t original_char_offset = char_offset;
+ output->resize(char_offset + CBU8_MAX_LENGTH);
+
+ CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+
+ // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
+ // it will represent the new length of the string.
+ output->resize(char_offset);
+ return char_offset - original_char_offset;
+}
+
+size_t WriteUnicodeCharacter(uint32 code_point, string16* output) {
+ if (CBU16_LENGTH(code_point) == 1) {
+ // Thie code point is in the Basic Multilingual Plane (BMP).
+ output->push_back(static_cast<char16>(code_point));
+ return 1;
+ }
+ // Non-BMP characters use a double-character encoding.
+ size_t char_offset = output->length();
+ output->resize(char_offset + CBU16_MAX_LENGTH);
+ CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+ return CBU16_MAX_LENGTH;
+}
+
+// Generalized Unicode converter -----------------------------------------------
+
+template<typename CHAR>
+void PrepareForUTF8Output(const CHAR* src,
+ size_t src_len,
+ std::string* output) {
+ output->clear();
+ if (src_len == 0)
+ return;
+ if (src[0] < 0x80) {
+ // Assume that the entire input will be ASCII.
+ output->reserve(src_len);
+ } else {
+ // Assume that the entire input is non-ASCII and will have 3 bytes per char.
+ output->reserve(src_len * 3);
+ }
+}
+
+// Instantiate versions we know callers will need.
+template void PrepareForUTF8Output(const wchar_t*, size_t, std::string*);
+template void PrepareForUTF8Output(const char16*, size_t, std::string*);
+
+template<typename STRING>
+void PrepareForUTF16Or32Output(const char* src,
+ size_t src_len,
+ STRING* output) {
+ output->clear();
+ if (src_len == 0)
+ return;
+ if (static_cast<unsigned char>(src[0]) < 0x80) {
+ // Assume the input is all ASCII, which means 1:1 correspondence.
+ output->reserve(src_len);
+ } else {
+ // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
+ // character.
+ output->reserve(src_len / 2);
+ }
+}
+
+// Instantiate versions we know callers will need.
+template void PrepareForUTF16Or32Output(const char*, size_t, std::wstring*);
+template void PrepareForUTF16Or32Output(const char*, size_t, string16*);
+
+} // namespace base
diff --git a/base/strings/utf_string_conversion_utils.h b/base/strings/utf_string_conversion_utils.h
new file mode 100644
index 0000000..8832369
--- /dev/null
+++ b/base/strings/utf_string_conversion_utils.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
+#define BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
+
+// This should only be used by the various UTF string conversion files.
+
+#include "base/base_export.h"
+#include "base/string16.h"
+
+namespace base {
+
+inline bool IsValidCodepoint(uint32 code_point) {
+ // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
+ // codepoints larger than 0x10FFFF (the highest codepoint allowed).
+ // Non-characters and unassigned codepoints are allowed.
+ return code_point < 0xD800u ||
+ (code_point >= 0xE000u && code_point <= 0x10FFFFu);
+}
+
+inline bool IsValidCharacter(uint32 code_point) {
+ // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
+ // 0xFFFE or 0xFFFF) from the set of valid code points.
+ return code_point < 0xD800u || (code_point >= 0xE000u &&
+ code_point < 0xFDD0u) || (code_point > 0xFDEFu &&
+ code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu);
+}
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+// Reads a UTF-8 stream, placing the next code point into the given output
+// |*code_point|. |src| represents the entire string to read, and |*char_index|
+// is the character offset within the string to start reading at. |*char_index|
+// will be updated to index the last character read, such that incrementing it
+// (as in a for loop) will take the reader to the next character.
+//
+// Returns true on success. On false, |*code_point| will be invalid.
+BASE_EXPORT bool ReadUnicodeCharacter(const char* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point_out);
+
+// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
+BASE_EXPORT bool ReadUnicodeCharacter(const char16* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point);
+
+#if defined(WCHAR_T_IS_UTF32)
+// Reads UTF-32 character. The usage is the same as the 8-bit version above.
+BASE_EXPORT bool ReadUnicodeCharacter(const wchar_t* src,
+ int32 src_len,
+ int32* char_index,
+ uint32* code_point);
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+// Appends a UTF-8 character to the given 8-bit string. Returns the number of
+// bytes written.
+// TODO(brettw) Bug 79631: This function should not be exposed.
+BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point,
+ std::string* output);
+
+// Appends the given code point as a UTF-16 character to the given 16-bit
+// string. Returns the number of 16-bit values written.
+BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point, string16* output);
+
+#if defined(WCHAR_T_IS_UTF32)
+// Appends the given UTF-32 character to the given 32-bit string. Returns the
+// number of 32-bit values written.
+inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
+ // This is the easy case, just append the character.
+ output->push_back(code_point);
+ return 1;
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// Generalized Unicode converter -----------------------------------------------
+
+// Guesses the length of the output in UTF-8 in bytes, clears that output
+// string, and reserves that amount of space. We assume that the input
+// character types are unsigned, which will be true for UTF-16 and -32 on our
+// systems.
+template<typename CHAR>
+void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output);
+
+// Prepares an output buffer (containing either UTF-16 or -32 data) given some
+// UTF-8 input that will be converted to it. See PrepareForUTF8Output().
+template<typename STRING>
+void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output);
+
+} // namespace base
+
+#endif // BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_