Move utf_offset_string_conversions and utf_string_conversion_utils to strings.

Review URL: https://codereview.chromium.org/12087115 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@181183 0039d316-1c4b-4281-b951-d872f2087c98
author: brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-02-07 03:59:06 +0000
committer: brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-02-07 03:59:06 +0000
commit: a3f72189102fc85a7f9cfe60c61e124a2677f0ea (patch)
tree: fa5699e92f87a628740fd9b88960b6449541055d /base/strings
parent: 3d350326fd4def19e343890c55c5388e370c7c2f (diff)
download: chromium_src-a3f72189102fc85a7f9cfe60c61e124a2677f0ea.zip
chromium_src-a3f72189102fc85a7f9cfe60c61e124a2677f0ea.tar.gz
chromium_src-a3f72189102fc85a7f9cfe60c61e124a2677f0ea.tar.bz2
5 files changed, 658 insertions, 0 deletions
diff --git a/base/strings/utf_offset_string_conversions.cc b/base/strings/utf_offset_string_conversions.cc
new file mode 100644
index 0000000..5a6f0c0
--- /dev/null
+++ b/base/strings/utf_offset_string_conversions.cc
@@ -0,0 +1,166 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/utf_offset_string_conversions.h"
+
+#include <algorithm>
+
+#include "base/memory/scoped_ptr.h"
+#include "base/string_piece.h"
+#include "base/strings/utf_string_conversion_utils.h"
+
+namespace base {
+
+// Converts the given source Unicode character type to the given destination
+// Unicode character type as a STL string. The given input buffer and size
+// determine the source, and the given output STL string will be replaced by
+// the result.
+template<typename SrcChar, typename DestStdString>
+bool ConvertUnicode(const SrcChar* src,
+                    size_t src_len,
+                    DestStdString* output,
+                    std::vector<size_t>* offsets_for_adjustment) {
+  if (offsets_for_adjustment) {
+    std::for_each(offsets_for_adjustment->begin(),
+                  offsets_for_adjustment->end(),
+                  LimitOffset<DestStdString>(src_len));
+  }
+
+  // ICU requires 32-bit numbers.
+  bool success = true;
+  OffsetAdjuster offset_adjuster(offsets_for_adjustment);
+  int32 src_len32 = static_cast<int32>(src_len);
+  for (int32 i = 0; i < src_len32; i++) {
+    uint32 code_point;
+    size_t original_i = i;
+    size_t chars_written = 0;
+    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
+      chars_written = WriteUnicodeCharacter(code_point, output);
+    } else {
+      chars_written = WriteUnicodeCharacter(0xFFFD, output);
+      success = false;
+    }
+    if (offsets_for_adjustment) {
+      // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
+      // character read, not after it (so that incrementing it in the loop
+      // increment will place it at the right location), so we need to account
+      // for that in determining the amount that was read.
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(original_i,
+          i - original_i + 1, chars_written));
+    }
+  }
+  return success;
+}
+
+bool UTF8ToUTF16AndAdjustOffset(const char* src,
+                                size_t src_len,
+                                string16* output,
+                                size_t* offset_for_adjustment) {
+  std::vector<size_t> offsets;
+  if (offset_for_adjustment)
+    offsets.push_back(*offset_for_adjustment);
+  PrepareForUTF16Or32Output(src, src_len, output);
+  bool ret = ConvertUnicode(src, src_len, output, &offsets);
+  if (offset_for_adjustment)
+    *offset_for_adjustment = offsets[0];
+  return ret;
+}
+
+bool UTF8ToUTF16AndAdjustOffsets(const char* src,
+                                 size_t src_len,
+                                 string16* output,
+                                 std::vector<size_t>* offsets_for_adjustment) {
+  PrepareForUTF16Or32Output(src, src_len, output);
+  return ConvertUnicode(src, src_len, output, offsets_for_adjustment);
+}
+
+string16 UTF8ToUTF16AndAdjustOffset(const base::StringPiece& utf8,
+                                        size_t* offset_for_adjustment) {
+  std::vector<size_t> offsets;
+  if (offset_for_adjustment)
+    offsets.push_back(*offset_for_adjustment);
+  string16 result;
+  UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result,
+                              &offsets);
+  if (offset_for_adjustment)
+    *offset_for_adjustment = offsets[0];
+  return result;
+}
+
+string16 UTF8ToUTF16AndAdjustOffsets(
+    const base::StringPiece& utf8,
+    std::vector<size_t>* offsets_for_adjustment) {
+  string16 result;
+  UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result,
+                              offsets_for_adjustment);
+  return result;
+}
+
+std::string UTF16ToUTF8AndAdjustOffset(
+    const base::StringPiece16& utf16,
+    size_t* offset_for_adjustment) {
+  std::vector<size_t> offsets;
+  if (offset_for_adjustment)
+    offsets.push_back(*offset_for_adjustment);
+  std::string result = UTF16ToUTF8AndAdjustOffsets(utf16, &offsets);
+  if (offset_for_adjustment)
+    *offset_for_adjustment = offsets[0];
+  return result;
+}
+
+std::string UTF16ToUTF8AndAdjustOffsets(
+    const base::StringPiece16& utf16,
+    std::vector<size_t>* offsets_for_adjustment) {
+  std::string result;
+  PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
+  ConvertUnicode(utf16.data(), utf16.length(), &result, offsets_for_adjustment);
+  return result;
+}
+
+OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
+                                       size_t original_length,
+                                       size_t output_length)
+    : original_offset(original_offset),
+      original_length(original_length),
+      output_length(output_length) {
+}
+
+OffsetAdjuster::OffsetAdjuster(std::vector<size_t>* offsets_for_adjustment)
+    : offsets_for_adjustment_(offsets_for_adjustment) {
+}
+
+OffsetAdjuster::~OffsetAdjuster() {
+  if (!offsets_for_adjustment_ || adjustments_.empty())
+    return;
+  for (std::vector<size_t>::iterator i(offsets_for_adjustment_->begin());
+       i != offsets_for_adjustment_->end(); ++i)
+    AdjustOffset(i);
+}
+
+void OffsetAdjuster::Add(const Adjustment& adjustment) {
+  adjustments_.push_back(adjustment);
+}
+
+void OffsetAdjuster::AdjustOffset(std::vector<size_t>::iterator offset) {
+  if (*offset == string16::npos)
+    return;
+  size_t adjustment = 0;
+  for (std::vector<Adjustment>::const_iterator i = adjustments_.begin();
+       i != adjustments_.end(); ++i) {
+    if (*offset == i->original_offset && i->output_length == 0) {
+      *offset = string16::npos;
+      return;
+    }
+    if (*offset <= i->original_offset)
+      break;
+    if (*offset < (i->original_offset + i->original_length)) {
+      *offset = string16::npos;
+      return;
+    }
+    adjustment += (i->original_length - i->output_length);
+  }
+  *offset -= adjustment;
+}
+
+}  // namespace base
diff --git a/base/strings/utf_offset_string_conversions.h b/base/strings/utf_offset_string_conversions.h
new file mode 100644
index 0000000..98f29b9
--- /dev/null
+++ b/base/strings/utf_offset_string_conversions.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
+#define BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
+
+#include <string>
+#include <vector>
+
+#include "base/base_export.h"
+#include "base/string16.h"
+#include "base/string_piece.h"
+
+namespace base {
+
+// Like the conversions in utf_string_conversions.h, but also takes one or more
+// offsets (|offset[s]_for_adjustment|) into the source strings, each offset
+// will be adjusted to point at the same logical place in the result strings.
+// If this isn't possible because an offset points past the end of the source
+// strings or into the middle of a multibyte sequence, the offending offset will
+// be set to string16::npos. |offset[s]_for_adjustment| may be NULL.
+BASE_EXPORT bool UTF8ToUTF16AndAdjustOffset(const char* src,
+                                            size_t src_len,
+                                            string16* output,
+                                            size_t* offset_for_adjustment);
+BASE_EXPORT bool UTF8ToUTF16AndAdjustOffsets(
+    const char* src,
+    size_t src_len,
+    string16* output,
+    std::vector<size_t>* offsets_for_adjustment);
+
+BASE_EXPORT string16 UTF8ToUTF16AndAdjustOffset(const base::StringPiece& utf8,
+                                                size_t* offset_for_adjustment);
+BASE_EXPORT string16 UTF8ToUTF16AndAdjustOffsets(
+    const base::StringPiece& utf8,
+    std::vector<size_t>* offsets_for_adjustment);
+
+BASE_EXPORT std::string UTF16ToUTF8AndAdjustOffset(
+    const base::StringPiece16& utf16,
+    size_t* offset_for_adjustment);
+BASE_EXPORT std::string UTF16ToUTF8AndAdjustOffsets(
+    const base::StringPiece16& utf16,
+    std::vector<size_t>* offsets_for_adjustment);
+
+// Limiting function callable by std::for_each which will replace any value
+// which is equal to or greater than |limit| with npos.
+template <typename T>
+struct LimitOffset {
+  explicit LimitOffset(size_t limit)
+    : limit_(limit) {}
+
+  void operator()(size_t& offset) {
+    if (offset >= limit_)
+      offset = T::npos;
+  }
+
+  size_t limit_;
+};
+
+// Stack object which, on destruction, will update a vector of offsets based on
+// any supplied adjustments.  To use, declare one of these, providing the
+// address of the offset vector to adjust.  Then Add() any number of Adjustments
+// (each Adjustment gives the |original_offset| of a substring and the lengths
+// of the substring before and after transforming).  When the OffsetAdjuster
+// goes out of scope, all the offsets in the provided vector will be updated.
+class BASE_EXPORT OffsetAdjuster {
+ public:
+  struct BASE_EXPORT Adjustment {
+    Adjustment(size_t original_offset,
+               size_t original_length,
+               size_t output_length);
+
+    size_t original_offset;
+    size_t original_length;
+    size_t output_length;
+  };
+
+  explicit OffsetAdjuster(std::vector<size_t>* offsets_for_adjustment);
+  ~OffsetAdjuster();
+
+  void Add(const Adjustment& adjustment);
+
+ private:
+  void AdjustOffset(std::vector<size_t>::iterator offset);
+
+  std::vector<size_t>* offsets_for_adjustment_;
+  std::vector<Adjustment> adjustments_;
+};
+
+}  // namespace base
+
+#endif  // BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
diff --git a/base/strings/utf_offset_string_conversions_unittest.cc b/base/strings/utf_offset_string_conversions_unittest.cc
new file mode 100644
index 0000000..885357b
--- /dev/null
+++ b/base/strings/utf_offset_string_conversions_unittest.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <algorithm>
+
+#include "base/logging.h"
+#include "base/string_piece.h"
+#include "base/strings/utf_offset_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace base {
+
+namespace {
+
+static const size_t kNpos = string16::npos;
+
+}  // namespace
+
+TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
+  struct UTF8ToUTF16Case {
+    const char* utf8;
+    size_t input_offset;
+    size_t output_offset;
+  } utf8_to_utf16_cases[] = {
+    {"", 0, kNpos},
+    {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
+    {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
+    {"\xed\xb0\x80z", 3, 1},
+    {"A\xF0\x90\x8C\x80z", 1, 1},
+    {"A\xF0\x90\x8C\x80z", 2, kNpos},
+    {"A\xF0\x90\x8C\x80z", 5, 3},
+  };
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf8_to_utf16_cases); ++i) {
+    size_t offset = utf8_to_utf16_cases[i].input_offset;
+    UTF8ToUTF16AndAdjustOffset(utf8_to_utf16_cases[i].utf8, &offset);
+    EXPECT_EQ(utf8_to_utf16_cases[i].output_offset, offset);
+  }
+
+  struct UTF16ToUTF8Case {
+    char16 utf16[10];
+    size_t input_offset;
+    size_t output_offset;
+  } utf16_to_utf8_cases[] = {
+      {{}, 0, kNpos},
+      // Converted to 3-byte utf-8 sequences
+      {{0x5909, 0x63DB}, 2, kNpos},
+      {{0x5909, 0x63DB}, 1, 3},
+      // Converted to 2-byte utf-8 sequences
+      {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
+      {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
+      {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
+      // Surrogate pair
+      {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
+      {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
+      {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
+  };
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_utf8_cases); ++i) {
+    size_t offset = utf16_to_utf8_cases[i].input_offset;
+    UTF16ToUTF8AndAdjustOffset(utf16_to_utf8_cases[i].utf16, &offset);
+    EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offset);
+  }
+}
+
+TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
+  const size_t kLimit = 10;
+  const size_t kItems = 20;
+  std::vector<size_t> size_ts;
+  for (size_t t = 0; t < kItems; ++t)
+    size_ts.push_back(t);
+  std::for_each(size_ts.begin(), size_ts.end(),
+                LimitOffset<string16>(kLimit));
+  size_t unlimited_count = 0;
+  for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
+       ++ti) {
+    if (*ti < kLimit && *ti != kNpos)
+      ++unlimited_count;
+  }
+  EXPECT_EQ(10U, unlimited_count);
+
+  // Reverse the values in the vector and try again.
+  size_ts.clear();
+  for (size_t t = kItems; t > 0; --t)
+    size_ts.push_back(t - 1);
+  std::for_each(size_ts.begin(), size_ts.end(),
+                LimitOffset<string16>(kLimit));
+  unlimited_count = 0;
+  for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
+       ++ti) {
+    if (*ti < kLimit && *ti != kNpos)
+      ++unlimited_count;
+  }
+  EXPECT_EQ(10U, unlimited_count);
+}
+
+TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
+  // Imagine we have strings as shown in the following cases where the
+  // X's represent encoded characters.
+  // 1: abcXXXdef ==> abcXdef
+  {
+    std::vector<size_t> offsets;
+    for (size_t t = 0; t < 9; ++t)
+      offsets.push_back(t);
+    {
+      OffsetAdjuster offset_adjuster(&offsets);
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(3, 3, 1));
+    }
+    size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6};
+    EXPECT_EQ(offsets.size(), arraysize(expected_1));
+    for (size_t i = 0; i < arraysize(expected_1); ++i)
+      EXPECT_EQ(expected_1[i], offsets[i]);
+  }
+
+  // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
+  {
+    std::vector<size_t> offsets;
+    for (size_t t = 0; t < 23; ++t)
+      offsets.push_back(t);
+    {
+      OffsetAdjuster offset_adjuster(&offsets);
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(0, 3, 1));
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(4, 4, 2));
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(10, 7, 4));
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(20, 3, 1));
+    }
+    size_t expected_2[] = {0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6,
+                           kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 10, 11, 12,
+                           13, kNpos, kNpos};
+    EXPECT_EQ(offsets.size(), arraysize(expected_2));
+    for (size_t i = 0; i < arraysize(expected_2); ++i)
+      EXPECT_EQ(expected_2[i], offsets[i]);
+  }
+
+  // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
+  {
+    std::vector<size_t> offsets;
+    for (size_t t = 0; t < 17; ++t)
+      offsets.push_back(t);
+    {
+      OffsetAdjuster offset_adjuster(&offsets);
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(0, 3, 0));
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(4, 4, 4));
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(11, 3, 3));
+      offset_adjuster.Add(OffsetAdjuster::Adjustment(15, 2, 0));
+    }
+    size_t expected_3[] = {kNpos, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6,
+                           7, 8, kNpos, kNpos, 11, kNpos, kNpos};
+    EXPECT_EQ(offsets.size(), arraysize(expected_3));
+    for (size_t i = 0; i < arraysize(expected_3); ++i)
+      EXPECT_EQ(expected_3[i], offsets[i]);
+  }
+}
+
+}  // namaspace base
diff --git a/base/strings/utf_string_conversion_utils.cc b/base/strings/utf_string_conversion_utils.cc
new file mode 100644
index 0000000..09a003d
--- /dev/null
+++ b/base/strings/utf_string_conversion_utils.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/utf_string_conversion_utils.h"
+
+#include "base/third_party/icu/icu_utf.h"
+
+namespace base {
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+bool ReadUnicodeCharacter(const char* src,
+                          int32 src_len,
+                          int32* char_index,
+                          uint32* code_point_out) {
+  // U8_NEXT expects to be able to use -1 to signal an error, so we must
+  // use a signed type for code_point.  But this function returns false
+  // on error anyway, so code_point_out is unsigned.
+  int32 code_point;
+  CBU8_NEXT(src, *char_index, src_len, code_point);
+  *code_point_out = static_cast<uint32>(code_point);
+
+  // The ICU macro above moves to the next char, we want to point to the last
+  // char consumed.
+  (*char_index)--;
+
+  // Validate the decoded value.
+  return IsValidCodepoint(code_point);
+}
+
+bool ReadUnicodeCharacter(const char16* src,
+                          int32 src_len,
+                          int32* char_index,
+                          uint32* code_point) {
+  if (CBU16_IS_SURROGATE(src[*char_index])) {
+    if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) ||
+        *char_index + 1 >= src_len ||
+        !CBU16_IS_TRAIL(src[*char_index + 1])) {
+      // Invalid surrogate pair.
+      return false;
+    }
+
+    // Valid surrogate pair.
+    *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index],
+                                          src[*char_index + 1]);
+    (*char_index)++;
+  } else {
+    // Not a surrogate, just one 16-bit word.
+    *code_point = src[*char_index];
+  }
+
+  return IsValidCodepoint(*code_point);
+}
+
+#if defined(WCHAR_T_IS_UTF32)
+bool ReadUnicodeCharacter(const wchar_t* src,
+                          int32 src_len,
+                          int32* char_index,
+                          uint32* code_point) {
+  // Conversion is easy since the source is 32-bit.
+  *code_point = src[*char_index];
+
+  // Validate the value.
+  return IsValidCodepoint(*code_point);
+}
+#endif  // defined(WCHAR_T_IS_UTF32)
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+size_t WriteUnicodeCharacter(uint32 code_point, std::string* output) {
+  if (code_point <= 0x7f) {
+    // Fast path the common case of one byte.
+    output->push_back(code_point);
+    return 1;
+  }
+
+
+  // CBU8_APPEND_UNSAFE can append up to 4 bytes.
+  size_t char_offset = output->length();
+  size_t original_char_offset = char_offset;
+  output->resize(char_offset + CBU8_MAX_LENGTH);
+
+  CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+
+  // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
+  // it will represent the new length of the string.
+  output->resize(char_offset);
+  return char_offset - original_char_offset;
+}
+
+size_t WriteUnicodeCharacter(uint32 code_point, string16* output) {
+  if (CBU16_LENGTH(code_point) == 1) {
+    // Thie code point is in the Basic Multilingual Plane (BMP).
+    output->push_back(static_cast<char16>(code_point));
+    return 1;
+  }
+  // Non-BMP characters use a double-character encoding.
+  size_t char_offset = output->length();
+  output->resize(char_offset + CBU16_MAX_LENGTH);
+  CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+  return CBU16_MAX_LENGTH;
+}
+
+// Generalized Unicode converter -----------------------------------------------
+
+template<typename CHAR>
+void PrepareForUTF8Output(const CHAR* src,
+                          size_t src_len,
+                          std::string* output) {
+  output->clear();
+  if (src_len == 0)
+    return;
+  if (src[0] < 0x80) {
+    // Assume that the entire input will be ASCII.
+    output->reserve(src_len);
+  } else {
+    // Assume that the entire input is non-ASCII and will have 3 bytes per char.
+    output->reserve(src_len * 3);
+  }
+}
+
+// Instantiate versions we know callers will need.
+template void PrepareForUTF8Output(const wchar_t*, size_t, std::string*);
+template void PrepareForUTF8Output(const char16*, size_t, std::string*);
+
+template<typename STRING>
+void PrepareForUTF16Or32Output(const char* src,
+                               size_t src_len,
+                               STRING* output) {
+  output->clear();
+  if (src_len == 0)
+    return;
+  if (static_cast<unsigned char>(src[0]) < 0x80) {
+    // Assume the input is all ASCII, which means 1:1 correspondence.
+    output->reserve(src_len);
+  } else {
+    // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
+    // character.
+    output->reserve(src_len / 2);
+  }
+}
+
+// Instantiate versions we know callers will need.
+template void PrepareForUTF16Or32Output(const char*, size_t, std::wstring*);
+template void PrepareForUTF16Or32Output(const char*, size_t, string16*);
+
+}  // namespace base
diff --git a/base/strings/utf_string_conversion_utils.h b/base/strings/utf_string_conversion_utils.h
new file mode 100644
index 0000000..8832369
--- /dev/null
+++ b/base/strings/utf_string_conversion_utils.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
+#define BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
+
+// This should only be used by the various UTF string conversion files.
+
+#include "base/base_export.h"
+#include "base/string16.h"
+
+namespace base {
+
+inline bool IsValidCodepoint(uint32 code_point) {
+  // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
+  // codepoints larger than 0x10FFFF (the highest codepoint allowed).
+  // Non-characters and unassigned codepoints are allowed.
+  return code_point < 0xD800u ||
+         (code_point >= 0xE000u && code_point <= 0x10FFFFu);
+}
+
+inline bool IsValidCharacter(uint32 code_point) {
+  // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
+  // 0xFFFE or 0xFFFF) from the set of valid code points.
+  return code_point < 0xD800u || (code_point >= 0xE000u &&
+      code_point < 0xFDD0u) || (code_point > 0xFDEFu &&
+      code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu);
+}
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+// Reads a UTF-8 stream, placing the next code point into the given output
+// |*code_point|. |src| represents the entire string to read, and |*char_index|
+// is the character offset within the string to start reading at. |*char_index|
+// will be updated to index the last character read, such that incrementing it
+// (as in a for loop) will take the reader to the next character.
+//
+// Returns true on success. On false, |*code_point| will be invalid.
+BASE_EXPORT bool ReadUnicodeCharacter(const char* src,
+                                      int32 src_len,
+                                      int32* char_index,
+                                      uint32* code_point_out);
+
+// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
+BASE_EXPORT bool ReadUnicodeCharacter(const char16* src,
+                                      int32 src_len,
+                                      int32* char_index,
+                                      uint32* code_point);
+
+#if defined(WCHAR_T_IS_UTF32)
+// Reads UTF-32 character. The usage is the same as the 8-bit version above.
+BASE_EXPORT bool ReadUnicodeCharacter(const wchar_t* src,
+                                      int32 src_len,
+                                      int32* char_index,
+                                      uint32* code_point);
+#endif  // defined(WCHAR_T_IS_UTF32)
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+// Appends a UTF-8 character to the given 8-bit string.  Returns the number of
+// bytes written.
+// TODO(brettw) Bug 79631: This function should not be exposed.
+BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point,
+                                         std::string* output);
+
+// Appends the given code point as a UTF-16 character to the given 16-bit
+// string.  Returns the number of 16-bit values written.
+BASE_EXPORT size_t WriteUnicodeCharacter(uint32 code_point, string16* output);
+
+#if defined(WCHAR_T_IS_UTF32)
+// Appends the given UTF-32 character to the given 32-bit string.  Returns the
+// number of 32-bit values written.
+inline size_t WriteUnicodeCharacter(uint32 code_point, std::wstring* output) {
+  // This is the easy case, just append the character.
+  output->push_back(code_point);
+  return 1;
+}
+#endif  // defined(WCHAR_T_IS_UTF32)
+
+// Generalized Unicode converter -----------------------------------------------
+
+// Guesses the length of the output in UTF-8 in bytes, clears that output
+// string, and reserves that amount of space.  We assume that the input
+// character types are unsigned, which will be true for UTF-16 and -32 on our
+// systems.
+template<typename CHAR>
+void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output);
+
+// Prepares an output buffer (containing either UTF-16 or -32 data) given some
+// UTF-8 input that will be converted to it.  See PrepareForUTF8Output().
+template<typename STRING>
+void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output);
+
+}  // namespace base
+
+#endif  // BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
author	brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-02-07 03:59:06 +0000
committer	brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-02-07 03:59:06 +0000
commit	a3f72189102fc85a7f9cfe60c61e124a2677f0ea (patch)
tree	fa5699e92f87a628740fd9b88960b6449541055d /base/strings
parent	3d350326fd4def19e343890c55c5388e370c7c2f (diff)
download	chromium_src-a3f72189102fc85a7f9cfe60c61e124a2677f0ea.zip chromium_src-a3f72189102fc85a7f9cfe60c61e124a2677f0ea.tar.gz chromium_src-a3f72189102fc85a7f9cfe60c61e124a2677f0ea.tar.bz2