Add functions to get the length or compute a substring of UTF8 and UTF16

encoded strings. BUG=none TEST=Added new unit tests for each. Review URL: http://codereview.chromium.org/3133028 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@57051 0039d316-1c4b-4281-b951-d872f2087c98
author: dmazzoni@chromium.org <dmazzoni@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-08-23 14:30:27 +0000
committer: dmazzoni@chromium.org <dmazzoni@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-08-23 14:30:27 +0000
commit: 0ca5c104bc719dc5e7dcdc19cca1576c27391e65 (patch)
tree: fbb7dbff560b787cf3007f72236b51b3a41b5a5e /base/i18n
parent: 9ea6435cf0c61350b6676821c12e7c95ee1f20ca (diff)
download: chromium_src-0ca5c104bc719dc5e7dcdc19cca1576c27391e65.zip
chromium_src-0ca5c104bc719dc5e7dcdc19cca1576c27391e65.tar.gz
chromium_src-0ca5c104bc719dc5e7dcdc19cca1576c27391e65.tar.bz2
3 files changed, 276 insertions, 0 deletions
diff --git a/base/i18n/char_iterator.cc b/base/i18n/char_iterator.cc
new file mode 100644
index 0000000..c323c5d
--- /dev/null
+++ b/base/i18n/char_iterator.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/char_iterator.h"
+
+#include "unicode/utf8.h"
+#include "unicode/utf16.h"
+
+namespace base {
+
+UTF8CharIterator::UTF8CharIterator(const std::string* str)
+    : str_(reinterpret_cast<const uint8_t*>(str->data())),
+      len_(str->size()),
+      array_pos_(0),
+      next_pos_(0),
+      char_pos_(0),
+      char_(0) {
+  if (len_)
+    U8_NEXT(str_, next_pos_, len_, char_);
+}
+
+bool UTF8CharIterator::Advance() {
+  if (array_pos_ >= len_)
+    return false;
+
+  array_pos_ = next_pos_;
+  char_pos_++;
+  if (next_pos_ < len_)
+    U8_NEXT(str_, next_pos_, len_, char_);
+
+  return true;
+}
+
+UTF16CharIterator::UTF16CharIterator(const string16* str)
+    : str_(reinterpret_cast<const char16*>(str->data())),
+      len_(str->size()),
+      array_pos_(0),
+      next_pos_(0),
+      char_pos_(0),
+      char_(0) {
+  if (len_)
+    U16_NEXT(str_, next_pos_, len_, char_);
+}
+
+bool UTF16CharIterator::Advance() {
+  if (array_pos_ >= len_)
+    return false;
+
+  array_pos_ = next_pos_;
+  char_pos_++;
+  if (next_pos_ < len_)
+    U16_NEXT(str_, next_pos_, len_, char_);
+
+  return true;
+}
+
+}  // namespace base
diff --git a/base/i18n/char_iterator.h b/base/i18n/char_iterator.h
new file mode 100644
index 0000000..784c6e5d
--- /dev/null
+++ b/base/i18n/char_iterator.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_I18N_CHAR_ITERATOR_H_
+#define BASE_I18N_CHAR_ITERATOR_H_
+#pragma once
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/string16.h"
+
+// The CharIterator classes iterate through the characters in UTF8 and
+// UTF16 strings.  Example usage:
+//
+//   UTF8CharIterator iter(&str);
+//   while (!iter.End()) {
+//     LOG(INFO) << iter.get();
+//     iter.Advance();
+//   }
+
+#if defined(OS_WIN)
+typedef unsigned char uint8_t;
+#endif
+
+namespace base {
+
+class UTF8CharIterator {
+ public:
+  // Requires |str| to live as long as the UTF8CharIterator does.
+  UTF8CharIterator(const std::string* str);
+  ~UTF8CharIterator() {}
+
+  // Return the starting array index of the current character within the
+  // string.
+  int32 array_pos() const { return array_pos_; }
+
+  // Return the logical index of the current character, independent of the
+  // number of bytes each character takes.
+  int32 char_pos() const { return char_pos_; }
+
+  // Return the current char.
+  int32 get() const { return char_; }
+
+  // Returns true if we're at the end of the string.
+  bool end() const { return array_pos_ == len_; }
+
+  // Advance to the next actual character.  Returns false if we're at the
+  // end of the string.
+  bool Advance();
+
+ private:
+  // The string we're iterating over.
+  const uint8_t* str_;
+
+  // The length of the encoded string.
+  int32 len_;
+
+  // Array index.
+  int32 array_pos_;
+
+  // The next array index.
+  int32 next_pos_;
+
+  // Character index.
+  int32 char_pos_;
+
+  // The current character.
+  int32 char_;
+
+  DISALLOW_COPY_AND_ASSIGN(UTF8CharIterator);
+};
+
+class UTF16CharIterator {
+ public:
+  // Requires |str| to live as long as the UTF16CharIterator does.
+  UTF16CharIterator(const string16* str);
+  ~UTF16CharIterator() {}
+
+  // Return the starting array index of the current character within the
+  // string.
+  int32 array_pos() const { return array_pos_; }
+
+  // Return the logical index of the current character, independent of the
+  // number of codewords each character takes.
+  int32 char_pos() const { return char_pos_; }
+
+  // Return the current char.
+  int32 get() const { return char_; }
+
+  // Returns true if we're at the end of the string.
+  bool end() const { return array_pos_ == len_; }
+
+  // Advance to the next actual character.  Returns false if we're at the
+  // end of the string.
+  bool Advance();
+
+ private:
+  // The string we're iterating over.
+  const char16* str_;
+
+  // The length of the encoded string.
+  int32 len_;
+
+  // Array index.
+  int32 array_pos_;
+
+  // The next array index.
+  int32 next_pos_;
+
+  // Character index.
+  int32 char_pos_;
+
+  // The current character.
+  int32 char_;
+
+  DISALLOW_COPY_AND_ASSIGN(UTF16CharIterator);
+};
+
+}  // namespace base
+
+#endif  // BASE_I18N_CHAR_ITERATOR_H_
diff --git a/base/i18n/char_iterator_unittest.cc b/base/i18n/char_iterator_unittest.cc
new file mode 100644
index 0000000..4fe7ebb
--- /dev/null
+++ b/base/i18n/char_iterator_unittest.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/char_iterator.h"
+
+#include "base/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+TEST(CharIteratorsTest, TestUTF8) {
+  std::string empty("");
+  base::UTF8CharIterator empty_iter(&empty);
+  ASSERT_TRUE(empty_iter.end());
+  ASSERT_EQ(0, empty_iter.array_pos());
+  ASSERT_EQ(0, empty_iter.char_pos());
+  ASSERT_FALSE(empty_iter.Advance());
+
+  std::string str("s\303\273r");  // [u with circumflex]
+  base::UTF8CharIterator iter(&str);
+  ASSERT_FALSE(iter.end());
+  ASSERT_EQ(0, iter.array_pos());
+  ASSERT_EQ(0, iter.char_pos());
+  ASSERT_EQ('s', iter.get());
+  ASSERT_TRUE(iter.Advance());
+
+  ASSERT_FALSE(iter.end());
+  ASSERT_EQ(1, iter.array_pos());
+  ASSERT_EQ(1, iter.char_pos());
+  ASSERT_EQ(251, iter.get());
+  ASSERT_TRUE(iter.Advance());
+
+  ASSERT_FALSE(iter.end());
+  ASSERT_EQ(3, iter.array_pos());
+  ASSERT_EQ(2, iter.char_pos());
+  ASSERT_EQ('r', iter.get());
+  ASSERT_TRUE(iter.Advance());
+
+  ASSERT_TRUE(iter.end());
+  ASSERT_EQ(4, iter.array_pos());
+  ASSERT_EQ(3, iter.char_pos());
+
+  // Don't care what it returns, but this shouldn't crash
+  iter.get();
+
+  ASSERT_FALSE(iter.Advance());
+}
+
+TEST(CharIteratorsTest, TestUTF16) {
+  string16 empty = UTF8ToUTF16("");
+  base::UTF16CharIterator empty_iter(&empty);
+  ASSERT_TRUE(empty_iter.end());
+  ASSERT_EQ(0, empty_iter.array_pos());
+  ASSERT_EQ(0, empty_iter.char_pos());
+  ASSERT_FALSE(empty_iter.Advance());
+
+  // This test string contains 4 characters:
+  //   x
+  //   u with circumflex - 2 bytes in UTF8, 1 codeword in UTF16
+  //   math double-struck A - 4 bytes in UTF8, 2 codewords in UTF16
+  //   z
+  string16 str = UTF8ToUTF16("x\303\273\360\235\224\270z");
+  base::UTF16CharIterator iter(&str);
+  ASSERT_FALSE(iter.end());
+  ASSERT_EQ(0, iter.array_pos());
+  ASSERT_EQ(0, iter.char_pos());
+  ASSERT_EQ('x', iter.get());
+  ASSERT_TRUE(iter.Advance());
+
+  ASSERT_FALSE(iter.end());
+  ASSERT_EQ(1, iter.array_pos());
+  ASSERT_EQ(1, iter.char_pos());
+  ASSERT_EQ(251, iter.get());
+  ASSERT_TRUE(iter.Advance());
+
+  ASSERT_FALSE(iter.end());
+  ASSERT_EQ(2, iter.array_pos());
+  ASSERT_EQ(2, iter.char_pos());
+  ASSERT_EQ(120120, iter.get());
+  ASSERT_TRUE(iter.Advance());
+
+  ASSERT_FALSE(iter.end());
+  ASSERT_EQ(4, iter.array_pos());
+  ASSERT_EQ(3, iter.char_pos());
+  ASSERT_EQ('z', iter.get());
+  ASSERT_TRUE(iter.Advance());
+
+  ASSERT_TRUE(iter.end());
+  ASSERT_EQ(5, iter.array_pos());
+  ASSERT_EQ(4, iter.char_pos());
+
+  // Don't care what it returns, but this shouldn't crash
+  iter.get();
+
+  ASSERT_FALSE(iter.Advance());
+}
author	dmazzoni@chromium.org <dmazzoni@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-08-23 14:30:27 +0000
committer	dmazzoni@chromium.org <dmazzoni@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-08-23 14:30:27 +0000
commit	0ca5c104bc719dc5e7dcdc19cca1576c27391e65 (patch)
tree	fbb7dbff560b787cf3007f72236b51b3a41b5a5e /base/i18n
parent	9ea6435cf0c61350b6676821c12e7c95ee1f20ca (diff)
download	chromium_src-0ca5c104bc719dc5e7dcdc19cca1576c27391e65.zip chromium_src-0ca5c104bc719dc5e7dcdc19cca1576c27391e65.tar.gz chromium_src-0ca5c104bc719dc5e7dcdc19cca1576c27391e65.tar.bz2