diff options
author | dmazzoni@chromium.org <dmazzoni@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-23 14:30:27 +0000 |
---|---|---|
committer | dmazzoni@chromium.org <dmazzoni@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-23 14:30:27 +0000 |
commit | 0ca5c104bc719dc5e7dcdc19cca1576c27391e65 (patch) | |
tree | fbb7dbff560b787cf3007f72236b51b3a41b5a5e /base/i18n | |
parent | 9ea6435cf0c61350b6676821c12e7c95ee1f20ca (diff) | |
download | chromium_src-0ca5c104bc719dc5e7dcdc19cca1576c27391e65.zip chromium_src-0ca5c104bc719dc5e7dcdc19cca1576c27391e65.tar.gz chromium_src-0ca5c104bc719dc5e7dcdc19cca1576c27391e65.tar.bz2 |
Add functions to get the length or compute a substring of UTF8 and UTF16
encoded strings.
BUG=none
TEST=Added new unit tests for each.
Review URL: http://codereview.chromium.org/3133028
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@57051 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/i18n')
-rw-r--r-- | base/i18n/char_iterator.cc | 58 | ||||
-rw-r--r-- | base/i18n/char_iterator.h | 123 | ||||
-rw-r--r-- | base/i18n/char_iterator_unittest.cc | 95 |
3 files changed, 276 insertions, 0 deletions
diff --git a/base/i18n/char_iterator.cc b/base/i18n/char_iterator.cc new file mode 100644 index 0000000..c323c5d --- /dev/null +++ b/base/i18n/char_iterator.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/i18n/char_iterator.h" + +#include "unicode/utf8.h" +#include "unicode/utf16.h" + +namespace base { + +UTF8CharIterator::UTF8CharIterator(const std::string* str) + : str_(reinterpret_cast<const uint8_t*>(str->data())), + len_(str->size()), + array_pos_(0), + next_pos_(0), + char_pos_(0), + char_(0) { + if (len_) + U8_NEXT(str_, next_pos_, len_, char_); +} + +bool UTF8CharIterator::Advance() { + if (array_pos_ >= len_) + return false; + + array_pos_ = next_pos_; + char_pos_++; + if (next_pos_ < len_) + U8_NEXT(str_, next_pos_, len_, char_); + + return true; +} + +UTF16CharIterator::UTF16CharIterator(const string16* str) + : str_(reinterpret_cast<const char16*>(str->data())), + len_(str->size()), + array_pos_(0), + next_pos_(0), + char_pos_(0), + char_(0) { + if (len_) + U16_NEXT(str_, next_pos_, len_, char_); +} + +bool UTF16CharIterator::Advance() { + if (array_pos_ >= len_) + return false; + + array_pos_ = next_pos_; + char_pos_++; + if (next_pos_ < len_) + U16_NEXT(str_, next_pos_, len_, char_); + + return true; +} + +} // namespace base diff --git a/base/i18n/char_iterator.h b/base/i18n/char_iterator.h new file mode 100644 index 0000000..784c6e5d --- /dev/null +++ b/base/i18n/char_iterator.h @@ -0,0 +1,123 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_I18N_CHAR_ITERATOR_H_ +#define BASE_I18N_CHAR_ITERATOR_H_ +#pragma once + +#include <string> + +#include "base/basictypes.h" +#include "base/string16.h" + +// The CharIterator classes iterate through the characters in UTF8 and +// UTF16 strings. Example usage: +// +// UTF8CharIterator iter(&str); +// while (!iter.End()) { +// LOG(INFO) << iter.get(); +// iter.Advance(); +// } + +#if defined(OS_WIN) +typedef unsigned char uint8_t; +#endif + +namespace base { + +class UTF8CharIterator { + public: + // Requires |str| to live as long as the UTF8CharIterator does. + UTF8CharIterator(const std::string* str); + ~UTF8CharIterator() {} + + // Return the starting array index of the current character within the + // string. + int32 array_pos() const { return array_pos_; } + + // Return the logical index of the current character, independent of the + // number of bytes each character takes. + int32 char_pos() const { return char_pos_; } + + // Return the current char. + int32 get() const { return char_; } + + // Returns true if we're at the end of the string. + bool end() const { return array_pos_ == len_; } + + // Advance to the next actual character. Returns false if we're at the + // end of the string. + bool Advance(); + + private: + // The string we're iterating over. + const uint8_t* str_; + + // The length of the encoded string. + int32 len_; + + // Array index. + int32 array_pos_; + + // The next array index. + int32 next_pos_; + + // Character index. + int32 char_pos_; + + // The current character. + int32 char_; + + DISALLOW_COPY_AND_ASSIGN(UTF8CharIterator); +}; + +class UTF16CharIterator { + public: + // Requires |str| to live as long as the UTF16CharIterator does. + UTF16CharIterator(const string16* str); + ~UTF16CharIterator() {} + + // Return the starting array index of the current character within the + // string. + int32 array_pos() const { return array_pos_; } + + // Return the logical index of the current character, independent of the + // number of codewords each character takes. + int32 char_pos() const { return char_pos_; } + + // Return the current char. + int32 get() const { return char_; } + + // Returns true if we're at the end of the string. + bool end() const { return array_pos_ == len_; } + + // Advance to the next actual character. Returns false if we're at the + // end of the string. + bool Advance(); + + private: + // The string we're iterating over. + const char16* str_; + + // The length of the encoded string. + int32 len_; + + // Array index. + int32 array_pos_; + + // The next array index. + int32 next_pos_; + + // Character index. + int32 char_pos_; + + // The current character. + int32 char_; + + DISALLOW_COPY_AND_ASSIGN(UTF16CharIterator); +}; + +} // namespace base + +#endif // BASE_I18N_CHAR_ITERATOR_H_ diff --git a/base/i18n/char_iterator_unittest.cc b/base/i18n/char_iterator_unittest.cc new file mode 100644 index 0000000..4fe7ebb --- /dev/null +++ b/base/i18n/char_iterator_unittest.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/i18n/char_iterator.h" + +#include "base/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +TEST(CharIteratorsTest, TestUTF8) { + std::string empty(""); + base::UTF8CharIterator empty_iter(&empty); + ASSERT_TRUE(empty_iter.end()); + ASSERT_EQ(0, empty_iter.array_pos()); + ASSERT_EQ(0, empty_iter.char_pos()); + ASSERT_FALSE(empty_iter.Advance()); + + std::string str("s\303\273r"); // [u with circumflex] + base::UTF8CharIterator iter(&str); + ASSERT_FALSE(iter.end()); + ASSERT_EQ(0, iter.array_pos()); + ASSERT_EQ(0, iter.char_pos()); + ASSERT_EQ('s', iter.get()); + ASSERT_TRUE(iter.Advance()); + + ASSERT_FALSE(iter.end()); + ASSERT_EQ(1, iter.array_pos()); + ASSERT_EQ(1, iter.char_pos()); + ASSERT_EQ(251, iter.get()); + ASSERT_TRUE(iter.Advance()); + + ASSERT_FALSE(iter.end()); + ASSERT_EQ(3, iter.array_pos()); + ASSERT_EQ(2, iter.char_pos()); + ASSERT_EQ('r', iter.get()); + ASSERT_TRUE(iter.Advance()); + + ASSERT_TRUE(iter.end()); + ASSERT_EQ(4, iter.array_pos()); + ASSERT_EQ(3, iter.char_pos()); + + // Don't care what it returns, but this shouldn't crash + iter.get(); + + ASSERT_FALSE(iter.Advance()); +} + +TEST(CharIteratorsTest, TestUTF16) { + string16 empty = UTF8ToUTF16(""); + base::UTF16CharIterator empty_iter(&empty); + ASSERT_TRUE(empty_iter.end()); + ASSERT_EQ(0, empty_iter.array_pos()); + ASSERT_EQ(0, empty_iter.char_pos()); + ASSERT_FALSE(empty_iter.Advance()); + + // This test string contains 4 characters: + // x + // u with circumflex - 2 bytes in UTF8, 1 codeword in UTF16 + // math double-struck A - 4 bytes in UTF8, 2 codewords in UTF16 + // z + string16 str = UTF8ToUTF16("x\303\273\360\235\224\270z"); + base::UTF16CharIterator iter(&str); + ASSERT_FALSE(iter.end()); + ASSERT_EQ(0, iter.array_pos()); + ASSERT_EQ(0, iter.char_pos()); + ASSERT_EQ('x', iter.get()); + ASSERT_TRUE(iter.Advance()); + + ASSERT_FALSE(iter.end()); + ASSERT_EQ(1, iter.array_pos()); + ASSERT_EQ(1, iter.char_pos()); + ASSERT_EQ(251, iter.get()); + ASSERT_TRUE(iter.Advance()); + + ASSERT_FALSE(iter.end()); + ASSERT_EQ(2, iter.array_pos()); + ASSERT_EQ(2, iter.char_pos()); + ASSERT_EQ(120120, iter.get()); + ASSERT_TRUE(iter.Advance()); + + ASSERT_FALSE(iter.end()); + ASSERT_EQ(4, iter.array_pos()); + ASSERT_EQ(3, iter.char_pos()); + ASSERT_EQ('z', iter.get()); + ASSERT_TRUE(iter.Advance()); + + ASSERT_TRUE(iter.end()); + ASSERT_EQ(5, iter.array_pos()); + ASSERT_EQ(4, iter.char_pos()); + + // Don't care what it returns, but this shouldn't crash + iter.get(); + + ASSERT_FALSE(iter.Advance()); +} |