From 55954d890dd1f817ea7f49374f1de14d826b2fa4 Mon Sep 17 00:00:00 2001 From: "cdn@chromium.org" Date: Wed, 15 Dec 2010 18:00:54 +0000 Subject: Commiting second word iterator patch for tsepez. this was originally reviewed at http://codereview.chromium.org/5796003/ Review URL: http://codereview.chromium.org/5707011 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@69278 0039d316-1c4b-4281-b951-d872f2087c98 --- base/base.gyp | 11 +--- base/i18n/break_iterator.cc | 76 +++++++++++++++++++++++ base/i18n/break_iterator.h | 100 ++++++++++++++++++++++++++++++ base/i18n/break_iterator_unittest.cc | 117 +++++++++++++++++++++++++++++++++++ base/i18n/word_iterator.cc | 71 --------------------- base/i18n/word_iterator.h | 98 ----------------------------- base/i18n/word_iterator_unittest.cc | 117 ----------------------------------- 7 files changed, 296 insertions(+), 294 deletions(-) create mode 100644 base/i18n/break_iterator.cc create mode 100644 base/i18n/break_iterator.h create mode 100644 base/i18n/break_iterator_unittest.cc delete mode 100644 base/i18n/word_iterator.cc delete mode 100644 base/i18n/word_iterator.h delete mode 100644 base/i18n/word_iterator_unittest.cc (limited to 'base') diff --git a/base/base.gyp b/base/base.gyp index f68359a..70cf465 100644 --- a/base/base.gyp +++ b/base/base.gyp @@ -31,6 +31,8 @@ 'base', ], 'sources': [ + 'i18n/break_iterator.cc', + 'i18n/break_iterator.h', 'i18n/char_iterator.cc', 'i18n/char_iterator.h', 'i18n/file_util_icu.cc', @@ -47,8 +49,6 @@ 'i18n/rtl.h', 'i18n/time_formatting.cc', 'i18n/time_formatting.h', - 'i18n/word_iterator.cc', - 'i18n/word_iterator.h', ], }, { @@ -87,11 +87,11 @@ 'gmock_unittest.cc', 'hmac_unittest.cc', 'id_map_unittest.cc', + 'i18n/break_iterator_unittest.cc', 'i18n/char_iterator_unittest.cc', 'i18n/file_util_icu_unittest.cc', 'i18n/icu_string_conversions_unittest.cc', 'i18n/rtl_unittest.cc', - 'i18n/word_iterator_unittest.cc', 'json/json_reader_unittest.cc', 'json/json_writer_unittest.cc', 'json/string_escape_unittest.cc', @@ -173,11 +173,6 @@ 'win/scoped_variant_unittest.cc', 'worker_pool_unittest.cc', ], - 'include_dirs': [ - # word_iterator.h (used by word_iterator_unittest.cc) leaks an ICU - # #include for unicode/uchar.h. This should probably be cleaned up. - '../third_party/icu/public/common', - ], 'dependencies': [ 'base', 'base_i18n', diff --git a/base/i18n/break_iterator.cc b/base/i18n/break_iterator.cc new file mode 100644 index 0000000..f0f5240 --- /dev/null +++ b/base/i18n/break_iterator.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/i18n/break_iterator.h" + +#include "base/logging.h" +#include "unicode/ubrk.h" +#include "unicode/uchar.h" +#include "unicode/ustring.h" + +namespace base { + +const size_t npos = -1; + +BreakIterator::BreakIterator(const string16* str, BreakType break_type) + : iter_(NULL), + string_(str), + break_type_(break_type), + prev_(npos), + pos_(0) { +} + +BreakIterator::~BreakIterator() { + if (iter_) + ubrk_close(iter_); +} + +bool BreakIterator::Init() { + UErrorCode status = U_ZERO_ERROR; + UBreakIteratorType break_type; + switch (break_type_) { + case BREAK_WORD: + break_type = UBRK_WORD; + break; + case BREAK_SPACE: + break_type = UBRK_LINE; + break; + default: + NOTREACHED(); + break_type = UBRK_LINE; + } + iter_ = ubrk_open(break_type, NULL, + string_->data(), static_cast(string_->size()), + &status); + if (U_FAILURE(status)) { + NOTREACHED() << "ubrk_open failed"; + return false; + } + ubrk_first(iter_); // Move the iterator to the beginning of the string. + return true; +} + +bool BreakIterator::Advance() { + prev_ = pos_; + const int32_t pos = ubrk_next(iter_); + if (pos == UBRK_DONE) { + pos_ = npos; + return false; + } else { + pos_ = static_cast(pos); + return true; + } +} + +bool BreakIterator::IsWord() const { + return (break_type_ == BREAK_WORD && + ubrk_getRuleStatus(iter_) != UBRK_WORD_NONE); +} + +string16 BreakIterator::GetString() const { + DCHECK(prev_ != npos && pos_ != npos); + return string_->substr(prev_, pos_ - prev_); +} + +} // namespace base diff --git a/base/i18n/break_iterator.h b/base/i18n/break_iterator.h new file mode 100644 index 0000000..0e89060 --- /dev/null +++ b/base/i18n/break_iterator.h @@ -0,0 +1,100 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_I18N_BREAK_ITERATOR_H_ +#define BASE_I18N_BREAK_ITERATOR_H_ +#pragma once + +#include "base/basictypes.h" +#include "base/string16.h" + +// The BreakIterator class iterates through the words and word breaks +// in a UTF-16 string. +// +// It provides two modes, BREAK_WORD and BREAK_SPACE, which modify how +// trailing non-word characters are aggregated into the returned word. +// +// Under BREAK_WORD mode (more common), the non-word characters are +// not included with a returned word (e.g. in the UTF-16 equivalent of +// the string " foo bar! ", the word breaks are at the periods in +// ". .foo. .bar.!. ."). +// +// Under BREAK_SPACE mode (less common), the non-word characters are +// included in the word, breaking only when a space-equivalent character +// is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ", +// the word breaks are at the periods in ". .foo .bar! ."). +// +// To extract the words from a string, move a BREAK_WORD BreakIterator +// through the string and test whether IsWord() is true. E.g., +// BreakIterator iter(&str, BreakIterator::BREAK_WORD); +// if (!iter.Init()) return false; +// while (iter.Advance()) { +// if (iter.IsWord()) { +// // region [iter.prev(),iter.pos()) contains a word. +// VLOG(1) << "word: " << iter.GetString(); +// } +// } + +namespace base { + +class BreakIterator { + public: + enum BreakType { + BREAK_WORD, + BREAK_SPACE + }; + + // Requires |str| to live as long as the BreakIterator does. + BreakIterator(const string16* str, BreakType break_type); + ~BreakIterator(); + + // Init() must be called before any of the iterators are valid. + // Returns false if ICU failed to initialize. + bool Init(); + + // Return the current break position within the string, + // or BreakIterator::npos when done. + size_t pos() const { return pos_; } + // Return the value of pos() returned before Advance() was last called. + size_t prev() const { return prev_; } + + // Advance to the next break. Returns false if we've run past the end of + // the string. (Note that the very last "word break" is after the final + // character in the string, and when we advance to that position it's the + // last time Advance() returns true.) + bool Advance(); + + // Under BREAK_WORD mode, returns true if the break we just hit is the + // end of a word. (Otherwise, the break iterator just skipped over e.g. + // whitespace or punctuation.) Under BREAK_SPACE mode, this distinction + // doesn't apply and it always retuns false. + bool IsWord() const; + + // Return the string between prev() and pos(). + // Advance() must have been called successfully at least once + // for pos() to have advanced to somewhere useful. + string16 GetString() const; + + private: + // ICU iterator, avoiding ICU ubrk.h dependence. + // This is actually an ICU UBreakiterator* type, which turns out to be + // a typedef for a void* in the ICU headers. Using void* directly prevents + // callers from needing access to the ICU public headers directory. + void* iter_; + + // The string we're iterating over. + const string16* string_; + + // The breaking style (word/line). + BreakType break_type_; + + // Previous and current iterator positions. + size_t prev_, pos_; + + DISALLOW_COPY_AND_ASSIGN(BreakIterator); +}; + +} // namespace base + +#endif // BASE_I18N_BREAK_ITERATOR_H__ diff --git a/base/i18n/break_iterator_unittest.cc b/base/i18n/break_iterator_unittest.cc new file mode 100644 index 0000000..8add918 --- /dev/null +++ b/base/i18n/break_iterator_unittest.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/i18n/break_iterator.h" + +#include "base/string_piece.h" +#include "base/string_util.h" +#include "base/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +TEST(BreakIteratorTest, BreakWord) { + string16 space(UTF8ToUTF16(" ")); + + string16 str(UTF8ToUTF16(" foo bar! \npouet boom")); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(space, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(space, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("!"), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(space, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(space, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakSpace) { + string16 str(UTF8ToUTF16(" foo bar! \npouet boom")); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakWide16) { + // "Παγκόσμιος Ιστός" + const string16 str(WideToUTF16( + L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2")); + const string16 word1(str.substr(0, 10)); + const string16 word2(str.substr(11, 5)); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(word1, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(word2, iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakWide32) { + // U+1D49C MATHEMATICAL SCRIPT CAPITAL A + const char* very_wide_char = "\xF0\x9D\x92\x9C"; + const string16 str( + UTF8ToUTF16(StringPrintf("%s a", very_wide_char))); + const string16 very_wide_word(str.substr(0, 2)); + + base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(very_wide_word, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); +} diff --git a/base/i18n/word_iterator.cc b/base/i18n/word_iterator.cc deleted file mode 100644 index 7ad9c84..0000000 --- a/base/i18n/word_iterator.cc +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2009 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "base/i18n/word_iterator.h" - -#include "base/logging.h" -#include "unicode/ubrk.h" -#include "unicode/uchar.h" -#include "unicode/ustring.h" - -const size_t npos = -1; - -WordIterator::WordIterator(const string16* str, BreakType break_type) - : iter_(NULL), - string_(str), - break_type_(break_type), - prev_(npos), - pos_(0) { -} - -WordIterator::~WordIterator() { - if (iter_) - ubrk_close(iter_); -} - -bool WordIterator::Init() { - UErrorCode status = U_ZERO_ERROR; - UBreakIteratorType break_type; - switch (break_type_) { - case BREAK_WORD: - break_type = UBRK_WORD; - break; - case BREAK_LINE: - break_type = UBRK_LINE; - break; - default: - NOTREACHED(); - break_type = UBRK_LINE; - } - iter_ = ubrk_open(break_type, NULL, - string_->data(), static_cast(string_->size()), - &status); - if (U_FAILURE(status)) { - NOTREACHED() << "ubrk_open failed"; - return false; - } - ubrk_first(iter_); // Move the iterator to the beginning of the string. - return true; -} - -bool WordIterator::Advance() { - prev_ = pos_; - const int32_t pos = ubrk_next(iter_); - if (pos == UBRK_DONE) { - pos_ = npos; - return false; - } else { - pos_ = static_cast(pos); - return true; - } -} - -bool WordIterator::IsWord() const { - return (ubrk_getRuleStatus(iter_) != UBRK_WORD_NONE); -} - -string16 WordIterator::GetWord() const { - DCHECK(prev_ != npos && pos_ != npos); - return string_->substr(prev_, pos_ - prev_); -} diff --git a/base/i18n/word_iterator.h b/base/i18n/word_iterator.h deleted file mode 100644 index ada86b9..0000000 --- a/base/i18n/word_iterator.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef BASE_I18N_WORD_ITERATOR_H_ -#define BASE_I18N_WORD_ITERATOR_H_ -#pragma once - -#include - -#include "base/basictypes.h" -#include "base/string16.h" - -// The WordIterator class iterates through the words and word breaks -// in a UTF-16 string. -// -// It provides two modes, BREAK_WORD and BREAK_LINE, which modify how -// trailing non-word characters are aggregated into the returned word. -// -// Under BREAK_WORD mode (more common), the non-word characters are -// not included with a returned word (e.g. in the UTF-16 equivalent of -// the string " foo bar! ", the word breaks are at the periods in -// ". .foo. .bar.!. ."). -// -// Under BREAK_LINE mode (less common), the non-word characters are -// included in the word, breaking only when a space-equivalent character -// is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ", -// the word breaks are at the periods in ". .foo .bar! ."). -// -// To extract the words from a string, move a BREAK_WORD WordIterator -// through the string and test whether IsWord() is true. E.g., -// WordIterator iter(&str, WordIterator::BREAK_WORD); -// if (!iter.Init()) return false; -// while (iter.Advance()) { -// if (iter.IsWord()) { -// // region [iter.prev(),iter.pos()) contains a word. -// VLOG(1) << "word: " << iter.GetWord(); -// } -// } - - -class WordIterator { - public: - enum BreakType { - BREAK_WORD, - BREAK_LINE - }; - - // Requires |str| to live as long as the WordIterator does. - WordIterator(const string16* str, BreakType break_type); - ~WordIterator(); - - // Init() must be called before any of the iterators are valid. - // Returns false if ICU failed to initialize. - bool Init(); - - // Return the current break position within the string, - // or WordIterator::npos when done. - size_t pos() const { return pos_; } - // Return the value of pos() returned before Advance() was last called. - size_t prev() const { return prev_; } - - // Advance to the next break. Returns false if we've run past the end of - // the string. (Note that the very last "word break" is after the final - // character in the string, and when we advance to that position it's the - // last time Advance() returns true.) - bool Advance(); - - // Returns true if the break we just hit is the end of a word. - // (Otherwise, the break iterator just skipped over e.g. whitespace - // or punctuation.) - bool IsWord() const; - - // Return the word between prev() and pos(). - // Advance() must have been called successfully at least once - // for pos() to have advanced to somewhere useful. - string16 GetWord() const; - - private: - // ICU iterator, avoiding ICU ubrk.h dependence. - // This is actually an ICU UBreakiterator* type, which turns out to be - // a typedef for a void* in the ICU headers. Using void* directly prevents - // callers from needing access to the ICU public headers directory. - void* iter_; - - // The string we're iterating over. - const string16* string_; - - // The breaking style (word/line). - BreakType break_type_; - - // Previous and current iterator positions. - size_t prev_, pos_; - - DISALLOW_COPY_AND_ASSIGN(WordIterator); -}; - -#endif // BASE_I18N_WORD_ITERATOR_H__ diff --git a/base/i18n/word_iterator_unittest.cc b/base/i18n/word_iterator_unittest.cc deleted file mode 100644 index 92aff76..0000000 --- a/base/i18n/word_iterator_unittest.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "base/i18n/word_iterator.h" - -#include "base/string_piece.h" -#include "base/string_util.h" -#include "base/utf_string_conversions.h" -#include "testing/gtest/include/gtest/gtest.h" - -TEST(WordIteratorTest, BreakWord) { - string16 space(UTF8ToUTF16(" ")); - - string16 str(UTF8ToUTF16(" foo bar! \npouet boom")); - WordIterator iter(&str, WordIterator::BREAK_WORD); - ASSERT_TRUE(iter.Init()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(space, iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(space, iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("!"), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(space, iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(space, iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetWord()); - EXPECT_FALSE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); -} - -TEST(WordIteratorTest, BreakLine) { - string16 str(UTF8ToUTF16(" foo bar! \npouet boom")); - WordIterator iter(&str, WordIterator::BREAK_LINE); - ASSERT_TRUE(iter.Init()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16(" "), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetWord()); - EXPECT_FALSE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); -} - -TEST(WordIteratorTest, BreakWide16) { - // "Παγκόσμιος Ιστός" - const string16 str(WideToUTF16( - L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" - L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2")); - const string16 word1(str.substr(0, 10)); - const string16 word2(str.substr(11, 5)); - WordIterator iter(&str, WordIterator::BREAK_WORD); - ASSERT_TRUE(iter.Init()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(word1, iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16(" "), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(word2, iter.GetWord()); - EXPECT_FALSE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); -} - -TEST(WordIteratorTest, BreakWide32) { - // U+1D49C MATHEMATICAL SCRIPT CAPITAL A - const char* very_wide_char = "\xF0\x9D\x92\x9C"; - const string16 str( - UTF8ToUTF16(StringPrintf("%s a", very_wide_char))); - const string16 very_wide_word(str.substr(0, 2)); - - WordIterator iter(&str, WordIterator::BREAK_WORD); - ASSERT_TRUE(iter.Init()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(very_wide_word, iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16(" "), iter.GetWord()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("a"), iter.GetWord()); - EXPECT_FALSE(iter.Advance()); - EXPECT_FALSE(iter.IsWord()); -} -- cgit v1.1