diff options
author | cdn@chromium.org <cdn@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-12-15 18:00:54 +0000 |
---|---|---|
committer | cdn@chromium.org <cdn@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-12-15 18:00:54 +0000 |
commit | 55954d890dd1f817ea7f49374f1de14d826b2fa4 (patch) | |
tree | dd9f64a4d64642a6d9d5c8de019dc5c778016071 /base | |
parent | 6df44fb660221182373b00ad27840040167205d7 (diff) | |
download | chromium_src-55954d890dd1f817ea7f49374f1de14d826b2fa4.zip chromium_src-55954d890dd1f817ea7f49374f1de14d826b2fa4.tar.gz chromium_src-55954d890dd1f817ea7f49374f1de14d826b2fa4.tar.bz2 |
Commiting second word iterator patch for tsepez. this was originally reviewed at http://codereview.chromium.org/5796003/
Review URL: http://codereview.chromium.org/5707011
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@69278 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/base.gyp | 11 | ||||
-rw-r--r-- | base/i18n/break_iterator.cc (renamed from base/i18n/word_iterator.cc) | 23 | ||||
-rw-r--r-- | base/i18n/break_iterator.h (renamed from base/i18n/word_iterator.h) | 48 | ||||
-rw-r--r-- | base/i18n/break_iterator_unittest.cc (renamed from base/i18n/word_iterator_unittest.cc) | 62 |
4 files changed, 73 insertions, 71 deletions
diff --git a/base/base.gyp b/base/base.gyp index f68359a..70cf465 100644 --- a/base/base.gyp +++ b/base/base.gyp @@ -31,6 +31,8 @@ 'base', ], 'sources': [ + 'i18n/break_iterator.cc', + 'i18n/break_iterator.h', 'i18n/char_iterator.cc', 'i18n/char_iterator.h', 'i18n/file_util_icu.cc', @@ -47,8 +49,6 @@ 'i18n/rtl.h', 'i18n/time_formatting.cc', 'i18n/time_formatting.h', - 'i18n/word_iterator.cc', - 'i18n/word_iterator.h', ], }, { @@ -87,11 +87,11 @@ 'gmock_unittest.cc', 'hmac_unittest.cc', 'id_map_unittest.cc', + 'i18n/break_iterator_unittest.cc', 'i18n/char_iterator_unittest.cc', 'i18n/file_util_icu_unittest.cc', 'i18n/icu_string_conversions_unittest.cc', 'i18n/rtl_unittest.cc', - 'i18n/word_iterator_unittest.cc', 'json/json_reader_unittest.cc', 'json/json_writer_unittest.cc', 'json/string_escape_unittest.cc', @@ -173,11 +173,6 @@ 'win/scoped_variant_unittest.cc', 'worker_pool_unittest.cc', ], - 'include_dirs': [ - # word_iterator.h (used by word_iterator_unittest.cc) leaks an ICU - # #include for unicode/uchar.h. This should probably be cleaned up. - '../third_party/icu/public/common', - ], 'dependencies': [ 'base', 'base_i18n', diff --git a/base/i18n/word_iterator.cc b/base/i18n/break_iterator.cc index 7ad9c84..f0f5240 100644 --- a/base/i18n/word_iterator.cc +++ b/base/i18n/break_iterator.cc @@ -2,16 +2,18 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "base/i18n/word_iterator.h" +#include "base/i18n/break_iterator.h" #include "base/logging.h" #include "unicode/ubrk.h" #include "unicode/uchar.h" #include "unicode/ustring.h" +namespace base { + const size_t npos = -1; -WordIterator::WordIterator(const string16* str, BreakType break_type) +BreakIterator::BreakIterator(const string16* str, BreakType break_type) : iter_(NULL), string_(str), break_type_(break_type), @@ -19,19 +21,19 @@ WordIterator::WordIterator(const string16* str, BreakType break_type) pos_(0) { } -WordIterator::~WordIterator() { +BreakIterator::~BreakIterator() { if (iter_) ubrk_close(iter_); } -bool WordIterator::Init() { +bool BreakIterator::Init() { UErrorCode status = U_ZERO_ERROR; UBreakIteratorType break_type; switch (break_type_) { case BREAK_WORD: break_type = UBRK_WORD; break; - case BREAK_LINE: + case BREAK_SPACE: break_type = UBRK_LINE; break; default: @@ -49,7 +51,7 @@ bool WordIterator::Init() { return true; } -bool WordIterator::Advance() { +bool BreakIterator::Advance() { prev_ = pos_; const int32_t pos = ubrk_next(iter_); if (pos == UBRK_DONE) { @@ -61,11 +63,14 @@ bool WordIterator::Advance() { } } -bool WordIterator::IsWord() const { - return (ubrk_getRuleStatus(iter_) != UBRK_WORD_NONE); +bool BreakIterator::IsWord() const { + return (break_type_ == BREAK_WORD && + ubrk_getRuleStatus(iter_) != UBRK_WORD_NONE); } -string16 WordIterator::GetWord() const { +string16 BreakIterator::GetString() const { DCHECK(prev_ != npos && pos_ != npos); return string_->substr(prev_, pos_ - prev_); } + +} // namespace base diff --git a/base/i18n/word_iterator.h b/base/i18n/break_iterator.h index ada86b9..0e89060 100644 --- a/base/i18n/word_iterator.h +++ b/base/i18n/break_iterator.h @@ -2,19 +2,17 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#ifndef BASE_I18N_WORD_ITERATOR_H_ -#define BASE_I18N_WORD_ITERATOR_H_ +#ifndef BASE_I18N_BREAK_ITERATOR_H_ +#define BASE_I18N_BREAK_ITERATOR_H_ #pragma once -#include <vector> - #include "base/basictypes.h" #include "base/string16.h" -// The WordIterator class iterates through the words and word breaks +// The BreakIterator class iterates through the words and word breaks // in a UTF-16 string. // -// It provides two modes, BREAK_WORD and BREAK_LINE, which modify how +// It provides two modes, BREAK_WORD and BREAK_SPACE, which modify how // trailing non-word characters are aggregated into the returned word. // // Under BREAK_WORD mode (more common), the non-word characters are @@ -22,40 +20,41 @@ // the string " foo bar! ", the word breaks are at the periods in // ". .foo. .bar.!. ."). // -// Under BREAK_LINE mode (less common), the non-word characters are +// Under BREAK_SPACE mode (less common), the non-word characters are // included in the word, breaking only when a space-equivalent character // is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ", // the word breaks are at the periods in ". .foo .bar! ."). // -// To extract the words from a string, move a BREAK_WORD WordIterator +// To extract the words from a string, move a BREAK_WORD BreakIterator // through the string and test whether IsWord() is true. E.g., -// WordIterator iter(&str, WordIterator::BREAK_WORD); +// BreakIterator iter(&str, BreakIterator::BREAK_WORD); // if (!iter.Init()) return false; // while (iter.Advance()) { // if (iter.IsWord()) { // // region [iter.prev(),iter.pos()) contains a word. -// VLOG(1) << "word: " << iter.GetWord(); +// VLOG(1) << "word: " << iter.GetString(); // } // } +namespace base { -class WordIterator { +class BreakIterator { public: enum BreakType { BREAK_WORD, - BREAK_LINE + BREAK_SPACE }; - // Requires |str| to live as long as the WordIterator does. - WordIterator(const string16* str, BreakType break_type); - ~WordIterator(); + // Requires |str| to live as long as the BreakIterator does. + BreakIterator(const string16* str, BreakType break_type); + ~BreakIterator(); // Init() must be called before any of the iterators are valid. // Returns false if ICU failed to initialize. bool Init(); // Return the current break position within the string, - // or WordIterator::npos when done. + // or BreakIterator::npos when done. size_t pos() const { return pos_; } // Return the value of pos() returned before Advance() was last called. size_t prev() const { return prev_; } @@ -66,15 +65,16 @@ class WordIterator { // last time Advance() returns true.) bool Advance(); - // Returns true if the break we just hit is the end of a word. - // (Otherwise, the break iterator just skipped over e.g. whitespace - // or punctuation.) + // Under BREAK_WORD mode, returns true if the break we just hit is the + // end of a word. (Otherwise, the break iterator just skipped over e.g. + // whitespace or punctuation.) Under BREAK_SPACE mode, this distinction + // doesn't apply and it always retuns false. bool IsWord() const; - // Return the word between prev() and pos(). + // Return the string between prev() and pos(). // Advance() must have been called successfully at least once // for pos() to have advanced to somewhere useful. - string16 GetWord() const; + string16 GetString() const; private: // ICU iterator, avoiding ICU ubrk.h dependence. @@ -92,7 +92,9 @@ class WordIterator { // Previous and current iterator positions. size_t prev_, pos_; - DISALLOW_COPY_AND_ASSIGN(WordIterator); + DISALLOW_COPY_AND_ASSIGN(BreakIterator); }; -#endif // BASE_I18N_WORD_ITERATOR_H__ +} // namespace base + +#endif // BASE_I18N_BREAK_ITERATOR_H__ diff --git a/base/i18n/word_iterator_unittest.cc b/base/i18n/break_iterator_unittest.cc index 92aff76..8add918 100644 --- a/base/i18n/word_iterator_unittest.cc +++ b/base/i18n/break_iterator_unittest.cc @@ -2,116 +2,116 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "base/i18n/word_iterator.h" +#include "base/i18n/break_iterator.h" #include "base/string_piece.h" #include "base/string_util.h" #include "base/utf_string_conversions.h" #include "testing/gtest/include/gtest/gtest.h" -TEST(WordIteratorTest, BreakWord) { +TEST(BreakIteratorTest, BreakWord) { string16 space(UTF8ToUTF16(" ")); string16 str(UTF8ToUTF16(" foo bar! \npouet boom")); - WordIterator iter(&str, WordIterator::BREAK_WORD); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); ASSERT_TRUE(iter.Init()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(space, iter.GetWord()); + EXPECT_EQ(space, iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(space, iter.GetWord()); + EXPECT_EQ(space, iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("!"), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("!"), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(space, iter.GetWord()); + EXPECT_EQ(space, iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(space, iter.GetWord()); + EXPECT_EQ(space, iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString()); EXPECT_FALSE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); } -TEST(WordIteratorTest, BreakLine) { +TEST(BreakIteratorTest, BreakSpace) { string16 str(UTF8ToUTF16(" foo bar! \npouet boom")); - WordIterator iter(&str, WordIterator::BREAK_LINE); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE); ASSERT_TRUE(iter.Init()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16(" "), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString()); EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetWord()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString()); EXPECT_FALSE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); } -TEST(WordIteratorTest, BreakWide16) { +TEST(BreakIteratorTest, BreakWide16) { // "Παγκόσμιος Ιστός" const string16 str(WideToUTF16( L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2")); const string16 word1(str.substr(0, 10)); const string16 word2(str.substr(11, 5)); - WordIterator iter(&str, WordIterator::BREAK_WORD); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); ASSERT_TRUE(iter.Init()); EXPECT_TRUE(iter.Advance()); EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(word1, iter.GetWord()); + EXPECT_EQ(word1, iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16(" "), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(word2, iter.GetWord()); + EXPECT_EQ(word2, iter.GetString()); EXPECT_FALSE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); } -TEST(WordIteratorTest, BreakWide32) { +TEST(BreakIteratorTest, BreakWide32) { // U+1D49C MATHEMATICAL SCRIPT CAPITAL A const char* very_wide_char = "\xF0\x9D\x92\x9C"; const string16 str( UTF8ToUTF16(StringPrintf("%s a", very_wide_char))); const string16 very_wide_word(str.substr(0, 2)); - WordIterator iter(&str, WordIterator::BREAK_WORD); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); ASSERT_TRUE(iter.Init()); EXPECT_TRUE(iter.Advance()); EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(very_wide_word, iter.GetWord()); + EXPECT_EQ(very_wide_word, iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16(" "), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_TRUE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16("a"), iter.GetWord()); + EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString()); EXPECT_FALSE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); } |