diff options
author | tsepez@chromium.org <tsepez@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-12-21 21:25:03 +0000 |
---|---|---|
committer | tsepez@chromium.org <tsepez@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-12-21 21:25:03 +0000 |
commit | 8e574740bffe5733dfdb9960eeff6b5b47e384f3 (patch) | |
tree | e293e47a42c133dbd785d3e810bbaf2d714b3fc2 /base | |
parent | 2caf30e20420b411f4e70001632587ff6df7d476 (diff) | |
download | chromium_src-8e574740bffe5733dfdb9960eeff6b5b47e384f3.zip chromium_src-8e574740bffe5733dfdb9960eeff6b5b47e384f3.tar.gz chromium_src-8e574740bffe5733dfdb9960eeff6b5b47e384f3.tar.bz2 |
Add BREAK_NEWLINE mode to break_iterator.cc
BUG=49747
TEST=BreakIteratorTest.*
Review URL: http://codereview.chromium.org/5935002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@69874 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/i18n/break_iterator.cc | 41 | ||||
-rw-r--r-- | base/i18n/break_iterator.h | 42 | ||||
-rw-r--r-- | base/i18n/break_iterator_unittest.cc | 221 |
3 files changed, 263 insertions, 41 deletions
diff --git a/base/i18n/break_iterator.cc b/base/i18n/break_iterator.cc index acf37cd9..e1b5e29 100644 --- a/base/i18n/break_iterator.cc +++ b/base/i18n/break_iterator.cc @@ -34,11 +34,12 @@ bool BreakIterator::Init() { break_type = UBRK_WORD; break; case BREAK_SPACE: + case BREAK_NEWLINE: break_type = UBRK_LINE; break; default: - NOTREACHED(); - break_type = UBRK_LINE; + NOTREACHED() << "invalid break_type_"; + return false; } iter_ = ubrk_open(break_type, NULL, string_->data(), static_cast<int32_t>(string_->size()), @@ -53,14 +54,36 @@ bool BreakIterator::Init() { } bool BreakIterator::Advance() { + int32_t pos; + int32_t status; prev_ = pos_; - const int32_t pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); - if (pos == UBRK_DONE) { - pos_ = npos; - return false; - } else { - pos_ = static_cast<size_t>(pos); - return true; + switch (break_type_) { + case BREAK_WORD: + case BREAK_SPACE: + pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); + if (pos == UBRK_DONE) { + pos_ = npos; + return false; + } + pos_ = static_cast<size_t>(pos); + return true; + case BREAK_NEWLINE: + do { + pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); + if (pos == UBRK_DONE) { + break; + } + pos_ = static_cast<size_t>(pos); + status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); + } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT); + if (pos == UBRK_DONE && prev_ == pos_) { + pos_ = npos; + return false; + } + return true; + default: + NOTREACHED() << "invalid break_type_"; + return false; } } diff --git a/base/i18n/break_iterator.h b/base/i18n/break_iterator.h index 0e89060..9de7ac7 100644 --- a/base/i18n/break_iterator.h +++ b/base/i18n/break_iterator.h @@ -9,21 +9,27 @@ #include "base/basictypes.h" #include "base/string16.h" -// The BreakIterator class iterates through the words and word breaks -// in a UTF-16 string. +// The BreakIterator class iterates through the words, word breaks, and +// line breaks in a UTF-16 string. // -// It provides two modes, BREAK_WORD and BREAK_SPACE, which modify how -// trailing non-word characters are aggregated into the returned word. +// It provides several modes, BREAK_WORD, BREAK_SPACE, and BREAK_NEWLINE, +// which modify how characters are aggregated into the returned string. // -// Under BREAK_WORD mode (more common), the non-word characters are -// not included with a returned word (e.g. in the UTF-16 equivalent of -// the string " foo bar! ", the word breaks are at the periods in -// ". .foo. .bar.!. ."). +// Under BREAK_WORD mode, once a word is encountered any non-word +// characters are not included in the returned string (e.g. in the +// UTF-16 equivalent of the string " foo bar! ", the word breaks are at +// the periods in ". .foo. .bar.!. ."). // -// Under BREAK_SPACE mode (less common), the non-word characters are -// included in the word, breaking only when a space-equivalent character -// is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ", -// the word breaks are at the periods in ". .foo .bar! ."). +// Under BREAK_SPACE mode, once a word is encountered, any non-word +// characters are included in the returned string, breaking only when a +// space-equivalent character is encountered (e.g. in the +// UTF16-equivalent of the string " foo bar! ", the word breaks are at +// the periods in ". .foo .bar! ."). +// +// Under BREAK_NEWLINE mode, all characters are included in the returned +// string, breking only when a newline-equivalent character is encountered +// (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line +// breaks are at the periods in ".foo\n.bar\n.\n."). // // To extract the words from a string, move a BREAK_WORD BreakIterator // through the string and test whether IsWord() is true. E.g., @@ -42,7 +48,8 @@ class BreakIterator { public: enum BreakType { BREAK_WORD, - BREAK_SPACE + BREAK_SPACE, + BREAK_NEWLINE, }; // Requires |str| to live as long as the BreakIterator does. @@ -56,19 +63,20 @@ class BreakIterator { // Return the current break position within the string, // or BreakIterator::npos when done. size_t pos() const { return pos_; } + // Return the value of pos() returned before Advance() was last called. size_t prev() const { return prev_; } // Advance to the next break. Returns false if we've run past the end of - // the string. (Note that the very last "word break" is after the final + // the string. (Note that the very last "break" is after the final // character in the string, and when we advance to that position it's the // last time Advance() returns true.) bool Advance(); // Under BREAK_WORD mode, returns true if the break we just hit is the // end of a word. (Otherwise, the break iterator just skipped over e.g. - // whitespace or punctuation.) Under BREAK_SPACE mode, this distinction - // doesn't apply and it always retuns false. + // whitespace or punctuation.) Under BREAK_SPACE and BREAK_NEWLINE modes, + // this distinction doesn't apply and it always retuns false. bool IsWord() const; // Return the string between prev() and pos(). @@ -86,7 +94,7 @@ class BreakIterator { // The string we're iterating over. const string16* string_; - // The breaking style (word/line). + // The breaking style (word/space/newline). BreakType break_type_; // Previous and current iterator positions. diff --git a/base/i18n/break_iterator_unittest.cc b/base/i18n/break_iterator_unittest.cc index 8add918..bf4fdc1 100644 --- a/base/i18n/break_iterator_unittest.cc +++ b/base/i18n/break_iterator_unittest.cc @@ -9,9 +9,18 @@ #include "base/utf_string_conversions.h" #include "testing/gtest/include/gtest/gtest.h" +TEST(BreakIteratorTest, BreakWordEmpty) { + string16 empty; + base::BreakIterator iter(&empty, base::BreakIterator::BREAK_WORD); + ASSERT_TRUE(iter.Init()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + TEST(BreakIteratorTest, BreakWord) { string16 space(UTF8ToUTF16(" ")); - string16 str(UTF8ToUTF16(" foo bar! \npouet boom")); base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); ASSERT_TRUE(iter.Init()); @@ -47,6 +56,66 @@ TEST(BreakIteratorTest, BreakWord) { EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString()); EXPECT_FALSE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakWide16) { + // Two greek words separated by space. + const string16 str(WideToUTF16( + L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2")); + const string16 word1(str.substr(0, 10)); + const string16 word2(str.substr(11, 5)); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(word1, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(word2, iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakWide32) { + // U+1D49C MATHEMATICAL SCRIPT CAPITAL A + const char* very_wide_char = "\xF0\x9D\x92\x9C"; + const string16 str( + UTF8ToUTF16(StringPrintf("%s a", very_wide_char))); + const string16 very_wide_word(str.substr(0, 2)); + + base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(very_wide_word, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_TRUE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakSpaceEmpty) { + string16 empty; + base::BreakIterator iter(&empty, base::BreakIterator::BREAK_SPACE); + ASSERT_TRUE(iter.Init()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); } TEST(BreakIteratorTest, BreakSpace) { @@ -70,48 +139,170 @@ TEST(BreakIteratorTest, BreakSpace) { EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString()); EXPECT_FALSE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); } -TEST(BreakIteratorTest, BreakWide16) { - // "Παγκόσμιος Ιστός" +TEST(BreakIteratorTest, BreakSpaceSP) { + string16 str(UTF8ToUTF16(" foo bar! \npouet boom ")); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("boom "), iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakSpacekWide16) { + // Two Greek words. const string16 str(WideToUTF16( L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2")); - const string16 word1(str.substr(0, 10)); + const string16 word1(str.substr(0, 11)); const string16 word2(str.substr(11, 5)); - base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE); ASSERT_TRUE(iter.Init()); EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); + EXPECT_FALSE(iter.IsWord()); EXPECT_EQ(word1, iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); - EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); EXPECT_EQ(word2, iter.GetString()); EXPECT_FALSE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); } -TEST(BreakIteratorTest, BreakWide32) { +TEST(BreakIteratorTest, BreakSpaceWide32) { // U+1D49C MATHEMATICAL SCRIPT CAPITAL A const char* very_wide_char = "\xF0\x9D\x92\x9C"; const string16 str( UTF8ToUTF16(StringPrintf("%s a", very_wide_char))); - const string16 very_wide_word(str.substr(0, 2)); + const string16 very_wide_word(str.substr(0, 3)); - base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE); ASSERT_TRUE(iter.Init()); EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); + EXPECT_FALSE(iter.IsWord()); EXPECT_EQ(very_wide_word, iter.GetString()); EXPECT_TRUE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); - EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); + EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakLineEmpty) { + string16 empty; + base::BreakIterator iter(&empty, base::BreakIterator::BREAK_NEWLINE); + ASSERT_TRUE(iter.Init()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakLine) { + string16 nl(UTF8ToUTF16("\n")); + string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom")); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE); + ASSERT_TRUE(iter.Init()); EXPECT_TRUE(iter.Advance()); - EXPECT_TRUE(iter.IsWord()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(nl, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(nl, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("pouet boom"), iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakLineNL) { + string16 nl(UTF8ToUTF16("\n")); + string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom\n")); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(nl, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(nl, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(UTF8ToUTF16("pouet boom\n"), iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakLineWide16) { + // Two Greek words separated by newline. + const string16 str(WideToUTF16( + L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" + L"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2")); + const string16 line1(str.substr(0, 11)); + const string16 line2(str.substr(11, 5)); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(line1, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(line2, iter.GetString()); + EXPECT_FALSE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); +} + +TEST(BreakIteratorTest, BreakLineWide32) { + // U+1D49C MATHEMATICAL SCRIPT CAPITAL A + const char* very_wide_char = "\xF0\x9D\x92\x9C"; + const string16 str( + UTF8ToUTF16(StringPrintf("%s\na", very_wide_char))); + const string16 very_wide_line(str.substr(0, 3)); + base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE); + ASSERT_TRUE(iter.Init()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); + EXPECT_EQ(very_wide_line, iter.GetString()); + EXPECT_TRUE(iter.Advance()); + EXPECT_FALSE(iter.IsWord()); EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString()); EXPECT_FALSE(iter.Advance()); EXPECT_FALSE(iter.IsWord()); + EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. + EXPECT_FALSE(iter.IsWord()); } |