summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
authortsepez@chromium.org <tsepez@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-12-21 21:25:03 +0000
committertsepez@chromium.org <tsepez@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-12-21 21:25:03 +0000
commit8e574740bffe5733dfdb9960eeff6b5b47e384f3 (patch)
treee293e47a42c133dbd785d3e810bbaf2d714b3fc2 /base
parent2caf30e20420b411f4e70001632587ff6df7d476 (diff)
downloadchromium_src-8e574740bffe5733dfdb9960eeff6b5b47e384f3.zip
chromium_src-8e574740bffe5733dfdb9960eeff6b5b47e384f3.tar.gz
chromium_src-8e574740bffe5733dfdb9960eeff6b5b47e384f3.tar.bz2
Add BREAK_NEWLINE mode to break_iterator.cc
BUG=49747 TEST=BreakIteratorTest.* Review URL: http://codereview.chromium.org/5935002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@69874 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r--base/i18n/break_iterator.cc41
-rw-r--r--base/i18n/break_iterator.h42
-rw-r--r--base/i18n/break_iterator_unittest.cc221
3 files changed, 263 insertions, 41 deletions
diff --git a/base/i18n/break_iterator.cc b/base/i18n/break_iterator.cc
index acf37cd9..e1b5e29 100644
--- a/base/i18n/break_iterator.cc
+++ b/base/i18n/break_iterator.cc
@@ -34,11 +34,12 @@ bool BreakIterator::Init() {
break_type = UBRK_WORD;
break;
case BREAK_SPACE:
+ case BREAK_NEWLINE:
break_type = UBRK_LINE;
break;
default:
- NOTREACHED();
- break_type = UBRK_LINE;
+ NOTREACHED() << "invalid break_type_";
+ return false;
}
iter_ = ubrk_open(break_type, NULL,
string_->data(), static_cast<int32_t>(string_->size()),
@@ -53,14 +54,36 @@ bool BreakIterator::Init() {
}
bool BreakIterator::Advance() {
+ int32_t pos;
+ int32_t status;
prev_ = pos_;
- const int32_t pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
- if (pos == UBRK_DONE) {
- pos_ = npos;
- return false;
- } else {
- pos_ = static_cast<size_t>(pos);
- return true;
+ switch (break_type_) {
+ case BREAK_WORD:
+ case BREAK_SPACE:
+ pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
+ if (pos == UBRK_DONE) {
+ pos_ = npos;
+ return false;
+ }
+ pos_ = static_cast<size_t>(pos);
+ return true;
+ case BREAK_NEWLINE:
+ do {
+ pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
+ if (pos == UBRK_DONE) {
+ break;
+ }
+ pos_ = static_cast<size_t>(pos);
+ status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
+ } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
+ if (pos == UBRK_DONE && prev_ == pos_) {
+ pos_ = npos;
+ return false;
+ }
+ return true;
+ default:
+ NOTREACHED() << "invalid break_type_";
+ return false;
}
}
diff --git a/base/i18n/break_iterator.h b/base/i18n/break_iterator.h
index 0e89060..9de7ac7 100644
--- a/base/i18n/break_iterator.h
+++ b/base/i18n/break_iterator.h
@@ -9,21 +9,27 @@
#include "base/basictypes.h"
#include "base/string16.h"
-// The BreakIterator class iterates through the words and word breaks
-// in a UTF-16 string.
+// The BreakIterator class iterates through the words, word breaks, and
+// line breaks in a UTF-16 string.
//
-// It provides two modes, BREAK_WORD and BREAK_SPACE, which modify how
-// trailing non-word characters are aggregated into the returned word.
+// It provides several modes, BREAK_WORD, BREAK_SPACE, and BREAK_NEWLINE,
+// which modify how characters are aggregated into the returned string.
//
-// Under BREAK_WORD mode (more common), the non-word characters are
-// not included with a returned word (e.g. in the UTF-16 equivalent of
-// the string " foo bar! ", the word breaks are at the periods in
-// ". .foo. .bar.!. .").
+// Under BREAK_WORD mode, once a word is encountered any non-word
+// characters are not included in the returned string (e.g. in the
+// UTF-16 equivalent of the string " foo bar! ", the word breaks are at
+// the periods in ". .foo. .bar.!. .").
//
-// Under BREAK_SPACE mode (less common), the non-word characters are
-// included in the word, breaking only when a space-equivalent character
-// is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ",
-// the word breaks are at the periods in ". .foo .bar! .").
+// Under BREAK_SPACE mode, once a word is encountered, any non-word
+// characters are included in the returned string, breaking only when a
+// space-equivalent character is encountered (e.g. in the
+// UTF16-equivalent of the string " foo bar! ", the word breaks are at
+// the periods in ". .foo .bar! .").
+//
+// Under BREAK_NEWLINE mode, all characters are included in the returned
+// string, breking only when a newline-equivalent character is encountered
+// (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line
+// breaks are at the periods in ".foo\n.bar\n.\n.").
//
// To extract the words from a string, move a BREAK_WORD BreakIterator
// through the string and test whether IsWord() is true. E.g.,
@@ -42,7 +48,8 @@ class BreakIterator {
public:
enum BreakType {
BREAK_WORD,
- BREAK_SPACE
+ BREAK_SPACE,
+ BREAK_NEWLINE,
};
// Requires |str| to live as long as the BreakIterator does.
@@ -56,19 +63,20 @@ class BreakIterator {
// Return the current break position within the string,
// or BreakIterator::npos when done.
size_t pos() const { return pos_; }
+
// Return the value of pos() returned before Advance() was last called.
size_t prev() const { return prev_; }
// Advance to the next break. Returns false if we've run past the end of
- // the string. (Note that the very last "word break" is after the final
+ // the string. (Note that the very last "break" is after the final
// character in the string, and when we advance to that position it's the
// last time Advance() returns true.)
bool Advance();
// Under BREAK_WORD mode, returns true if the break we just hit is the
// end of a word. (Otherwise, the break iterator just skipped over e.g.
- // whitespace or punctuation.) Under BREAK_SPACE mode, this distinction
- // doesn't apply and it always retuns false.
+ // whitespace or punctuation.) Under BREAK_SPACE and BREAK_NEWLINE modes,
+ // this distinction doesn't apply and it always retuns false.
bool IsWord() const;
// Return the string between prev() and pos().
@@ -86,7 +94,7 @@ class BreakIterator {
// The string we're iterating over.
const string16* string_;
- // The breaking style (word/line).
+ // The breaking style (word/space/newline).
BreakType break_type_;
// Previous and current iterator positions.
diff --git a/base/i18n/break_iterator_unittest.cc b/base/i18n/break_iterator_unittest.cc
index 8add918..bf4fdc1 100644
--- a/base/i18n/break_iterator_unittest.cc
+++ b/base/i18n/break_iterator_unittest.cc
@@ -9,9 +9,18 @@
#include "base/utf_string_conversions.h"
#include "testing/gtest/include/gtest/gtest.h"
+TEST(BreakIteratorTest, BreakWordEmpty) {
+ string16 empty;
+ base::BreakIterator iter(&empty, base::BreakIterator::BREAK_WORD);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
TEST(BreakIteratorTest, BreakWord) {
string16 space(UTF8ToUTF16(" "));
-
string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD);
ASSERT_TRUE(iter.Init());
@@ -47,6 +56,66 @@ TEST(BreakIteratorTest, BreakWord) {
EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakWide16) {
+ // Two greek words separated by space.
+ const string16 str(WideToUTF16(
+ L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
+ L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
+ const string16 word1(str.substr(0, 10));
+ const string16 word2(str.substr(11, 5));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(word1, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(word2, iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakWide32) {
+ // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
+ const char* very_wide_char = "\xF0\x9D\x92\x9C";
+ const string16 str(
+ UTF8ToUTF16(StringPrintf("%s a", very_wide_char)));
+ const string16 very_wide_word(str.substr(0, 2));
+
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(very_wide_word, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_TRUE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakSpaceEmpty) {
+ string16 empty;
+ base::BreakIterator iter(&empty, base::BreakIterator::BREAK_SPACE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
}
TEST(BreakIteratorTest, BreakSpace) {
@@ -70,48 +139,170 @@ TEST(BreakIteratorTest, BreakSpace) {
EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
}
-TEST(BreakIteratorTest, BreakWide16) {
- // "Παγκόσμιος Ιστός"
+TEST(BreakIteratorTest, BreakSpaceSP) {
+ string16 str(UTF8ToUTF16(" foo bar! \npouet boom "));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("boom "), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakSpacekWide16) {
+ // Two Greek words.
const string16 str(WideToUTF16(
L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
- const string16 word1(str.substr(0, 10));
+ const string16 word1(str.substr(0, 11));
const string16 word2(str.substr(11, 5));
- base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD);
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
+ EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(word1, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
- EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
EXPECT_EQ(word2, iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
}
-TEST(BreakIteratorTest, BreakWide32) {
+TEST(BreakIteratorTest, BreakSpaceWide32) {
// U+1D49C MATHEMATICAL SCRIPT CAPITAL A
const char* very_wide_char = "\xF0\x9D\x92\x9C";
const string16 str(
UTF8ToUTF16(StringPrintf("%s a", very_wide_char)));
- const string16 very_wide_word(str.substr(0, 2));
+ const string16 very_wide_word(str.substr(0, 3));
- base::BreakIterator iter(&str, base::BreakIterator::BREAK_WORD);
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_SPACE);
ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
+ EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(very_wide_word, iter.GetString());
EXPECT_TRUE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
- EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLineEmpty) {
+ string16 empty;
+ base::BreakIterator iter(&empty, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLine) {
+ string16 nl(UTF8ToUTF16("\n"));
+ string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom"));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
EXPECT_TRUE(iter.Advance());
- EXPECT_TRUE(iter.IsWord());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(nl, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(nl, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("pouet boom"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLineNL) {
+ string16 nl(UTF8ToUTF16("\n"));
+ string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom\n"));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(nl, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(nl, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(UTF8ToUTF16("pouet boom\n"), iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLineWide16) {
+ // Two Greek words separated by newline.
+ const string16 str(WideToUTF16(
+ L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
+ L"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2"));
+ const string16 line1(str.substr(0, 11));
+ const string16 line2(str.substr(11, 5));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(line1, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(line2, iter.GetString());
+ EXPECT_FALSE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
+}
+
+TEST(BreakIteratorTest, BreakLineWide32) {
+ // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
+ const char* very_wide_char = "\xF0\x9D\x92\x9C";
+ const string16 str(
+ UTF8ToUTF16(StringPrintf("%s\na", very_wide_char)));
+ const string16 very_wide_line(str.substr(0, 3));
+ base::BreakIterator iter(&str, base::BreakIterator::BREAK_NEWLINE);
+ ASSERT_TRUE(iter.Init());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
+ EXPECT_EQ(very_wide_line, iter.GetString());
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_FALSE(iter.IsWord());
EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
EXPECT_FALSE(iter.Advance());
EXPECT_FALSE(iter.IsWord());
+ EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
+ EXPECT_FALSE(iter.IsWord());
}