diff options
author | juliusa <juliusa@google.com> | 2015-08-11 18:30:04 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-08-12 01:30:39 +0000 |
commit | 3fc3250d48a1e1d280936a9de4c0875d4ec72e3e (patch) | |
tree | 7782337c72cb103b0afb02545b515c4bec4ad45c /base | |
parent | 1167cb33649c82a33bd18f8c08333113458939ff (diff) | |
download | chromium_src-3fc3250d48a1e1d280936a9de4c0875d4ec72e3e.zip chromium_src-3fc3250d48a1e1d280936a9de4c0875d4ec72e3e.tar.gz chromium_src-3fc3250d48a1e1d280936a9de4c0875d4ec72e3e.tar.bz2 |
Creates BreakIterator::GetWordBreakStatus.
For multilingual spellchecking, we need a function to tell us the
current state of the iterator so we know what the spellchecker needs to
pay attention to. That is, we need to know if we've found a word or
characters that can be skipped over.
TEST=*Skippable*
TEST=*BreakStatus*
BUG=5102
Review URL: https://codereview.chromium.org/1272683002
Cr-Commit-Position: refs/heads/master@{#342958}
Diffstat (limited to 'base')
-rw-r--r-- | base/i18n/break_iterator.cc | 8 | ||||
-rw-r--r-- | base/i18n/break_iterator.h | 27 | ||||
-rw-r--r-- | base/i18n/break_iterator_unittest.cc | 85 |
3 files changed, 118 insertions, 2 deletions
diff --git a/base/i18n/break_iterator.cc b/base/i18n/break_iterator.cc index e2ed667..bc20fff 100644 --- a/base/i18n/break_iterator.cc +++ b/base/i18n/break_iterator.cc @@ -138,10 +138,14 @@ bool BreakIterator::SetText(const base::char16* text, const size_t length) { } bool BreakIterator::IsWord() const { + return GetWordBreakStatus() == IS_WORD_BREAK; +} + +BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const { int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) - return false; - return status != UBRK_WORD_NONE; + return IS_LINE_OR_CHAR_BREAK; + return status == UBRK_WORD_NONE ? IS_SKIPPABLE_WORD : IS_WORD_BREAK; } bool BreakIterator::IsEndOfWord(size_t position) const { diff --git a/base/i18n/break_iterator.h b/base/i18n/break_iterator.h index 19fdbe0..9dbac7c 100644 --- a/base/i18n/break_iterator.h +++ b/base/i18n/break_iterator.h @@ -71,6 +71,19 @@ class BASE_I18N_EXPORT BreakIterator { RULE_BASED, }; + enum WordBreakStatus { + // The end of text that the iterator recognizes as word characters. + // Non-word characters are things like punctuation and spaces. + IS_WORD_BREAK, + // Characters that the iterator can skip past, such as punctuation, + // whitespace, and, if using RULE_BASED mode, characters from another + // character set. + IS_SKIPPABLE_WORD, + // Only used if not in BREAK_WORD or RULE_BASED mode. This is returned for + // newlines, line breaks, and character breaks. + IS_LINE_OR_CHAR_BREAK + }; + // Requires |str| to live as long as the BreakIterator does. BreakIterator(const StringPiece16& str, BreakType break_type); // Make a rule-based iterator. BreakType == RULE_BASED is implied. @@ -101,6 +114,20 @@ class BASE_I18N_EXPORT BreakIterator { // this distinction doesn't apply and it always returns false. bool IsWord() const; + // Under BREAK_WORD mode: + // - Returns IS_SKIPPABLE_WORD if non-word characters, such as punctuation or + // spaces, are found. + // - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence + // of word characters. + // Under RULE_BASED mode: + // - Returns IS_SKIPPABLE_WORD if characters outside the rules' character set + // or non-word characters, such as punctuation or spaces, are found. + // - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence + // of word characters that are in the rules' character set. + // Not under BREAK_WORD or RULE_BASED mode: + // - Returns IS_LINE_OR_CHAR_BREAK. + BreakIterator::WordBreakStatus GetWordBreakStatus() const; + // Under BREAK_WORD mode, returns true if |position| is at the end of word or // at the start of word. It always returns false under BREAK_LINE and // BREAK_NEWLINE modes. diff --git a/base/i18n/break_iterator_unittest.cc b/base/i18n/break_iterator_unittest.cc index 220a996..c535091 100644 --- a/base/i18n/break_iterator_unittest.cc +++ b/base/i18n/break_iterator_unittest.cc @@ -369,5 +369,90 @@ TEST(BreakIteratorTest, GetStringPiece) { EXPECT_EQ(StringPiece16(ASCIIToUTF16("string")), iter.GetStringPiece()); } +// Make sure that when not in RULE_BASED or BREAK_WORD mode we're getting +// IS_LINE_OR_CHAR_BREAK. +TEST(BreakIteratorTest, GetWordBreakStatusBreakLine) { + // A string containing the English word "foo", followed by two Khmer + // characters, the English word "Can", and then two Russian characters and + // punctuation. + base::string16 text( + base::WideToUTF16(L"foo \x1791\x17C1 \nCan \x041C\x0438...")); + BreakIterator iter(text, BreakIterator::BREAK_LINE); + ASSERT_TRUE(iter.Init()); + + EXPECT_TRUE(iter.Advance()); + // Finds "foo" and the space. + EXPECT_EQ(base::UTF8ToUTF16("foo "), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK); + EXPECT_TRUE(iter.Advance()); + // Finds the Khmer characters, the next space, and the newline. + EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1 \n"), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK); + EXPECT_TRUE(iter.Advance()); + // Finds "Can" and the space. + EXPECT_EQ(base::UTF8ToUTF16("Can "), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK); + EXPECT_TRUE(iter.Advance()); + // Finds the Russian characters and periods. + EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438..."), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK); + EXPECT_FALSE(iter.Advance()); +} + +// Make sure that in BREAK_WORD mode we're getting IS_WORD_BREAK and +// IS_SKIPPABLE_WORD when we should be. IS_WORD_BREAK should be returned when we +// finish going over non-punctuation characters while IS_SKIPPABLE_WORD should +// be returned on punctuation and spaces. +TEST(BreakIteratorTest, GetWordBreakStatusBreakWord) { + // A string containing the English word "foo", followed by two Khmer + // characters, the English word "Can", and then two Russian characters and + // punctuation. + base::string16 text( + base::WideToUTF16(L"foo \x1791\x17C1 \nCan \x041C\x0438...")); + BreakIterator iter(text, BreakIterator::BREAK_WORD); + ASSERT_TRUE(iter.Init()); + + EXPECT_TRUE(iter.Advance()); + // Finds "foo". + EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); + EXPECT_TRUE(iter.Advance()); + // Finds the space, and the Khmer characters. + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); + EXPECT_TRUE(iter.Advance()); + EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); + EXPECT_TRUE(iter.Advance()); + // Finds the space and the newline. + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); + EXPECT_TRUE(iter.Advance()); + EXPECT_EQ(base::UTF8ToUTF16("\n"), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); + EXPECT_TRUE(iter.Advance()); + // Finds "Can". + EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); + EXPECT_TRUE(iter.Advance()); + // Finds the space and the Russian characters. + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); + EXPECT_TRUE(iter.Advance()); + EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); + EXPECT_TRUE(iter.Advance()); + // Finds the trailing periods. + EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); + EXPECT_TRUE(iter.Advance()); + EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); + EXPECT_TRUE(iter.Advance()); + EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); + EXPECT_FALSE(iter.Advance()); +} + } // namespace i18n } // namespace base |