summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
authorjuliusa <juliusa@google.com>2015-08-11 18:30:04 -0700
committerCommit bot <commit-bot@chromium.org>2015-08-12 01:30:39 +0000
commit3fc3250d48a1e1d280936a9de4c0875d4ec72e3e (patch)
tree7782337c72cb103b0afb02545b515c4bec4ad45c /base
parent1167cb33649c82a33bd18f8c08333113458939ff (diff)
downloadchromium_src-3fc3250d48a1e1d280936a9de4c0875d4ec72e3e.zip
chromium_src-3fc3250d48a1e1d280936a9de4c0875d4ec72e3e.tar.gz
chromium_src-3fc3250d48a1e1d280936a9de4c0875d4ec72e3e.tar.bz2
Creates BreakIterator::GetWordBreakStatus.
For multilingual spellchecking, we need a function to tell us the current state of the iterator so we know what the spellchecker needs to pay attention to. That is, we need to know if we've found a word or characters that can be skipped over. TEST=*Skippable* TEST=*BreakStatus* BUG=5102 Review URL: https://codereview.chromium.org/1272683002 Cr-Commit-Position: refs/heads/master@{#342958}
Diffstat (limited to 'base')
-rw-r--r--base/i18n/break_iterator.cc8
-rw-r--r--base/i18n/break_iterator.h27
-rw-r--r--base/i18n/break_iterator_unittest.cc85
3 files changed, 118 insertions, 2 deletions
diff --git a/base/i18n/break_iterator.cc b/base/i18n/break_iterator.cc
index e2ed667..bc20fff 100644
--- a/base/i18n/break_iterator.cc
+++ b/base/i18n/break_iterator.cc
@@ -138,10 +138,14 @@ bool BreakIterator::SetText(const base::char16* text, const size_t length) {
}
bool BreakIterator::IsWord() const {
+ return GetWordBreakStatus() == IS_WORD_BREAK;
+}
+
+BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const {
int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
- return false;
- return status != UBRK_WORD_NONE;
+ return IS_LINE_OR_CHAR_BREAK;
+ return status == UBRK_WORD_NONE ? IS_SKIPPABLE_WORD : IS_WORD_BREAK;
}
bool BreakIterator::IsEndOfWord(size_t position) const {
diff --git a/base/i18n/break_iterator.h b/base/i18n/break_iterator.h
index 19fdbe0..9dbac7c 100644
--- a/base/i18n/break_iterator.h
+++ b/base/i18n/break_iterator.h
@@ -71,6 +71,19 @@ class BASE_I18N_EXPORT BreakIterator {
RULE_BASED,
};
+ enum WordBreakStatus {
+ // The end of text that the iterator recognizes as word characters.
+ // Non-word characters are things like punctuation and spaces.
+ IS_WORD_BREAK,
+ // Characters that the iterator can skip past, such as punctuation,
+ // whitespace, and, if using RULE_BASED mode, characters from another
+ // character set.
+ IS_SKIPPABLE_WORD,
+ // Only used if not in BREAK_WORD or RULE_BASED mode. This is returned for
+ // newlines, line breaks, and character breaks.
+ IS_LINE_OR_CHAR_BREAK
+ };
+
// Requires |str| to live as long as the BreakIterator does.
BreakIterator(const StringPiece16& str, BreakType break_type);
// Make a rule-based iterator. BreakType == RULE_BASED is implied.
@@ -101,6 +114,20 @@ class BASE_I18N_EXPORT BreakIterator {
// this distinction doesn't apply and it always returns false.
bool IsWord() const;
+ // Under BREAK_WORD mode:
+ // - Returns IS_SKIPPABLE_WORD if non-word characters, such as punctuation or
+ // spaces, are found.
+ // - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence
+ // of word characters.
+ // Under RULE_BASED mode:
+ // - Returns IS_SKIPPABLE_WORD if characters outside the rules' character set
+ // or non-word characters, such as punctuation or spaces, are found.
+ // - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence
+ // of word characters that are in the rules' character set.
+ // Not under BREAK_WORD or RULE_BASED mode:
+ // - Returns IS_LINE_OR_CHAR_BREAK.
+ BreakIterator::WordBreakStatus GetWordBreakStatus() const;
+
// Under BREAK_WORD mode, returns true if |position| is at the end of word or
// at the start of word. It always returns false under BREAK_LINE and
// BREAK_NEWLINE modes.
diff --git a/base/i18n/break_iterator_unittest.cc b/base/i18n/break_iterator_unittest.cc
index 220a996..c535091 100644
--- a/base/i18n/break_iterator_unittest.cc
+++ b/base/i18n/break_iterator_unittest.cc
@@ -369,5 +369,90 @@ TEST(BreakIteratorTest, GetStringPiece) {
EXPECT_EQ(StringPiece16(ASCIIToUTF16("string")), iter.GetStringPiece());
}
+// Make sure that when not in RULE_BASED or BREAK_WORD mode we're getting
+// IS_LINE_OR_CHAR_BREAK.
+TEST(BreakIteratorTest, GetWordBreakStatusBreakLine) {
+ // A string containing the English word "foo", followed by two Khmer
+ // characters, the English word "Can", and then two Russian characters and
+ // punctuation.
+ base::string16 text(
+ base::WideToUTF16(L"foo \x1791\x17C1 \nCan \x041C\x0438..."));
+ BreakIterator iter(text, BreakIterator::BREAK_LINE);
+ ASSERT_TRUE(iter.Init());
+
+ EXPECT_TRUE(iter.Advance());
+ // Finds "foo" and the space.
+ EXPECT_EQ(base::UTF8ToUTF16("foo "), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
+ EXPECT_TRUE(iter.Advance());
+ // Finds the Khmer characters, the next space, and the newline.
+ EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1 \n"), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
+ EXPECT_TRUE(iter.Advance());
+ // Finds "Can" and the space.
+ EXPECT_EQ(base::UTF8ToUTF16("Can "), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
+ EXPECT_TRUE(iter.Advance());
+ // Finds the Russian characters and periods.
+ EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438..."), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
+ EXPECT_FALSE(iter.Advance());
+}
+
+// Make sure that in BREAK_WORD mode we're getting IS_WORD_BREAK and
+// IS_SKIPPABLE_WORD when we should be. IS_WORD_BREAK should be returned when we
+// finish going over non-punctuation characters while IS_SKIPPABLE_WORD should
+// be returned on punctuation and spaces.
+TEST(BreakIteratorTest, GetWordBreakStatusBreakWord) {
+ // A string containing the English word "foo", followed by two Khmer
+ // characters, the English word "Can", and then two Russian characters and
+ // punctuation.
+ base::string16 text(
+ base::WideToUTF16(L"foo \x1791\x17C1 \nCan \x041C\x0438..."));
+ BreakIterator iter(text, BreakIterator::BREAK_WORD);
+ ASSERT_TRUE(iter.Init());
+
+ EXPECT_TRUE(iter.Advance());
+ // Finds "foo".
+ EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
+ EXPECT_TRUE(iter.Advance());
+ // Finds the space, and the Khmer characters.
+ EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
+ EXPECT_TRUE(iter.Advance());
+ // Finds the space and the newline.
+ EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_EQ(base::UTF8ToUTF16("\n"), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
+ EXPECT_TRUE(iter.Advance());
+ // Finds "Can".
+ EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
+ EXPECT_TRUE(iter.Advance());
+ // Finds the space and the Russian characters.
+ EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
+ EXPECT_TRUE(iter.Advance());
+ // Finds the trailing periods.
+ EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
+ EXPECT_TRUE(iter.Advance());
+ EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
+ EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
+ EXPECT_FALSE(iter.Advance());
+}
+
} // namespace i18n
} // namespace base