summaryrefslogtreecommitdiffstats
path: root/chrome
diff options
context:
space:
mode:
Diffstat (limited to 'chrome')
-rwxr-xr-xchrome/chrome_tests.gypi1
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator.cc463
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator.h194
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc129
4 files changed, 496 insertions, 291 deletions
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index afe2ba5..66e995b 100755
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -884,6 +884,7 @@
'renderer/render_widget_unittest.cc',
'renderer/renderer_main_unittest.cc',
'renderer/spellchecker/spellcheck_unittest.cc',
+ 'renderer/spellchecker/spellcheck_worditerator_unittest.cc',
'renderer/translate/page_translator_unittest.cc',
'test/browser_with_test_window_test.cc',
'test/browser_with_test_window_test.h',
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
index 827d9ee..0806f50 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
@@ -10,265 +10,334 @@
#include "base/basictypes.h"
#include "base/string_util.h"
#include "chrome/renderer/spellchecker/spellcheck.h"
-
#include "third_party/icu/public/common/unicode/normlzr.h"
#include "third_party/icu/public/common/unicode/schriter.h"
-#include "third_party/icu/public/common/unicode/uchar.h"
#include "third_party/icu/public/common/unicode/uscript.h"
-#include "third_party/icu/public/common/unicode/uset.h"
#include "third_party/icu/public/i18n/unicode/ulocdata.h"
-SpellcheckCharAttribute::SpellcheckCharAttribute() {
- InitializeScriptTable();
-
- // Even though many dictionaries treats numbers and contractions as words and
- // treats USCRIPT_COMMON characters as word characters, the
- // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
- // characters to strictly-distinguish contraction characters from word
- // characters.
- SetWordScript(USCRIPT_COMMON, false);
-
- // Initialize the table of characters used for contractions.
- // This array consists of the 'Midletter' and 'MidNumLet' characters of the
- // word-break property list provided by Unicode, Inc.:
- // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
- static const UChar32 kMidLetters[] = {
- L'\x003A', // MidLetter # COLON
- L'\x00B7', // MidLetter # MIDDLE DOT
- L'\x0387', // MidLetter # GREEK ANO TELEIA
- L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM
- L'\x2027', // MidLetter # HYPHENATION POINT
- L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON
- L'\xFE55', // MidLetter # SMALL COLON
- L'\xFF1A', // MidLetter # FULLWIDTH COLON
- L'\x0027', // MidNumLet # APOSTROPHE
- L'\x002E', // MidNumLet # FULL STOP
- L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK
- L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK
- L'\x2024', // MidNumLet # ONE DOT LEADER
- L'\xFE52', // MidNumLet # SMALL FULL STOP
- L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE
- L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP
- };
- for (size_t i = 0; i < arraysize(kMidLetters); ++i)
- middle_letters_[kMidLetters[i]] = true;
+///////////////////////////////////////////////////////////////////////////////
+// SpellcheckCharAttribute implementation:
+
+SpellcheckCharAttribute::SpellcheckCharAttribute()
+ : script_code_(USCRIPT_LATIN) {
}
SpellcheckCharAttribute::~SpellcheckCharAttribute() {
}
-// Sets the default language for this object.
-// This function retrieves the exemplar set to set up the default character
-// attributes.
void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {
- UErrorCode status = U_ZERO_ERROR;
- ULocaleData* locale_data = ulocdata_open(language.c_str(), &status);
- if (U_FAILURE(status))
- return;
-
- // Retrieves the exemplar set of the given language and update the
- // character-attribute table to treat its characters as word characters.
- USet* exemplar_set = uset_open(1, 0);
- ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
- &status);
- ulocdata_close(locale_data);
- if (U_SUCCESS(status)) {
- int length = uset_size(exemplar_set);
- for (int i = 0; i < length; ++i) {
- UChar32 character = uset_charAt(exemplar_set, i);
- SetWordScript(GetScriptCode(character), true);
- }
+ CreateRuleSets(language);
+}
- // Many languages use combining characters to input their characters from
- // keyboards. On the other hand, this exemplar set does not always include
- // combining characters for such languages.
- // To treat such combining characters as word characters, we decompose
- // this exemplar set and treat the decomposed characters as word characters.
- icu::UnicodeString composed;
- for (int i = 0; i < length; ++i)
- composed.append(uset_charAt(exemplar_set, i));
-
- icu::UnicodeString decomposed;
- icu::Normalizer::decompose(composed, FALSE, 0, decomposed, status);
- if (U_SUCCESS(status)) {
- icu::StringCharacterIterator iterator(decomposed);
- UChar32 character = iterator.first32();
- while (character != icu::CharacterIterator::DONE) {
- SetWordScript(GetScriptCode(character), true);
- character = iterator.next32();
- }
- }
- }
- uset_close(exemplar_set);
+string16 SpellcheckCharAttribute::GetRuleSet(bool allow_contraction) const {
+ return allow_contraction ?
+ ruleset_allow_contraction_ : ruleset_disallow_contraction_;
}
-// Returns whether or not the given character is a character used by the
-// selected dictionary.
-bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
- return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
+void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
+ // The template for our custom rule sets. Even though this template is based
+ // on the one of ICU 4.0, it changed the following points:
+ // * It discards characters not needed by our spellchecker (e.g. numbers,
+ // punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on).
+ // * It allows customization of the $ALetter value (i.e. word characters).
+ // * It allows customization of the $ALetterPlus value (i.e. whether or not to
+ // use the dictionary data).
+ // * It allows choosing whether or not to split a text at contraction
+ // characters.
+ // This template only changes the forward-iteration rules. So, calling
+ // ubrk_prev() returns the same results as the original template.
+ static const char kRuleTemplate[] =
+ "!!chain;"
+ "$CR = [\\p{Word_Break = CR}];"
+ "$LF = [\\p{Word_Break = LF}];"
+ "$Newline = [\\p{Word_Break = Newline}];"
+ "$Extend = [\\p{Word_Break = Extend}];"
+ "$Format = [\\p{Word_Break = Format}];"
+ "$Katakana = [\\p{Word_Break = Katakana}];"
+ "$ALetter = [\\p{script=%s}];"
+ "$MidNumLet = [\\p{Word_Break = MidNumLet}];"
+ "$MidLetter = [\\p{Word_Break = MidLetter}];"
+ "$MidNum = [\\p{Word_Break = MidNum}];"
+ "$Numeric = [\\p{Word_Break = Numeric}];"
+ "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"
+
+ "$dictionary = [:LineBreak = Complex_Context:];"
+ "$Control = [\\p{Grapheme_Cluster_Break = Control}]; "
+ "$ALetterPlus = %s;"
+
+ "$KatakanaEx = $Katakana ($Extend | $Format)*;"
+ "$ALetterEx = $ALetterPlus ($Extend | $Format)*;"
+ "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;"
+ "$MidLetterEx = $MidLetter ($Extend | $Format)*;"
+ "$MidNumEx = $MidNum ($Extend | $Format)*;"
+ "$NumericEx = $Numeric ($Extend | $Format)*;"
+ "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;"
+
+ "$Hiragana = [\\p{script=Hiragana}];"
+ "$Ideographic = [\\p{Ideographic}];"
+ "$HiraganaEx = $Hiragana ($Extend | $Format)*;"
+ "$IdeographicEx = $Ideographic ($Extend | $Format)*;"
+
+ "!!forward;"
+ "$CR $LF;"
+ "[^$CR $LF $Newline]? ($Extend | $Format)+;"
+ "$ALetterEx {200};"
+ "$ALetterEx $ALetterEx {200};"
+ "%s"
+
+ "!!reverse;"
+ "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;"
+ "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;"
+ "$BackNumericEx = ($Format | $Extend)* $Numeric;"
+ "$BackMidNumEx = ($Format | $Extend)* $MidNum;"
+ "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;"
+ "$BackKatakanaEx = ($Format | $Extend)* $Katakana;"
+ "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;"
+ "$LF $CR;"
+ "($Format | $Extend)* [^$CR $LF $Newline]?;"
+ "$BackALetterEx $BackALetterEx;"
+ "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;"
+ "$BackNumericEx $BackNumericEx;"
+ "$BackNumericEx $BackALetterEx;"
+ "$BackALetterEx $BackNumericEx;"
+ "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;"
+ "$BackKatakanaEx $BackKatakanaEx;"
+ "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |"
+ " $BackKatakanaEx | $BackExtendNumLetEx);"
+ "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)"
+ " $BackExtendNumLetEx;"
+
+ "!!safe_reverse;"
+ "($Extend | $Format)+ .?;"
+ "($MidLetter | $MidNumLet) $BackALetterEx;"
+ "($MidNum | $MidNumLet) $BackNumericEx;"
+ "$dictionary $dictionary;"
+
+ "!!safe_forward;"
+ "($Extend | $Format)+ .?;"
+ "($MidLetterEx | $MidNumLetEx) $ALetterEx;"
+ "($MidNumEx | $MidNumLetEx) $NumericEx;"
+ "$dictionary $dictionary;";
+
+ // Retrieve the script code used by the given language from ICU. When the
+ // given language consists of two or more scripts, we just use the first
+ // script.
+ UErrorCode error = U_ZERO_ERROR;
+ UScriptCode script_code[8];
+ int scripts = uscript_getCode(language.c_str(), script_code,
+ arraysize(script_code), &error);
+ if (U_SUCCESS(error) && scripts >= 1)
+ script_code_ = script_code[0];
+
+ // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary
+ // only for the languages which need it (i.e. Korean and Thai) to prevent ICU
+ // from returning dictionary words (i.e. Korean or Thai words) for languages
+ // which don't need them.
+ const char* aletter = uscript_getName(script_code_);
+ if (!aletter)
+ aletter = "Latin";
+
+ const char kWithDictionary[] = "[$ALetter [$dictionary-$Extend-$Control]]";
+ const char kWithoutDictionary[] = "$ALetter";
+ const char* aletter_plus = kWithoutDictionary;
+ if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
+ aletter_plus = kWithDictionary;
+
+ // Create two custom rule-sets: one allows contraction and the other doesn't.
+ // We save these strings in UTF-16 so we can use it without conversions. (ICU
+ // needs UTF-16 strings.)
+ const char kAllowContraction[] =
+ "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};";
+ const char kDisallowContraction[] = "";
+
+ ruleset_allow_contraction_ = UTF8ToUTF16(StringPrintf(kRuleTemplate,
+ aletter, aletter_plus, kAllowContraction));
+ ruleset_disallow_contraction_ = UTF8ToUTF16(StringPrintf(kRuleTemplate,
+ aletter, aletter_plus, kDisallowContraction));
}
-// Returns whether or not the given character is a character used by
-// contractions.
-bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
- std::map<UChar32, bool>::const_iterator iterator;
- iterator = middle_letters_.find(character);
- if (iterator == middle_letters_.end())
- return false;
- return iterator->second;
+bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const {
+ // Call the language-specific function if necessary.
+ // Otherwise, we call the default one.
+ switch (script_code_) {
+ case USCRIPT_ARABIC:
+ return OutputArabic(c, output);
+
+ case USCRIPT_HANGUL:
+ return OutputHangul(c, output);
+
+ case USCRIPT_HEBREW:
+ return OutputHebrew(c, output);
+
+ default:
+ return OutputDefault(c, output);
+ }
}
-// Initializes the mapping table.
-void SpellcheckCharAttribute::InitializeScriptTable() {
- for (size_t i = 0; i < arraysize(script_attributes_); ++i)
- script_attributes_[i] = false;
+bool SpellcheckCharAttribute::OutputArabic(UChar c, string16* output) const {
+ // Discard characters not from Arabic alphabets. We also discard vowel marks
+ // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from
+ // marking an Arabic word including vowel marks as misspelled. (We need to
+ // check these vowel marks manually and filter them out since their script
+ // codes are USCRIPT_ARABIC.)
+ if (0x0621 <= c && c <= 0x064D)
+ output->push_back(c);
+ return true;
}
-// Retrieves the ICU script code.
-UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
- UErrorCode status = U_ZERO_ERROR;
- UScriptCode script_code = uscript_getScript(character, &status);
- return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
+bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const {
+ // Decompose a Hangul syllable to Hangul jamos.
+ // This code is copied from Unicode Standard Annex #15:
+ // <http://unicode.org/reports/tr15>.
+ const int kSBase = 0xAC00;
+ const int kLBase = 0x1100;
+ const int kVBase = 0x1161;
+ const int kTBase = 0x11A7;
+ const int kLCount = 19;
+ const int kVCount = 21;
+ const int kTCount = 28;
+ const int kNCount = kVCount * kTCount;
+ const int kSCount = kLCount * kNCount;
+
+ int index = c - kSBase;
+ if (index < 0 || index >= kSBase + kSCount) {
+ // This is not a Hangul syllable. Call the default output function since we
+ // should output this character when it is a Hangul jamo.
+ return OutputDefault(c, output);
+ }
+
+ // This is a Hangul syllable. Decompose this syllable into Hangul jamos and
+ // output them.
+ int l = kLBase + index / kNCount;
+ int v = kVBase + (index % kNCount) / kTCount;
+ int t = kTBase + index % kTCount;
+ output->push_back(l);
+ output->push_back(v);
+ if (t != kTBase)
+ output->push_back(t);
+ return true;
}
-// Updates the mapping table from an ICU script code to its attribute, i.e.
-// whether not a script is used by the selected dictionary.
-void SpellcheckCharAttribute::SetWordScript(const int script_code,
- bool in_use) {
- if (script_code < 0 ||
- static_cast<size_t>(script_code) >= arraysize(script_attributes_))
- return;
- script_attributes_[script_code] = in_use;
+bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {
+ // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds
+ // to prevent our Hebrew dictionay from marking a Hebrew word including
+ // niqquds as misspelled. (Same as Arabic vowel marks, we need to check
+ // niqquds manually and filter them out since their script codes are
+ // USCRIPT_HEBREW.)
+ if (0x05D0 <= c && c <= 0x05EA)
+ output->push_back(c);
+ return true;
}
-// Returns whether or not the given script is used by the selected
-// dictionary.
-bool SpellcheckCharAttribute::IsWordScript(
- const UScriptCode script_code) const {
- if (script_code < 0 ||
- static_cast<size_t>(script_code) >= arraysize(script_attributes_))
- return false;
- return script_attributes_[script_code];
+bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const {
+ // Check the script code of this character and output only if it is the one
+ // used by the spellchecker language.
+ UErrorCode status = U_ZERO_ERROR;
+ UScriptCode script_code = uscript_getScript(c, &status);
+ if (script_code == script_code_ || script_code == USCRIPT_COMMON)
+ output->push_back(c);
+ return true;
}
+///////////////////////////////////////////////////////////////////////////////
+// SpellcheckWordIterator implementation:
+
SpellcheckWordIterator::SpellcheckWordIterator()
: word_(NULL),
length_(0),
- position_(0),
- allow_contraction_(false),
- attribute_(NULL) {
+ position_(UBRK_DONE),
+ attribute_(NULL),
+ iterator_(NULL) {
}
SpellcheckWordIterator::~SpellcheckWordIterator() {
+ Close();
}
-// Initialize a word-iterator object.
-void SpellcheckWordIterator::Initialize(
+bool SpellcheckWordIterator::Initialize(
const SpellcheckCharAttribute* attribute,
const char16* word,
size_t length,
bool allow_contraction) {
+ // Create a custom ICU break iterator used in this object.
+ DCHECK(attribute);
+ UErrorCode open_status = U_ZERO_ERROR;
+ UParseError parse_status;
+ string16 rule(attribute->GetRuleSet(allow_contraction));
+ iterator_ = ubrk_openRules(rule.c_str(), rule.length(), word, length,
+ &parse_status, &open_status);
+ if (U_FAILURE(open_status))
+ return false;
+
+ position_ = ubrk_first(iterator_);
+ if (position_ == UBRK_DONE)
+ return false;
+
word_ = word;
- position_ = 0;
length_ = static_cast<int>(length);
- allow_contraction_ = allow_contraction;
attribute_ = attribute;
+ return true;
}
-// Retrieves a word (or a contraction).
-// When a contraction is enclosed with contraction characters (e.g. 'isn't',
-// 'rock'n'roll'), we should discard the beginning and the end of the
-// contraction but we should never split the contraction.
-// To handle this case easily, we should firstly extract a segment consisting
-// of word characters and contraction characters, and discard contraction
-// characters at the beginning and the end of the extracted segment.
bool SpellcheckWordIterator::GetNextWord(string16* word_string,
int* word_start,
int* word_length) {
- word_string->empty();
+ word_string->clear();
*word_start = 0;
*word_length = 0;
- while (position_ < length_) {
- int segment_start = 0;
- int segment_end = 0;
- GetSegment(&segment_start, &segment_end);
- TrimSegment(segment_start, segment_end, word_start, word_length);
- if (*word_length > 0)
- return Normalize(*word_start, *word_length, word_string);
- }
- return false;
-}
+ if (!word_ || position_ == UBRK_DONE)
+ return false;
-// Retrieves a segment consisting of word characters (and contraction
-// characters if the |allow_contraction_| value is true).
-// When the current position refers to a non-word character, this function
-// returns a non-empty segment consisting of the character itself. In this
-// case, the TrimSegment() function discards the character and returns an
-// empty word (i.e. |word_length| == 0).
-void SpellcheckWordIterator::GetSegment(int* segment_start,
- int* segment_end) {
- int position = position_;
- while (position < length_) {
- UChar32 character;
- U16_NEXT(word_, position, length_, character);
- if (!attribute_->IsWordChar(character)) {
- if (!allow_contraction_ || !attribute_->IsContractionChar(character))
- break;
+ // Find a word that can be checked for spelling. Our rule sets filter out
+ // invalid words (e.g. numbers and characters not supported by the
+ // spellchecker language) so this ubrk_getRuleStatus() call returns
+ // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
+ // words until we can find a valid word or reach the end of the input string.
+ int next = ubrk_next(iterator_);
+ while (next != UBRK_DONE) {
+ if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) {
+ if (Normalize(position_, next - position_, word_string)) {
+ *word_start = position_;
+ *word_length = next - position_;
+ position_ = next;
+ return true;
+ }
}
+ position_ = next;
+ next = ubrk_next(iterator_);
}
- *segment_start = position_;
- *segment_end = position;
- position_ = position;
+
+ // There aren't any more words in the given text. Set the position to
+ // UBRK_DONE to prevent from calling ubrk_next() next time when this function
+ // is called.
+ position_ = UBRK_DONE;
+ return false;
}
-// Discards non-word characters at the beginning and the end of the given
-// segment.
-void SpellcheckWordIterator::TrimSegment(int segment_start,
- int segment_end,
- int* word_start,
- int* word_length) const {
- while (segment_start < segment_end) {
- UChar32 character;
- int segment_next = segment_start;
- U16_NEXT(word_, segment_next, segment_end, character);
- if (attribute_->IsWordChar(character)) {
- *word_start = segment_start;
- break;
- }
- segment_start = segment_next;
- }
- while (segment_end >= segment_start) {
- UChar32 character;
- int segment_prev = segment_end;
- U16_PREV(word_, segment_start, segment_prev, character);
- if (attribute_->IsWordChar(character)) {
- *word_length = segment_end - segment_start;
- break;
- }
- segment_end = segment_prev;
+void SpellcheckWordIterator::Close() {
+ if (iterator_) {
+ ubrk_close(iterator_);
+ iterator_ = NULL;
}
}
-// Normalizes a non-terminated string into its canonical form so that
-// a spellchecker object can check spellings of words which contain ligatures,
-// full-width letters, etc.
-// USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
-// also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
-// etc. For its details, please read the script table in
-// "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
bool SpellcheckWordIterator::Normalize(int input_start,
int input_length,
string16* output_string) const {
- // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
- // does not only write NFKD and NFKC can compose ligatures into their ASCII
- // alternatives, but also write NFKC keeps accents of characters.
- // Therefore, NFKC seems to be the best option for hunspell.
+ // We use NFKC to normalize this token because NFKC can compose combined
+ // characters and decompose ligatures.
icu::UnicodeString input(FALSE, &word_[input_start], input_length);
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString output;
icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
- if (U_SUCCESS(status))
- output_string->assign(output.getTerminatedBuffer());
- return status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING;
+ if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)
+ return false;
+
+ // Copy the normalized text to the output.
+ icu::StringCharacterIterator it(output);
+ for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())
+ attribute_->OutputChar(c, output_string);
+
+ return !output_string->empty();
}
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.h b/chrome/renderer/spellchecker/spellcheck_worditerator.h
index 7763314..aa54011 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.h
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.h
@@ -10,105 +10,117 @@
#include "base/basictypes.h"
#include "base/string16.h"
-
-#include "unicode/uscript.h"
-
-// A class which handles character attributes dependent on a spellchecker and
-// its dictionary.
-// This class is used by the SpellcheckWordIterator class to determine whether
-// or not a character is one used by the spellchecker and its dictinary.
+#include "third_party/icu/public/common/unicode/ubrk.h"
+#include "third_party/icu/public/common/unicode/uscript.h"
+
+// A class which encapsulates language-specific operations used by
+// SpellcheckWordIterator.
+// When we set the spellchecker language, this class creates rule sets that
+// filter out the characters not supported by the spellchecker.
+// (Please read the comment in the SpellcheckWordIterator class about how to
+// use this class.)
class SpellcheckCharAttribute {
public:
SpellcheckCharAttribute();
-
~SpellcheckCharAttribute();
- // Sets the default language of the spell checker. This controls which
- // characters are considered parts of words of the given language.
+ // Sets the language of the spellchecker.
+ // This function creates the custom rule-sets used by SpellcheckWordIterator.
+ // Parameters
+ // * language [in] (std::string)
+ // The language-code string.
void SetDefaultLanguage(const std::string& language);
- // Returns whether or not the given character is a character used by the
- // selected dictionary.
+ // Returns a custom rule-set string used by the ICU break iterator.
// Parameters
- // * character [in] (UChar32)
- // Represents a Unicode character to be checked.
- // Return values
- // * true
- // The given character is a word character.
- // * false
- // The given character is not a word character.
- bool IsWordChar(UChar32 character) const;
+ // * allow_contraction [in] (bool)
+ // A flag to control whether or not this object splits a possible
+ // contraction. If this value is false, it returns a rule set that
+ // splits a possible contraction: "in'n'out" -> "in", "n", and "out".
+ string16 GetRuleSet(bool allow_contraction) const;
- // Returns whether or not the given character is a character used by
- // contractions.
- // Parameters
- // * character [in] (UChar32)
- // Represents a Unicode character to be checked.
- // Return values
- // * true
- // The given character is a character used by contractions.
- // * false
- // The given character is not a character used by contractions.
- bool IsContractionChar(UChar32 character) const;
+ // Output a character only if it is a word character.
+ bool OutputChar(UChar c, string16* output) const;
private:
- // Initializes the mapping table.
- void InitializeScriptTable();
-
- // Retrieves the ICU script code.
- UScriptCode GetScriptCode(UChar32 character) const;
+ // Creates the rule-set strings.
+ void CreateRuleSets(const std::string& language);
- // Updates an entry in the mapping table.
- void SetWordScript(const int script_code, bool in_use);
-
- // Returns whether or not the given script is used by the selected
- // dictionary.
- bool IsWordScript(const UScriptCode script_code) const;
+ // Language-specific output functions.
+ bool OutputArabic(UChar c, string16* output) const;
+ bool OutputHangul(UChar c, string16* output) const;
+ bool OutputHebrew(UChar c, string16* output) const;
+ bool OutputDefault(UChar c, string16* output) const;
private:
- // Represents a mapping table from a script code to a boolean value
- // representing whether or not the script is used by the selected dictionary.
- bool script_attributes_[USCRIPT_CODE_LIMIT];
+ // The custom rule-set strings used by ICU BreakIterator.
+ // Since it is not so easy to create custom rule-sets from a spellchecker
+ // language, this class saves these rule-set strings created when we set the
+ // language.
+ string16 ruleset_allow_contraction_;
+ string16 ruleset_disallow_contraction_;
- // Represents a table of characters used by contractions.
- std::map<UChar32, bool> middle_letters_;
+ // The script code used by this language.
+ UScriptCode script_code_;
DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
};
-// A class which implements methods for finding the location of word boundaries
-// used by the Spellchecker class.
-// This class is implemented on the following assumptions:
-// * An input string is encoded in UTF-16 (i.e. it may contain surrogate
-// pairs), and;
-// * The length of a string is the number of UTF-16 characters in the string
-// (i.e. the length of a non-BMP character becomes two).
+// A class which extracts words that can be checked for spelling from a longer
+// string.
+// The ICU word-break iterator does not discard some punctuation characters
+// attached to a word. For example, when we set a word "_hello_" to a
+// word-break iterator, it just returns "_hello_".
+// On the other hand, our spellchecker expects for us to discard such
+// punctuation characters.
+// To extract only the words that our spellchecker can check, this class uses
+// custom rule-sets created by the SpellcheckCharAttribute class.
+// Also, this class normalizes extracted words so our spellchecker can check
+// the spellings of a word that includes ligatures, combined characters,
+// full-width characters, etc.
+//
+// The following snippet is an example that extracts words with this class.
+//
+// // Creates the language-specific attributes for US English.
+// SpellcheckCharAttribute attribute;
+// attribute.SetDefaultLanguage("en-US");
+//
+// // Set up a SpellcheckWordIterator object which extracts English words,
+// // and retrieves them.
+// SpellcheckWordIterator iterator;
+// string16 text(UTF8ToUTF16("this is a test."));
+// iterator.Initialize(&attribute, text.c_str(), text_.length(), true);
+//
+// string16 word;
+// int start;
+// int end;
+// while (iterator.GetNextWord(&word, &start, &end)) {
+// ...
+// }
+//
class SpellcheckWordIterator {
public:
SpellcheckWordIterator();
-
~SpellcheckWordIterator();
// Initializes a word-iterator object.
// Parameters
// * attribute [in] (const SpellcheckCharAttribute*)
- // Represents a set of character attributes used for filtering out
- // non-word characters.
+ // Character attributes used for filtering out non-word characters.
// * word [in] (const char16*)
- // Represents a string from which this object extracts words.
- // (This string does not have to be NUL-terminated.)
+ // A string from which this object extracts words. (This string does not
+ // have to be NUL-terminated.)
// * length [in] (size_t)
- // Represents the length of the given string, in UTF-16 characters.
- // This value should not include terminating NUL characters.
+ // The length of the given string, in UTF-16 characters.
// * allow_contraction [in] (bool)
- // Represents a flag to control whether or not this object should split a
- // possible contraction (e.g. "isn't", "in'n'out", etc.)
+ // A flag to control whether or not this object should split a possible
+ // contraction (e.g. "isn't", "in'n'out", etc.)
// Return values
// * true
// This word-iterator object is initialized successfully.
// * false
// An error occured while initializing this object.
- void Initialize(const SpellcheckCharAttribute* attribute,
+ bool Initialize(const SpellcheckCharAttribute* attribute,
const char16* word,
size_t length,
bool allow_contraction);
@@ -116,20 +128,20 @@ class SpellcheckWordIterator {
// Retrieves a word (or a contraction).
// Parameters
// * word_string [out] (string16*)
- // Represents a word (or a contraction) to be checked its spelling.
- // This |word_string| has been already normalized to its canonical form
- // (i.e. decomposed ligatures, replaced full-width latin characters to
- // its ASCII alternatives, etc.) so that a SpellChecker object can check
- // its spelling without any additional operations.
- // On the other hand, a substring of the input string
+ // A word (or a contraction) to be checked its spelling. This
+ // |word_string| has been already normalized to its canonical form (i.e.
+ // decomposed ligatures, replaced full-width latin characters to its ASCII
+ // alternatives, etc.) so a SpellChecker object can check its spelling
+ // without any additional operations. We can use |word_start| and
+ // |word_length| to retrieve the non-normalizedversion of this string as
+ // shown in the following snippet.
// string16 str(&word[word_start], word_length);
- // represents the non-normalized version of this extracted word.
// * word_start [out] (int*)
- // Represents the offset of this word from the beginning of the input
- // string, in UTF-16 characters.
+ // The offset of this word from the beginning of the input string,
+ // in UTF-16 characters.
// * word_length [out] (int*)
- // Represents the length of an extracted word before normalization, in
- // UTF-16 characters.
+ // The length of an extracted word before normalization, in UTF-16
+ // characters.
// When the input string contains ligatures, this value may not be equal
// to the length of the |word_string|.
// Return values
@@ -142,20 +154,14 @@ class SpellcheckWordIterator {
int* word_length);
private:
- // Retrieves a segment consisting of word characters (and contraction
- // characters if the |allow_contraction| value is true).
- void GetSegment(int* segment_start,
- int* segment_end);
-
- // Discards non-word characters at the beginning and the end of the given
- // segment.
- void TrimSegment(int segment_start,
- int segment_end,
- int* word_start,
- int* word_length) const;
-
- // Normalizes the given segment of the |word_| variable and write its
- // canonical form to the |output_string|.
+ // Releases all the resources attached to this object.
+ void Close();
+
+ // Normalizes a non-terminated string so our spellchecker can check its
+ // spelling. A word returned from an ICU word-break iterator may include
+ // characters not supported by our spellchecker, e.g. ligatures, combining
+ // characters, full-width letters, etc. This function replaces such characters
+ // with alternative characters supported by our spellchecker.
bool Normalize(int input_start,
int input_length,
string16* output_string) const;
@@ -170,13 +176,13 @@ class SpellcheckWordIterator {
// The current position in the original string.
int position_;
- // The flag to control whether or not this object should extract possible
- // contractions.
- bool allow_contraction_;
-
- // The character attributes used for filtering out non-word characters.
+ // The language-specific attributes used for filtering out non-word
+ // characters.
const SpellcheckCharAttribute* attribute_;
+ // The ICU break iterator.
+ UBreakIterator* iterator_;
+
DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
};
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
new file mode 100644
index 0000000..a41c93b
--- /dev/null
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <string>
+#include <vector>
+
+#include "base/format_macros.h"
+#include "base/string_util.h"
+#include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace {
+
+struct TestCase {
+ const char* language;
+ bool allow_contraction;
+ const wchar_t* expected_words;
+};
+
+} // namespace
+
+// Tests whether or not our SpellcheckWordIterator can extract only words used
+// by the specified language from a multi-language text.
+TEST(SpellcheckWordIteratorTest, SplitWord) {
+ // An input text. This text includes words of several languages. (Some words
+ // are not separated with whitespace characters.) Our SpellcheckWordIterator
+ // should extract only the words used by the specified language from this text
+ // and normalize them so our spell-checker can check their spellings.
+ const wchar_t kTestText[] =
+ // Numbers
+ L"0123456789"
+ // Latin (including a contraction character and a ligature).
+ L"hello:hello a\xFB03x"
+ // Greek
+ L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
+ // Cyrillic
+ L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
+ L"\x0443\x0439\x0442\x0435"
+ // Hebrew (including niqquds)
+ L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd"
+ // Arabic (including vowel marks)
+ L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
+ L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
+ L"\x0652\x0643\x064f\x0645\x0652"
+ // Hindi
+ L"\x0930\x093E\x091C\x0927\x093E\x0928"
+ // Thai
+ L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
+ L"\x0e23\x0e31\x0e1a"
+ // Hiraganas
+ L"\x3053\x3093\x306B\x3061\x306F"
+ // CJKV ideographs
+ L"\x4F60\x597D"
+ // Hangul Syllables
+ L"\xC548\xB155\xD558\xC138\xC694"
+ // Full-width latin
+ L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F";
+
+ // The languages and expected results used in this test.
+ static const TestCase kTestCases[] = {
+ {
+ // English (keep contraction words)
+ "en-US", true, L"hello:hello affix Hello"
+ }, {
+ // English (split contraction words)
+ "en-US", false, L"hello hello affix Hello"
+ }, {
+ // Greek
+ "el-GR", true,
+ L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
+ }, {
+ // Russian
+ "ru-RU", true,
+ L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
+ L"\x0443\x0439\x0442\x0435"
+ }, {
+ // Hebrew
+ "he-IL", true,
+ L"\x05e9\x05dc\x05d5\x05dd"
+ }, {
+ // Arabic
+ "ar", true,
+ L"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639"
+ L"\x0644\x064a\x0643\x0645"
+ }, {
+ // Hindi
+ "hi-IN", true,
+ L"\x0930\x093E\x091C\x0927\x093E\x0928"
+ }, {
+ // Thai
+ "th-TH", true,
+ L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
+ L"\x0e23\x0e31\x0e1a"
+ }, {
+ // Korean
+ "ko-KR", true,
+ L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
+ L"\x1109\x1166\x110b\x116d"
+ },
+ };
+
+ for (size_t i = 0; i < arraysize(kTestCases); ++i) {
+ SCOPED_TRACE(StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
+ kTestCases[i].language));
+
+ SpellcheckCharAttribute attributes;
+ attributes.SetDefaultLanguage(kTestCases[i].language);
+
+ string16 input(WideToUTF16(kTestText));
+ SpellcheckWordIterator iterator;
+ EXPECT_TRUE(iterator.Initialize(&attributes, input.c_str(), input.length(),
+ kTestCases[i].allow_contraction));
+
+ std::vector<string16> expected_words;
+ SplitString(WideToUTF16(kTestCases[i].expected_words), ' ',
+ &expected_words);
+
+ string16 actual_word;
+ int actual_start, actual_end;
+ size_t index = 0;
+ while (iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
+ EXPECT_TRUE(index < expected_words.size());
+ if (index < expected_words.size())
+ EXPECT_EQ(expected_words[index], actual_word);
+ ++index;
+ }
+ }
+}