diff options
author | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-02-16 09:02:41 +0000 |
---|---|---|
committer | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-02-16 09:02:41 +0000 |
commit | 59e2c39984b9a34bbb72db1f3a02be171dc9726d (patch) | |
tree | 5bf95d487360cb6cf59e0c01c4e89d5c9d9d82b8 /chrome | |
parent | 2b19e2feeac2a01b2068595bd2913a194a6527e5 (diff) | |
download | chromium_src-59e2c39984b9a34bbb72db1f3a02be171dc9726d.zip chromium_src-59e2c39984b9a34bbb72db1f3a02be171dc9726d.tar.gz chromium_src-59e2c39984b9a34bbb72db1f3a02be171dc9726d.tar.bz2 |
Re-implement SpellcheckWordIterator with ICU.
This change re-implements the SpellcheckWordIterator class to use ICU custom rules so we can use the ICU dictionary to handle Thai and Korean.
Also, this class has added a couple of new features to improve the spell-checking quality:
* Decompose Hangul syllables into Korean Jamos.
This helps us support Korean spell-checking.
* Filter out some characters not needed by our spell-checker (e.g. Hebrew niqquds and Arabic vowel signs).
This prevents us from marking a word that includes these characters as misspelled.
BUG=8487
TEST=unit_test.exe --gtest_filter=SpellcheckWordIteratorTest*
Review URL: http://codereview.chromium.org/577020
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@39082 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome')
-rwxr-xr-x | chrome/chrome_tests.gypi | 1 | ||||
-rw-r--r-- | chrome/renderer/spellchecker/spellcheck_worditerator.cc | 463 | ||||
-rw-r--r-- | chrome/renderer/spellchecker/spellcheck_worditerator.h | 194 | ||||
-rw-r--r-- | chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc | 129 |
4 files changed, 496 insertions, 291 deletions
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi index afe2ba5..66e995b 100755 --- a/chrome/chrome_tests.gypi +++ b/chrome/chrome_tests.gypi @@ -884,6 +884,7 @@ 'renderer/render_widget_unittest.cc', 'renderer/renderer_main_unittest.cc', 'renderer/spellchecker/spellcheck_unittest.cc', + 'renderer/spellchecker/spellcheck_worditerator_unittest.cc', 'renderer/translate/page_translator_unittest.cc', 'test/browser_with_test_window_test.cc', 'test/browser_with_test_window_test.h', diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc index 827d9ee..0806f50 100644 --- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc +++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc @@ -10,265 +10,334 @@ #include "base/basictypes.h" #include "base/string_util.h" #include "chrome/renderer/spellchecker/spellcheck.h" - #include "third_party/icu/public/common/unicode/normlzr.h" #include "third_party/icu/public/common/unicode/schriter.h" -#include "third_party/icu/public/common/unicode/uchar.h" #include "third_party/icu/public/common/unicode/uscript.h" -#include "third_party/icu/public/common/unicode/uset.h" #include "third_party/icu/public/i18n/unicode/ulocdata.h" -SpellcheckCharAttribute::SpellcheckCharAttribute() { - InitializeScriptTable(); - - // Even though many dictionaries treats numbers and contractions as words and - // treats USCRIPT_COMMON characters as word characters, the - // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word - // characters to strictly-distinguish contraction characters from word - // characters. - SetWordScript(USCRIPT_COMMON, false); - - // Initialize the table of characters used for contractions. - // This array consists of the 'Midletter' and 'MidNumLet' characters of the - // word-break property list provided by Unicode, Inc.: - // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt - static const UChar32 kMidLetters[] = { - L'\x003A', // MidLetter # COLON - L'\x00B7', // MidLetter # MIDDLE DOT - L'\x0387', // MidLetter # GREEK ANO TELEIA - L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM - L'\x2027', // MidLetter # HYPHENATION POINT - L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON - L'\xFE55', // MidLetter # SMALL COLON - L'\xFF1A', // MidLetter # FULLWIDTH COLON - L'\x0027', // MidNumLet # APOSTROPHE - L'\x002E', // MidNumLet # FULL STOP - L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK - L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK - L'\x2024', // MidNumLet # ONE DOT LEADER - L'\xFE52', // MidNumLet # SMALL FULL STOP - L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE - L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP - }; - for (size_t i = 0; i < arraysize(kMidLetters); ++i) - middle_letters_[kMidLetters[i]] = true; +/////////////////////////////////////////////////////////////////////////////// +// SpellcheckCharAttribute implementation: + +SpellcheckCharAttribute::SpellcheckCharAttribute() + : script_code_(USCRIPT_LATIN) { } SpellcheckCharAttribute::~SpellcheckCharAttribute() { } -// Sets the default language for this object. -// This function retrieves the exemplar set to set up the default character -// attributes. void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) { - UErrorCode status = U_ZERO_ERROR; - ULocaleData* locale_data = ulocdata_open(language.c_str(), &status); - if (U_FAILURE(status)) - return; - - // Retrieves the exemplar set of the given language and update the - // character-attribute table to treat its characters as word characters. - USet* exemplar_set = uset_open(1, 0); - ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, - &status); - ulocdata_close(locale_data); - if (U_SUCCESS(status)) { - int length = uset_size(exemplar_set); - for (int i = 0; i < length; ++i) { - UChar32 character = uset_charAt(exemplar_set, i); - SetWordScript(GetScriptCode(character), true); - } + CreateRuleSets(language); +} - // Many languages use combining characters to input their characters from - // keyboards. On the other hand, this exemplar set does not always include - // combining characters for such languages. - // To treat such combining characters as word characters, we decompose - // this exemplar set and treat the decomposed characters as word characters. - icu::UnicodeString composed; - for (int i = 0; i < length; ++i) - composed.append(uset_charAt(exemplar_set, i)); - - icu::UnicodeString decomposed; - icu::Normalizer::decompose(composed, FALSE, 0, decomposed, status); - if (U_SUCCESS(status)) { - icu::StringCharacterIterator iterator(decomposed); - UChar32 character = iterator.first32(); - while (character != icu::CharacterIterator::DONE) { - SetWordScript(GetScriptCode(character), true); - character = iterator.next32(); - } - } - } - uset_close(exemplar_set); +string16 SpellcheckCharAttribute::GetRuleSet(bool allow_contraction) const { + return allow_contraction ? + ruleset_allow_contraction_ : ruleset_disallow_contraction_; } -// Returns whether or not the given character is a character used by the -// selected dictionary. -bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { - return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); +void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { + // The template for our custom rule sets. Even though this template is based + // on the one of ICU 4.0, it changed the following points: + // * It discards characters not needed by our spellchecker (e.g. numbers, + // punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on). + // * It allows customization of the $ALetter value (i.e. word characters). + // * It allows customization of the $ALetterPlus value (i.e. whether or not to + // use the dictionary data). + // * It allows choosing whether or not to split a text at contraction + // characters. + // This template only changes the forward-iteration rules. So, calling + // ubrk_prev() returns the same results as the original template. + static const char kRuleTemplate[] = + "!!chain;" + "$CR = [\\p{Word_Break = CR}];" + "$LF = [\\p{Word_Break = LF}];" + "$Newline = [\\p{Word_Break = Newline}];" + "$Extend = [\\p{Word_Break = Extend}];" + "$Format = [\\p{Word_Break = Format}];" + "$Katakana = [\\p{Word_Break = Katakana}];" + "$ALetter = [\\p{script=%s}];" + "$MidNumLet = [\\p{Word_Break = MidNumLet}];" + "$MidLetter = [\\p{Word_Break = MidLetter}];" + "$MidNum = [\\p{Word_Break = MidNum}];" + "$Numeric = [\\p{Word_Break = Numeric}];" + "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" + + "$dictionary = [:LineBreak = Complex_Context:];" + "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " + "$ALetterPlus = %s;" + + "$KatakanaEx = $Katakana ($Extend | $Format)*;" + "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" + "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" + "$MidLetterEx = $MidLetter ($Extend | $Format)*;" + "$MidNumEx = $MidNum ($Extend | $Format)*;" + "$NumericEx = $Numeric ($Extend | $Format)*;" + "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" + + "$Hiragana = [\\p{script=Hiragana}];" + "$Ideographic = [\\p{Ideographic}];" + "$HiraganaEx = $Hiragana ($Extend | $Format)*;" + "$IdeographicEx = $Ideographic ($Extend | $Format)*;" + + "!!forward;" + "$CR $LF;" + "[^$CR $LF $Newline]? ($Extend | $Format)+;" + "$ALetterEx {200};" + "$ALetterEx $ALetterEx {200};" + "%s" + + "!!reverse;" + "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" + "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" + "$BackNumericEx = ($Format | $Extend)* $Numeric;" + "$BackMidNumEx = ($Format | $Extend)* $MidNum;" + "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" + "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" + "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" + "$LF $CR;" + "($Format | $Extend)* [^$CR $LF $Newline]?;" + "$BackALetterEx $BackALetterEx;" + "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;" + "$BackNumericEx $BackNumericEx;" + "$BackNumericEx $BackALetterEx;" + "$BackALetterEx $BackNumericEx;" + "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;" + "$BackKatakanaEx $BackKatakanaEx;" + "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |" + " $BackKatakanaEx | $BackExtendNumLetEx);" + "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)" + " $BackExtendNumLetEx;" + + "!!safe_reverse;" + "($Extend | $Format)+ .?;" + "($MidLetter | $MidNumLet) $BackALetterEx;" + "($MidNum | $MidNumLet) $BackNumericEx;" + "$dictionary $dictionary;" + + "!!safe_forward;" + "($Extend | $Format)+ .?;" + "($MidLetterEx | $MidNumLetEx) $ALetterEx;" + "($MidNumEx | $MidNumLetEx) $NumericEx;" + "$dictionary $dictionary;"; + + // Retrieve the script code used by the given language from ICU. When the + // given language consists of two or more scripts, we just use the first + // script. + UErrorCode error = U_ZERO_ERROR; + UScriptCode script_code[8]; + int scripts = uscript_getCode(language.c_str(), script_code, + arraysize(script_code), &error); + if (U_SUCCESS(error) && scripts >= 1) + script_code_ = script_code[0]; + + // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary + // only for the languages which need it (i.e. Korean and Thai) to prevent ICU + // from returning dictionary words (i.e. Korean or Thai words) for languages + // which don't need them. + const char* aletter = uscript_getName(script_code_); + if (!aletter) + aletter = "Latin"; + + const char kWithDictionary[] = "[$ALetter [$dictionary-$Extend-$Control]]"; + const char kWithoutDictionary[] = "$ALetter"; + const char* aletter_plus = kWithoutDictionary; + if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) + aletter_plus = kWithDictionary; + + // Create two custom rule-sets: one allows contraction and the other doesn't. + // We save these strings in UTF-16 so we can use it without conversions. (ICU + // needs UTF-16 strings.) + const char kAllowContraction[] = + "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; + const char kDisallowContraction[] = ""; + + ruleset_allow_contraction_ = UTF8ToUTF16(StringPrintf(kRuleTemplate, + aletter, aletter_plus, kAllowContraction)); + ruleset_disallow_contraction_ = UTF8ToUTF16(StringPrintf(kRuleTemplate, + aletter, aletter_plus, kDisallowContraction)); } -// Returns whether or not the given character is a character used by -// contractions. -bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const { - std::map<UChar32, bool>::const_iterator iterator; - iterator = middle_letters_.find(character); - if (iterator == middle_letters_.end()) - return false; - return iterator->second; +bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const { + // Call the language-specific function if necessary. + // Otherwise, we call the default one. + switch (script_code_) { + case USCRIPT_ARABIC: + return OutputArabic(c, output); + + case USCRIPT_HANGUL: + return OutputHangul(c, output); + + case USCRIPT_HEBREW: + return OutputHebrew(c, output); + + default: + return OutputDefault(c, output); + } } -// Initializes the mapping table. -void SpellcheckCharAttribute::InitializeScriptTable() { - for (size_t i = 0; i < arraysize(script_attributes_); ++i) - script_attributes_[i] = false; +bool SpellcheckCharAttribute::OutputArabic(UChar c, string16* output) const { + // Discard characters not from Arabic alphabets. We also discard vowel marks + // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from + // marking an Arabic word including vowel marks as misspelled. (We need to + // check these vowel marks manually and filter them out since their script + // codes are USCRIPT_ARABIC.) + if (0x0621 <= c && c <= 0x064D) + output->push_back(c); + return true; } -// Retrieves the ICU script code. -UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const { - UErrorCode status = U_ZERO_ERROR; - UScriptCode script_code = uscript_getScript(character, &status); - return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE; +bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const { + // Decompose a Hangul syllable to Hangul jamos. + // This code is copied from Unicode Standard Annex #15: + // <http://unicode.org/reports/tr15>. + const int kSBase = 0xAC00; + const int kLBase = 0x1100; + const int kVBase = 0x1161; + const int kTBase = 0x11A7; + const int kLCount = 19; + const int kVCount = 21; + const int kTCount = 28; + const int kNCount = kVCount * kTCount; + const int kSCount = kLCount * kNCount; + + int index = c - kSBase; + if (index < 0 || index >= kSBase + kSCount) { + // This is not a Hangul syllable. Call the default output function since we + // should output this character when it is a Hangul jamo. + return OutputDefault(c, output); + } + + // This is a Hangul syllable. Decompose this syllable into Hangul jamos and + // output them. + int l = kLBase + index / kNCount; + int v = kVBase + (index % kNCount) / kTCount; + int t = kTBase + index % kTCount; + output->push_back(l); + output->push_back(v); + if (t != kTBase) + output->push_back(t); + return true; } -// Updates the mapping table from an ICU script code to its attribute, i.e. -// whether not a script is used by the selected dictionary. -void SpellcheckCharAttribute::SetWordScript(const int script_code, - bool in_use) { - if (script_code < 0 || - static_cast<size_t>(script_code) >= arraysize(script_attributes_)) - return; - script_attributes_[script_code] = in_use; +bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const { + // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds + // to prevent our Hebrew dictionay from marking a Hebrew word including + // niqquds as misspelled. (Same as Arabic vowel marks, we need to check + // niqquds manually and filter them out since their script codes are + // USCRIPT_HEBREW.) + if (0x05D0 <= c && c <= 0x05EA) + output->push_back(c); + return true; } -// Returns whether or not the given script is used by the selected -// dictionary. -bool SpellcheckCharAttribute::IsWordScript( - const UScriptCode script_code) const { - if (script_code < 0 || - static_cast<size_t>(script_code) >= arraysize(script_attributes_)) - return false; - return script_attributes_[script_code]; +bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const { + // Check the script code of this character and output only if it is the one + // used by the spellchecker language. + UErrorCode status = U_ZERO_ERROR; + UScriptCode script_code = uscript_getScript(c, &status); + if (script_code == script_code_ || script_code == USCRIPT_COMMON) + output->push_back(c); + return true; } +/////////////////////////////////////////////////////////////////////////////// +// SpellcheckWordIterator implementation: + SpellcheckWordIterator::SpellcheckWordIterator() : word_(NULL), length_(0), - position_(0), - allow_contraction_(false), - attribute_(NULL) { + position_(UBRK_DONE), + attribute_(NULL), + iterator_(NULL) { } SpellcheckWordIterator::~SpellcheckWordIterator() { + Close(); } -// Initialize a word-iterator object. -void SpellcheckWordIterator::Initialize( +bool SpellcheckWordIterator::Initialize( const SpellcheckCharAttribute* attribute, const char16* word, size_t length, bool allow_contraction) { + // Create a custom ICU break iterator used in this object. + DCHECK(attribute); + UErrorCode open_status = U_ZERO_ERROR; + UParseError parse_status; + string16 rule(attribute->GetRuleSet(allow_contraction)); + iterator_ = ubrk_openRules(rule.c_str(), rule.length(), word, length, + &parse_status, &open_status); + if (U_FAILURE(open_status)) + return false; + + position_ = ubrk_first(iterator_); + if (position_ == UBRK_DONE) + return false; + word_ = word; - position_ = 0; length_ = static_cast<int>(length); - allow_contraction_ = allow_contraction; attribute_ = attribute; + return true; } -// Retrieves a word (or a contraction). -// When a contraction is enclosed with contraction characters (e.g. 'isn't', -// 'rock'n'roll'), we should discard the beginning and the end of the -// contraction but we should never split the contraction. -// To handle this case easily, we should firstly extract a segment consisting -// of word characters and contraction characters, and discard contraction -// characters at the beginning and the end of the extracted segment. bool SpellcheckWordIterator::GetNextWord(string16* word_string, int* word_start, int* word_length) { - word_string->empty(); + word_string->clear(); *word_start = 0; *word_length = 0; - while (position_ < length_) { - int segment_start = 0; - int segment_end = 0; - GetSegment(&segment_start, &segment_end); - TrimSegment(segment_start, segment_end, word_start, word_length); - if (*word_length > 0) - return Normalize(*word_start, *word_length, word_string); - } - return false; -} + if (!word_ || position_ == UBRK_DONE) + return false; -// Retrieves a segment consisting of word characters (and contraction -// characters if the |allow_contraction_| value is true). -// When the current position refers to a non-word character, this function -// returns a non-empty segment consisting of the character itself. In this -// case, the TrimSegment() function discards the character and returns an -// empty word (i.e. |word_length| == 0). -void SpellcheckWordIterator::GetSegment(int* segment_start, - int* segment_end) { - int position = position_; - while (position < length_) { - UChar32 character; - U16_NEXT(word_, position, length_, character); - if (!attribute_->IsWordChar(character)) { - if (!allow_contraction_ || !attribute_->IsContractionChar(character)) - break; + // Find a word that can be checked for spelling. Our rule sets filter out + // invalid words (e.g. numbers and characters not supported by the + // spellchecker language) so this ubrk_getRuleStatus() call returns + // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such + // words until we can find a valid word or reach the end of the input string. + int next = ubrk_next(iterator_); + while (next != UBRK_DONE) { + if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) { + if (Normalize(position_, next - position_, word_string)) { + *word_start = position_; + *word_length = next - position_; + position_ = next; + return true; + } } + position_ = next; + next = ubrk_next(iterator_); } - *segment_start = position_; - *segment_end = position; - position_ = position; + + // There aren't any more words in the given text. Set the position to + // UBRK_DONE to prevent from calling ubrk_next() next time when this function + // is called. + position_ = UBRK_DONE; + return false; } -// Discards non-word characters at the beginning and the end of the given -// segment. -void SpellcheckWordIterator::TrimSegment(int segment_start, - int segment_end, - int* word_start, - int* word_length) const { - while (segment_start < segment_end) { - UChar32 character; - int segment_next = segment_start; - U16_NEXT(word_, segment_next, segment_end, character); - if (attribute_->IsWordChar(character)) { - *word_start = segment_start; - break; - } - segment_start = segment_next; - } - while (segment_end >= segment_start) { - UChar32 character; - int segment_prev = segment_end; - U16_PREV(word_, segment_start, segment_prev, character); - if (attribute_->IsWordChar(character)) { - *word_length = segment_end - segment_start; - break; - } - segment_end = segment_prev; +void SpellcheckWordIterator::Close() { + if (iterator_) { + ubrk_close(iterator_); + iterator_ = NULL; } } -// Normalizes a non-terminated string into its canonical form so that -// a spellchecker object can check spellings of words which contain ligatures, -// full-width letters, etc. -// USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but -// also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, -// etc. For its details, please read the script table in -// "http://www.unicode.org/Public/UNIDATA/Scripts.txt". bool SpellcheckWordIterator::Normalize(int input_start, int input_length, string16* output_string) const { - // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" - // does not only write NFKD and NFKC can compose ligatures into their ASCII - // alternatives, but also write NFKC keeps accents of characters. - // Therefore, NFKC seems to be the best option for hunspell. + // We use NFKC to normalize this token because NFKC can compose combined + // characters and decompose ligatures. icu::UnicodeString input(FALSE, &word_[input_start], input_length); UErrorCode status = U_ZERO_ERROR; icu::UnicodeString output; icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); - if (U_SUCCESS(status)) - output_string->assign(output.getTerminatedBuffer()); - return status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING; + if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) + return false; + + // Copy the normalized text to the output. + icu::StringCharacterIterator it(output); + for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) + attribute_->OutputChar(c, output_string); + + return !output_string->empty(); } diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.h b/chrome/renderer/spellchecker/spellcheck_worditerator.h index 7763314..aa54011 100644 --- a/chrome/renderer/spellchecker/spellcheck_worditerator.h +++ b/chrome/renderer/spellchecker/spellcheck_worditerator.h @@ -10,105 +10,117 @@ #include "base/basictypes.h" #include "base/string16.h" - -#include "unicode/uscript.h" - -// A class which handles character attributes dependent on a spellchecker and -// its dictionary. -// This class is used by the SpellcheckWordIterator class to determine whether -// or not a character is one used by the spellchecker and its dictinary. +#include "third_party/icu/public/common/unicode/ubrk.h" +#include "third_party/icu/public/common/unicode/uscript.h" + +// A class which encapsulates language-specific operations used by +// SpellcheckWordIterator. +// When we set the spellchecker language, this class creates rule sets that +// filter out the characters not supported by the spellchecker. +// (Please read the comment in the SpellcheckWordIterator class about how to +// use this class.) class SpellcheckCharAttribute { public: SpellcheckCharAttribute(); - ~SpellcheckCharAttribute(); - // Sets the default language of the spell checker. This controls which - // characters are considered parts of words of the given language. + // Sets the language of the spellchecker. + // This function creates the custom rule-sets used by SpellcheckWordIterator. + // Parameters + // * language [in] (std::string) + // The language-code string. void SetDefaultLanguage(const std::string& language); - // Returns whether or not the given character is a character used by the - // selected dictionary. + // Returns a custom rule-set string used by the ICU break iterator. // Parameters - // * character [in] (UChar32) - // Represents a Unicode character to be checked. - // Return values - // * true - // The given character is a word character. - // * false - // The given character is not a word character. - bool IsWordChar(UChar32 character) const; + // * allow_contraction [in] (bool) + // A flag to control whether or not this object splits a possible + // contraction. If this value is false, it returns a rule set that + // splits a possible contraction: "in'n'out" -> "in", "n", and "out". + string16 GetRuleSet(bool allow_contraction) const; - // Returns whether or not the given character is a character used by - // contractions. - // Parameters - // * character [in] (UChar32) - // Represents a Unicode character to be checked. - // Return values - // * true - // The given character is a character used by contractions. - // * false - // The given character is not a character used by contractions. - bool IsContractionChar(UChar32 character) const; + // Output a character only if it is a word character. + bool OutputChar(UChar c, string16* output) const; private: - // Initializes the mapping table. - void InitializeScriptTable(); - - // Retrieves the ICU script code. - UScriptCode GetScriptCode(UChar32 character) const; + // Creates the rule-set strings. + void CreateRuleSets(const std::string& language); - // Updates an entry in the mapping table. - void SetWordScript(const int script_code, bool in_use); - - // Returns whether or not the given script is used by the selected - // dictionary. - bool IsWordScript(const UScriptCode script_code) const; + // Language-specific output functions. + bool OutputArabic(UChar c, string16* output) const; + bool OutputHangul(UChar c, string16* output) const; + bool OutputHebrew(UChar c, string16* output) const; + bool OutputDefault(UChar c, string16* output) const; private: - // Represents a mapping table from a script code to a boolean value - // representing whether or not the script is used by the selected dictionary. - bool script_attributes_[USCRIPT_CODE_LIMIT]; + // The custom rule-set strings used by ICU BreakIterator. + // Since it is not so easy to create custom rule-sets from a spellchecker + // language, this class saves these rule-set strings created when we set the + // language. + string16 ruleset_allow_contraction_; + string16 ruleset_disallow_contraction_; - // Represents a table of characters used by contractions. - std::map<UChar32, bool> middle_letters_; + // The script code used by this language. + UScriptCode script_code_; DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); }; -// A class which implements methods for finding the location of word boundaries -// used by the Spellchecker class. -// This class is implemented on the following assumptions: -// * An input string is encoded in UTF-16 (i.e. it may contain surrogate -// pairs), and; -// * The length of a string is the number of UTF-16 characters in the string -// (i.e. the length of a non-BMP character becomes two). +// A class which extracts words that can be checked for spelling from a longer +// string. +// The ICU word-break iterator does not discard some punctuation characters +// attached to a word. For example, when we set a word "_hello_" to a +// word-break iterator, it just returns "_hello_". +// On the other hand, our spellchecker expects for us to discard such +// punctuation characters. +// To extract only the words that our spellchecker can check, this class uses +// custom rule-sets created by the SpellcheckCharAttribute class. +// Also, this class normalizes extracted words so our spellchecker can check +// the spellings of a word that includes ligatures, combined characters, +// full-width characters, etc. +// +// The following snippet is an example that extracts words with this class. +// +// // Creates the language-specific attributes for US English. +// SpellcheckCharAttribute attribute; +// attribute.SetDefaultLanguage("en-US"); +// +// // Set up a SpellcheckWordIterator object which extracts English words, +// // and retrieves them. +// SpellcheckWordIterator iterator; +// string16 text(UTF8ToUTF16("this is a test.")); +// iterator.Initialize(&attribute, text.c_str(), text_.length(), true); +// +// string16 word; +// int start; +// int end; +// while (iterator.GetNextWord(&word, &start, &end)) { +// ... +// } +// class SpellcheckWordIterator { public: SpellcheckWordIterator(); - ~SpellcheckWordIterator(); // Initializes a word-iterator object. // Parameters // * attribute [in] (const SpellcheckCharAttribute*) - // Represents a set of character attributes used for filtering out - // non-word characters. + // Character attributes used for filtering out non-word characters. // * word [in] (const char16*) - // Represents a string from which this object extracts words. - // (This string does not have to be NUL-terminated.) + // A string from which this object extracts words. (This string does not + // have to be NUL-terminated.) // * length [in] (size_t) - // Represents the length of the given string, in UTF-16 characters. - // This value should not include terminating NUL characters. + // The length of the given string, in UTF-16 characters. // * allow_contraction [in] (bool) - // Represents a flag to control whether or not this object should split a - // possible contraction (e.g. "isn't", "in'n'out", etc.) + // A flag to control whether or not this object should split a possible + // contraction (e.g. "isn't", "in'n'out", etc.) // Return values // * true // This word-iterator object is initialized successfully. // * false // An error occured while initializing this object. - void Initialize(const SpellcheckCharAttribute* attribute, + bool Initialize(const SpellcheckCharAttribute* attribute, const char16* word, size_t length, bool allow_contraction); @@ -116,20 +128,20 @@ class SpellcheckWordIterator { // Retrieves a word (or a contraction). // Parameters // * word_string [out] (string16*) - // Represents a word (or a contraction) to be checked its spelling. - // This |word_string| has been already normalized to its canonical form - // (i.e. decomposed ligatures, replaced full-width latin characters to - // its ASCII alternatives, etc.) so that a SpellChecker object can check - // its spelling without any additional operations. - // On the other hand, a substring of the input string + // A word (or a contraction) to be checked its spelling. This + // |word_string| has been already normalized to its canonical form (i.e. + // decomposed ligatures, replaced full-width latin characters to its ASCII + // alternatives, etc.) so a SpellChecker object can check its spelling + // without any additional operations. We can use |word_start| and + // |word_length| to retrieve the non-normalizedversion of this string as + // shown in the following snippet. // string16 str(&word[word_start], word_length); - // represents the non-normalized version of this extracted word. // * word_start [out] (int*) - // Represents the offset of this word from the beginning of the input - // string, in UTF-16 characters. + // The offset of this word from the beginning of the input string, + // in UTF-16 characters. // * word_length [out] (int*) - // Represents the length of an extracted word before normalization, in - // UTF-16 characters. + // The length of an extracted word before normalization, in UTF-16 + // characters. // When the input string contains ligatures, this value may not be equal // to the length of the |word_string|. // Return values @@ -142,20 +154,14 @@ class SpellcheckWordIterator { int* word_length); private: - // Retrieves a segment consisting of word characters (and contraction - // characters if the |allow_contraction| value is true). - void GetSegment(int* segment_start, - int* segment_end); - - // Discards non-word characters at the beginning and the end of the given - // segment. - void TrimSegment(int segment_start, - int segment_end, - int* word_start, - int* word_length) const; - - // Normalizes the given segment of the |word_| variable and write its - // canonical form to the |output_string|. + // Releases all the resources attached to this object. + void Close(); + + // Normalizes a non-terminated string so our spellchecker can check its + // spelling. A word returned from an ICU word-break iterator may include + // characters not supported by our spellchecker, e.g. ligatures, combining + // characters, full-width letters, etc. This function replaces such characters + // with alternative characters supported by our spellchecker. bool Normalize(int input_start, int input_length, string16* output_string) const; @@ -170,13 +176,13 @@ class SpellcheckWordIterator { // The current position in the original string. int position_; - // The flag to control whether or not this object should extract possible - // contractions. - bool allow_contraction_; - - // The character attributes used for filtering out non-word characters. + // The language-specific attributes used for filtering out non-word + // characters. const SpellcheckCharAttribute* attribute_; + // The ICU break iterator. + UBreakIterator* iterator_; + DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); }; diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc new file mode 100644 index 0000000..a41c93b --- /dev/null +++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <string> +#include <vector> + +#include "base/format_macros.h" +#include "base/string_util.h" +#include "chrome/renderer/spellchecker/spellcheck_worditerator.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace { + +struct TestCase { + const char* language; + bool allow_contraction; + const wchar_t* expected_words; +}; + +} // namespace + +// Tests whether or not our SpellcheckWordIterator can extract only words used +// by the specified language from a multi-language text. +TEST(SpellcheckWordIteratorTest, SplitWord) { + // An input text. This text includes words of several languages. (Some words + // are not separated with whitespace characters.) Our SpellcheckWordIterator + // should extract only the words used by the specified language from this text + // and normalize them so our spell-checker can check their spellings. + const wchar_t kTestText[] = + // Numbers + L"0123456789" + // Latin (including a contraction character and a ligature). + L"hello:hello a\xFB03x" + // Greek + L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5" + // Cyrillic + L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" + L"\x0443\x0439\x0442\x0435" + // Hebrew (including niqquds) + L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd" + // Arabic (including vowel marks) + L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627" + L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a" + L"\x0652\x0643\x064f\x0645\x0652" + // Hindi + L"\x0930\x093E\x091C\x0927\x093E\x0928" + // Thai + L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04" + L"\x0e23\x0e31\x0e1a" + // Hiraganas + L"\x3053\x3093\x306B\x3061\x306F" + // CJKV ideographs + L"\x4F60\x597D" + // Hangul Syllables + L"\xC548\xB155\xD558\xC138\xC694" + // Full-width latin + L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F"; + + // The languages and expected results used in this test. + static const TestCase kTestCases[] = { + { + // English (keep contraction words) + "en-US", true, L"hello:hello affix Hello" + }, { + // English (split contraction words) + "en-US", false, L"hello hello affix Hello" + }, { + // Greek + "el-GR", true, + L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5" + }, { + // Russian + "ru-RU", true, + L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" + L"\x0443\x0439\x0442\x0435" + }, { + // Hebrew + "he-IL", true, + L"\x05e9\x05dc\x05d5\x05dd" + }, { + // Arabic + "ar", true, + L"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639" + L"\x0644\x064a\x0643\x0645" + }, { + // Hindi + "hi-IN", true, + L"\x0930\x093E\x091C\x0927\x093E\x0928" + }, { + // Thai + "th-TH", true, + L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04" + L"\x0e23\x0e31\x0e1a" + }, { + // Korean + "ko-KR", true, + L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161" + L"\x1109\x1166\x110b\x116d" + }, + }; + + for (size_t i = 0; i < arraysize(kTestCases); ++i) { + SCOPED_TRACE(StringPrintf("kTestCases[%" PRIuS "]: language=%s", i, + kTestCases[i].language)); + + SpellcheckCharAttribute attributes; + attributes.SetDefaultLanguage(kTestCases[i].language); + + string16 input(WideToUTF16(kTestText)); + SpellcheckWordIterator iterator; + EXPECT_TRUE(iterator.Initialize(&attributes, input.c_str(), input.length(), + kTestCases[i].allow_contraction)); + + std::vector<string16> expected_words; + SplitString(WideToUTF16(kTestCases[i].expected_words), ' ', + &expected_words); + + string16 actual_word; + int actual_start, actual_end; + size_t index = 0; + while (iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) { + EXPECT_TRUE(index < expected_words.size()); + if (index < expected_words.size()) + EXPECT_EQ(expected_words[index], actual_word); + ++index; + } + } +} |