Re-implement SpellcheckWordIterator with ICU.

This change re-implements the SpellcheckWordIterator class to use ICU custom rules so we can use the ICU dictionary to handle Thai and Korean. Also, this class has added a couple of new features to improve the spell-checking quality: * Decompose Hangul syllables into Korean Jamos. This helps us support Korean spell-checking. * Filter out some characters not needed by our spell-checker (e.g. Hebrew niqquds and Arabic vowel signs). This prevents us from marking a word that includes these characters as misspelled. BUG=8487 TEST=unit_test.exe --gtest_filter=SpellcheckWordIteratorTest* Review URL: http://codereview.chromium.org/577020 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@39082 0039d316-1c4b-4281-b951-d872f2087c98
author: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-02-16 09:02:41 +0000
committer: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-02-16 09:02:41 +0000
commit: 59e2c39984b9a34bbb72db1f3a02be171dc9726d (patch)
tree: 5bf95d487360cb6cf59e0c01c4e89d5c9d9d82b8 /chrome
parent: 2b19e2feeac2a01b2068595bd2913a194a6527e5 (diff)
download: chromium_src-59e2c39984b9a34bbb72db1f3a02be171dc9726d.zip
chromium_src-59e2c39984b9a34bbb72db1f3a02be171dc9726d.tar.gz
chromium_src-59e2c39984b9a34bbb72db1f3a02be171dc9726d.tar.bz2
4 files changed, 496 insertions, 291 deletions
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index afe2ba5..66e995b 100755
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -884,6 +884,7 @@
         'renderer/render_widget_unittest.cc',
         'renderer/renderer_main_unittest.cc',
         'renderer/spellchecker/spellcheck_unittest.cc',
+        'renderer/spellchecker/spellcheck_worditerator_unittest.cc',
         'renderer/translate/page_translator_unittest.cc',
         'test/browser_with_test_window_test.cc',
         'test/browser_with_test_window_test.h',
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
index 827d9ee..0806f50 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
@@ -10,265 +10,334 @@
 #include "base/basictypes.h"
 #include "base/string_util.h"
 #include "chrome/renderer/spellchecker/spellcheck.h"
-
 #include "third_party/icu/public/common/unicode/normlzr.h"
 #include "third_party/icu/public/common/unicode/schriter.h"
-#include "third_party/icu/public/common/unicode/uchar.h"
 #include "third_party/icu/public/common/unicode/uscript.h"
-#include "third_party/icu/public/common/unicode/uset.h"
 #include "third_party/icu/public/i18n/unicode/ulocdata.h"
 
-SpellcheckCharAttribute::SpellcheckCharAttribute() {
-  InitializeScriptTable();
-
-  // Even though many dictionaries treats numbers and contractions as words and
-  // treats USCRIPT_COMMON characters as word characters, the
-  // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
-  // characters to strictly-distinguish contraction characters from word
-  // characters.
-  SetWordScript(USCRIPT_COMMON, false);
-
-  // Initialize the table of characters used for contractions.
-  // This array consists of the 'Midletter' and 'MidNumLet' characters of the
-  // word-break property list provided by Unicode, Inc.:
-  //   http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
-  static const UChar32 kMidLetters[] = {
-      L'\x003A',  // MidLetter # COLON
-      L'\x00B7',  // MidLetter # MIDDLE DOT
-      L'\x0387',  // MidLetter # GREEK ANO TELEIA
-      L'\x05F4',  // MidLetter # HEBREW PUNCTUATION GERSHAYIM
-      L'\x2027',  // MidLetter # HYPHENATION POINT
-      L'\xFE13',  // MidLetter # PRESENTATION FORM FOR VERTICAL COLON
-      L'\xFE55',  // MidLetter # SMALL COLON
-      L'\xFF1A',  // MidLetter # FULLWIDTH COLON
-      L'\x0027',  // MidNumLet # APOSTROPHE
-      L'\x002E',  // MidNumLet # FULL STOP
-      L'\x2018',  // MidNumLet # LEFT SINGLE QUOTATION MARK
-      L'\x2019',  // MidNumLet # RIGHT SINGLE QUOTATION MARK
-      L'\x2024',  // MidNumLet # ONE DOT LEADER
-      L'\xFE52',  // MidNumLet # SMALL FULL STOP
-      L'\xFF07',  // MidNumLet # FULLWIDTH APOSTROPHE
-      L'\xFF0E',  // MidNumLet # FULLWIDTH FULL STOP
-  };
-  for (size_t i = 0; i < arraysize(kMidLetters); ++i)
-    middle_letters_[kMidLetters[i]] = true;
+///////////////////////////////////////////////////////////////////////////////
+// SpellcheckCharAttribute implementation:
+
+SpellcheckCharAttribute::SpellcheckCharAttribute()
+    : script_code_(USCRIPT_LATIN) {
 }
 
 SpellcheckCharAttribute::~SpellcheckCharAttribute() {
 }
 
-// Sets the default language for this object.
-// This function retrieves the exemplar set to set up the default character
-// attributes.
 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {
-  UErrorCode status = U_ZERO_ERROR;
-  ULocaleData* locale_data = ulocdata_open(language.c_str(), &status);
-  if (U_FAILURE(status))
-    return;
-
-  // Retrieves the exemplar set of the given language and update the
-  // character-attribute table to treat its characters as word characters.
-  USet* exemplar_set = uset_open(1, 0);
-  ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
-                          &status);
-  ulocdata_close(locale_data);
-  if (U_SUCCESS(status)) {
-    int length = uset_size(exemplar_set);
-    for (int i = 0; i < length; ++i) {
-      UChar32 character = uset_charAt(exemplar_set, i);
-      SetWordScript(GetScriptCode(character), true);
-    }
+  CreateRuleSets(language);
+}
 
-    // Many languages use combining characters to input their characters from
-    // keyboards. On the other hand, this exemplar set does not always include
-    // combining characters for such languages.
-    // To treat such combining characters as word characters, we decompose
-    // this exemplar set and treat the decomposed characters as word characters.
-    icu::UnicodeString composed;
-    for (int i = 0; i < length; ++i)
-      composed.append(uset_charAt(exemplar_set, i));
-
-    icu::UnicodeString decomposed;
-    icu::Normalizer::decompose(composed, FALSE, 0, decomposed, status);
-    if (U_SUCCESS(status)) {
-      icu::StringCharacterIterator iterator(decomposed);
-      UChar32 character = iterator.first32();
-      while (character != icu::CharacterIterator::DONE) {
-        SetWordScript(GetScriptCode(character), true);
-        character = iterator.next32();
-      }
-    }
-  }
-  uset_close(exemplar_set);
+string16 SpellcheckCharAttribute::GetRuleSet(bool allow_contraction) const {
+  return allow_contraction ?
+      ruleset_allow_contraction_ : ruleset_disallow_contraction_;
 }
 
-// Returns whether or not the given character is a character used by the
-// selected dictionary.
-bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
-  return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
+void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
+  // The template for our custom rule sets. Even though this template is based
+  // on the one of ICU 4.0, it changed the following points:
+  // * It discards characters not needed by our spellchecker (e.g. numbers,
+  //   punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on).
+  // * It allows customization of the $ALetter value (i.e. word characters).
+  // * It allows customization of the $ALetterPlus value (i.e. whether or not to
+  //   use the dictionary data).
+  // * It allows choosing whether or not to split a text at contraction
+  //   characters.
+  // This template only changes the forward-iteration rules. So, calling
+  // ubrk_prev() returns the same results as the original template.
+  static const char kRuleTemplate[] =
+      "!!chain;"
+      "$CR           = [\\p{Word_Break = CR}];"
+      "$LF           = [\\p{Word_Break = LF}];"
+      "$Newline      = [\\p{Word_Break = Newline}];"
+      "$Extend       = [\\p{Word_Break = Extend}];"
+      "$Format       = [\\p{Word_Break = Format}];"
+      "$Katakana     = [\\p{Word_Break = Katakana}];"
+      "$ALetter      = [\\p{script=%s}];"
+      "$MidNumLet    = [\\p{Word_Break = MidNumLet}];"
+      "$MidLetter    = [\\p{Word_Break = MidLetter}];"
+      "$MidNum       = [\\p{Word_Break = MidNum}];"
+      "$Numeric      = [\\p{Word_Break = Numeric}];"
+      "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"
+
+      "$dictionary   = [:LineBreak = Complex_Context:];"
+      "$Control        = [\\p{Grapheme_Cluster_Break = Control}]; "
+      "$ALetterPlus  = %s;"
+
+      "$KatakanaEx     = $Katakana     ($Extend |  $Format)*;"
+      "$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;"
+      "$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;"
+      "$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;"
+      "$MidNumEx       = $MidNum       ($Extend |  $Format)*;"
+      "$NumericEx      = $Numeric      ($Extend |  $Format)*;"
+      "$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;"
+
+      "$Hiragana       = [\\p{script=Hiragana}];"
+      "$Ideographic    = [\\p{Ideographic}];"
+      "$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;"
+      "$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;"
+
+      "!!forward;"
+      "$CR $LF;"
+      "[^$CR $LF $Newline]? ($Extend |  $Format)+;"
+      "$ALetterEx {200};"
+      "$ALetterEx $ALetterEx {200};"
+      "%s"
+
+      "!!reverse;"
+      "$BackALetterEx     = ($Format | $Extend)* $ALetterPlus;"
+      "$BackMidNumLetEx   = ($Format | $Extend)* $MidNumLet;"
+      "$BackNumericEx     = ($Format | $Extend)* $Numeric;"
+      "$BackMidNumEx      = ($Format | $Extend)* $MidNum;"
+      "$BackMidLetterEx   = ($Format | $Extend)* $MidLetter;"
+      "$BackKatakanaEx    = ($Format | $Extend)* $Katakana;"
+      "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;"
+      "$LF $CR;"
+      "($Format | $Extend)*  [^$CR $LF $Newline]?;"
+      "$BackALetterEx $BackALetterEx;"
+      "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;"
+      "$BackNumericEx $BackNumericEx;"
+      "$BackNumericEx $BackALetterEx;"
+      "$BackALetterEx $BackNumericEx;"
+      "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;"
+      "$BackKatakanaEx $BackKatakanaEx;"
+      "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |"
+      " $BackKatakanaEx | $BackExtendNumLetEx);"
+      "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)"
+      " $BackExtendNumLetEx;"
+
+      "!!safe_reverse;"
+      "($Extend | $Format)+ .?;"
+      "($MidLetter | $MidNumLet) $BackALetterEx;"
+      "($MidNum | $MidNumLet) $BackNumericEx;"
+      "$dictionary $dictionary;"
+
+      "!!safe_forward;"
+      "($Extend | $Format)+ .?;"
+      "($MidLetterEx | $MidNumLetEx) $ALetterEx;"
+      "($MidNumEx | $MidNumLetEx) $NumericEx;"
+      "$dictionary $dictionary;";
+
+  // Retrieve the script code used by the given language from ICU. When the
+  // given language consists of two or more scripts, we just use the first
+  // script.
+  UErrorCode error = U_ZERO_ERROR;
+  UScriptCode script_code[8];
+  int scripts = uscript_getCode(language.c_str(), script_code,
+                                arraysize(script_code), &error);
+  if (U_SUCCESS(error) && scripts >= 1)
+    script_code_ = script_code[0];
+
+  // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary
+  // only for the languages which need it (i.e. Korean and Thai) to prevent ICU
+  // from returning dictionary words (i.e. Korean or Thai words) for languages
+  // which don't need them.
+  const char* aletter = uscript_getName(script_code_);
+  if (!aletter)
+    aletter = "Latin";
+
+  const char kWithDictionary[] = "[$ALetter [$dictionary-$Extend-$Control]]";
+  const char kWithoutDictionary[] = "$ALetter";
+  const char* aletter_plus = kWithoutDictionary;
+  if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
+    aletter_plus = kWithDictionary;
+
+  // Create two custom rule-sets: one allows contraction and the other doesn't.
+  // We save these strings in UTF-16 so we can use it without conversions. (ICU
+  // needs UTF-16 strings.)
+  const char kAllowContraction[] =
+      "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};";
+  const char kDisallowContraction[] = "";
+
+  ruleset_allow_contraction_ = UTF8ToUTF16(StringPrintf(kRuleTemplate,
+      aletter, aletter_plus, kAllowContraction));
+  ruleset_disallow_contraction_ = UTF8ToUTF16(StringPrintf(kRuleTemplate,
+      aletter, aletter_plus, kDisallowContraction));
 }
 
-// Returns whether or not the given character is a character used by
-// contractions.
-bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
-  std::map<UChar32, bool>::const_iterator iterator;
-  iterator = middle_letters_.find(character);
-  if (iterator == middle_letters_.end())
-    return false;
-  return iterator->second;
+bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const {
+  // Call the language-specific function if necessary.
+  // Otherwise, we call the default one.
+  switch (script_code_) {
+    case USCRIPT_ARABIC:
+      return OutputArabic(c, output);
+
+    case USCRIPT_HANGUL:
+      return OutputHangul(c, output);
+
+    case USCRIPT_HEBREW:
+      return OutputHebrew(c, output);
+
+    default:
+      return OutputDefault(c, output);
+  }
 }
 
-// Initializes the mapping table.
-void SpellcheckCharAttribute::InitializeScriptTable() {
-  for (size_t i = 0; i < arraysize(script_attributes_); ++i)
-    script_attributes_[i] = false;
+bool SpellcheckCharAttribute::OutputArabic(UChar c, string16* output) const {
+  // Discard characters not from Arabic alphabets. We also discard vowel marks
+  // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from
+  // marking an Arabic word including vowel marks as misspelled. (We need to
+  // check these vowel marks manually and filter them out since their script
+  // codes are USCRIPT_ARABIC.)
+  if (0x0621 <= c && c <= 0x064D)
+    output->push_back(c);
+  return true;
 }
 
-// Retrieves the ICU script code.
-UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
-  UErrorCode status = U_ZERO_ERROR;
-  UScriptCode script_code = uscript_getScript(character, &status);
-  return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
+bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const {
+  // Decompose a Hangul syllable to Hangul jamos.
+  // This code is copied from Unicode Standard Annex #15:
+  // <http://unicode.org/reports/tr15>.
+  const int kSBase = 0xAC00;
+  const int kLBase = 0x1100;
+  const int kVBase = 0x1161;
+  const int kTBase = 0x11A7;
+  const int kLCount = 19;
+  const int kVCount = 21;
+  const int kTCount = 28;
+  const int kNCount = kVCount * kTCount;
+  const int kSCount = kLCount * kNCount;
+
+  int index = c - kSBase;
+  if (index < 0 || index >= kSBase + kSCount) {
+    // This is not a Hangul syllable. Call the default output function since we
+    // should output this character when it is a Hangul jamo.
+    return OutputDefault(c, output);
+  }
+
+  // This is a Hangul syllable. Decompose this syllable into Hangul jamos and
+  // output them.
+  int l = kLBase + index / kNCount;
+  int v = kVBase + (index % kNCount) / kTCount;
+  int t = kTBase + index % kTCount;
+  output->push_back(l);
+  output->push_back(v);
+  if (t != kTBase)
+    output->push_back(t);
+  return true;
 }
 
-// Updates the mapping table from an ICU script code to its attribute, i.e.
-// whether not a script is used by the selected dictionary.
-void SpellcheckCharAttribute::SetWordScript(const int script_code,
-                                            bool in_use) {
-  if (script_code < 0 ||
-      static_cast<size_t>(script_code) >= arraysize(script_attributes_))
-    return;
-  script_attributes_[script_code] = in_use;
+bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {
+  // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds
+  // to prevent our Hebrew dictionay from marking a Hebrew word including
+  // niqquds as misspelled. (Same as Arabic vowel marks, we need to check
+  // niqquds manually and filter them out since their script codes are
+  // USCRIPT_HEBREW.)
+  if (0x05D0 <= c && c <= 0x05EA)
+    output->push_back(c);
+  return true;
 }
 
-// Returns whether or not the given script is used by the selected
-// dictionary.
-bool SpellcheckCharAttribute::IsWordScript(
-    const UScriptCode script_code) const {
-  if (script_code < 0 ||
-      static_cast<size_t>(script_code) >= arraysize(script_attributes_))
-    return false;
-  return script_attributes_[script_code];
+bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const {
+  // Check the script code of this character and output only if it is the one
+  // used by the spellchecker language.
+  UErrorCode status = U_ZERO_ERROR;
+  UScriptCode script_code = uscript_getScript(c, &status);
+  if (script_code == script_code_ || script_code == USCRIPT_COMMON)
+    output->push_back(c);
+  return true;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// SpellcheckWordIterator implementation:
+
 SpellcheckWordIterator::SpellcheckWordIterator()
     : word_(NULL),
       length_(0),
-      position_(0),
-      allow_contraction_(false),
-      attribute_(NULL) {
+      position_(UBRK_DONE),
+      attribute_(NULL),
+      iterator_(NULL) {
 }
 
 SpellcheckWordIterator::~SpellcheckWordIterator() {
+  Close();
 }
 
-// Initialize a word-iterator object.
-void SpellcheckWordIterator::Initialize(
+bool SpellcheckWordIterator::Initialize(
     const SpellcheckCharAttribute* attribute,
     const char16* word,
     size_t length,
     bool allow_contraction) {
+  // Create a custom ICU break iterator used in this object.
+  DCHECK(attribute);
+  UErrorCode open_status = U_ZERO_ERROR;
+  UParseError parse_status;
+  string16 rule(attribute->GetRuleSet(allow_contraction));
+  iterator_ = ubrk_openRules(rule.c_str(), rule.length(), word, length,
+                             &parse_status, &open_status);
+  if (U_FAILURE(open_status))
+    return false;
+
+  position_ = ubrk_first(iterator_);
+  if (position_ == UBRK_DONE)
+    return false;
+
   word_ = word;
-  position_ = 0;
   length_ = static_cast<int>(length);
-  allow_contraction_ = allow_contraction;
   attribute_ = attribute;
+  return true;
 }
 
-// Retrieves a word (or a contraction).
-// When a contraction is enclosed with contraction characters (e.g. 'isn't',
-// 'rock'n'roll'), we should discard the beginning and the end of the
-// contraction but we should never split the contraction.
-// To handle this case easily, we should firstly extract a segment consisting
-// of word characters and contraction characters, and discard contraction
-// characters at the beginning and the end of the extracted segment.
 bool SpellcheckWordIterator::GetNextWord(string16* word_string,
                                          int* word_start,
                                          int* word_length) {
-  word_string->empty();
+  word_string->clear();
   *word_start = 0;
   *word_length = 0;
-  while (position_ < length_) {
-    int segment_start = 0;
-    int segment_end = 0;
-    GetSegment(&segment_start, &segment_end);
-    TrimSegment(segment_start, segment_end, word_start, word_length);
-    if (*word_length > 0)
-      return Normalize(*word_start, *word_length, word_string);
-  }
 
-  return false;
-}
+  if (!word_ || position_ == UBRK_DONE)
+    return false;
 
-// Retrieves a segment consisting of word characters (and contraction
-// characters if the |allow_contraction_| value is true).
-// When the current position refers to a non-word character, this function
-// returns a non-empty segment consisting of the character itself. In this
-// case, the TrimSegment() function discards the character and returns an
-// empty word (i.e. |word_length| == 0).
-void SpellcheckWordIterator::GetSegment(int* segment_start,
-                                        int* segment_end) {
-  int position = position_;
-  while (position <  length_) {
-    UChar32 character;
-    U16_NEXT(word_, position, length_, character);
-    if (!attribute_->IsWordChar(character)) {
-      if (!allow_contraction_ || !attribute_->IsContractionChar(character))
-        break;
+  // Find a word that can be checked for spelling. Our rule sets filter out
+  // invalid words (e.g. numbers and characters not supported by the
+  // spellchecker language) so this ubrk_getRuleStatus() call returns
+  // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
+  // words until we can find a valid word or reach the end of the input string.
+  int next = ubrk_next(iterator_);
+  while (next != UBRK_DONE) {
+    if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) {
+      if (Normalize(position_, next - position_, word_string)) {
+        *word_start = position_;
+        *word_length = next - position_;
+        position_ = next;
+        return true;
+      }
     }
+    position_ = next;
+    next = ubrk_next(iterator_);
   }
-  *segment_start = position_;
-  *segment_end = position;
-  position_ = position;
+
+  // There aren't any more words in the given text. Set the position to
+  // UBRK_DONE to prevent from calling ubrk_next() next time when this function
+  // is called.
+  position_ = UBRK_DONE;
+  return false;
 }
 
-// Discards non-word characters at the beginning and the end of the given
-// segment.
-void SpellcheckWordIterator::TrimSegment(int segment_start,
-                                         int segment_end,
-                                         int* word_start,
-                                         int* word_length) const {
-  while (segment_start < segment_end) {
-    UChar32 character;
-    int segment_next = segment_start;
-    U16_NEXT(word_, segment_next, segment_end, character);
-    if (attribute_->IsWordChar(character)) {
-      *word_start = segment_start;
-      break;
-    }
-    segment_start = segment_next;
-  }
-  while (segment_end >= segment_start) {
-    UChar32 character;
-    int segment_prev = segment_end;
-    U16_PREV(word_, segment_start, segment_prev, character);
-    if (attribute_->IsWordChar(character)) {
-      *word_length = segment_end - segment_start;
-      break;
-    }
-    segment_end = segment_prev;
+void SpellcheckWordIterator::Close() {
+  if (iterator_) {
+    ubrk_close(iterator_);
+    iterator_ = NULL;
   }
 }
 
-// Normalizes a non-terminated string into its canonical form so that
-// a spellchecker object can check spellings of words which contain ligatures,
-// full-width letters, etc.
-// USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
-// also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
-// etc. For its details, please read the script table in
-// "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
 bool SpellcheckWordIterator::Normalize(int input_start,
                                        int input_length,
                                        string16* output_string) const {
-  // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
-  // does not only write NFKD and NFKC can compose ligatures into their ASCII
-  // alternatives, but also write NFKC keeps accents of characters.
-  // Therefore, NFKC seems to be the best option for hunspell.
+  // We use NFKC to normalize this token because NFKC can compose combined
+  // characters and decompose ligatures.
   icu::UnicodeString input(FALSE, &word_[input_start], input_length);
   UErrorCode status = U_ZERO_ERROR;
   icu::UnicodeString output;
   icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
-  if (U_SUCCESS(status))
-    output_string->assign(output.getTerminatedBuffer());
-  return status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING;
+  if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)
+    return false;
+
+  // Copy the normalized text to the output.
+  icu::StringCharacterIterator it(output);
+  for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())
+    attribute_->OutputChar(c, output_string);
+
+  return !output_string->empty();
 }
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.h b/chrome/renderer/spellchecker/spellcheck_worditerator.h
index 7763314..aa54011 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.h
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.h
@@ -10,105 +10,117 @@
 
 #include "base/basictypes.h"
 #include "base/string16.h"
-
-#include "unicode/uscript.h"
-
-// A class which handles character attributes dependent on a spellchecker and
-// its dictionary.
-// This class is used by the SpellcheckWordIterator class to determine whether
-// or not a character is one used by the spellchecker and its dictinary.
+#include "third_party/icu/public/common/unicode/ubrk.h"
+#include "third_party/icu/public/common/unicode/uscript.h"
+
+// A class which encapsulates language-specific operations used by
+// SpellcheckWordIterator.
+// When we set the spellchecker language, this class creates rule sets that
+// filter out the characters not supported by the spellchecker.
+// (Please read the comment in the SpellcheckWordIterator class about how to
+// use this class.)
 class SpellcheckCharAttribute {
  public:
   SpellcheckCharAttribute();
-
   ~SpellcheckCharAttribute();
 
-  // Sets the default language of the spell checker. This controls which
-  // characters are considered parts of words of the given language.
+  // Sets the language of the spellchecker.
+  // This function creates the custom rule-sets used by SpellcheckWordIterator.
+  // Parameters
+  //   * language [in] (std::string)
+  //     The language-code string.
   void SetDefaultLanguage(const std::string& language);
 
-  // Returns whether or not the given character is a character used by the
-  // selected dictionary.
+  // Returns a custom rule-set string used by the ICU break iterator.
   // Parameters
-  //   * character [in] (UChar32)
-  //     Represents a Unicode character to be checked.
-  // Return values
-  //   * true
-  //     The given character is a word character.
-  //   * false
-  //     The given character is not a word character.
-  bool IsWordChar(UChar32 character) const;
+  //   * allow_contraction [in] (bool)
+  //     A flag to control whether or not this object splits a possible
+  //     contraction. If this value is false, it returns a rule set that
+  //    splits a possible contraction: "in'n'out" -> "in", "n", and "out".
+  string16 GetRuleSet(bool allow_contraction) const;
 
-  // Returns whether or not the given character is a character used by
-  // contractions.
-  // Parameters
-  //   * character [in] (UChar32)
-  //     Represents a Unicode character to be checked.
-  // Return values
-  //   * true
-  //     The given character is a character used by contractions.
-  //   * false
-  //     The given character is not a character used by contractions.
-  bool IsContractionChar(UChar32 character) const;
+  // Output a character only if it is a word character.
+  bool OutputChar(UChar c, string16* output) const;
 
  private:
-  // Initializes the mapping table.
-  void InitializeScriptTable();
-
-  // Retrieves the ICU script code.
-  UScriptCode GetScriptCode(UChar32 character) const;
+  // Creates the rule-set strings.
+  void CreateRuleSets(const std::string& language);
 
-  // Updates an entry in the mapping table.
-  void SetWordScript(const int script_code, bool in_use);
-
-  // Returns whether or not the given script is used by the selected
-  // dictionary.
-  bool IsWordScript(const UScriptCode script_code) const;
+  // Language-specific output functions.
+  bool OutputArabic(UChar c, string16* output) const;
+  bool OutputHangul(UChar c, string16* output) const;
+  bool OutputHebrew(UChar c, string16* output) const;
+  bool OutputDefault(UChar c, string16* output) const;
 
  private:
-  // Represents a mapping table from a script code to a boolean value
-  // representing whether or not the script is used by the selected dictionary.
-  bool script_attributes_[USCRIPT_CODE_LIMIT];
+  // The custom rule-set strings used by ICU BreakIterator.
+  // Since it is not so easy to create custom rule-sets from a spellchecker
+  // language, this class saves these rule-set strings created when we set the
+  // language.
+  string16 ruleset_allow_contraction_;
+  string16 ruleset_disallow_contraction_;
 
-  // Represents a table of characters used by contractions.
-  std::map<UChar32, bool> middle_letters_;
+  // The script code used by this language.
+  UScriptCode script_code_;
 
   DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
 };
 
-// A class which implements methods for finding the location of word boundaries
-// used by the Spellchecker class.
-// This class is implemented on the following assumptions:
-//   * An input string is encoded in UTF-16 (i.e. it may contain surrogate
-//     pairs), and;
-//   * The length of a string is the number of UTF-16 characters in the string
-//     (i.e. the length of a non-BMP character becomes two).
+// A class which extracts words that can be checked for spelling from a longer
+// string.
+// The ICU word-break iterator does not discard some punctuation characters
+// attached to a word. For example, when we set a word "_hello_" to a
+// word-break iterator, it just returns "_hello_".
+// On the other hand, our spellchecker expects for us to discard such
+// punctuation characters.
+// To extract only the words that our spellchecker can check, this class uses
+// custom rule-sets created by the SpellcheckCharAttribute class.
+// Also, this class normalizes extracted words so our spellchecker can check
+// the spellings of a word that includes ligatures, combined characters,
+// full-width characters, etc.
+//
+// The following snippet is an example that extracts words with this class.
+//
+//   // Creates the language-specific attributes for US English.
+//   SpellcheckCharAttribute attribute;
+//   attribute.SetDefaultLanguage("en-US");
+//
+//   // Set up a SpellcheckWordIterator object which extracts English words,
+//   // and retrieves them.
+//   SpellcheckWordIterator iterator;
+//   string16 text(UTF8ToUTF16("this is a test."));
+//   iterator.Initialize(&attribute, text.c_str(), text_.length(), true);
+//
+//   string16 word;
+//   int start;
+//   int end;
+//   while (iterator.GetNextWord(&word, &start, &end)) {
+//     ...
+//   }
+//
 class SpellcheckWordIterator {
  public:
   SpellcheckWordIterator();
-
   ~SpellcheckWordIterator();
 
   // Initializes a word-iterator object.
   // Parameters
   //   * attribute [in] (const SpellcheckCharAttribute*)
-  //     Represents a set of character attributes used for filtering out
-  //     non-word characters.
+  //     Character attributes used for filtering out non-word characters.
   //   * word [in] (const char16*)
-  //     Represents a string from which this object extracts words.
-  //     (This string does not have to be NUL-terminated.)
+  //     A string from which this object extracts words. (This string does not
+  //     have to be NUL-terminated.)
   //   * length [in] (size_t)
-  //     Represents the length of the given string, in UTF-16 characters.
-  //     This value should not include terminating NUL characters.
+  //     The length of the given string, in UTF-16 characters.
   //   * allow_contraction [in] (bool)
-  //     Represents a flag to control whether or not this object should split a
-  //     possible contraction (e.g. "isn't", "in'n'out", etc.)
+  //     A flag to control whether or not this object should split a possible
+  //     contraction (e.g. "isn't", "in'n'out", etc.)
   // Return values
   //   * true
   //     This word-iterator object is initialized successfully.
   //   * false
   //     An error occured while initializing this object.
-  void Initialize(const SpellcheckCharAttribute* attribute,
+  bool Initialize(const SpellcheckCharAttribute* attribute,
                   const char16* word,
                   size_t length,
                   bool allow_contraction);
@@ -116,20 +128,20 @@ class SpellcheckWordIterator {
   // Retrieves a word (or a contraction).
   // Parameters
   //   * word_string [out] (string16*)
-  //     Represents a word (or a contraction) to be checked its spelling.
-  //     This |word_string| has been already normalized to its canonical form
-  //     (i.e. decomposed ligatures, replaced full-width latin characters to
-  //     its ASCII alternatives, etc.) so that a SpellChecker object can check
-  //     its spelling without any additional operations.
-  //     On the other hand, a substring of the input string
+  //     A word (or a contraction) to be checked its spelling. This
+  //     |word_string| has been already normalized to its canonical form (i.e.
+  //     decomposed ligatures, replaced full-width latin characters to its ASCII
+  //     alternatives, etc.) so a SpellChecker object can check its spelling
+  //     without any additional operations. We can use |word_start| and
+  //     |word_length| to retrieve the non-normalizedversion of this string as
+  //     shown in the following snippet.
   //       string16 str(&word[word_start], word_length);
-  //     represents the non-normalized version of this extracted word.
   //   * word_start [out] (int*)
-  //     Represents the offset of this word from the beginning of the input
-  //     string, in UTF-16 characters.
+  //     The offset of this word from the beginning of the input string,
+  //     in UTF-16 characters.
   //   * word_length [out] (int*)
-  //     Represents the length of an extracted word before normalization, in
-  //     UTF-16 characters.
+  //     The length of an extracted word before normalization, in UTF-16
+  //     characters.
   //     When the input string contains ligatures, this value may not be equal
   //     to the length of the |word_string|.
   // Return values
@@ -142,20 +154,14 @@ class SpellcheckWordIterator {
                    int* word_length);
 
  private:
-  // Retrieves a segment consisting of word characters (and contraction
-  // characters if the |allow_contraction| value is true).
-  void GetSegment(int* segment_start,
-                  int* segment_end);
-
-  // Discards non-word characters at the beginning and the end of the given
-  // segment.
-  void TrimSegment(int segment_start,
-                   int segment_end,
-                   int* word_start,
-                   int* word_length) const;
-
-  // Normalizes the given segment of the |word_| variable and write its
-  // canonical form to the |output_string|.
+  // Releases all the resources attached to this object.
+  void Close();
+
+  // Normalizes a non-terminated string so our spellchecker can check its
+  // spelling. A word returned from an ICU word-break iterator may include
+  // characters not supported by our spellchecker, e.g. ligatures, combining
+  // characters, full-width letters, etc. This function replaces such characters
+  // with alternative characters supported by our spellchecker.
   bool Normalize(int input_start,
                  int input_length,
                  string16* output_string) const;
@@ -170,13 +176,13 @@ class SpellcheckWordIterator {
   // The current position in the original string.
   int position_;
 
-  // The flag to control whether or not this object should extract possible
-  // contractions.
-  bool allow_contraction_;
-
-  // The character attributes used for filtering out non-word characters.
+  // The language-specific attributes used for filtering out non-word
+  // characters.
   const SpellcheckCharAttribute* attribute_;
 
+  // The ICU break iterator.
+  UBreakIterator* iterator_;
+
   DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
 };
 
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
new file mode 100644
index 0000000..a41c93b
--- /dev/null
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <string>
+#include <vector>
+
+#include "base/format_macros.h"
+#include "base/string_util.h"
+#include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace {
+
+struct TestCase {
+    const char* language;
+    bool allow_contraction;
+    const wchar_t* expected_words;
+};
+
+}  // namespace
+
+// Tests whether or not our SpellcheckWordIterator can extract only words used
+// by the specified language from a multi-language text.
+TEST(SpellcheckWordIteratorTest, SplitWord) {
+  // An input text. This text includes words of several languages. (Some words
+  // are not separated with whitespace characters.) Our SpellcheckWordIterator
+  // should extract only the words used by the specified language from this text
+  // and normalize them so our spell-checker can check their spellings.
+  const wchar_t kTestText[] =
+      // Numbers
+      L"0123456789"
+      // Latin (including a contraction character and a ligature).
+      L"hello:hello a\xFB03x"
+      // Greek
+      L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
+      // Cyrillic
+      L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
+      L"\x0443\x0439\x0442\x0435"
+      // Hebrew (including niqquds)
+      L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd"
+      // Arabic (including vowel marks)
+      L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
+      L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
+      L"\x0652\x0643\x064f\x0645\x0652"
+      // Hindi
+      L"\x0930\x093E\x091C\x0927\x093E\x0928"
+      // Thai
+      L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
+      L"\x0e23\x0e31\x0e1a"
+      // Hiraganas
+      L"\x3053\x3093\x306B\x3061\x306F"
+      // CJKV ideographs
+      L"\x4F60\x597D"
+      // Hangul Syllables
+      L"\xC548\xB155\xD558\xC138\xC694"
+      // Full-width latin
+      L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F";
+
+  // The languages and expected results used in this test.
+  static const TestCase kTestCases[] = {
+    {
+      // English (keep contraction words)
+      "en-US", true, L"hello:hello affix Hello"
+    }, {
+      // English (split contraction words)
+      "en-US", false, L"hello hello affix Hello"
+    }, {
+      // Greek
+      "el-GR", true,
+      L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
+    }, {
+      // Russian
+      "ru-RU", true,
+      L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
+      L"\x0443\x0439\x0442\x0435"
+    }, {
+      // Hebrew
+      "he-IL", true,
+      L"\x05e9\x05dc\x05d5\x05dd"
+    }, {
+      // Arabic
+      "ar", true,
+      L"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639"
+      L"\x0644\x064a\x0643\x0645"
+    }, {
+      // Hindi
+      "hi-IN", true,
+      L"\x0930\x093E\x091C\x0927\x093E\x0928"
+    }, {
+      // Thai
+      "th-TH", true,
+      L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
+      L"\x0e23\x0e31\x0e1a"
+    }, {
+      // Korean
+      "ko-KR", true,
+      L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
+      L"\x1109\x1166\x110b\x116d"
+    },
+  };
+
+  for (size_t i = 0; i < arraysize(kTestCases); ++i) {
+    SCOPED_TRACE(StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
+                              kTestCases[i].language));
+
+    SpellcheckCharAttribute attributes;
+    attributes.SetDefaultLanguage(kTestCases[i].language);
+
+    string16 input(WideToUTF16(kTestText));
+    SpellcheckWordIterator iterator;
+    EXPECT_TRUE(iterator.Initialize(&attributes, input.c_str(), input.length(),
+                                    kTestCases[i].allow_contraction));
+
+    std::vector<string16> expected_words;
+    SplitString(WideToUTF16(kTestCases[i].expected_words), ' ',
+                            &expected_words);
+
+    string16 actual_word;
+    int actual_start, actual_end;
+    size_t index = 0;
+    while (iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
+      EXPECT_TRUE(index < expected_words.size());
+      if (index < expected_words.size())
+        EXPECT_EQ(expected_words[index], actual_word);
+      ++index;
+    }
+  }
+}
author	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-02-16 09:02:41 +0000
committer	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-02-16 09:02:41 +0000
commit	59e2c39984b9a34bbb72db1f3a02be171dc9726d (patch)
tree	5bf95d487360cb6cf59e0c01c4e89d5c9d9d82b8 /chrome
parent	2b19e2feeac2a01b2068595bd2913a194a6527e5 (diff)
download	chromium_src-59e2c39984b9a34bbb72db1f3a02be171dc9726d.zip chromium_src-59e2c39984b9a34bbb72db1f3a02be171dc9726d.tar.gz chromium_src-59e2c39984b9a34bbb72db1f3a02be171dc9726d.tar.bz2