C++ readability for hbono.

The original CL: http://codereview.chromium.org/577020 Review URL: http://codereview.chromium.org/2449002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@49918 0039d316-1c4b-4281-b951-d872f2087c98
author: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-06-16 10:12:09 +0000
committer: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-06-16 10:12:09 +0000
commit: 2264d904da86cb735e16c54b12c8e71584f8edff (patch)
tree: e2f998fae428ab9dc7ca23f316818feb99dce25b
parent: 5779cab05e046d7a8a83c55be87d1a3fc01d11ed (diff)
download: chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.zip
chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.tar.gz
chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.tar.bz2
3 files changed, 120 insertions, 113 deletions
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
index 2b58393..f01b104 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
@@ -1,7 +1,9 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+// Implements a custom word iterator used for our spellchecker.
+
 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
 
 #include <map>
@@ -15,7 +17,6 @@
 #include "third_party/icu/public/common/unicode/uscript.h"
 #include "third_party/icu/public/i18n/unicode/ulocdata.h"
 
-///////////////////////////////////////////////////////////////////////////////
 // SpellcheckCharAttribute implementation:
 
 SpellcheckCharAttribute::SpellcheckCharAttribute()
@@ -35,8 +36,11 @@ string16 SpellcheckCharAttribute::GetRuleSet(bool allow_contraction) const {
 }
 
 void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
-  // The template for our custom rule sets. Even though this template is based
-  // on the one of ICU 4.0, it changed the following points:
+  // The template for our custom rule sets, which is based on the word-break
+  // rules of ICU 4.0:
+  // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/word.txt>.
+  // The major differences from the original one are listed below:
+  // * It discards comments in the original rules.
   // * It discards characters not needed by our spellchecker (e.g. numbers,
   //   punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on).
   // * It allows customization of the $ALetter value (i.e. word characters).
@@ -119,9 +123,11 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
       "($MidNumEx | $MidNumLetEx) $NumericEx;"
       "$dictionary $dictionary;";
 
-  // Retrieve the script code used by the given language from ICU. When the
+  // Retrieve the script codes used by the given language from ICU. When the
   // given language consists of two or more scripts, we just use the first
-  // script.
+  // script. The size of returned script codes is always < 8. Therefore, we use
+  // an array of size 8 so we can include all script codes without insufficient
+  // buffer errors.
   UErrorCode error = U_ZERO_ERROR;
   UScriptCode script_code[8];
   int scripts = uscript_getCode(language.c_str(), script_code,
@@ -143,7 +149,7 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
   if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
     aletter_plus = kWithDictionary;
 
-  // Create two custom rule-sets: one allows contraction and the other doesn't.
+  // Create two custom rule-sets: one allows contraction and the other does not.
   // We save these strings in UTF-16 so we can use it without conversions. (ICU
   // needs UTF-16 strings.)
   const char kAllowContraction[] =
@@ -186,28 +192,42 @@ bool SpellcheckCharAttribute::OutputArabic(UChar c, string16* output) const {
 }
 
 bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const {
-  // Decompose a Hangul syllable to Hangul jamos.
-  // This code is copied from Unicode Standard Annex #15:
-  // <http://unicode.org/reports/tr15>.
-  const int kSBase = 0xAC00;
-  const int kLBase = 0x1100;
-  const int kVBase = 0x1161;
-  const int kTBase = 0x11A7;
-  const int kLCount = 19;
-  const int kVCount = 21;
-  const int kTCount = 28;
+  // Decompose a Hangul character to a Hangul vowel and consonants used by our
+  // spellchecker. A Hangul character of Unicode is a ligature consisting of a
+  // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G",
+  // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as
+  // a point of a cubic linear space consisting of (first consonant, vowel, last
+  // consonant). Therefore, we can compose a Hangul character from a vowel and
+  // two consonants with linear composition:
+  //   character =  0xAC00 +
+  //                (first consonant - 0x1100) * 28 * 21 +
+  //                (vowel           - 0x1161) * 28 +
+  //                (last consonant  - 0x11A7);
+  // We can also decompose a Hangul character with linear decomposition:
+  //   first consonant = (character - 0xAC00) / 28 / 21;
+  //   vowel           = (character - 0xAC00) / 28 % 21;
+  //   last consonant  = (character - 0xAC00) % 28;
+  // This code is copied from Unicode Standard Annex #15
+  // <http://unicode.org/reports/tr15> and added some comments.
+  const int kSBase = 0xAC00;  // U+AC00: the top of Hangul characters.
+  const int kLBase = 0x1100;  // U+1100: the top of Hangul first consonants.
+  const int kVBase = 0x1161;  // U+1161: the top of Hangul vowels.
+  const int kTBase = 0x11A7;  // U+11A7: the top of Hangul last consonants.
+  const int kLCount = 19;     // The number of Hangul first consonants.
+  const int kVCount = 21;     // The number of Hangul vowels.
+  const int kTCount = 28;     // The number of Hangul last consonants.
   const int kNCount = kVCount * kTCount;
   const int kSCount = kLCount * kNCount;
 
   int index = c - kSBase;
   if (index < 0 || index >= kSBase + kSCount) {
     // This is not a Hangul syllable. Call the default output function since we
-    // should output this character when it is a Hangul jamo.
+    // should output this character when it is a Hangul syllable.
     return OutputDefault(c, output);
   }
 
-  // This is a Hangul syllable. Decompose this syllable into Hangul jamos and
-  // output them.
+  // This is a Hangul character. Decompose this characters into Hangul vowels
+  // and consonants.
   int l = kLBase + index / kNCount;
   int v = kVBase + (index % kNCount) / kTCount;
   int t = kTBase + index % kTCount;
@@ -220,7 +240,7 @@ bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const {
 
 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {
   // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds
-  // to prevent our Hebrew dictionay from marking a Hebrew word including
+  // to prevent our Hebrew dictionary from marking a Hebrew word including
   // niqquds as misspelled. (Same as Arabic vowel marks, we need to check
   // niqquds manually and filter them out since their script codes are
   // USCRIPT_HEBREW.)
@@ -239,7 +259,6 @@ bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const {
   return true;
 }
 
-///////////////////////////////////////////////////////////////////////////////
 // SpellcheckWordIterator implementation:
 
 SpellcheckWordIterator::SpellcheckWordIterator()
@@ -325,8 +344,13 @@ void SpellcheckWordIterator::Close() {
 bool SpellcheckWordIterator::Normalize(int input_start,
                                        int input_length,
                                        string16* output_string) const {
-  // We use NFKC to normalize this token because NFKC can compose combined
-  // characters and decompose ligatures.
+  // We use NFKC (Normalization Form, Compatible decomposition, followed by
+  // canonical Composition) defined in Unicode Standard Annex #15 to normalize
+  // this token because it it the most suitable normalization algorithm for our
+  // spellchecker. Nevertheless, it is not a perfect algorithm for our
+  // spellchecker and we need manual normalization as well. The normalized
+  // text does not have to be NUL-terminated since its characters are copied to
+  // string16, which adds a NUL character when we need.
   icu::UnicodeString input(FALSE, &word_[input_start], input_length);
   UErrorCode status = U_ZERO_ERROR;
   icu::UnicodeString output;
@@ -341,3 +365,4 @@ bool SpellcheckWordIterator::Normalize(int input_start,
 
   return !output_string->empty();
 }
+
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.h b/chrome/renderer/spellchecker/spellcheck_worditerator.h
index aa54011..ce2a98d 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.h
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.h
@@ -1,7 +1,11 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+// Defines an iterator class that enumerates words supported by our spellchecker
+// from multi-language text. This class is used for filtering out characters
+// not supported by our spellchecker.
+
 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
 
@@ -14,49 +18,50 @@
 #include "third_party/icu/public/common/unicode/uscript.h"
 
 // A class which encapsulates language-specific operations used by
-// SpellcheckWordIterator.
-// When we set the spellchecker language, this class creates rule sets that
-// filter out the characters not supported by the spellchecker.
-// (Please read the comment in the SpellcheckWordIterator class about how to
-// use this class.)
+// SpellcheckWordIterator. When we set the spellchecker language, this class
+// creates rule sets that filter out the characters not supported by the
+// spellchecker. (Please read the comment in the SpellcheckWordIterator class
+// about how to use this class.)
 class SpellcheckCharAttribute {
  public:
   SpellcheckCharAttribute();
   ~SpellcheckCharAttribute();
 
-  // Sets the language of the spellchecker.
-  // This function creates the custom rule-sets used by SpellcheckWordIterator.
-  // Parameters
-  //   * language [in] (std::string)
-  //     The language-code string.
+  // Sets the language of the spellchecker. When this function is called with an
+  // ISO language code, this function creates the custom rule-sets used by
+  // the ICU break iterator so it can extract only words used by the language.
+  // GetRuleSet() returns the rule-sets created in this function.
   void SetDefaultLanguage(const std::string& language);
 
-  // Returns a custom rule-set string used by the ICU break iterator.
-  // Parameters
-  //   * allow_contraction [in] (bool)
-  //     A flag to control whether or not this object splits a possible
-  //     contraction. If this value is false, it returns a rule set that
-  //    splits a possible contraction: "in'n'out" -> "in", "n", and "out".
+  // Returns a custom rule-set string used by the ICU break iterator. This class
+  // has two rule-sets, one splits a contraction and the other does not, so we
+  // can split a concaticated word (e.g. "seven-year-old") into words (e.g.
+  // "seven", "year", and "old") and check their spellings. The result stirng is
+  // encoded in UTF-16 since ICU needs UTF-16 strings.
   string16 GetRuleSet(bool allow_contraction) const;
 
-  // Output a character only if it is a word character.
+  // Outputs a character only if it is a word character. (Please read the
+  // comments in CreateRuleSets() why we need this function.)
   bool OutputChar(UChar c, string16* output) const;
 
  private:
-  // Creates the rule-set strings.
+  // Creates the rule-sets that return words possibly used by the given
+  // language. Unfortunately, these rule-sets are not perfect and have some
+  // false-positives. For example, they return combined accent marks even though
+  // we need English words only. We call OutputCharacter() to filter out such
+  // false-positive characters.
   void CreateRuleSets(const std::string& language);
 
-  // Language-specific output functions.
+  // Outputs a character only if it is one used by the given language. These
+  // functions are called from OutputChar().
   bool OutputArabic(UChar c, string16* output) const;
   bool OutputHangul(UChar c, string16* output) const;
   bool OutputHebrew(UChar c, string16* output) const;
   bool OutputDefault(UChar c, string16* output) const;
 
- private:
-  // The custom rule-set strings used by ICU BreakIterator.
-  // Since it is not so easy to create custom rule-sets from a spellchecker
-  // language, this class saves these rule-set strings created when we set the
-  // language.
+  // The custom rule-set strings used by ICU break iterator. Since it is not so
+  // easy to create custom rule-sets from an ISO language code, this class
+  // saves these rule-set strings created when we set the language.
   string16 ruleset_allow_contraction_;
   string16 ruleset_disallow_contraction_;
 
@@ -66,19 +71,20 @@ class SpellcheckCharAttribute {
   DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
 };
 
-// A class which extracts words that can be checked for spelling from a longer
-// string.
-// The ICU word-break iterator does not discard some punctuation characters
-// attached to a word. For example, when we set a word "_hello_" to a
-// word-break iterator, it just returns "_hello_".
-// On the other hand, our spellchecker expects for us to discard such
-// punctuation characters.
-// To extract only the words that our spellchecker can check, this class uses
-// custom rule-sets created by the SpellcheckCharAttribute class.
-// Also, this class normalizes extracted words so our spellchecker can check
-// the spellings of a word that includes ligatures, combined characters,
-// full-width characters, etc.
-//
+// A class which extracts words that can be checked for spelling from a
+// multi-language string. The ICU word-break iterator does not discard some
+// punctuation characters attached to a word. For example, when we set a word
+// "_hello_" to a word-break iterator, it just returns "_hello_". Neither does
+// it discard characters not used by the language. For example, it returns
+// Russian words even though we need English words only. To extract only the
+// words that our spellchecker can check their spellings, this class uses custom
+// rule-sets created by the SpellcheckCharAttribute class. Also, this class
+// normalizes extracted words so our spellchecker can check the spellings of
+// words that include ligatures, combined characters, full-width characters,
+// etc. This class uses UTF-16 strings as its input and output strings since
+// UTF-16 is the native encoding of ICU and avoid unnecessary conversions
+// when changing the encoding of this string for our spellchecker. (Chrome can
+// use two or more spellcheckers and we cannot assume their encodings.)
 // The following snippet is an example that extracts words with this class.
 //
 //   // Creates the language-specific attributes for US English.
@@ -86,15 +92,15 @@ class SpellcheckCharAttribute {
 //   attribute.SetDefaultLanguage("en-US");
 //
 //   // Set up a SpellcheckWordIterator object which extracts English words,
-//   // and retrieves them.
+//   // and retrieve them.
 //   SpellcheckWordIterator iterator;
 //   string16 text(UTF8ToUTF16("this is a test."));
 //   iterator.Initialize(&attribute, text.c_str(), text_.length(), true);
 //
 //   string16 word;
-//   int start;
-//   int end;
-//   while (iterator.GetNextWord(&word, &start, &end)) {
+//   int offset;
+//   int length;
+//   while (iterator.GetNextWord(&word, &offset, &length)) {
 //     ...
 //   }
 //
@@ -103,52 +109,25 @@ class SpellcheckWordIterator {
   SpellcheckWordIterator();
   ~SpellcheckWordIterator();
 
-  // Initializes a word-iterator object.
-  // Parameters
-  //   * attribute [in] (const SpellcheckCharAttribute*)
-  //     Character attributes used for filtering out non-word characters.
-  //   * word [in] (const char16*)
-  //     A string from which this object extracts words. (This string does not
-  //     have to be NUL-terminated.)
-  //   * length [in] (size_t)
-  //     The length of the given string, in UTF-16 characters.
-  //   * allow_contraction [in] (bool)
-  //     A flag to control whether or not this object should split a possible
-  //     contraction (e.g. "isn't", "in'n'out", etc.)
-  // Return values
-  //   * true
-  //     This word-iterator object is initialized successfully.
-  //   * false
-  //     An error occured while initializing this object.
+  // Initializes a word-iterator object with the language-specific attribute and
+  // a multi-language text (it does not have to be NULL-terminated). If we need
+  // to split contractions and concatenated words, call this function with its
+  // 'allow_contraction' parameter false.
   bool Initialize(const SpellcheckCharAttribute* attribute,
                   const char16* word,
                   size_t length,
                   bool allow_contraction);
 
-  // Retrieves a word (or a contraction).
-  // Parameters
-  //   * word_string [out] (string16*)
-  //     A word (or a contraction) to be checked its spelling. This
-  //     |word_string| has been already normalized to its canonical form (i.e.
-  //     decomposed ligatures, replaced full-width latin characters to its ASCII
-  //     alternatives, etc.) so a SpellChecker object can check its spelling
-  //     without any additional operations. We can use |word_start| and
-  //     |word_length| to retrieve the non-normalizedversion of this string as
-  //     shown in the following snippet.
-  //       string16 str(&word[word_start], word_length);
-  //   * word_start [out] (int*)
-  //     The offset of this word from the beginning of the input string,
-  //     in UTF-16 characters.
-  //   * word_length [out] (int*)
-  //     The length of an extracted word before normalization, in UTF-16
-  //     characters.
-  //     When the input string contains ligatures, this value may not be equal
-  //     to the length of the |word_string|.
-  // Return values
-  //   * true
-  //     Found a word (or a contraction) to be checked its spelling.
-  //   * false
-  //     Not found any more words or contractions to be checked their spellings.
+  // Retrieves a word (or a contraction), stores its copy to 'word_string', and
+  // stores the position and the length for input word to 'word_start'. Since
+  // this function normalizes the output word, the length of 'word_string' may
+  // be different from the 'word_length'. Therefore, when we call functions that
+  // changes the input text, such as string16::replace(), we need to use
+  // 'word_start' and 'word_length' as listed in the following snippet.
+  //
+  //   while(iterator.GetNextWord(&word, &offset, &length))
+  //     text.replace(offset, length, word);
+  //
   bool GetNextWord(string16* word_string,
                    int* word_start,
                    int* word_length);
@@ -157,16 +136,17 @@ class SpellcheckWordIterator {
   // Releases all the resources attached to this object.
   void Close();
 
-  // Normalizes a non-terminated string so our spellchecker can check its
-  // spelling. A word returned from an ICU word-break iterator may include
-  // characters not supported by our spellchecker, e.g. ligatures, combining
-  // characters, full-width letters, etc. This function replaces such characters
-  // with alternative characters supported by our spellchecker.
+  // Normalizes a non-terminated string returned from an ICU word-break
+  // iterator. A word returned from an ICU break iterator may include characters
+  // not supported by our spellchecker, e.g. ligatures, combining/ characters,
+  // full-width letters, etc. This function replaces such characters with
+  // alternative characters supported by our spellchecker. This function also
+  // calls SpellcheckWordIterator::OutputChar() to filter out false-positive
+  // characters.
   bool Normalize(int input_start,
                  int input_length,
                  string16* output_string) const;
 
- private:
   // The pointer to the input string from which we are extracting words.
   const char16* word_;
 
@@ -187,3 +167,4 @@ class SpellcheckWordIterator {
 };
 
 #endif  // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
+
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
index 37e4f94..43af29f 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
@@ -128,3 +128,4 @@ TEST(SpellcheckWordIteratorTest, SplitWord) {
     }
   }
 }
+
author	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-06-16 10:12:09 +0000
committer	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-06-16 10:12:09 +0000
commit	2264d904da86cb735e16c54b12c8e71584f8edff (patch)
tree	e2f998fae428ab9dc7ca23f316818feb99dce25b
parent	5779cab05e046d7a8a83c55be87d1a3fc01d11ed (diff)
download	chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.zip chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.tar.gz chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.tar.bz2