diff options
author | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-06-16 10:12:09 +0000 |
---|---|---|
committer | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-06-16 10:12:09 +0000 |
commit | 2264d904da86cb735e16c54b12c8e71584f8edff (patch) | |
tree | e2f998fae428ab9dc7ca23f316818feb99dce25b | |
parent | 5779cab05e046d7a8a83c55be87d1a3fc01d11ed (diff) | |
download | chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.zip chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.tar.gz chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.tar.bz2 |
C++ readability for hbono.
The original CL: http://codereview.chromium.org/577020
Review URL: http://codereview.chromium.org/2449002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@49918 0039d316-1c4b-4281-b951-d872f2087c98
3 files changed, 120 insertions, 113 deletions
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc index 2b58393..f01b104 100644 --- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc +++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc @@ -1,7 +1,9 @@ -// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +// Implements a custom word iterator used for our spellchecker. + #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" #include <map> @@ -15,7 +17,6 @@ #include "third_party/icu/public/common/unicode/uscript.h" #include "third_party/icu/public/i18n/unicode/ulocdata.h" -/////////////////////////////////////////////////////////////////////////////// // SpellcheckCharAttribute implementation: SpellcheckCharAttribute::SpellcheckCharAttribute() @@ -35,8 +36,11 @@ string16 SpellcheckCharAttribute::GetRuleSet(bool allow_contraction) const { } void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { - // The template for our custom rule sets. Even though this template is based - // on the one of ICU 4.0, it changed the following points: + // The template for our custom rule sets, which is based on the word-break + // rules of ICU 4.0: + // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/word.txt>. + // The major differences from the original one are listed below: + // * It discards comments in the original rules. // * It discards characters not needed by our spellchecker (e.g. numbers, // punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on). // * It allows customization of the $ALetter value (i.e. word characters). @@ -119,9 +123,11 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { "($MidNumEx | $MidNumLetEx) $NumericEx;" "$dictionary $dictionary;"; - // Retrieve the script code used by the given language from ICU. When the + // Retrieve the script codes used by the given language from ICU. When the // given language consists of two or more scripts, we just use the first - // script. + // script. The size of returned script codes is always < 8. Therefore, we use + // an array of size 8 so we can include all script codes without insufficient + // buffer errors. UErrorCode error = U_ZERO_ERROR; UScriptCode script_code[8]; int scripts = uscript_getCode(language.c_str(), script_code, @@ -143,7 +149,7 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) aletter_plus = kWithDictionary; - // Create two custom rule-sets: one allows contraction and the other doesn't. + // Create two custom rule-sets: one allows contraction and the other does not. // We save these strings in UTF-16 so we can use it without conversions. (ICU // needs UTF-16 strings.) const char kAllowContraction[] = @@ -186,28 +192,42 @@ bool SpellcheckCharAttribute::OutputArabic(UChar c, string16* output) const { } bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const { - // Decompose a Hangul syllable to Hangul jamos. - // This code is copied from Unicode Standard Annex #15: - // <http://unicode.org/reports/tr15>. - const int kSBase = 0xAC00; - const int kLBase = 0x1100; - const int kVBase = 0x1161; - const int kTBase = 0x11A7; - const int kLCount = 19; - const int kVCount = 21; - const int kTCount = 28; + // Decompose a Hangul character to a Hangul vowel and consonants used by our + // spellchecker. A Hangul character of Unicode is a ligature consisting of a + // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G", + // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as + // a point of a cubic linear space consisting of (first consonant, vowel, last + // consonant). Therefore, we can compose a Hangul character from a vowel and + // two consonants with linear composition: + // character = 0xAC00 + + // (first consonant - 0x1100) * 28 * 21 + + // (vowel - 0x1161) * 28 + + // (last consonant - 0x11A7); + // We can also decompose a Hangul character with linear decomposition: + // first consonant = (character - 0xAC00) / 28 / 21; + // vowel = (character - 0xAC00) / 28 % 21; + // last consonant = (character - 0xAC00) % 28; + // This code is copied from Unicode Standard Annex #15 + // <http://unicode.org/reports/tr15> and added some comments. + const int kSBase = 0xAC00; // U+AC00: the top of Hangul characters. + const int kLBase = 0x1100; // U+1100: the top of Hangul first consonants. + const int kVBase = 0x1161; // U+1161: the top of Hangul vowels. + const int kTBase = 0x11A7; // U+11A7: the top of Hangul last consonants. + const int kLCount = 19; // The number of Hangul first consonants. + const int kVCount = 21; // The number of Hangul vowels. + const int kTCount = 28; // The number of Hangul last consonants. const int kNCount = kVCount * kTCount; const int kSCount = kLCount * kNCount; int index = c - kSBase; if (index < 0 || index >= kSBase + kSCount) { // This is not a Hangul syllable. Call the default output function since we - // should output this character when it is a Hangul jamo. + // should output this character when it is a Hangul syllable. return OutputDefault(c, output); } - // This is a Hangul syllable. Decompose this syllable into Hangul jamos and - // output them. + // This is a Hangul character. Decompose this characters into Hangul vowels + // and consonants. int l = kLBase + index / kNCount; int v = kVBase + (index % kNCount) / kTCount; int t = kTBase + index % kTCount; @@ -220,7 +240,7 @@ bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const { bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const { // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds - // to prevent our Hebrew dictionay from marking a Hebrew word including + // to prevent our Hebrew dictionary from marking a Hebrew word including // niqquds as misspelled. (Same as Arabic vowel marks, we need to check // niqquds manually and filter them out since their script codes are // USCRIPT_HEBREW.) @@ -239,7 +259,6 @@ bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const { return true; } -/////////////////////////////////////////////////////////////////////////////// // SpellcheckWordIterator implementation: SpellcheckWordIterator::SpellcheckWordIterator() @@ -325,8 +344,13 @@ void SpellcheckWordIterator::Close() { bool SpellcheckWordIterator::Normalize(int input_start, int input_length, string16* output_string) const { - // We use NFKC to normalize this token because NFKC can compose combined - // characters and decompose ligatures. + // We use NFKC (Normalization Form, Compatible decomposition, followed by + // canonical Composition) defined in Unicode Standard Annex #15 to normalize + // this token because it it the most suitable normalization algorithm for our + // spellchecker. Nevertheless, it is not a perfect algorithm for our + // spellchecker and we need manual normalization as well. The normalized + // text does not have to be NUL-terminated since its characters are copied to + // string16, which adds a NUL character when we need. icu::UnicodeString input(FALSE, &word_[input_start], input_length); UErrorCode status = U_ZERO_ERROR; icu::UnicodeString output; @@ -341,3 +365,4 @@ bool SpellcheckWordIterator::Normalize(int input_start, return !output_string->empty(); } + diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.h b/chrome/renderer/spellchecker/spellcheck_worditerator.h index aa54011..ce2a98d 100644 --- a/chrome/renderer/spellchecker/spellcheck_worditerator.h +++ b/chrome/renderer/spellchecker/spellcheck_worditerator.h @@ -1,7 +1,11 @@ -// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +// Defines an iterator class that enumerates words supported by our spellchecker +// from multi-language text. This class is used for filtering out characters +// not supported by our spellchecker. + #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ @@ -14,49 +18,50 @@ #include "third_party/icu/public/common/unicode/uscript.h" // A class which encapsulates language-specific operations used by -// SpellcheckWordIterator. -// When we set the spellchecker language, this class creates rule sets that -// filter out the characters not supported by the spellchecker. -// (Please read the comment in the SpellcheckWordIterator class about how to -// use this class.) +// SpellcheckWordIterator. When we set the spellchecker language, this class +// creates rule sets that filter out the characters not supported by the +// spellchecker. (Please read the comment in the SpellcheckWordIterator class +// about how to use this class.) class SpellcheckCharAttribute { public: SpellcheckCharAttribute(); ~SpellcheckCharAttribute(); - // Sets the language of the spellchecker. - // This function creates the custom rule-sets used by SpellcheckWordIterator. - // Parameters - // * language [in] (std::string) - // The language-code string. + // Sets the language of the spellchecker. When this function is called with an + // ISO language code, this function creates the custom rule-sets used by + // the ICU break iterator so it can extract only words used by the language. + // GetRuleSet() returns the rule-sets created in this function. void SetDefaultLanguage(const std::string& language); - // Returns a custom rule-set string used by the ICU break iterator. - // Parameters - // * allow_contraction [in] (bool) - // A flag to control whether or not this object splits a possible - // contraction. If this value is false, it returns a rule set that - // splits a possible contraction: "in'n'out" -> "in", "n", and "out". + // Returns a custom rule-set string used by the ICU break iterator. This class + // has two rule-sets, one splits a contraction and the other does not, so we + // can split a concaticated word (e.g. "seven-year-old") into words (e.g. + // "seven", "year", and "old") and check their spellings. The result stirng is + // encoded in UTF-16 since ICU needs UTF-16 strings. string16 GetRuleSet(bool allow_contraction) const; - // Output a character only if it is a word character. + // Outputs a character only if it is a word character. (Please read the + // comments in CreateRuleSets() why we need this function.) bool OutputChar(UChar c, string16* output) const; private: - // Creates the rule-set strings. + // Creates the rule-sets that return words possibly used by the given + // language. Unfortunately, these rule-sets are not perfect and have some + // false-positives. For example, they return combined accent marks even though + // we need English words only. We call OutputCharacter() to filter out such + // false-positive characters. void CreateRuleSets(const std::string& language); - // Language-specific output functions. + // Outputs a character only if it is one used by the given language. These + // functions are called from OutputChar(). bool OutputArabic(UChar c, string16* output) const; bool OutputHangul(UChar c, string16* output) const; bool OutputHebrew(UChar c, string16* output) const; bool OutputDefault(UChar c, string16* output) const; - private: - // The custom rule-set strings used by ICU BreakIterator. - // Since it is not so easy to create custom rule-sets from a spellchecker - // language, this class saves these rule-set strings created when we set the - // language. + // The custom rule-set strings used by ICU break iterator. Since it is not so + // easy to create custom rule-sets from an ISO language code, this class + // saves these rule-set strings created when we set the language. string16 ruleset_allow_contraction_; string16 ruleset_disallow_contraction_; @@ -66,19 +71,20 @@ class SpellcheckCharAttribute { DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); }; -// A class which extracts words that can be checked for spelling from a longer -// string. -// The ICU word-break iterator does not discard some punctuation characters -// attached to a word. For example, when we set a word "_hello_" to a -// word-break iterator, it just returns "_hello_". -// On the other hand, our spellchecker expects for us to discard such -// punctuation characters. -// To extract only the words that our spellchecker can check, this class uses -// custom rule-sets created by the SpellcheckCharAttribute class. -// Also, this class normalizes extracted words so our spellchecker can check -// the spellings of a word that includes ligatures, combined characters, -// full-width characters, etc. -// +// A class which extracts words that can be checked for spelling from a +// multi-language string. The ICU word-break iterator does not discard some +// punctuation characters attached to a word. For example, when we set a word +// "_hello_" to a word-break iterator, it just returns "_hello_". Neither does +// it discard characters not used by the language. For example, it returns +// Russian words even though we need English words only. To extract only the +// words that our spellchecker can check their spellings, this class uses custom +// rule-sets created by the SpellcheckCharAttribute class. Also, this class +// normalizes extracted words so our spellchecker can check the spellings of +// words that include ligatures, combined characters, full-width characters, +// etc. This class uses UTF-16 strings as its input and output strings since +// UTF-16 is the native encoding of ICU and avoid unnecessary conversions +// when changing the encoding of this string for our spellchecker. (Chrome can +// use two or more spellcheckers and we cannot assume their encodings.) // The following snippet is an example that extracts words with this class. // // // Creates the language-specific attributes for US English. @@ -86,15 +92,15 @@ class SpellcheckCharAttribute { // attribute.SetDefaultLanguage("en-US"); // // // Set up a SpellcheckWordIterator object which extracts English words, -// // and retrieves them. +// // and retrieve them. // SpellcheckWordIterator iterator; // string16 text(UTF8ToUTF16("this is a test.")); // iterator.Initialize(&attribute, text.c_str(), text_.length(), true); // // string16 word; -// int start; -// int end; -// while (iterator.GetNextWord(&word, &start, &end)) { +// int offset; +// int length; +// while (iterator.GetNextWord(&word, &offset, &length)) { // ... // } // @@ -103,52 +109,25 @@ class SpellcheckWordIterator { SpellcheckWordIterator(); ~SpellcheckWordIterator(); - // Initializes a word-iterator object. - // Parameters - // * attribute [in] (const SpellcheckCharAttribute*) - // Character attributes used for filtering out non-word characters. - // * word [in] (const char16*) - // A string from which this object extracts words. (This string does not - // have to be NUL-terminated.) - // * length [in] (size_t) - // The length of the given string, in UTF-16 characters. - // * allow_contraction [in] (bool) - // A flag to control whether or not this object should split a possible - // contraction (e.g. "isn't", "in'n'out", etc.) - // Return values - // * true - // This word-iterator object is initialized successfully. - // * false - // An error occured while initializing this object. + // Initializes a word-iterator object with the language-specific attribute and + // a multi-language text (it does not have to be NULL-terminated). If we need + // to split contractions and concatenated words, call this function with its + // 'allow_contraction' parameter false. bool Initialize(const SpellcheckCharAttribute* attribute, const char16* word, size_t length, bool allow_contraction); - // Retrieves a word (or a contraction). - // Parameters - // * word_string [out] (string16*) - // A word (or a contraction) to be checked its spelling. This - // |word_string| has been already normalized to its canonical form (i.e. - // decomposed ligatures, replaced full-width latin characters to its ASCII - // alternatives, etc.) so a SpellChecker object can check its spelling - // without any additional operations. We can use |word_start| and - // |word_length| to retrieve the non-normalizedversion of this string as - // shown in the following snippet. - // string16 str(&word[word_start], word_length); - // * word_start [out] (int*) - // The offset of this word from the beginning of the input string, - // in UTF-16 characters. - // * word_length [out] (int*) - // The length of an extracted word before normalization, in UTF-16 - // characters. - // When the input string contains ligatures, this value may not be equal - // to the length of the |word_string|. - // Return values - // * true - // Found a word (or a contraction) to be checked its spelling. - // * false - // Not found any more words or contractions to be checked their spellings. + // Retrieves a word (or a contraction), stores its copy to 'word_string', and + // stores the position and the length for input word to 'word_start'. Since + // this function normalizes the output word, the length of 'word_string' may + // be different from the 'word_length'. Therefore, when we call functions that + // changes the input text, such as string16::replace(), we need to use + // 'word_start' and 'word_length' as listed in the following snippet. + // + // while(iterator.GetNextWord(&word, &offset, &length)) + // text.replace(offset, length, word); + // bool GetNextWord(string16* word_string, int* word_start, int* word_length); @@ -157,16 +136,17 @@ class SpellcheckWordIterator { // Releases all the resources attached to this object. void Close(); - // Normalizes a non-terminated string so our spellchecker can check its - // spelling. A word returned from an ICU word-break iterator may include - // characters not supported by our spellchecker, e.g. ligatures, combining - // characters, full-width letters, etc. This function replaces such characters - // with alternative characters supported by our spellchecker. + // Normalizes a non-terminated string returned from an ICU word-break + // iterator. A word returned from an ICU break iterator may include characters + // not supported by our spellchecker, e.g. ligatures, combining/ characters, + // full-width letters, etc. This function replaces such characters with + // alternative characters supported by our spellchecker. This function also + // calls SpellcheckWordIterator::OutputChar() to filter out false-positive + // characters. bool Normalize(int input_start, int input_length, string16* output_string) const; - private: // The pointer to the input string from which we are extracting words. const char16* word_; @@ -187,3 +167,4 @@ class SpellcheckWordIterator { }; #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ + diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc index 37e4f94..43af29f 100644 --- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc +++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc @@ -128,3 +128,4 @@ TEST(SpellcheckWordIteratorTest, SplitWord) { } } } + |