summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-06-16 10:12:09 +0000
committerhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-06-16 10:12:09 +0000
commit2264d904da86cb735e16c54b12c8e71584f8edff (patch)
treee2f998fae428ab9dc7ca23f316818feb99dce25b
parent5779cab05e046d7a8a83c55be87d1a3fc01d11ed (diff)
downloadchromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.zip
chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.tar.gz
chromium_src-2264d904da86cb735e16c54b12c8e71584f8edff.tar.bz2
C++ readability for hbono.
The original CL: http://codereview.chromium.org/577020 Review URL: http://codereview.chromium.org/2449002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@49918 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator.cc73
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator.h159
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc1
3 files changed, 120 insertions, 113 deletions
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
index 2b58393..f01b104 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
@@ -1,7 +1,9 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+// Implements a custom word iterator used for our spellchecker.
+
#include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
#include <map>
@@ -15,7 +17,6 @@
#include "third_party/icu/public/common/unicode/uscript.h"
#include "third_party/icu/public/i18n/unicode/ulocdata.h"
-///////////////////////////////////////////////////////////////////////////////
// SpellcheckCharAttribute implementation:
SpellcheckCharAttribute::SpellcheckCharAttribute()
@@ -35,8 +36,11 @@ string16 SpellcheckCharAttribute::GetRuleSet(bool allow_contraction) const {
}
void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
- // The template for our custom rule sets. Even though this template is based
- // on the one of ICU 4.0, it changed the following points:
+ // The template for our custom rule sets, which is based on the word-break
+ // rules of ICU 4.0:
+ // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/word.txt>.
+ // The major differences from the original one are listed below:
+ // * It discards comments in the original rules.
// * It discards characters not needed by our spellchecker (e.g. numbers,
// punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on).
// * It allows customization of the $ALetter value (i.e. word characters).
@@ -119,9 +123,11 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
"($MidNumEx | $MidNumLetEx) $NumericEx;"
"$dictionary $dictionary;";
- // Retrieve the script code used by the given language from ICU. When the
+ // Retrieve the script codes used by the given language from ICU. When the
// given language consists of two or more scripts, we just use the first
- // script.
+ // script. The size of returned script codes is always < 8. Therefore, we use
+ // an array of size 8 so we can include all script codes without insufficient
+ // buffer errors.
UErrorCode error = U_ZERO_ERROR;
UScriptCode script_code[8];
int scripts = uscript_getCode(language.c_str(), script_code,
@@ -143,7 +149,7 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
aletter_plus = kWithDictionary;
- // Create two custom rule-sets: one allows contraction and the other doesn't.
+ // Create two custom rule-sets: one allows contraction and the other does not.
// We save these strings in UTF-16 so we can use it without conversions. (ICU
// needs UTF-16 strings.)
const char kAllowContraction[] =
@@ -186,28 +192,42 @@ bool SpellcheckCharAttribute::OutputArabic(UChar c, string16* output) const {
}
bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const {
- // Decompose a Hangul syllable to Hangul jamos.
- // This code is copied from Unicode Standard Annex #15:
- // <http://unicode.org/reports/tr15>.
- const int kSBase = 0xAC00;
- const int kLBase = 0x1100;
- const int kVBase = 0x1161;
- const int kTBase = 0x11A7;
- const int kLCount = 19;
- const int kVCount = 21;
- const int kTCount = 28;
+ // Decompose a Hangul character to a Hangul vowel and consonants used by our
+ // spellchecker. A Hangul character of Unicode is a ligature consisting of a
+ // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G",
+ // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as
+ // a point of a cubic linear space consisting of (first consonant, vowel, last
+ // consonant). Therefore, we can compose a Hangul character from a vowel and
+ // two consonants with linear composition:
+ // character = 0xAC00 +
+ // (first consonant - 0x1100) * 28 * 21 +
+ // (vowel - 0x1161) * 28 +
+ // (last consonant - 0x11A7);
+ // We can also decompose a Hangul character with linear decomposition:
+ // first consonant = (character - 0xAC00) / 28 / 21;
+ // vowel = (character - 0xAC00) / 28 % 21;
+ // last consonant = (character - 0xAC00) % 28;
+ // This code is copied from Unicode Standard Annex #15
+ // <http://unicode.org/reports/tr15> and added some comments.
+ const int kSBase = 0xAC00; // U+AC00: the top of Hangul characters.
+ const int kLBase = 0x1100; // U+1100: the top of Hangul first consonants.
+ const int kVBase = 0x1161; // U+1161: the top of Hangul vowels.
+ const int kTBase = 0x11A7; // U+11A7: the top of Hangul last consonants.
+ const int kLCount = 19; // The number of Hangul first consonants.
+ const int kVCount = 21; // The number of Hangul vowels.
+ const int kTCount = 28; // The number of Hangul last consonants.
const int kNCount = kVCount * kTCount;
const int kSCount = kLCount * kNCount;
int index = c - kSBase;
if (index < 0 || index >= kSBase + kSCount) {
// This is not a Hangul syllable. Call the default output function since we
- // should output this character when it is a Hangul jamo.
+ // should output this character when it is a Hangul syllable.
return OutputDefault(c, output);
}
- // This is a Hangul syllable. Decompose this syllable into Hangul jamos and
- // output them.
+ // This is a Hangul character. Decompose this characters into Hangul vowels
+ // and consonants.
int l = kLBase + index / kNCount;
int v = kVBase + (index % kNCount) / kTCount;
int t = kTBase + index % kTCount;
@@ -220,7 +240,7 @@ bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const {
bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {
// Discard characters except Hebrew alphabets. We also discard Hebrew niqquds
- // to prevent our Hebrew dictionay from marking a Hebrew word including
+ // to prevent our Hebrew dictionary from marking a Hebrew word including
// niqquds as misspelled. (Same as Arabic vowel marks, we need to check
// niqquds manually and filter them out since their script codes are
// USCRIPT_HEBREW.)
@@ -239,7 +259,6 @@ bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const {
return true;
}
-///////////////////////////////////////////////////////////////////////////////
// SpellcheckWordIterator implementation:
SpellcheckWordIterator::SpellcheckWordIterator()
@@ -325,8 +344,13 @@ void SpellcheckWordIterator::Close() {
bool SpellcheckWordIterator::Normalize(int input_start,
int input_length,
string16* output_string) const {
- // We use NFKC to normalize this token because NFKC can compose combined
- // characters and decompose ligatures.
+ // We use NFKC (Normalization Form, Compatible decomposition, followed by
+ // canonical Composition) defined in Unicode Standard Annex #15 to normalize
+ // this token because it it the most suitable normalization algorithm for our
+ // spellchecker. Nevertheless, it is not a perfect algorithm for our
+ // spellchecker and we need manual normalization as well. The normalized
+ // text does not have to be NUL-terminated since its characters are copied to
+ // string16, which adds a NUL character when we need.
icu::UnicodeString input(FALSE, &word_[input_start], input_length);
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString output;
@@ -341,3 +365,4 @@ bool SpellcheckWordIterator::Normalize(int input_start,
return !output_string->empty();
}
+
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.h b/chrome/renderer/spellchecker/spellcheck_worditerator.h
index aa54011..ce2a98d 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.h
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.h
@@ -1,7 +1,11 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+// Defines an iterator class that enumerates words supported by our spellchecker
+// from multi-language text. This class is used for filtering out characters
+// not supported by our spellchecker.
+
#ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
#define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
@@ -14,49 +18,50 @@
#include "third_party/icu/public/common/unicode/uscript.h"
// A class which encapsulates language-specific operations used by
-// SpellcheckWordIterator.
-// When we set the spellchecker language, this class creates rule sets that
-// filter out the characters not supported by the spellchecker.
-// (Please read the comment in the SpellcheckWordIterator class about how to
-// use this class.)
+// SpellcheckWordIterator. When we set the spellchecker language, this class
+// creates rule sets that filter out the characters not supported by the
+// spellchecker. (Please read the comment in the SpellcheckWordIterator class
+// about how to use this class.)
class SpellcheckCharAttribute {
public:
SpellcheckCharAttribute();
~SpellcheckCharAttribute();
- // Sets the language of the spellchecker.
- // This function creates the custom rule-sets used by SpellcheckWordIterator.
- // Parameters
- // * language [in] (std::string)
- // The language-code string.
+ // Sets the language of the spellchecker. When this function is called with an
+ // ISO language code, this function creates the custom rule-sets used by
+ // the ICU break iterator so it can extract only words used by the language.
+ // GetRuleSet() returns the rule-sets created in this function.
void SetDefaultLanguage(const std::string& language);
- // Returns a custom rule-set string used by the ICU break iterator.
- // Parameters
- // * allow_contraction [in] (bool)
- // A flag to control whether or not this object splits a possible
- // contraction. If this value is false, it returns a rule set that
- // splits a possible contraction: "in'n'out" -> "in", "n", and "out".
+ // Returns a custom rule-set string used by the ICU break iterator. This class
+ // has two rule-sets, one splits a contraction and the other does not, so we
+ // can split a concaticated word (e.g. "seven-year-old") into words (e.g.
+ // "seven", "year", and "old") and check their spellings. The result stirng is
+ // encoded in UTF-16 since ICU needs UTF-16 strings.
string16 GetRuleSet(bool allow_contraction) const;
- // Output a character only if it is a word character.
+ // Outputs a character only if it is a word character. (Please read the
+ // comments in CreateRuleSets() why we need this function.)
bool OutputChar(UChar c, string16* output) const;
private:
- // Creates the rule-set strings.
+ // Creates the rule-sets that return words possibly used by the given
+ // language. Unfortunately, these rule-sets are not perfect and have some
+ // false-positives. For example, they return combined accent marks even though
+ // we need English words only. We call OutputCharacter() to filter out such
+ // false-positive characters.
void CreateRuleSets(const std::string& language);
- // Language-specific output functions.
+ // Outputs a character only if it is one used by the given language. These
+ // functions are called from OutputChar().
bool OutputArabic(UChar c, string16* output) const;
bool OutputHangul(UChar c, string16* output) const;
bool OutputHebrew(UChar c, string16* output) const;
bool OutputDefault(UChar c, string16* output) const;
- private:
- // The custom rule-set strings used by ICU BreakIterator.
- // Since it is not so easy to create custom rule-sets from a spellchecker
- // language, this class saves these rule-set strings created when we set the
- // language.
+ // The custom rule-set strings used by ICU break iterator. Since it is not so
+ // easy to create custom rule-sets from an ISO language code, this class
+ // saves these rule-set strings created when we set the language.
string16 ruleset_allow_contraction_;
string16 ruleset_disallow_contraction_;
@@ -66,19 +71,20 @@ class SpellcheckCharAttribute {
DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
};
-// A class which extracts words that can be checked for spelling from a longer
-// string.
-// The ICU word-break iterator does not discard some punctuation characters
-// attached to a word. For example, when we set a word "_hello_" to a
-// word-break iterator, it just returns "_hello_".
-// On the other hand, our spellchecker expects for us to discard such
-// punctuation characters.
-// To extract only the words that our spellchecker can check, this class uses
-// custom rule-sets created by the SpellcheckCharAttribute class.
-// Also, this class normalizes extracted words so our spellchecker can check
-// the spellings of a word that includes ligatures, combined characters,
-// full-width characters, etc.
-//
+// A class which extracts words that can be checked for spelling from a
+// multi-language string. The ICU word-break iterator does not discard some
+// punctuation characters attached to a word. For example, when we set a word
+// "_hello_" to a word-break iterator, it just returns "_hello_". Neither does
+// it discard characters not used by the language. For example, it returns
+// Russian words even though we need English words only. To extract only the
+// words that our spellchecker can check their spellings, this class uses custom
+// rule-sets created by the SpellcheckCharAttribute class. Also, this class
+// normalizes extracted words so our spellchecker can check the spellings of
+// words that include ligatures, combined characters, full-width characters,
+// etc. This class uses UTF-16 strings as its input and output strings since
+// UTF-16 is the native encoding of ICU and avoid unnecessary conversions
+// when changing the encoding of this string for our spellchecker. (Chrome can
+// use two or more spellcheckers and we cannot assume their encodings.)
// The following snippet is an example that extracts words with this class.
//
// // Creates the language-specific attributes for US English.
@@ -86,15 +92,15 @@ class SpellcheckCharAttribute {
// attribute.SetDefaultLanguage("en-US");
//
// // Set up a SpellcheckWordIterator object which extracts English words,
-// // and retrieves them.
+// // and retrieve them.
// SpellcheckWordIterator iterator;
// string16 text(UTF8ToUTF16("this is a test."));
// iterator.Initialize(&attribute, text.c_str(), text_.length(), true);
//
// string16 word;
-// int start;
-// int end;
-// while (iterator.GetNextWord(&word, &start, &end)) {
+// int offset;
+// int length;
+// while (iterator.GetNextWord(&word, &offset, &length)) {
// ...
// }
//
@@ -103,52 +109,25 @@ class SpellcheckWordIterator {
SpellcheckWordIterator();
~SpellcheckWordIterator();
- // Initializes a word-iterator object.
- // Parameters
- // * attribute [in] (const SpellcheckCharAttribute*)
- // Character attributes used for filtering out non-word characters.
- // * word [in] (const char16*)
- // A string from which this object extracts words. (This string does not
- // have to be NUL-terminated.)
- // * length [in] (size_t)
- // The length of the given string, in UTF-16 characters.
- // * allow_contraction [in] (bool)
- // A flag to control whether or not this object should split a possible
- // contraction (e.g. "isn't", "in'n'out", etc.)
- // Return values
- // * true
- // This word-iterator object is initialized successfully.
- // * false
- // An error occured while initializing this object.
+ // Initializes a word-iterator object with the language-specific attribute and
+ // a multi-language text (it does not have to be NULL-terminated). If we need
+ // to split contractions and concatenated words, call this function with its
+ // 'allow_contraction' parameter false.
bool Initialize(const SpellcheckCharAttribute* attribute,
const char16* word,
size_t length,
bool allow_contraction);
- // Retrieves a word (or a contraction).
- // Parameters
- // * word_string [out] (string16*)
- // A word (or a contraction) to be checked its spelling. This
- // |word_string| has been already normalized to its canonical form (i.e.
- // decomposed ligatures, replaced full-width latin characters to its ASCII
- // alternatives, etc.) so a SpellChecker object can check its spelling
- // without any additional operations. We can use |word_start| and
- // |word_length| to retrieve the non-normalizedversion of this string as
- // shown in the following snippet.
- // string16 str(&word[word_start], word_length);
- // * word_start [out] (int*)
- // The offset of this word from the beginning of the input string,
- // in UTF-16 characters.
- // * word_length [out] (int*)
- // The length of an extracted word before normalization, in UTF-16
- // characters.
- // When the input string contains ligatures, this value may not be equal
- // to the length of the |word_string|.
- // Return values
- // * true
- // Found a word (or a contraction) to be checked its spelling.
- // * false
- // Not found any more words or contractions to be checked their spellings.
+ // Retrieves a word (or a contraction), stores its copy to 'word_string', and
+ // stores the position and the length for input word to 'word_start'. Since
+ // this function normalizes the output word, the length of 'word_string' may
+ // be different from the 'word_length'. Therefore, when we call functions that
+ // changes the input text, such as string16::replace(), we need to use
+ // 'word_start' and 'word_length' as listed in the following snippet.
+ //
+ // while(iterator.GetNextWord(&word, &offset, &length))
+ // text.replace(offset, length, word);
+ //
bool GetNextWord(string16* word_string,
int* word_start,
int* word_length);
@@ -157,16 +136,17 @@ class SpellcheckWordIterator {
// Releases all the resources attached to this object.
void Close();
- // Normalizes a non-terminated string so our spellchecker can check its
- // spelling. A word returned from an ICU word-break iterator may include
- // characters not supported by our spellchecker, e.g. ligatures, combining
- // characters, full-width letters, etc. This function replaces such characters
- // with alternative characters supported by our spellchecker.
+ // Normalizes a non-terminated string returned from an ICU word-break
+ // iterator. A word returned from an ICU break iterator may include characters
+ // not supported by our spellchecker, e.g. ligatures, combining/ characters,
+ // full-width letters, etc. This function replaces such characters with
+ // alternative characters supported by our spellchecker. This function also
+ // calls SpellcheckWordIterator::OutputChar() to filter out false-positive
+ // characters.
bool Normalize(int input_start,
int input_length,
string16* output_string) const;
- private:
// The pointer to the input string from which we are extracting words.
const char16* word_;
@@ -187,3 +167,4 @@ class SpellcheckWordIterator {
};
#endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
+
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
index 37e4f94..43af29f 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
@@ -128,3 +128,4 @@ TEST(SpellcheckWordIteratorTest, SplitWord) {
}
}
}
+