// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ #include #include #include "base/basictypes.h" #include "base/string16.h" #include "third_party/icu/public/common/unicode/ubrk.h" #include "third_party/icu/public/common/unicode/uscript.h" // A class which encapsulates language-specific operations used by // SpellcheckWordIterator. // When we set the spellchecker language, this class creates rule sets that // filter out the characters not supported by the spellchecker. // (Please read the comment in the SpellcheckWordIterator class about how to // use this class.) class SpellcheckCharAttribute { public: SpellcheckCharAttribute(); ~SpellcheckCharAttribute(); // Sets the language of the spellchecker. // This function creates the custom rule-sets used by SpellcheckWordIterator. // Parameters // * language [in] (std::string) // The language-code string. void SetDefaultLanguage(const std::string& language); // Returns a custom rule-set string used by the ICU break iterator. // Parameters // * allow_contraction [in] (bool) // A flag to control whether or not this object splits a possible // contraction. If this value is false, it returns a rule set that // splits a possible contraction: "in'n'out" -> "in", "n", and "out". string16 GetRuleSet(bool allow_contraction) const; // Output a character only if it is a word character. bool OutputChar(UChar c, string16* output) const; private: // Creates the rule-set strings. void CreateRuleSets(const std::string& language); // Language-specific output functions. bool OutputArabic(UChar c, string16* output) const; bool OutputHangul(UChar c, string16* output) const; bool OutputHebrew(UChar c, string16* output) const; bool OutputDefault(UChar c, string16* output) const; private: // The custom rule-set strings used by ICU BreakIterator. // Since it is not so easy to create custom rule-sets from a spellchecker // language, this class saves these rule-set strings created when we set the // language. string16 ruleset_allow_contraction_; string16 ruleset_disallow_contraction_; // The script code used by this language. UScriptCode script_code_; DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); }; // A class which extracts words that can be checked for spelling from a longer // string. // The ICU word-break iterator does not discard some punctuation characters // attached to a word. For example, when we set a word "_hello_" to a // word-break iterator, it just returns "_hello_". // On the other hand, our spellchecker expects for us to discard such // punctuation characters. // To extract only the words that our spellchecker can check, this class uses // custom rule-sets created by the SpellcheckCharAttribute class. // Also, this class normalizes extracted words so our spellchecker can check // the spellings of a word that includes ligatures, combined characters, // full-width characters, etc. // // The following snippet is an example that extracts words with this class. // // // Creates the language-specific attributes for US English. // SpellcheckCharAttribute attribute; // attribute.SetDefaultLanguage("en-US"); // // // Set up a SpellcheckWordIterator object which extracts English words, // // and retrieves them. // SpellcheckWordIterator iterator; // string16 text(UTF8ToUTF16("this is a test.")); // iterator.Initialize(&attribute, text.c_str(), text_.length(), true); // // string16 word; // int start; // int end; // while (iterator.GetNextWord(&word, &start, &end)) { // ... // } // class SpellcheckWordIterator { public: SpellcheckWordIterator(); ~SpellcheckWordIterator(); // Initializes a word-iterator object. // Parameters // * attribute [in] (const SpellcheckCharAttribute*) // Character attributes used for filtering out non-word characters. // * word [in] (const char16*) // A string from which this object extracts words. (This string does not // have to be NUL-terminated.) // * length [in] (size_t) // The length of the given string, in UTF-16 characters. // * allow_contraction [in] (bool) // A flag to control whether or not this object should split a possible // contraction (e.g. "isn't", "in'n'out", etc.) // Return values // * true // This word-iterator object is initialized successfully. // * false // An error occured while initializing this object. bool Initialize(const SpellcheckCharAttribute* attribute, const char16* word, size_t length, bool allow_contraction); // Retrieves a word (or a contraction). // Parameters // * word_string [out] (string16*) // A word (or a contraction) to be checked its spelling. This // |word_string| has been already normalized to its canonical form (i.e. // decomposed ligatures, replaced full-width latin characters to its ASCII // alternatives, etc.) so a SpellChecker object can check its spelling // without any additional operations. We can use |word_start| and // |word_length| to retrieve the non-normalizedversion of this string as // shown in the following snippet. // string16 str(&word[word_start], word_length); // * word_start [out] (int*) // The offset of this word from the beginning of the input string, // in UTF-16 characters. // * word_length [out] (int*) // The length of an extracted word before normalization, in UTF-16 // characters. // When the input string contains ligatures, this value may not be equal // to the length of the |word_string|. // Return values // * true // Found a word (or a contraction) to be checked its spelling. // * false // Not found any more words or contractions to be checked their spellings. bool GetNextWord(string16* word_string, int* word_start, int* word_length); private: // Releases all the resources attached to this object. void Close(); // Normalizes a non-terminated string so our spellchecker can check its // spelling. A word returned from an ICU word-break iterator may include // characters not supported by our spellchecker, e.g. ligatures, combining // characters, full-width letters, etc. This function replaces such characters // with alternative characters supported by our spellchecker. bool Normalize(int input_start, int input_length, string16* output_string) const; private: // The pointer to the input string from which we are extracting words. const char16* word_; // The length of the original string. int length_; // The current position in the original string. int position_; // The language-specific attributes used for filtering out non-word // characters. const SpellcheckCharAttribute* attribute_; // The ICU break iterator. UBreakIterator* iterator_; DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); }; #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_