// Copyright 2008, Google Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__ #define CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__ #include #include #include "base/basictypes.h" #include "unicode/uscript.h" // A class which handles character attributes dependent on a spellchecker and // its dictionary. // This class is used by the SpellcheckWordIterator class to determine whether // or not a character is one used by the spellchecker and its dictinary. class SpellcheckCharAttribute { public: SpellcheckCharAttribute(); ~SpellcheckCharAttribute(); // Sets the default language of the spell checker. This controls which // characters are considered parts of words of the given language. void SetDefaultLanguage(const std::wstring& language); // Returns whether or not the given character is a character used by the // selected dictionary. // Parameters // * character [in] (UChar32) // Represents a Unicode character to be checked. // Return values // * true // The given character is a word character. // * false // The given character is not a word character. bool IsWordChar(UChar32 character) const; // Returns whether or not the given character is a character used by // contractions. // Parameters // * character [in] (UChar32) // Represents a Unicode character to be checked. // Return values // * true // The given character is a character used by contractions. // * false // The given character is not a character used by contractions. bool IsContractionChar(UChar32 character) const; private: // Initializes the mapping table. void InitializeScriptTable(); // Retrieves the ICU script code. UScriptCode GetScriptCode(UChar32 character) const; // Updates an entry in the mapping table. void SetWordScript(const int script_code, bool in_use); // Returns whether or not the given script is used by the selected // dictionary. bool IsWordScript(const UScriptCode script_code) const; private: // Represents a mapping table from a script code to a boolean value // representing whether or not the script is used by the selected dictionary. bool script_attributes_[USCRIPT_CODE_LIMIT]; // Represents a table of characters used by contractions. std::map middle_letters_; DISALLOW_EVIL_CONSTRUCTORS(SpellcheckCharAttribute); }; // A class which implements methods for finding the location of word boundaries // used by the Spellchecker class. // This class is implemented on the following assumptions: // * An input string is encoded in UTF-16 (i.e. it may contain surrogate // pairs), and; // * The length of a string is the number of UTF-16 characters in the string // (i.e. the length of a non-BMP character becomes two). class SpellcheckWordIterator { public: SpellcheckWordIterator(); ~SpellcheckWordIterator(); // Initializes a word-iterator object. // Parameters // * attribute [in] (const SpellcheckCharAttribute*) // Represents a set of character attributes used for filtering out // non-word characters. // * word [in] (const wchar_t*) // Represents a string from which this object extracts words. // (This string does not have to be NUL-terminated.) // * length [in] (size_t) // Represents the length of the given string, in UTF-16 characters. // This value should not include terminating NUL characters. // * allow_contraction [in] (bool) // Represents a flag to control whether or not this object should split a // possible contraction (e.g. "isn't", "in'n'out", etc.) // Return values // * true // This word-iterator object is initialized successfully. // * false // An error occured while initializing this object. void Initialize(const SpellcheckCharAttribute* attribute, const wchar_t* word, size_t length, bool allow_contraction); // Retrieves a word (or a contraction). // Parameters // * word_string [out] (std::wstring*) // Represents a word (or a contraction) to be checked its spelling. // This |word_string| has been already normalized to its canonical form // (i.e. decomposed ligatures, replaced full-width latin characters to // its ASCII alternatives, etc.) so that a SpellChecker object can check // its spelling without any additional operations. // On the other hand, a substring of the input string // std::wstring str(&word[word_start], word_length); // represents the non-normalized version of this extracted word. // * word_start [out] (int*) // Represents the offset of this word from the beginning of the input // string, in UTF-16 characters. // * word_length [out] (int*) // Represents the length of an extracted word before normalization, in // UTF-16 characters. // When the input string contains ligatures, this value may not be equal // to the length of the |word_string|. // Return values // * true // Found a word (or a contraction) to be checked its spelling. // * false // Not found any more words or contractions to be checked their spellings. bool GetNextWord(std::wstring* word_string, int* word_start, int* word_length); private: // Retrieves a segment consisting of word characters (and contraction // characters if the |allow_contraction| value is true). void GetSegment(int* segment_start, int* segment_end); // Discards non-word characters at the beginning and the end of the given // segment. void TrimSegment(int segment_start, int segment_end, int* word_start, int* word_length) const; // Normalizes the given segment of the |word_| variable and write its // canonical form to the |output_string|. bool Normalize(int input_start, int input_length, std::wstring* output_string) const; private: // The pointer to the input string from which we are extracting words. const wchar_t* word_; // The length of the original string. int length_; // The current position in the original string. int position_; // The flag to control whether or not this object should extract possible // contractions. bool allow_contraction_; // The character attributes used for filtering out non-word characters. const SpellcheckCharAttribute* attribute_; DISALLOW_EVIL_CONSTRUCTORS(SpellcheckWordIterator); }; #endif // CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__