diff options
author | initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-07-26 23:55:29 +0000 |
---|---|---|
committer | initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-07-26 23:55:29 +0000 |
commit | 09911bf300f1a419907a9412154760efd0b7abc3 (patch) | |
tree | f131325fb4e2ad12c6d3504ab75b16dd92facfed /chrome/browser/spellcheck_worditerator.h | |
parent | 586acc5fe142f498261f52c66862fa417c3d52d2 (diff) | |
download | chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.zip chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.tar.gz chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.tar.bz2 |
Add chrome to the repository.
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@15 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/spellcheck_worditerator.h')
-rw-r--r-- | chrome/browser/spellcheck_worditerator.h | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/chrome/browser/spellcheck_worditerator.h b/chrome/browser/spellcheck_worditerator.h new file mode 100644 index 0000000..02792ddc --- /dev/null +++ b/chrome/browser/spellcheck_worditerator.h @@ -0,0 +1,207 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__ +#define CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__ + +#include <map> +#include <string> + +#include "base/basictypes.h" + +#include "unicode/uscript.h" + +// A class which handles character attributes dependent on a spellchecker and +// its dictionary. +// This class is used by the SpellcheckWordIterator class to determine whether +// or not a character is one used by the spellchecker and its dictinary. +class SpellcheckCharAttribute { + public: + SpellcheckCharAttribute(); + + ~SpellcheckCharAttribute(); + + // Sets the default language of the spell checker. This controls which + // characters are considered parts of words of the given language. + void SetDefaultLanguage(const std::wstring& language); + + // Returns whether or not the given character is a character used by the + // selected dictionary. + // Parameters + // * character [in] (UChar32) + // Represents a Unicode character to be checked. + // Return values + // * true + // The given character is a word character. + // * false + // The given character is not a word character. + bool IsWordChar(UChar32 character) const; + + // Returns whether or not the given character is a character used by + // contractions. + // Parameters + // * character [in] (UChar32) + // Represents a Unicode character to be checked. + // Return values + // * true + // The given character is a character used by contractions. + // * false + // The given character is not a character used by contractions. + bool IsContractionChar(UChar32 character) const; + + private: + // Initializes the mapping table. + void InitializeScriptTable(); + + // Retrieves the ICU script code. + UScriptCode GetScriptCode(UChar32 character) const; + + // Updates an entry in the mapping table. + void SetWordScript(const int script_code, bool in_use); + + // Returns whether or not the given script is used by the selected + // dictionary. + bool IsWordScript(const UScriptCode script_code) const; + + private: + // Represents a mapping table from a script code to a boolean value + // representing whether or not the script is used by the selected dictionary. + bool script_attributes_[USCRIPT_CODE_LIMIT]; + + // Represents a table of characters used by contractions. + std::map<UChar32, bool> middle_letters_; + + DISALLOW_EVIL_CONSTRUCTORS(SpellcheckCharAttribute); +}; + +// A class which implements methods for finding the location of word boundaries +// used by the Spellchecker class. +// This class is implemented on the following assumptions: +// * An input string is encoded in UTF-16 (i.e. it may contain surrogate +// pairs), and; +// * The length of a string is the number of UTF-16 characters in the string +// (i.e. the length of a non-BMP character becomes two). +class SpellcheckWordIterator { + public: + SpellcheckWordIterator(); + + ~SpellcheckWordIterator(); + + // Initializes a word-iterator object. + // Parameters + // * attribute [in] (const SpellcheckCharAttribute*) + // Represents a set of character attributes used for filtering out + // non-word characters. + // * word [in] (const wchar_t*) + // Represents a string from which this object extracts words. + // (This string does not have to be NUL-terminated.) + // * length [in] (size_t) + // Represents the length of the given string, in UTF-16 characters. + // This value should not include terminating NUL characters. + // * allow_contraction [in] (bool) + // Represents a flag to control whether or not this object should split a + // possible contraction (e.g. "isn't", "in'n'out", etc.) + // Return values + // * true + // This word-iterator object is initialized successfully. + // * false + // An error occured while initializing this object. + void Initialize(const SpellcheckCharAttribute* attribute, + const wchar_t* word, + size_t length, + bool allow_contraction); + + // Retrieves a word (or a contraction). + // Parameters + // * word_string [out] (std::wstring*) + // Represents a word (or a contraction) to be checked its spelling. + // This |word_string| has been already normalized to its canonical form + // (i.e. decomposed ligatures, replaced full-width latin characters to + // its ASCII alternatives, etc.) so that a SpellChecker object can check + // its spelling without any additional operations. + // On the other hand, a substring of the input string + // std::wstring str(&word[word_start], word_length); + // represents the non-normalized version of this extracted word. + // * word_start [out] (int*) + // Represents the offset of this word from the beginning of the input + // string, in UTF-16 characters. + // * word_length [out] (int*) + // Represents the length of an extracted word before normalization, in + // UTF-16 characters. + // When the input string contains ligatures, this value may not be equal + // to the length of the |word_string|. + // Return values + // * true + // Found a word (or a contraction) to be checked its spelling. + // * false + // Not found any more words or contractions to be checked their spellings. + bool GetNextWord(std::wstring* word_string, + int* word_start, + int* word_length); + + private: + // Retrieves a segment consisting of word characters (and contraction + // characters if the |allow_contraction| value is true). + void GetSegment(int* segment_start, + int* segment_end); + + // Discards non-word characters at the beginning and the end of the given + // segment. + void TrimSegment(int segment_start, + int segment_end, + int* word_start, + int* word_length) const; + + // Normalizes the given segment of the |word_| variable and write its + // canonical form to the |output_string|. + bool Normalize(int input_start, + int input_length, + std::wstring* output_string) const; + + private: + // The pointer to the input string from which we are extracting words. + const wchar_t* word_; + + // The length of the original string. + int length_; + + // The current position in the original string. + int position_; + + // The flag to control whether or not this object should extract possible + // contractions. + bool allow_contraction_; + + // The character attributes used for filtering out non-word characters. + const SpellcheckCharAttribute* attribute_; + + DISALLOW_EVIL_CONSTRUCTORS(SpellcheckWordIterator); +}; + +#endif // CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__ |