1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
|
// Copyright 2008, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__
#define CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__
#include <map>
#include <string>
#include "base/basictypes.h"
#include "unicode/uscript.h"
// A class which handles character attributes dependent on a spellchecker and
// its dictionary.
// This class is used by the SpellcheckWordIterator class to determine whether
// or not a character is one used by the spellchecker and its dictinary.
class SpellcheckCharAttribute {
public:
SpellcheckCharAttribute();
~SpellcheckCharAttribute();
// Sets the default language of the spell checker. This controls which
// characters are considered parts of words of the given language.
void SetDefaultLanguage(const std::wstring& language);
// Returns whether or not the given character is a character used by the
// selected dictionary.
// Parameters
// * character [in] (UChar32)
// Represents a Unicode character to be checked.
// Return values
// * true
// The given character is a word character.
// * false
// The given character is not a word character.
bool IsWordChar(UChar32 character) const;
// Returns whether or not the given character is a character used by
// contractions.
// Parameters
// * character [in] (UChar32)
// Represents a Unicode character to be checked.
// Return values
// * true
// The given character is a character used by contractions.
// * false
// The given character is not a character used by contractions.
bool IsContractionChar(UChar32 character) const;
private:
// Initializes the mapping table.
void InitializeScriptTable();
// Retrieves the ICU script code.
UScriptCode GetScriptCode(UChar32 character) const;
// Updates an entry in the mapping table.
void SetWordScript(const int script_code, bool in_use);
// Returns whether or not the given script is used by the selected
// dictionary.
bool IsWordScript(const UScriptCode script_code) const;
private:
// Represents a mapping table from a script code to a boolean value
// representing whether or not the script is used by the selected dictionary.
bool script_attributes_[USCRIPT_CODE_LIMIT];
// Represents a table of characters used by contractions.
std::map<UChar32, bool> middle_letters_;
DISALLOW_EVIL_CONSTRUCTORS(SpellcheckCharAttribute);
};
// A class which implements methods for finding the location of word boundaries
// used by the Spellchecker class.
// This class is implemented on the following assumptions:
// * An input string is encoded in UTF-16 (i.e. it may contain surrogate
// pairs), and;
// * The length of a string is the number of UTF-16 characters in the string
// (i.e. the length of a non-BMP character becomes two).
class SpellcheckWordIterator {
public:
SpellcheckWordIterator();
~SpellcheckWordIterator();
// Initializes a word-iterator object.
// Parameters
// * attribute [in] (const SpellcheckCharAttribute*)
// Represents a set of character attributes used for filtering out
// non-word characters.
// * word [in] (const wchar_t*)
// Represents a string from which this object extracts words.
// (This string does not have to be NUL-terminated.)
// * length [in] (size_t)
// Represents the length of the given string, in UTF-16 characters.
// This value should not include terminating NUL characters.
// * allow_contraction [in] (bool)
// Represents a flag to control whether or not this object should split a
// possible contraction (e.g. "isn't", "in'n'out", etc.)
// Return values
// * true
// This word-iterator object is initialized successfully.
// * false
// An error occured while initializing this object.
void Initialize(const SpellcheckCharAttribute* attribute,
const wchar_t* word,
size_t length,
bool allow_contraction);
// Retrieves a word (or a contraction).
// Parameters
// * word_string [out] (std::wstring*)
// Represents a word (or a contraction) to be checked its spelling.
// This |word_string| has been already normalized to its canonical form
// (i.e. decomposed ligatures, replaced full-width latin characters to
// its ASCII alternatives, etc.) so that a SpellChecker object can check
// its spelling without any additional operations.
// On the other hand, a substring of the input string
// std::wstring str(&word[word_start], word_length);
// represents the non-normalized version of this extracted word.
// * word_start [out] (int*)
// Represents the offset of this word from the beginning of the input
// string, in UTF-16 characters.
// * word_length [out] (int*)
// Represents the length of an extracted word before normalization, in
// UTF-16 characters.
// When the input string contains ligatures, this value may not be equal
// to the length of the |word_string|.
// Return values
// * true
// Found a word (or a contraction) to be checked its spelling.
// * false
// Not found any more words or contractions to be checked their spellings.
bool GetNextWord(std::wstring* word_string,
int* word_start,
int* word_length);
private:
// Retrieves a segment consisting of word characters (and contraction
// characters if the |allow_contraction| value is true).
void GetSegment(int* segment_start,
int* segment_end);
// Discards non-word characters at the beginning and the end of the given
// segment.
void TrimSegment(int segment_start,
int segment_end,
int* word_start,
int* word_length) const;
// Normalizes the given segment of the |word_| variable and write its
// canonical form to the |output_string|.
bool Normalize(int input_start,
int input_length,
std::wstring* output_string) const;
private:
// The pointer to the input string from which we are extracting words.
const wchar_t* word_;
// The length of the original string.
int length_;
// The current position in the original string.
int position_;
// The flag to control whether or not this object should extract possible
// contractions.
bool allow_contraction_;
// The character attributes used for filtering out non-word characters.
const SpellcheckCharAttribute* attribute_;
DISALLOW_EVIL_CONSTRUCTORS(SpellcheckWordIterator);
};
#endif // CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__
|