chrome/browser/spellcheck_worditerator.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183

// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__
#define CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__

#include <map>
#include <string>

#include "base/basictypes.h"

#include "unicode/uscript.h"

// A class which handles character attributes dependent on a spellchecker and
// its dictionary.
// This class is used by the SpellcheckWordIterator class to determine whether
// or not a character is one used by the spellchecker and its dictinary.
class SpellcheckCharAttribute {
 public:
  SpellcheckCharAttribute();

  ~SpellcheckCharAttribute();

  // Sets the default language of the spell checker. This controls which
  // characters are considered parts of words of the given language.
  void SetDefaultLanguage(const std::wstring& language);

  // Returns whether or not the given character is a character used by the
  // selected dictionary.
  // Parameters
  //   * character [in] (UChar32)
  //     Represents a Unicode character to be checked.
  // Return values
  //   * true
  //     The given character is a word character.
  //   * false
  //     The given character is not a word character.
  bool IsWordChar(UChar32 character) const;

  // Returns whether or not the given character is a character used by
  // contractions.
  // Parameters
  //   * character [in] (UChar32)
  //     Represents a Unicode character to be checked.
  // Return values
  //   * true
  //     The given character is a character used by contractions.
  //   * false
  //     The given character is not a character used by contractions.
  bool IsContractionChar(UChar32 character) const;

 private:
  // Initializes the mapping table.
  void InitializeScriptTable();

  // Retrieves the ICU script code.
  UScriptCode GetScriptCode(UChar32 character) const;

  // Updates an entry in the mapping table.
  void SetWordScript(const int script_code, bool in_use);

  // Returns whether or not the given script is used by the selected
  // dictionary.
  bool IsWordScript(const UScriptCode script_code) const;

 private:
  // Represents a mapping table from a script code to a boolean value
  // representing whether or not the script is used by the selected dictionary.
  bool script_attributes_[USCRIPT_CODE_LIMIT];

  // Represents a table of characters used by contractions.
  std::map<UChar32, bool> middle_letters_;

  DISALLOW_EVIL_CONSTRUCTORS(SpellcheckCharAttribute);
};

// A class which implements methods for finding the location of word boundaries
// used by the Spellchecker class.
// This class is implemented on the following assumptions:
//   * An input string is encoded in UTF-16 (i.e. it may contain surrogate
//     pairs), and;
//   * The length of a string is the number of UTF-16 characters in the string
//     (i.e. the length of a non-BMP character becomes two).
class SpellcheckWordIterator {
 public:
  SpellcheckWordIterator();

  ~SpellcheckWordIterator();

  // Initializes a word-iterator object.
  // Parameters
  //   * attribute [in] (const SpellcheckCharAttribute*)
  //     Represents a set of character attributes used for filtering out
  //     non-word characters.
  //   * word [in] (const wchar_t*)
  //     Represents a string from which this object extracts words.
  //     (This string does not have to be NUL-terminated.)
  //   * length [in] (size_t)
  //     Represents the length of the given string, in UTF-16 characters.
  //     This value should not include terminating NUL characters.
  //   * allow_contraction [in] (bool)
  //     Represents a flag to control whether or not this object should split a
  //     possible contraction (e.g. "isn't", "in'n'out", etc.)
  // Return values
  //   * true
  //     This word-iterator object is initialized successfully.
  //   * false
  //     An error occured while initializing this object.
  void Initialize(const SpellcheckCharAttribute* attribute,
                  const wchar_t* word,
                  size_t length,
                  bool allow_contraction);

  // Retrieves a word (or a contraction).
  // Parameters
  //   * word_string [out] (std::wstring*)
  //     Represents a word (or a contraction) to be checked its spelling.
  //     This |word_string| has been already normalized to its canonical form
  //     (i.e. decomposed ligatures, replaced full-width latin characters to
  //     its ASCII alternatives, etc.) so that a SpellChecker object can check
  //     its spelling without any additional operations.
  //     On the other hand, a substring of the input string
  //       std::wstring str(&word[word_start], word_length);
  //     represents the non-normalized version of this extracted word.
  //   * word_start [out] (int*)
  //     Represents the offset of this word from the beginning of the input
  //     string, in UTF-16 characters.
  //   * word_length [out] (int*)
  //     Represents the length of an extracted word before normalization, in
  //     UTF-16 characters.
  //     When the input string contains ligatures, this value may not be equal
  //     to the length of the |word_string|.
  // Return values
  //   * true
  //     Found a word (or a contraction) to be checked its spelling.
  //   * false
  //     Not found any more words or contractions to be checked their spellings.
  bool GetNextWord(std::wstring* word_string,
                   int* word_start,
                   int* word_length);

 private:
  // Retrieves a segment consisting of word characters (and contraction
  // characters if the |allow_contraction| value is true).
  void GetSegment(int* segment_start,
                  int* segment_end);

  // Discards non-word characters at the beginning and the end of the given
  // segment.
  void TrimSegment(int segment_start,
                   int segment_end,
                   int* word_start,
                   int* word_length) const;

  // Normalizes the given segment of the |word_| variable and write its
  // canonical form to the |output_string|.
  bool Normalize(int input_start,
                 int input_length,
                 std::wstring* output_string) const;

 private:
  // The pointer to the input string from which we are extracting words.
  const wchar_t* word_;

  // The length of the original string.
  int length_;

  // The current position in the original string.
  int position_;

  // The flag to control whether or not this object should extract possible
  // contractions.
  bool allow_contraction_;

  // The character attributes used for filtering out non-word characters.
  const SpellcheckCharAttribute* attribute_;

  DISALLOW_EVIL_CONSTRUCTORS(SpellcheckWordIterator);
};

#endif  // CHROME_BROWSER_SPELLCHECK_WORDITERATOR_H__