summaryrefslogtreecommitdiffstats
path: root/chrome/renderer/spellchecker/spellcheck_worditerator.h
blob: aa54011e87a205af9b735234a786e9b2e4a69899 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
#define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_

#include <map>
#include <string>

#include "base/basictypes.h"
#include "base/string16.h"
#include "third_party/icu/public/common/unicode/ubrk.h"
#include "third_party/icu/public/common/unicode/uscript.h"

// A class which encapsulates language-specific operations used by
// SpellcheckWordIterator.
// When we set the spellchecker language, this class creates rule sets that
// filter out the characters not supported by the spellchecker.
// (Please read the comment in the SpellcheckWordIterator class about how to
// use this class.)
class SpellcheckCharAttribute {
 public:
  SpellcheckCharAttribute();
  ~SpellcheckCharAttribute();

  // Sets the language of the spellchecker.
  // This function creates the custom rule-sets used by SpellcheckWordIterator.
  // Parameters
  //   * language [in] (std::string)
  //     The language-code string.
  void SetDefaultLanguage(const std::string& language);

  // Returns a custom rule-set string used by the ICU break iterator.
  // Parameters
  //   * allow_contraction [in] (bool)
  //     A flag to control whether or not this object splits a possible
  //     contraction. If this value is false, it returns a rule set that
  //    splits a possible contraction: "in'n'out" -> "in", "n", and "out".
  string16 GetRuleSet(bool allow_contraction) const;

  // Output a character only if it is a word character.
  bool OutputChar(UChar c, string16* output) const;

 private:
  // Creates the rule-set strings.
  void CreateRuleSets(const std::string& language);

  // Language-specific output functions.
  bool OutputArabic(UChar c, string16* output) const;
  bool OutputHangul(UChar c, string16* output) const;
  bool OutputHebrew(UChar c, string16* output) const;
  bool OutputDefault(UChar c, string16* output) const;

 private:
  // The custom rule-set strings used by ICU BreakIterator.
  // Since it is not so easy to create custom rule-sets from a spellchecker
  // language, this class saves these rule-set strings created when we set the
  // language.
  string16 ruleset_allow_contraction_;
  string16 ruleset_disallow_contraction_;

  // The script code used by this language.
  UScriptCode script_code_;

  DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
};

// A class which extracts words that can be checked for spelling from a longer
// string.
// The ICU word-break iterator does not discard some punctuation characters
// attached to a word. For example, when we set a word "_hello_" to a
// word-break iterator, it just returns "_hello_".
// On the other hand, our spellchecker expects for us to discard such
// punctuation characters.
// To extract only the words that our spellchecker can check, this class uses
// custom rule-sets created by the SpellcheckCharAttribute class.
// Also, this class normalizes extracted words so our spellchecker can check
// the spellings of a word that includes ligatures, combined characters,
// full-width characters, etc.
//
// The following snippet is an example that extracts words with this class.
//
//   // Creates the language-specific attributes for US English.
//   SpellcheckCharAttribute attribute;
//   attribute.SetDefaultLanguage("en-US");
//
//   // Set up a SpellcheckWordIterator object which extracts English words,
//   // and retrieves them.
//   SpellcheckWordIterator iterator;
//   string16 text(UTF8ToUTF16("this is a test."));
//   iterator.Initialize(&attribute, text.c_str(), text_.length(), true);
//
//   string16 word;
//   int start;
//   int end;
//   while (iterator.GetNextWord(&word, &start, &end)) {
//     ...
//   }
//
class SpellcheckWordIterator {
 public:
  SpellcheckWordIterator();
  ~SpellcheckWordIterator();

  // Initializes a word-iterator object.
  // Parameters
  //   * attribute [in] (const SpellcheckCharAttribute*)
  //     Character attributes used for filtering out non-word characters.
  //   * word [in] (const char16*)
  //     A string from which this object extracts words. (This string does not
  //     have to be NUL-terminated.)
  //   * length [in] (size_t)
  //     The length of the given string, in UTF-16 characters.
  //   * allow_contraction [in] (bool)
  //     A flag to control whether or not this object should split a possible
  //     contraction (e.g. "isn't", "in'n'out", etc.)
  // Return values
  //   * true
  //     This word-iterator object is initialized successfully.
  //   * false
  //     An error occured while initializing this object.
  bool Initialize(const SpellcheckCharAttribute* attribute,
                  const char16* word,
                  size_t length,
                  bool allow_contraction);

  // Retrieves a word (or a contraction).
  // Parameters
  //   * word_string [out] (string16*)
  //     A word (or a contraction) to be checked its spelling. This
  //     |word_string| has been already normalized to its canonical form (i.e.
  //     decomposed ligatures, replaced full-width latin characters to its ASCII
  //     alternatives, etc.) so a SpellChecker object can check its spelling
  //     without any additional operations. We can use |word_start| and
  //     |word_length| to retrieve the non-normalizedversion of this string as
  //     shown in the following snippet.
  //       string16 str(&word[word_start], word_length);
  //   * word_start [out] (int*)
  //     The offset of this word from the beginning of the input string,
  //     in UTF-16 characters.
  //   * word_length [out] (int*)
  //     The length of an extracted word before normalization, in UTF-16
  //     characters.
  //     When the input string contains ligatures, this value may not be equal
  //     to the length of the |word_string|.
  // Return values
  //   * true
  //     Found a word (or a contraction) to be checked its spelling.
  //   * false
  //     Not found any more words or contractions to be checked their spellings.
  bool GetNextWord(string16* word_string,
                   int* word_start,
                   int* word_length);

 private:
  // Releases all the resources attached to this object.
  void Close();

  // Normalizes a non-terminated string so our spellchecker can check its
  // spelling. A word returned from an ICU word-break iterator may include
  // characters not supported by our spellchecker, e.g. ligatures, combining
  // characters, full-width letters, etc. This function replaces such characters
  // with alternative characters supported by our spellchecker.
  bool Normalize(int input_start,
                 int input_length,
                 string16* output_string) const;

 private:
  // The pointer to the input string from which we are extracting words.
  const char16* word_;

  // The length of the original string.
  int length_;

  // The current position in the original string.
  int position_;

  // The language-specific attributes used for filtering out non-word
  // characters.
  const SpellcheckCharAttribute* attribute_;

  // The ICU break iterator.
  UBreakIterator* iterator_;

  DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
};

#endif  // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_