summaryrefslogtreecommitdiffstats
path: root/content/renderer/hyphenator/hyphenator.cc
blob: b94ba3cc6981a6aa227b1b6a7122c9ed34f9c860 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "content/renderer/hyphenator/hyphenator.h"

#include "base/files/memory_mapped_file.h"
#include "base/logging.h"
#include "base/memory/scoped_ptr.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "content/common/hyphenator_messages.h"
#include "content/public/renderer/render_thread.h"
#include "third_party/hyphen/hyphen.h"
#include "third_party/icu/source/common/unicode/uscript.h"

namespace {

// A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds
// only the length of converted UTF-16 characters. This class is used for
// creating a mapping from the position of a UTF-8 string to a position of a
// UTF-16 string without unnecessary conversions. Even though the following
// snippet produces the same mapping, it needs to convert same characters many
// times. This class incrementally counts the number of converted UTF-16
// characters to avoid this problem.
//
//   scoped_ptr<size_t[]> position(new size_t[text.length()]);
//   for (size_t i = 0; i < text.length(); ++i)
//     position[i] = UTF8ToUTF16(text.substr(0, i)).length();
//
class UTF16TextLength {
 public:
  UTF16TextLength();
  ~UTF16TextLength();

  // Returns the current position.
  int utf16_length() const { return utf16_length_; }

  // Appends one UTF-8 character to this converter and advances the converted
  // position. This converter increases the position by one when it finishes
  // reading a BMP character and increases by two when it finish reading a
  // non-BMP character.
  void Append(char c);

 private:
  // The length of the converted UTF-16 text.
  int utf16_length_;

  // The buffer that stores UTF-8 characters being converted.
  std::string utf8_text_;

  DISALLOW_COPY_AND_ASSIGN(UTF16TextLength);
};

UTF16TextLength::UTF16TextLength()
    : utf16_length_(0) {
}

UTF16TextLength::~UTF16TextLength() {
}

void UTF16TextLength::Append(char c) {
  // Append the given character and try converting the UTF-8 characters in this
  // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,
  // get the number of UTF-16 characters representing this codepoint and advance
  // the position.
  int code = 0;
  int index = 0;
  utf8_text_.push_back(c);
  U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),
          code);
  if (code != U_SENTINEL) {
    utf8_text_.clear();
    utf16_length_ += U16_LENGTH(code);
  }
}

// A class that encapsulates a hyphenation query. This class owns resources
// temporarily needed for hyphenating one word, and deletes them when it is
// deleted as listed in the following snippet.
//
//   std::vector<int> hyphens;
//   QUery query(UTF8ToUTF16("hyphenate"));
//   query.Hyphenate(dict, &hyphens);
//
class Query {
 public:
  explicit Query(const string16& word);
  ~Query();

  // Hyphenates a word with the specified dictionary. This function hyphenates
  // the word provided to its constructor and returns a list of hyphenation
  // points, positions where we can insert hyphens.
  bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);

 private:
  // A word to be hyphenated.
  std::string word_utf8_;

  // Return variables from the hyphen library.
  scoped_ptr<char[]> hyphen_vector_;
  char** rep_;
  int* pos_;
  int* cut_;

  DISALLOW_COPY_AND_ASSIGN(Query);
};

Query::Query(const string16& word)
    : rep_(NULL),
      pos_(NULL),
      cut_(NULL) {
  // Remove trailing punctuation characters. WebKit does not remove these
  // characters when it hyphenates a word. These characters prevent the hyphen
  // library from applying some rules, i.e. they prevent the library from adding
  // hyphens.
  DCHECK(!word.empty());
  const char16* data = word.data();
  int length = static_cast<int>(word.length());
  while (length > 0) {
    int previous = length;
    int code = 0;
    U16_PREV(data, 0, previous, code);
    UErrorCode error = U_ZERO_ERROR;
    if (uscript_getScript(code, &error) != USCRIPT_COMMON)
      break;
    length = previous;
  }
  UTF16ToUTF8(word.c_str(), length, &word_utf8_);
  // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a
  // buffer of |word_.length()| + 5 as written in Line 112 of
  // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.
  hyphen_vector_.reset(new char[word_utf8_.length() + 5]);
}

Query::~Query() {
  if (rep_) {
    for (size_t i = 0; i < word_utf8_.length(); ++i) {
      if (rep_[i])
        free(rep_[i]);
    }
    free(rep_);
  }
  if (pos_)
    free(pos_);
  if (cut_)
    free(cut_);
}

bool Query::Hyphenate(HyphenDict* dictionary,
                      std::vector<int>* hyphen_offsets) {
  DCHECK(dictionary);
  DCHECK(hyphen_offsets);

  int error_code = hnj_hyphen_hyphenate2(dictionary,
                                         word_utf8_.data(),
                                         static_cast<int>(word_utf8_.length()),
                                         hyphen_vector_.get(),
                                         NULL,
                                         &rep_,
                                         &pos_,
                                         &cut_);
  if (error_code)
    return false;

  // WebKit needs hyphenation points counted in UTF-16 characters. On the other
  // hand, the hyphen library returns hyphenation points counted in UTF-8
  // characters. We increamentally convert hyphenation points in UTF-8
  // characters to hyphenation points in UTF-16 characters and write the
  // converted hyphenation points to the output vector.
  UTF16TextLength text_length;
  hyphen_offsets->clear();
  for (size_t i = 0; i < word_utf8_.length(); ++i) {
    text_length.Append(word_utf8_[i]);
    if (hyphen_vector_[i] & 1)
      hyphen_offsets->push_back(text_length.utf16_length());
  }
  return !hyphen_offsets->empty();
}

}  // namespace

namespace content {

Hyphenator::Hyphenator(base::PlatformFile file)
    : dictionary_(NULL),
      dictionary_file_(base::FdopenPlatformFile(file, "r")),
      result_(0) {
}

Hyphenator::~Hyphenator() {
  if (dictionary_)
    hnj_hyphen_free(dictionary_);
}

bool Hyphenator::Initialize() {
  if (dictionary_)
    return true;

  if (!dictionary_file_.get())
    return false;
  dictionary_ = hnj_hyphen_load_file(dictionary_file_.get());
  return !!dictionary_;
}

bool Hyphenator::Attach(RenderThread* thread, const string16& locale) {
  if (!thread)
    return false;
  locale_.assign(locale);
  thread->AddObserver(this);
  return thread->Send(new HyphenatorHostMsg_OpenDictionary(locale));
}

bool Hyphenator::CanHyphenate(const string16& locale) {
  return !locale_.compare(locale);
}

size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,
                                             size_t before_index) {
  if (!Initialize() || word.empty())
    return 0;

  // Call the hyphen library to get all hyphenation points, i.e. positions where
  // we can insert hyphens. When WebKit finds a line-break, it calls this
  // function twice or more with the same word to find the best hyphenation
  // point. To avoid calling the hyphen library twice or more with the same
  // word, we cache the last query.
  if (word_ != word) {
    word_ = word;
    Query query(word);
    result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);
  }
  if (!result_)
    return 0;
  for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();
       it != hyphen_offsets_.rend(); ++it) {
    if (static_cast<size_t>(*it) < before_index)
      return *it;
  }
  return 0;
}

bool Hyphenator::OnControlMessageReceived(const IPC::Message& message) {
  bool handled = true;
  IPC_BEGIN_MESSAGE_MAP(Hyphenator, message)
    IPC_MESSAGE_HANDLER(HyphenatorMsg_SetDictionary, OnSetDictionary)
    IPC_MESSAGE_UNHANDLED(handled = false)
  IPC_END_MESSAGE_MAP()
  return handled;
}

void Hyphenator::OnSetDictionary(IPC::PlatformFileForTransit file) {
  base::PlatformFile rule_file =
      IPC::PlatformFileForTransitToPlatformFile(file);
  if (rule_file == base::kInvalidPlatformFileValue)
    return;
  // Delete the current dictionary and save the given file to this object. We
  // initialize the hyphen library the first time when WebKit actually
  // hyphenates a word, i.e. when WebKit calls the ComputeLastHyphenLocation
  // function. (WebKit does not always hyphenate words even when it calls the
  // CanHyphenate function, e.g. WebKit does not have to hyphenate words when it
  // does not have to break text into lines.)
  if (dictionary_) {
    hnj_hyphen_free(dictionary_);
    dictionary_ = NULL;
  }
  dictionary_file_.Set(base::FdopenPlatformFile(rule_file, "r"));
}

}  // namespace content