1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
|
// Copyright 2008 Google Inc. All Rights Reserved.
#include "chrome/third_party/hunspell/google/bdict_writer.h"
#include "base/logging.h"
#include "base/string_util.h"
#include "chrome/third_party/hunspell/google/bdict.h"
namespace hunspell {
// Represents one node the word trie in memory. This does not have to be very
// efficient since it is only used when building.
class DicNode {
public:
enum StorageType {
UNDEFINED, // Uninitialized storage type.
LEAF, // Word with no additional string data.
LEAFMORE, // Word with additional suffix following.
LIST16, // List of sub-nodes with 16-bit relative offsets.
LIST8, // List of sub-nodes with 8-bit relative offsets.
LOOKUP32, // Lookup table with 32-bit absolute offsets.
LOOKUP16, // LOokup table with 16-bit relative offsets.
};
DicNode() : addition(0), storage(UNDEFINED) {
}
~DicNode() {
for (size_t i = 0; i < children.size(); i++)
delete children[i];
}
bool is_leaf() const { return children.empty(); }
// When non-zero, this character is the additional level that this
// node represents. This will be 0 for some leaf nodes when there is no
// addition and for the root node.
char addition;
std::vector<DicNode*> children;
// When there are no children, this is a leaf node and this "addition string"
// is appended to the result. When there are children, this will be empty.
std::string leaf_addition;
// For leaf nodes, this are the indices into the affix table.
std::vector<int> affix_indices;
// Initially uninitialized, ComputeStorage() will fill this in with the
// desired serialization method.
StorageType storage;
};
namespace {
void SerializeTrie(const DicNode* node, std::string* output);
// Returns true if the nth character in the given word is |ch|. Will return
// false when there is no nth character. Note that this will also match an
// implicit NULL at the end of the string.
bool NthCharacterIs(const std::string& word, size_t n, char ch) {
if (word.length() < n) // Want to allow n == length() to catch the NULL.
return false;
return word.c_str()[n] == ch; // Use c_str() to get NULL terminator.
}
// Recursively build the trie data structure for the range in the |words| list
// in [begin, end). It is assumed that all words in that range will have the
// same |node_depth - 2| characters at the beginning. This node will key off of
// the |node_depth - 1| character, with a special case for the root.
//
// |prefix_chars| is how deep this node is in the trie (and corresponds to how
// many letters of the word we will skip). The root level will have
// |prefix_chars| of 0.
//
// The given |node| will be filled with the data. The return value is the
// index into the |words| vector of the next word to process. It will be
// equal to |end| when all words have been consumed.
size_t BuildTrie(const BDictWriter::WordList& words,
size_t begin, size_t end,
size_t node_depth, DicNode* node) {
// Find the prefix that this node represents.
const std::string& begin_str = words[begin].first;
if (begin_str.length() < node_depth) {
// Singleton.
node->addition = 0;
node->affix_indices = words[begin].second;
return begin + 1;
}
// Now find the range of words sharing this prefix.
size_t match_count;
if (node_depth == 0 && begin == 0) {
// Special case the root node.
match_count = end - begin;
node->addition = 0;
} else {
match_count = 0;
node->addition = begin_str[node_depth - 1];
// We know the strings should have [0, node_depth-1) characters at the
// beginning already matching, so we only need to check the new one.
while (begin + match_count < end &&
NthCharacterIs(words[begin + match_count].first,
node_depth - 1, node->addition))
match_count++;
}
if (match_count == 1) {
// Just found a leaf node with no other words sharing its prefix. Save any
// remaining characters and we're done.
node->affix_indices = words[begin].second;
node->leaf_addition = begin_str.substr(node_depth);
return begin + 1;
}
// We have a range of words, add them as children of this node.
size_t i = begin;
while (i < begin + match_count) {
DicNode* cur = new DicNode;
i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur);
node->children.push_back(cur);
}
return begin + match_count;
}
// Lookup tables are complicated. They can have a magic 0th entry not counted
// in the table dimensions, and also have indices only for the used sub-range.
// This function will compute the starting point and size of a lookup table,
// in addition to whether it should have the magic 0th entry for the given
// list of child nodes.
void ComputeLookupStrategyDetails(const std::vector<DicNode*>& children,
bool* has_0th_entry,
int* first_item,
int* list_size) {
*has_0th_entry = false;
*first_item = 0;
*list_size = 0;
if (children.empty())
return;
size_t first_offset = 0;
if (children[0]->addition == 0) {
*has_0th_entry = true;
first_offset++;
}
if (children.size() == first_offset)
return;
*first_item = static_cast<unsigned char>(children[first_offset]->addition);
unsigned char last_item = children[children.size() - 1]->addition;
*list_size = last_item - *first_item + 1;
}
// Recursively fills in the storage strategy for this node and each of its
// children. This must be done before actually serializing because the storage
// mode will depend on the size of the children.
size_t ComputeTrieStorage(DicNode* node) {
if (node->is_leaf()) {
// The additional affix list holds affixes when there is more than one. Each
// entry is two bytes, plus an additional FFFF terminator.
size_t supplimentary_size = 0;
if (node->affix_indices.size() > 1 ||
node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID)
supplimentary_size = node->affix_indices.size() * 2;
if (node->leaf_addition.empty()) {
node->storage = DicNode::LEAF;
return 2 + supplimentary_size;
}
node->storage = DicNode::LEAFMORE;
// Signature & affix (2) + null for leaf_addition (1) = 3
return 3 + node->leaf_addition.size() + supplimentary_size;
}
// Recursively compute the size of the children for non-leaf nodes.
size_t child_size = 0;
for (size_t i = 0; i < node->children.size(); i++)
child_size += ComputeTrieStorage(node->children[i]);
// Fixed size is only 1 byte which is the ID byte and the count combined.
static const int kListHeaderSize = 1;
// Lists can only store up to 16 items.
static const int kListThreshold = 16;
if (node->children.size() < kListThreshold && child_size <= 0xFF) {
node->storage = DicNode::LIST8;
return kListHeaderSize + node->children.size() * 2 + child_size;
}
if (node->children.size() < kListThreshold && child_size <= 0xFFFF) {
node->storage = DicNode::LIST16;
// Fixed size is one byte plus 3 for each table entry.
return kListHeaderSize + node->children.size() * 3 + child_size;
}
static const int kTableHeaderSize = 2; // Type + table size.
bool has_0th_item;
int first_table_item, table_item_count;
ComputeLookupStrategyDetails(node->children, &has_0th_item,
&first_table_item, &table_item_count);
if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) +
table_item_count * 2 < 0xFFFF) {
// Use 16-bit addressing since the children will fit.
node->storage = DicNode::LOOKUP16;
return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 +
child_size;
}
// Use 32-bit addressing as a last resort.
node->storage = DicNode::LOOKUP32;
return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 +
child_size;
}
// Serializes the given node when it is DicNode::LEAF* to the output.
void SerializeLeaf(const DicNode* node, std::string* output) {
// The low 6 bits of the ID byte are the high 6 bits of the first affix ID.
int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0;
// We may store the first value with the node or in the supplimentary list.
size_t first_affix_in_supplimentary_list = 1;
if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
// There are not enough bits for this value, move it to the supplimentary
// list where there are more bits per value.
first_affix_in_supplimentary_list = 0;
first_affix = BDict::FIRST_AFFIX_IS_UNUSED;
}
unsigned char id_byte = (first_affix >> 8) &
BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK;
// The next two bits indicates an additional string and more affixes.
if (node->storage == DicNode::LEAFMORE)
id_byte |= BDict::LEAF_NODE_ADDITIONAL_VALUE;
if (node->affix_indices.size() > 1 || first_affix_in_supplimentary_list == 0)
id_byte |= BDict::LEAF_NODE_FOLLOWING_VALUE;
output->push_back(id_byte);
// Following is the low 8 bits of the affix index.
output->push_back(first_affix & 0xff);
// Handle the optional addition with NULL terminator.
if (node->storage == DicNode::LEAFMORE) {
for (size_t i = 0; i < node->leaf_addition.size() + 1; i++)
output->push_back(node->leaf_addition.c_str()[i]);
}
// Handle any following affixes. We already wrote the 0th one.
if (node->affix_indices.size() > first_affix_in_supplimentary_list) {
for (size_t i = first_affix_in_supplimentary_list;
i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD;
i++) {
output->push_back(static_cast<char>(node->affix_indices[i] & 0xFF));
output->push_back(
static_cast<char>((node->affix_indices[i] >> 8) & 0xFF));
}
// Terminator for affix list. We use 0xFFFF.
output->push_back(static_cast<unsigned char>(0xFF));
output->push_back(static_cast<unsigned char>(0xFF));
}
}
// Serializes the given node when it is DicNode::LIST* to the output.
void SerializeList(const DicNode* node, std::string* output) {
bool is_8_bit = node->storage == DicNode::LIST8;
unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE |
(is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE);
id_byte |= node->children.size(); // We assume the size is < 4 bits.
output->push_back(id_byte);
// Reserve enough room for the lookup table (either 2 or 3 bytes per entry).
int bytes_per_entry = (is_8_bit ? 2 : 3);
size_t table_begin = output->size();
output->resize(output->size() + node->children.size() * bytes_per_entry);
size_t children_begin = output->size();
for (size_t i = 0; i < node->children.size(); i++) {
// First is the character this entry represents.
(*output)[table_begin + i * bytes_per_entry] = node->children[i]->addition;
// Next is the 8- or 16-bit offset.
size_t offset = output->size() - children_begin;
if (is_8_bit) {
DCHECK(offset <= 0xFF);
(*output)[table_begin + i * bytes_per_entry + 1] =
static_cast<char>(offset & 0xFF);
} else {
unsigned short* output16 = reinterpret_cast<unsigned short*>(
&(*output)[table_begin + i * bytes_per_entry + 1]);
*output16 = static_cast<unsigned short>(offset);
}
// Now append the children's data.
SerializeTrie(node->children[i], output);
}
}
// Serializes the given node when it is DicNode::LOOKUP* to the output.
void SerializeLookup(const DicNode* node, std::string* output) {
unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE;
bool has_0th_item;
int first_table_item, table_item_count;
ComputeLookupStrategyDetails(node->children, &has_0th_item,
&first_table_item, &table_item_count);
// Set the extra bits in the ID byte.
bool is_32_bit = (node->storage == DicNode::LOOKUP32);
if (is_32_bit)
id_byte |= BDict::LOOKUP_NODE_32BIT_VALUE;
if (has_0th_item)
id_byte |= BDict::LOOKUP_NODE_0TH_VALUE;
size_t begin_offset = output->size();
output->push_back(id_byte);
output->push_back(static_cast<char>(first_table_item));
output->push_back(static_cast<char>(table_item_count));
// Save room for the lookup table and the optional 0th item.
int bytes_per_entry = (is_32_bit ? 4 : 2);
size_t zeroth_item_offset = output->size();
if (has_0th_item)
output->resize(output->size() + bytes_per_entry);
size_t table_begin = output->size();
output->resize(output->size() + table_item_count * bytes_per_entry);
// Append the children.
for (size_t i = 0; i < node->children.size(); i++) {
size_t offset = output->size();
// Compute the location at which we'll store the offset of the child data.
// We may be writing the magic 0th item.
size_t offset_offset;
if (i == 0 && has_0th_item) {
offset_offset = zeroth_item_offset;
} else {
int table_index = static_cast<unsigned char>(node->children[i]->addition) - first_table_item;
offset_offset = table_begin + table_index * bytes_per_entry;
}
// Write the offset.
if (is_32_bit) {
// Use 32-bit absolute offsets.
// FIXME(brettw) use bit cast.
unsigned* offset32 = reinterpret_cast<unsigned*>(&(*output)[offset_offset]);
*offset32 = static_cast<unsigned>(output->size());
} else {
// Use 16-bit relative offsets.
unsigned short* offset16 = reinterpret_cast<unsigned short*>(&(*output)[offset_offset]);
*offset16 = static_cast<unsigned short>(output->size() - begin_offset);
}
SerializeTrie(node->children[i], output);
}
}
// Recursively serializes this node and all of its children to the output.
void SerializeTrie(const DicNode* node, std::string* output) {
if (node->storage == DicNode::LEAF ||
node->storage == DicNode::LEAFMORE) {
SerializeLeaf(node, output);
} else if (node->storage == DicNode::LIST8 ||
node->storage == DicNode::LIST16) {
SerializeList(node, output);
} else if (node->storage == DicNode::LOOKUP16 ||
node->storage == DicNode::LOOKUP32) {
SerializeLookup(node, output);
}
}
/*
void SerializeStringList(const std::vector<std::string>& list,
std::string* output) {
for (size_t i = 0; i < list.size(); i++) {
if (i != 0)
output->push_back('\n');
output->append(list[i]);
}
output->push_back(0);
}
*/
// Appends the given uint32 to the given string.
void AppendUint32(uint32 a, std::string* output) {
size_t offset = output->size();
output->resize(offset + 4);
memcpy(&(*output)[offset], &a, sizeof(uint32));
}
// Serializes the given list of strings with 0 bytes separating them. The end
// will be marked by a double-0.
void SerializeStringListNullTerm(const std::vector<std::string>& strings,
std::string* output) {
for (size_t i = 0; i < strings.size(); i++) {
// Can't tolerate empty strings since the'll mark the end.
if (strings[i].empty())
output->push_back(' ');
else
output->append(strings[i]);
output->push_back(0);
}
output->push_back(0);
}
void SerializeReplacements(
const std::vector< std::pair<std::string, std::string> >& repl,
std::string* output) {
for (size_t i = 0; i < repl.size(); i++) {
output->append(repl[i].first);
output->push_back(0);
output->append(repl[i].second);
output->push_back(0);
}
output->push_back(0);
}
} // namespace
BDictWriter::BDictWriter() : trie_root_(NULL) {
}
BDictWriter::~BDictWriter() {
delete trie_root_;
}
void BDictWriter::SetWords(const WordList& words) {
trie_root_ = new DicNode;
BuildTrie(words, 0, words.size(), 0, trie_root_);
}
std::string BDictWriter::GetBDict() const {
std::string ret;
// Save room for the header. This will be populated at the end.
ret.resize(sizeof(hunspell::BDict::Header));
// Serialize the affix portion.
size_t aff_offset = ret.size();
SerializeAff(&ret);
// Serialize the dictionary words.
size_t dic_offset = ret.size();
ret.reserve(ret.size() + ComputeTrieStorage(trie_root_));
SerializeTrie(trie_root_, &ret);
// Fill the header last, now that we have the data.
hunspell::BDict::Header* header =
reinterpret_cast<hunspell::BDict::Header*>(&ret[0]);
header->signature = hunspell::BDict::SIGNATURE;
header->major_version = hunspell::BDict::MAJOR_VERSION;
header->minor_version = hunspell::BDict::MINOR_VERSION;
header->aff_offset = static_cast<uint32>(aff_offset);
header->dic_offset = static_cast<uint32>(dic_offset);
return ret;
}
void BDictWriter::SerializeAff(std::string* output) const {
// Reserve enough room for the header.
size_t header_offset = output->size();
output->resize(output->size() + sizeof(hunspell::BDict::AffHeader));
// Write the comment.
output->push_back('\n');
output->append(comment_);
output->push_back('\n');
// We need a magic first AF line that lists the number of following ones.
size_t affix_group_offset = output->size();
output->append(StringPrintf("AF %d", affix_groups_.size()));
output->push_back(0);
SerializeStringListNullTerm(affix_groups_, output);
size_t affix_rule_offset = output->size();
SerializeStringListNullTerm(affix_rules_, output);
size_t rep_offset = output->size();
SerializeReplacements(replacements_, output);
size_t other_offset = output->size();
SerializeStringListNullTerm(other_commands_, output);
// Add the header now that we know the offsets.
hunspell::BDict::AffHeader* header =
reinterpret_cast<hunspell::BDict::AffHeader*>(&(*output)[header_offset]);
header->affix_group_offset = static_cast<uint32>(affix_group_offset);
header->affix_rule_offset = static_cast<uint32>(affix_rule_offset);
header->rep_offset = static_cast<uint32>(rep_offset);
header->other_offset = static_cast<uint32>(other_offset);
}
} // namespace hunspell
|