summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-02-04 02:30:40 +0000
committerhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-02-04 02:30:40 +0000
commitd718a62d075bdd6f45a128bd5d82358d73799300 (patch)
treeac102e896fcc1c4ad8532559cda99142ade00986
parentca9608b447a4b42d74d1d8343cb98c0f7ff123f4 (diff)
downloadchromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.zip
chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.tar.gz
chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.tar.bz2
The first step towards Turkish spell-checker.This is a set of fixes for supporting the Turkish dictionary provided by the tr-spell project (*1).As I wrote in http://crbug.com/4782, this issue consists of three issues: one is against our convert_dict tool, and two are against our hunspell client.(*1) http://code.google.com/p/tr-spell/Unfortunately, the BDIC file converted from this Turkish dictionary is huge (7.1MB) because the dictionary has a lot of affix rules (> 18,000) and the most of the BDIC file is occupied by "AF" lines.
BUG=4782 Review URL: http://codereview.chromium.org/18041 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@9122 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--chrome/third_party/hunspell/google/bdict_writer.cc12
-rw-r--r--chrome/third_party/hunspell/src/hunspell/hashmgr.cxx31
-rw-r--r--chrome/third_party/hunspell/src/hunspell/htypes.hxx8
3 files changed, 36 insertions, 15 deletions
diff --git a/chrome/third_party/hunspell/google/bdict_writer.cc b/chrome/third_party/hunspell/google/bdict_writer.cc
index fcb060d..6ec080b 100644
--- a/chrome/third_party/hunspell/google/bdict_writer.cc
+++ b/chrome/third_party/hunspell/google/bdict_writer.cc
@@ -161,9 +161,17 @@ size_t ComputeTrieStorage(DicNode* node) {
// The additional affix list holds affixes when there is more than one. Each
// entry is two bytes, plus an additional FFFF terminator.
size_t supplimentary_size = 0;
- if (node->affix_indices.size() > 1 ||
- node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID)
+ if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
+ // We cannot store the first affix ID of the affix list into a leaf node.
+ // In this case, we have to store all the affix IDs and a terminator
+ // into a supplimentary list.
+ supplimentary_size = node->affix_indices.size() * 2 + 2;
+ } else if (node->affix_indices.size() > 1) {
+ // We can store the first affix ID of the affix list into a leaf node.
+ // In this case, we need to store the remaining affix IDs and a
+ // terminator into a supplimentary list.
supplimentary_size = node->affix_indices.size() * 2;
+ }
if (node->leaf_addition.empty()) {
node->storage = DicNode::LEAF;
diff --git a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx
index 1926c852..ec6f4f3 100644
--- a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx
+++ b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx
@@ -559,27 +559,32 @@ int HashMgr::load_config()
{
utf8 = 1; // We always use UTF-8.
- // Read in all the AF lines which tell us the rules for each affix group ID.
+ // Read in the regular commands from the affix file. We care about the FLAG
+ // line becuase the AF lines depend on this value, and the IGNORE line.
+ // The rest of the commands will be read by the affix manager.
char line[MAXDELEN+1];
- hunspell::LineIterator iterator = bdict_reader->GetAfLineIterator();
- while (iterator.AdvanceAndCopy(line, MAXDELEN)) {
- int rv = parse_aliasf(line, &iterator);
- if (rv)
- return rv;
- }
-
- // Read in the regular commands from the affix file. We only care about the
- // IGNORE line here. The rest of the commands will be read by the affix
- // manager.
- iterator = bdict_reader->GetOtherLineIterator();
+ hunspell::LineIterator iterator = bdict_reader->GetOtherLineIterator();
while (iterator.AdvanceAndCopy(line, MAXDELEN)) {
// Parse in the ignored characters (for example, Arabic optional
// diacritics characters.
if (strncmp(line,"IGNORE",6) == 0) {
parse_array(line, &ignorechars, &ignorechars_utf16,
&ignorechars_utf16_len, "IGNORE", utf8);
- break; // All done.
}
+ // Retrieve the format of an AF line.
+ if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
+ if (strstr(line, "long")) flag_mode = FLAG_LONG;
+ if (strstr(line, "num")) flag_mode = FLAG_NUM;
+ if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
+ }
+ }
+
+ // Read in all the AF lines which tell us the rules for each affix group ID.
+ iterator = bdict_reader->GetAfLineIterator();
+ while (iterator.AdvanceAndCopy(line, MAXDELEN)) {
+ int rv = parse_aliasf(line, &iterator);
+ if (rv)
+ return rv;
}
return 0;
diff --git a/chrome/third_party/hunspell/src/hunspell/htypes.hxx b/chrome/third_party/hunspell/src/hunspell/htypes.hxx
index ddd1d83..f8d685a 100644
--- a/chrome/third_party/hunspell/src/hunspell/htypes.hxx
+++ b/chrome/third_party/hunspell/src/hunspell/htypes.hxx
@@ -1,7 +1,15 @@
#ifndef _HTYPES_HXX_
#define _HTYPES_HXX_
+#ifdef HUNSPELL_CHROME_CLIENT
+// This is a workaround for preventing errors in parsing Turkish BDICs, which
+// contain very long AF lines (~ 12,000 chars).
+// TODO(hbono) change the HashMgr::parse_aliasf() function to be able to parse
+// longer lines than MAXDELEN.
+#define MAXDELEN (8192 * 2)
+#else
#define MAXDELEN 8192
+#endif // HUNSPELL_CHROME_CLIENT
#define ROTATE_LEN 5