The first step towards Turkish spell-checker.This is a set of fixes for supporting the Turkish dictionary provided by the tr-spell project (*1).As I wrote in http://crbug.com/4782, this issue consists of three issues: one is against our convert_dict tool, and two are against our hunspell client.(*1) http://code.google.com/p/tr-spell/Unfortunately, the BDIC file converted from this Turkish dictionary is huge (7.1MB) because the dictionary has a lot of affix rules (> 18,000) and the most of the BDIC file is occupied by "AF" lines.

BUG=4782 Review URL: http://codereview.chromium.org/18041 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@9122 0039d316-1c4b-4281-b951-d872f2087c98
author: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-02-04 02:30:40 +0000
committer: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-02-04 02:30:40 +0000
commit: d718a62d075bdd6f45a128bd5d82358d73799300 (patch)
tree: ac102e896fcc1c4ad8532559cda99142ade00986
parent: ca9608b447a4b42d74d1d8343cb98c0f7ff123f4 (diff)
download: chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.zip
chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.tar.gz
chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.tar.bz2
3 files changed, 36 insertions, 15 deletions
diff --git a/chrome/third_party/hunspell/google/bdict_writer.cc b/chrome/third_party/hunspell/google/bdict_writer.cc
index fcb060d..6ec080b 100644
--- a/chrome/third_party/hunspell/google/bdict_writer.cc
+++ b/chrome/third_party/hunspell/google/bdict_writer.cc
@@ -161,9 +161,17 @@ size_t ComputeTrieStorage(DicNode* node) {
     // The additional affix list holds affixes when there is more than one. Each
     // entry is two bytes, plus an additional FFFF terminator.
     size_t supplimentary_size = 0;
-    if (node->affix_indices.size() > 1 ||
-        node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID)
+    if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {
+      // We cannot store the first affix ID of the affix list into a leaf node.
+      // In this case, we have to store all the affix IDs and a terminator
+      // into a supplimentary list.
+      supplimentary_size = node->affix_indices.size() * 2 + 2;
+    } else if (node->affix_indices.size() > 1) {
+      // We can store the first affix ID of the affix list into a leaf node.
+      // In this case, we need to store the remaining affix IDs and a
+      // terminator into a supplimentary list.
       supplimentary_size = node->affix_indices.size() * 2;
+    }
 
     if (node->leaf_addition.empty()) {
       node->storage = DicNode::LEAF;
diff --git a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx
index 1926c852..ec6f4f3 100644
--- a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx
+++ b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx
@@ -559,27 +559,32 @@ int HashMgr::load_config()
 {
   utf8 = 1;  // We always use UTF-8.
 
-  // Read in all the AF lines which tell us the rules for each affix group ID.
+  // Read in the regular commands from the affix file. We care about the FLAG
+  // line becuase the AF lines depend on this value, and the IGNORE line.
+  // The rest of the commands will be read by the affix manager.
   char line[MAXDELEN+1];
-  hunspell::LineIterator iterator = bdict_reader->GetAfLineIterator();
-  while (iterator.AdvanceAndCopy(line, MAXDELEN)) {
-    int rv = parse_aliasf(line, &iterator);
-    if (rv)
-      return rv;
-  }
-
-  // Read in the regular commands from the affix file. We only care about the
-  // IGNORE line here. The rest of the commands will be read by the affix
-  // manager.
-  iterator = bdict_reader->GetOtherLineIterator();
+  hunspell::LineIterator iterator = bdict_reader->GetOtherLineIterator();
   while (iterator.AdvanceAndCopy(line, MAXDELEN)) {
     // Parse in the ignored characters (for example, Arabic optional
     // diacritics characters.
     if (strncmp(line,"IGNORE",6) == 0) {
       parse_array(line, &ignorechars, &ignorechars_utf16,
                   &ignorechars_utf16_len, "IGNORE", utf8);
-      break;  // All done.
     }
+    // Retrieve the format of an AF line.
+    if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
+      if (strstr(line, "long")) flag_mode = FLAG_LONG;
+      if (strstr(line, "num")) flag_mode = FLAG_NUM;
+      if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
+    }
+  }
+
+  // Read in all the AF lines which tell us the rules for each affix group ID.
+  iterator = bdict_reader->GetAfLineIterator();
+  while (iterator.AdvanceAndCopy(line, MAXDELEN)) {
+    int rv = parse_aliasf(line, &iterator);
+    if (rv)
+      return rv;
   }
 
   return 0;
diff --git a/chrome/third_party/hunspell/src/hunspell/htypes.hxx b/chrome/third_party/hunspell/src/hunspell/htypes.hxx
index ddd1d83..f8d685a 100644
--- a/chrome/third_party/hunspell/src/hunspell/htypes.hxx
+++ b/chrome/third_party/hunspell/src/hunspell/htypes.hxx
@@ -1,7 +1,15 @@
 #ifndef _HTYPES_HXX_
 #define _HTYPES_HXX_
 
+#ifdef HUNSPELL_CHROME_CLIENT
+// This is a workaround for preventing errors in parsing Turkish BDICs, which
+// contain very long AF lines (~ 12,000 chars).
+// TODO(hbono) change the HashMgr::parse_aliasf() function to be able to parse
+// longer lines than MAXDELEN.
+#define MAXDELEN    (8192 * 2)
+#else
 #define MAXDELEN    8192
+#endif  // HUNSPELL_CHROME_CLIENT
 
 #define ROTATE_LEN   5
author	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-02-04 02:30:40 +0000
committer	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-02-04 02:30:40 +0000
commit	d718a62d075bdd6f45a128bd5d82358d73799300 (patch)
tree	ac102e896fcc1c4ad8532559cda99142ade00986
parent	ca9608b447a4b42d74d1d8343cb98c0f7ff123f4 (diff)
download	chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.zip chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.tar.gz chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.tar.bz2