diff options
author | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-02-04 02:30:40 +0000 |
---|---|---|
committer | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-02-04 02:30:40 +0000 |
commit | d718a62d075bdd6f45a128bd5d82358d73799300 (patch) | |
tree | ac102e896fcc1c4ad8532559cda99142ade00986 | |
parent | ca9608b447a4b42d74d1d8343cb98c0f7ff123f4 (diff) | |
download | chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.zip chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.tar.gz chromium_src-d718a62d075bdd6f45a128bd5d82358d73799300.tar.bz2 |
The first step towards Turkish spell-checker.This is a set of fixes for supporting the Turkish dictionary provided by the tr-spell project (*1).As I wrote in http://crbug.com/4782, this issue consists of three issues: one is against our convert_dict tool, and two are against our hunspell client.(*1) http://code.google.com/p/tr-spell/Unfortunately, the BDIC file converted from this Turkish dictionary is huge (7.1MB) because the dictionary has a lot of affix rules (> 18,000) and the most of the BDIC file is occupied by "AF" lines.
BUG=4782
Review URL: http://codereview.chromium.org/18041
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@9122 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | chrome/third_party/hunspell/google/bdict_writer.cc | 12 | ||||
-rw-r--r-- | chrome/third_party/hunspell/src/hunspell/hashmgr.cxx | 31 | ||||
-rw-r--r-- | chrome/third_party/hunspell/src/hunspell/htypes.hxx | 8 |
3 files changed, 36 insertions, 15 deletions
diff --git a/chrome/third_party/hunspell/google/bdict_writer.cc b/chrome/third_party/hunspell/google/bdict_writer.cc index fcb060d..6ec080b 100644 --- a/chrome/third_party/hunspell/google/bdict_writer.cc +++ b/chrome/third_party/hunspell/google/bdict_writer.cc @@ -161,9 +161,17 @@ size_t ComputeTrieStorage(DicNode* node) { // The additional affix list holds affixes when there is more than one. Each // entry is two bytes, plus an additional FFFF terminator. size_t supplimentary_size = 0; - if (node->affix_indices.size() > 1 || - node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) + if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) { + // We cannot store the first affix ID of the affix list into a leaf node. + // In this case, we have to store all the affix IDs and a terminator + // into a supplimentary list. + supplimentary_size = node->affix_indices.size() * 2 + 2; + } else if (node->affix_indices.size() > 1) { + // We can store the first affix ID of the affix list into a leaf node. + // In this case, we need to store the remaining affix IDs and a + // terminator into a supplimentary list. supplimentary_size = node->affix_indices.size() * 2; + } if (node->leaf_addition.empty()) { node->storage = DicNode::LEAF; diff --git a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx index 1926c852..ec6f4f3 100644 --- a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx @@ -559,27 +559,32 @@ int HashMgr::load_config() { utf8 = 1; // We always use UTF-8. - // Read in all the AF lines which tell us the rules for each affix group ID. + // Read in the regular commands from the affix file. We care about the FLAG + // line becuase the AF lines depend on this value, and the IGNORE line. + // The rest of the commands will be read by the affix manager. char line[MAXDELEN+1]; - hunspell::LineIterator iterator = bdict_reader->GetAfLineIterator(); - while (iterator.AdvanceAndCopy(line, MAXDELEN)) { - int rv = parse_aliasf(line, &iterator); - if (rv) - return rv; - } - - // Read in the regular commands from the affix file. We only care about the - // IGNORE line here. The rest of the commands will be read by the affix - // manager. - iterator = bdict_reader->GetOtherLineIterator(); + hunspell::LineIterator iterator = bdict_reader->GetOtherLineIterator(); while (iterator.AdvanceAndCopy(line, MAXDELEN)) { // Parse in the ignored characters (for example, Arabic optional // diacritics characters. if (strncmp(line,"IGNORE",6) == 0) { parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8); - break; // All done. } + // Retrieve the format of an AF line. + if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { + if (strstr(line, "long")) flag_mode = FLAG_LONG; + if (strstr(line, "num")) flag_mode = FLAG_NUM; + if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI; + } + } + + // Read in all the AF lines which tell us the rules for each affix group ID. + iterator = bdict_reader->GetAfLineIterator(); + while (iterator.AdvanceAndCopy(line, MAXDELEN)) { + int rv = parse_aliasf(line, &iterator); + if (rv) + return rv; } return 0; diff --git a/chrome/third_party/hunspell/src/hunspell/htypes.hxx b/chrome/third_party/hunspell/src/hunspell/htypes.hxx index ddd1d83..f8d685a 100644 --- a/chrome/third_party/hunspell/src/hunspell/htypes.hxx +++ b/chrome/third_party/hunspell/src/hunspell/htypes.hxx @@ -1,7 +1,15 @@ #ifndef _HTYPES_HXX_ #define _HTYPES_HXX_ +#ifdef HUNSPELL_CHROME_CLIENT +// This is a workaround for preventing errors in parsing Turkish BDICs, which +// contain very long AF lines (~ 12,000 chars). +// TODO(hbono) change the HashMgr::parse_aliasf() function to be able to parse +// longer lines than MAXDELEN. +#define MAXDELEN (8192 * 2) +#else #define MAXDELEN 8192 +#endif // HUNSPELL_CHROME_CLIENT #define ROTATE_LEN 5 |