diff options
author | mhm@chromium.org <mhm@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-08-03 18:13:58 +0000 |
---|---|---|
committer | mhm@chromium.org <mhm@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-08-03 18:13:58 +0000 |
commit | b614cbb088107996d8e56d30cdecefca9f14d5b6 (patch) | |
tree | 47c38ae97993d4ab786e916844d20ece42f75b01 /chrome | |
parent | 951073ea674d8436fbcfbd6a0310fe20fb2e6a4a (diff) | |
download | chromium_src-b614cbb088107996d8e56d30cdecefca9f14d5b6.zip chromium_src-b614cbb088107996d8e56d30cdecefca9f14d5b6.tar.gz chromium_src-b614cbb088107996d8e56d30cdecefca9f14d5b6.tar.bz2 |
Reverting hunspell upgrade 22263,22257,22252,22243. Causing many valgrind warnings.
TBR: brettw, dank, jshin
Review URL: http://codereview.chromium.org/159797
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@22287 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome')
33 files changed, 2777 insertions, 5235 deletions
diff --git a/chrome/browser/spellcheck_unittest.cc b/chrome/browser/spellcheck_unittest.cc index 1d51b5b..86eadff 100644 --- a/chrome/browser/spellcheck_unittest.cc +++ b/chrome/browser/spellcheck_unittest.cc @@ -607,11 +607,11 @@ TEST_F(SpellCheckTest, SpellCheckSuggestions_EN_US) { {L"wate", false, 0, 0, L"water"}, {L"wate", false, 0, 0, L"waste"}, {L"wate", false, 0, 0, L"sate"}, - {L"wate", false, 0, 0, L"ate"}, + {L"wate", false, 0, 0, L"rate"}, {L"jum", false, 0, 0, L"jump"}, - {L"jum", false, 0, 0, L"hum"}, + {L"jum", false, 0, 0, L"rum"}, {L"jum", false, 0, 0, L"sum"}, - {L"jum", false, 0, 0, L"um"}, + {L"jum", false, 0, 0, L"tum"}, #endif //!OS_MACOSX // TODO (Sidchat): add many more examples. }; diff --git a/chrome/browser/spellchecker.cc b/chrome/browser/spellchecker.cc index f7635b6..f3bb9d3 100644 --- a/chrome/browser/spellchecker.cc +++ b/chrome/browser/spellchecker.cc @@ -613,7 +613,7 @@ void SpellChecker::AddCustomWordsToHunspell() { if (hunspell_.get()) { for (std::vector<std::string>::iterator it = list_of_words.begin(); it != list_of_words.end(); ++it) { - hunspell_->add(it->c_str()); + hunspell_->put_word(it->c_str()); } } } @@ -744,7 +744,7 @@ void SpellChecker::AddWord(const std::wstring& word) { // Add the word to hunspell. std::string word_to_add = WideToUTF8(word); if (!word_to_add.empty()) - hunspell_->add(word_to_add.c_str()); + hunspell_->put_word(word_to_add.c_str()); // Now add the word to the custom dictionary file. Task* write_word_task = diff --git a/chrome/test/data/valgrind/unit_tests.gtest.txt b/chrome/test/data/valgrind/unit_tests.gtest.txt index 8c0fb75..3f75f25 100644 --- a/chrome/test/data/valgrind/unit_tests.gtest.txt +++ b/chrome/test/data/valgrind/unit_tests.gtest.txt @@ -1,4 +1,2 @@ # Takes 65 seconds to run. IPCSyncChannelTest.ChattyServer -# Bug18254 - Hunspell 1.2.8 caused 28 suppression. -SpellCheckTest.SpellCheckSuggestions_EN_US diff --git a/chrome/third_party/hunspell/README.chromium b/chrome/third_party/hunspell/README.chromium index e19a1b1..5e6f6f7 100644 --- a/chrome/third_party/hunspell/README.chromium +++ b/chrome/third_party/hunspell/README.chromium @@ -6,13 +6,24 @@ This is a partial copy of Hunspell 1.1.5, with the following changes: reference in src/hunspell/csutil.cxx changed accordingly * Change the input params of the constructors to receive a FILE* instead of a file path. This is required to use hunspell in the sandbox. -* Remove all HUNSPELL_WARNING parameters since we are not using HashMgr - anymore, just show the msg not the line number. -* Remove the key variable from Hunspell, HashMgr and AffixMgr since Bdict - is being used instead. + The patch is in google.patch. The English dictionary distributed by Firefox has been checked in to the dictionaries directory. It has several additions over the default myspell/hunspell dictionary. +* Workaround for non-ASCII characters + +Visual Studio on Japanese Windows assumes the source files to be +encoded in Shift_JIS. The compiler is unhappy with non-ASCII letters +in the source files of Hunspell. The same problem happens with other +CJK Windows as well. Here is the workaround for this problem: + +Convert 8-bit bytes to hexadecimal escaped forms by + + % perl -i -De 's/([\x80-\xff])/sprintf("\\x%02x", $1)/ge' src/*.cxx + + +Note that Hunspell upstream is going to fix this problem. We'll no +longer need the workaround if the problem is fixed in the upstream. diff --git a/chrome/third_party/hunspell/google.patch b/chrome/third_party/hunspell/google.patch new file mode 100644 index 0000000..ae7fd9d --- /dev/null +++ b/chrome/third_party/hunspell/google.patch @@ -0,0 +1,212 @@ +Index: src/hunspell/affixmgr.cxx +=================================================================== +--- src/hunspell/affixmgr.cxx (revision 3811) ++++ src/hunspell/affixmgr.cxx (working copy) +@@ -25,7 +27,7 @@ + #endif + #endif + +-AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) ++AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) + { + // register hash manager and load affix data from aff file + pHMgr = ptr; +@@ -104,8 +106,8 @@ + contclasses[j] = 0; + } + +- if (parse_file(affpath)) { +- HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath); ++ if (parse_file(aff_handle)) { ++ HUNSPELL_WARNING(stderr, "Failure loading aff file\n"); + wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"); + } + +@@ -232,7 +234,7 @@ + + + // read in aff file and build up prefix and suffix entry objects +-int AffixMgr::parse_file(const char * affpath) ++int AffixMgr::parse_file(FILE* aff_handle) + { + + // io buffers +@@ -250,11 +252,12 @@ + + // open the affix file + FILE * afflst; +- afflst = fopen(affpath,"r"); ++ afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); + if (!afflst) { +- HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath); ++ HUNSPELL_WARNING(stderr, "error: could not open affix description file\n"); + return 1; + } ++ fseek(afflst, 0, SEEK_SET); + + // step one is to parse the affix file building up the internal + // affix data structures +Index: src/hunspell/affixmgr.hxx +=================================================================== +--- src/hunspell/affixmgr.hxx (revision 3811) ++++ src/hunspell/affixmgr.hxx (working copy) +@@ -93,7 +93,7 @@ + + public: + +- AffixMgr(const char * affpath, HashMgr * ptr); ++ AffixMgr(FILE* aff_handle, HashMgr * ptr); + ~AffixMgr(); + struct hentry * affix_check(const char * word, int len, + const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT); +@@ -179,7 +179,7 @@ + int get_checksharps(void); + + private: +- int parse_file(const char * affpath); ++ int parse_file(FILE* aff_handle); + // int parse_string(char * line, char ** out, const char * name); + int parse_flag(char * line, unsigned short * out, const char * name); + int parse_num(char * line, int * out, const char * name); +Index: src/hunspell/hashmgr.cxx +=================================================================== +--- src/hunspell/hashmgr.cxx (revision 3811) ++++ src/hunspell/hashmgr.cxx (working copy) +@@ -29,7 +31,7 @@ + + // build a hash table from a munched word list + +-HashMgr::HashMgr(const char * tpath, const char * apath) ++HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) + { + tablesize = 0; + tableptr = NULL; +@@ -43,8 +45,8 @@ + aliasf = NULL; + numaliasm = 0; + aliasm = NULL; +- load_config(apath); +- int ec = load_tables(tpath); ++ load_config(aff_handle); ++ int ec = load_tables(dic_handle); + if (ec) { + /* error condition - what should we do here */ + HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec); +@@ -240,7 +242,7 @@ + } + + // load a munched word list and build a hash table on the fly +-int HashMgr::load_tables(const char * tpath) ++int HashMgr::load_tables(FILE* t_handle) + { + int wl, al; + char * ap; +@@ -248,8 +250,9 @@ + unsigned short * flags; + + // raw dictionary - munched file +- FILE * rawdict = fopen(tpath, "r"); ++ FILE * rawdict = _fdopen(_dup(_fileno(t_handle)), "r"); + if (rawdict == NULL) return 1; ++ fseek(rawdict, 0, SEEK_SET); + + // first read the first line of file to get hash table size */ + char ts[MAXDELEN]; +@@ -442,7 +445,7 @@ + } + + // read in aff file and set flag mode +-int HashMgr::load_config(const char * affpath) ++int HashMgr::load_config(FILE* aff_handle) + { + int firstline = 1; + +@@ -451,11 +454,12 @@ + + // open the affix file + FILE * afflst; +- afflst = fopen(affpath,"r"); ++ afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); + if (!afflst) { +- HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath); ++ HUNSPELL_WARNING(stderr, "Error - could not open affix description file\n"); + return 1; + } ++ fseek(afflst, 0, SEEK_SET); + + // read in each line ignoring any that do not + // start with a known line type indicator +Index: src/hunspell/hashmgr.hxx +=================================================================== +--- src/hunspell/hashmgr.hxx (revision 3811) ++++ src/hunspell/hashmgr.hxx (working copy) +@@ -25,7 +25,7 @@ + + + public: +- HashMgr(const char * tpath, const char * apath); ++ HashMgr(FILE* t_handle, FILE* a_handle); + ~HashMgr(); + + struct hentry * lookup(const char *) const; +@@ -46,9 +46,9 @@ + + + private: +- int load_tables(const char * tpath); ++ int load_tables(FILE* t_handle); + int add_word(const char * word, int wl, unsigned short * ap, int al, const char * desc); +- int load_config(const char * affpath); ++ int load_config(FILE* aff_handle); + int parse_aliasf(char * line, FILE * af); + #ifdef HUNSPELL_EXPERIMENTAL + int parse_aliasm(char * line, FILE * af); +Index: src/hunspell/hunspell.cxx +=================================================================== +--- src/hunspell/hunspell.cxx (revision 3811) ++++ src/hunspell/hunspell.cxx (working copy) +@@ -20,7 +20,7 @@ + #endif + #endif + +-Hunspell::Hunspell(const char * affpath, const char * dpath) ++Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle) + { + encoding = NULL; + csconv = NULL; +@@ -28,11 +28,11 @@ + complexprefixes = 0; + + /* first set up the hash manager */ +- pHMgr = new HashMgr(dpath, affpath); ++ pHMgr = new HashMgr(dic_handle, aff_handle); + + /* next set up the affix manager */ + /* it needs access to the hash manager lookup methods */ +- pAMgr = new AffixMgr(affpath,pHMgr); ++ pAMgr = new AffixMgr(aff_handle, pHMgr); + + /* get the preferred try string and the dictionary */ + /* encoding from the Affix Manager for that dictionary */ +@@ -1694,9 +1694,9 @@ + + #endif // END OF HUNSPELL_EXPERIMENTAL CODE + +-Hunhandle *Hunspell_create(const char * affpath, const char * dpath) ++Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) + { +- return (Hunhandle*)(new Hunspell(affpath, dpath)); ++ return (Hunhandle*)(new Hunspell(aff_handle, dic_handle)); + } + + void Hunspell_destroy(Hunhandle *pHunspell) +Index: src/hunspell/hunspell.hxx +=================================================================== +--- src/hunspell/hunspell.hxx (revision 3811) ++++ src/hunspell/hunspell.hxx (working copy) +@@ -48,7 +48,7 @@ + * input: path of affix file and dictionary file + */ + +- Hunspell(const char * affpath, const char * dpath); ++ Hunspell(FILE* aff_handle, FILE* dic_handle); diff --git a/chrome/third_party/hunspell/hunspell.gyp b/chrome/third_party/hunspell/hunspell.gyp index 217fa5e..a19f103 100644 --- a/chrome/third_party/hunspell/hunspell.gyp +++ b/chrome/third_party/hunspell/hunspell.gyp @@ -16,7 +16,6 @@ '../../../third_party/icu38/icu38.gyp:icuuc', ], 'defines': [ - 'HUNSPELL_STATIC', 'HUNSPELL_CHROME_CLIENT', 'OPENOFFICEORG', ], @@ -36,31 +35,21 @@ 'src/hunspell/csutil.hxx', 'src/hunspell/dictmgr.cxx', 'src/hunspell/dictmgr.hxx', - 'src/hunspell/filemgr.cxx', - 'src/hunspell/filemgr.hxx', 'src/hunspell/hashmgr.cxx', 'src/hunspell/hashmgr.hxx', 'src/hunspell/htypes.hxx', 'src/hunspell/hunspell.cxx', 'src/hunspell/hunspell.h', 'src/hunspell/hunspell.hxx', - 'src/hunspell/hunzip.cxx', - 'src/hunspell/hunzip.hxx', 'src/hunspell/langnum.hxx', - 'src/hunspell/phonet.cxx', - 'src/hunspell/phonet.hxx', - 'src/hunspell/replist.cxx', - 'src/hunspell/replist.hxx', 'src/hunspell/suggestmgr.cxx', 'src/hunspell/suggestmgr.hxx', 'src/hunspell/utf_info.hxx', - 'src/hunspell/w_char.hxx', 'src/parsers/textparser.cxx', 'src/parsers/textparser.hxx', ], 'direct_dependent_settings': { 'defines': [ - 'HUNSPELL_STATIC', 'HUNSPELL_CHROME_CLIENT', 'USE_HUNSPELL', ], diff --git a/chrome/third_party/hunspell/src/hunspell/affentry.cxx b/chrome/third_party/hunspell/src/hunspell/affentry.cxx index 7c2dab4..517646f 100644 --- a/chrome/third_party/hunspell/src/hunspell/affentry.cxx +++ b/chrome/third_party/hunspell/src/hunspell/affentry.cxx @@ -7,9 +7,9 @@ #include <cctype> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #include <ctype.h> #endif @@ -17,7 +17,7 @@ #include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef WIN32 +#ifndef W32 using namespace std; #endif #endif @@ -29,23 +29,22 @@ PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) pmyMgr = pmgr; // set up its intial values - - aflag = dp->aflag; // flag + + aflag = dp->aflag; // flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string - numconds = dp->numconds; // length of the condition - opts = dp->opts; // cross product flag + numconds = dp->numconds; // number of conditions to match + opts = dp->opts; // cross product flag // then copy over all of the conditions - if (opts & aeLONGCOND) { - memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); - c.l.conds2 = dp->c.l.conds2; - } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); + memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); next = NULL; nextne = NULL; nexteq = NULL; +#ifdef HUNSPELL_EXPERIMENTAL morphcode = dp->morphcode; +#endif contclass = dp->contclass; contclasslen = dp->contclasslen; } @@ -59,8 +58,15 @@ PfxEntry::~PfxEntry() pmyMgr = NULL; appnd = NULL; strip = NULL; - if (opts & aeLONGCOND) free(c.l.conds2); + if (opts & aeUTF8) { + for (int i = 0; i < numconds; i++) { + if (conds.utf8.wchars[i]) + free(conds.utf8.wchars[i]); + } + } +#ifdef HUNSPELL_EXPERIMENTAL if (morphcode && !(opts & aeALIASM)) free(morphcode); +#endif if (contclass && !(opts & aeALIASF)) free(contclass); } @@ -69,9 +75,8 @@ char * PfxEntry::add(const char * word, int len) { char tword[MAXWORDUTF8LEN + 4]; - if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && - (len >= numconds) && test_condition(word) && - (!stripl || (strncmp(word, strip, stripl) == 0)) && + if ((len > stripl) && (len >= numconds) && test_condition(word) && + (!stripl || (strncmp(word, strip, stripl) == 0)) && ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { /* we have a match so add prefix */ char * pp = tword; @@ -82,87 +87,51 @@ char * PfxEntry::add(const char * word, int len) strcpy(pp, (word + stripl)); return mystrdup(tword); } - return NULL; + return NULL; } -inline char * PfxEntry::nextchar(char * p) { - if (p) { - p++; - if (opts & aeLONGCOND) { - // jump to the 2nd part of the condition - if (p == c.conds + MAXCONDLEN_1) return c.l.conds2; - // end of the MAXCONDLEN length condition - } else if (p == c.conds + MAXCONDLEN) return NULL; - return *p ? p : NULL; - } - return NULL; -} inline int PfxEntry::test_condition(const char * st) { - const char * pos = NULL; // group with pos input position - bool neg = false; // complementer - bool ingroup = false; // character in the group - if (numconds == 0) return 1; - char * p = c.conds; - while (1) { - switch (*p) { - case '\0': return 1; - case '[': { - neg = false; - ingroup = false; - p = nextchar(p); - pos = st; break; - } - case '^': { p = nextchar(p); neg = true; break; } - case ']': { - if ((neg && ingroup) || (!neg && !ingroup)) return 0; - pos = NULL; - p = nextchar(p); - // skip the next character - if (!ingroup) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); - if (*st == '\0' && p) return 0; // word <= condition - break; - } - case '.': if (!pos) { // dots are not metacharacters in groups: [.] - p = nextchar(p); - // skip the next character - for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); - if (*st == '\0' && p) return 0; // word <= condition - break; - } - default: { - if (*st == *p) { - st++; - p = nextchar(p); - if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte - while (p && (*p & 0xc0) == 0x80) { // character - if (*p != *st) { - if (!pos) return 0; - st = pos; - break; - } - p = nextchar(p); - st++; - } - if (pos && st != pos) { - ingroup = true; - while (p && *p != ']' && (p = nextchar(p))); - } - } else if (pos) { - ingroup = true; - while (p && *p != ']' && (p = nextchar(p))); - } - } else if (pos) { // group - p = nextchar(p); - } else return 0; + int cond; + unsigned char * cp = (unsigned char *)st; + if (!(opts & aeUTF8)) { // 256-character codepage + for (cond = 0; cond < numconds; cond++) { + if ((conds.base[*cp++] & (1 << cond)) == 0) return 0; + } + } else { // UTF-8 encoding + unsigned short wc; + for (cond = 0; cond < numconds; cond++) { + // a simple 7-bit ASCII character in UTF-8 + if ((*cp >> 7) == 0) { + // also check limit (end of word) + if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0; + // UTF-8 multibyte character + } else { + // not dot wildcard in rule + if (!conds.utf8.all[cond]) { + if (conds.utf8.neg[cond]) { + u8_u16((w_char *) &wc, 1, (char *) cp); + if (conds.utf8.wchars[cond] && + flag_bsearch((unsigned short *)conds.utf8.wchars[cond], + wc, (short) conds.utf8.wlen[cond])) return 0; + } else { + if (!conds.utf8.wchars[cond]) return 0; + u8_u16((w_char *) &wc, 1, (char *) cp); + if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], + wc, (short)conds.utf8.wlen[cond])) return 0; + } } + // jump to next UTF-8 character + for(cp++; (*cp & 0xc0) == 0x80; cp++); + } } - if (!p) return 1; } + return 1; } -// check if this prefix entry matches + +// check if this prefix entry matches struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag) { int tmpl; // length of tmpword @@ -176,7 +145,7 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound tmpl = len - appndl; - if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -197,8 +166,8 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound if ((he = pmyMgr->lookup(tmpword)) != NULL) { do { if (TESTAFF(he->astr, aflag, he->alen) && - // forbid single prefixes with needaffix flag - ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && + // forbid single prefixes with pseudoroot flag + ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && // needflag ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || (contclass && TESTAFF(contclass, needflag, contclasslen)))) @@ -206,14 +175,14 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound he = he->next_homonym; // check homonyms } while (he); } - - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix //if ((opts & aeXPRODUCT) && in_compound) { if ((opts & aeXPRODUCT)) { - he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL, + he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL, 0, NULL, FLAG_NULL, needflag, in_compound); if (he) return he; } @@ -222,7 +191,7 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound return NULL; } -// check if this prefix entry matches +// check if this prefix entry matches struct hentry * PfxEntry::check_twosfx(const char * word, int len, char in_compound, const FLAG needflag) { @@ -237,8 +206,7 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, tmpl = len - appndl; - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && - (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -257,8 +225,8 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, if (test_condition(tmpword)) { tmpl += stripl; - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // cross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { @@ -270,7 +238,8 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, return NULL; } -// check if this prefix entry matches +#ifdef HUNSPELL_EXPERIMENTAL +// check if this prefix entry matches char * PfxEntry::check_twosfx_morph(const char * word, int len, char in_compound, const FLAG needflag) { @@ -284,8 +253,7 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, tmpl = len - appndl; - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && - (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -304,8 +272,8 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, if (test_condition(tmpword)) { tmpl += stripl; - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { @@ -317,7 +285,7 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, return NULL; } -// check if this prefix entry matches +// check if this prefix entry matches char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag) { int tmpl; // length of tmpword @@ -325,7 +293,7 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const char tmpword[MAXWORDUTF8LEN + 4]; char result[MAXLNLEN]; char * st; - + *result = '\0'; // on entry prefix is 0 length or already matches the beginning of the word. @@ -335,8 +303,7 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const tmpl = len - appndl; - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && - (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -357,56 +324,41 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const if ((he = pmyMgr->lookup(tmpword)) != NULL) { do { if (TESTAFF(he->astr, aflag, he->alen) && - // forbid single prefixes with needaffix flag - ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && + // forbid single prefixes with pseudoroot flag + ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && // needflag ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || (contclass && TESTAFF(contclass, needflag, contclasslen)))) { - if (morphcode) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, morphcode, MAXLNLEN); - } else mystrcat(result,getKey(), MAXLNLEN); - if (!HENTRY_FIND(he, MORPH_STEM)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_STEM, MAXLNLEN); - mystrcat(result, HENTRY_WORD(he), MAXLNLEN); - } - // store the pointer of the hash entry - if (HENTRY_DATA(he)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, HENTRY_DATA2(he), MAXLNLEN); - } else { - // return with debug information - char * flag = pmyMgr->encode_flag(getFlag()); - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_FLAG, MAXLNLEN); - mystrcat(result, flag, MAXLNLEN); - free(flag); + if (morphcode) strcat(result, morphcode); else strcat(result,getKey()); + if (he->description) { + if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word); + strcat(result,he->description); } - mystrcat(result, "\n", MAXLNLEN); + strcat(result, "\n"); } he = he->next_homonym; } while (he); } - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { - st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, + st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, FLAG_NULL, needflag); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } } } } - + if (*result) return mystrdup(result); return NULL; } +#endif // END OF HUNSPELL_EXPERIMENTAL CODE SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) { @@ -414,22 +366,22 @@ SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) pmyMgr = pmgr; // set up its intial values - aflag = dp->aflag; // char flag + aflag = dp->aflag; // char flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string - numconds = dp->numconds; // length of the condition - opts = dp->opts; // cross product flag + numconds = dp->numconds; // number of conditions to match + opts = dp->opts; // cross product flag // then copy over all of the conditions - if (opts & aeLONGCOND) { - memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); - c.l.conds2 = dp->c.l.conds2; - } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); + memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); rappnd = myrevstrdup(appnd); + +#ifdef HUNSPELL_EXPERIMENTAL morphcode = dp->morphcode; +#endif contclass = dp->contclass; contclasslen = dp->contclasslen; } @@ -443,9 +395,15 @@ SfxEntry::~SfxEntry() if (strip) free(strip); pmyMgr = NULL; appnd = NULL; - strip = NULL; - if (opts & aeLONGCOND) free(c.l.conds2); + strip = NULL; + if (opts & aeUTF8) { + for (int i = 0; i < numconds; i++) { + if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]); + } + } +#ifdef HUNSPELL_EXPERIMENTAL if (morphcode && !(opts & aeALIASM)) free(morphcode); +#endif if (contclass && !(opts & aeALIASF)) free(contclass); } @@ -455,8 +413,7 @@ char * SfxEntry::add(const char * word, int len) char tword[MAXWORDUTF8LEN + 4]; /* make sure all conditions match */ - if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && - (len >= numconds) && test_condition(word + len, word) && + if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) && (!stripl || (strcmp(word + len - stripl, strip) == 0)) && ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { /* we have a match so add suffix */ @@ -471,114 +428,56 @@ char * SfxEntry::add(const char * word, int len) return NULL; } -inline char * SfxEntry::nextchar(char * p) { - if (p) { - p++; - if (opts & aeLONGCOND) { - // jump to the 2nd part of the condition - if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2; - // end of the MAXCONDLEN length condition - } else if (p == c.conds + MAXCONDLEN) return NULL; - return *p ? p : NULL; - } - return NULL; -} inline int SfxEntry::test_condition(const char * st, const char * beg) { - const char * pos = NULL; // group with pos input position - bool neg = false; // complementer - bool ingroup = false; // character in the group - if (numconds == 0) return 1; - char * p = c.conds; - st--; - int i = 1; - while (1) { - switch (*p) { - case '\0': return 1; - case '[': { p = nextchar(p); pos = st; break; } - case '^': { p = nextchar(p); neg = true; break; } - case ']': { if (!neg && !ingroup) return 0; - i++; - // skip the next character - if (!ingroup) { - for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); - st--; - } - pos = NULL; - neg = false; - ingroup = false; - p = nextchar(p); - if (st < beg && p) return 0; // word <= condition - break; - } - case '.': if (!pos) { // dots are not metacharacters in groups: [.] - p = nextchar(p); - // skip the next character - for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); - if (st < beg) { // word <= condition - if (p) return 0; else return 1; - } - if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character - st--; - if (st < beg) { // word <= condition - if (p) return 0; else return 1; - } + int cond; + unsigned char * cp = (unsigned char *) st; + if (!(opts & aeUTF8)) { // 256-character codepage + // D\xf6m\xf6lki affix algorithm + for (cond = numconds; --cond >= 0; ) { + if ((conds.base[*--cp] & (1 << cond)) == 0) return 0; + } + } else { // UTF-8 encoding + unsigned short wc; + for (cond = numconds; --cond >= 0; ) { + // go to next character position and check limit + if ((char *) --cp < beg) return 0; + // a simple 7-bit ASCII character in UTF-8 + if ((*cp >> 7) == 0) { + if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0; + // UTF-8 multibyte character + } else { + // go to first character of UTF-8 multibyte character + for (; (*cp & 0xc0) == 0x80; cp--); + // not dot wildcard in rule + if (!conds.utf8.all[cond]) { + if (conds.utf8.neg[cond]) { + u8_u16((w_char *) &wc, 1, (char *) cp); + if (conds.utf8.wchars[cond] && + flag_bsearch((unsigned short *)conds.utf8.wchars[cond], + wc, (short) conds.utf8.wlen[cond])) return 0; + } else { + if (!conds.utf8.wchars[cond]) return 0; + u8_u16((w_char *) &wc, 1, (char *) cp); + if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], + wc, (short)conds.utf8.wlen[cond])) return 0; } - break; - } - default: { - if (*st == *p) { - p = nextchar(p); - if ((opts & aeUTF8) && (*st & 0x80)) { - st--; - while (p && (st >= beg)) { - if (*p != *st) { - if (!pos) return 0; - st = pos; - break; - } - // first byte of the UTF-8 multibyte character - if ((*p & 0xc0) != 0x80) break; - p = nextchar(p); - st--; - } - if (pos && st != pos) { - if (neg) return 0; - else if (i == numconds) return 1; - ingroup = true; - while (p && *p != ']' && (p = nextchar(p))); - st--; - } - if (p && *p != ']') p = nextchar(p); - } else if (pos) { - if (neg) return 0; - else if (i == numconds) return 1; - ingroup = true; - while (p && *p != ']' && (p = nextchar(p))); -// if (p && *p != ']') p = nextchar(p); - st--; - } - if (!pos) { - i++; - st--; - } - if (st < beg && p && *p != ']') return 0; // word <= condition - } else if (pos) { // group - p = nextchar(p); - } else return 0; } + } } - if (!p) return 1; } + return 1; } -// see if this suffix is present in the word + + +// see if this suffix is present in the word struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag, const FLAG badflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword struct hentry * he; // hash entry pointer unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; @@ -598,9 +497,8 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, tmpl = len - appndl; // the second condition is not enough for UTF-8 strings // it checked in test_condition() - - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && - (tmpl + stripl >= numconds)) { + + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -616,8 +514,7 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, // now make sure all of the conditions on characters // are met. Please see the appendix at the end of - // this file for more info on exactly what is being - // tested + // this file for more info on exactly what is being // tested // if all conditions are met then check if resulting // root word in the dictionary @@ -631,21 +528,21 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, do { // check conditional suffix (enabled by prefix) if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && - TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && - (((optflags & aeXPRODUCT) == 0) || + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + (((optflags & aeXPRODUCT) == 0) || TESTAFF(he->astr, ep->getFlag(), he->alen) || // enabled by prefix ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) ) && // handle cont. class - ((!cclass) || + ((!cclass) || ((contclass) && TESTAFF(contclass, cclass, contclasslen)) ) && // check only in compound homonyms (bad flags) (!badflag || !TESTAFF(he->astr, badflag, he->alen) - ) && + ) && // handle required flag - ((!needflag) || + ((!needflag) || (TESTAFF(he->astr, needflag, he->alen) || ((contclass) && TESTAFF(contclass, needflag, contclasslen))) ) @@ -653,12 +550,12 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, he = he->next_homonym; // check homonyms } while (he); - // obsolote stemming code (used only by the + // obsolote stemming code (used only by the // experimental SuffixMgr:suggest_pos_stems) // store resulting root in wlst } else if (wlst && (*ns < maxSug)) { int cwrd = 1; - for (int k=0; k < *ns; k++) + for (int k=0; k < *ns; k++) if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0; if (cwrd) { wlst[*ns] = mystrdup(tmpword); @@ -675,11 +572,11 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, return NULL; } -// see if two-level suffix is present in the word +// see if two-level suffix is present in the word struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword struct hentry * he; // hash entry pointer unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; @@ -699,8 +596,7 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, tmpl = len - appndl; - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && - (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -724,7 +620,7 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, if (test_condition((char *) cp, (char *) tmpword)) { if (ppfx) { // handle conditional suffix - if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); else he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag); @@ -737,18 +633,19 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, return NULL; } -// see if two-level suffix is present in the word +#ifdef HUNSPELL_EXPERIMENTAL +// see if two-level suffix is present in the word char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; PfxEntry* ep = (PfxEntry *) ppfx; char * st; char result[MAXLNLEN]; - + *result = '\0'; // if this suffix is being cross checked with a prefix @@ -764,8 +661,7 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, tmpl = len - appndl; - if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && - (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -793,17 +689,16 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); if (st) { if (((PfxEntry *) ppfx)->getMorph()) { - mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); - mystrcat(result, " ", MAXLNLEN); + strcat(result, ((PfxEntry *) ppfx)->getMorph()); } - mystrcat(result,st, MAXLNLEN); + strcat(result,st); free(st); mychomp(result); } } else { st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); mychomp(result); } @@ -811,7 +706,7 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, } else { st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); mychomp(result); } @@ -821,28 +716,28 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, } return NULL; } +#endif // END OF HUNSPELL_EXPERIMENTAL CODE // get next homonym with same affix -struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, +struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, const FLAG cclass, const FLAG needflag) { PfxEntry* ep = (PfxEntry *) ppfx; - FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL; while (he->next_homonym) { he = he->next_homonym; - if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && - ((optflags & aeXPRODUCT) == 0 || - TESTAFF(he->astr, eFlag, he->alen) || + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + ((optflags & aeXPRODUCT) == 0 || + TESTAFF(he->astr, ep->getFlag(), he->alen) || // handle conditional suffix - ((contclass) && TESTAFF(contclass, eFlag, contclasslen)) + ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) ) && // handle cont. class - ((!cclass) || + ((!cclass) || ((contclass) && TESTAFF(contclass, cclass, contclasslen)) ) && // handle required flag - ((!needflag) || + ((!needflag) || (TESTAFF(he->astr, needflag, he->alen) || ((contclass) && TESTAFF(contclass, needflag, contclasslen))) ) diff --git a/chrome/third_party/hunspell/src/hunspell/affentry.hxx b/chrome/third_party/hunspell/src/hunspell/affentry.hxx index ef1f86d..bb21773 100644 --- a/chrome/third_party/hunspell/src/hunspell/affentry.hxx +++ b/chrome/third_party/hunspell/src/hunspell/affentry.hxx @@ -54,7 +54,6 @@ public: inline void setNextEQ(PfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(PfxEntry * ptr) { flgnxt = ptr; } - inline char * nextchar(char * p); inline int test_condition(const char * st); }; @@ -124,9 +123,7 @@ public: inline void setNextEQ(SfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(SfxEntry * ptr) { flgnxt = ptr; } - inline char * nextchar(char * p); inline int test_condition(const char * st, const char * begin); - }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx b/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx index 29bc9f7..9f53a67 100644 --- a/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx @@ -7,9 +7,9 @@ #include <cctype> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #include <ctype.h> #endif @@ -20,24 +20,21 @@ #include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef WIN32 +#ifndef W32 using namespace std; #endif #endif #ifdef HUNSPELL_CHROME_CLIENT -AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md) +AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr* ptr) { bdict_reader = reader; #else -AffixMgr::AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) +AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) { #endif // register hash manager and load affix data from aff file - pHMgr = ptr[0]; - alldic = ptr; - maxdic = md; - keystring = NULL; + pHMgr = ptr; trystring = NULL; encoding=NULL; utf8 = 0; @@ -48,15 +45,10 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) numbreak = 0; reptable = NULL; numrep = 0; - iconvtable = NULL; - oconvtable = NULL; checkcpdtable = NULL; - // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) - simplifiedcpd = 0; numcheckcpd = 0; defcpdtable = NULL; numdefcpd = 0; - phone = NULL; compoundflag = FLAG_NULL; // permits word in compound forms compoundbegin = FLAG_NULL; // may be first word in compound forms compoundmiddle = FLAG_NULL; // may be middle word in compound forms @@ -68,12 +60,11 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution) checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds checkcompoundtriple = 0; // forbid compounds with triple letters - simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt) - forbiddenword = FORBIDDENWORD; // forbidden word signing flag + forbiddenword = FLAG_NULL; // forbidden word signing flag nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag lang = NULL; // language langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) - needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes + pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes cpdwordmax = -1; // default: unlimited wordcount in compound words cpdmin = -1; // undefined cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words @@ -97,14 +88,14 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) lemma_present = FLAG_NULL; circumfix = FLAG_NULL; onlyincompound = FLAG_NULL; + flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file maxngramsugs = -1; // undefined nosplitsugs = 0; sugswithdots = 0; keepcase = 0; checksharps = 0; - substandard = FLAG_NULL; - fullstrip = 0; + derived = NULL; // XXX not threadsafe variable for experimental stemming sfx = NULL; pfx = NULL; @@ -118,14 +109,14 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) #ifdef HUNSPELL_CHROME_CLIENT if (parse_file()) { #else - for (int j=0; j < CONTSIZE; j++) { contclasses[j] = 0; } - if (parse_file(affpath, key)) { + if (parse_file(aff_handle)) { #endif HUNSPELL_WARNING(stderr, "Failure loading aff file\n"); + wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"); } if (cpdmin == -1) cpdmin = MINCPDLEN; @@ -163,8 +154,6 @@ AffixMgr::~AffixMgr() sStart[j] = NULL; } - if (keystring) free(keystring); - keystring=NULL; if (trystring) free(trystring); trystring=NULL; if (encoding) free(encoding); @@ -189,26 +178,16 @@ AffixMgr::~AffixMgr() breaktable = NULL; } numbreak = 0; - if (reptable) { + if (reptable) { for (int j=0; j < numrep; j++) { free(reptable[j].pattern); free(reptable[j].pattern2); + reptable[j].pattern = NULL; + reptable[j].pattern2 = NULL; } free(reptable); reptable = NULL; } - if (iconvtable) delete iconvtable; - if (oconvtable) delete oconvtable; - if (phone && phone->rules) { - for (int j=0; j < phone->num + 1; j++) { - free(phone->rules[j * 2]); - free(phone->rules[j * 2 + 1]); - } - free(phone->rules); - free(phone); - phone = NULL; - } - if (defcpdtable) { for (int j=0; j < numdefcpd; j++) { free(defcpdtable[j].def); @@ -222,10 +201,8 @@ AffixMgr::~AffixMgr() for (int j=0; j < numcheckcpd; j++) { free(checkcpdtable[j].pattern); free(checkcpdtable[j].pattern2); - free(checkcpdtable[j].pattern3); checkcpdtable[j].pattern = NULL; checkcpdtable[j].pattern2 = NULL; - checkcpdtable[j].pattern3 = NULL; } free(checkcpdtable); checkcpdtable = NULL; @@ -240,7 +217,7 @@ AffixMgr::~AffixMgr() FREE_FLAG(compoundroot); FREE_FLAG(forbiddenword); FREE_FLAG(nosuggest); - FREE_FLAG(needaffix); + FREE_FLAG(pseudoroot); FREE_FLAG(lemma_present); FREE_FLAG(circumfix); FREE_FLAG(onlyincompound); @@ -259,20 +236,30 @@ AffixMgr::~AffixMgr() if (ignorechars) free(ignorechars); if (ignorechars_utf16) free(ignorechars_utf16); if (version) free(version); + if (derived) free(derived); checknum=0; } // read in aff file and build up prefix and suffix entry objects #ifdef HUNSPELL_CHROME_CLIENT + // Hack to avoid having to comment out all the fclose calls below on errors. + #define fclose(a); + int AffixMgr::parse_file() #else -int AffixMgr::parse_file(FILE* aff_handle, const char * key) +int AffixMgr::parse_file(FILE* aff_handle) #endif { - char * line = new char[MAXLNLEN+1]; // io buffers - char ft; // affix type - + // io buffers + char line[MAXLNLEN+1]; + + // affix type + char ft; + + // first line indicator for removing byte order mark + int firstline = 1; + // open the affix file #ifdef HUNSPELL_CHROME_CLIENT // We're always UTF-8 @@ -299,65 +286,44 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) char dupflags[CONTSIZE]; char dupflags_ini = 1; - // first line indicator for removing byte order mark - int firstline = 1; - - // open the affix file - FileMgr * afflst = new FileMgr(affpath, key); + FILE * afflst; + afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); if (!afflst) { - HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath); + HUNSPELL_WARNING(stderr, "error: could not open affix description file\n"); return 1; } + fseek(afflst, 0, SEEK_SET); // step one is to parse the affix file building up the internal // affix data structures + // read in each line ignoring any that do not // start with a known line type indicator - while ((line = afflst->getline())) { + while (fgets(line,MAXLNLEN,afflst)) { mychomp(line); /* remove byte order mark */ if (firstline) { firstline = 0; - if (strncmp(line,"\xEF\xBB\xBF",3) == 0) { + if (strncmp(line,"\xef\xbb\xbf",3) == 0) { memmove(line, line+3, strlen(line+3)+1); HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n"); } } #endif - /* parse in the keyboard string */ - if (strncmp(line,"KEY",3) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_string(line, &keystring, 0)) { -#else - if (parse_string(line, &keystring, afflst->getlinenum())) { - delete afflst; -#endif - return 1; - } - } - /* parse in the try string */ if (strncmp(line,"TRY",3) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_string(line, &trystring, 0)) { -#else - if (parse_string(line, &trystring, afflst->getlinenum())) { - delete afflst; -#endif + if (parse_string(line, &trystring, "TRY")) { + fclose(afflst); return 1; } } /* parse in the name of the character set used by the .dict and .aff */ if (strncmp(line,"SET",3) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_string(line, &encoding, 0)) { -#else - if (parse_string(line, &encoding, afflst->getlinenum())) { - delete afflst; -#endif + if (parse_string(line, &encoding, "SET")) { + fclose(afflst); return 1; } if (strcmp(encoding, "UTF-8") == 0) { @@ -376,12 +342,8 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) /* parse in the flag used by the controlled compound words */ if (strncmp(line,"COMPOUNDFLAG",12) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &compoundflag)) { -#else - if (parse_flag(line, &compoundflag, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) { + fclose(afflst); return 1; } } @@ -389,21 +351,13 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDBEGIN",13) == 0) { if (complexprefixes) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &compoundend)) { -#else - if (parse_flag(line, &compoundend, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) { + fclose(afflst); return 1; } } else { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &compoundbegin)) { -#else - if (parse_flag(line, &compoundbegin, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) { + fclose(afflst); return 1; } } @@ -411,33 +365,21 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &compoundmiddle)) { -#else - if (parse_flag(line, &compoundmiddle, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) { + fclose(afflst); return 1; } } /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDEND",11) == 0) { if (complexprefixes) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &compoundbegin)) { -#else - if (parse_flag(line, &compoundbegin, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) { + fclose(afflst); return 1; } } else { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &compoundend)) { -#else - if (parse_flag(line, &compoundend, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &compoundend, "COMPOUNDEND")) { + fclose(afflst); return 1; } } @@ -445,48 +387,32 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) /* parse in the data used by compound_check() method */ if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_num(line, &cpdwordmax)) { -#else - if (parse_num(line, &cpdwordmax, afflst)) { - delete afflst; -#endif + if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) { + fclose(afflst); return 1; } } /* parse in the flag sign compounds in dictionary */ if (strncmp(line,"COMPOUNDROOT",12) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &compoundroot)) { -#else - if (parse_flag(line, &compoundroot, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) { + fclose(afflst); return 1; } } /* parse in the flag used by compound_check() method */ if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &compoundpermitflag)) { -#else - if (parse_flag(line, &compoundpermitflag, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) { + fclose(afflst); return 1; } } /* parse in the flag used by compound_check() method */ if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &compoundforbidflag)) { -#else - if (parse_flag(line, &compoundforbidflag, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) { + fclose(afflst); return 1; } } @@ -503,105 +429,69 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) checkcompoundtriple = 1; } - if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) { - simplifiedtriple = 1; - } - if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) { checkcompoundcase = 1; } if (strncmp(line,"NOSUGGEST",9) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &nosuggest)) { -#else - if (parse_flag(line, &nosuggest, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &nosuggest, "NOSUGGEST")) { + fclose(afflst); return 1; } } /* parse in the flag used by forbidden words */ if (strncmp(line,"FORBIDDENWORD",13) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &forbiddenword)) { -#else - if (parse_flag(line, &forbiddenword, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) { + fclose(afflst); return 1; } } /* parse in the flag used by forbidden words */ if (strncmp(line,"LEMMA_PRESENT",13) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &lemma_present)) { -#else - if (parse_flag(line, &lemma_present, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) { + fclose(afflst); return 1; } } /* parse in the flag used by circumfixes */ if (strncmp(line,"CIRCUMFIX",9) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &circumfix)) { -#else - if (parse_flag(line, &circumfix, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &circumfix, "CIRCUMFIX")) { + fclose(afflst); return 1; } } /* parse in the flag used by fogemorphemes */ if (strncmp(line,"ONLYINCOMPOUND",14) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &onlyincompound)) { -#else - if (parse_flag(line, &onlyincompound, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) { + fclose(afflst); return 1; } } - /* parse in the flag used by `needaffixs' */ + /* parse in the flag used by `pseudoroots' */ if (strncmp(line,"PSEUDOROOT",10) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &needaffix)) { -#else - if (parse_flag(line, &needaffix, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) { + fclose(afflst); return 1; } } - /* parse in the flag used by `needaffixs' */ + /* parse in the flag used by `pseudoroots' */ if (strncmp(line,"NEEDAFFIX",9) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &needaffix)) { -#else - if (parse_flag(line, &needaffix, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) { + fclose(afflst); return 1; } } /* parse in the minimal length for words in compounds */ if (strncmp(line,"COMPOUNDMIN",11) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_num(line, &cpdmin)) { -#else - if (parse_num(line, &cpdmin, afflst)) { - delete afflst; -#endif + if (parse_num(line, &cpdmin, "COMPOUNDMIN")) { + fclose(afflst); return 1; } if (cpdmin < 1) cpdmin = 1; @@ -609,24 +499,16 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) /* parse in the max. words and syllables in compounds */ if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT if (parse_cpdsyllable(line)) { -#else - if (parse_cpdsyllable(line, afflst)) { - delete afflst; -#endif + fclose(afflst); return 1; } } /* parse in the flag used by compound_check() method */ if (strncmp(line,"SYLLABLENUM",11) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_string(line, &cpdsyllablenum, 0)) { -#else - if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) { - delete afflst; -#endif + if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) { + fclose(afflst); return 1; } } @@ -638,74 +520,34 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) /* parse in the extra word characters */ if (strncmp(line,"WORDCHARS",9) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, 0)) { -#else - if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) { - delete afflst; -#endif + if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) { + fclose(afflst); return 1; } } /* parse in the ignored characters (for example, Arabic optional diacretics charachters */ if (strncmp(line,"IGNORE",6) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, 0)) { -#else - if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) { - delete afflst; -#endif + if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) { + fclose(afflst); return 1; } } -#ifndef HUNSPELL_CHROME_CLIENT /* parse in the typical fault correcting table */ +#ifndef HUNSPELL_CHROME_CLIENT if (strncmp(line,"REP",3) == 0) { if (parse_reptable(line, afflst)) { - delete afflst; + fclose(afflst); return 1; } } #endif - /* parse in the input conversion table */ - if (strncmp(line,"ICONV",5) == 0) { - if (parse_convtable(line, afflst, &iconvtable, "ICONV")) { -#ifndef HUNSPELL_CHROME_CLIENT - delete afflst; -#endif - return 1; - } - } - - /* parse in the input conversion table */ - if (strncmp(line,"OCONV",5) == 0) { - if (parse_convtable(line, afflst, &oconvtable, "OCONV")) { -#ifndef HUNSPELL_CHROME_CLIENT - delete afflst; -#endif - return 1; - } - } - - /* parse in the phonetic translation table */ - if (strncmp(line,"PHONE",5) == 0) { - if (parse_phonetable(line, afflst)) { -#ifndef HUNSPELL_CHROME_CLIENT - delete afflst; -#endif - return 1; - } - } - /* parse in the checkcompoundpattern table */ if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) { if (parse_checkcpdtable(line, afflst)) { -#ifndef HUNSPELL_CHROME_CLIENT - delete afflst; -#endif + fclose(afflst); return 1; } } @@ -713,9 +555,7 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) /* parse in the defcompound table */ if (strncmp(line,"COMPOUNDRULE",12) == 0) { if (parse_defcpdtable(line, afflst)) { -#ifndef HUNSPELL_CHROME_CLIENT - delete afflst; -#endif + fclose(afflst); return 1; } } @@ -723,9 +563,7 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) /* parse in the related character map table */ if (strncmp(line,"MAP",3) == 0) { if (parse_maptable(line, afflst)) { -#ifndef HUNSPELL_CHROME_CLIENT - delete afflst; -#endif + fclose(afflst); return 1; } } @@ -733,38 +571,30 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) /* parse in the word breakpoints table */ if (strncmp(line,"BREAK",5) == 0) { if (parse_breaktable(line, afflst)) { -#ifndef HUNSPELL_CHROME_CLIENT - delete afflst; -#endif + fclose(afflst); return 1; } } /* parse in the language for language specific codes */ if (strncmp(line,"LANG",4) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_string(line, &lang, 0)) { -#else - if (parse_string(line, &lang, afflst->getlinenum())) { - delete afflst; -#endif + if (parse_string(line, &lang, "LANG")) { + fclose(afflst); return 1; } langnum = get_lang_num(lang); } if (strncmp(line,"VERSION",7) == 0) { - for(line = line + 7; *line == ' ' || *line == '\t'; line++); - version = mystrdup(line); + if (parse_string(line, &version, "VERSION")) { + fclose(afflst); + return 1; + } } if (strncmp(line,"MAXNGRAMSUGS",12) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_num(line, &maxngramsugs)) { -#else - if (parse_num(line, &maxngramsugs, afflst)) { - delete afflst; -#endif + if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) { + fclose(afflst); return 1; } } @@ -773,34 +603,14 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) nosplitsugs=1; } - if (strncmp(line,"FULLSTRIP",9) == 0) { - fullstrip=1; - } - if (strncmp(line,"SUGSWITHDOTS",12) == 0) { sugswithdots=1; } /* parse in the flag used by forbidden words */ if (strncmp(line,"KEEPCASE",8) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &keepcase)) { -#else - if (parse_flag(line, &keepcase, afflst)) { - delete afflst; -#endif - return 1; - } - } - - /* parse in the flag used by the affix generator */ - if (strncmp(line,"SUBSTANDARD",11) == 0) { -#ifdef HUNSPELL_CHROME_CLIENT - if (parse_flag(line, &substandard)) { -#else - if (parse_flag(line, &substandard, afflst)) { - delete afflst; -#endif + if (parse_flag(line, &keepcase, "KEEPCASE")) { + fclose(afflst); return 1; } } @@ -820,7 +630,7 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) dupflags_ini = 0; } if (parse_affix(line, ft, afflst, dupflags)) { - delete afflst; + fclose(afflst); process_pfx_tree_to_list(); process_sfx_tree_to_list(); return 1; @@ -830,7 +640,7 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) } #ifndef HUNSPELL_CHROME_CLIENT - delete afflst; + fclose(afflst); #endif // convert affix trees to sorted list @@ -863,8 +673,8 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) process_pfx_order(); process_sfx_order(); - /* get encoding for CHECKCOMPOUNDCASE */ - if (!utf8) { + // expand wordchars string, based on csutil (for external tokenization) + char * enc = get_encoding(); csconv = get_current_cs(enc); free(enc); @@ -885,20 +695,16 @@ int AffixMgr::parse_file(FILE* aff_handle, const char * key) } wordchars = mystrdup(expw); - } - // default BREAK definition - if (!breaktable) { - breaktable = (char **) malloc(sizeof(char *) * 3); + // temporary BREAK definition for German dash handling (OOo issue 64400) + if ((langnum == LANG_de) && (!breaktable)) { + breaktable = (char **) malloc(sizeof(char *)); if (!breaktable) return 1; breaktable[0] = mystrdup("-"); - breaktable[1] = mystrdup("^-"); - breaktable[2] = mystrdup("-$"); - if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3; + numbreak = 1; } return 0; } - #ifdef HUNSPELL_CHROME_CLIENT #undef fclose #endif @@ -1171,52 +977,197 @@ int AffixMgr::process_sfx_order() return 0; } -// add flags to the result for dictionary debugging -void AffixMgr::debugflag(char * result, unsigned short flag) { - char * st = encode_flag(flag); - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_FLAG, MAXLNLEN); - if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } -} -// calculate the character length of the condition -int AffixMgr::condlen(char * st) -{ - int l = 0; - bool group = false; - for(; *st; st++) { - if (*st == '[') { - group = true; - l++; - } else if (*st == ']') group = false; - else if (!group && (!utf8 || - (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++; - } - return l; -} + +// takes aff file condition string and creates the +// conds array - please see the appendix at the end of the +// file affentry.cxx which describes what is going on here +// in much more detail int AffixMgr::encodeit(struct affentry * ptr, char * cs) { - if (strcmp(cs,".") != 0) { - ptr->numconds = (char) condlen(cs); - strncpy(ptr->c.conds, cs, MAXCONDLEN); - // long condition (end of conds padded by strncpy) - if (ptr->c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { - ptr->opts += aeLONGCOND; - ptr->c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); - if (!ptr->c.l.conds2) return 1; - } - } else { + unsigned char c; + int i, j, k; + unsigned char mbr[MAXLNLEN]; + w_char wmbr[MAXLNLEN]; + w_char * wpos = wmbr; + + // now clear the conditions array */ + for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0; + + // now parse the string to create the conds array */ + int nc = strlen(cs); + unsigned char neg = 0; // complement indicator + int grp = 0; // group indicator + unsigned char n = 0; // number of conditions + int ec = 0; // end condition indicator + int nm = 0; // number of member in group + + // if no condition just return + if (strcmp(cs,".")==0) { ptr->numconds = 0; - ptr->c.conds[0] = '\0'; + return 0; } + + i = 0; + while (i < nc) { + c = *((unsigned char *)(cs + i)); + + // start group indicator + if (c == '[') { + grp = 1; + c = 0; + } + + // complement flag + if ((grp == 1) && (c == '^')) { + neg = 1; + c = 0; + } + + // end goup indicator + if (c == ']') { + ec = 1; + c = 0; + } + + // add character of group to list + if ((grp == 1) && (c != 0)) { + *(mbr + nm) = c; + nm++; + c = 0; + } + + // end of condition + if (c != 0) { + ec = 1; + } + + if (ec) { + if (!utf8) { + if (grp == 1) { + if (neg == 0) { + // set the proper bits in the condition array vals for those chars + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + ptr->conds.base[k] = ptr->conds.base[k] | ((unsigned char)1 << n); + } + } else { + // complement so set all of them and then unset indicated ones + for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n); + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + ptr->conds.base[k] = ptr->conds.base[k] & ~((unsigned char)1 << n); + } + } + neg = 0; + grp = 0; + nm = 0; + } else { + // not a group so just set the proper bit for this char + // but first handle special case of . inside condition + if (c == '.') { + // wild card character so set them all + for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n); + } else { + ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | ((unsigned char)1 << n); + } + } + n++; + ec = 0; + } else { // UTF-8 character set + if (grp == 1) { + ptr->conds.utf8.neg[n] = neg; + if (neg == 0) { + // set the proper bits in the condition array vals for those chars + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + if (k >> 7) { + u8_u16(wpos, 1, (char *) mbr + j); + wpos++; + if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character + } else { + ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | ((unsigned char)1 << n); + } + } + } else { // neg == 1 + // complement so set all of them and then unset indicated ones + for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n); + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + if (k >> 7) { + u8_u16(wpos, 1, (char *) mbr + j); + wpos++; + if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character + } else { + ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~((unsigned char)1 << n); + } + } + } + neg = 0; + grp = 0; + nm = 0; + ptr->conds.utf8.wlen[n] = wpos - wmbr; + if ((wpos - wmbr) != 0) { + ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr)); + if (!ptr->conds.utf8.wchars[n]) return 1; + memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr)); + flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]); + wpos = wmbr; + } + } else { // grp == 0 + // is UTF-8 character? + if (c >> 7) { + ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char)); + if (!ptr->conds.utf8.wchars[n]) return 1; + ptr->conds.utf8.wlen[n] = 1; + u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i); + if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character + } else { + ptr->conds.utf8.wchars[n] = NULL; + // not a group so just set the proper bit for this char + // but first handle special case of . inside condition + if (c == '.') { + ptr->conds.utf8.all[n] = 1; + // wild card character so set them all + for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n); + } else { + ptr->conds.utf8.all[n] = 0; + ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | ((unsigned char)1 << n); + } + } + neg = 0; + } + n++; + if (n > 8) { + HUNSPELL_WARNING(stderr, "Number of conditions is larger than 8. This" + "version of Hunspell does not support more than 8 conditions." + "Please, get rid of affentries with more than 8 conditions."); + break; + } + ec = 0; + neg = 0; + } + } + + i++; + } + ptr->numconds = n; return 0; } -// return 1 if s1 is a leading subset of s2 (dots are for infixes) + // return 1 if s1 is a leading subset of s2 +/* inline int AffixMgr::isSubset(const char * s1, const char * s2) + { + while ((*s1 == *s2) && *s1) { + s1++; + s2++; + } + return (*s1 == '\0'); + } +*/ + + // return 1 if s1 is a leading subset of s2 (dots are for infixes) inline int AffixMgr::isSubset(const char * s1, const char * s2) { while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) { @@ -1326,6 +1277,7 @@ struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len, return NULL; } +#ifdef HUNSPELL_EXPERIMENTAL // check word for prefixes char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound, const FLAG needflag) @@ -1343,7 +1295,7 @@ char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound while (pe) { st = pe->check_morph(word,len,in_compound, needflag); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } // if (rv) return rv; @@ -1361,7 +1313,7 @@ char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound // fogemorpheme if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); pfx = (AffEntry *)pptr; } free(st); @@ -1394,7 +1346,7 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, while (pe) { st = pe->check_twosfx_morph(word,len,in_compound, needflag); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } pe = pe->getNext(); @@ -1408,7 +1360,7 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, if (isSubset(pptr->getKey(),word)) { st = pptr->check_twosfx_morph(word, len, in_compound, needflag); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); pfx = (AffEntry *)pptr; } @@ -1421,6 +1373,8 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, if (*result) return mystrdup(result); return NULL; } +#endif // END OF HUNSPELL_EXPERIMENTAL CODE + // Is word a non compound with a REP substitution (see checkcompoundrep)? int AffixMgr::cpdrep_check(const char * word, int wl) @@ -1470,15 +1424,11 @@ int AffixMgr::cpdrep_check(const char * word, int wl) } // forbid compoundings when there are special patterns at word bound -int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2) +int AffixMgr::cpdpat_check(const char * word, int pos) { int len; for (int i = 0; i < numcheckcpd; i++) { if (isSubset(checkcpdtable[i].pattern2, word + pos) && - (!r1 || !checkcpdtable[i].cond || - (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) && - (!r2 || !checkcpdtable[i].cond2 || - (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) && (len = strlen(checkcpdtable[i].pattern)) && (pos > len) && (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1; } @@ -1496,8 +1446,10 @@ int AffixMgr::cpdcase_check(const char * word, int pos) u8_u16(&w, 1, p); unsigned short a = (u.h << 8) + u.l; unsigned short b = (w.h << 8) + w.l; - if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) && - (a != '-') && (b != '-')) return 1; + // CHROME MODIFICATION: We add the checks for the dashes as they are used + // below in the non-UTF-8 case. This seems to be a bug in Hunspell. It + // causes some of the tests to fail since we convert everything to UTF-8. + if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) && (a != '-') && (b != '-')) return 1; } else { unsigned char a = *(word + pos - 1); unsigned char b = *(word + pos); @@ -1513,35 +1465,15 @@ int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** signed short btwp[MAXWORDLEN]; // word positions for metacharacters int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions short bt = 0; - int i, j; + int i; int ok; int w = 0; - if (!*words) { w = 1; *words = def; } (*words)[wnum] = rv; - // has the last word COMPOUNDRULE flag? - if (rv->alen == 0) { - (*words)[wnum] = NULL; - if (w) *words = NULL; - return 0; - } - ok = 0; - for (i = 0; i < numdefcpd; i++) { - for (j = 0; j < defcpdtable[i].len; j++) { - if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' && - TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1; - } - } - if (ok == 0) { - (*words)[wnum] = NULL; - if (w) *words = NULL; - return 0; - } - for (i = 0; i < numdefcpd; i++) { signed short pp = 0; // pattern position signed short wp = 0; // "words" position @@ -1586,18 +1518,17 @@ int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) && ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2; if (defcpdtable[i].len <= r) return 1; - } + } // backtrack if (bt) do { ok = 1; btnum[bt - 1]--; pp = btpp[bt - 1]; - wp = btwp[bt - 1] + (signed short) btnum[bt - 1]; + wp = btwp[bt - 1] + btnum[bt - 1]; } while ((btnum[bt - 1] < 0) && --bt); } while (bt); - if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; - + if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; // check zero ending while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) && ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2; @@ -1637,7 +1568,7 @@ short AffixMgr::get_syllable(const char * word, int wlen) } else if (cpdvowels_utf16) { w_char w[MAXWORDUTF8LEN]; int i = u8_u16(w, MAXWORDUTF8LEN, word); - for (; i > 0; i--) { + for (; i; i--) { if (flag_bsearch((unsigned short *) cpdvowels_utf16, ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++; } @@ -1645,29 +1576,15 @@ short AffixMgr::get_syllable(const char * word, int wlen) return num; } -void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) { - if (utf8) { - int i; - for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) { - for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++); - } - for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) { - for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--); - } - } else { - *cmin = cpdmin; - *cmax = len - cpdmin + 1; - } -} - // check if compound word is correctly spelled // hu_mov_rule = spec. Hungarian rule (XXX) struct hentry * AffixMgr::compound_check(const char * word, int len, short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL, - char hu_mov_rule = 0, char is_sug = 0) + char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0) { int i; short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; + int oldcmpdstemnum = 0; struct hentry * rv = NULL; struct hentry * rv_first; struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking @@ -1675,17 +1592,31 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, char ch; int cmin; int cmax; - int striple = 0; - int scpd = 0; - int soldi = 0; - int oldcmin = 0; - int oldcmax = 0; - int oldlen = 0; - int checkedstriple = 0; - + int checked_prefix; - setcminmax(&cmin, &cmax, word, len); +#ifdef HUNSTEM + if (cmpdstemnum) { + if (wordnum == 0) { + *cmpdstemnum = 1; + } else { + (*cmpdstemnum)++; + } + } +#endif + if (utf8) { + for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) { + cmin++; + for (; (word[cmin] & 0xc0) == 0x80; cmin++); + } + for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) { + cmax--; + for (; (word[cmax] & 0xc0) == 0x80; cmax--); + } + } else { + cmin = cpdmin; + cmax = len - cpdmin + 1; + } strcpy(st, word); @@ -1701,42 +1632,20 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, if (i >= cmax) return NULL; } - do { // simplified checkcompoundpattern loop - - if (scpd > 0) { - for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 || - strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++); - - if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop - strcpy(st + i, checkcpdtable[scpd-1].pattern); - soldi = i; - i += strlen(checkcpdtable[scpd-1].pattern); - strcpy(st + i, checkcpdtable[scpd-1].pattern2); - strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3)); - - oldlen = len; - len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3); - oldcmin = cmin; - oldcmax = cmax; - setcminmax(&cmin, &cmax, st, len); - - cmax = len - cpdmin + 1; - } - - + ch = st[i]; st[i] = '\0'; sfx = NULL; pfx = NULL; - + // FIRST WORD - + rv = lookup(st); // perhaps without prefix // search homonym with compound flag while ((rv) && !hu_mov_rule && - ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || + ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundbegin && !wordnum && TESTAFF(rv->astr, compoundbegin, rv->alen)) || @@ -1744,10 +1653,8 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, TESTAFF(rv->astr, compoundmiddle, rv->alen)) || (numdefcpd && ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) || - (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) || - (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL && - !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen))) - ) { + (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)))) + ))) { rv = rv->next_homonym; } @@ -1764,7 +1671,6 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, rv = NULL; } } - if (rv || (((wordnum == 0) && compoundbegin && ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || @@ -1773,9 +1679,9 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) ) checked_prefix = 1; - // else check forbiddenwords and needaffix + // else check forbiddenwords and pseudoroot } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || - TESTAFF(rv->astr, needaffix, rv->alen) || + TESTAFF(rv->astr, pseudoroot, rv->alen) || (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)) )) { st[i] = ch; @@ -1803,7 +1709,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ((SfxEntry*)sfx)->getContLen())))) { rv = NULL; } - + // check compoundmiddle flag in suffix and prefix if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule && ((pfx && ((PfxEntry*)pfx)->getCont() && @@ -1813,7 +1719,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle, ((SfxEntry*)sfx)->getContLen())))) { rv = NULL; - } + } // check forbiddenwords if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || @@ -1843,20 +1749,19 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ) ) // END of LANG_hu section - ) && - ( - // test CHECKCOMPOUNDPATTERN conditions - scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL || - TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen) ) - && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters + && ! (( checkcompoundtriple && // test triple letters (word[i-1]==word[i]) && ( - ((i>1) && (word[i-1]==word[i-2])) || + ((i>1) && (word[i-1]==word[i-2])) || ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' ) ) || - ( - checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i) + ( + // test CHECKCOMPOUNDPATTERN + numcheckcpd && cpdpat_check(word, i) + ) || + ( + checkcompoundcase && cpdcase_check(word, i) )) ) // LANG_hu section: spec. Hungarian rule @@ -1864,14 +1769,15 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) || TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen()) - ) + ) ) ) - ) { // first word is ok condition +// END of LANG_hu section + ) { // LANG_hu section: spec. Hungarian rule if (langnum == LANG_hu) { - // calculate syllable number of the word + // calculate syllable number of the word numsyllable += get_syllable(st, i); // + 1 word, if syllable number of the prefix > 1 (hungarian convention) @@ -1879,35 +1785,23 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, } // END of LANG_hu section +#ifdef HUNSTEM + if (cmpdstem) cmpdstem[*cmpdstemnum - 1] = i; +#endif // NEXT WORD(S) rv_first = rv; - st[i] = ch; - - do { // striple loop - - // check simplifiedtriple - if (simplifiedtriple) { - if (striple) { - checkedstriple = 1; - i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" - } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1; - } - - rv = lookup((st+i)); // perhaps without prefix + rv = lookup((word+i)); // perhaps without prefix // search homonym with compound flag - while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || + while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || - (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) || - (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL && - !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) - )) { + (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { rv = rv->next_homonym; } - if (rv && words && words[wnum + 1]) return rv_first; + if (rv && words && words[wnum + 1]) return rv; oldnumsyllable2 = numsyllable; oldwordnum2 = wordnum; @@ -1939,27 +1833,20 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ) && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || - ((cpdmaxsyllable!=0) && - (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable)) - ) && - ( - // test CHECKCOMPOUNDPATTERN - !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv) - ) && - ( + ((cpdmaxsyllable==0) || + (numsyllable + get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable)) + ) + && ( (!checkcompounddup || (rv != rv_first)) ) - // test CHECKCOMPOUNDPATTERN conditions - && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || - TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) ) { // forbid compound word, if it is a non compound word with typical fault if (checkcompoundrep && cpdrep_check(word,len)) return NULL; - return rv_first; + return rv; } - numsyllable = oldnumsyllable2; + numsyllable = oldnumsyllable2 ; wordnum = oldwordnum2; // perhaps second word has prefix or/and suffix @@ -1971,20 +1858,12 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, pfx = NULL; rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END); } - + if (!rv && numdefcpd && words) { rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); - if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first; - rv = NULL; + if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv; } - // test CHECKCOMPOUNDPATTERN conditions (allowed forms) - if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || - TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL; - - // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) - if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv)) rv = NULL; - // check non_compound flag in suffix and prefix if ((rv) && ((pfx && ((PfxEntry*)pfx)->getCont() && @@ -2008,7 +1887,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, if (langnum == LANG_hu) { // calculate syllable number of the word numsyllable += get_syllable(word + i, strlen(word + i)); - + // - affix syllable num. // XXX only second suffix (inflections, not derivations) if (sfxappnd) { @@ -2016,13 +1895,13 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, numsyllable -= get_syllable(tmp, strlen(tmp)); free(tmp); } - + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++; // increment syllable num, if last word has a SYLLABLENUM flag // and the suffix is beginning `s' - + if (cpdsyllablenum) { switch (sfxflag) { case 'c': { numsyllable+=2; break; } @@ -2031,7 +1910,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, } } } - + // increment word number, if the second word has a compoundroot flag if ((rv) && (compoundroot) && (TESTAFF(rv->astr, compoundroot, rv->alen))) { @@ -2045,7 +1924,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, if ((rv) && ( ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || - ((cpdmaxsyllable != 0) && + ((cpdmaxsyllable == 0) || (numsyllable <= cpdmaxsyllable)) ) && ( @@ -2053,61 +1932,41 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, )) { // forbid compound word, if it is a non compound word with typical fault if (checkcompoundrep && cpdrep_check(word, len)) return NULL; - return rv_first; + return rv; } numsyllable = oldnumsyllable2; wordnum = oldwordnum2; - +#ifdef HUNSTEM + if (cmpdstemnum) oldcmpdstemnum = *cmpdstemnum; +#endif // perhaps second word is a compound word (recursive call) if (wordnum < maxwordnum) { - rv = compound_check((st+i),strlen(st+i), wordnum+1, - numsyllable, maxwordnum, wnum + 1, words, 0, is_sug); - if (rv && numcheckcpd && (scpd == 0 && cpdpat_check(word, i, rv_first, rv) || - scpd != 0 && !cpdpat_check(word, i, rv_first, rv))) rv = NULL; + rv = compound_check((word+i),strlen(word+i), wordnum+1, + numsyllable, maxwordnum, wnum + 1, words, + 0, cmpdstemnum, cmpdstem, is_sug); } else { rv=NULL; } if (rv) { // forbid compound word, if it is a non compound word with typical fault if (checkcompoundrep && cpdrep_check(word, len)) return NULL; - return rv_first; + return rv; + } else { +#ifdef HUNSTEM + if (cmpdstemnum) *cmpdstemnum = oldcmpdstemnum; +#endif } - } while (striple && !checkedstriple); // end of striple loop - - if (checkedstriple) { - i++; - checkedstriple = 0; - striple = 0; - } - - } // first word is ok condition - - if (soldi != 0) { - i = soldi; - soldi = 0; - len = oldlen; - cmin = oldcmin; - cmax = oldcmax; } - scpd++; - - } while (simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop - - if (soldi != 0) { - i = soldi; - strcpy(st, word); // XXX add more optim. - soldi = 0; - } else st[i] = ch; - - scpd = 0; + st[i] = ch; wordnum = oldwordnum; numsyllable = oldnumsyllable; } - + return NULL; -} +} +#ifdef HUNSPELL_EXPERIMENTAL // check if compound word is correctly spelled // hu_mov_rule = spec. Hungarian rule (XXX) int AffixMgr::compound_check_morph(const char * word, int len, @@ -2123,14 +1982,26 @@ int AffixMgr::compound_check_morph(const char * word, int len, struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking char st [MAXWORDUTF8LEN + 4]; char ch; - + int checked_prefix; char presult[MAXLNLEN]; int cmin; int cmax; - - setcminmax(&cmin, &cmax, word, len); + + if (utf8) { + for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) { + cmin++; + for (; (word[cmin] & 0xc0) == 0x80; cmin++); + } + for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) { + cmax--; + for (; (word[cmax] & 0xc0) == 0x80; cmax--); + } + } else { + cmin = cpdmin; + cmax = len - cpdmin + 1; + } strcpy(st, word); @@ -2144,7 +2015,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, for (; (st[i] & 0xc0) == 0x80; i++); if (i >= cmax) return 0; } - + ch = st[i]; st[i] = '\0'; sfx = NULL; @@ -2152,12 +2023,12 @@ int AffixMgr::compound_check_morph(const char * word, int len, // FIRST WORD *presult = '\0'; if (partresult) strcat(presult, partresult); - + rv = lookup(st); // perhaps without prefix // search homonym with compound flag while ((rv) && !hu_mov_rule && - ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || + ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundbegin && !wordnum && TESTAFF(rv->astr, compoundbegin, rv->alen)) || @@ -2171,16 +2042,13 @@ int AffixMgr::compound_check_morph(const char * word, int len, } if (rv) { - sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st); - if (!HENTRY_FIND(rv, MORPH_STEM)) { - sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st); + if (rv->description) { + if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) + strcat(presult, st); + strcat(presult, rv->description); } - // store the pointer of the hash entry -// sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv); - if (HENTRY_DATA(rv)) { - sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv)); - } - } + } + if (!rv) { if (compoundflag && !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { @@ -2194,7 +2062,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, rv = NULL; } } - + if (rv || (((wordnum == 0) && compoundbegin && ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || @@ -2203,28 +2071,35 @@ int AffixMgr::compound_check_morph(const char * word, int len, ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) ) { - // char * p = prefix_check_morph(st, i, 0, compound); + //char * p = prefix_check_morph(st, i, 0, compound); char * p = NULL; if (compoundflag) p = affix_check_morph(st, i, compoundflag); if (!p || (*p == '\0')) { - if (p) free(p); - p = NULL; if ((wordnum == 0) && compoundbegin) { p = affix_check_morph(st, i, compoundbegin); } else if ((wordnum > 0) && compoundmiddle) { p = affix_check_morph(st, i, compoundmiddle); } } - if (p && (*p != '\0')) { - sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, - MORPH_PART, st, line_uniq_app(&p, MSEP_REC)); + if (*p != '\0') { + line_uniq(p); + if (strchr(p, '\n')) { + strcat(presult, "("); + strcat(presult, line_join(p, '|')); + strcat(presult, ")"); + } else { + strcat(presult, p); + } + } + if (presult[strlen(presult) - 1] == '\n') { + presult[strlen(presult) - 1] = '\0'; } - if (p) free(p); checked_prefix = 1; + //strcat(presult, "+"); } // else check forbiddenwords } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || - TESTAFF(rv->astr, needaffix, rv->alen))) { + TESTAFF(rv->astr, pseudoroot, rv->alen))) { st[i] = ch; continue; } @@ -2287,7 +2162,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, ) // END of LANG_hu section ) - && ! (( checkcompoundtriple && !words && // test triple letters + && ! (( checkcompoundtriple && // test triple letters (word[i-1]==word[i]) && ( ((i>1) && (word[i-1]==word[i-2])) || ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' @@ -2295,10 +2170,10 @@ int AffixMgr::compound_check_morph(const char * word, int len, ) || ( // test CHECKCOMPOUNDPATTERN - numcheckcpd && !words && cpdpat_check(word, i, rv, NULL) + numcheckcpd && cpdpat_check(word, i) ) || ( - checkcompoundcase && !words && cpdcase_check(word, i) + checkcompoundcase && cpdcase_check(word, i) )) ) // LANG_hu section: spec. Hungarian rule @@ -2327,7 +2202,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, rv = lookup((word+i)); // perhaps without prefix // search homonym with compound flag - while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || + while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { @@ -2336,21 +2211,11 @@ int AffixMgr::compound_check_morph(const char * word, int len, if (rv && words && words[wnum + 1]) { strcat(*result, presult); - strcat(*result, " "); - strcat(*result, MORPH_PART); - strcat(*result, word+i); - if (complexprefixes && HENTRY_DATA(rv)) strcat(*result, HENTRY_DATA2(rv)); - if (!HENTRY_FIND(rv, MORPH_STEM)) { - strcat(*result, " "); - strcat(*result, MORPH_STEM); - strcat(*result, HENTRY_WORD(rv)); - } - // store the pointer of the hash entry -// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); - if (!complexprefixes && HENTRY_DATA(rv)) { - strcat(*result, " "); - strcat(*result, HENTRY_DATA2(rv)); - } + if (complexprefixes && rv->description) strcat(*result, rv->description); + if (rv->description && ((!rv->astr) || + !TESTAFF(rv->astr, lemma_present, rv->alen))) + strcat(*result, rv->word); + if (!complexprefixes && rv->description) strcat(*result, rv->description); strcat(*result, "\n"); ok = 1; return 0; @@ -2375,7 +2240,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, st[i] = ch; continue; } - + // second word is acceptable, as a root? // hungarian conventions: compounding is acceptable, // when compound forms consist of 2 words, or if more, @@ -2386,8 +2251,8 @@ int AffixMgr::compound_check_morph(const char * word, int len, ) && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || - ((cpdmaxsyllable!=0) && - (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable)) + ((cpdmaxsyllable==0) || + (numsyllable+get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable)) ) && ( (!checkcompounddup || (rv != rv_first)) @@ -2396,23 +2261,12 @@ int AffixMgr::compound_check_morph(const char * word, int len, { // bad compound word strcat(*result, presult); - strcat(*result, " "); - strcat(*result, MORPH_PART); - strcat(*result, word+i); - - if (HENTRY_DATA(rv)) { - if (complexprefixes) strcat(*result, HENTRY_DATA2(rv)); - if (! HENTRY_FIND(rv, MORPH_STEM)) { - strcat(*result, " "); - strcat(*result, MORPH_STEM); - strcat(*result, HENTRY_WORD(rv)); - } - // store the pointer of the hash entry -// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); - if (!complexprefixes) { - strcat(*result, " "); - strcat(*result, HENTRY_DATA2(rv)); - } + + if (rv->description) { + if (complexprefixes) strcat(*result, rv->description); + if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) + strcat(*result, rv->word); + if (!complexprefixes) strcat(*result, rv->description); } strcat(*result, "\n"); ok = 1; @@ -2438,16 +2292,20 @@ int AffixMgr::compound_check_morph(const char * word, int len, if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { char * m = NULL; if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); - if ((!m || *m == '\0') && compoundend) { - if (m) free(m); + if ((!m || *m == '\0') && compoundend) m = affix_check_morph((word+i),strlen(word+i), compoundend); - } strcat(*result, presult); - if (m || (*m != '\0')) { - sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, - MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); + if (m) { + line_uniq(m); + if (strchr(m, '\n')) { + strcat(*result, "("); + strcat(*result, line_join(m, '|')); + strcat(*result, ")"); + } else { + strcat(*result, m); + } + free(m); } - if (m) free(m); strcat(*result, "\n"); ok = 1; } @@ -2466,7 +2324,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, // check forbiddenwords if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen)) - && (! TESTAFF(rv->astr, needaffix, rv->alen))) { + && (! TESTAFF(rv->astr, pseudoroot, rv->alen))) { st[i] = ch; continue; } @@ -2510,7 +2368,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, if ((rv) && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || - ((cpdmaxsyllable!=0) && + ((cpdmaxsyllable==0) || (numsyllable <= cpdmaxsyllable)) ) && ( @@ -2518,17 +2376,21 @@ int AffixMgr::compound_check_morph(const char * word, int len, )) { char * m = NULL; if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); - if ((!m || *m == '\0') && compoundend) { - if (m) free(m); + if ((!m || *m == '\0') && compoundend) m = affix_check_morph((word+i),strlen(word+i), compoundend); - } strcat(*result, presult); - if (m && (*m != '\0')) { - sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, - MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); + if (m) { + line_uniq(m); + if (strchr(m, '\n')) { + strcat(*result, "("); + strcat(*result, line_join(m, '|')); + strcat(*result, ")"); + } else { + strcat(*result, m); + } + free(m); } - if (m) free(m); - sprintf(*result + strlen(*result), "%c", MSEP_REC); + strcat(*result, "\n"); ok = 1; } @@ -2549,6 +2411,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, } return 0; } +#endif // END OF HUNSPELL_EXPERIMENTAL CODE // return 1 if s1 (reversed) is a leading subset of end of s2 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len) @@ -2579,6 +2442,8 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, const FLAG cclass, const FLAG needflag, char in_compound) { struct hentry * rv = NULL; + char result[MAXLNLEN]; + PfxEntry* ep = (PfxEntry *) ppfx; // first handle the special case of 0 length suffixes @@ -2602,11 +2467,11 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, // fogemorpheme (in_compound || !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && - // needaffix on prefix or first suffix + // pseudoroot on prefix or first suffix (cclass || - !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || + !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), needaffix, + TESTAFF(ep->getCont(), pseudoroot, ep->getContLen()))) ) ) { @@ -2644,11 +2509,11 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, // fogemorpheme (in_compound || !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && - // needaffix on prefix or first suffix + // pseudoroot on prefix or first suffix (cclass || - !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || + !(sptr->getCont() && TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), needaffix, + TESTAFF(ep->getCont(), pseudoroot, ep->getContLen()))) ) ) { @@ -2658,6 +2523,17 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, sfx=(AffEntry *)sptr; // BUG: sfx not stateless sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless + if (cclass || sptr->getCont()) { + if (!derived) { + derived = mystrdup(word); + } else { + strcpy(result, derived); // XXX check size + strcat(result, "\n"); + strcat(result, word); + free(derived); + derived = mystrdup(result); + } + } return rv; } } @@ -2712,6 +2588,7 @@ struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len, return NULL; } +#ifdef HUNSPELL_EXPERIMENTAL char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, int sfxopts, AffEntry * ppfx, const FLAG needflag) { @@ -2733,18 +2610,12 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); if (st) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) { - mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); - mystrcat(result, " ", MAXLNLEN); - } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); + if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); } - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); - if (se->getMorph()) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, se->getMorph(), MAXLNLEN); - } else debugflag(result, se->getFlag()); - mystrcat(result, "\n", MAXLNLEN); + if (se->getMorph()) strcat(result, se->getMorph()); + strcat(result, "\n"); } } se = se->getNext(); @@ -2766,14 +2637,19 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, free(st); result3[0] = '\0'; - - if (sptr->getMorph()) { - mystrcat(result3, " ", MAXLNLEN); - mystrcat(result3, sptr->getMorph(), MAXLNLEN); - } else debugflag(result3, sptr->getFlag()); +#ifdef DEBUG + unsigned short flag = sptr->getFlag(); + if (flag_mode == FLAG_NUM) { + sprintf(result3, "<%d>", sptr->getKey()); + } else if (flag_mode == FLAG_LONG) { + sprintf(result3, "<%c%c>", flag >> 8, (flag << 8) >>8); + } else sprintf(result3, "<%c>", flag); + strcat(result3, ":"); +#endif + if (sptr->getMorph()) strcat(result3, sptr->getMorph()); strlinecat(result2, result3); - mystrcat(result2, "\n", MAXLNLEN); - mystrcat(result, result2, MAXLNLEN); + strcat(result2, "\n"); + strcat(result, result2); } } sptr = sptr->getNextEQ(); @@ -2781,7 +2657,7 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, sptr = sptr->getNextNE(); } } - if (*result) return mystrdup(result); + if (result) return mystrdup(result); return NULL; } @@ -2816,40 +2692,26 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, // fogemorpheme (in_compound || !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && - // needaffix on prefix or first suffix + // pseudoroot on prefix or first suffix (cclass || - !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || + !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), needaffix, + TESTAFF(ep->getCont(), pseudoroot, ep->getContLen()))) ) )) rv = se->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); while (rv) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) { - mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); - mystrcat(result, " ", MAXLNLEN); - } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); - } - if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); - if (! HENTRY_FIND(rv, MORPH_STEM)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_STEM, MAXLNLEN); - mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); - } - // store the pointer of the hash entry -// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); - - if (!complexprefixes && HENTRY_DATA(rv)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); } - if (se->getMorph()) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, se->getMorph(), MAXLNLEN); - } else debugflag(result, se->getFlag()); - mystrcat(result, "\n", MAXLNLEN); + if (complexprefixes && rv->description) strcat(result, rv->description); + if (rv->description && ((!rv->astr) || + !TESTAFF(rv->astr, lemma_present, rv->alen))) + strcat(result, rv->word); + if (!complexprefixes && rv->description) strcat(result, rv->description); + if (se->getMorph()) strcat(result, se->getMorph()); + strcat(result, "\n"); rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } } @@ -2879,36 +2741,30 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, // fogemorpheme (in_compound || !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && - // needaffix on first suffix + // pseudoroot on first suffix (cclass || !(sptr->getCont() && - TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))) + TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen()))) )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); while (rv) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) { - mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); - mystrcat(result, " ", MAXLNLEN); - } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); + if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); } - if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); - if (! HENTRY_FIND(rv, MORPH_STEM)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_STEM, MAXLNLEN); - mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); - } - // store the pointer of the hash entry -// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); - - if (!complexprefixes && HENTRY_DATA(rv)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); - } + if (complexprefixes && rv->description) strcat(result, rv->description); + if (rv->description && ((!rv->astr) || + !TESTAFF(rv->astr, lemma_present, rv->alen))) strcat(result, rv->word); + if (!complexprefixes && rv->description) strcat(result, rv->description); +#ifdef DEBUG + unsigned short flag = sptr->getFlag(); + if (flag_mode == FLAG_NUM) { + sprintf(result, "<%d>", sptr->getKey()); + } else if (flag_mode == FLAG_LONG) { + sprintf(result, "<%c%c>", flag >> 8, (flag << 8) >>8); + } else sprintf(result, "<%c>", flag); + strcat(result, ":"); +#endif - if (sptr->getMorph()) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, sptr->getMorph(), MAXLNLEN); - } else debugflag(result, sptr->getFlag()); - mystrcat(result, "\n", MAXLNLEN); + if (sptr->getMorph()) strcat(result, sptr->getMorph()); + strcat(result, "\n"); rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } sptr = sptr->getNextEQ(); @@ -2920,11 +2776,15 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, if (*result) return mystrdup(result); return NULL; } +#endif // END OF HUNSPELL_EXPERIMENTAL CODE + // check if word with affixes is correctly spelled struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound) { struct hentry * rv= NULL; + if (derived) free(derived); + derived = NULL; // check all prefixes (also crossed with suffixes if allowed) rv = prefix_check(word, len, in_compound, needflag); @@ -2946,6 +2806,7 @@ struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG ne return rv; } +#ifdef HUNSPELL_EXPERIMENTAL // check if word with affixes is correctly spelled char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound) { @@ -2957,14 +2818,14 @@ char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needfl // check all prefixes (also crossed with suffixes if allowed) st = prefix_check_morph(word, len, in_compound); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } // if still not found check all suffixes st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } @@ -2974,133 +2835,54 @@ char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needfl // if still not found check all two-level suffixes st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } // if still not found check all two-level suffixes st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } } - + return mystrdup(result); } - -char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap, - unsigned short al, char * morph, char * targetmorph, int level) -{ - // handle suffixes - char * stemmorph; - char * stemmorphcatpos; - char mymorph[MAXLNLEN]; - - if (!morph && !targetmorph) return NULL; - - // check substandard flag - if (TESTAFF(ap, substandard, al)) return NULL; - - if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts); - -// int targetcount = get_sfxcount(targetmorph); - - // use input suffix fields, if exist - if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { - stemmorph = mymorph; - strcpy(stemmorph, morph); - strcat(stemmorph, " "); - stemmorphcatpos = stemmorph + strlen(stemmorph); - } else { - stemmorph = morph; - stemmorphcatpos = NULL; - } - - for (int i = 0; i < al; i++) { - const unsigned char c = (unsigned char) (ap[i] & 0x00FF); - SfxEntry * sptr = (SfxEntry *)sFlag[c]; - while (sptr) { - if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) || - // don't generate forms with substandard affixes - !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { - - if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph()); - else stemmorph = (char *) sptr->getMorph(); - - int cmp = morphcmp(stemmorph, targetmorph); - - if (cmp == 0) { - char * newword = sptr->add(ts, wl); - if (newword) { - hentry * check = pHMgr->lookup(newword); // XXX extra dic - if (!check || !check->astr || - !TESTAFF(check->astr, forbiddenword, check->alen)) { - return newword; - } - free(newword); - } - } - - // recursive call for secondary suffixes - if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && -// (get_sfxcount(stemmorph) < targetcount) && - !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { - char * newword = sptr->add(ts, wl); - if (newword) { - char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(), - sptr->getContLen(), stemmorph, targetmorph, 1); - - if (newword2) { - free(newword); - return newword2; - } - free(newword); - newword = NULL; - } - } - } - sptr = (SfxEntry *)sptr ->getFlgNxt(); - } - } - return NULL; -} +#endif // END OF HUNSPELL_EXPERIMENTAL CODE int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts, - int wl, const unsigned short * ap, unsigned short al, char * bad, int badl, - char * phon) + int wl, const unsigned short * ap, unsigned short al, char * bad, int badl) { + int nh=0; + // first add root word to list - if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || + if ((nh < maxn) && !(al && ((pseudoroot && TESTAFF(ap, pseudoroot, al)) || (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { wlst[nh].word = mystrdup(ts); - if (!wlst[nh].word) return 0; wlst[nh].allow = (1 == 0); - wlst[nh].orig = NULL; nh++; - // add special phonetic version - if (phon && (nh < maxn)) { - wlst[nh].word = mystrdup(phon); - if (!wlst[nh].word) return nh - 1; - wlst[nh].allow = (1 == 0); - wlst[nh].orig = mystrdup(ts); - if (!wlst[nh].orig) return nh - 1; - nh++; - } } // handle suffixes for (int i = 0; i < al; i++) { +#ifdef HUNSPELL_CHROME_CLIENT + // This change is taken from a future version of Hunspell. In other + // places, the index is clamped to a byte, so I think this is correct. + // Our array is only 256 entries anyway, so it is required. const unsigned char c = (unsigned char) (ap[i] & 0x00FF); +#else + unsigned short c = (unsigned short) ap[i]; +#endif SfxEntry * sptr = (SfxEntry *)sFlag[c]; while (sptr) { - if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && - (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) && - // check needaffix flag - !(sptr->getCont() && ((needaffix && - TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || + if (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && + (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0)) && + // check pseudoroot flag + !(sptr->getCont() && ((pseudoroot && + TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || (circumfix && TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || (onlyincompound && @@ -3110,22 +2892,8 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts if (newword) { if (nh < maxn) { wlst[nh].word = newword; - wlst[nh].allow = sptr->allowCross(); - wlst[nh].orig = NULL; - nh++; - // add special phonetic version - if (phon && (nh < maxn)) { - char st[MAXWORDUTF8LEN]; - strcpy(st, phon); - strcat(st, sptr->getKey()); - reverseword(st + strlen(phon)); - wlst[nh].word = mystrdup(st); - if (!wlst[nh].word) return nh - 1; - wlst[nh].allow = (1 == 0); - wlst[nh].orig = mystrdup(newword); - if (!wlst[nh].orig) return nh - 1; - nh++; - } + wlst[nh].allow = sptr->allowCross(); + nh++; } else { free(newword); } @@ -3141,10 +2909,15 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts for (int j=1;j<n ;j++) if (wlst[j].allow) { for (int k = 0; k < al; k++) { +#ifdef HUNSPELL_CHROME_CLIENT + // See similar change above. const unsigned char c = (unsigned char) (ap[k] & 0x00FF); +#else + unsigned short c = (unsigned short) ap[k]; +#endif PfxEntry * cptr = (PfxEntry *) pFlag[c]; while (cptr) { - if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) && + if (cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) && (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { int l1 = strlen(wlst[j].word); char * newword = cptr->add(wlst[j].word, l1); @@ -3152,7 +2925,6 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts if (nh < maxn) { wlst[nh].word = newword; wlst[nh].allow = cptr->allowCross(); - wlst[nh].orig = NULL; nh++; } else { free(newword); @@ -3167,14 +2939,19 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts // now handle pure prefixes for (int m = 0; m < al; m ++) { +#ifdef HUNSPELL_CHROME_CLIENT + // See similar change above. const unsigned char c = (unsigned char) (ap[m] & 0x00FF); +#else + unsigned short c = (unsigned short) ap[m]; +#endif PfxEntry * ptr = (PfxEntry *) pFlag[c]; while (ptr) { - if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && - (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && - // check needaffix flag - !(ptr->getCont() && ((needaffix && - TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || + if (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && + (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0)) && + // check pseudoroot flag + !(ptr->getCont() && ((pseudoroot && + TESTAFF(ptr->getCont(), pseudoroot, ptr->getContLen())) || (circumfix && TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || (onlyincompound && @@ -3185,7 +2962,6 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts if (nh < maxn) { wlst[nh].word = newword; wlst[nh].allow = ptr->allowCross(); - wlst[nh].orig = NULL; nh++; } else { free(newword); @@ -3199,6 +2975,8 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts return nh; } + + // return length of replacing table int AffixMgr::get_numrep() { @@ -3212,27 +2990,6 @@ struct replentry * AffixMgr::get_reptable() return reptable; } -// return iconv table -RepList * AffixMgr::get_iconvtable() -{ - if (! iconvtable ) return NULL; - return iconvtable; -} - -// return oconv table -RepList * AffixMgr::get_oconvtable() -{ - if (! oconvtable ) return NULL; - return oconvtable; -} - -// return replacing table -struct phonetable * AffixMgr::get_phonetable() -{ - if (! phone ) return NULL; - return phone; -} - // return length of character map table int AffixMgr::get_nummap() { @@ -3262,7 +3019,9 @@ char ** AffixMgr::get_breaktable() // return text encoding of dictionary char * AffixMgr::get_encoding() { - if (! encoding ) encoding = mystrdup(SPELL_ENCODING); + if (! encoding ) { + encoding = mystrdup("ISO8859-1"); + } return mystrdup(encoding); } @@ -3278,12 +3037,6 @@ int AffixMgr::get_complexprefixes() return complexprefixes; } -// return FULLSTRIP option -int AffixMgr::get_fullstrip() -{ - return fullstrip; -} - FLAG AffixMgr::get_keepcase() { return keepcase; @@ -3294,17 +3047,11 @@ int AffixMgr::get_checksharps() return checksharps; } -char * AffixMgr::encode_flag(unsigned short aflag) -{ - return pHMgr->encode_flag(aflag); -} - - // return the preferred ignore string for suggestions char * AffixMgr::get_ignore() { if (!ignorechars) return NULL; - return ignorechars; + return mystrdup(ignorechars); } // return the preferred ignore string for suggestions @@ -3314,13 +3061,6 @@ unsigned short * AffixMgr::get_ignore_utf16(int * len) return ignorechars_utf16; } -// return the keyboard string for suggestions -char * AffixMgr::get_key_string() -{ - if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING); - return mystrdup(keystring); -} - // return the preferred try string for suggestions char * AffixMgr::get_try_string() { @@ -3365,9 +3105,9 @@ FLAG AffixMgr::get_nosuggest() } // return the forbidden words flag modify flag -FLAG AffixMgr::get_needaffix() +FLAG AffixMgr::get_pseudoroot() { - return needaffix; + return pseudoroot; } // return the onlyincompound flag @@ -3407,6 +3147,12 @@ const char * AffixMgr::get_suffix() return sfxappnd; } +// return the value of derived form (base word with first suffix). +const char * AffixMgr::get_derived() +{ + return derived; +} + // return the value of suffix const char * AffixMgr::get_version() { @@ -3422,12 +3168,8 @@ FLAG AffixMgr::get_lemma_present() // utility method to look up root words in hash table struct hentry * AffixMgr::lookup(const char * word) { - int i; - struct hentry * he = NULL; - for (i = 0; i < *maxdic && !he; i++) { - he = (alldic[i])->lookup(word); - } - return he; + if (! pHMgr) return NULL; + return pHMgr->lookup(word); } // return the value of suffix @@ -3461,47 +3203,33 @@ int AffixMgr::get_sugswithdots(void) } /* parse flag */ -#ifdef HUNSPELL_CHROME_CLIENT -int AffixMgr::parse_flag(char * line, unsigned short * out) -#else -int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) -#endif -{ +int AffixMgr::parse_flag(char * line, unsigned short * out, const char * name) { char * s = NULL; - if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { - HUNSPELL_WARNING(stderr, "error:multiple definitions of an affix file parameter\n"); + if (*out != FLAG_NULL) { + HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); return 1; } - if (parse_string(line, &s, 0)) return 1; + if (parse_string(line, &s, name)) return 1; *out = pHMgr->decode_flag(s); free(s); return 0; } /* parse num */ -#ifdef HUNSPELL_CHROME_CLIENT -int AffixMgr::parse_num(char * line, int * out) -#else -int AffixMgr::parse_num(char * line, int * out, FileMgr * af) -#endif -{ +int AffixMgr::parse_num(char * line, int * out, const char * name) { char * s = NULL; if (*out != -1) { - HUNSPELL_WARNING(stderr, "error: multiple definitions of an affix file parameter\n"); + HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); return 1; } - if (parse_string(line, &s, 0)) return 1; + if (parse_string(line, &s, name)) return 1; *out = atoi(s); free(s); return 0; } /* parse in the max syllablecount of compound words and */ -#ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_cpdsyllable(char * line) -#else -int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af) -#endif { char * tp = line; char * piece; @@ -3534,6 +3262,7 @@ int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (np < 2) { @@ -3544,12 +3273,12 @@ int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af) return 0; } -#ifndef HUNSPELL_CHROME_CLIENT /* parse in the typical fault correcting table */ -int AffixMgr::parse_reptable(char * line, FileMgr * af) +#ifndef HUNSPELL_CHROME_CLIENT +int AffixMgr::parse_reptable(char * line, FILE * af) { if (numrep != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); + HUNSPELL_WARNING(stderr, "error: duplicate REP tables used\n"); return 1; } char * tp = line; @@ -3564,7 +3293,8 @@ int AffixMgr::parse_reptable(char * line, FileMgr * af) case 1: { numrep = atoi(piece); if (numrep < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum()); + HUNSPELL_WARNING(stderr, "incorrect number of entries in replacement table\n"); + free(piece); return 1; } reptable = (replentry *) malloc(numrep * sizeof(struct replentry)); @@ -3576,17 +3306,18 @@ int AffixMgr::parse_reptable(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); + HUNSPELL_WARNING(stderr, "error: missing replacement table information\n"); return 1; } /* now parse the numrep lines to read in the remainder of the table */ - char * nl; + char * nl = line; for (int j=0; j < numrep; j++) { - if (!(nl = af->getline())) return 1; + if (!fgets(nl,MAXLNLEN,af)) return 1; mychomp(nl); tp = nl; i = 0; @@ -3598,8 +3329,8 @@ int AffixMgr::parse_reptable(char * line, FileMgr * af) switch(i) { case 0: { if (strncmp(piece,"REP",3) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - numrep = 0; + HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n"); + free(piece); return 1; } break; @@ -3610,210 +3341,27 @@ int AffixMgr::parse_reptable(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); - numrep = 0; - return 1; - } - } - return 0; -} -#endif - -/* parse in the typical fault correcting table */ -#ifdef HUNSPELL_CHROME_CLIENT -int AffixMgr::parse_convtable(char * line, hunspell::LineIterator* iterator, RepList ** rl, const char * keyword) -#else -int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword) -#endif -{ - if (*rl) { - HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); - return 1; - } - char * tp = line; - char * piece; - int i = 0; - int np = 0; - int numrl = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch(i) { - case 0: { np++; break; } - case 1: { - numrl = atoi(piece); - if (numrl < 1) { - HUNSPELL_WARNING(stderr, "error: incorrect entry number\n"); - return 1; - } - *rl = new RepList(numrl); - if (!rl) return 1; - np++; - break; - } - default: break; - } - i++; - } - piece = mystrsep(&tp, 0); - } - if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing data\n"); - return 1; - } - - /* now parse the num lines to read in the remainder of the table */ - char * nl = line; - for (int j=0; j < numrl; j++) { -#ifdef HUNSPELL_CHROME_CLIENT - if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) - return 1; -#else - if (!(nl = af->getline())) return 1; -#endif - mychomp(nl); - tp = nl; - i = 0; - char * pattern = NULL; - char * pattern2 = NULL; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch(i) { - case 0: { - if (strncmp(piece, keyword, sizeof(keyword)) != 0) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - delete *rl; - *rl = NULL; - return 1; - } - break; - } - case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; } - case 2: { - pattern2 = mystrrep(mystrdup(piece),"_"," "); - break; - } - default: break; - } - i++; - } - piece = mystrsep(&tp, 0); - } - if (!pattern || !pattern2) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n"); return 1; } - (*rl)->add(pattern, pattern2); } return 0; } - - -/* parse in the typical fault correcting table */ -#ifdef HUNSPELL_CHROME_CLIENT -int AffixMgr::parse_phonetable(char * line, hunspell::LineIterator* iterator) -#else -int AffixMgr::parse_phonetable(char * line, FileMgr * af) #endif -{ - if (phone) { - HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); - return 1; - } - char * tp = line; - char * piece; - int i = 0; - int np = 0; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch(i) { - case 0: { np++; break; } - case 1: { - phone = (phonetable *) malloc(sizeof(struct phonetable)); - phone->num = atoi(piece); - phone->rules = NULL; - phone->utf8 = (char) utf8; - if (!phone) return 1; - if (phone->num < 1) { - HUNSPELL_WARNING(stderr, "error: line bad entry number\n"); - return 1; - } - phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *)); - if (!phone->rules) return 1; - np++; - break; - } - default: break; - } - i++; - } - piece = mystrsep(&tp, 0); - } - if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing data\n"); - return 1; - } - - /* now parse the phone->num lines to read in the remainder of the table */ - char * nl = line; - for (int j=0; j < phone->num; j++) { -#ifdef HUNSPELL_CHROME_CLIENT - if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) - return 1; -#else - if (!(nl = af->getline())) return 1; -#endif - mychomp(nl); - tp = nl; - i = 0; - phone->rules[j * 2] = NULL; - phone->rules[j * 2 + 1] = NULL; - piece = mystrsep(&tp, 0); - while (piece) { - if (*piece != '\0') { - switch(i) { - case 0: { - if (strncmp(piece,"PHONE",5) != 0) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - phone->num = 0; - return 1; - } - break; - } - case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; } - case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; } - default: break; - } - i++; - } - piece = mystrsep(&tp, 0); - } - if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - phone->num = 0; - return 1; - } - } - phone->rules[phone->num * 2] = mystrdup(""); - phone->rules[phone->num * 2 + 1] = mystrdup(""); - init_phonet_hash(*phone); - return 0; -} /* parse in the checkcompoundpattern table */ #if HUNSPELL_CHROME_CLIENT int AffixMgr::parse_checkcpdtable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) +int AffixMgr::parse_checkcpdtable(char * line, FILE * af) #endif { if (numcheckcpd != 0) { - HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + HUNSPELL_WARNING(stderr, "error: duplicate compound pattern tables used\n"); return 1; } char * tp = line; @@ -3828,10 +3376,11 @@ int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) case 1: { numcheckcpd = atoi(piece); if (numcheckcpd < 1) { - HUNSPELL_WARNING(stderr, "error: bad entry number\n"); + HUNSPELL_WARNING(stderr, "incorrect number of entries in compound pattern table\n"); + free(piece); return 1; } - checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry)); + checkcpdtable = (replentry *) malloc(numcheckcpd * sizeof(struct replentry)); if (!checkcpdtable) return 1; np++; break; @@ -3840,13 +3389,14 @@ int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing data\n"); + HUNSPELL_WARNING(stderr, "error: missing compound pattern table information\n"); return 1; - } - + } + /* now parse the numcheckcpd lines to read in the remainder of the table */ char * nl = line; for (int j=0; j < numcheckcpd; j++) { @@ -3854,55 +3404,36 @@ int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!(nl = af->getline())) return 1; + if (!fgets(nl,MAXLNLEN,af)) return 1; #endif mychomp(nl); tp = nl; i = 0; checkcpdtable[j].pattern = NULL; checkcpdtable[j].pattern2 = NULL; - checkcpdtable[j].pattern3 = NULL; - checkcpdtable[j].cond = FLAG_NULL; - checkcpdtable[j].cond2 = FLAG_NULL; piece = mystrsep(&tp, 0); while (piece) { if (*piece != '\0') { switch(i) { case 0: { if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - numcheckcpd = 0; + HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n"); + free(piece); return 1; } break; } - case 1: { - checkcpdtable[j].pattern = mystrdup(piece); - char * p = strchr(checkcpdtable[j].pattern, '/'); - if (p) { - *p = '\0'; - checkcpdtable[j].cond = pHMgr->decode_flag(p + 1); - } - break; } - case 2: { - checkcpdtable[j].pattern2 = mystrdup(piece); - char * p = strchr(checkcpdtable[j].pattern2, '/'); - if (p) { - *p = '\0'; - checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1); - } - break; - } - case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; } + case 1: { checkcpdtable[j].pattern = mystrdup(piece); break; } + case 2: { checkcpdtable[j].pattern2 = mystrdup(piece); break; } default: break; } i++; } + free(piece); piece = mystrsep(&tp, 0); } if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - numcheckcpd = 0; + HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n"); return 1; } } @@ -3913,11 +3444,11 @@ int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_defcpdtable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) +int AffixMgr::parse_defcpdtable(char * line, FILE * af) #endif { if (numdefcpd != 0) { - HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + HUNSPELL_WARNING(stderr, "error: duplicate compound rule tables used\n"); return 1; } char * tp = line; @@ -3932,7 +3463,8 @@ int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) case 1: { numdefcpd = atoi(piece); if (numdefcpd < 1) { - HUNSPELL_WARNING(stderr, "error: bad entry number\n"); + HUNSPELL_WARNING(stderr, "incorrect number of entries in compound rule table\n"); + free(piece); return 1; } defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry)); @@ -3944,10 +3476,11 @@ int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing data\n"); + HUNSPELL_WARNING(stderr, "error: missing compound rule table information\n"); return 1; } @@ -3958,7 +3491,7 @@ int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!(nl = af->getline())) return 1; + if (!fgets(nl,MAXLNLEN,af)) return 1; #endif mychomp(nl); tp = nl; @@ -3970,46 +3503,26 @@ int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) switch(i) { case 0: { if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - numdefcpd = 0; + HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n"); + free(piece); return 1; } break; } - case 1: { // handle parenthesized flags - if (strchr(piece, '(')) { - defcpdtable[j].def = (FLAG *) malloc(sizeof(piece) * sizeof(FLAG)); - defcpdtable[j].len = 0; - int end = 0; - FLAG * conv; - while (!end) { - char * par = piece + 1; - while (*par != '(' && *par != ')' && *par != '\0') par++; - if (*par == '\0') end = 1; else *par = '\0'; - if (*piece == '(') piece++; - if (*piece == '*' || *piece == '?') { - defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece; - } else if (*piece != '\0') { - int l = pHMgr->decode_flags(&conv, piece); - for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k]; - free(conv); - } - piece = par + 1; - } - } else { - defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece); - } + case 1: { + defcpdtable[j].len = + pHMgr->decode_flags(&(defcpdtable[j].def), piece); break; } default: break; } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (!defcpdtable[j].len) { - HUNSPELL_WARNING(stderr, "error: line table is corrupt\n"); - numdefcpd = 0; + HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n"); return 1; } } @@ -4021,11 +3534,11 @@ int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_maptable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_maptable(char * line, FileMgr * af) +int AffixMgr::parse_maptable(char * line, FILE * af) #endif { if (nummap != 0) { - HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + HUNSPELL_WARNING(stderr, "error: duplicate MAP tables used\n"); return 1; } char * tp = line; @@ -4040,7 +3553,8 @@ int AffixMgr::parse_maptable(char * line, FileMgr * af) case 1: { nummap = atoi(piece); if (nummap < 1) { - HUNSPELL_WARNING(stderr, "error: bad entry number\n"); + HUNSPELL_WARNING(stderr, "incorrect number of entries in map table\n"); + free(piece); return 1; } maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry)); @@ -4052,10 +3566,11 @@ int AffixMgr::parse_maptable(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: line missing data\n"); + HUNSPELL_WARNING(stderr, "error: missing map table information\n"); return 1; } @@ -4066,7 +3581,7 @@ int AffixMgr::parse_maptable(char * line, FileMgr * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!(nl = af->getline())) return 1; + if (!fgets(nl,MAXLNLEN,af)) return 1; #endif mychomp(nl); tp = nl; @@ -4079,8 +3594,8 @@ int AffixMgr::parse_maptable(char * line, FileMgr * af) switch(i) { case 0: { if (strncmp(piece,"MAP",3) != 0) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - nummap = 0; + HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); + free(piece); return 1; } break; @@ -4108,11 +3623,11 @@ int AffixMgr::parse_maptable(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - nummap = 0; + HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); return 1; } } @@ -4123,11 +3638,11 @@ int AffixMgr::parse_maptable(char * line, FileMgr * af) #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_breaktable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_breaktable(char * line, FileMgr * af) +int AffixMgr::parse_breaktable(char * line, FILE * af) #endif { if (numbreak != 0) { - HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + HUNSPELL_WARNING(stderr, "error: duplicate word breakpoint tables used\n"); return 1; } char * tp = line; @@ -4142,7 +3657,8 @@ int AffixMgr::parse_breaktable(char * line, FileMgr * af) case 1: { numbreak = atoi(piece); if (numbreak < 1) { - HUNSPELL_WARNING(stderr, "error: bad entry number\n"); + HUNSPELL_WARNING(stderr, "incorrect number of entries in BREAK table\n"); + free(piece); return 1; } breaktable = (char **) malloc(numbreak * sizeof(char *)); @@ -4154,10 +3670,11 @@ int AffixMgr::parse_breaktable(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing data\n"); + HUNSPELL_WARNING(stderr, "error: missing word breakpoint table information\n"); return 1; } @@ -4168,7 +3685,7 @@ int AffixMgr::parse_breaktable(char * line, FileMgr * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!(nl = af->getline())) return 1; + if (!fgets(nl,MAXLNLEN,af)) return 1; #endif mychomp(nl); tp = nl; @@ -4179,8 +3696,8 @@ int AffixMgr::parse_breaktable(char * line, FileMgr * af) switch(i) { case 0: { if (strncmp(piece,"BREAK",5) != 0) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - numbreak = 0; + HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n"); + free(piece); return 1; } break; @@ -4193,45 +3710,21 @@ int AffixMgr::parse_breaktable(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (!breaktable) { - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); - numbreak = 0; + HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n"); return 1; } } return 0; } -void AffixMgr::reverse_condition(char * piece) { - int neg = 0; - for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { - switch(*k) { - case '[': { - if (neg) *(k+1) = '['; else *k = ']'; - break; - } - case ']': { - *k = '['; - if (neg) *(k+1) = '^'; - neg = 0; - break; - } - case '^': { - if (*(k+1) == ']') neg = 1; else *(k+1) = *k; - break; - } - default: { - if (neg) *(k+1) = *k; - } - } - } -} #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_affix(char * line, const char at, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags) +int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflags) #endif { int numents = 0; // number of affentry structures to parse @@ -4255,7 +3748,6 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf // split affix header line into pieces int np = 0; - piece = mystrsep(&tp, 0); while (piece) { if (*piece != '\0') { @@ -4270,11 +3762,10 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf #ifndef HUNSPELL_CHROME_CLIENT // We don't check for duplicates. if (((at == 'S') && (dupflags[aflag] & dupSFX)) || ((at == 'P') && (dupflags[aflag] & dupPFX))) { - HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n", - af->getlinenum()); + HUNSPELL_WARNING(stderr, "error: duplicate affix flag %s in line %s\n", piece, nl); // return 1; XXX permissive mode for bad dictionaries } - dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX); + dupflags[aflag] += ((at == 'S') ? dupSFX : dupPFX); #endif break; } @@ -4287,18 +3778,19 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf numents = atoi(piece); if (numents == 0) { char * err = pHMgr->encode_flag(aflag); - if (err) { - HUNSPELL_WARNING(stderr, "error: line bad entry number\n"); - free(err); - } + HUNSPELL_WARNING(stderr, "error: affix %s header has incorrect entry count in line %s\n", + err, nl); + free(err); return 1; } - ptr = (struct affentry *) malloc(numents * sizeof(struct affentry)); + ptr = (struct affentry *) calloc(numents, sizeof(struct affentry)); if (!ptr) return 1; ptr->opts = ff; if (utf8) ptr->opts += aeUTF8; if (pHMgr->is_aliasf()) ptr->opts += aeALIASF; +#ifdef HUNSPELL_EXPERIMENTAL if (pHMgr->is_aliasm()) ptr->opts += aeALIASM; +#endif ptr->aflag = aflag; } @@ -4306,15 +3798,14 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf } i++; } + free(piece); piece = mystrsep(&tp, 0); } // check to make sure we parsed enough pieces if (np != 4) { - char * err = pHMgr->encode_flag(aflag); - if (err) { - HUNSPELL_WARNING(stderr, "error: missing data\n"); - free(err); - } + char * err = pHMgr->encode_flag(aflag); + HUNSPELL_WARNING(stderr, "error: affix %s header has insufficient data in line %s\n", err, nl); + free(err); free(ptr); return 1; } @@ -4328,7 +3819,7 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!(nl = af->getline())) return 1; + if (!fgets(nl,MAXLNLEN,af)) return 1; #endif mychomp(nl); tp = nl; @@ -4343,8 +3834,7 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf // piece 1 - is type case 0: { np++; - if (nptr != ptr) nptr->opts = ptr->opts & - (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); + if (nptr != ptr) nptr->opts = ptr->opts; break; } @@ -4353,10 +3843,10 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf np++; if (pHMgr->decode_flag(piece) != aflag) { char * err = pHMgr->encode_flag(aflag); - if (err) { - HUNSPELL_WARNING(stderr, "error: affix %s is corrupt\n", err); - free(err); - } + HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl); + HUNSPELL_WARNING(stderr, "error: possible incorrect count\n"); + free(err); + free(piece); return 1; } @@ -4383,7 +3873,9 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf // piece 4 - is affix string or 0 for null case 3: { char * dash; +#ifdef HUNSPELL_EXPERIMENTAL nptr->morphcode = NULL; +#endif nptr->contclass = NULL; nptr->contclasslen = 0; np++; @@ -4398,16 +3890,15 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf remove_ignored_chars(piece,ignorechars); } } - + if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } nptr->appnd = mystrdup(piece); - + if (pHMgr->is_aliasf()) { int index = atoi(dash + 1); nptr->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(nptr->contclass)); - if (!nptr->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1); } else { nptr->contclasslen = (unsigned short) pHMgr->decode_flags(&(nptr->contclass), dash + 1); flag_qsort(nptr->contclass, 0, nptr->contclasslen); @@ -4430,9 +3921,9 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } - nptr->appnd = mystrdup(piece); + nptr->appnd = mystrdup(piece); } - + nptr->appndl = (unsigned char) strlen(nptr->appnd); if (strcmp(nptr->appnd,"0") == 0) { free(nptr->appnd); @@ -4446,66 +3937,82 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf case 4: { np++; if (complexprefixes) { + int neg = 0; if (utf8) reverseword_utf(piece); else reverseword(piece); - reverse_condition(piece); + // reverse condition + for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { + switch(*k) { + case '[': { + if (neg) *(k+1) = '['; else *k = ']'; + break; + } + case ']': { + *k = '['; + if (neg) *(k+1) = '^'; + neg = 0; + break; + } + case '^': { + if (*(k+1) == ']') neg = 1; else *(k+1) = *k; + break; + } + default: { + if (neg) *(k+1) = *k; + } + } + } } if (nptr->stripl && (strcmp(piece, ".") != 0) && - redundant_condition(at, nptr->strip, nptr->stripl, piece, 0)) + redundant_condition(at, nptr->strip, nptr->stripl, piece, nl)) strcpy(piece, "."); - if (at == 'S') { - reverseword(piece); - reverse_condition(piece); - } - if (encodeit(nptr, piece)) return 1; + if (encodeit(nptr,piece)) return 1; break; } - + +#ifdef HUNSPELL_EXPERIMENTAL case 5: { np++; if (pHMgr->is_aliasm()) { int index = atoi(piece); nptr->morphcode = pHMgr->get_aliasm(index); } else { - if (complexprefixes) { // XXX - fix me for morph. gen. + if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } - // add the remaining of the line - if (*tp) { - *(tp - 1) = ' '; - tp = tp + strlen(tp); - } nptr->morphcode = mystrdup(piece); - if (!nptr->morphcode) return 1; } break; } +#endif + default: break; } i++; } + free(piece); piece = mystrsep(&tp, 0); } // check to make sure we parsed enough pieces if (np < 4) { char * err = pHMgr->encode_flag(aflag); - if (err) { - HUNSPELL_WARNING(stderr, "error: affix %s is corrupt\n", err); - free(err); - } + HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl); + free(err); free(ptr); return 1; } #ifdef DEBUG +#ifdef HUNSPELL_EXPERIMENTAL // detect unnecessary fields, excepting comments if (basefieldnum) { int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); if (fieldnum != basefieldnum) - HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum()); + HUNSPELL_WARNING(stderr, "warning: bad field number:\n%s\n", nl); } else { basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); } #endif +#endif nptr++; } @@ -4521,12 +4028,12 @@ int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupf build_sfxtree((AffEntry *)sfxptr); } nptr++; - } + } free(ptr); return 0; } -int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) { +int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, char * line) { int condl = strlen(cond); int i; int j; @@ -4539,8 +4046,7 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { if (cond[j] != '[') { if (cond[j] != strip[i]) { - HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); - return 0; + HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); } } else { neg = (cond[j+1] == '^') ? 1 : 0; @@ -4550,12 +4056,12 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char if (strip[i] == cond[j]) in = 1; } while ((j < (condl - 1)) && (cond[j] != ']')); if (j == (condl - 1) && (cond[j] != ']')) { - HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum); + HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line); return 0; } if ((!neg && !in) || (neg && in)) { - HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); - return 0; + HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); + return 0; } } } @@ -4568,8 +4074,7 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { if (cond[j] != ']') { if (cond[j] != strip[i]) { - HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); - return 0; + HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); } } else { in = 0; @@ -4578,18 +4083,18 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char if (strip[i] == cond[j]) in = 1; } while ((j > 0) && (cond[j] != '[')); if ((j == 0) && (cond[j] != '[')) { - HUNSPELL_WARNING(stderr, "error: error: %d: missing ] in condition:\n%s\n", linenum); + HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line); return 0; } neg = (cond[j+1] == '^') ? 1 : 0; if ((!neg && !in) || (neg && in)) { - HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); - return 0; + HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); + return 0; } } } if (j < 0) return 1; - } + } } return 0; } diff --git a/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx b/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx index fa4e217..e960068 100644 --- a/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx +++ b/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx @@ -13,8 +13,6 @@ using namespace std; #include "atypes.hxx" #include "baseaffix.hxx" #include "hashmgr.hxx" -#include "phonet.hxx" -#include "replist.hxx" // check flag duplication #define dupSFX (1 << 0) @@ -68,15 +66,12 @@ class AffixMgr AffEntry * sFlag[CONTSIZE]; #endif HashMgr * pHMgr; - HashMgr ** alldic; - int * maxdic; - char * keystring; char * trystring; char * encoding; struct cs_info * csconv; int utf8; int complexprefixes; - FLAG compoundflag; + FLAG compoundflag; FLAG compoundbegin; FLAG compoundmiddle; FLAG compoundend; @@ -87,25 +82,20 @@ class AffixMgr int checkcompoundrep; int checkcompoundcase; int checkcompoundtriple; - int simplifiedtriple; FLAG forbiddenword; FLAG nosuggest; - FLAG needaffix; + FLAG pseudoroot; int cpdmin; int numrep; replentry * reptable; - RepList * iconvtable; - RepList * oconvtable; int nummap; mapentry * maptable; int numbreak; char ** breaktable; int numcheckcpd; - patentry * checkcpdtable; - int simplifiedcpd; + replentry * checkcpdtable; int numdefcpd; flagentry * defcpdtable; - phonetable * phone; int maxngramsugs; int nosplitsugs; int sugswithdots; @@ -135,9 +125,7 @@ class AffixMgr FLAG circumfix; FLAG onlyincompound; FLAG keepcase; - FLAG substandard; int checksharps; - int fullstrip; int havecontclass; // boolean variable #ifdef HUNSPELL_CHROME_CLIENT @@ -145,81 +133,68 @@ class AffixMgr #else char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold affix) #endif - + flag flag_mode; + public: - #ifdef HUNSPELL_CHROME_CLIENT - AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md); + AffixMgr(hunspell::BDictReader* reader, HashMgr* ptr); #else - AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) + AffixMgr(FILE* aff_handle, HashMgr * ptr); #endif ~AffixMgr(); struct hentry * affix_check(const char * word, int len, - const unsigned short needflag = (unsigned short) 0, - char in_compound = IN_CPD_NOT); + const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT); struct hentry * prefix_check(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); inline int isSubset(const char * s1, const char * s2); struct hentry * prefix_check_twosfx(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); inline int isRevSubset(const char * s1, const char * end_of_s2, int len); - struct hentry * suffix_check(const char * word, int len, int sfxopts, - AffEntry* ppfx, char ** wlst, int maxSug, int * ns, - const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, - char in_compound = IN_CPD_NOT); + struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx, + char ** wlst, int maxSug, int * ns, const FLAG cclass = FLAG_NULL, + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); struct hentry * suffix_check_twosfx(const char * word, int len, int sfxopts, AffEntry* ppfx, const FLAG needflag = FLAG_NULL); char * affix_check_morph(const char * word, int len, - const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); char * prefix_check_morph(const char * word, int len, - char in_compound, const FLAG needflag = FLAG_NULL); - char * suffix_check_morph (const char * word, int len, int sfxopts, - AffEntry * ppfx, const FLAG cclass = FLAG_NULL, - const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + char in_compound, const FLAG needflag = FLAG_NULL); + char * suffix_check_morph (const char * word, int len, int sfxopts, AffEntry * ppfx, + const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); char * prefix_check_twosfx_morph(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); char * suffix_check_twosfx_morph(const char * word, int len, int sfxopts, AffEntry * ppfx, const FLAG needflag = FLAG_NULL); - char * morphgen(char * ts, int wl, const unsigned short * ap, - unsigned short al, char * morph, char * targetmorph, int level); + int expand_rootword(struct guessword * wlst, int maxn, const char * ts, + int wl, const unsigned short * ap, unsigned short al, char * bad, int); - int expand_rootword(struct guessword * wlst, int maxn, const char * ts, - int wl, const unsigned short * ap, unsigned short al, char * bad, - int, char *); + short get_syllable (const char * word, int wlen); + int cpdrep_check(const char * word, int len); + int cpdpat_check(const char * word, int len); + int defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** rwords, char all); + int cpdcase_check(const char * word, int len); + inline int candidate_check(const char * word, int len); + struct hentry * compound_check(const char * word, int len, + short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, int * cmpdstemnum, int * cmpdstem, char is_sug); - short get_syllable (const char * word, int wlen); - int cpdrep_check(const char * word, int len); - int cpdpat_check(const char * word, int len, hentry * r1, hentry * r2); - int defcpd_check(hentry *** words, short wnum, hentry * rv, - hentry ** rwords, char all); - int cpdcase_check(const char * word, int len); - inline int candidate_check(const char * word, int len); - void setcminmax(int * cmin, int * cmax, const char * word, int len); - struct hentry * compound_check(const char * word, int len, short wordnum, - short numsyllable, short maxwordnum, short wnum, hentry ** words, - char hu_mov_rule, char is_sug); + int compound_check_morph(const char * word, int len, + short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, char ** result, char * partresult); - int compound_check_morph(const char * word, int len, short wordnum, - short numsyllable, short maxwordnum, short wnum, hentry ** words, - char hu_mov_rule, char ** result, char * partresult); - - struct hentry * lookup(const char * word); + struct hentry * lookup(const char * word); int get_numrep(); struct replentry * get_reptable(); - RepList * get_iconvtable(); - RepList * get_oconvtable(); - struct phonetable * get_phonetable(); int get_nummap(); struct mapentry * get_maptable(); int get_numbreak(); char ** get_breaktable(); char * get_encoding(); int get_langnum(); - char * get_key_string(); char * get_try_string(); const char * get_wordchars(); unsigned short * get_wordchars_utf16(int * len); @@ -230,7 +205,8 @@ public: FLAG get_compoundbegin(); FLAG get_forbiddenword(); FLAG get_nosuggest(); - FLAG get_needaffix(); +// FLAG get_circumfix(); + FLAG get_pseudoroot(); FLAG get_onlyincompound(); FLAG get_compoundroot(); FLAG get_lemma_present(); @@ -249,8 +225,6 @@ public: int get_sugswithdots(void); FLAG get_keepcase(void); int get_checksharps(void); - char * encode_flag(unsigned short aflag); - int get_fullstrip(); private: #ifdef HUNSPELL_CHROME_CLIENT @@ -258,37 +232,31 @@ private: hunspell::BDictReader* bdict_reader; int parse_file(); - int parse_flag(char * line, unsigned short * out); - int parse_num(char * line, int * out); - int parse_cpdsyllable(char * line); - - int parse_reptable(char * line, hunspell::LineIterator* iterator); - int parse_convtable(char * line, hunspell::LineIterator* iterator, RepList ** rl, const char * keyword); - int parse_phonetable(char * line, hunspell::LineIterator* iterator); +#else + int parse_file(FILE* aff_handle); +#endif +// int parse_string(char * line, char ** out, const char * name); + int parse_flag(char * line, unsigned short * out, const char * name); + int parse_num(char * line, int * out, const char * name); +// int parse_array(char * line, char ** out, unsigned short ** out_utf16, +// int * out_utf16_len, const char * name); + int parse_cpdsyllable(char * linfe); +#ifdef HUNSPELL_CHROME_CLIENT + // We just change the FILE* to be an iterator. int parse_maptable(char * line, hunspell::LineIterator* iterator); - int parse_breaktable(char * line, hunspell::LineIterator* iterator); int parse_checkcpdtable(char * line, hunspell::LineIterator* iterator); + int parse_breaktable(char * line, hunspell::LineIterator* iterator); int parse_defcpdtable(char * line, hunspell::LineIterator* iterator); int parse_affix(char * line, const char at, hunspell::LineIterator* iterator); #else - int parse_file(FILE* aff_handle, const char * key); - int parse_flag(char * line, unsigned short * out, FileMgr * af); - int parse_num(char * line, int * out, FileMgr * af); - int parse_cpdsyllable(char * line, FileMgr * af); - - int parse_reptable(char * line, FileMgr * af); - int parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword); - int parse_phonetable(char * line, FileMgr * af); - int parse_maptable(char * line, FileMgr * af); - int parse_breaktable(char * line, FileMgr * af); - int parse_checkcpdtable(char * line, FileMgr * af); - int parse_defcpdtable(char * line, FileMgr * af); - int parse_affix(char * line, const char at, FileMgr * af, char * dupflags); + int parse_reptable(char * line, FILE * af); + int parse_maptable(char * line, FILE * af); + int parse_breaktable(char * line, FILE * af); + int parse_checkcpdtable(char * line, FILE * af); + int parse_defcpdtable(char * line, FILE * af); + int parse_affix(char * line, const char at, FILE * af, char * dupflags); #endif - void reverse_condition(char *); - void debugflag(char * result, unsigned short flag); - int condlen(char *); int encodeit(struct affentry * ptr, char * cs); int build_pfxtree(AffEntry* pfxptr); int build_sfxtree(AffEntry* sfxptr); @@ -298,8 +266,7 @@ private: AffEntry * process_sfx_in_order(AffEntry * ptr, AffEntry * nptr); int process_pfx_tree_to_list(); int process_sfx_tree_to_list(); - int redundant_condition(char, char * strip, int stripl, - const char * cond, int); + int redundant_condition(char, char * strip, int stripl, const char * cond, char *); }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/atypes.hxx b/chrome/third_party/hunspell/src/hunspell/atypes.hxx index 4753f9c..4f6c1ea 100644 --- a/chrome/third_party/hunspell/src/hunspell/atypes.hxx +++ b/chrome/third_party/hunspell/src/hunspell/atypes.hxx @@ -5,28 +5,27 @@ #ifdef HUNSPELL_WARNING_ON #define HUNSPELL_WARNING fprintf #else -// empty inline function to switch off warnings (instead of the C99 standard variadic macros) -static inline void HUNSPELL_WARNING(FILE *, const char *, ...) {} +#define HUNSPELL_WARNING #endif #endif // HUNSTEM def. #define HUNSTEM +#include "csutil.hxx" #include "hashmgr.hxx" -#include "w_char.hxx" #define SETSIZE 256 #define CONTSIZE 65536 #define MAXWORDLEN 100 -#define MAXWORDUTF8LEN 256 +#define MAXWORDUTF8LEN (MAXWORDLEN * 4) // affentry options #define aeXPRODUCT (1 << 0) #define aeUTF8 (1 << 1) #define aeALIASF (1 << 2) #define aeALIASM (1 << 3) -#define aeLONGCOND (1 << 4) +#define aeINFIX (1 << 4) // compound options #define IN_CPD_NOT 0 @@ -34,12 +33,10 @@ static inline void HUNSPELL_WARNING(FILE *, const char *, ...) {} #define IN_CPD_END 2 #define IN_CPD_OTHER 3 -#define MAXLNLEN 8192 +#define MAXLNLEN 8192 * 4 #define MINCPDLEN 3 #define MAXCOMPOUND 10 -#define MAXCONDLEN 20 -#define MAXCONDLEN_1 (MAXCONDLEN - sizeof(char *)) #define MAXACC 1000 @@ -58,22 +55,26 @@ struct affentry char numconds; char opts; unsigned short aflag; - unsigned short * contclass; - short contclasslen; union { - char conds[MAXCONDLEN]; - struct { - char conds1[MAXCONDLEN_1]; - char * conds2; - } l; - } c; + char base[SETSIZE]; + struct { + char ascii[SETSIZE/2]; + char neg[8]; + char all[8]; + w_char * wchars[8]; + int wlen[8]; + } utf8; + } conds; +#ifdef HUNSPELL_EXPERIMENTAL char * morphcode; +#endif + unsigned short * contclass; + short contclasslen; }; -struct guessword { - char * word; - bool allow; - char * orig; +struct replentry { + char * pattern; + char * pattern2; }; struct mapentry { @@ -87,12 +88,14 @@ struct flagentry { int len; }; -struct patentry { - char * pattern; - char * pattern2; - char * pattern3; - FLAG cond; - FLAG cond2; +struct guessword { + char * word; + bool allow; }; #endif + + + + + diff --git a/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx b/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx index 03a876d..d6a5cd6 100644 --- a/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx +++ b/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx @@ -6,23 +6,26 @@ class AffEntry public: protected: - char * appnd; - char * strip; - unsigned char appndl; - unsigned char stripl; - char numconds; - char opts; - unsigned short aflag; - union { - char conds[MAXCONDLEN]; - struct { - char conds1[MAXCONDLEN_1]; - char * conds2; - } l; - } c; - char * morphcode; - unsigned short * contclass; - short contclasslen; + char * appnd; + char * strip; + unsigned char appndl; + unsigned char stripl; + char numconds; + char opts; + unsigned short aflag; + union { + char base[SETSIZE]; + struct { + char ascii[SETSIZE/2]; + char neg[8]; + char all[8]; + w_char * wchars[8]; + int wlen[8]; + } utf8; + } conds; + char * morphcode; + unsigned short * contclass; + short contclasslen; }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/csutil.cxx b/chrome/third_party/hunspell/src/hunspell/csutil.cxx index c07e34d..4424b98 100644 --- a/chrome/third_party/hunspell/src/hunspell/csutil.cxx +++ b/chrome/third_party/hunspell/src/hunspell/csutil.cxx @@ -5,12 +5,10 @@ #include <cstdlib> #include <cstring> #include <cstdio> -#include <cctype> #else #include <stdlib.h> #include <string.h> #include <stdio.h> -#include <ctype.h> #endif #include "csutil.hxx" @@ -45,18 +43,17 @@ static NS_DEFINE_CID(kUnicharUtilCID, NS_UNICHARUTIL_CID); using namespace std; #endif #else -#ifndef WIN32 +#ifndef W32 using namespace std; #endif #endif -static struct unicode_info2 * utf_tbl = NULL; -static int utf_tbl_count = 0; // utf_tbl can be used by multiple Hunspell instances +struct unicode_info2 * utf_tbl = NULL; /* only UTF-16 (BMP) implementation */ char * u16_u8(char * dest, int size, const w_char * src, int srclen) { - signed char * u8 = (signed char *)dest; - signed char * u8_max = (signed char *)(u8 + size); + char * u8 = dest; + char * u8_max = u8 + size; const w_char * u2 = src; const w_char * u2_max = src + srclen; while ((u2 < u2_max) && (u8 < u8_max)) { @@ -103,12 +100,12 @@ char * u16_u8(char * dest, int size, const w_char * src, int srclen) { /* only UTF-16 (BMP) implementation */ int u8_u16(w_char * dest, int size, const char * src) { - const signed char * u8 = (const signed char *)src; + const char * u8 = src; w_char * u2 = dest; w_char * u2_max = u2 + size; while ((u2 < u2_max) && *u8) { - switch ((*u8) & 0xf0) { + switch ((*u8) & 0xf0) { case 0x00: case 0x10: case 0x20: @@ -125,7 +122,7 @@ int u8_u16(w_char * dest, int size, const char * src) { case 0x90: case 0xa0: case 0xb0: { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %ld. character position\n%s\n", static_cast<long>(u8 - (signed char *)src), src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %d. character position\n%s\n", u8 - src, src); u2->h = 0xff; u2->l = 0xfd; break; @@ -137,7 +134,7 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); u2->h = 0xff; u2->l = 0xfd; } @@ -151,12 +148,12 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); u2->h = 0xff; u2->l = 0xfd; } } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); u2->h = 0xff; u2->l = 0xfd; } @@ -221,11 +218,13 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { char * mystrsep(char ** stringp, const char delim) { + char * rv = NULL; char * mp = *stringp; - if (*mp != '\0') { + int n = strlen(mp); + if (n > 0) { char * dp; if (delim) { - dp = strchr(mp, delim); + dp = (char *)memchr(mp,(int)((unsigned char)delim),n); } else { // don't use isspace() here, the string can be in some random charset // that's way different than the locale's @@ -235,16 +234,22 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { if (dp) { *stringp = dp+1; int nc = (int)((unsigned long)dp - (unsigned long)mp); - *(mp+nc) = '\0'; - return mp; + rv = (char *) malloc(nc+1); + memcpy(rv,mp,nc); + *(rv+nc) = '\0'; + return rv; } else { - *stringp = mp + strlen(mp); - return mp; + rv = (char *) malloc(n+1); + memcpy(rv, mp, n); + *(rv+n) = '\0'; + *stringp = mp + n; + return rv; } } return NULL; } + // replaces strdup with ansi version char * mystrdup(const char * s) { @@ -252,27 +257,12 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { if (s) { int sl = strlen(s); d = (char *) malloc(((sl+1) * sizeof(char))); - if (d) { - memcpy(d,s,((sl+1)*sizeof(char))); - return d; - } - HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); + if (d) memcpy(d,s,((sl+1)*sizeof(char))); } return d; } - - // strcat for limited length destination string - char * mystrcat(char * dest, const char * st, int max) { - int len; - int len2; - if (dest == NULL || st == NULL) return dest; - len = strlen(dest); - len2 = strlen(st); - if (len + len2 + 1 > max) return dest; - strcpy(dest + len, st); - return dest; - } - + + // remove cross-platform text line end characters void mychomp(char * s) { @@ -299,258 +289,112 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { return d; } +#ifdef HUNSPELL_EXPERIMENTAL + // append s to ends of every lines in text + void strlinecat(char * dest, const char * s) + { + char * dup = mystrdup(dest); + char * source = dup; + int len = strlen(s); + while (*source) { + if (*source == '\n') { + strncpy(dest, s, len); + dest += len; + } + *dest = *source; + source++; dest++; + } + strcpy(dest, s); + free(dup); + } + // break text to lines // return number of lines -int line_tok(const char * text, char *** lines, char breakchar) { +int line_tok(const char * text, char *** lines) { int linenum = 0; char * dup = mystrdup(text); - char * p = strchr(dup, breakchar); + char * p = strchr(dup, '\n'); while (p) { linenum++; *p = '\0'; p++; - p = strchr(p, breakchar); - } - linenum++; -// fprintf(stderr, "LINEN:%d %p %p\n", linenum, lines, *lines); - *lines = (char **) malloc(linenum * sizeof(char *)); -// fprintf(stderr, "hello\n"); - if (!(*lines)) { - free(dup); - return 0; + p = strchr(p, '\n'); } + *lines = (char **) calloc(linenum + 1, sizeof(char *)); + if (!(*lines)) return -1; - p = dup; - int l = 0; - for (int i = 0; i < linenum; i++) { - if (*p != '\0') { - (*lines)[l] = mystrdup(p); - if (!(*lines)[l]) { - for (i = 0; i < l; i++) free((*lines)[i]); - free(dup); - return 0; - } - l++; - } + p = dup; + for (int i = 0; i < linenum + 1; i++) { + (*lines)[i] = mystrdup(p); p += strlen(p) + 1; } free(dup); - if (!l) free(*lines); - return l; + return linenum; } // uniq line in place -char * line_uniq(char * text, char breakchar) { +char * line_uniq(char * text) { char ** lines; - int linenum = line_tok(text, &lines, breakchar); + int linenum = line_tok(text, &lines); int i; strcpy(text, lines[0]); - for ( i = 1; i < linenum; i++ ) { + for ( i = 1; i<=linenum; i++ ) { int dup = 0; for (int j = 0; j < i; j++) { if (strcmp(lines[i], lines[j]) == 0) dup = 1; } if (!dup) { - if ((i > 1) || (*(lines[0]) != '\0')) { - sprintf(text + strlen(text), "%c", breakchar); - } + if ((i > 1) || (*(lines[0]) != '\0')) strcat(text, "\n"); strcat(text, lines[i]); } } - for ( i = 0; i < linenum; i++ ) { + for ( i = 0; i<=linenum; i++ ) { if (lines[i]) free(lines[i]); } if (lines) free(lines); return text; } -// uniq and boundary for compound analysis: "1\n\2\n\1" -> " ( \1 | \2 ) " -char * line_uniq_app(char ** text, char breakchar) { - if (!strchr(*text, breakchar)) { - return *text; - } - - char ** lines; - int i; - int linenum = line_tok(*text, &lines, breakchar); - int dup = 0; - for (i = 0; i < linenum; i++) { - for (int j = 0; j < (i - 1); j++) { - if (strcmp(lines[i], lines[j]) == 0) { - *(lines[i]) = '\0'; - dup++; - break; - } - } - } - if ((linenum - dup) == 1) { - strcpy(*text, lines[0]); - freelist(&lines, linenum); - return *text; - } - char * newtext = (char *) malloc(strlen(*text) + 2 * linenum + 3 + 1); - if (newtext) { - free(*text); - *text = newtext; - } else { - freelist(&lines, linenum); - return *text; - } - strcpy(*text," ( "); - for (i = 0; i < linenum; i++) if (*(lines[i])) { - sprintf(*text + strlen(*text), "%s%s", lines[i], " | "); - } - (*text)[strlen(*text) - 2] = ')'; // " ) " - freelist(&lines, linenum); - return *text; -} - - // append s to ends of every lines in text - void strlinecat(char * dest, const char * s) - { - char * dup = mystrdup(dest); - char * source = dup; - int len = strlen(s); - if (dup) { - while (*source) { - if (*source == '\n') { - strncpy(dest, s, len); - dest += len; - } - *dest = *source; - source++; dest++; - } - strcpy(dest, s); - free(dup); - } - } - // change \n to char c -char * tr(char * text, char oldc, char newc) { +char * line_join(char * text, char c) { char * p; - for (p = text; *p; p++) if (*p == oldc) *p = newc; + for (p = text; *p; p++) if (*p == '\n') *p = c; return text; } -// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields -// in the first line of the inputs -// return 0, if inputs equal -// return 1, if inputs may equal with a secondary suffix -// otherwise return -1 -int morphcmp(const char * s, const char * t) -{ - int se = 0; - int te = 0; - const char * sl; - const char * tl; - const char * olds; - const char * oldt; - if (!s || !t) return 1; - olds = s; - sl = strchr(s, '\n'); - s = strstr(s, MORPH_DERI_SFX); - if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); - if (!s || (sl && sl < s)) { - s= strstr(olds, MORPH_TERM_SFX); - olds = NULL; - } - oldt = t; - tl = strchr(t, '\n'); - t = strstr(t, MORPH_DERI_SFX); - if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); - if (!t || (tl && tl < t)) { - t = strstr(oldt, MORPH_TERM_SFX); - oldt = NULL; - } - while (s && t && (!sl || sl > s) && (!tl || tl > t)) { - s += MORPH_TAG_LEN; - t += MORPH_TAG_LEN; - se = 0; - te = 0; - while ((*s == *t) && !se && !te) { - s++; - t++; - switch(*s) { - case ' ': - case '\n': - case '\t': - case '\0': se = 1; - } - switch(*t) { - case ' ': - case '\n': - case '\t': - case '\0': te = 1; - } - } - if (!se || !te) { - // not terminal suffix difference - if (olds) return -1; - return 1; - } - olds = s; - s = strstr(s, MORPH_DERI_SFX); - if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); - if (!s || (sl && sl < s)) { - s = strstr(olds, MORPH_TERM_SFX); - olds = NULL; - } - oldt = t; - t = strstr(t, MORPH_DERI_SFX); - if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); - if (!t || (tl && tl < t)) { - t = strstr(oldt, MORPH_TERM_SFX); - oldt = NULL; +// leave only last {[^}]*} substring for handling zero morphemes +char * delete_zeros(char * morphout) { + char * p = morphout; + char * q = p; + char * q2 = NULL; + int suffix = 0; + + for (;*p && *(p+1);) { + switch (*p) { + case '{': + q2 = q; + q--; + break; + case '}': + if (q2) { + suffix = 1; + q--; + } + break; + default: + if (suffix) { + q = q2; + } + suffix = 0; + *q = *p; } + p++; + q++; } - if (!s && !t && se && te) return 0; - return 1; -} - -int get_sfxcount(const char * morph) -{ - if (!morph || !*morph) return 0; - int n = 0; - const char * old = morph; - morph = strstr(morph, MORPH_DERI_SFX); - if (!morph) morph = strstr(old, MORPH_INFL_SFX); - if (!morph) morph = strstr(old, MORPH_TERM_SFX); - while (morph) { - n++; - old = morph; - morph = strstr(morph + 1, MORPH_DERI_SFX); - if (!morph) morph = strstr(old + 1, MORPH_INFL_SFX); - if (!morph) morph = strstr(old + 1, MORPH_TERM_SFX); - } - return n; -} - - -int fieldlen(const char * r) -{ - int n = 0; - while (r && *r != '\t' && *r != '\0' && *r != '\n' && *r != ' ') { - r++; - n++; - } - return n; -} - -char * copy_field(char * dest, const char * morph, const char * var) -{ - if (!morph) return NULL; - const char * beg = strstr(morph, var); - if (beg) { - char * d = dest; - for (beg += MORPH_TAG_LEN; *beg != ' ' && *beg != '\t' && - *beg != '\n' && *beg != '\0'; d++, beg++) { - *d = *beg; - } - *d = '\0'; - return dest; - } - return NULL; + *q = '\0'; + return morphout; } +#endif // END OF HUNSPELL_EXPERIMENTAL CODE char * mystrrep(char * word, const char * pat, const char * rep) { char * pos = strstr(word, pat); @@ -601,34 +445,6 @@ char * mystrrep(char * word, const char * pat, const char * rep) { u16_u8(word, MAXWORDUTF8LEN, w, l); return 0; } - - int uniqlist(char ** list, int n) { - int i; - if (n < 2) return n; - for (i = 0; i < n; i++) { - for (int j = 0; j < i; j++) { - if (list[j] && list[i] && (strcmp(list[j], list[i]) == 0)) { - free(list[i]); - list[i] = NULL; - break; - } - } - } - int m = 1; - for (i = 1; i < n; i++) if (list[i]) { - list[m] = list[i]; - m++; - } - return m; - } - - void freelist(char *** list, int n) { - if (list && *list && n > 0) { - for (int i = 0; i < n; i++) if ((*list)[i]) free((*list)[i]); - free(*list); - *list = NULL; - } - } // convert null terminated string to all caps void mkallcap(char * p, const struct cs_info * csconv) @@ -662,8 +478,8 @@ void mkallcap_utf(w_char * u, int nc, int langnum) { for (int i = 0; i < nc; i++) { unsigned short idx = (u[i].h << 8) + u[i].l; if (idx != unicodetoupper(idx, langnum)) { - u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); - u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); + u[i].h = (unsigned char) (unicodetolower(idx, langnum) >> 8); + u[i].l = (unsigned char) (unicodetolower(idx, langnum) & 0x00FF); } } } @@ -674,20 +490,6 @@ void mkallcap_utf(w_char * u, int nc, int langnum) { if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; } - // conversion function for protected memory - void store_pointer(char * dest, char * source) - { - memcpy(dest, &source, sizeof(char *)); - } - - // conversion function for protected memory - char * get_stored_pointer(char * s) - { - char * p; - memcpy(&p, s, sizeof(char *)); - return p; - } - #ifndef MOZILLA_CLIENT // convert null terminated string to all caps using encoding void enmkallcap(char * d, const char * p, const char * encoding) @@ -980,7 +782,7 @@ struct cs_info iso1_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; @@ -1240,7 +1042,7 @@ struct cs_info iso2_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; @@ -1500,7 +1302,7 @@ struct cs_info iso3_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; struct cs_info iso4_tbl[] = { @@ -1759,7 +1561,7 @@ struct cs_info iso4_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; struct cs_info iso5_tbl[] = { @@ -2018,7 +1820,7 @@ struct cs_info iso5_tbl[] = { { 0x00, 0xfc, 0xac }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xae }, -{ 0x00, 0xff, 0xaf } +{ 0x00, 0xff, 0xaf }, }; struct cs_info iso6_tbl[] = { @@ -2277,7 +2079,7 @@ struct cs_info iso6_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; struct cs_info iso7_tbl[] = { @@ -2536,7 +2338,7 @@ struct cs_info iso7_tbl[] = { { 0x00, 0xfc, 0xbc }, { 0x00, 0xfd, 0xbe }, { 0x00, 0xfe, 0xbf }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; struct cs_info iso8_tbl[] = { @@ -2795,7 +2597,7 @@ struct cs_info iso8_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; struct cs_info iso9_tbl[] = { @@ -3054,7 +2856,7 @@ struct cs_info iso9_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0x49 }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; struct cs_info iso10_tbl[] = { @@ -3313,7 +3115,7 @@ struct cs_info iso10_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; struct cs_info koi8r_tbl[] = { @@ -3572,7 +3374,7 @@ struct cs_info koi8r_tbl[] = { { 0x01, 0xdc, 0xfc }, { 0x01, 0xdd, 0xfd }, { 0x01, 0xde, 0xfe }, -{ 0x01, 0xdf, 0xff } +{ 0x01, 0xdf, 0xff }, }; struct cs_info koi8u_tbl[] = { @@ -3831,7 +3633,7 @@ struct cs_info koi8u_tbl[] = { { 0x01, 0xdc, 0xfc }, { 0x01, 0xdd, 0xfd }, { 0x01, 0xde, 0xfe }, -{ 0x01, 0xdf, 0xff } +{ 0x01, 0xdf, 0xff }, }; struct cs_info cp1251_tbl[] = { @@ -4090,7 +3892,7 @@ struct cs_info cp1251_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xdf } +{ 0x00, 0xff, 0xdf }, }; struct cs_info iso13_tbl[] = { @@ -4349,7 +4151,7 @@ struct cs_info iso13_tbl[] = { { 0x00, 0xFC, 0xDC }, { 0x00, 0xFD, 0xDD }, { 0x00, 0xFE, 0xDE }, -{ 0x00, 0xFF, 0xFF } +{ 0x00, 0xFF, 0xFF }, }; @@ -4609,7 +4411,7 @@ struct cs_info iso14_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; struct cs_info iso15_tbl[] = { @@ -4868,7 +4670,7 @@ struct cs_info iso15_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xbe } +{ 0x00, 0xff, 0xbe }, }; struct cs_info iscii_devanagari_tbl[] = { @@ -5127,10 +4929,10 @@ struct cs_info iscii_devanagari_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff } +{ 0x00, 0xff, 0xff }, }; -static struct enc_entry encds[] = { +struct enc_entry encds[] = { {"ISO8859-1",iso1_tbl}, {"ISO8859-2",iso2_tbl}, {"ISO8859-3",iso3_tbl}, @@ -5147,7 +4949,7 @@ static struct enc_entry encds[] = { {"ISO8859-13", iso13_tbl}, {"ISO8859-14", iso14_tbl}, {"ISO8859-15", iso15_tbl}, -{"ISCII-DEVANAGARI", iscii_devanagari_tbl} +{"ISCII-DEVANAGARI", iscii_devanagari_tbl}, }; struct cs_info * get_current_cs(const char * es) { @@ -5156,7 +4958,6 @@ struct cs_info * get_current_cs(const char * es) { for (int i = 0; i < n; i++) { if (strcmp(es,encds[i].enc_name) == 0) { ccs = encds[i].cs_table; - break; } } return ccs; @@ -5237,26 +5038,6 @@ struct cs_info * get_current_cs(const char * es) { } #endif -// primitive isalpha() replacement for tokenization -char * get_casechars(const char * enc) { - struct cs_info * csconv = get_current_cs(enc); - char expw[MAXLNLEN]; - char * p = expw; - for (int i = 0; i <= 255; i++) { - if ((csconv[i].cupper != csconv[i].clower)) { - *p = (char) i; - p++; - } - } - *p = '\0'; -#ifdef MOZILLA_CLIENT - delete csconv; -#endif - return mystrdup(expw); -} - - - struct lang_map lang2enc[] = { {"ar", "UTF-8", LANG_ar}, {"az", "UTF-8", LANG_az}, @@ -5309,8 +5090,6 @@ int get_lang_num(const char * lang) { #ifndef OPENOFFICEORG #ifndef MOZILLA_CLIENT int initialize_utf_tbl() { - utf_tbl_count++; - if (utf_tbl) return 0; utf_tbl = (unicode_info2 *) malloc(CONTSIZE * sizeof(unicode_info2)); if (utf_tbl) { int j; @@ -5331,11 +5110,7 @@ int initialize_utf_tbl() { #endif void free_utf_tbl() { - if (utf_tbl_count > 0) utf_tbl_count--; - if (utf_tbl && (utf_tbl_count == 0)) { - free(utf_tbl); - utf_tbl = NULL; - } + if (utf_tbl) free(utf_tbl); } #ifdef MOZILLA_CLIENT @@ -5358,11 +5133,11 @@ unsigned short unicodetoupper(unsigned short c, int langnum) return u_toupper(c); #else #ifdef MOZILLA_CLIENT - PRUnichar ch2; - getcaseConv()->ToUpper((PRUnichar) c, &ch2); - return ch2; + unsigned short ret(c); + getcaseConv()->ToUpper(c, &ret); + return ret; #else - return (utf_tbl) ? utf_tbl[c].cupper : c; + return utf_tbl[c].cupper; #endif #endif } @@ -5378,11 +5153,11 @@ unsigned short unicodetolower(unsigned short c, int langnum) return u_tolower(c); #else #ifdef MOZILLA_CLIENT - PRUnichar ch2; - getcaseConv()->ToLower((PRUnichar) c, &ch2); - return ch2; + unsigned short ret(c); + getcaseConv()->ToLower(c, &ret); + return ret; #else - return (utf_tbl) ? utf_tbl[c].clower : c; + return utf_tbl[c].clower; #endif #endif } @@ -5392,72 +5167,10 @@ int unicodeisalpha(unsigned short c) #ifdef OPENOFFICEORG return u_isalpha(c); #else - return (utf_tbl) ? utf_tbl[c].cletter : 0; + return utf_tbl[c].cletter; #endif } -/* get type of capitalization */ -int get_captype(char * word, int nl, cs_info * csconv) { - // now determine the capitalization type of the first nl letters - int ncap = 0; - int nneutral = 0; - int firstcap = 0; - if (csconv == NULL) return NOCAP; - for (char * q = word; *q != '\0'; q++) { - if (csconv[*((unsigned char *)q)].ccase) ncap++; - if (csconv[*((unsigned char *)q)].cupper == csconv[*((unsigned char *)q)].clower) nneutral++; - } - if (ncap) { - firstcap = csconv[*((unsigned char *) word)].ccase; - } - - // now finally set the captype - if (ncap == 0) { - return NOCAP; - } else if ((ncap == 1) && firstcap) { - return INITCAP; - } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { - return ALLCAP; - } else if ((ncap > 1) && firstcap) { - return HUHINITCAP; - } - return HUHCAP; -} - -int get_captype_utf8(w_char * word, int nl, int langnum) { - // now determine the capitalization type of the first nl letters - int ncap = 0; - int nneutral = 0; - int firstcap = 0; - unsigned short idx; - // don't check too long words - if (nl >= MAXWORDLEN) return 0; - // big Unicode character (non BMP area) - if (nl == -1) return NOCAP; - for (int i = 0; i < nl; i++) { - idx = (word[i].h << 8) + word[i].l; - if (idx != unicodetolower(idx, langnum)) ncap++; - if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; - } - if (ncap) { - idx = (word[0].h << 8) + word[0].l; - firstcap = (idx != unicodetolower(idx, langnum)); - } - - // now finally set the captype - if (ncap == 0) { - return NOCAP; - } else if ((ncap == 1) && firstcap) { - return INITCAP; - } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { - return ALLCAP; - } else if ((ncap > 1) && firstcap) { - return HUHINITCAP; - } - return HUHCAP; -} - - // strip all ignored characters in the string void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len) { @@ -5487,14 +5200,14 @@ void remove_ignored_chars(char * word, char * ignored_chars) *word = '\0'; } -int parse_string(char * line, char ** out, int ln) +int parse_string(char * line, char ** out, const char * name) { char * tp = line; char * piece; int i = 0; int np = 0; if (*out) { - HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions\n", ln); + HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); return 1; } piece = mystrsep(&tp, 0); @@ -5504,7 +5217,6 @@ int parse_string(char * line, char ** out, int ln) case 0: { np++; break; } case 1: { *out = mystrdup(piece); - if (!*out) return 1; np++; break; } @@ -5512,19 +5224,19 @@ int parse_string(char * line, char ** out, int ln) } i++; } - // free(piece); + free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", ln); + HUNSPELL_WARNING(stderr, "error: missing %s information\n", name); return 1; } return 0; } -int parse_array(char * line, char ** out, unsigned short ** out_utf16, - int * out_utf16_len, int utf8, int ln) { - if (parse_string(line, out, ln)) return 1; +int parse_array(char * line, char ** out, + unsigned short ** out_utf16, int * out_utf16_len, const char * name, int utf8) { + if (parse_string(line, out, name)) return 1; if (utf8) { w_char w[MAXWORDLEN]; int n = u8_u16(w, MAXWORDLEN, *out); diff --git a/chrome/third_party/hunspell/src/hunspell/csutil.hxx b/chrome/third_party/hunspell/src/hunspell/csutil.hxx index 0e6192b..7fc6732 100644 --- a/chrome/third_party/hunspell/src/hunspell/csutil.hxx +++ b/chrome/third_party/hunspell/src/hunspell/csutil.hxx @@ -3,56 +3,10 @@ // First some base level utility routines -#include "w_char.hxx" - -// casing -#define NOCAP 0 -#define INITCAP 1 -#define ALLCAP 2 -#define HUHCAP 3 -#define HUHINITCAP 4 - -// default encoding and keystring -#define SPELL_ENCODING "ISO8859-1" -#define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm" - -// default morphological fields -#define MORPH_STEM "st:" -#define MORPH_ALLOMORPH "al:" -#define MORPH_POS "po:" -#define MORPH_DERI_PFX "dp:" -#define MORPH_INFL_PFX "ip:" -#define MORPH_TERM_PFX "tp:" -#define MORPH_DERI_SFX "ds:" -#define MORPH_INFL_SFX "is:" -#define MORPH_TERM_SFX "ts:" -#define MORPH_SURF_PFX "sp:" -#define MORPH_FREQ "fr:" -#define MORPH_PHON "ph:" -#define MORPH_HYPH "hy:" -#define MORPH_PART "pa:" -#define MORPH_FLAG "fl:" -#define MORPH_HENTRY "_H:" -#define MORPH_TAG_LEN strlen(MORPH_STEM) - -#define MSEP_FLD ' ' -#define MSEP_REC '\n' -#define MSEP_ALT '\v' - -// default flags -#define DEFAULTFLAGS 65510 -#define FORBIDDENWORD 65510 -#define ONLYUPCASEFLAG 65511 - -// hash entry macros -#define HENTRY_DATA(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \ - get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : NULL) -// NULL-free version for warning-free OOo build -#define HENTRY_DATA2(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \ - get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : "") -#define HENTRY_FIND(h,p) (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL) - -#define w_char_eq(a,b) (((a).l == (b).l) && ((a).h == (b).h)) +typedef struct { + unsigned char l; + unsigned char h; +} w_char; // convert UTF-16 characters to UTF-8 char * u16_u8(char * dest, int size, const w_char * src, int srclen); @@ -72,9 +26,6 @@ void mychomp(char * s); // duplicate string char * mystrdup(const char * s); -// strcat for limited length destination string -char * mystrcat(char * dest, const char * st, int max); - // duplicate reverse of string char * myrevstrdup(const char * s); @@ -90,14 +41,16 @@ char * mystrrep(char *, const char *, const char *); void strlinecat(char * lines, const char * s); // tokenize into lines with new line - int line_tok(const char * text, char *** lines, char breakchar); + int line_tok(const char * text, char *** lines); // tokenize into lines with new line and uniq in place - char * line_uniq(char * text, char breakchar); - char * line_uniq_app(char ** text, char breakchar); + char * line_uniq(char * text); + +// change \n to c in place + char * line_join(char * text, char c); -// change oldchar to newchar in place - char * tr(char * text, char oldc, char newc); +// leave only last {[^}]*} pattern in string + char * delete_zeros(char * morphout); // reverse word int reverseword(char *); @@ -105,12 +58,6 @@ void strlinecat(char * lines, const char * s); // reverse word int reverseword_utf(char *); -// remove duplicates - int uniqlist(char ** list, int n); - -// free character array list - void freelist(char *** list, int n); - // character encoding information struct cs_info { unsigned char ccase; @@ -154,12 +101,8 @@ struct cs_info * get_current_cs(const char * es); const char * get_default_enc(const char * lang); -// get language identifiers of language codes int get_lang_num(const char * lang); -// get characters of the given 8bit encoding with lower- and uppercase forms -char * get_casechars(const char * enc); - // convert null terminated string to all caps using encoding void enmkallcap(char * d, const char * p, const char * encoding); @@ -184,34 +127,15 @@ void mkallsmall_utf(w_char * u, int nc, int langnum); // convert first nc characters of UTF-8 string to capital void mkallcap_utf(w_char * u, int nc, int langnum); -// get type of capitalization -int get_captype(char * q, int nl, cs_info *); - -// get type of capitalization (UTF-8) -int get_captype_utf8(w_char * q, int nl, int langnum); - // strip all ignored characters in the string void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len); // strip all ignored characters in the string void remove_ignored_chars(char * word, char * ignored_chars); -int parse_string(char * line, char ** out, int ln); - -int parse_array(char * line, char ** out, unsigned short ** out_utf16, - int * out_utf16_len, int utf8, int ln); - -int fieldlen(const char * r); -char * copy_field(char * dest, const char * morph, const char * var); - -int morphcmp(const char * s, const char * t); - -int get_sfxcount(const char * morph); - -// conversion function for protected memory -void store_pointer(char * dest, char * source); +int parse_string(char * line, char ** out, const char * name); -// conversion function for protected memory -char * get_stored_pointer(char * s); +int parse_array(char * line, char ** out, + unsigned short ** out_utf16, int * out_utf16_len, const char * name, int utf8); #endif diff --git a/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx b/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx index 5594582..34736a6 100644 --- a/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx @@ -135,19 +135,15 @@ char * DictMgr::mystrsep(char ** stringp, const char delim) *stringp = dp+1; int nc = (int)((unsigned long)dp - (unsigned long)mp); rv = (char *) malloc(nc+1); - if (rv) { - memcpy(rv,mp,nc); - *(rv+nc) = '\0'; - return rv; - } + memcpy(rv,mp,nc); + *(rv+nc) = '\0'; + return rv; } else { rv = (char *) malloc(n+1); - if (rv) { - memcpy(rv, mp, n); - *(rv+n) = '\0'; - *stringp = mp + n; - return rv; - } + memcpy(rv, mp, n); + *(rv+n) = '\0'; + *stringp = mp + n; + return rv; } } return NULL; diff --git a/chrome/third_party/hunspell/src/hunspell/filemgr.cxx b/chrome/third_party/hunspell/src/hunspell/filemgr.cxx deleted file mode 100644 index 4150ce6..0000000 --- a/chrome/third_party/hunspell/src/hunspell/filemgr.cxx +++ /dev/null @@ -1,54 +0,0 @@ -#include "license.hunspell" -#include "license.myspell" - -#ifndef MOZILLA_CLIENT -#include <cstdlib> -#include <cstring> -#include <cstdio> -#else -#include <stdlib.h> -#include <string.h> -#include <stdio.h> -#endif - -#include "filemgr.hxx" - -int FileMgr::fail(const char * err, const char * par) { - fprintf(stderr, err, par); - return -1; -} - -FileMgr::FileMgr(const char * file, const char * key) { - linenum = 0; - hin = NULL; - fin = fopen(file, "r"); - if (!fin) { - // check hzipped file - char * st = (char *) malloc(strlen(file) + strlen(HZIP_EXTENSION)); - if (st) { - strcpy(st, file); - strcat(st, HZIP_EXTENSION); - hin = new Hunzip(st, key); - } - } - if (!fin && !hin) fail(MSG_OPEN, file); -} - -FileMgr::~FileMgr() -{ - if (fin) fclose(fin); - if (hin) delete hin; -} - -char * FileMgr::getline() { - const char * l; - linenum++; - if (fin) return fgets(in, BUFSIZE - 1, fin); - if (hin && (l = hin->getline())) return strcpy(in, l); - linenum--; - return NULL; -} - -int FileMgr::getlinenum() { - return linenum; -} diff --git a/chrome/third_party/hunspell/src/hunspell/filemgr.hxx b/chrome/third_party/hunspell/src/hunspell/filemgr.hxx deleted file mode 100644 index fb4d52b..0000000 --- a/chrome/third_party/hunspell/src/hunspell/filemgr.hxx +++ /dev/null @@ -1,21 +0,0 @@ -/* file manager class - read lines of files [filename] OR [filename.hz] */ -#ifndef _FILEMGR_HXX_ -#define _FILEMGR_HXX_ -#include "hunzip.hxx" - -class FileMgr -{ -protected: - FILE * fin; - Hunzip * hin; - char in[BUFSIZE + 50]; // input buffer - int fail(const char * err, const char * par); - int linenum; - -public: - FileMgr(const char * filename, const char * key = NULL); - ~FileMgr(); - char * getline(); - int getlinenum(); -}; -#endif diff --git a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx index 49ea117..ec6f4f3 100644 --- a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx @@ -22,19 +22,18 @@ using namespace std; #endif #else -#ifndef WIN32 +#ifndef W32 using namespace std; #endif #endif // build a hash table from a munched word list - #ifdef HUNSPELL_CHROME_CLIENT HashMgr::HashMgr(hunspell::BDictReader* reader) { bdict_reader = reader; #else -HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle, const char * key) +HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) { #endif tablesize = 0; @@ -42,10 +41,6 @@ HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle, const char * key) flag_mode = FLAG_CHAR; complexprefixes = 0; utf8 = 0; - langnum = 0; - lang = NULL; - enc = NULL; - csconv = 0; ignorechars = NULL; ignorechars_utf16 = NULL; ignorechars_utf16_len = 0; @@ -53,13 +48,12 @@ HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle, const char * key) aliasf = NULL; numaliasm = 0; aliasm = NULL; - forbiddenword = FORBIDDENWORD; // forbidden word signing flag #ifdef HUNSPELL_CHROME_CLIENT // No tables to load, just the AF config. int ec = load_config(); #else load_config(aff_handle); - int ec = load_tables(dic_handle, key); + int ec = load_tables(dic_handle); #endif if (ec) { /* error condition - what should we do here */ @@ -79,16 +73,29 @@ HashMgr::~HashMgr() // now pass through hash table freeing up everything // go through column by column of the table for (int i=0; i < tablesize; i++) { - struct hentry * pt = tableptr[i]; + struct hentry * pt = &tableptr[i]; struct hentry * nt = NULL; + if (pt) { + if (pt->astr && !aliasf) free(pt->astr); + if (pt->word) free(pt->word); +#ifdef HUNSPELL_EXPERIMENTAL + if (pt->description && !aliasm) free(pt->description); +#endif + pt = pt->next; + } while(pt) { nt = pt->next; - if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr); + if (pt->astr && !aliasf) free(pt->astr); + if (pt->word) free(pt->word); +#ifdef HUNSPELL_EXPERIMENTAL + if (pt->description && !aliasm) free(pt->description); +#endif free(pt); pt = nt; } } free(tableptr); + tableptr = NULL; } tablesize = 0; @@ -106,15 +113,6 @@ HashMgr::~HashMgr() free(aliasm); aliasm = NULL; } - -#ifndef OPENOFFICEORG -#ifndef MOZILLA_CLIENT - if (utf8) free_utf_tbl(); -#endif -#endif - - if (enc) free(enc); - if (lang) free(lang); if (ignorechars) free(ignorechars); if (ignorechars_utf16) free(ignorechars_utf16); @@ -146,6 +144,7 @@ void HashMgr::EmptyHentryCache() { #endif // lookup a root word in the hashtable + struct hentry * HashMgr::lookup(const char *word) const { #ifdef HUNSPELL_CHROME_CLIENT @@ -168,10 +167,10 @@ struct hentry * HashMgr::lookup(const char *word) const #else struct hentry * dp; if (tableptr) { - dp = tableptr[hash(word)]; - if (!dp) return NULL; + dp = &tableptr[hash(word)]; + if (dp->word == NULL) return NULL; for ( ; dp != NULL; dp = dp->next) { - if (strcmp(word,&(dp->word)) == 0) return dp; + if (strcmp(word,dp->word) == 0) return dp; } } return NULL; @@ -179,101 +178,69 @@ struct hentry * HashMgr::lookup(const char *word) const } // add a word to the hash table (private) -int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, - int al, const char * desc, bool onlyupcase) + +int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, const char * desc) { #ifndef HUNSPELL_CHROME_CLIENT - bool upcasehomonym = false; - int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0; - // variable-length hash record with word and optional fields - struct hentry* hp = - (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl); - if (!hp) return 1; - char * hpw = &(hp->word); - strcpy(hpw, word); + char * st = mystrdup(word); + if (wl && !st) return 1; if (ignorechars != NULL) { if (utf8) { - remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len); + remove_ignored_chars_utf(st, ignorechars_utf16, ignorechars_utf16_len); } else { - remove_ignored_chars(hpw, ignorechars); + remove_ignored_chars(st, ignorechars); } } if (complexprefixes) { - if (utf8) reverseword_utf(hpw); else reverseword(hpw); + if (utf8) reverseword_utf(st); else reverseword(st); } - - int i = hash(hpw); - - hp->blen = (unsigned char) wbl; - hp->clen = (unsigned char) wcl; - hp->alen = (short) al; - hp->astr = aff; - hp->next = NULL; - hp->next_homonym = NULL; - - // store the description string or its pointer - if (desc) { - hp->var = H_OPT; - if (aliasm) { - hp->var += H_OPT_ALIASM; - store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc))); - } else { - strcpy(hpw + wbl + 1, desc); - if (complexprefixes) { - if (utf8) reverseword_utf(HENTRY_DATA(hp)); - else reverseword(HENTRY_DATA(hp)); - } - } - if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON; - } else hp->var = 0; - - struct hentry * dp = tableptr[i]; - if (!dp) { - tableptr[i] = hp; - return 0; - } - while (dp->next != NULL) { - if ((!dp->next_homonym) && (strcmp(&(hp->word), &(dp->word)) == 0)) { - // remove hidden onlyupcase homonym - if (!onlyupcase) { - if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { - free(dp->astr); - dp->astr = hp->astr; - dp->alen = hp->alen; - free(hp); - return 0; - } else { - dp->next_homonym = hp; - } - } else { - upcasehomonym = true; + int i = hash(st); + struct hentry * dp = &tableptr[i]; + if (dp->word == NULL) { + dp->wlen = (short) wl; + dp->alen = (short) al; + dp->word = st; + dp->astr = aff; + dp->next = NULL; + dp->next_homonym = NULL; +#ifdef HUNSPELL_EXPERIMENTAL + if (aliasm) { + dp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc); + } else { + dp->description = mystrdup(desc); + if (desc && !dp->description) return 1; + if (dp->description && complexprefixes) { + if (utf8) reverseword_utf(dp->description); else reverseword(dp->description); } - } - dp=dp->next; } - if (strcmp(&(hp->word), &(dp->word)) == 0) { - // remove hidden onlyupcase homonym - if (!onlyupcase) { - if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { - free(dp->astr); - dp->astr = hp->astr; - dp->alen = hp->alen; - free(hp); - return 0; - } else { - dp->next_homonym = hp; - } - } else { - upcasehomonym = true; +#endif + } else { + struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry)); + if (!hp) return 1; + hp->wlen = (short) wl; + hp->alen = (short) al; + hp->word = st; + hp->astr = aff; + hp->next = NULL; + hp->next_homonym = NULL; +#ifdef HUNSPELL_EXPERIMENTAL + if (aliasm) { + hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc); + } else { + hp->description = mystrdup(desc); + if (desc && !hp->description) return 1; + if (dp->description && complexprefixes) { + if (utf8) reverseword_utf(hp->description); else reverseword(hp->description); } } - if (!upcasehomonym) { - dp->next = hp; - } else { - // remove hidden onlyupcase homonym - if (hp->astr) free(hp->astr); - free(hp); +#endif + while (dp->next != NULL) { + if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp; + dp=dp->next; } + if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp; + dp->next = hp; + } #endif // HUNSPELL_CHROME_CLIENT std::map<StringPiece, int>::iterator iter = custom_word_to_affix_id_map_.find(word); @@ -288,134 +255,33 @@ int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, return 0; } -int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl, - unsigned short * flags, int al, char * dp, int captype) -{ - // add inner capitalized forms to handle the following allcap forms: - // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG - // Allcaps with suffixes: CIA's -> CIA'S - if (((captype == HUHCAP) || (captype == HUHINITCAP) || - ((captype == ALLCAP) && (flags != NULL))) && - !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) { - unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned short) * (al+1)); - if (!flags2) return 1; - if (al) memcpy(flags2, flags, al * sizeof(unsigned short)); - flags2[al] = ONLYUPCASEFLAG; - if (utf8) { - char st[BUFSIZE]; - w_char w[BUFSIZE]; - int wlen = u8_u16(w, BUFSIZE, word); - mkallsmall_utf(w, wlen, langnum); - mkallcap_utf(w, 1, langnum); - u16_u8(st, BUFSIZE, w, wlen); - return add_word(st,wbl,wcl,flags2,al+1,dp, true); - } else { - mkallsmall(word, csconv); - mkinitcap(word, csconv); - return add_word(word,wbl,wcl,flags2,al+1,dp, true); - } - } - return 0; -} - -// detect captype and modify word length for UTF-8 encoding -int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) { - int len; - if (utf8) { - w_char dest_utf[BUFSIZE]; - len = u8_u16(dest_utf, BUFSIZE, word); - *captype = get_captype_utf8(dest_utf, len, langnum); - } else { - len = wbl; - *captype = get_captype((char *) word, len, csconv); - } - return len; -} - -// remove word (personal dictionary function for standalone applications) -int HashMgr::remove(const char * word) -{ - struct hentry * dp = lookup(word); - while (dp) { - if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { - unsigned short * flags = - (unsigned short *) malloc(sizeof(short *) * (dp->alen + 1)); - if (!flags) return 1; - for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i]; - flags[dp->alen] = forbiddenword; - dp->astr = flags; - dp->alen++; - flag_qsort(flags, 0, dp->alen); - } - dp = dp->next_homonym; - } - return 0; -} - -/* remove forbidden flag to add a personal word to the hash */ -int HashMgr::remove_forbidden_flag(const char * word) { - struct hentry * dp = lookup(word); - if (!dp) return 1; - while (dp) { - if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) { - if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic. - else { - unsigned short * flags2 = - (unsigned short *) malloc(sizeof(short *) * (dp->alen - 1)); - if (!flags2) return 1; - int i, j = 0; - for (i = 0; i < dp->alen; i++) { - if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i]; - } - dp->alen--; - dp->astr = flags2; // XXX allowed forbidden words - } - } - dp = dp->next_homonym; - } - return 0; -} - // add a custom dic. word to the hash table (public) -int HashMgr::add(const char * word) +int HashMgr::put_word(const char * word, int wl, char * aff) { - unsigned short * flags = NULL; + unsigned short * flags; int al = 0; - if (remove_forbidden_flag(word)) { - int captype; - int wbl = strlen(word); - int wcl = get_clen_and_captype(word, wbl, &captype); - add_word(word, wbl, wcl, flags, al, NULL, false); - return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype); + if (aff) { + al = decode_flags(&flags, aff); + flag_qsort(flags, 0, al); + } else { + flags = NULL; } + add_word(word, wl, flags, al, NULL); return 0; } -int HashMgr::add_with_affix(const char * word, const char * example) +int HashMgr::put_word_pattern(const char * word, int wl, const char * pattern) { - // detect captype and modify word length for UTF-8 encoding - struct hentry * dp = lookup(example); - remove_forbidden_flag(word); - if (dp && dp->astr) { - int captype; - int wbl = strlen(word); - int wcl = get_clen_and_captype(word, wbl, &captype); - if (aliasf) { - add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false); - } else { - unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short)); - if (flags) { - memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); - add_word(word, wbl, wcl, flags, dp->alen, NULL, false); - } else return 1; - } - return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype); - } - return 1; + unsigned short * flags; + struct hentry * dp = lookup(pattern); + if (!dp || !dp->astr) return 1; + flags = (unsigned short *) malloc (dp->alen * sizeof(short)); + memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); + add_word(word, wl, flags, dp->alen, NULL); + return 0; } // walk the hash table entry by entry - null at end -// initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp); struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const { #ifdef HUNSPELL_CHROME_CLIENT @@ -446,99 +312,88 @@ struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const // lists for the extra affixes. If hp is NULL, create it here. if (!hp) hp = new hentry; - hp->word = *word; - hp->blen = word_len; + hp->word = word; + hp->wlen = word_len; hp->alen = (short)const_cast<HashMgr*>(this)->get_aliasf(affix_ids[0], &hp->astr); hp->next = NULL; hp->next_homonym = NULL; - hp->var = 0; - hp->clen = 0; + return hp; #else - - if (hp && hp->next != NULL) return hp->next; - for (col++; col < tablesize; col++) { - if (tableptr[col]) return tableptr[col]; + //reset to start + if ((col < 0) || (hp == NULL)) { + col = -1; + hp = NULL; } - // null at end and reset to start - col = -1; - return NULL; + + if (hp && hp->next != NULL) { + hp = hp->next; + } else { + col++; + hp = (col < tablesize) ? &tableptr[col] : NULL; + // search for next non-blank column entry + while (hp && (hp->word == NULL)) { + col ++; + hp = (col < tablesize) ? &tableptr[col] : NULL; + } + if (col < tablesize) return hp; + hp = NULL; + col = -1; + } + return hp; #endif } // load a munched word list and build a hash table on the fly -int HashMgr::load_tables(FILE* t_handle, const char * key) +int HashMgr::load_tables(FILE* t_handle) { #ifndef HUNSPELL_CHROME_CLIENT - int al; + int wl, al; char * ap; char * dp; - char * dp2; unsigned short * flags; - char * ts; - // open dictionary file - FileMgr * dict = new FileMgr(tpath, key); - if (dict == NULL) return 1; + // raw dictionary - munched file + FILE * rawdict = _fdopen(_dup(_fileno(t_handle)), "r"); + if (rawdict == NULL) return 1; + fseek(rawdict, 0, SEEK_SET); // first read the first line of file to get hash table size */ - if (!(ts = dict->getline())) { - HUNSPELL_WARNING(stderr, "error: empty dic file\n"); - delete dict; - return 2; - } + char ts[MAXDELEN]; + if (! fgets(ts, MAXDELEN-1,rawdict)) return 2; mychomp(ts); - + /* remove byte order mark */ - if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) { + if (strncmp(ts,"\xef\xbb\xbf",3) == 0) { memmove(ts, ts+3, strlen(ts+3)+1); HUNSPELL_WARNING(stderr, "warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions\n"); } - + + if ((*ts < '1') || (*ts > '9')) HUNSPELL_WARNING(stderr, "error - missing word count in dictionary file\n"); tablesize = atoi(ts); - if (tablesize == 0) { - HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the dic file\n"); - delete dict; - return 4; - } + if (!tablesize) return 4; tablesize = tablesize + 5 + USERWORD; if ((tablesize %2) == 0) tablesize++; // allocate the hash table - tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *)); - if (! tableptr) { - delete dict; - return 3; - } - for (int i=0; i<tablesize; i++) tableptr[i] = NULL; + tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry)); + if (! tableptr) return 3; + for (int i=0; i<tablesize; i++) tableptr[i].word = NULL; // loop through all words on much list and add to hash // table and create word and affix strings - while ((ts = dict->getline())) { + while (fgets(ts,MAXDELEN-1,rawdict)) { mychomp(ts); // split each line into word and morphological description - dp = ts; - while ((dp = strchr(dp, ':'))) { - if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) { - for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--); - if (dp < ts) { // missing word - dp = NULL; - } else { - *(dp + 1) = '\0'; - dp = dp + 2; - } - break; - } - dp++; - } + dp = strchr(ts,'\t'); - // tabulator is the old morphological field separator - dp2 = strchr(ts, '\t'); - if (dp2 && (!dp || dp2 < dp)) { - *dp2 = '\0'; - dp = dp2 + 1; + if (dp) { + *dp = '\0'; + dp++; + } else { + dp = NULL; } // split each line into word and affix char strings @@ -559,13 +414,13 @@ int HashMgr::load_tables(FILE* t_handle, const char * key) *ap = '\0'; if (aliasf) { int index = atoi(ap + 1); - al = get_aliasf(index, &flags, dict); + al = get_aliasf(index, &flags); if (!al) { - HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum()); + HUNSPELL_WARNING(stderr, "error - bad flag vector alias: %s\n", ts); *ap = '\0'; } } else { - al = decode_flags(&flags, ap + 1, dict); + al = decode_flags(&flags, ap + 1); flag_qsort(flags, 0, al); } } else { @@ -574,22 +429,19 @@ int HashMgr::load_tables(FILE* t_handle, const char * key) flags = NULL; } - int captype; - int wbl = strlen(ts); - int wcl = get_clen_and_captype(ts, wbl, &captype); - // add the word and its index plus its capitalized form optionally - if (add_word(ts,wbl,wcl,flags,al,dp, false) || - add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) { - delete dict; - return 5; - } - } + wl = strlen(ts); - delete dict; + // add the word and its index + if (add_word(ts,wl,flags,al,dp)) return 5; + + } + + fclose(rawdict); #endif return 0; } + // the hash function is a simple load and rotate // algorithm borrowed @@ -614,17 +466,15 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { switch (flag_mode) { case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) len = strlen(flags); - if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: bad flagvector\n"); - len /= 2; + if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: length of FLAG_LONG flagvector is odd: %s\n", flags); + len = len/2; *result = (unsigned short *) malloc(len * sizeof(short)); - if (!*result) return -1; for (int i = 0; i < len; i++) { (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1]; } break; } case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233) - int i; len = 1; char * src = flags; unsigned short * dest; @@ -633,29 +483,23 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { if (*p == ',') len++; } *result = (unsigned short *) malloc(len * sizeof(short)); - if (!*result) return -1; dest = *result; for (p = flags; *p; p++) { if (*p == ',') { - i = atoi(src); - if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); - *dest = (unsigned short) i; + *dest = (unsigned short) atoi(src); if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); src = p + 1; dest++; } } - i = atoi(src); - if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); - *dest = (unsigned short) i; + *dest = (unsigned short) atoi(src); if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); break; } case FLAG_UNI: { // UTF-8 characters - w_char w[BUFSIZE/2]; - len = u8_u16(w, BUFSIZE/2, flags); + w_char w[MAXDELEN/2]; + len = u8_u16(w, MAXDELEN/2, flags); *result = (unsigned short *) malloc(len * sizeof(short)); - if (!*result) return -1; memcpy(*result, w, len * sizeof(short)); break; } @@ -663,28 +507,24 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { unsigned short * dest; len = strlen(flags); *result = (unsigned short *) malloc(len * sizeof(short)); - if (!*result) return -1; dest = *result; for (unsigned char * p = (unsigned char *) flags; *p; p++) { *dest = (unsigned short) *p; dest++; } } - } + } return len; } unsigned short HashMgr::decode_flag(const char * f) { unsigned short s = 0; - int i; switch (flag_mode) { case FLAG_LONG: s = ((unsigned short) f[0] << 8) + (unsigned short) f[1]; break; case FLAG_NUM: - i = atoi(f); - if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); - s = (unsigned short) i; + s = (unsigned short) atoi(f); break; case FLAG_UNI: u8_u16((w_char *) &s, 1, f); @@ -692,7 +532,7 @@ unsigned short HashMgr::decode_flag(const char * f) { default: s = (unsigned short) *((unsigned char *)f); } - if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); + if (!s) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); return s; } @@ -729,7 +569,7 @@ int HashMgr::load_config() // diacritics characters. if (strncmp(line,"IGNORE",6) == 0) { parse_array(line, &ignorechars, &ignorechars_utf16, - &ignorechars_utf16_len, utf8, 0); + &ignorechars_utf16_len, "IGNORE", utf8); } // Retrieve the format of an AF line. if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { @@ -751,101 +591,75 @@ int HashMgr::load_config() } #else // read in aff file and set flag mode -int HashMgr::load_config(FILE* aff_handle, const char * key) +int HashMgr::load_config(FILE* aff_handle) { - char * line; // io buffers int firstline = 1; + + // io buffers + char line[MAXDELEN+1]; // open the affix file - FileMgr * afflst = new FileMgr(affpath, key); + FILE * afflst; + afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); if (!afflst) { HUNSPELL_WARNING(stderr, "Error - could not open affix description file\n"); return 1; } + fseek(afflst, 0, SEEK_SET); // read in each line ignoring any that do not // start with a known line type indicator - while ((line = afflst->getline())) { + while (fgets(line,MAXDELEN,afflst)) { mychomp(line); /* remove byte order mark */ if (firstline) { firstline = 0; - if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(line+3)+1); + if (strncmp(line,"\xef\xbb\xbf",3) == 0) memmove(line, line+3, strlen(line+3)+1); } /* parse in the try string */ if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { if (flag_mode != FLAG_CHAR) { - HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst->getlinenum()); + HUNSPELL_WARNING(stderr, "error: duplicate FLAG parameter\n"); } if (strstr(line, "long")) flag_mode = FLAG_LONG; if (strstr(line, "num")) flag_mode = FLAG_NUM; if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI; if (flag_mode == FLAG_CHAR) { - HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst->getlinenum()); + HUNSPELL_WARNING(stderr, "error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line); } } - if (strncmp(line,"FORBIDDENWORD",13) == 0) { - char * st = NULL; - if (parse_string(line, &st, afflst->getlinenum())) { - delete afflst; - return 1; - } - forbiddenword = decode_flag(st); - free(st); - } - if (strncmp(line, "SET", 3) == 0) { - if (parse_string(line, &enc, afflst->getlinenum())) { - delete afflst; - return 1; - } - if (strcmp(enc, "UTF-8") == 0) { - utf8 = 1; -#ifndef OPENOFFICEORG -#ifndef MOZILLA_CLIENT - initialize_utf_tbl(); -#endif -#endif - } else csconv = get_current_cs(enc); - } - if (strncmp(line, "LANG", 4) == 0) { - if (parse_string(line, &lang, afflst->getlinenum())) { - delete afflst; - return 1; - } - langnum = get_lang_num(lang); - } + if ((strncmp(line,"SET",3) == 0) && isspace(line[3]) && strstr(line, "UTF-8")) utf8 = 1; /* parse in the ignored characters (for example, Arabic optional diacritics characters */ if (strncmp(line,"IGNORE",6) == 0) { - if (parse_array(line, &ignorechars, &ignorechars_utf16, - &ignorechars_utf16_len, utf8, afflst->getlinenum())) { - delete afflst; + if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) { + fclose(afflst); return 1; } } if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) { if (parse_aliasf(line, afflst)) { - delete afflst; + fclose(afflst); return 1; } } +#ifdef HUNSPELL_EXPERIMENTAL if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) { if (parse_aliasm(line, afflst)) { - delete afflst; + fclose(afflst); return 1; } } - - if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; - if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; +#endif + if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; + if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; } - if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING); - delete afflst; + fclose(afflst); return 0; } #endif // HUNSPELL_CHROME_CLIENT @@ -855,11 +669,11 @@ int HashMgr::load_config(FILE* aff_handle, const char * key) int HashMgr::parse_aliasf(char* line, hunspell::LineIterator* iterator) { #else -int HashMgr::parse_aliasf(char * line, FileMgr * af) +int HashMgr::parse_aliasf(char * line, FILE * af) { #endif if (numaliasf != 0) { - HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + HUNSPELL_WARNING(stderr, "error: duplicate AF (alias for flag vector) tables used\n"); return 1; } char * tp = line; @@ -877,7 +691,8 @@ int HashMgr::parse_aliasf(char * line, FileMgr * af) numaliasf = 0; aliasf = NULL; aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: bad entry number\n"); + HUNSPELL_WARNING(stderr, "incorrect number of entries in AF table\n"); + free(piece); return 1; } aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *)); @@ -897,6 +712,7 @@ int HashMgr::parse_aliasf(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -905,7 +721,7 @@ int HashMgr::parse_aliasf(char * line, FileMgr * af) free(aliasflen); aliasf = NULL; aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: missing data\n"); + HUNSPELL_WARNING(stderr, "error: missing AF table information\n"); return 1; } @@ -916,9 +732,9 @@ int HashMgr::parse_aliasf(char * line, FileMgr * af) if (!iterator->AdvanceAndCopy(nl, MAXDELEN)) return 1; #else - if (!(nl = af->getline())) return 1; + if (!fgets(nl,MAXDELEN,af)) return 1; #endif - mychomp(nl); + mychomp(nl); tp = nl; i = 0; aliasf[j] = NULL; @@ -934,7 +750,8 @@ int HashMgr::parse_aliasf(char * line, FileMgr * af) free(aliasflen); aliasf = NULL; aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n"); + free(piece); return 1; } break; @@ -948,6 +765,7 @@ int HashMgr::parse_aliasf(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (!aliasf[j]) { @@ -956,7 +774,7 @@ int HashMgr::parse_aliasf(char * line, FileMgr * af) aliasf = NULL; aliasflen = NULL; numaliasf = 0; - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n"); return 1; } } @@ -992,8 +810,8 @@ hentry* HashMgr::AffixIDsToHentry(char* word, struct hentry* he = new hentry; if (i == 0) first_he = he; - he->word = *word; - he->blen = word_len; + he->word = word; + he->wlen = word_len; he->alen = (short)const_cast<HashMgr*>(this)->get_aliasf(affix_ids[i], &he->astr); he->next = NULL; @@ -1036,11 +854,12 @@ int HashMgr::get_aliasf(int index, unsigned short ** fvec) { return 0; } +#ifdef HUNSPELL_EXPERIMENTAL /* parse morph alias definitions */ -int HashMgr::parse_aliasm(char * line, FileMgr * af) +int HashMgr::parse_aliasm(char * line, FILE * af) { if (numaliasm != 0) { - HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + HUNSPELL_WARNING(stderr, "error: duplicate AM (aliases for morphological descriptions) tables used\n"); return 1; } char * tp = line; @@ -1055,7 +874,8 @@ int HashMgr::parse_aliasm(char * line, FileMgr * af) case 1: { numaliasm = atoi(piece); if (numaliasm < 1) { - HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); + HUNSPELL_WARNING(stderr, "incorrect number of entries in AM table\n"); + free(piece); return 1; } aliasm = (char **) malloc(numaliasm * sizeof(char *)); @@ -1070,31 +890,33 @@ int HashMgr::parse_aliasm(char * line, FileMgr * af) } i++; } + free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { numaliasm = 0; free(aliasm); aliasm = NULL; - HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); + HUNSPELL_WARNING(stderr, "error: missing AM alias information\n"); return 1; } /* now parse the numaliasm lines to read in the remainder of the table */ char * nl = line; for (int j=0; j < numaliasm; j++) { - if (!(nl = af->getline())) return 1; + if (!fgets(nl,MAXDELEN,af)) return 1; mychomp(nl); tp = nl; i = 0; aliasm[j] = NULL; - piece = mystrsep(&tp, ' '); + piece = mystrsep(&tp, 0); while (piece) { if (*piece != '\0') { switch(i) { case 0: { if (strncmp(piece,"AM",2) != 0) { - HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); + HUNSPELL_WARNING(stderr, "error: AM table is corrupt\n"); + free(piece); numaliasm = 0; free(aliasm); aliasm = NULL; @@ -1103,34 +925,24 @@ int HashMgr::parse_aliasm(char * line, FileMgr * af) break; } case 1: { - // add the remaining of the line - if (*tp) { - *(tp - 1) = ' '; - tp = tp + strlen(tp); - } if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } aliasm[j] = mystrdup(piece); - if (!aliasm[j]) { - numaliasm = 0; - free(aliasm); - aliasm = NULL; - return 1; - } break; } default: break; } i++; } - piece = mystrsep(&tp, ' '); + free(piece); + piece = mystrsep(&tp, 0); } if (!aliasm[j]) { numaliasm = 0; free(aliasm); aliasm = NULL; - HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); return 1; } } @@ -1146,3 +958,4 @@ char * HashMgr::get_aliasm(int index) { HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); return NULL; } +#endif diff --git a/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx b/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx index acfbbce..781175e5 100644 --- a/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx +++ b/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx @@ -1,14 +1,8 @@ #ifndef _HASHMGR_HXX_ #define _HASHMGR_HXX_ -#ifndef MOZILLA_CLIENT #include <cstdio> -#else -#include <stdio.h> -#endif - #include "htypes.hxx" -#include "filemgr.hxx" #ifdef HUNSPELL_CHROME_CLIENT #include <string> @@ -29,25 +23,20 @@ class HashMgr std::map<StringPiece, int> custom_word_to_affix_id_map_; std::vector<std::string*> pointer_to_strings_; #endif - int tablesize; - struct hentry ** tableptr; - int userword; - flag flag_mode; - int complexprefixes; - int utf8; - unsigned short forbiddenword; - int langnum; - char * enc; - char * lang; - struct cs_info * csconv; - char * ignorechars; - unsigned short * ignorechars_utf16; - int ignorechars_utf16_len; - int numaliasf; // flag vector `compression' with aliases - unsigned short ** aliasf; - unsigned short * aliasflen; - int numaliasm; // morphological desciption `compression' with aliases - char ** aliasm; + int tablesize; + struct hentry * tableptr; + int userword; + flag flag_mode; + int complexprefixes; + int utf8; + char * ignorechars; + unsigned short * ignorechars_utf16; + int ignorechars_utf16_len; + int numaliasf; // flag vector `compression' with aliases + unsigned short ** aliasf; + unsigned short * aliasflen; + int numaliasm; // morphological desciption `compression' with aliases + char ** aliasm; public: @@ -66,7 +55,7 @@ public: // This function allows that cache to be emptied and not grow infinitely. void EmptyHentryCache(); #else - HashMgr(FILE* t_handle, FILE* a_handle, const char * key); + HashMgr(FILE* t_handle, FILE* a_handle); #endif ~HashMgr(); @@ -74,22 +63,22 @@ public: int hash(const char *) const; struct hentry * walk_hashtable(int & col, struct hentry * hp) const; - int add(const char * word); - int add_with_affix(const char * word, const char * pattern); - int remove(const char * word); + int put_word(const char * word, int wl, char * ap); + int put_word_pattern(const char * word, int wl, const char * pattern); int decode_flags(unsigned short ** result, char * flags); unsigned short decode_flag(const char * flag); char * encode_flag(unsigned short flag); int is_aliasf(); int get_aliasf(int index, unsigned short ** fvec); +#ifdef HUNSPELL_EXPERIMENTAL int is_aliasm(); char * get_aliasm(int index); +#endif + private: - int get_clen_and_captype(const char * word, int wbl, int * captype); - int load_tables(FILE* t_handle, const char * key); - int add_word(const char * word, int wbl, int wcl, unsigned short * ap, - int al, const char * desc, bool onlyupcase); + int load_tables(FILE* t_handle); + int add_word(const char * word, int wl, unsigned short * ap, int al, const char * desc); #ifdef HUNSPELL_CHROME_CLIENT int load_config(); @@ -107,14 +96,13 @@ private: HEntryCache hentry_cache; #else - int load_config(FILE* aff_handle, const char * key); + int load_config(FILE* aff_handle); int parse_aliasf(char * line, FILE * af); #endif - int add_hidden_capitalized_word(char * word, int wbl, int wcl, - unsigned short * flags, int al, char * dp, int captype); - int parse_aliasm(char * line, FileMgr * af); - int remove_forbidden_flag(const char * word); +#ifdef HUNSPELL_EXPERIMENTAL + int parse_aliasm(char * line, FILE * af); +#endif }; diff --git a/chrome/third_party/hunspell/src/hunspell/htypes.hxx b/chrome/third_party/hunspell/src/hunspell/htypes.hxx index 75d9542..f8d685a 100644 --- a/chrome/third_party/hunspell/src/hunspell/htypes.hxx +++ b/chrome/third_party/hunspell/src/hunspell/htypes.hxx @@ -15,28 +15,25 @@ #define ROTATE(v,q) \ (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1)); - -// hentry options -#define H_OPT (1 << 0) -#define H_OPT_ALIASM (1 << 1) -#define H_OPT_PHON (1 << 2) - -// see also csutil.hxx -#define HENTRY_WORD(h) &(h->word) - // approx. number of user defined words #define USERWORD 1000 struct hentry { - unsigned char blen; // word length in bytes - unsigned char clen; // word length in characters (different for UTF-8 enc.) - short alen; // length of affix flag vector - unsigned short * astr; // affix flag vector - struct hentry * next; // next word with same hash code - struct hentry * next_homonym; // next homonym word (with same hash code) - char var; // variable fields (only for special pronounciation yet) - char word; // variable-length word (8-bit or UTF-8 encoding) + short wlen; + short alen; + /* NOTE: Removed by mbelshe since this is not used. + * The english dictionary is 63K in size, so removing this + * itty bitty field saves us ~250KB of RAM. + char wbeg[2]; + */ + char * word; + unsigned short * astr; + struct hentry * next; + struct hentry * next_homonym; +#ifdef HUNSPELL_EXPERIMENTAL + char * description; +#endif }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/hunspell.cxx b/chrome/third_party/hunspell/src/hunspell/hunspell.cxx index 131ad50..42b0603 100644 --- a/chrome/third_party/hunspell/src/hunspell/hunspell.cxx +++ b/chrome/third_party/hunspell/src/hunspell/hunspell.cxx @@ -6,17 +6,16 @@ #include <cstring> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #endif #include "hunspell.hxx" #include "hunspell.h" -#include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef WIN32 +#ifndef W32 using namespace std; #endif #endif @@ -24,34 +23,27 @@ using namespace std; #ifdef HUNSPELL_CHROME_CLIENT Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length) #else -Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle, const char * key = NULL) +Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle) #endif { encoding = NULL; csconv = NULL; utf8 = 0; complexprefixes = 0; -#ifndef HUNSPELL_CHROME_CLIENT - affixpath = mystrdup(affpath); -#endif - maxdic = 0; #ifdef HUNSPELL_CHROME_CLIENT bdict_reader = new hunspell::BDictReader; bdict_reader->Init(bdict_data, bdict_length); - pHMgr[0] = new HashMgr(bdict_reader); - if (pHMgr[0]) maxdic = 1; - - pAMgr = new AffixMgr(bdict_reader, pHMgr, &maxdic); + pHMgr = new HashMgr(bdict_reader); + pAMgr = new AffixMgr(bdict_reader, pHMgr); #else /* first set up the hash manager */ - pHMgr[0] = new HashMgr(dic_handle, aff_handle, key); - if (pHMgr[0]) maxdic = 1; + pHMgr = new HashMgr(dic_handle, aff_handle); /* next set up the affix manager */ /* it needs access to the hash manager lookup methods */ - pAMgr = new AffixMgr(aff_handle, pHMgr, &maxdic, key); + pAMgr = new AffixMgr(aff_handle, pHMgr); #endif /* get the preferred try string and the dictionary */ @@ -73,13 +65,10 @@ Hunspell::~Hunspell() { if (pSMgr) delete pSMgr; if (pAMgr) delete pAMgr; - for (int i = 0; i < maxdic; i++) delete pHMgr[i]; - maxdic = 0; + if (pHMgr) delete pHMgr; pSMgr = NULL; pAMgr = NULL; -#ifdef MOZILLA_CLIENT - free(csconv); -#endif + pHMgr = NULL; csconv= NULL; if (encoding) free(encoding); encoding = NULL; @@ -87,38 +76,27 @@ Hunspell::~Hunspell() #ifdef HUNSPELL_CHROME_CLIENT if (bdict_reader) delete bdict_reader; bdict_reader = NULL; -#else - if (affixpath) free(affixpath); - affixpath = NULL; #endif } -#ifndef HUNSPELL_CHROME_CLIENT -// load extra dictionaries -int Hunspell::add_dic(const char * dpath, const char * key) { - if (maxdic == MAXDIC || !affixpath) return 1; - pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); - if (pHMgr[maxdic]) maxdic++; else return 1; - return 0; -} -#endif // make a copy of src at destination while removing all leading // blanks and removing any trailing periods after recording // their presence with the abbreviation flag -// also since already going through character by character, +// also since already going through character by character, // set the capitalization type // return the length of the "cleaned" (and UTF-8 encoded) word -int Hunspell::cleanword2(char * dest, const char * src, +int Hunspell::cleanword2(char * dest, const char * src, w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev) -{ +{ unsigned char * p = (unsigned char *) dest; const unsigned char * q = (const unsigned char * ) src; + int firstcap = 0; // first skip over any leading blanks while ((*q != '\0') && (*q == ' ')) q++; - + // now strip off any trailing periods (recording their presence) *pabbrev = 0; int nl = strlen((const char *)q); @@ -126,43 +104,80 @@ int Hunspell::cleanword2(char * dest, const char * src, nl--; (*pabbrev)++; } - + // if no characters are left it can't be capitalized - if (nl <= 0) { + if (nl <= 0) { *pcaptype = NOCAP; *p = '\0'; return 0; } - strncpy(dest, (char *) q, nl); - *(dest + nl) = '\0'; - nl = strlen(dest); - if (utf8) { - *nc = u8_u16(dest_utf, MAXWORDLEN, dest); + // now determine the capitalization type of the first nl letters + int ncap = 0; + int nneutral = 0; + *nc = 0; + + if (!utf8) { + while (nl > 0) { + (*nc)++; + if (csconv[(*q)].ccase) ncap++; + if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; + *p++ = *q++; + nl--; + } + // remember to terminate the destination string + *p = '\0'; + if (ncap) { + firstcap = csconv[(unsigned char)(*dest)].ccase; + } + } else { + unsigned short idx; + *nc = u8_u16(dest_utf, MAXWORDLEN, (const char *) q); // don't check too long words if (*nc >= MAXWORDLEN) return 0; if (*nc == -1) { // big Unicode character (non BMP area) *pcaptype = NOCAP; - return nl; + strcpy((char *) p, (char *) q); + return strlen(dest); } - *pcaptype = get_captype_utf8(dest_utf, *nc, langnum); + *nc -= *pabbrev; + for (int i = 0; i < *nc; i++) { + idx = (dest_utf[i].h << 8) + dest_utf[i].l; + if (idx != unicodetolower(idx, langnum)) ncap++; + if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; + } + u16_u8(dest, MAXWORDUTF8LEN, dest_utf, *nc); + if (ncap) { + idx = (dest_utf[0].h << 8) + dest_utf[0].l; + firstcap = (idx != unicodetolower(idx, langnum)); + } + } + + // now finally set the captype + if (ncap == 0) { + *pcaptype = NOCAP; + } else if ((ncap == 1) && firstcap) { + *pcaptype = INITCAP; + } else if ((ncap == *nc) || ((ncap + nneutral) == *nc)) { + *pcaptype = ALLCAP; + } else if ((ncap > 1) && firstcap) { + *pcaptype = HUHINITCAP; } else { - *pcaptype = get_captype(dest, nl, csconv); - *nc = nl; + *pcaptype = HUHCAP; } - return nl; -} + return strlen(dest); +} -int Hunspell::cleanword(char * dest, const char * src, +int Hunspell::cleanword(char * dest, const char * src, int * pcaptype, int * pabbrev) -{ +{ unsigned char * p = (unsigned char *) dest; const unsigned char * q = (const unsigned char * ) src; int firstcap = 0; // first skip over any leading blanks while ((*q != '\0') && (*q == ' ')) q++; - + // now strip off any trailing periods (recording their presence) *pabbrev = 0; int nl = strlen((const char *)q); @@ -170,9 +185,9 @@ int Hunspell::cleanword(char * dest, const char * src, nl--; (*pabbrev)++; } - + // if no characters are left it can't be capitalized - if (nl <= 0) { + if (nl <= 0) { *pcaptype = NOCAP; *p = '\0'; return 0; @@ -200,9 +215,8 @@ int Hunspell::cleanword(char * dest, const char * src, nc = u8_u16(t, MAXWORDLEN, src); for (int i = 0; i < nc; i++) { idx = (t[i].h << 8) + t[i].l; - unsigned short low = unicodetolower(idx, langnum); - if (idx != low) ncap++; - if (unicodetoupper(idx, langnum) == low) nneutral++; + if (idx != unicodetolower(idx, langnum)) ncap++; + if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; } u16_u8(dest, MAXWORDUTF8LEN, t, nc); if (ncap) { @@ -224,7 +238,8 @@ int Hunspell::cleanword(char * dest, const char * src, *pcaptype = HUHCAP; } return strlen(dest); -} +} + void Hunspell::mkallcap(char * p) { @@ -241,7 +256,7 @@ void Hunspell::mkallcap(char * p) } u16_u8(p, MAXWORDUTF8LEN, u, nc); } else { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].cupper; p++; } @@ -254,16 +269,15 @@ int Hunspell::mkallcap2(char * p, w_char * u, int nc) unsigned short idx; for (int i = 0; i < nc; i++) { idx = (u[i].h << 8) + u[i].l; - unsigned short up = unicodetoupper(idx, langnum); - if (idx != up) { - u[i].h = (unsigned char) (up >> 8); - u[i].l = (unsigned char) (up & 0x00FF); + if (idx != unicodetoupper(idx, langnum)) { + u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); + u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); } } u16_u8(p, MAXWORDUTF8LEN, u, nc); - return strlen(p); + return strlen(p); } else { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].cupper; p++; } @@ -274,7 +288,7 @@ int Hunspell::mkallcap2(char * p, w_char * u, int nc) void Hunspell::mkallsmall(char * p) { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].clower; p++; } @@ -286,16 +300,15 @@ int Hunspell::mkallsmall2(char * p, w_char * u, int nc) unsigned short idx; for (int i = 0; i < nc; i++) { idx = (u[i].h << 8) + u[i].l; - unsigned short low = unicodetolower(idx, langnum); - if (idx != low) { - u[i].h = (unsigned char) (low >> 8); - u[i].l = (unsigned char) (low & 0x00FF); + if (idx != unicodetolower(idx, langnum)) { + u[i].h = (unsigned char) (unicodetolower(idx, langnum) >> 8); + u[i].l = (unsigned char) (unicodetolower(idx, langnum) & 0x00FF); } } u16_u8(p, MAXWORDUTF8LEN, u, nc); return strlen(p); } else { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].clower; p++; } @@ -309,18 +322,18 @@ char * Hunspell::sharps_u8_l1(char * dest, char * source) { *p = *source; for (p++, source++; *(source - 1); p++, source++) { *p = *source; - if (*source == '\x9F') *--p = '\xDF'; + if (*source == '\x9f') *--p = '\xdf'; } return dest; } -// recursive search for right ss - sharp s permutations +// recursive search for right ss-\xdf permutations hentry * Hunspell::spellsharps(char * base, char * pos, int n, int repnum, char * tmp, int * info, char **root) { pos = strstr(pos, "ss"); if (pos && (n < MAXSHARPS)) { - *pos = '\xC3'; - *(pos + 1) = '\x9F'; + *pos = '\xc3'; + *(pos + 1) = '\x9f'; hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root); if (h) return h; *pos = 's'; @@ -339,32 +352,31 @@ int Hunspell::is_keepcase(const hentry * rv) { TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); } -/* insert a word to the beginning of the suggestion array and return ns */ -int Hunspell::insert_sug(char ***slst, char * word, int ns) { - char * dup = mystrdup(word); - if (!dup) return ns; - if (ns == MAXSUGGESTION) { - ns--; - free((*slst)[ns]); +/* check and insert a word to beginning of the suggestion array */ +int Hunspell::insert_sug(char ***slst, char * word, int *ns) { + if (spell(word)) { + if (*ns == MAXSUGGESTION) { + (*ns)--; + free((*slst)[*ns]); + } + for (int k = *ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; + (*slst)[0] = mystrdup(word); + (*ns)++; } - for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; - (*slst)[0] = dup; - return ns + 1; + return 0; } int Hunspell::spell(const char * word, int * info, char ** root) { #ifdef HUNSPELL_CHROME_CLIENT - if (pHMgr) pHMgr[0]->EmptyHentryCache(); + if (pHMgr) pHMgr->EmptyHentryCache(); #endif struct hentry * rv=NULL; // need larger vector. For example, Turkish capital letter I converted a // 2-byte UTF-8 character (dotless i) by mkallsmall. - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - w_char unicw[MAXWORDLEN]; - // Hunspell supports XML input of the simplified API (see manual) - if (strcmp(word, SPELL_XML) == 0) return 1; + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + w_char unicw[MAXWORDLEN + 1]; int nc = strlen(word); int wl2 = 0; if (utf8) { @@ -374,18 +386,14 @@ int Hunspell::spell(const char * word, int * info, char ** root) } int captype = 0; int abbv = 0; - int wl = 0; + int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - // input conversion - RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); - else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + if (wl == 0) return 1; - int info2 = 0; - if (wl == 0 || maxdic == 0) return 1; + if (info) *info = 0; if (root) *root = NULL; - // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.) + // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.) enum { NBEGIN, NNUM, NSEP }; int nstate = NBEGIN; int i; @@ -399,179 +407,173 @@ int Hunspell::spell(const char * word, int * info, char ** root) } else break; } if ((i == wl) && (nstate == NNUM)) return 1; - if (!info) info = &info2; else *info = 0; + + // LANG_hu section: number(s) + (percent or degree) with suffixes + if (langnum == LANG_hu) { + if ((nstate == NNUM) && ((cw[i] == '%') || (cw[i] == '\xb0')) + && checkword(cw + i, info, root)) return 1; + } + // END of LANG_hu section switch(captype) { - case HUHCAP: - case HUHINITCAP: - case NOCAP: { - rv = checkword(cw, info, root); - if ((abbv) && !(rv)) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = checkword(wspace, info, root); - } - break; - } + case HUHCAP: + case HUHINITCAP: + case NOCAP: { + rv = checkword(cw, info, root); + if ((abbv) && !(rv)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + } + break; + } case ALLCAP: { - rv = checkword(cw, info, root); - if (rv) break; - if (abbv) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = checkword(wspace, info, root); - if (rv) break; - } - // Spec. prefix handling for Catalan, French, Italian: - // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). - if (pAMgr && strchr(cw, '\'')) { - wl = mkallsmall2(cw, unicw, nc); - char * apostrophe = strchr(cw, '\''); - if (utf8) { - w_char tmpword[MAXWORDLEN]; - *apostrophe = '\0'; - wl2 = u8_u16(tmpword, MAXWORDLEN, cw); - *apostrophe = '\''; - if (wl2 < nc) { - mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1); - rv = checkword(cw, info, root); - if (rv) break; - } - } else { - mkinitcap2(apostrophe + 1, unicw, nc); - rv = checkword(cw, info, root); - if (rv) break; - } - mkinitcap2(cw, unicw, nc); - rv = checkword(cw, info, root); - if (rv) break; - } - if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { - char tmpword[MAXWORDUTF8LEN]; - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace,cw,(wl+1)); - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - if (!rv) { - wl2 = mkinitcap2(cw, unicw, nc); - rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); - } - if ((abbv) && !(rv)) { - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - if (!rv) { - memcpy(wspace, cw, wl2); - *(wspace+wl2) = '.'; - *(wspace+wl2+1) = '\0'; + rv = checkword(cw, info, root); + if (rv) break; + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + if (rv) break; + } + if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { + char tmpword[MAXWORDUTF8LEN]; + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + if (!rv) { + wl2 = mkinitcap2(cw, unicw, nc); + rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); + } + if ((abbv) && !(rv)) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + if (!rv) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + } + } + if (rv) break; } } - if (rv) break; - } - } - case INITCAP: { - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace,cw,(wl+1)); - wl2 = mkinitcap2(cw, unicw, nc); - if (captype == INITCAP) *info += SPELL_INITCAP; - rv = checkword(cw, info, root); - if (captype == INITCAP) *info -= SPELL_INITCAP; - // forbid bad capitalization - // (for example, ijs -> Ijs instead of IJs in Dutch) - // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) - if (*info & SPELL_FORBIDDEN) { - rv = NULL; - break; - } - if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; - if (rv) break; - - rv = checkword(wspace, info, root); - if (abbv && !rv) { - - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = checkword(wspace, info, root); - if (!rv) { - memcpy(wspace, cw, wl2); - *(wspace+wl2) = '.'; - *(wspace+wl2+1) = '\0'; - if (captype == INITCAP) *info += SPELL_INITCAP; - rv = checkword(wspace, info, root); - if (captype == INITCAP) *info -= SPELL_INITCAP; - if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; - break; - } - } - if (rv && is_keepcase(rv) && - ((captype == ALLCAP) || - // if CHECKSHARPS: KEEPCASE words with \xDF are allowed - // in INITCAP form, too. - !(pAMgr->get_checksharps() && - ((utf8 && strstr(wspace, "\xC3\x9F")) || - (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL; - break; - } + case INITCAP: { + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + rv = checkword(wspace, info, root); + if (!rv || (is_keepcase(rv) && !((captype == INITCAP) && + // if CHECKSHARPS: KEEPCASE words with \xdf are allowed + // in INITCAP form, too. + pAMgr->get_checksharps() && ((utf8 && strstr(wspace, "\xc3\x9f")) || + (!utf8 && strchr(wspace, '\xdf')))))) { + wl2 = mkinitcap2(cw, unicw, nc); + rv = checkword(cw, info, root); + if (rv && (captype == ALLCAP) && is_keepcase(rv)) rv = NULL; + } + if (abbv && !rv) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + if (!rv || is_keepcase(rv)) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + rv = checkword(wspace, info, root); + if (rv && ((captype == ALLCAP) && is_keepcase(rv))) rv = NULL; + } + } + break; + } } - + if (rv) return 1; - // recursive breaking at break points + // recursive breaking at break points (not good for morphological analysis) if (wordbreak) { char * s; char r; - int corr = 0; - wl = strlen(cw); - int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; - // check boundary patterns (^begin and end$) - for (int j = 0; j < numbreak; j++) { - int plen = strlen(wordbreak[j]); - if (plen == 1 || plen > wl) continue; - if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 - && spell(cw + plen - 1)) return 1; - if (wordbreak[j][plen - 1] == '$' && - strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) { - r = cw[wl - plen + 1]; - cw[wl - plen + 1] = '\0'; - if (spell(cw)) return 1; - cw[wl - plen + 1] = r; - } - } - // other patterns - for (int j = 0; j < numbreak; j++) { - int result = 0; - int plen = strlen(wordbreak[j]); + for (int j = 0; j < pAMgr->get_numbreak(); j++) { s=(char *) strstr(cw, wordbreak[j]); - if (s && (s > cw) && (s < cw + wl - plen)) { - if (!spell(s + plen)) continue; + if (s) { r = *s; *s = '\0'; // examine 2 sides of the break point - if (spell(cw)) return 1; + if (spell(cw) && spell(s + strlen(wordbreak[j]))) { + *s = r; + return 1; + } *s = r; - - // LANG_hu: spec. dash rule - if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { - r = s[1]; - s[1] = '\0'; - if (spell(cw)) return 1; // check the first part with dash - s[1] = r; - } - // end of LANG speficic region - } } } + // LANG_hu: compoundings with dashes and n-dashes XXX deprecated! + if (langnum == LANG_hu) { + int n; + // compound word with dash (HU) I18n + char * dash; + int result = 0; + // n-dash + dash = (char *) strstr(cw,"\xe2\x80\x93"); + if (dash && !wordbreak) { + *dash = '\0'; + // examine 2 sides of the dash + if (spell(cw) && spell(dash + 3)) { + *dash = '\xe2'; + return 1; + } + *dash = '\xe2'; + } + dash = (char *) strchr(cw,'-'); + if (dash) { + *dash='\0'; + // examine 2 sides of the dash + if (dash[1] == '\0') { // base word ending with dash + if (spell(cw)) return 1; + } else { + // first word ending with dash: word- + char r2 = *(dash + 1); + dash[0]='-'; + dash[1]='\0'; + result = spell(cw); + dash[1] = r2; + dash[0]='\0'; + if (result && spell(dash+1) && ((strlen(dash+1) > 1) || (dash[1] == 'e') || + ((dash[1] > '0') && (dash[1] < '9')))) return 1; + } + // affixed number in correct word + if (result && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)>='.'))) { + *dash='-'; + n = 1; + if (*(dash - n) == '.') n++; + // search first not a number character to left from dash + while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) { + n++; + } + if ((dash - n) < cw) n--; + // numbers: deprecated + for(; n >= 1; n--) { + if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && + checkword(dash - n, info, root)) return 1; + } + } + } + } return 0; } +//int Hunspell::spell(const char * word) { +// return spell(word, NULL, NULL); +//} + struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) { struct hentry * he = NULL; - int len, i; + int len; char w2[MAXWORDUTF8LEN]; const char * word; @@ -598,29 +600,26 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) } // look word in hash table - for (i = 0; (i < maxdic) && !he; i ++) { - he = (pHMgr[i])->lookup(word); + if (pHMgr) he = pHMgr->lookup(word); // check forbidden and onlyincompound words if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - if (info) *info += SPELL_FORBIDDEN; + info += SPELL_FORBIDDEN; // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) { if (pAMgr->get_compoundflag() && TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { - if (info) *info += SPELL_COMPOUND; + info += SPELL_COMPOUND; } } return NULL; } - // he = next not needaffix, onlyincompound homonym or onlyupcase word + // he = next not pseudoroot and not onlyincompound homonym or NULL while (he && (he->astr) && - ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || - (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || - (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)) + ((pAMgr->get_pseudoroot() && TESTAFF(he->astr, pAMgr->get_pseudoroot(), he->alen)) || + (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) )) he = he->next_homonym; - } // check with affixes if (!he && pAMgr) { @@ -628,42 +627,38 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) len = strlen(word); he = pAMgr->affix_check(word, len, 0); - // check compound restriction and onlyupcase - if (he && he->astr && ( - (pAMgr->get_onlyincompound() && - TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || - (info && (*info & SPELL_INITCAP) && - TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { - he = NULL; - } + // check compound restriction + if (he && he->astr && pAMgr->get_onlyincompound() && + TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) he = NULL; if (he) { if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - if (info) *info += SPELL_FORBIDDEN; + info += SPELL_FORBIDDEN; return NULL; } if (root) { - *root = mystrdup(&(he->word)); - if (*root && complexprefixes) { + *root = mystrdup(he->word); + if (complexprefixes) { if (utf8) reverseword_utf(*root); else reverseword(*root); } } // try check compound word } else if (pAMgr->get_compound()) { - he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0); + he = pAMgr->compound_check(word, len, + 0,0,100,0,NULL,0,NULL,NULL,0); // LANG_hu section: `moving rule' with last dash - if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) { + if ((!he) && (langnum == LANG_hu) && (word[len-1]=='-')) { char * dup = mystrdup(word); - if (!dup) return NULL; dup[len-1] = '\0'; - he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0); + he = pAMgr->compound_check(dup, len-1, + -5,0,100,0,NULL,1,NULL,NULL,0); free(dup); } - // end of LANG speficic region + // end of LANG speficic region if (he) { if (root) { - *root = mystrdup(&(he->word)); - if (*root && complexprefixes) { + *root = mystrdup(he->word); + if (complexprefixes) { if (utf8) reverseword_utf(*root); else reverseword(*root); } } @@ -679,18 +674,12 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) int Hunspell::suggest(char*** slst, const char * word) { #ifdef HUNSPELL_CHROME_CLIENT - if (pHMgr) pHMgr[0]->EmptyHentryCache(); + if (pHMgr) pHMgr->EmptyHentryCache(); #endif - int onlycmpdsug = 0; - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - if (!pSMgr || maxdic == 0) return 0; - w_char unicw[MAXWORDLEN]; - *slst = NULL; - // process XML input of the simplified API (see manual) - if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { - return spellml(slst, word); - } + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; + w_char unicw[MAXWORDLEN + 1]; int nc = strlen(word); if (utf8) { if (nc >= MAXWORDUTF8LEN) return 0; @@ -699,73 +688,49 @@ int Hunspell::suggest(char*** slst, const char * word) } int captype = 0; int abbv = 0; - int wl = 0; - - // input conversion - RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); - else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - + int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); if (wl == 0) return 0; int ns = 0; + *slst = NULL; int capwords = 0; + int ngramsugs = 0; switch(captype) { - case NOCAP: { - ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); + case NOCAP: { + ns = pSMgr->suggest(slst, cw, ns); break; } - case INITCAP: { + case INITCAP: { capwords = 1; - ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); + ns = pSMgr->suggest(slst, cw, ns); if (ns == -1) break; memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + ns = pSMgr->suggest(slst, wspace, ns); break; } case HUHINITCAP: capwords = 1; - case HUHCAP: { - ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); + case HUHCAP: { + ns = pSMgr->suggest(slst, cw, ns); if (ns != -1) { int prevns; - // something.The -> something. The - char * dot = strchr(cw, '.'); - if (dot && (dot > cw)) { - int captype_; - if (utf8) { - w_char w_[MAXWORDLEN]; - int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1); - captype_ = get_captype_utf8(w_, wl_, langnum); - } else captype_ = get_captype(dot+1, strlen(dot+1), csconv); - if (captype_ == INITCAP) { - char * st = mystrdup(cw); - if (st) st = (char *) realloc(st, wl + 2); - if (st) { - st[(dot - cw) + 1] = ' '; - strcpy(st + (dot - cw) + 2, dot + 1); - ns = insert_sug(slst, st, ns); - free(st); - } - } - } if (captype == HUHINITCAP) { // TheOpenOffice.org -> The OpenOffice.org memcpy(wspace,cw,(wl+1)); mkinitsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + ns = pSMgr->suggest(slst, wspace, ns); } memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - if (spell(wspace)) ns = insert_sug(slst, wspace, ns); + insert_sug(slst, wspace, &ns); prevns = ns; - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + ns = pSMgr->suggest(slst, wspace, ns); if (captype == HUHINITCAP) { mkinitcap2(wspace, unicw, nc); - if (spell(wspace)) ns = insert_sug(slst, wspace, ns); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + insert_sug(slst, wspace, &ns); + ns = pSMgr->suggest(slst, wspace, ns); } // aNew -> "a New" (instead of "a new") for (int j = prevns; j < ns; j++) { @@ -774,7 +739,7 @@ int Hunspell::suggest(char*** slst, const char * word) int slen = strlen(space + 1); // different case after space (need capitalisation) if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) { - w_char w[MAXWORDLEN]; + w_char w[MAXWORDLEN + 1]; int wc = 0; char * r = (*slst)[j]; if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1); @@ -789,32 +754,31 @@ int Hunspell::suggest(char*** slst, const char * word) break; } - case ALLCAP: { + case ALLCAP: { memcpy(wspace, cw, (wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + ns = pSMgr->suggest(slst, wspace, ns); if (ns == -1) break; - if (pAMgr && pAMgr->get_keepcase() && spell(wspace)) - ns = insert_sug(slst, wspace, ns); + if (pAMgr && pAMgr->get_keepcase()) insert_sug(slst, wspace, &ns); mkinitcap2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); + ns = pSMgr->suggest(slst, wspace, ns); for (int j=0; j < ns; j++) { mkallcap((*slst)[j]); if (pAMgr && pAMgr->get_checksharps()) { char * pos; if (utf8) { - pos = strstr((*slst)[j], "\xC3\x9F"); + pos = strstr((*slst)[j], "\xc3\x9f"); while (pos) { *pos = 'S'; *(pos+1) = 'S'; - pos = strstr(pos+2, "\xC3\x9F"); + pos = strstr(pos+2, "\xc3\x9f"); } } else { - pos = strchr((*slst)[j], '\xDF'); + pos = strchr((*slst)[j], '\xdf'); while (pos) { (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2); - mystrrep((*slst)[j], "\xDF", "SS"); - pos = strchr((*slst)[j], '\xDF'); + mystrrep((*slst)[j], "\xdf", "SS"); + pos = strchr((*slst)[j], '\xdf'); } } } @@ -843,76 +807,37 @@ int Hunspell::suggest(char*** slst, const char * word) // END OF LANG_hu section // try ngram approach since found nothing - if ((ns == 0 || onlycmpdsug) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { + if ((ns == 0) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { + ngramsugs = 1; switch(captype) { case NOCAP: { - ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); + ns = pSMgr->ngsuggest(*slst, cw, pHMgr); break; } - case HUHINITCAP: - capwords = 1; case HUHCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); - break; + ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); + break; } - case INITCAP: { + case INITCAP: { capwords = 1; memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); + ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); break; } case ALLCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - int oldns = ns; - ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); - for (int j = oldns; j < ns; j++) + ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); + for (int j=0; j < ns; j++) mkallcap((*slst)[j]); break; } } } - // try dash suggestion (Afo-American -> Afro-American) - if (strchr(cw, '-')) { - char * pos = strchr(cw, '-'); - char * ppos = cw; - int nodashsug = 1; - char ** nlst = NULL; - int nn = 0; - int last = 0; - for (int j = 0; j < ns && nodashsug == 1; j++) { - if (strchr((*slst)[j], '-')) nodashsug = 0; - } - while (nodashsug && !last) { - if (*pos == '\0') last = 1; else *pos = '\0'; - if (!spell(ppos)) { - nn = suggest(&nlst, ppos); - for (int j = nn - 1; j >= 0; j--) { - strncpy(wspace, cw, ppos - cw); - strcpy(wspace + (ppos - cw), nlst[j]); - if (!last) { - strcat(wspace, "-"); - strcat(wspace, pos + 1); - } - ns = insert_sug(slst, wspace, ns); - free(nlst[j]); - } - if (nlst != NULL) free(nlst); - nodashsug = 0; - } - if (!last) { - *pos = '-'; - ppos = pos + 1; - pos = strchr(ppos, '-'); - } - if (!pos) pos = cw + strlen(cw); - } - } - // word reversing wrapper for complex prefixes if (complexprefixes) { for (int j = 0; j < ns; j++) { @@ -933,14 +858,14 @@ int Hunspell::suggest(char*** slst, const char * word) } } - // remove bad capitalized and forbidden forms - if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { + // suggest keepcase + if (pAMgr->get_keepcase()) { switch (captype) { case INITCAP: case ALLCAP: { int l = 0; for (int j=0; j < ns; j++) { - if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) { + if (!spell((*slst)[j])) { char s[MAXSWUTF8L]; w_char w[MAXSWL]; int len; @@ -951,21 +876,21 @@ int Hunspell::suggest(char*** slst, const char * word) len = strlen(s); } mkallsmall2(s, w, len); - free((*slst)[j]); + free((*slst)[j]); if (spell(s)) { (*slst)[l] = mystrdup(s); - if ((*slst)[l]) l++; + l++; } else { mkinitcap2(s, w, len); if (spell(s)) { (*slst)[l] = mystrdup(s); - if ((*slst)[l]) l++; + l++; } } } else { (*slst)[l] = (*slst)[j]; l++; - } + } } ns = l; } @@ -984,28 +909,9 @@ int Hunspell::suggest(char*** slst, const char * word) } l++; } - - // output conversion - rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; - for (int j = 0; rl && j < ns; j++) { - if (rl->conv((*slst)[j], wspace)) { - free((*slst)[j]); - (*slst)[j] = mystrdup(wspace); - } - } - - // if suggestions removed by nosuggest, onlyincompound parameters - if (l == 0 && *slst) { - free(*slst); - *slst = NULL; - } return l; } -void Hunspell::free_list(char *** slst, int n) { - freelist(slst, n); -} - char * Hunspell::get_dic_encoding() { return encoding; @@ -1015,9 +921,9 @@ char * Hunspell::get_dic_encoding() // XXX need UTF-8 support int Hunspell::suggest_auto(char*** slst, const char * word) { - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - if (!pSMgr || maxdic == 0) return 0; + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return 0; @@ -1030,15 +936,15 @@ int Hunspell::suggest_auto(char*** slst, const char * word) if (wl == 0) return 0; int ns = 0; *slst = NULL; // HU, nsug in pSMgr->suggest - + switch(captype) { - case NOCAP: { + case NOCAP: { ns = pSMgr->suggest_auto(slst, cw, ns); if (ns>0) break; break; } - case INITCAP: { + case INITCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); ns = pSMgr->suggest_auto(slst, wspace, ns); @@ -1046,11 +952,10 @@ int Hunspell::suggest_auto(char*** slst, const char * word) mkinitcap((*slst)[j]); ns = pSMgr->suggest_auto(slst, cw, ns); break; - + } - case HUHINITCAP: - case HUHCAP: { + case HUHCAP: { ns = pSMgr->suggest_auto(slst, cw, ns); if (ns == 0) { memcpy(wspace,cw,(wl+1)); @@ -1060,7 +965,7 @@ int Hunspell::suggest_auto(char*** slst, const char * word) break; } - case ALLCAP: { + case ALLCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); ns = pSMgr->suggest_auto(slst, wspace, ns); @@ -1106,89 +1011,103 @@ int Hunspell::suggest_auto(char*** slst, const char * word) } } } - // END OF LANG_hu section + // END OF LANG_hu section return ns; } -#endif -int Hunspell::stem(char*** slst, char ** desc, int n) +// XXX need UTF-8 support +int Hunspell::stem(char*** slst, const char * word) { - char result[MAXLNLEN]; - char result2[MAXLNLEN]; - *slst = NULL; - if (n == 0) return 0; - *result2 = '\0'; - for (int i = 0; i < n; i++) { - *result = '\0'; - // add compound word parts (except the last one) - char * s = (char *) desc[i]; - char * part = strstr(s, MORPH_PART); - if (part) { - char * nextpart = strstr(part + 1, MORPH_PART); - while (nextpart) { - copy_field(result + strlen(result), part, MORPH_PART); - part = nextpart; - nextpart = strstr(part + 1, MORPH_PART); - } - s = part; - } - - char **pl; - char tok[MAXLNLEN]; - strcpy(tok, s); - char * alt = strstr(tok, " | "); - while (alt) { - alt[1] = MSEP_ALT; - alt = strstr(alt, " | "); - } - int pln = line_tok(tok, &pl, MSEP_ALT); - for (int k = 0; k < pln; k++) { - // add derivational suffixes - if (strstr(pl[k], MORPH_DERI_SFX)) { - // remove inflectional suffixes - char * is = strstr(pl[k], MORPH_INFL_SFX); - if (is) *is = '\0'; - char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); - if (sg) { - char ** gen; - int genl = line_tok(sg, &gen, MSEP_REC); - free(sg); - for (int j = 0; j < genl; j++) { - sprintf(result2 + strlen(result2), "%c%s%s", - MSEP_REC, result, gen[j]); - } - freelist(&gen, genl); - } - } else { - sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); - if (strstr(pl[k], MORPH_SURF_PFX)) { - copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); - } - copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); - } - } - freelist(&pl, pln); + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; + int wl = strlen(word); + if (utf8) { + if (wl >= MAXWORDUTF8LEN) return 0; + } else { + if (wl >= MAXWORDLEN) return 0; } - int sln = line_tok(result2, slst, MSEP_REC); - return uniqlist(*slst, sln); + int captype = 0; + int abbv = 0; + wl = cleanword(cw, word, &captype, &abbv); + if (wl == 0) return 0; + + int ns = 0; -} + *slst = NULL; // HU, nsug in pSMgr->suggest + + switch(captype) { + case HUHCAP: + case NOCAP: { + ns = pSMgr->suggest_stems(slst, cw, ns); -int Hunspell::stem(char*** slst, const char * word) -{ - char ** pl; - int pln = analyze(&pl, word); - int pln2 = stem(slst, pl, pln); - freelist(&pl, pln); - return pln2; + if ((abbv) && (ns == 0)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + ns = pSMgr->suggest_stems(slst, wspace, ns); + } + + break; + } + + case INITCAP: { + + ns = pSMgr->suggest_stems(slst, cw, ns); + + if (ns == 0) { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_stems(slst, wspace, ns); + + } + + if ((abbv) && (ns == 0)) { + memcpy(wspace,cw,wl); + mkallsmall(wspace); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + ns = pSMgr->suggest_stems(slst, wspace, ns); + } + + break; + + } + + case ALLCAP: { + ns = pSMgr->suggest_stems(slst, cw, ns); + if (ns != 0) break; + + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_stems(slst, wspace, ns); + + if (ns == 0) { + mkinitcap(wspace); + ns = pSMgr->suggest_stems(slst, wspace, ns); + } + + if ((abbv) && (ns == 0)) { + memcpy(wspace,cw,wl); + mkallsmall(wspace); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + ns = pSMgr->suggest_stems(slst, wspace, ns); + } + + + break; + } + } + + return ns; } -#ifdef HUNSPELL_EXPERIMENTAL int Hunspell::suggest_pos_stems(char*** slst, const char * word) { - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - if (! pSMgr || maxdic == 0) return 0; + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return 0; @@ -1199,14 +1118,14 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) int abbv = 0; wl = cleanword(cw, word, &captype, &abbv); if (wl == 0) return 0; - + int ns = 0; // ns=0 = normalized input *slst = NULL; // HU, nsug in pSMgr->suggest - + switch(captype) { case HUHCAP: - case NOCAP: { + case NOCAP: { ns = pSMgr->suggest_pos_stems(slst, cw, ns); if ((abbv) && (ns == 0)) { @@ -1219,7 +1138,7 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) break; } - case INITCAP: { + case INITCAP: { ns = pSMgr->suggest_pos_stems(slst, cw, ns); @@ -1228,15 +1147,15 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) mkallsmall(wspace); ns = pSMgr->suggest_pos_stems(slst, wspace, ns); } - + break; - + } - case ALLCAP: { + case ALLCAP: { ns = pSMgr->suggest_pos_stems(slst, cw, ns); if (ns != 0) break; - + memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); ns = pSMgr->suggest_pos_stems(slst, wspace, ns); @@ -1306,21 +1225,19 @@ int Hunspell::mkinitsmall2(char * p, w_char * u, int nc) return nc; } -int Hunspell::add(const char * word) +int Hunspell::put_word(const char * word) { - if (pHMgr[0]) return (pHMgr[0])->add(word); - return 0; -} - -int Hunspell::add_with_affix(const char * word, const char * example) -{ - if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example); + if (pHMgr) { + return pHMgr->put_word(word, strlen(word), NULL); + } return 0; } -int Hunspell::remove(const char * word) +int Hunspell::put_word_pattern(const char * word, const char * pattern) { - if (pHMgr[0]) return (pHMgr[0])->remove(word); + if (pHMgr) { + return pHMgr->put_word_pattern(word, strlen(word), pattern); + } return 0; } @@ -1334,38 +1251,22 @@ struct cs_info * Hunspell::get_csconv() return csconv; } -void Hunspell::cat_result(char * result, char * st) -{ - if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); - free(st); - } -} - -int Hunspell::analyze(char*** slst, const char * word) +#ifdef HUNSPELL_EXPERIMENTAL +// XXX need UTF-8 support +char * Hunspell::morph(const char * word) { - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - w_char unicw[MAXWORDLEN]; - int wl2 = 0; - *slst = NULL; - if (! pSMgr || maxdic == 0) return 0; - int nc = strlen(word); + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; + int wl = strlen(word); if (utf8) { - if (nc >= MAXWORDUTF8LEN) return 0; + if (wl >= MAXWORDUTF8LEN) return 0; } else { - if (nc >= MAXWORDLEN) return 0; + if (wl >= MAXWORDLEN) return 0; } int captype = 0; int abbv = 0; - int wl = 0; - - // input conversion - RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; - if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); - else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - + wl = cleanword(cw, word, &captype, &abbv); if (wl == 0) { if (abbv) { for (wl = 0; wl < abbv; wl++) cw[wl] = '.'; @@ -1376,7 +1277,7 @@ int Hunspell::analyze(char*** slst, const char * word) char result[MAXLNLEN]; char * st = NULL; - + *result = '\0'; int n = 0; @@ -1386,103 +1287,177 @@ int Hunspell::analyze(char*** slst, const char * word) // test numbers // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) { - while ((n < wl) && + while ((n < wl) && (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) { n++; if ((cw[n] == '.') || (cw[n] == ',')) { - if (((n2 == 0) && (n > 3)) || + if (((n2 == 0) && (n > 3)) || ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break; n2++; n3 = n; } } - if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0; - if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) { - mystrcat(result, cw, MAXLNLEN); + if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return NULL; + if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xb0')) && checkword(cw+n, NULL, NULL))) { + strcat(result, cw); result[n - 1] = '\0'; - if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1)); - else { + if (n == wl) { + st = pSMgr->suggest_morph(cw + n - 1); + if (st) { + strcat(result, st); + free(st); + } + } else { char sign = cw[n]; cw[n] = '\0'; - cat_result(result, pSMgr->suggest_morph(cw + n - 1)); - mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE + st = pSMgr->suggest_morph(cw + n - 1); + if (st) { + strcat(result, st); + free(st); + } + strcat(result, "+"); // XXX SPEC. MORPHCODE cw[n] = sign; - cat_result(result, pSMgr->suggest_morph(cw + n)); + st = pSMgr->suggest_morph(cw + n); + if (st) { + strcat(result, st); + free(st); + } } - return line_tok(result, slst, MSEP_REC); + return mystrdup(result); } } // END OF LANG_hu section - + switch(captype) { - case HUHCAP: - case HUHINITCAP: - case NOCAP: { - cat_result(result, pSMgr->suggest_morph(cw)); - if (abbv) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - cat_result(result, pSMgr->suggest_morph(wspace)); - } - break; - } - case INITCAP: { - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace,cw,(wl+1)); - wl2 = mkinitcap2(cw, unicw, nc); - cat_result(result, pSMgr->suggest_morph(wspace)); - cat_result(result, pSMgr->suggest_morph(cw)); - if (abbv) { + case NOCAP: { + st = pSMgr->suggest_morph(cw); + if (st) { + strcat(result, st); + free(st); + } + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; - cat_result(result, pSMgr->suggest_morph(wspace)); - - memcpy(wspace, cw, wl2); - *(wspace+wl2) = '.'; - *(wspace+wl2+1) = '\0'; - - cat_result(result, pSMgr->suggest_morph(wspace)); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } } - break; + break; } - case ALLCAP: { - cat_result(result, pSMgr->suggest_morph(cw)); - if (abbv) { - memcpy(wspace,cw,wl); + case INITCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + strcat(result, st); + free(st); + } + st = pSMgr->suggest_morph(cw); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; - cat_result(result, pSMgr->suggest_morph(cw)); + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } } - wl = mkallsmall2(cw, unicw, nc); + break; + } + case HUHCAP: { + st = pSMgr->suggest_morph(cw); + if (st) { + strcat(result, st); + free(st); + } +#if 0 memcpy(wspace,cw,(wl+1)); - wl2 = mkinitcap2(cw, unicw, nc); - - cat_result(result, pSMgr->suggest_morph(wspace)); - cat_result(result, pSMgr->suggest_morph(cw)); - if (abbv) { - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - cat_result(result, pSMgr->suggest_morph(wspace)); - - memcpy(wspace, cw, wl2); - *(wspace+wl2) = '.'; - *(wspace+wl2+1) = '\0'; - - cat_result(result, pSMgr->suggest_morph(wspace)); + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); } +#endif + break; + } + case ALLCAP: { + memcpy(wspace,cw,(wl+1)); + st = pSMgr->suggest_morph(wspace); + if (st) { + strcat(result, st); + free(st); + } + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + if (abbv) { + memcpy(wspace,cw,(wl+1)); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + if (*result) strcat(result, "\n"); + st = pSMgr->suggest_morph(wspace); + if (st) { + strcat(result, st); + free(st); + } + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + } break; } } - if (*result) { + if (result && (*result)) { // word reversing wrapper for complex prefixes if (complexprefixes) { if (utf8) reverseword_utf(result); else reverseword(result); } - return line_tok(result, slst, MSEP_REC); - + return mystrdup(result); } // compound word with dash (HU) I18n @@ -1491,24 +1466,24 @@ int Hunspell::analyze(char*** slst, const char * word) // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) dash = (char *) strchr(cw,'-'); if ((langnum == LANG_hu) && dash) { - *dash='\0'; + *dash='\0'; // examine 2 sides of the dash if (dash[1] == '\0') { // base word ending with dash - if (spell(cw)) return line_tok(pSMgr->suggest_morph(cw), slst, MSEP_REC); + if (spell(cw)) return pSMgr->suggest_morph(cw); } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. if (spell(cw) && (spell("-e"))) { st = pSMgr->suggest_morph(cw); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } - mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE + strcat(result,"+"); // XXX spec. separator in MORPHCODE st = pSMgr->suggest_morph("-e"); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } - return line_tok(result, slst, MSEP_REC); + return mystrdup(result); } } else { // first word ending with dash: word- XXX ??? @@ -1520,22 +1495,22 @@ int Hunspell::analyze(char*** slst, const char * word) dash[0]='\0'; if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || ((dash[1] > '0') && (dash[1] < '9')))) { - st = pSMgr->suggest_morph(cw); + st = morph(cw); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); - mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE + strcat(result,"+"); // XXX spec. separator in MORPHCODE } - st = pSMgr->suggest_morph(dash+1); + st = morph(dash+1); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } - return line_tok(result, slst, MSEP_REC); + return mystrdup(result); } } // affixed number in correct word - if (nresult && (dash > cw) && (((*(dash-1)<='9') && + if (nresult && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)=='.'))) { *dash='-'; n = 1; @@ -1550,338 +1525,195 @@ int Hunspell::analyze(char*** slst, const char * word) // 56-hoz, 6-hoz for(; n >= 1; n--) { if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) { - mystrcat(result, cw, MAXLNLEN); + strcat(result, cw); result[dash - cw - n] = '\0'; st = pSMgr->suggest_morph(dash - n); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } - return line_tok(result, slst, MSEP_REC); + return mystrdup(result); } } } } - return 0; -} - -int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) -{ - *slst = NULL; - if (!pSMgr || !pln) return 0; - char **pl2; - int pl2n = analyze(&pl2, word); - int captype = 0; - int abbv = 0; - char cw[MAXWORDUTF8LEN]; - cleanword(cw, word, &captype, &abbv); - char result[MAXLNLEN]; - *result = '\0'; - - for (int i = 0; i < pln; i++) { - cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); - } - freelist(&pl2, pl2n); - - if (*result) { - // allcap - if (captype == ALLCAP) mkallcap(result); - - // line split - int linenum = line_tok(result, slst, MSEP_REC); - - // capitalize - if (captype == INITCAP || captype == HUHINITCAP) { - for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]); - } - - // temporary filtering of prefix related errors (eg. - // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") - - int r = 0; - for (int j=0; j < linenum; j++) { - if (!spell((*slst)[j])) { - free((*slst)[j]); - (*slst)[j] = NULL; - } else { - if (r < j) (*slst)[r] = (*slst)[j]; - r++; - } - } - if (r > 0) return r; - free(*slst); - *slst = NULL; - } - return 0; -} - -int Hunspell::generate(char*** slst, const char * word, const char * pattern) -{ - char **pl; - int pln = analyze(&pl, pattern); - int n = generate(slst, word, pl, pln); - freelist(&pl, pln); - return uniqlist(*slst, n); -} - -// minimal XML parser functions -int Hunspell::get_xml_par(char * dest, const char * par, int max) -{ - char * d = dest; - if (!par) return 0; - char end = *par; - char * dmax = dest + max; - if (end == '>') end = '<'; - else if (end != '\'' && end != '"') return 0; // bad XML - for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par; - *d = '\0'; - mystrrep(dest, "<", "<"); - mystrrep(dest, "&", "&"); - return d - dest; -} - -// return the beginning of the element (attr == NULL) or the attribute -const char * Hunspell::get_xml_pos(const char * s, const char * attr) -{ - const char * end = strchr(s, '>'); - const char * p = s; - if (attr == NULL) return end; - do { - p = strstr(p, attr); - if (!p || p >= end) return 0; - } while (*(p-1) != ' ' && *(p-1) != '\n'); - return p + strlen(attr); -} - -int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) { - char cw[MAXWORDUTF8LEN]; - if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) && - strcmp(cw, value) == 0) return 1; - return 0; -} - -int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) { - int n = 0; - char * p; - if (!list) return 0; - for (p = list; (p = strstr(p, tag)); p++) n++; - if (n == 0) return 0; - *slst = (char **) malloc(sizeof(char *) * n); - if (!*slst) return 0; - for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) { - int l = strlen(p); - (*slst)[n] = (char *) malloc(l); - if (!(*slst)[n]) return (n > 0 ? n - 1 : 0); - get_xml_par((*slst)[n], p + strlen(tag) - 1, l); - } - return n; -} - -int Hunspell::spellml(char*** slst, const char * word) -{ - char *q, *q2; - char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN]; - q = (char *) strstr(word, "<query"); - if (!q) return 0; // bad XML input - q2 = strchr(q, '>'); - if (!q2) return 0; // bad XML input - q2 = strstr(q2, "<word"); - if (!q2) return 0; // bad XML input - if (check_xml_par(q, "type=", "analyze")) { - int n = 0, s = 0; - if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) n = analyze(slst, cw); - if (n == 0) return 0; - // convert the result to <code><a>ana1</a><a>ana2</a></code> format - for (int i = 0; i < n; i++) s+= strlen((*slst)[i]); - char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->& - if (!r) return 0; - strcpy(r, "<code>"); - for (int i = 0; i < n; i++) { - int l = strlen(r); - strcpy(r + l, "<a>"); - strcpy(r + l + 3, (*slst)[i]); - mystrrep(r + l + 3, "\t", " "); - mystrrep(r + l + 3, "<", "<"); - mystrrep(r + l + 3, "&", "&"); - strcat(r, "</a>"); - free((*slst)[i]); - } - strcat(r, "</code>"); - (*slst)[0] = r; - return 1; - } else if (check_xml_par(q, "type=", "stem")) { - if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) return stem(slst, cw); - } else if (check_xml_par(q, "type=", "generate")) { - int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN); - if (n == 0) return 0; - char * q3 = strstr(q2 + 1, "<word"); - if (q3) { - if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN)) { - return generate(slst, cw, cw2); - } - } else { - char ** slst2; - if ((q2 = strstr(q2 + 1, "<code")) && - (n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) { - int n2 = generate(slst, cw, slst2, n); - freelist(&slst2, n); - return uniqlist(*slst, n2); - } - } - } - return 0; + return NULL; } - -#ifdef HUNSPELL_EXPERIMENTAL // XXX need UTF-8 support char * Hunspell::morph_with_correction(const char * word) { - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - if (! pSMgr || maxdic == 0) return NULL; + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; int wl = strlen(word); if (utf8) { - if (wl >= MAXWORDUTF8LEN) return NULL; + if (wl >= MAXWORDUTF8LEN) return 0; } else { - if (wl >= MAXWORDLEN) return NULL; + if (wl >= MAXWORDLEN) return 0; } int captype = 0; int abbv = 0; wl = cleanword(cw, word, &captype, &abbv); - if (wl == 0) return NULL; + if (wl == 0) return 0; char result[MAXLNLEN]; char * st = NULL; - + *result = '\0'; - - + + switch(captype) { - case NOCAP: { + case NOCAP: { st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } - if (abbv) { - memcpy(wspace,cw,wl); + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); + if (*result) strcat(result, "\n"); + strcat(result, st); free(st); } } break; } - case INITCAP: { + case INITCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); - } - st = pSMgr->suggest_morph_for_spelling_error(cw); + } + st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); + if (*result) strcat(result, "\n"); + strcat(result, st); free(st); } - if (abbv) { - memcpy(wspace,cw,wl); + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); + if (*result) strcat(result, "\n"); + strcat(result, st); free(st); - } + } mkinitcap(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); + if (*result) strcat(result, "\n"); + strcat(result, st); free(st); - } + } } break; } - case HUHCAP: { + case HUHCAP: { st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); + if (*result) strcat(result, "\n"); + strcat(result, st); free(st); - } + } break; } - case ALLCAP: { + case ALLCAP: { memcpy(wspace,cw,(wl+1)); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); - } + } mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); + if (*result) strcat(result, "\n"); + strcat(result, st); free(st); } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); + if (*result) strcat(result, "\n"); + strcat(result, st); free(st); } - if (abbv) { + if (abbv) { memcpy(wspace,cw,(wl+1)); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; - if (*result) mystrcat(result, "\n", MAXLNLEN); + if (*result) strcat(result, "\n"); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - mystrcat(result, st, MAXLNLEN); - free(st); - } + strcat(result, st); + free(st); + } mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); + if (*result) strcat(result, "\n"); + strcat(result, st); free(st); } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) mystrcat(result, "\n", MAXLNLEN); - mystrcat(result, st, MAXLNLEN); + if (*result) strcat(result, "\n"); + strcat(result, st); free(st); } - } + } break; } } - if (*result) return mystrdup(result); + if (result) return mystrdup(result); return NULL; } +/* analyze word + * return line count + * XXX need a better data structure for morphological analysis */ +int Hunspell::analyze(char ***out, const char *word) { + int n = 0; + if (!word) return 0; + char * m = morph(word); + if(!m) return 0; + if (!out) return line_tok(m, out); + + // without memory allocation + /* BUG missing buffer size checking */ + int i, p; + for(p = 0, i = 0; m[i]; i++) { + if(m[i] == '\n' || !m[i+1]) { + n++; + strncpy((*out)[n++], m + p, i - p + 1); + if (m[i] == '\n') (*out)[n++][i - p] = '\0'; + if(!m[i+1]) break; + p = i + 1; + } + } + free(m); + return n; +} + #endif // END OF HUNSPELL_EXPERIMENTAL CODE Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) @@ -1893,17 +1725,6 @@ Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) #endif } - -Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, - const char * key) -{ -#ifdef HUNSPELL_CHROME_CLIENT - return NULL; -#else - return (Hunhandle*)(new Hunspell(affpath, dpath, key)); -#endif -} - void Hunspell_destroy(Hunhandle *pHunspell) { delete (Hunspell*)(pHunspell); @@ -1924,57 +1745,3 @@ int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word) return ((Hunspell*)pHunspell)->suggest(slst, word); } -int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word) -{ - return ((Hunspell*)pHunspell)->analyze(slst, word); -} - -int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word) -{ - return ((Hunspell*)pHunspell)->stem(slst, word); -} - -int Hunspell_stem(Hunhandle *pHunspell, char*** slst, char** desc, int n) -{ - return ((Hunspell*)pHunspell)->stem(slst, desc, n); -} - -int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, - const char * word2) -{ - return ((Hunspell*)pHunspell)->generate(slst, word, word2); -} - -int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, - char** desc, int n) -{ - return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); -} - - /* functions for run-time modification of the dictionary */ - - /* add word to the run-time dictionary */ - -int Hunspell_add(Hunhandle *pHunspell, const char * word) { - return ((Hunspell*)pHunspell)->add(word); -} - - /* add word to the run-time dictionary with affix flags of - * the example (a dictionary word): Hunspell will recognize - * affixed forms of the new word, too. - */ - -int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, - const char * example) { - return ((Hunspell*)pHunspell)->add_with_affix(word, example); -} - - /* remove word from the run-time dictionary */ - -int Hunspell_remove(Hunhandle *pHunspell, const char * word) { - return ((Hunspell*)pHunspell)->remove(word); -} - -void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n) { - freelist(slst, n); -} diff --git a/chrome/third_party/hunspell/src/hunspell/hunspell.h b/chrome/third_party/hunspell/src/hunspell/hunspell.h index f926052..b04b83a 100644 --- a/chrome/third_party/hunspell/src/hunspell/hunspell.h +++ b/chrome/third_party/hunspell/src/hunspell/hunspell.h @@ -7,25 +7,15 @@ extern "C" { typedef struct Hunhandle Hunhandle; -#ifdef _MSC_VER -#define DLL __declspec ( dllexport ) -#else -#define DLL -#endif - -DLL Hunhandle *Hunspell_create(const char * affpath, const char * dpath); - -DLL Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, - const char * key); - -DLL void Hunspell_destroy(Hunhandle *pHunspell); +Hunhandle *Hunspell_create(const char * affpath, const char * dpath); +void Hunspell_destroy(Hunhandle *pHunspell); /* spell(word) - spellcheck word * output: 0 = bad word, not 0 = good word */ -DLL int Hunspell_spell(Hunhandle *pHunspell, const char *); +int Hunspell_spell(Hunhandle *pHunspell, const char *); -DLL char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); +char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); /* suggest(suggestions, word) - search suggestions * input: pointer to an array of strings pointer and the (bad) word @@ -34,63 +24,7 @@ DLL char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); * a newly allocated array of strings (*slts will be NULL when number * of suggestion equals 0.) */ -DLL int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word); - - /* morphological functions */ - - /* analyze(result, word) - morphological analysis of the word */ - -DLL int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word); - - /* stem(result, word) - stemmer function */ - -DLL int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word); - - /* stem(result, analysis, n) - get stems from a morph. analysis - * example: - * char ** result, result2; - * int n1 = Hunspell_analyze(result, "words"); - * int n2 = Hunspell_stem2(result2, result, n1); - */ - -DLL int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n); - - /* generate(result, word, word2) - morphological generation by example(s) */ - -DLL int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, - const char * word2); - - /* generate(result, word, desc, n) - generation by morph. description(s) - * example: - * char ** result; - * char * affix = "is:plural"; // description depends from dictionaries, too - * int n = Hunspell_generate2(result, "word", &affix, 1); - * for (int i = 0; i < n; i++) printf("%s\n", result[i]); - */ - -DLL int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, - char** desc, int n); - - /* functions for run-time modification of the dictionary */ - - /* add word to the run-time dictionary */ - -DLL int Hunspell_add(Hunhandle *pHunspell, const char * word); - - /* add word to the run-time dictionary with affix flags of - * the example (a dictionary word): Hunspell will recognize - * affixed forms of the new word, too. - */ - -DLL int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, const char * example); - - /* remove word from the run-time dictionary */ - -DLL int Hunspell_remove(Hunhandle *pHunspell, const char * word); - - /* free suggestion lists */ - -DLL void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n); +int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word); #ifdef __cplusplus } diff --git a/chrome/third_party/hunspell/src/hunspell/hunspell.hxx b/chrome/third_party/hunspell/src/hunspell/hunspell.hxx index bb26b5b..bc6f7d5 100644 --- a/chrome/third_party/hunspell/src/hunspell/hunspell.hxx +++ b/chrome/third_party/hunspell/src/hunspell/hunspell.hxx @@ -1,20 +1,30 @@ +#include "license.hunspell" +#include "license.myspell" + #include "hashmgr.hxx" #include "affixmgr.hxx" #include "suggestmgr.hxx" +#include "csutil.hxx" #include "langnum.hxx" #define SPELL_COMPOUND (1 << 0) #define SPELL_FORBIDDEN (1 << 1) -#define SPELL_ALLCAP (1 << 2) -#define SPELL_NOCAP (1 << 3) -#define SPELL_INITCAP (1 << 4) -#define SPELL_XML "<?xml?>" +#define NOCAP 0 +#define INITCAP 1 +#define ALLCAP 2 +#define HUHCAP 3 +#define HUHINITCAP 4 -#define MAXDIC 20 #define MAXSUGGESTION 15 #define MAXSHARPS 5 +#if defined(W32) && defined(LIBRARY) +#define DLLTEST2_API __declspec(dllexport) +#else +#define DLLTEST2_API +#endif + #ifndef _MYSPELLMGR_HXX_ #define _MYSPELLMGR_HXX_ @@ -22,27 +32,15 @@ #include "chrome/third_party/hunspell/google/bdict_reader.h" #endif -#ifdef HUNSPELL_STATIC - #define DLLEXPORT -#else - #ifdef HUNSPELL_EXPORTS - #define DLLEXPORT __declspec( dllexport ) - #else - #define DLLEXPORT __declspec( dllimport ) - #endif -#endif - -#ifdef WIN32 -class DLLEXPORT Hunspell +#ifdef W32 +class DLLTEST2_API Hunspell #else class Hunspell #endif { AffixMgr* pAMgr; - HashMgr* pHMgr[MAXDIC]; - int maxdic; + HashMgr* pHMgr; SuggestMgr* pSMgr; - char * affixpath; char * encoding; struct cs_info * csconv; int langnum; @@ -63,16 +61,11 @@ public: #ifdef HUNSPELL_CHROME_CLIENT Hunspell(const unsigned char* bdict_data, size_t bdict_length); #else - Hunspell(FILE* aff_handle, FILE* dic_handle, const char * key = NULL); + Hunspell(FILE* aff_handle, FILE* dic_handle); #endif ~Hunspell(); -#ifndef HUNSPELL_CHROME_CLIENT - /* load extra dictionaries (only dic files) */ - int add_dic(const char * dpath, const char * key = NULL); -#endif - /* spell(word) - spellcheck word * output: 0 = bad word, not 0 = good word * @@ -94,62 +87,17 @@ public: */ int suggest(char*** slst, const char * word); - - /* deallocate suggestion lists */ - - void free_list(char *** slst, int n); - char * get_dic_encoding(); - /* morphological functions */ - - /* analyze(result, word) - morphological analysis of the word */ - - int analyze(char*** slst, const char * word); - - /* stem(result, word) - stemmer function */ + /* handling custom dictionary */ - int stem(char*** slst, const char * word); - - /* stem(result, analysis, n) - get stems from a morph. analysis - * example: - * char ** result, result2; - * int n1 = analyze(&result, "words"); - * int n2 = stem(&result2, result, n1); - */ - - int stem(char*** slst, char ** morph, int n); - - /* generate(result, word, word2) - morphological generation by example(s) */ - - int generate(char*** slst, const char * word, const char * word2); + int put_word(const char * word); - /* generate(result, word, desc, n) - generation by morph. description(s) - * example: - * char ** result; - * char * affix = "is:plural"; // description depends from dictionaries, too - * int n = generate(&result, "word", &affix, 1); - * for (int i = 0; i < n; i++) printf("%s\n", result[i]); - */ - - int generate(char*** slst, const char * word, char ** desc, int n); - - /* functions for run-time modification of the dictionary */ - - /* add word to the run-time dictionary */ - - int add(const char * word); - - /* add word to the run-time dictionary with affix flags of - * the example (a dictionary word): Hunspell will recognize - * affixed forms of the new word, too. + /* pattern is a sample dictionary word + * put word into custom dictionary with affix flags of pattern word */ - int add_with_affix(const char * word, const char * example); - - /* remove word from the run-time dictionary */ - - int remove(const char * word); + int put_word_pattern(const char * word, const char * pattern); /* other */ @@ -159,14 +107,25 @@ public: struct cs_info * get_csconv(); const char * get_version(); - - /* experimental and deprecated functions */ + + /* experimental functions */ #ifdef HUNSPELL_EXPERIMENTAL - /* suffix is an affix flag string, similarly in dictionary files */ + /* suffix is an affix flag string, similarly in dictionary files */ + int put_word_suffix(const char * word, const char * suffix); + + /* morphological analysis */ + + char * morph(const char * word); + int analyze(char*** out, const char *word); + char * morph_with_correction(const char * word); + /* stemmer function */ + + int stem(char*** slst, const char * word); + /* spec. suggestions */ int suggest_auto(char*** slst, const char * word); int suggest_pos_stems(char*** slst, const char * word); @@ -187,14 +146,8 @@ private: char * sharps_u8_l1(char * dest, char * source); hentry * spellsharps(char * base, char *, int, int, char * tmp, int * info, char **root); int is_keepcase(const hentry * rv); - int insert_sug(char ***slst, char * word, int ns); - void cat_result(char * result, char * st); - char * stem_description(const char * desc); - int spellml(char*** slst, const char * word); - int get_xml_par(char * dest, const char * par, int maxl); - const char * get_xml_pos(const char * s, const char * attr); - int get_xml_list(char ***slst, char * list, const char * tag); - int check_xml_par(const char * q, const char * attr, const char * value); + int insert_sug(char ***slst, char * word, int *ns); + }; diff --git a/chrome/third_party/hunspell/src/hunspell/hunzip.cxx b/chrome/third_party/hunspell/src/hunspell/hunzip.cxx deleted file mode 100644 index f9091b8..0000000 --- a/chrome/third_party/hunspell/src/hunspell/hunzip.cxx +++ /dev/null @@ -1,196 +0,0 @@ -#ifndef MOZILLA_CLIENT -#include <cstdlib> -#include <cstring> -#include <cstdio> -#else -#include <stdlib.h> -#include <string.h> -#include <stdio.h> -#endif - -#include "hunzip.hxx" - -#define CODELEN 65536 -#define BASEBITREC 5000 - -#define UNCOMPRESSED '\002' -#define MAGIC "hz0" -#define MAGIC_ENCRYPT "hz1" -#define MAGICLEN (sizeof(MAGIC) - 1) - -int Hunzip::fail(const char * err, const char * par) { - fprintf(stderr, err, par); - return -1; -} - -Hunzip::Hunzip(const char * file, const char * key) { - bufsiz = 0; - lastbit = 0; - inc = 0; - outc = 0; - dec = NULL; - filename = (char *) malloc(strlen(file) + 1); - if (filename) strcpy(filename, file); - if (getcode(key) == -1) bufsiz = -1; - else bufsiz = getbuf(); -} - -int Hunzip::getcode(const char * key) { - unsigned char c[2]; - int i, j, n, p; - int allocatedbit = BASEBITREC; - const char * enc = key; - - fin = fopen(filename, "rb"); - if (!fin) return -1; - - // read magic number - if ((fread(in, 1, 3, fin) < MAGICLEN) - || !(strncmp(MAGIC, in, MAGICLEN) == 0 || - strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0)) { - return fail(MSG_FORMAT, filename); - } - - // check encryption - if (strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0) { - unsigned char cs; - if (!key) return fail(MSG_KEY, filename); - if (fread(&c, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename); - for (cs = 0; *enc; enc++) cs ^= *enc; - if (cs != c[0]) return fail(MSG_KEY, filename); - enc = key; - } else key = NULL; - - // read record count - if (fread(&c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename); - - if (key) { - c[0] ^= *enc; - if (*(++enc) == '\0') enc = key; - c[1] ^= *enc; - } - - n = ((int) c[0] << 8) + c[1]; - dec = (struct bit *) malloc(BASEBITREC * sizeof(struct bit)); - if (!dec) return fail(MSG_MEMORY, filename); - dec[0].v[0] = 0; - dec[0].v[1] = 0; - - // read codes - for (i = 0; i < n; i++) { - unsigned char l; - if (fread(c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename); - if (key) { - if (*(++enc) == '\0') enc = key; - c[0] ^= *enc; - if (*(++enc) == '\0') enc = key; - c[1] ^= *enc; - } - if (fread(&l, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename); - if (key) { - if (*(++enc) == '\0') enc = key; - l ^= *enc; - } - if (fread(in, 1, l/8+1, fin) < (size_t) l/8+1) return fail(MSG_FORMAT, filename); - if (key) for (j = 0; j <= l/8; j++) { - if (*(++enc) == '\0') enc = key; - in[j] ^= *enc; - } - p = 0; - for (j = 0; j < l; j++) { - int b = (in[j/8] & (1 << (7 - (j % 8)))) ? 1 : 0; - int oldp = p; - p = dec[p].v[b]; - if (p == 0) { - lastbit++; - if (lastbit == allocatedbit) { - allocatedbit += BASEBITREC; - dec = (struct bit *) realloc(dec, allocatedbit * sizeof(struct bit)); - } - dec[lastbit].v[0] = 0; - dec[lastbit].v[1] = 0; - dec[oldp].v[b] = lastbit; - p = lastbit; - } - } - dec[p].c[0] = c[0]; - dec[p].c[1] = c[1]; - } - return 0; -} - -Hunzip::~Hunzip() -{ - if (dec) free(dec); - if (fin) fclose(fin); - if (filename) free(filename); -} - -int Hunzip::getbuf() { - int p = 0; - int o = 0; - do { - if (inc == 0) inbits = fread(in, 1, BUFSIZE, fin) * 8; - for (; inc < inbits; inc++) { - int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0; - int oldp = p; - p = dec[p].v[b]; - if (p == 0) { - if (oldp == lastbit) { - fclose(fin); - fin = NULL; - // add last odd byte - if (dec[lastbit].c[0]) out[o++] = dec[lastbit].c[1]; - return o; - } - out[o++] = dec[oldp].c[0]; - out[o++] = dec[oldp].c[1]; - if (o == BUFSIZE) return o; - p = dec[p].v[b]; - } - } - inc = 0; - } while (inbits == BUFSIZE * 8); - return fail(MSG_FORMAT, filename); -} - -const char * Hunzip::getline() { - char linebuf[BUFSIZE]; - int l = 0, eol = 0, left = 0, right = 0; - if (bufsiz == -1) return NULL; - while (l < bufsiz && !eol) { - linebuf[l++] = out[outc]; - switch (out[outc]) { - case '\t': break; - case 31: { // escape - if (++outc == bufsiz) { - bufsiz = getbuf(); - outc = 0; - } - linebuf[l - 1] = out[outc]; - break; - } - case ' ': break; - default: if (((unsigned char) out[outc]) < 47) { - if (out[outc] > 32) { - right = out[outc] - 31; - if (++outc == bufsiz) { - bufsiz = getbuf(); - outc = 0; - } - } - if (out[outc] == 30) left = 9; else left = out[outc]; - linebuf[l-1] = '\n'; - eol = 1; - } - } - if (++outc == bufsiz) { - outc = 0; - bufsiz = fin ? getbuf(): -1; - } - } - if (right) strcpy(linebuf + l - 1, line + strlen(line) - right - 1); - else linebuf[l] = '\0'; - strcpy(line + left, linebuf); - return line; -} diff --git a/chrome/third_party/hunspell/src/hunspell/hunzip.hxx b/chrome/third_party/hunspell/src/hunspell/hunzip.hxx deleted file mode 100644 index 52109d1..0000000 --- a/chrome/third_party/hunspell/src/hunspell/hunzip.hxx +++ /dev/null @@ -1,41 +0,0 @@ -/* hunzip: file decompression for sorted dictionaries with optional encryption, - * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */ - -#ifndef _HUNZIP_HXX_ -#define _HUNZIP_HXX_ - -#define BUFSIZE 65536 -#define HZIP_EXTENSION ".hz" - -#define MSG_OPEN "error: %s: cannot open\n" -#define MSG_FORMAT "error: %s: not in hzip format\n" -#define MSG_MEMORY "error: %s: missing memory\n" -#define MSG_KEY "error: %s: missing or bad password\n" - -struct bit { - unsigned char c[2]; - int v[2]; -}; - -class Hunzip -{ - -protected: - char * filename; - FILE * fin; - int bufsiz, lastbit, inc, inbits, outc; - struct bit * dec; // code table - char in[BUFSIZE]; // input buffer - char out[BUFSIZE + 1]; // Huffman-decoded buffer - char line[BUFSIZE + 50]; // decoded line - int getcode(const char * key); - int getbuf(); - int fail(const char * err, const char * par); - -public: - Hunzip(const char * filename, const char * key = NULL); - ~Hunzip(); - const char * getline(); -}; - -#endif diff --git a/chrome/third_party/hunspell/src/hunspell/license.hunspell b/chrome/third_party/hunspell/src/hunspell/license.hunspell index 3afee61..f1cf161 100644 --- a/chrome/third_party/hunspell/src/hunspell/license.hunspell +++ b/chrome/third_party/hunspell/src/hunspell/license.hunspell @@ -14,7 +14,7 @@ * The Original Code is Hunspell, based on MySpell. * * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Laszlo Nemeth (Hunspell). + * Kevin Hendricks (MySpell) and Németh László (Hunspell). * Portions created by the Initial Developers are Copyright (C) 2002-2005 * the Initial Developers. All Rights Reserved. * @@ -24,22 +24,22 @@ * Giuseppe Modugno * Gianluca Turconi * Simon Brouwer - * Noll Janos - * Biro Arpad - * Goldman Eleonora - * Sarlos Tamas - * Bencsath Boldizsar - * Halacsy Peter - * Dvornik Laszlo - * Gefferth Andras + * Noll János + * Bíró Árpád + * Goldman Eleonóra + * Sarlós Tamás + * Bencsáth Boldizsár + * Halácsy Péter + * Dvornik László + * Gefferth András * Nagy Viktor - * Varga Daniel + * Varga Dániel * Chris Halls * Rene Engelhard * Bram Moolenaar * Dafydd Jones - * Harri Pitkanen - * Andras Timar + * Harri Pitkänen + * András Tímár * Tor Lillqvist * * Alternatively, the contents of this file may be used under the terms of diff --git a/chrome/third_party/hunspell/src/hunspell/phonet.cxx b/chrome/third_party/hunspell/src/hunspell/phonet.cxx deleted file mode 100644 index ca20796..0000000 --- a/chrome/third_party/hunspell/src/hunspell/phonet.cxx +++ /dev/null @@ -1,299 +0,0 @@ -/* phonetic.c - generic replacement aglogithms for phonetic transformation - Copyright (C) 2000 Bjoern Jacke - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License version 2.1 as published by the Free Software Foundation; - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; If not, see - <http://www.gnu.org/licenses/>. - - Changelog: - - 2000-01-05 Bjoern Jacke <bjoern at j3e.de> - Initial Release insprired by the article about phonetic - transformations out of c't 25/1999 - - 2007-07-26 Bjoern Jacke <bjoern at j3e.de> - Released under MPL/GPL/LGPL tri-license for Hunspell - - 2007-08-23 Laszlo Nemeth <nemeth at OOo> - Porting from Aspell to Hunspell using C-like structs -*/ - -#ifndef MOZILLA_CLIENT -#include <cstdlib> -#include <cstring> -#include <cstdio> -#include <cctype> -#else -#include <stdlib.h> -#include <string.h> -#include <stdio.h> -#include <ctype.h> -#endif - -#include "csutil.hxx" -#include "phonet.hxx" - -void init_phonet_hash(phonetable & parms) - { - int i, k; - - for (i = 0; i < HASHSIZE; i++) { - parms.hash[i] = -1; - } - - for (i = 0; parms.rules[i][0] != '\0'; i += 2) { - /** set hash value **/ - k = (unsigned char) parms.rules[i][0]; - - if (parms.hash[k] < 0) { - parms.hash[k] = i; - } - } - } - - // like strcpy but safe if the strings overlap - // but only if dest < src - static inline void strmove(char * dest, char * src) { - while (*src) - *dest++ = *src++; - *dest = '\0'; - } - -int myisalpha(char ch) { - if ((unsigned char) ch < 128) return isalpha(ch); - return 1; -} - -/* phonetic transcription algorithm */ -/* see: http://aspell.net/man-html/Phonetic-Code.html */ -/* convert string to uppercase before this call */ -int phonet (const char * inword, char * target, - int len, - phonetable & parms) - { - /** Do phonetic transformation. **/ - /** "len" = length of "inword" incl. '\0'. **/ - - /** result: >= 0: length of "target" **/ - /** otherwise: error **/ - - int i,j,k=0,n,p,z; - int k0,n0,p0=-333,z0; - char c, c0; - const char * s; - typedef unsigned char uchar; - char word[MAXPHONETUTF8LEN + 1]; - if (len == -1) len = strlen(inword); - if (len > MAXPHONETUTF8LEN) return 0; - strcpy(word, inword); - - /** check word **/ - i = j = z = 0; - while ((c = word[i]) != '\0') { - n = parms.hash[(uchar) c]; - z0 = 0; - - if (n >= 0) { - /** check all rules for the same letter **/ - while (parms.rules[n][0] == c) { - - /** check whole string **/ - k = 1; /** number of found letters **/ - p = 5; /** default priority **/ - s = parms.rules[n]; - s++; /** important for (see below) "*(s-1)" **/ - - while (*s != '\0' && word[i+k] == *s - && !isdigit ((unsigned char) *s) && strchr ("(-<^$", *s) == NULL) { - k++; - s++; - } - if (*s == '(') { - /** check letters in "(..)" **/ - if (myisalpha(word[i+k]) // ...could be implied? - && strchr(s+1, word[i+k]) != NULL) { - k++; - while (*s != ')') - s++; - s++; - } - } - p0 = (int) *s; - k0 = k; - while (*s == '-' && k > 1) { - k--; - s++; - } - if (*s == '<') - s++; - if (isdigit ((unsigned char) *s)) { - /** determine priority **/ - p = *s - '0'; - s++; - } - if (*s == '^' && *(s+1) == '^') - s++; - - if (*s == '\0' - || (*s == '^' - && (i == 0 || ! myisalpha(word[i-1])) - && (*(s+1) != '$' - || (! myisalpha(word[i+k0]) ))) - || (*s == '$' && i > 0 - && myisalpha(word[i-1]) - && (! myisalpha(word[i+k0]) ))) - { - /** search for followup rules, if: **/ - /** parms.followup and k > 1 and NO '-' in searchstring **/ - c0 = word[i+k-1]; - n0 = parms.hash[(uchar) c0]; - -// if (parms.followup && k > 1 && n0 >= 0 - if (k > 1 && n0 >= 0 - && p0 != (int) '-' && word[i+k] != '\0') { - /** test follow-up rule for "word[i+k]" **/ - while (parms.rules[n0][0] == c0) { - - /** check whole string **/ - k0 = k; - p0 = 5; - s = parms.rules[n0]; - s++; - while (*s != '\0' && word[i+k0] == *s - && ! isdigit((unsigned char) *s) && strchr("(-<^$",*s) == NULL) { - k0++; - s++; - } - if (*s == '(') { - /** check letters **/ - if (myisalpha(word[i+k0]) - && strchr (s+1, word[i+k0]) != NULL) { - k0++; - while (*s != ')' && *s != '\0') - s++; - if (*s == ')') - s++; - } - } - while (*s == '-') { - /** "k0" gets NOT reduced **/ - /** because "if (k0 == k)" **/ - s++; - } - if (*s == '<') - s++; - if (isdigit ((unsigned char) *s)) { - p0 = *s - '0'; - s++; - } - - if (*s == '\0' - /** *s == '^' cuts **/ - || (*s == '$' && ! myisalpha(word[i+k0]))) - { - if (k0 == k) { - /** this is just a piece of the string **/ - n0 += 2; - continue; - } - - if (p0 < p) { - /** priority too low **/ - n0 += 2; - continue; - } - /** rule fits; stop search **/ - break; - } - n0 += 2; - } /** End of "while (parms.rules[n0][0] == c0)" **/ - - if (p0 >= p && parms.rules[n0][0] == c0) { - n += 2; - continue; - } - } /** end of follow-up stuff **/ - - /** replace string **/ - s = parms.rules[n+1]; - p0 = (parms.rules[n][0] != '\0' - && strchr (parms.rules[n]+1,'<') != NULL) ? 1:0; - if (p0 == 1 && z == 0) { - /** rule with '<' is used **/ - if (j > 0 && *s != '\0' - && (target[j-1] == c || target[j-1] == *s)) { - j--; - } - z0 = 1; - z = 1; - k0 = 0; - while (*s != '\0' && word[i+k0] != '\0') { - word[i+k0] = *s; - k0++; - s++; - } - if (k > k0) - strmove (&word[0]+i+k0, &word[0]+i+k); - - /** new "actual letter" **/ - c = word[i]; - } - else { /** no '<' rule used **/ - i += k - 1; - z = 0; - while (*s != '\0' - && *(s+1) != '\0' && j < len) { - if (j == 0 || target[j-1] != *s) { - target[j] = *s; - j++; - } - s++; - } - /** new "actual letter" **/ - c = *s; - if (parms.rules[n][0] != '\0' - && strstr (parms.rules[n]+1, "^^") != NULL) { - if (c != '\0') { - target[j] = c; - j++; - } - strmove (&word[0], &word[0]+i+1); - i = 0; - z0 = 1; - } - } - break; - } /** end of follow-up stuff **/ - n += 2; - } /** end of while (parms.rules[n][0] == c) **/ - } /** end of if (n >= 0) **/ - if (z0 == 0) { -// if (k && (assert(p0!=-333),!p0) && j < len && c != '\0' -// && (!parms.collapse_result || j == 0 || target[j-1] != c)){ - if (k && !p0 && j < len && c != '\0' - && (1 || j == 0 || target[j-1] != c)){ - /** condense only double letters **/ - target[j] = c; - ///printf("\n setting \n"); - j++; - } - - i++; - z = 0; - k=0; - } - } /** end of while ((c = word[i]) != '\0') **/ - - target[j] = '\0'; - return (j); - - } /** end of function "phonet" **/ diff --git a/chrome/third_party/hunspell/src/hunspell/phonet.hxx b/chrome/third_party/hunspell/src/hunspell/phonet.hxx deleted file mode 100644 index d1cf995..0000000 --- a/chrome/third_party/hunspell/src/hunspell/phonet.hxx +++ /dev/null @@ -1,50 +0,0 @@ -/* phonetic.c - generic replacement aglogithms for phonetic transformation - Copyright (C) 2000 Bjoern Jacke - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License version 2.1 as published by the Free Software Foundation; - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; If not, see - <http://www.gnu.org/licenses/>. - - Changelog: - - 2000-01-05 Bjoern Jacke <bjoern at j3e.de> - Initial Release insprired by the article about phonetic - transformations out of c't 25/1999 - - 2007-07-26 Bjoern Jacke <bjoern at j3e.de> - Released under MPL/GPL/LGPL tri-license for Hunspell - - 2007-08-23 Laszlo Nemeth <nemeth at OOo> - Porting from Aspell to Hunspell using C-like structs -*/ - -#ifndef __PHONETHXX__ -#define __PHONETHXX__ - -#define HASHSIZE 256 -#define MAXPHONETLEN 256 -#define MAXPHONETUTF8LEN (MAXPHONETLEN * 4) - -struct phonetable { - char utf8; - cs_info * lang; - int num; - char * * rules; - int hash[HASHSIZE]; -}; - -void init_phonet_hash(phonetable & parms); - -int phonet (const char * inword, char * target, - int len, phonetable & phone); - -#endif diff --git a/chrome/third_party/hunspell/src/hunspell/replist.cxx b/chrome/third_party/hunspell/src/hunspell/replist.cxx deleted file mode 100644 index 7846470..0000000 --- a/chrome/third_party/hunspell/src/hunspell/replist.cxx +++ /dev/null @@ -1,95 +0,0 @@ -#include "license.hunspell" -#include "license.myspell" - -#ifndef MOZILLA_CLIENT -#include <cstdlib> -#include <cstring> -#include <cstdio> -#else -#include <stdlib.h> -#include <string.h> -#include <stdio.h> -#endif - -#include "replist.hxx" -#include "csutil.hxx" - -RepList::RepList(int n) { - dat = (replentry **) malloc(sizeof(replentry *) * n); - if (dat == 0) size = 0; else size = n; - pos = 0; -} - -RepList::~RepList() -{ - for (int i = 0; i < pos; i++) { - free(dat[i]->pattern); - free(dat[i]->pattern2); - free(dat[i]); - } - free(dat); -} - -int RepList::get_pos() { - return pos; -} - -replentry * RepList::item(int n) { - return dat[n]; -} - -int RepList::near(const char * word) { - int p1 = 0; - int p2 = pos; - while ((p2 - p1) > 1) { - int m = (p1 + p2) / 2; -// fprintf(stderr, "m: %d p1: %d p2: %d dat: %s\n", m, p1, p2, dat[m]->pattern); - int c = strcmp(word, dat[m]->pattern); - if (c <= 0) { - if (c < 0) p2 = m; else p1 = p2 = m; - } else p1 = m; - } -// fprintf(stderr, "NEAR: %s (word: %s)\n", dat[p1]->pattern, word); - return p1; -} - -int RepList::match(const char * word, int n) { - if (strncmp(word, dat[n]->pattern, strlen(dat[n]->pattern)) == 0) return strlen(dat[n]->pattern); - return 0; -} - -int RepList::add(char * pat1, char * pat2) { - if (pos >= size || pat1 == NULL || pat2 == NULL) return 1; - replentry * r = (replentry *) malloc(sizeof(replentry)); - if (r == NULL) return 1; - r->pattern = mystrrep(pat1, "_", " "); - r->pattern2 = mystrrep(pat2, "_", " "); - dat[pos++] = r; - for (int i = pos - 1; i > 0; i--) { - r = dat[i]; - if (strcmp(r->pattern, dat[i - 1]->pattern) < 0) { - dat[i] = dat[i - 1]; - dat[i - 1] = r; - } else break; - } - return 0; -} - -int RepList::conv(const char * word, char * dest) { - int stl = 0; - int change = 0; -// for (int i = 0; i < pos; i++) fprintf(stderr, "%d. %s\n", i, dat[i]->pattern); - for (int i = 0; i < strlen(word); i++) { - int n = near(word + i); - int l = match(word + i, n); - if (l) { - strcpy(dest + stl, dat[n]->pattern2); - stl += strlen(dat[n]->pattern2); - i += l - 1; - change = 1; - } else dest[stl++] = word[i]; - } - dest[stl] = '\0'; -// fprintf(stderr, "i: %s o: %s change: %d\n", word, dest, change); - return change; -} diff --git a/chrome/third_party/hunspell/src/hunspell/replist.hxx b/chrome/third_party/hunspell/src/hunspell/replist.hxx deleted file mode 100644 index 16da313..0000000 --- a/chrome/third_party/hunspell/src/hunspell/replist.hxx +++ /dev/null @@ -1,29 +0,0 @@ -/* string replacement list class */ -#ifndef _REPLIST_HXX_ -#define _REPLIST_HXX_ -#ifdef HUNSPELL_CHROME_CLIENT -// Compilation issues in spellchecker.cc think near is a macro, therefore -// removing it here solves that problem. -#undef near -#endif -#include "w_char.hxx" - -class RepList -{ -protected: - replentry ** dat; - int size; - int pos; - -public: - RepList(int n); - ~RepList(); - - int get_pos(); - int add(char * pat1, char * pat2); - replentry * item(int n); - int near(const char * word); - int match(const char * word, int n); - int conv(const char * word, char * dest); -}; -#endif diff --git a/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx b/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx index 5914b6a..222701b 100644 --- a/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx @@ -14,16 +14,13 @@ #endif #include "suggestmgr.hxx" -#include "htypes.hxx" -#include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef WIN32 +#ifndef W32 using namespace std; #endif #endif -const w_char W_VLINE = { '\0', '|' }; SuggestMgr::SuggestMgr(const char * tryme, int maxn, AffixMgr * aptr) @@ -33,54 +30,36 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn, // try when building candidate suggestions pAMgr = aptr; - ckeyl = 0; - ckey = NULL; - ckey_utf = NULL; - ctryl = 0; ctry = NULL; ctry_utf = NULL; - utf8 = 0; - langnum = 0; - complexprefixes = 0; - maxSug = maxn; nosplitsugs = 0; maxngramsugs = MAXNGRAMSUGS; + utf8 = 0; + complexprefixes = 0; + if (pAMgr) { char * enc = pAMgr->get_encoding(); csconv = get_current_cs(enc); free(enc); - langnum = pAMgr->get_langnum(); - ckey = pAMgr->get_key_string(); nosplitsugs = pAMgr->get_nosplitsugs(); if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs(); utf8 = pAMgr->get_utf8(); complexprefixes = pAMgr->get_complexprefixes(); } - if (ckey) { - if (utf8) { - w_char t[MAXSWL]; - ckeyl = u8_u16(t, MAXSWL, ckey); - ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char)); - if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char)); - } else { - ckeyl = strlen(ckey); - } - } - if (tryme) { - ctry = mystrdup(tryme); - if (ctry) ctryl = strlen(ctry); - if (ctry && utf8) { + if (utf8) { w_char t[MAXSWL]; ctryl = u8_u16(t, MAXSWL, tryme); ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char)); - if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char)); - else ctryl = 0; + memcpy(ctry_utf, t, ctryl * sizeof(w_char)); + } else { + ctry = mystrdup(tryme); + ctryl = strlen(ctry); } } } @@ -89,11 +68,6 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn, SuggestMgr::~SuggestMgr() { pAMgr = NULL; - if (ckey) free(ckey); - ckey = NULL; - if (ckey_utf) free(ckey_utf); - ckey_utf = NULL; - ckeyl = 0; if (ctry) free(ctry); ctry = NULL; if (ctry_utf) free(ctry_utf); @@ -103,7 +77,7 @@ SuggestMgr::~SuggestMgr() } int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest, - int * timer, clock_t * timelimit) { + int * timer, time_t * timelimit) { int cwrd = 1; if (ns == maxSug) return maxSug; for (int k=0; k < ns; k++) { @@ -122,15 +96,13 @@ int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int // generate suggestions for a mispelled word // pass in address of array of char * pointers -// onlycompoundsug: probably bad suggestions (need for ngram sugs, too) -int SuggestMgr::suggest(char*** slst, const char * w, int nsug, - int * onlycompoundsug) +int SuggestMgr::suggest(char*** slst, const char * w, int nsug) { - int nocompoundtwowords = 0; - char ** wlst; - w_char word_utf[MAXSWL]; - int wl = 0; + int nocompoundtwowords = 0; + char ** wlst; + w_char word_utf[MAXSWL]; + int wl = 0; char w2[MAXWORDUTF8LEN]; const char * word = w; @@ -169,8 +141,8 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug, nsug = replchars(wlst, word, nsug, cpdsuggest); // perhaps we made chose the wrong char from a related set - if ((nsug < maxSug) && (nsug > -1)) { - nsug = mapchars(wlst, word, nsug, cpdsuggest); + if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) { + nsug = mapchars(wlst, word, nsug); } // did we swap the order of chars by mistake @@ -185,22 +157,6 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug, longswapchar(wlst, word, nsug, cpdsuggest); } - // did we just hit the wrong key in place of a good char (case and keyboard) - if ((nsug < maxSug) && (nsug > -1)) { - nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) : - badcharkey(wlst, word, nsug, cpdsuggest); - } - - // only suggest compound words when no other suggestion - if ((cpdsuggest == 0) && (nsug > 0)) nocompoundtwowords=1; - - // did we add a char that should not be there - if ((nsug < maxSug) && (nsug > -1)) { - nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : - extrachar(wlst, word, nsug, cpdsuggest); - } - - // did we forgot a char if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : @@ -213,6 +169,12 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug, movechar(wlst, word, nsug, cpdsuggest); } + // did we add a char that should not be there + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : + extrachar(wlst, word, nsug, cpdsuggest); + } + // did we just hit the wrong key in place of a good char if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : @@ -225,6 +187,10 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug, doubletwochars(wlst, word, nsug, cpdsuggest); } + + // only suggest compound words when no other suggestion + if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; + // perhaps we forgot to hit space and two words ran together if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) { nsug = twowords(wlst, word, nsug, cpdsuggest); @@ -239,8 +205,6 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug, free(wlst); wlst = NULL; } - - if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1; *slst = wlst; return nsug; @@ -278,8 +242,8 @@ int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug) nsug = replchars(wlst, word, nsug, cpdsuggest); // perhaps we made chose the wrong char from a related set - if ((nsug < maxSug) && (nsug > -1)) - nsug = mapchars(wlst, word, nsug, cpdsuggest); + if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) + nsug = mapchars(wlst, word, nsug); if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; @@ -309,7 +273,7 @@ int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns, char candidate[MAXSWUTF8L]; w_char candidate_utf[MAXSWL]; memcpy(candidate_utf, word, wl * sizeof(w_char)); - mkallcap_utf(candidate_utf, wl, langnum); + mkallcap_utf(candidate_utf, wl, pAMgr->get_langnum()); u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); } @@ -324,9 +288,9 @@ int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest) } // suggestions for when chose the wrong char out of a related set -int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest) +int SuggestMgr::mapchars(char** wlst, const char * word, int ns) { - clock_t timelimit; + time_t timelimit; int timer; int wl = strlen(word); @@ -336,19 +300,18 @@ int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest) struct mapentry* maptable = pAMgr->get_maptable(); if (maptable==NULL) return ns; - timelimit = clock(); + timelimit = time(NULL); timer = MINTIMER; if (utf8) { w_char w[MAXSWL]; int len = u8_u16(w, MAXSWL, word); - ns = map_related_utf(w, len, 0, cpdsuggest, wlst, ns, maptable, nummap, &timer, &timelimit); - } else ns = map_related(word, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit); + ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit); + } else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit); return ns; } -int SuggestMgr::map_related(const char * word, int i, char** wlst, - int cpdsuggest, int ns, - const mapentry* maptable, int nummap, int * timer, clock_t * timelimit) +int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, + const mapentry* maptable, int nummap, int * timer, time_t * timelimit) { char c = *(word + i); if (c == 0) { @@ -356,7 +319,8 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, int wl = strlen(word); for (int m=0; m < ns; m++) if (strcmp(word,wlst[m]) == 0) cwrd = 0; - if ((cwrd) && checkword(word, wl, cpdsuggest, timer, timelimit)) { + if ((cwrd) && (checkword(word, wl, 0, timer, timelimit) || + checkword(word, wl, 1, timer, timelimit))) { if (ns < maxSug) { wlst[ns] = mystrdup(word); if (wlst[ns] == NULL) return -1; @@ -370,27 +334,23 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, if (strchr(maptable[j].set,c) != 0) { in_map = 1; char * newword = mystrdup(word); - if (!newword) return -1; for (int k = 0; k < maptable[j].len; k++) { *(newword + i) = *(maptable[j].set + k); - ns = map_related(newword, (i+1), wlst, cpdsuggest, - ns, maptable, nummap, timer, timelimit); - if (!(*timer)) return ns; + ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit); + if (!(*timelimit)) return ns; } free(newword); } } if (!in_map) { i++; - ns = map_related(word, i, wlst, cpdsuggest, - ns, maptable, nummap, timer, timelimit); + ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit); } return ns; } -int SuggestMgr::map_related_utf(w_char * word, int len, int i, int cpdsuggest, - char** wlst, int ns, const mapentry* maptable, int nummap, - int * timer, clock_t * timelimit) +int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns, + const mapentry* maptable, int nummap, int * timer, time_t * timelimit) { if (i == len) { int cwrd = 1; @@ -400,7 +360,8 @@ int SuggestMgr::map_related_utf(w_char * word, int len, int i, int cpdsuggest, wl = strlen(s); for (int m=0; m < ns; m++) if (strcmp(s,wlst[m]) == 0) cwrd = 0; - if ((cwrd) && checkword(s, wl, cpdsuggest, timer, timelimit)) { + if ((cwrd) && (checkword(s, wl, 0, timer, timelimit) || + checkword(s, wl, 1, timer, timelimit))) { if (ns < maxSug) { wlst[ns] = mystrdup(s); if (wlst[ns] == NULL) return -1; @@ -416,17 +377,15 @@ int SuggestMgr::map_related_utf(w_char * word, int len, int i, int cpdsuggest, in_map = 1; for (int k = 0; k < maptable[j].len; k++) { *(word + i) = *(maptable[j].set_utf16 + k); - ns = map_related_utf(word, len, i + 1, cpdsuggest, - wlst, ns, maptable, nummap, timer, timelimit); - if (!(*timer)) return ns; + ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit); + if (!(*timelimit)) return ns; } *((unsigned short *) word + i) = c; } } if (!in_map) { i++; - ns = map_related_utf(word, len, i, cpdsuggest, - wlst, ns, maptable, nummap, timer, timelimit); + ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit); } return ns; } @@ -457,23 +416,6 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest strcpy(candidate+(r-word)+lenr, r+lenp); ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; - // check REP suggestions with space - char * sp = strchr(candidate, ' '); - if (sp) { - *sp = '\0'; - if (checkword(candidate, strlen(candidate), 0, NULL, NULL)) { - int oldns = ns; - *sp = ' '; - ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - if (oldns < ns) { - free(wlst[ns - 1]); - wlst[ns - 1] = mystrdup(candidate); - if (!wlst[ns - 1]) return -1; - } - } - *sp = ' '; - } r++; // search for the next letter } } @@ -512,7 +454,7 @@ int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, in int state=0; if (wl < 5 || ! pAMgr) return ns; for (int i=2; i < wl; i++) { - if (w_char_eq(word[i], word[i-2])) { + if ((word[i].l==word[i-2].l) && (word[i].h==word[i-2].h)) { state++; if (state==3) { memcpy(candidate_utf, word, (i - 1) * sizeof(w_char)); @@ -529,108 +471,25 @@ int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, in return ns; } -// error is wrong char in place of correct one (case and keyboard related version) -int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest) -{ - char tmpc; - char candidate[MAXSWUTF8L]; - int wl = strlen(word); - strcpy(candidate, word); - // swap out each char one by one and try uppercase and neighbor - // keyboard chars in its place to see if that makes a good word - - for (int i=0; i < wl; i++) { - tmpc = candidate[i]; - // check with uppercase letters - candidate[i] = csconv[((unsigned char)tmpc)].cupper; - if (tmpc != candidate[i]) { - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - candidate[i] = tmpc; - } - // check neighbor characters in keyboard string - if (!ckey) continue; - char * loc = strchr(ckey, tmpc); - while (loc) { - if ((loc > ckey) && (*(loc - 1) != '|')) { - candidate[i] = *(loc - 1); - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - } - if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) { - candidate[i] = *(loc + 1); - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - } - loc = strchr(loc + 1, tmpc); - } - candidate[i] = tmpc; - } - return ns; -} - -// error is wrong char in place of correct one (case and keyboard related version) -int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) -{ - w_char tmpc; - w_char candidate_utf[MAXSWL]; - char candidate[MAXSWUTF8L]; - memcpy(candidate_utf, word, wl * sizeof(w_char)); - // swap out each char one by one and try all the tryme - // chars in its place to see if that makes a good word - for (int i=0; i < wl; i++) { - tmpc = candidate_utf[i]; - // check with uppercase letters - mkallcap_utf(candidate_utf + i, 1, langnum); - if (!w_char_eq(tmpc, candidate_utf[i])) { - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - candidate_utf[i] = tmpc; - } - // check neighbor characters in keyboard string - if (!ckey) continue; - w_char * loc = ckey_utf; - while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++; - while (loc < (ckey_utf + ckeyl)) { - if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) { - candidate_utf[i] = *(loc - 1); - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - } - if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) { - candidate_utf[i] = *(loc + 1); - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - } - do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)); - } - candidate_utf[i] = tmpc; - } - return ns; -} - // error is wrong char in place of correct one int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest) { char tmpc; char candidate[MAXSWUTF8L]; - clock_t timelimit = clock(); + time_t timelimit = time(NULL); int timer = MINTIMER; int wl = strlen(word); strcpy(candidate, word); // swap out each char one by one and try all the tryme // chars in its place to see if that makes a good word - for (int j=0; j < ctryl; j++) { - for (int i=wl-1; i >= 0; i--) { - tmpc = candidate[i]; + for (int i=0; i < wl; i++) { + tmpc = candidate[i]; + for (int j=0; j < ctryl; j++) { if (ctry[j] == tmpc) continue; candidate[i] = ctry[j]; ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timer) return ns; + if (!timelimit) return ns; candidate[i] = tmpc; } } @@ -643,20 +502,20 @@ int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, i w_char tmpc; w_char candidate_utf[MAXSWL]; char candidate[MAXSWUTF8L]; - clock_t timelimit = clock(); + time_t timelimit = time(NULL); int timer = MINTIMER; memcpy(candidate_utf, word, wl * sizeof(w_char)); // swap out each char one by one and try all the tryme // chars in its place to see if that makes a good word - for (int j=0; j < ctryl; j++) { - for (int i=wl-1; i >= 0; i--) { - tmpc = candidate_utf[i]; - if (w_char_eq(tmpc, ctry_utf[j])) continue; + for (int i=0; i < wl; i++) { + tmpc = candidate_utf[i]; + for (int j=0; j < ctryl; j++) { + if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue; candidate_utf[i] = ctry_utf[j]; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timer) return ns; + if (!timelimit) return ns; candidate_utf[i] = tmpc; } } @@ -666,20 +525,18 @@ int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, i // error is word has an extra letter it does not need int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest) { - char candidate[MAXSWUTF8L]; + char candidate[MAXSWUTF8L]; w_char candidate_utf[MAXSWL]; - w_char * p; - w_char tmpc = W_VLINE; // not used value, only for VCC warning message + const w_char * p; + w_char * r; if (wl < 2) return ns; // try omitting one char of word at a time - memcpy(candidate_utf, word, wl * sizeof(w_char)); - for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) { - w_char tmpc2 = *p; - if (p < candidate_utf + wl - 1) *p = tmpc; - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1); + memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char)); + for (p = word, r = candidate_utf; p < word + wl; ) { + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1); ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; - tmpc = tmpc2; + *r++ = *p++; } return ns; } @@ -687,41 +544,47 @@ int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, // error is word has an extra letter it does not need int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest) { - char tmpc = '\0'; char candidate[MAXSWUTF8L]; - char * p; + const char * p; + char * r; int wl = strlen(word); if (wl < 2) return ns; // try omitting one char of word at a time - strcpy (candidate, word); - for (p = candidate + wl - 1; p >=candidate; p--) { - char tmpc2 = *p; - *p = tmpc; + strcpy (candidate, word + 1); + for (p = word, r = candidate; *p != 0; ) { ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; - tmpc = tmpc2; + *r++ = *p++; } return ns; } + // error is missing a letter it needs int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest) { char candidate[MAXSWUTF8L]; - char * p; - clock_t timelimit = clock(); + const char * p; + char * q; + time_t timelimit = time(NULL); int timer = MINTIMER; int wl = strlen(word); - // try inserting a tryme character before every letter (and the null terminator) - for (int i = 0; i < ctryl; i++) { - strcpy(candidate, word); - for (p = candidate + wl; p >= candidate; p--) { - *(p+1) = *p; - *p = ctry[i]; + // try inserting a tryme character before every letter + strcpy(candidate + 1, word); + for (p = word, q = candidate; *p != 0; ) { + for (int i = 0; i < ctryl; i++) { + *q = ctry[i]; ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timer) return ns; + if (!timelimit) return ns; } + *q++ = *p++; + } + // now try adding one to end */ + for (int i = 0; i < ctryl; i++) { + *q = ctry[i]; + ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; } return ns; } @@ -731,20 +594,31 @@ int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns { w_char candidate_utf[MAXSWL]; char candidate[MAXSWUTF8L]; - w_char * p; - clock_t timelimit = clock(); + const w_char * p; + w_char * q; + int cwrd; + time_t timelimit = time(NULL); int timer = MINTIMER; - // try inserting a tryme character at the end of the word and before every letter - for (int i = 0; i < ctryl; i++) { - memcpy (candidate_utf, word, wl * sizeof(w_char)); - for (p = candidate_utf + wl; p >= candidate_utf; p--) { - *(p + 1) = *p; - *p = ctry_utf[i]; + // try inserting a tryme character before every letter + memcpy (candidate_utf + 1, word, wl * sizeof(w_char)); + for (p = word, q = candidate_utf; p < (word + wl); ) { + for (int i = 0; i < ctryl; i++) { + *q = ctry_utf[i]; + cwrd = 1; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timer) return ns; - } + if (!timelimit) return ns; + } + *q++ = *p++; + } + // now try adding one to end */ + for (int i = 0; i < ctryl; i++) { + *q = ctry_utf[i]; + cwrd = 1; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; } return ns; } @@ -762,19 +636,19 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest int wl=strlen(word); if (wl < 3) return ns; - if (langnum == LANG_hu) forbidden = check_forbidden(word, wl); + if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl); strcpy(candidate + 1, word); + // split the string into two pieces after every char // if both pieces are good words make them a suggestion for (p = candidate + 1; p[1] != '\0'; p++) { p[-1] = *p; // go to end of the UTF-8 character while (utf8 && ((p[1] & 0xc0) == 0x80)) { - *p = p[1]; p++; + p[-1] = *p; } - if (utf8 && p[1] == '\0') break; // last UTF-8 character *p = '\0'; c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL); if (c1) { @@ -783,7 +657,7 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest *p = ' '; // spec. Hungarian code (need a better compound word support) - if ((langnum == LANG_hu) && !forbidden && + if ((pAMgr->get_langnum() == LANG_hu) && !forbidden && // if 3 repeating letter, use - instead of space (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) || // or multiple compounding, with more, than 6 syllables @@ -799,23 +673,6 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest ns++; } } else return ns; - // add two word suggestion with dash, if TRY string contains - // "a" or "-" - // NOTE: cwrd doesn't modified for REP twoword sugg. - if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) && - mystrlen(p + 1) > 1 && - mystrlen(candidate) - mystrlen(p) > 1) { - *p = '-'; - for (int k=0; k < ns; k++) - if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; - if (ns < maxSug) { - if (cwrd) { - wlst[ns] = mystrdup(candidate); - if (wlst[ns] == NULL) return -1; - ns++; - } - } else return ns; - } } } } @@ -841,24 +698,6 @@ int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest p[1] = *p; *p = tmpc; } - // try double swaps for short words - // ahev -> have, owudl -> would - if (wl == 4 || wl == 5) { - candidate[0] = word[1]; - candidate[1] = word[0]; - candidate[2] = word[2]; - candidate[wl - 2] = word[wl - 1]; - candidate[wl - 1] = word[wl - 2]; - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - if (wl == 5) { - candidate[0] = word[0]; - candidate[1] = word[2]; - candidate[2] = word[1]; - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - } - } return ns; } @@ -869,7 +708,6 @@ int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, char candidate[MAXSWUTF8L]; w_char * p; w_char tmpc; - int len = 0; // try swapping adjacent chars one by one memcpy (candidate_utf, word, wl * sizeof(w_char)); for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) { @@ -877,32 +715,11 @@ int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, *p = p[1]; p[1] = tmpc; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - if (len == 0) len = strlen(candidate); - ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; p[1] = *p; *p = tmpc; } - // try double swaps for short words - // ahev -> have, owudl -> would, suodn -> sound - if (wl == 4 || wl == 5) { - candidate_utf[0] = word[1]; - candidate_utf[1] = word[0]; - candidate_utf[2] = word[2]; - candidate_utf[wl - 2] = word[wl - 1]; - candidate_utf[wl - 1] = word[wl - 2]; - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - if (wl == 5) { - candidate_utf[0] = word[0]; - candidate_utf[1] = word[2]; - candidate_utf[2] = word[1]; - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; - } - } return ns; } @@ -977,7 +794,7 @@ int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest *(q-1) = *q; *q = tmpc; if ((q-p) < 2) continue; // omit swap char - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } strcpy(candidate, word); @@ -988,7 +805,7 @@ int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest *(q+1) = *q; *q = tmpc; if ((p-q) < 2) continue; // omit swap char - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } strcpy(candidate, word); @@ -1013,7 +830,7 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, *q = tmpc; if ((q-p) < 2) continue; // omit swap char u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } memcpy (candidate_utf, word, wl * sizeof(w_char)); @@ -1025,7 +842,7 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, *q = tmpc; if ((p-q) < 2) continue; // omit swap char u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } memcpy (candidate_utf, word, wl * sizeof(w_char)); @@ -1034,33 +851,28 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, } // generate a set of suggestions for very poorly spelled words -int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md) +int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) { int i, j; int lval; - int sc, scphon; - int lp, lpphon; + int sc; + int lp; int nonbmp = 0; + if (!pHMgr) return 0; + // exhaustively search through all root words // keeping track of the MAX_ROOTS most similar root words struct hentry * roots[MAX_ROOTS]; - char * rootsphon[MAX_ROOTS]; int scores[MAX_ROOTS]; - int scoresphon[MAX_ROOTS]; for (i = 0; i < MAX_ROOTS; i++) { roots[i] = NULL; scores[i] = -100 * i; - rootsphon[i] = NULL; - scoresphon[i] = -100 * i; } lp = MAX_ROOTS - 1; - lpphon = MAX_ROOTS - 1; - scphon = scoresphon[MAX_ROOTS-1]; - + char w2[MAXWORDUTF8LEN]; - char f[MAXSWUTF8L]; char * word = w; // word reversing wrapper for complex prefixes @@ -1084,8 +896,8 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md struct hentry* hp = NULL; int col = -1; - -#ifdef HUNSPELL_CHROME_CLIENT + + #ifdef HUNSPELL_CHROME_CLIENT // A static array of hentries required for walking the hash table. struct hentry static_hentry[MAX_ROOTS]; @@ -1094,61 +906,31 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md static const int kMaxWordLen = 128; char hentry_word[MAX_ROOTS][kMaxWordLen]; unsigned short hentry_astr[MAX_ROOTS]; -#endif + #endif - phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; - char target[MAXSWUTF8L]; - char candidate[MAXSWUTF8L]; - if (ph) { - strcpy(candidate, word); - mkallcap(candidate, csconv); - phonet(candidate, target, n, *ph); - } - - for (i = 0; i < md; i++) { - while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) { + while ((hp = pHMgr->walk_hashtable(col, hp))) { if ((hp->astr) && (pAMgr) && (TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) || - TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) || TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue; - - sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + NGRAM_LOWERING) + - leftcommonsubstring(word, HENTRY_WORD(hp)); - - // check special pronounciation - if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { - int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + NGRAM_LOWERING) + - leftcommonsubstring(word, f); - if (sc2 > sc) sc = sc2; - } - - if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) { - char target2[MAXSWUTF8L]; - strcpy(candidate, HENTRY_WORD(hp)); - mkallcap(candidate, csconv); - phonet(candidate, target2, -1, *ph); - scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); - } - + sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE); if (sc > scores[lp]) { scores[lp] = sc; -#ifdef HUNSPELL_CHROME_CLIENT + #ifdef HUNSPELL_CHROME_CLIENT roots[lp] = &static_hentry[lp]; roots[lp]->alen = hp->alen; if (hp->astr) hentry_astr[lp] = *hp->astr; roots[lp]->astr = &hentry_astr[lp]; - roots[lp]->blen = hp->blen; - strcpy(&hentry_word[lp][0], &hp->word); - roots[lp]->word = hentry_word[lp][0]; + roots[lp]->wlen = hp->wlen; + strcpy(&hentry_word[lp][0], hp->word); + roots[lp]->word = &hentry_word[lp][0]; roots[lp]->next = NULL; roots[lp]->next_homonym = NULL; - roots[lp]->var = 0; - roots[lp]->clen = 0; -#else + #else roots[lp] = hp; -#endif + #endif + lval = sc; for (j=0; j < MAX_ROOTS; j++) if (scores[j] < lval) { @@ -1156,18 +938,7 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md lval = scores[j]; } } - - if (scphon > scoresphon[lpphon]) { - scoresphon[lpphon] = scphon; - rootsphon[lpphon] = HENTRY_WORD(hp); - lval = scphon; - for (j=0; j < MAX_ROOTS; j++) - if (scoresphon[j] < lval) { - lpphon = j; - lval = scoresphon[j]; - } - } - }} + } // find minimum threshhold for a passable suggestion // mangle original word three differnt ways @@ -1177,11 +948,11 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md if (utf8) { for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*'; u16_u8(mw, MAXSWUTF8L, u8, n); - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING); + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); } else { strcpy(mw, word); for (int k=sp; k < n; k+=4) *(mw + k) = '*'; - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING); + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); } } thresh = thresh / 3; @@ -1191,11 +962,9 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md // and use length adjusted ngram scores to select // possible suggestions char * guess[MAX_GUESS]; - char * guessorig[MAX_GUESS]; int gscore[MAX_GUESS]; for(i=0;i<MAX_GUESS;i++) { guess[i] = NULL; - guessorig[i] = NULL; gscore[i] = -100 * i; } @@ -1205,46 +974,31 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword)); if (! glst) { if (nonbmp) utf8 = 1; - return ns; + return 0; } for (i = 0; i < MAX_ROOTS; i++) { + if (roots[i]) { struct hentry * rp = roots[i]; - int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, - rp->astr, rp->alen, word, nc, - ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL)); + int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen, + rp->astr, rp->alen, word, nc); for (int k = 0; k < nw ; k++) { - sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + NGRAM_LOWERING) + - leftcommonsubstring(word, glst[k].word); - + sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH); if ((sc > thresh)) { if (sc > gscore[lp]) { - if (guess[lp]) { - free (guess[lp]); - if (guessorig[lp]) { - free(guessorig[lp]); - guessorig[lp] = NULL; - } - } + if (guess[lp]) free (guess[lp]); gscore[lp] = sc; guess[lp] = glst[k].word; - guessorig[lp] = glst[k].orig; lval = sc; for (j=0; j < MAX_GUESS; j++) if (gscore[j] < lval) { lp = j; lval = gscore[j]; } - } else { - free(glst[k].word); - if (glst[k].orig) free(glst[k].orig); - } - } else { - free(glst[k].word); - if (glst[k].orig) free(glst[k].orig); - } + } else free (glst[k].word); + } else free(glst[k].word); } } } @@ -1253,9 +1007,7 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md // now we are done generating guesses // sort in order of decreasing score - - bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); - if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); + bubblesort(&guess[0], &gscore[0], MAX_GUESS); // weight suggestions with a similarity index, based on // the longest common subsequent algorithm and resort @@ -1269,7 +1021,7 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md if (utf8) { w_char _w[MAXSWL]; len = u8_u16(_w, MAXSWL, guess[i]); - mkallsmall_utf(_w, len, langnum); + mkallsmall_utf(_w, len, pAMgr->get_langnum()); u16_u8(gl, MAXSWUTF8L, _w, len); } else { strcpy(gl, guess[i]); @@ -1287,10 +1039,10 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md // heuristic weigthing of ngram scores gscore[i] += - // length of longest common subsequent minus length difference + // length of longest common subsequent minus lenght difference 2 * _lcs - abs((int) (n - len)) + - // weight length of the left common substring - leftcommonsubstring(word, gl) + + // weight equal first letter + equalfirstletter(word, gl) + // weight equal character positions ((_lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) + // swap character (not neighboring) @@ -1298,84 +1050,25 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md } } - bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); - -// phonetic version - if (ph) for (i=0; i < MAX_ROOTS; i++) { - if (rootsphon[i]) { - // lowering rootphon[i] - char gl[MAXSWUTF8L]; - int len; - if (utf8) { - w_char _w[MAXSWL]; - len = u8_u16(_w, MAXSWL, rootsphon[i]); - mkallsmall_utf(_w, len, langnum); - u16_u8(gl, MAXSWUTF8L, _w, len); - } else { - strcpy(gl, rootsphon[i]); - mkallsmall(gl, csconv); - len = strlen(rootsphon[i]); - } - - // heuristic weigthing of ngram scores - scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) + - // weight length of the left common substring - leftcommonsubstring(word, gl); - } - } - - if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); + bubblesort(&guess[0], &gscore[0], MAX_GUESS); // copy over - int oldns = ns; + int ns = 0; int same = 0; for (i=0; i < MAX_GUESS; i++) { if (guess[i]) { - if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) { + if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) { int unique = 1; - // leave only excellent suggestions, if exists + // we have excellent suggestion(s) if (gscore[i] > 1000) same = 1; - for (j = 0; j < ns; j++) { + for (j=0; j < ns; j++) // don't suggest previous suggestions or a previous suggestion with prefixes or affixes - if ((!guessorig[i] && strstr(guess[i], wlst[j])) || - (guessorig[i] && strstr(guessorig[i], wlst[j])) || + if (strstr(guess[i], wlst[j]) || // check forbidden words !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0; - } - if (unique) { - wlst[ns++] = guess[i]; - if (guessorig[i]) { - free(guess[i]); - wlst[ns-1] = guessorig[i]; - } - } else { - free(guess[i]); - if (guessorig[i]) free(guessorig[i]); - } - } else { - free(guess[i]); - if (guessorig[i]) free(guessorig[i]); - } - } - } - - oldns = ns; - if (ph) for (i=0; i < MAX_ROOTS; i++) { - if (rootsphon[i]) { - if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) { - int unique = 1; - for (j = 0; j < ns; j++) { - // don't suggest previous suggestions or a previous suggestion with prefixes or affixes - if (strstr(rootsphon[i], wlst[j]) || - // check forbidden words - !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0; - } - if (unique) { - wlst[ns++] = mystrdup(rootsphon[i]); - if (!wlst[ns - 1]) return ns - 1; - } - } + if (unique) wlst[ns++] = guess[i]; else free(guess[i]); + } else free(guess[i]); } } @@ -1390,16 +1083,19 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md // obsolote MySpell-HU modifications: // return value 2 and 3 marks compounding with hyphen (-) // `3' marks roots without suffix -int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit) +int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit) { struct hentry * rv=NULL; int nosuffix = 0; - + // check time limit if (timer) { (*timer)--; if (!(*timer) && timelimit) { - if ((clock() - *timelimit) > TIMELIMIT) return 0; + if (time(NULL) > *timelimit) { + *timelimit = 0; + return 0; + } *timer = MAXPLUSTIMER; } } @@ -1407,7 +1103,7 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time if (pAMgr) { if (cpdsuggest==1) { if (pAMgr->get_compound()) { - rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1); //EXT + rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1); if (rv) return 3; // XXX obsolote categorisation } return 0; @@ -1418,15 +1114,10 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time if (rv) { if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0; - while (rv) { - if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || - TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || - TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { - rv = rv->next_homonym; - } else break; - } + if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || + TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX - + if (rv) { nosuffix=1; } else { @@ -1439,9 +1130,8 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time } // check forbidden words - if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || - TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || - TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || + if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) + || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0; if (rv) { // XXX obsolote @@ -1459,7 +1149,7 @@ int SuggestMgr::check_forbidden(const char * word, int len) if (pAMgr) { rv = pAMgr->lookup(word); - if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || + if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; if (!(pAMgr->prefix_check(word,len,1))) rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix @@ -1470,6 +1160,184 @@ int SuggestMgr::check_forbidden(const char * word, int len) } #ifdef HUNSPELL_EXPERIMENTAL +// suggest stems, XXX experimental code +int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug) +{ + char buf[MAXSWUTF8L]; + char ** wlst; + int prevnsug = nsug; + + char w2[MAXWORDUTF8LEN]; + const char * word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + strcpy(w2, w); + if (utf8) reverseword_utf(w2); else reverseword(w2); + word = w2; + } + + if (*slst) { + wlst = *slst; + } else { + wlst = (char **) calloc(maxSug, sizeof(char *)); + if (wlst == NULL) return -1; + } + // perhaps there are a fix stem in the dictionary + if ((nsug < maxSug) && (nsug > -1)) { + + nsug = fixstems(wlst, word, nsug); + if (nsug == prevnsug) { + char * s = mystrdup(word); + char * p = s + strlen(s); + while ((*p != '-') && (p != s)) p--; + if (*p == '-') { + *p = '\0'; + nsug = fixstems(wlst, s, nsug); + if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) { + char * t; + buf[0] = '\0'; + for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number? + if (*t != '\0') strcpy(buf, "# "); + strcat(buf, s); + wlst[nsug] = mystrdup(buf); + if (wlst[nsug] == NULL) return -1; + nsug++; + } + p++; + nsug = fixstems(wlst, p, nsug); + } + + free(s); + } + } + + if (nsug < 0) { + for (int i=0;i<maxSug; i++) + if (wlst[i] != NULL) free(wlst[i]); + free(wlst); + return -1; + } + + *slst = wlst; + return nsug; +} + + +// there are fix stems in dictionary +int SuggestMgr::fixstems(char ** wlst, const char * word, int ns) +{ + char buf[MAXSWUTF8L]; + char prefix[MAXSWUTF8L] = ""; + + int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound + int cpdindex = 0; + struct hentry * rv = NULL; + + int wl = strlen(word); + int cmpdstemnum; + int cmpdstem[MAXCOMPOUND]; + + if (pAMgr) { + rv = pAMgr->lookup(word); + if (rv) { + dicstem = 0; + } else { + // try stripping off affixes + rv = pAMgr->affix_check(word, wl); + + // else try check compound word + if (!rv && pAMgr->get_compound()) { + rv = pAMgr->compound_check(word, wl, + 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1); + + if (rv) { + dicstem = 2; + for (int j = 0; j < cmpdstemnum; j++) { + cpdindex += cmpdstem[j]; + } + if(! (pAMgr->lookup(word + cpdindex))) + pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix + } + } + + + if (pAMgr->get_prefix()) { + strcpy(prefix, pAMgr->get_prefix()); + } + + // XXX obsolete, will be a general solution for stemming + if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU) + } + + } + + + + if ((rv) && (ns < maxSug)) { + + // check fixstem flag and not_valid_stem flag + // first word + if ((ns < maxSug) && (dicstem < 2)) { + strcpy(buf, prefix); + if ((dicstem > 0) && pAMgr->get_derived()) { + // XXX obsolote + if (strlen(prefix) == 1) { + strcat(buf, (pAMgr->get_derived()) + 1); + } else { + strcat(buf, pAMgr->get_derived()); + } + } else { + // special stem in affix description + const char * wordchars = pAMgr->get_wordchars(); + if (rv->description && + (strchr(wordchars, *(rv->description)))) { + char * desc = (rv->description) + 1; + while (strchr(wordchars, *desc)) desc++; + strncat(buf, rv->description, desc - (rv->description)); + } else { + strcat(buf, rv->word); + } + } + wlst[ns] = mystrdup(buf); + if (wlst[ns] == NULL) return -1; + ns++; + } + + if (dicstem == 2) { + + // compound stem + +// if (rv->astr && (strchr(rv->astr, '0') == NULL)) { + if (rv->astr) { + strcpy(buf, word); + buf[cpdindex] = '\0'; + if (prefix) strcat(buf, prefix); + if (pAMgr->get_derived()) { + strcat(buf, pAMgr->get_derived()); + } else { + // special stem in affix description + const char * wordchars = pAMgr->get_wordchars(); + if (rv->description && + (strchr(wordchars, *(rv->description)))) { + char * desc = (rv->description) + 1; + while (strchr(wordchars, *desc)) desc++; + strncat(buf, rv->description, desc - (rv->description)); + } else { + strcat(buf, rv->word); + } + } + if (ns < maxSug) { + wlst[ns] = mystrdup(buf); + if (wlst[ns] == NULL) return -1; + ns++; + } + } + } + } + return ns; +} + // suggest possible stems int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) { @@ -1509,7 +1377,6 @@ int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) *slst = wlst; return nsug; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE char * SuggestMgr::suggest_morph(const char * w) @@ -1538,25 +1405,20 @@ char * SuggestMgr::suggest_morph(const char * w) while (rv) { if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || - TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { - if (!HENTRY_FIND(rv, MORPH_STEM)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, MORPH_STEM, MAXLNLEN); - mystrcat(result, word, MAXLNLEN); - } - if (HENTRY_DATA(rv)) { - mystrcat(result, " ", MAXLNLEN); - mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); - } - mystrcat(result, "\n", MAXLNLEN); + if (rv->description && ((!rv->astr) || + !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen))) + strcat(result, word); + if (rv->description) strcat(result, rv->description); + strcat(result, "\n"); } rv = rv->next_homonym; } st = pAMgr->affix_check_morph(word,strlen(word)); if (st) { - mystrcat(result, st, MAXLNLEN); + strcat(result, st); free(st); } @@ -1564,177 +1426,28 @@ char * SuggestMgr::suggest_morph(const char * w) pAMgr->compound_check_morph(word, strlen(word), 0, 0, 100, 0,NULL, 0, &r, NULL); - return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL; + return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL; } -#ifdef HUNSPELL_EXPERIMENTAL char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) { char * p = NULL; char ** wlst = (char **) calloc(maxSug, sizeof(char *)); - if (!**wlst) return NULL; // we will use only the first suggestion for (int i = 0; i < maxSug - 1; i++) wlst[i] = ""; - int ns = suggest(&wlst, word, maxSug - 1, NULL); + int ns = suggest(&wlst, word, maxSug - 1); if (ns == maxSug) { p = suggest_morph(wlst[maxSug - 1]); free(wlst[maxSug - 1]); } if (wlst) free(wlst); - return p; + return p; } #endif // END OF HUNSPELL_EXPERIMENTAL CODE -/* affixation */ -char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern) -{ - char result[MAXLNLEN]; - *result = '\0'; - int sfxcount = get_sfxcount(pattern); - - if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL; - - if (HENTRY_DATA(rv)) { - char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen, - HENTRY_DATA(rv), pattern, 0); - if (aff) { - mystrcat(result, aff, MAXLNLEN); - mystrcat(result, "\n", MAXLNLEN); - free(aff); - } - } - - // check all allomorphs - char allomorph[MAXLNLEN]; - char * p = NULL; - if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH); - while (p) { - struct hentry * rv2 = NULL; - p += MORPH_TAG_LEN; - int plen = fieldlen(p); - strncpy(allomorph, p, plen); - allomorph[plen] = '\0'; - rv2 = pAMgr->lookup(allomorph); - while (rv2) { -// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) { - if (HENTRY_DATA(rv2)) { - char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM); - if (st && (strncmp(st + MORPH_TAG_LEN, - HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) { - char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen, - HENTRY_DATA(rv2), pattern, 0); - if (aff) { - mystrcat(result, aff, MAXLNLEN); - mystrcat(result, "\n", MAXLNLEN); - free(aff); - } - } - } - rv2 = rv2->next_homonym; - } - p = strstr(p + plen, MORPH_ALLOMORPH); - } - - return (*result) ? mystrdup(result) : NULL; -} - -char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) { - char result[MAXLNLEN]; - char result2[MAXLNLEN]; - char newpattern[MAXLNLEN]; - *newpattern = '\0'; - if (n == 0) return 0; - *result2 = '\0'; - struct hentry * rv = NULL; - if (!pAMgr) return NULL; - -// search affixed forms with and without derivational suffixes - while(1) { - - for (int k = 0; k < n; k++) { - *result = '\0'; - // add compound word parts (except the last one) - char * s = (char *) desc[k]; - char * part = strstr(s, MORPH_PART); - if (part) { - char * nextpart = strstr(part + 1, MORPH_PART); - while (nextpart) { - copy_field(result + strlen(result), part, MORPH_PART); - part = nextpart; - nextpart = strstr(part + 1, MORPH_PART); - } - s = part; - } - - char **pl; - char tok[MAXLNLEN]; - strcpy(tok, s); - char * alt = strstr(tok, " | "); - while (alt) { - alt[1] = MSEP_ALT; - alt = strstr(alt, " | "); - } - int pln = line_tok(tok, &pl, MSEP_ALT); - for (int i = 0; i < pln; i++) { - // remove inflectional and terminal suffixes - char * is = strstr(pl[i], MORPH_INFL_SFX); - if (is) *is = '\0'; - char * ts = strstr(pl[i], MORPH_TERM_SFX); - while (ts) { - *ts = '_'; - ts = strstr(pl[i], MORPH_TERM_SFX); - } - char * st = strstr(s, MORPH_STEM); - if (st) { - copy_field(tok, st, MORPH_STEM); - rv = pAMgr->lookup(tok); - while (rv) { - char newpat[MAXLNLEN]; - strcpy(newpat, pl[i]); - strcat(newpat, pattern); - char * sg = suggest_hentry_gen(rv, newpat); - if (!sg) sg = suggest_hentry_gen(rv, pattern); - if (sg) { - char ** gen; - int genl = line_tok(sg, &gen, MSEP_REC); - free(sg); - sg = NULL; - for (int j = 0; j < genl; j++) { - if (strstr(pl[i], MORPH_SURF_PFX)) { - int r2l = strlen(result2); - result2[r2l] = MSEP_REC; - strcpy(result2 + r2l + 1, result); - copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX); - mystrcat(result2, gen[j], MAXLNLEN); - } else { - sprintf(result2 + strlen(result2), "%c%s%s", - MSEP_REC, result, gen[j]); - } - } - freelist(&gen, genl); - } - rv = rv->next_homonym; - } - } - } - freelist(&pl, pln); - } - - if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break; - strcpy(newpattern, pattern); - pattern = newpattern; - char * ds = strstr(pattern, MORPH_DERI_SFX); - while (ds) { - strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN); - ds = strstr(pattern, MORPH_DERI_SFX); - } - } - return (*result2 ? mystrdup(result2) : NULL); -} - // generate an n-gram score comparing s1 and s2 -int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt) +int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) { int nscore = 0; int ns; @@ -1746,9 +1459,13 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt) w_char su2[MAXSWL]; l1 = u8_u16(su1, MAXSWL, s1); l2 = u8_u16(su2, MAXSWL, s2); - if ((l2 <= 0) || (l1 == -1)) return 0; - // lowering dictionary word - if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum); + if (!l2 || (l1==-1) || (l2==-1)) return 0; + // decapitalize dictionary word + if (complexprefixes) { + mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum()); + } else { + mkallsmall_utf(su2, 1, pAMgr->get_langnum()); + } for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1-j); i++) { @@ -1772,9 +1489,13 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt) char t[MAXSWUTF8L]; l1 = strlen(s1); l2 = strlen(s2); - if (l2 == 0) return 0; + if (!l2) return 0; strcpy(t, s2); - if (opt & NGRAM_LOWERING) mkallsmall(t, csconv); + if (complexprefixes) { + *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower; + } else { + mkallsmall(t, csconv); + } for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1-j); i++) { @@ -1789,14 +1510,13 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt) } ns = 0; - if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2; - if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; + if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2; + if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; ns = (nscore - ((ns > 0) ? ns : 0)); return ns; } -// length of the left common substring of s1 and (decapitalised) s2 -int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) { +int SuggestMgr::equalfirstletter(char * s1, const char * s2) { if (utf8) { w_char su1[MAXSWL]; w_char su2[MAXSWL]; @@ -1806,17 +1526,9 @@ int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) { int l2 = u8_u16(su2, MAXSWL, s2); if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1; } else { - int i; u8_u16(su1, 1, s1); u8_u16(su2, 1, s2); - unsigned short idx = (su2->h << 8) + su2->l; - if (*((short *)su1) != *((short *)su2) && - (*((unsigned short *)su1) != unicodetolower(idx, langnum))) return 0; - int l1 = u8_u16(su1, MAXSWL, s1); - int l2 = u8_u16(su2, MAXSWL, s2); - for(i = 1; (i < l1) && (i < l2) && - (*((short *)(su1 + i)) == *((short *)(su2 + i))); i++); - return i; + if (*((short *)su1) == *((short *)su2)) return 1; } } else { if (complexprefixes) { @@ -1824,13 +1536,7 @@ int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) { int l2 = strlen(s2); if (*(s2+l1-1) == *(s2+l2-1)) return 1; } else { - char * olds = s1; - // decapitalise dictionary word - if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0; - do { - s1++; s2++; - } while ((*s1 == *s2) && (*s1 != '\0')); - return s1 - olds; + if (*s1 == *s2) return 1; } } return 0; @@ -1848,9 +1554,9 @@ int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_sw int l2 = u8_u16(su2, MAXSWL, s2); // decapitalize dictionary word if (complexprefixes) { - mkallsmall_utf(su2+l2-1, 1, langnum); + mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum()); } else { - mkallsmall_utf(su2, 1, langnum); + mkallsmall_utf(su2, 1, pAMgr->get_langnum()); } for (int i = 0; (i < l1) && (i < l2); i++) { if (((short *) su1)[i] == ((short *) su2)[i]) { @@ -1897,7 +1603,7 @@ int SuggestMgr::mystrlen(const char * word) { } // sort in decreasing order of score -void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n ) +void SuggestMgr::bubblesort(char** rword, int* rsc, int n ) { int m = 1; while (m < n) { @@ -1910,11 +1616,6 @@ void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n ) rword[j-1] = rword[j]; rsc[j] = sctmp; rword[j] = wdtmp; - if (rword2) { - wdtmp = rword2[j-1]; - rword2[j-1] = rword2[j]; - rword2[j] = wdtmp; - } j--; } else break; } @@ -1941,12 +1642,6 @@ void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char * } c = (char *) malloc((m + 1) * (n + 1)); b = (char *) malloc((m + 1) * (n + 1)); - if (!c || !b) { - if (c) free(c); - if (b) free(b); - *result = NULL; - return; - } for (i = 1; i <= m; i++) c[i*(n+1)] = 0; for (j = 0; j <= n; j++) c[j] = 0; for (i = 1; i <= m; i++) { @@ -1978,7 +1673,6 @@ int SuggestMgr::lcslen(const char * s, const char* s2) { char * result; int len = 0; lcs(s, s2, &m, &n, &result); - if (!result) return 0; i = m; j = n; while ((i != 0) && (j != 0)) { @@ -1990,6 +1684,6 @@ int SuggestMgr::lcslen(const char * s, const char* s2) { i--; } else j--; } - free(result); + if (result) free(result); return len; } diff --git a/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx b/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx index 0e61572..70af7f1 100644 --- a/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx +++ b/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx @@ -5,18 +5,15 @@ #define MAXSWUTF8L (MAXSWL * 4) #define MAX_ROOTS 100 #define MAX_WORDS 100 -#define MAX_GUESS 200 -#define MAXNGRAMSUGS 4 -#define MAXPHONSUGS 2 +#define MAX_GUESS 100 +#define MAXNGRAMSUGS 5 -// timelimit: max ~1/4 sec (process time on Linux) for a time consuming function -#define TIMELIMIT (CLOCKS_PER_SEC >> 2) -#define MINTIMER 100 -#define MAXPLUSTIMER 100 +#define MINTIMER 500 +#define MAXPLUSTIMER 500 -#define NGRAM_LONGER_WORSE (1 << 0) -#define NGRAM_ANY_MISMATCH (1 << 1) -#define NGRAM_LOWERING (1 << 2) +#define NGRAM_IGNORE_LENGTH 0 +#define NGRAM_LONGER_WORSE 1 +#define NGRAM_ANY_MISMATCH 2 #include "atypes.hxx" #include "affixmgr.hxx" @@ -28,10 +25,6 @@ enum { LCS_UP, LCS_LEFT, LCS_UPLEFT }; class SuggestMgr { - char * ckey; - int ckeyl; - w_char * ckey_utf; - char * ctry; int ctryl; w_char * ctry_utf; @@ -40,7 +33,6 @@ class SuggestMgr int maxSug; struct cs_info * csconv; int utf8; - int langnum; int nosplitsugs; int maxngramsugs; int complexprefixes; @@ -50,20 +42,19 @@ public: SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr); ~SuggestMgr(); - int suggest(char*** slst, const char * word, int nsug, int * onlycmpdsug); - int ngsuggest(char ** wlst, char * word, int ns, HashMgr** pHMgr, int md); + int suggest(char*** slst, const char * word, int nsug); + int ngsuggest(char ** wlst, char * word, HashMgr* pHMgr); int suggest_auto(char*** slst, const char * word, int nsug); int suggest_stems(char*** slst, const char * word, int nsug); int suggest_pos_stems(char*** slst, const char * word, int nsug); char * suggest_morph(const char * word); - char * suggest_gen(char ** pl, int pln, char * pattern); char * suggest_morph_for_spelling_error(const char * word); private: int testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest, - int * timer, clock_t * timelimit); - int checkword(const char *, int, int, int *, clock_t *); + int * timer, time_t * timelimit); + int checkword(const char *, int, int, int *, time_t *); int check_forbidden(const char *, int); int capchars(char **, const char *, int, int); @@ -74,7 +65,6 @@ private: int longswapchar(char **, const char *, int, int); int movechar(char **, const char *, int, int); int extrachar(char **, const char *, int, int); - int badcharkey(char **, const char *, int, int); int badchar(char **, const char *, int, int); int twowords(char **, const char *, int, int); int fixstems(char **, const char *, int); @@ -83,23 +73,21 @@ private: int doubletwochars_utf(char**, const w_char *, int wl, int, int); int forgotchar_utf(char**, const w_char *, int wl, int, int); int extrachar_utf(char**, const w_char *, int wl, int, int); - int badcharkey_utf(char **, const w_char *, int wl, int, int); int badchar_utf(char **, const w_char *, int wl, int, int); int swapchar_utf(char **, const w_char *, int wl, int, int); int longswapchar_utf(char **, const w_char *, int, int, int); int movechar_utf(char **, const w_char *, int, int, int); - int mapchars(char**, const char *, int, int); - int map_related(const char *, int, char ** wlst, int, int, const mapentry*, int, int *, clock_t *); - int map_related_utf(w_char *, int, int, int, char ** wlst, int, const mapentry*, int, int *, clock_t *); - int ngram(int n, char * s1, const char * s2, int opt); + int mapchars(char**, const char *, int); + int map_related(const char *, int, char ** wlst, int, const mapentry*, int, int *, time_t *); + int map_related_utf(w_char *, int, int, char ** wlst, int, const mapentry*, int, int *, time_t *); + int ngram(int n, char * s1, const char * s2, int uselen); int mystrlen(const char * word); - int leftcommonsubstring(char * s1, const char * s2); + int equalfirstletter(char * s1, const char * s2); int commoncharacterpositions(char * s1, const char * s2, int * is_swap); - void bubblesort( char ** rwd, char ** rwd2, int * rsc, int n); + void bubblesort( char ** rwd, int * rsc, int n); void lcs(const char * s, const char * s2, int * l1, int * l2, char ** result); int lcslen(const char * s, const char* s2); - char * suggest_hentry_gen(hentry * rv, char * pattern); }; diff --git a/chrome/third_party/hunspell/src/hunspell/w_char.hxx b/chrome/third_party/hunspell/src/hunspell/w_char.hxx deleted file mode 100644 index 99cfe63..0000000 --- a/chrome/third_party/hunspell/src/hunspell/w_char.hxx +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __WCHARHXX__ -#define __WCHARHXX__ - -#ifndef GCC -typedef struct { -#else -typedef struct __attribute__ ((packed)) { -#endif - unsigned char l; - unsigned char h; -} w_char; - -// two character arrays -struct replentry { - char * pattern; - char * pattern2; -}; - -#endif |