diff options
32 files changed, 5233 insertions, 2777 deletions
diff --git a/chrome/browser/spellcheck_unittest.cc b/chrome/browser/spellcheck_unittest.cc index 86eadff..1d51b5b 100644 --- a/chrome/browser/spellcheck_unittest.cc +++ b/chrome/browser/spellcheck_unittest.cc @@ -607,11 +607,11 @@ TEST_F(SpellCheckTest, SpellCheckSuggestions_EN_US) { {L"wate", false, 0, 0, L"water"}, {L"wate", false, 0, 0, L"waste"}, {L"wate", false, 0, 0, L"sate"}, - {L"wate", false, 0, 0, L"rate"}, + {L"wate", false, 0, 0, L"ate"}, {L"jum", false, 0, 0, L"jump"}, - {L"jum", false, 0, 0, L"rum"}, + {L"jum", false, 0, 0, L"hum"}, {L"jum", false, 0, 0, L"sum"}, - {L"jum", false, 0, 0, L"tum"}, + {L"jum", false, 0, 0, L"um"}, #endif //!OS_MACOSX // TODO (Sidchat): add many more examples. }; diff --git a/chrome/browser/spellchecker.cc b/chrome/browser/spellchecker.cc index f3bb9d3..f7635b6 100644 --- a/chrome/browser/spellchecker.cc +++ b/chrome/browser/spellchecker.cc @@ -613,7 +613,7 @@ void SpellChecker::AddCustomWordsToHunspell() { if (hunspell_.get()) { for (std::vector<std::string>::iterator it = list_of_words.begin(); it != list_of_words.end(); ++it) { - hunspell_->put_word(it->c_str()); + hunspell_->add(it->c_str()); } } } @@ -744,7 +744,7 @@ void SpellChecker::AddWord(const std::wstring& word) { // Add the word to hunspell. std::string word_to_add = WideToUTF8(word); if (!word_to_add.empty()) - hunspell_->put_word(word_to_add.c_str()); + hunspell_->add(word_to_add.c_str()); // Now add the word to the custom dictionary file. Task* write_word_task = diff --git a/chrome/third_party/hunspell/README.chromium b/chrome/third_party/hunspell/README.chromium index 5e6f6f7..e19a1b1 100644 --- a/chrome/third_party/hunspell/README.chromium +++ b/chrome/third_party/hunspell/README.chromium @@ -6,24 +6,13 @@ This is a partial copy of Hunspell 1.1.5, with the following changes: reference in src/hunspell/csutil.cxx changed accordingly * Change the input params of the constructors to receive a FILE* instead of a file path. This is required to use hunspell in the sandbox. - The patch is in google.patch. +* Remove all HUNSPELL_WARNING parameters since we are not using HashMgr + anymore, just show the msg not the line number. +* Remove the key variable from Hunspell, HashMgr and AffixMgr since Bdict + is being used instead. The English dictionary distributed by Firefox has been checked in to the dictionaries directory. It has several additions over the default myspell/hunspell dictionary. -* Workaround for non-ASCII characters - -Visual Studio on Japanese Windows assumes the source files to be -encoded in Shift_JIS. The compiler is unhappy with non-ASCII letters -in the source files of Hunspell. The same problem happens with other -CJK Windows as well. Here is the workaround for this problem: - -Convert 8-bit bytes to hexadecimal escaped forms by - - % perl -i -De 's/([\x80-\xff])/sprintf("\\x%02x", $1)/ge' src/*.cxx - - -Note that Hunspell upstream is going to fix this problem. We'll no -longer need the workaround if the problem is fixed in the upstream. diff --git a/chrome/third_party/hunspell/google.patch b/chrome/third_party/hunspell/google.patch deleted file mode 100644 index ae7fd9d..0000000 --- a/chrome/third_party/hunspell/google.patch +++ /dev/null @@ -1,212 +0,0 @@ -Index: src/hunspell/affixmgr.cxx -=================================================================== ---- src/hunspell/affixmgr.cxx (revision 3811) -+++ src/hunspell/affixmgr.cxx (working copy) -@@ -25,7 +27,7 @@ - #endif - #endif - --AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) -+AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) - { - // register hash manager and load affix data from aff file - pHMgr = ptr; -@@ -104,8 +106,8 @@ - contclasses[j] = 0; - } - -- if (parse_file(affpath)) { -- HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath); -+ if (parse_file(aff_handle)) { -+ HUNSPELL_WARNING(stderr, "Failure loading aff file\n"); - wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"); - } - -@@ -232,7 +234,7 @@ - - - // read in aff file and build up prefix and suffix entry objects --int AffixMgr::parse_file(const char * affpath) -+int AffixMgr::parse_file(FILE* aff_handle) - { - - // io buffers -@@ -250,11 +252,12 @@ - - // open the affix file - FILE * afflst; -- afflst = fopen(affpath,"r"); -+ afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); - if (!afflst) { -- HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath); -+ HUNSPELL_WARNING(stderr, "error: could not open affix description file\n"); - return 1; - } -+ fseek(afflst, 0, SEEK_SET); - - // step one is to parse the affix file building up the internal - // affix data structures -Index: src/hunspell/affixmgr.hxx -=================================================================== ---- src/hunspell/affixmgr.hxx (revision 3811) -+++ src/hunspell/affixmgr.hxx (working copy) -@@ -93,7 +93,7 @@ - - public: - -- AffixMgr(const char * affpath, HashMgr * ptr); -+ AffixMgr(FILE* aff_handle, HashMgr * ptr); - ~AffixMgr(); - struct hentry * affix_check(const char * word, int len, - const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT); -@@ -179,7 +179,7 @@ - int get_checksharps(void); - - private: -- int parse_file(const char * affpath); -+ int parse_file(FILE* aff_handle); - // int parse_string(char * line, char ** out, const char * name); - int parse_flag(char * line, unsigned short * out, const char * name); - int parse_num(char * line, int * out, const char * name); -Index: src/hunspell/hashmgr.cxx -=================================================================== ---- src/hunspell/hashmgr.cxx (revision 3811) -+++ src/hunspell/hashmgr.cxx (working copy) -@@ -29,7 +31,7 @@ - - // build a hash table from a munched word list - --HashMgr::HashMgr(const char * tpath, const char * apath) -+HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) - { - tablesize = 0; - tableptr = NULL; -@@ -43,8 +45,8 @@ - aliasf = NULL; - numaliasm = 0; - aliasm = NULL; -- load_config(apath); -- int ec = load_tables(tpath); -+ load_config(aff_handle); -+ int ec = load_tables(dic_handle); - if (ec) { - /* error condition - what should we do here */ - HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec); -@@ -240,7 +242,7 @@ - } - - // load a munched word list and build a hash table on the fly --int HashMgr::load_tables(const char * tpath) -+int HashMgr::load_tables(FILE* t_handle) - { - int wl, al; - char * ap; -@@ -248,8 +250,9 @@ - unsigned short * flags; - - // raw dictionary - munched file -- FILE * rawdict = fopen(tpath, "r"); -+ FILE * rawdict = _fdopen(_dup(_fileno(t_handle)), "r"); - if (rawdict == NULL) return 1; -+ fseek(rawdict, 0, SEEK_SET); - - // first read the first line of file to get hash table size */ - char ts[MAXDELEN]; -@@ -442,7 +445,7 @@ - } - - // read in aff file and set flag mode --int HashMgr::load_config(const char * affpath) -+int HashMgr::load_config(FILE* aff_handle) - { - int firstline = 1; - -@@ -451,11 +454,12 @@ - - // open the affix file - FILE * afflst; -- afflst = fopen(affpath,"r"); -+ afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); - if (!afflst) { -- HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath); -+ HUNSPELL_WARNING(stderr, "Error - could not open affix description file\n"); - return 1; - } -+ fseek(afflst, 0, SEEK_SET); - - // read in each line ignoring any that do not - // start with a known line type indicator -Index: src/hunspell/hashmgr.hxx -=================================================================== ---- src/hunspell/hashmgr.hxx (revision 3811) -+++ src/hunspell/hashmgr.hxx (working copy) -@@ -25,7 +25,7 @@ - - - public: -- HashMgr(const char * tpath, const char * apath); -+ HashMgr(FILE* t_handle, FILE* a_handle); - ~HashMgr(); - - struct hentry * lookup(const char *) const; -@@ -46,9 +46,9 @@ - - - private: -- int load_tables(const char * tpath); -+ int load_tables(FILE* t_handle); - int add_word(const char * word, int wl, unsigned short * ap, int al, const char * desc); -- int load_config(const char * affpath); -+ int load_config(FILE* aff_handle); - int parse_aliasf(char * line, FILE * af); - #ifdef HUNSPELL_EXPERIMENTAL - int parse_aliasm(char * line, FILE * af); -Index: src/hunspell/hunspell.cxx -=================================================================== ---- src/hunspell/hunspell.cxx (revision 3811) -+++ src/hunspell/hunspell.cxx (working copy) -@@ -20,7 +20,7 @@ - #endif - #endif - --Hunspell::Hunspell(const char * affpath, const char * dpath) -+Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle) - { - encoding = NULL; - csconv = NULL; -@@ -28,11 +28,11 @@ - complexprefixes = 0; - - /* first set up the hash manager */ -- pHMgr = new HashMgr(dpath, affpath); -+ pHMgr = new HashMgr(dic_handle, aff_handle); - - /* next set up the affix manager */ - /* it needs access to the hash manager lookup methods */ -- pAMgr = new AffixMgr(affpath,pHMgr); -+ pAMgr = new AffixMgr(aff_handle, pHMgr); - - /* get the preferred try string and the dictionary */ - /* encoding from the Affix Manager for that dictionary */ -@@ -1694,9 +1694,9 @@ - - #endif // END OF HUNSPELL_EXPERIMENTAL CODE - --Hunhandle *Hunspell_create(const char * affpath, const char * dpath) -+Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) - { -- return (Hunhandle*)(new Hunspell(affpath, dpath)); -+ return (Hunhandle*)(new Hunspell(aff_handle, dic_handle)); - } - - void Hunspell_destroy(Hunhandle *pHunspell) -Index: src/hunspell/hunspell.hxx -=================================================================== ---- src/hunspell/hunspell.hxx (revision 3811) -+++ src/hunspell/hunspell.hxx (working copy) -@@ -48,7 +48,7 @@ - * input: path of affix file and dictionary file - */ - -- Hunspell(const char * affpath, const char * dpath); -+ Hunspell(FILE* aff_handle, FILE* dic_handle); diff --git a/chrome/third_party/hunspell/hunspell.gyp b/chrome/third_party/hunspell/hunspell.gyp index a19f103..217fa5e 100644 --- a/chrome/third_party/hunspell/hunspell.gyp +++ b/chrome/third_party/hunspell/hunspell.gyp @@ -16,6 +16,7 @@ '../../../third_party/icu38/icu38.gyp:icuuc', ], 'defines': [ + 'HUNSPELL_STATIC', 'HUNSPELL_CHROME_CLIENT', 'OPENOFFICEORG', ], @@ -35,21 +36,31 @@ 'src/hunspell/csutil.hxx', 'src/hunspell/dictmgr.cxx', 'src/hunspell/dictmgr.hxx', + 'src/hunspell/filemgr.cxx', + 'src/hunspell/filemgr.hxx', 'src/hunspell/hashmgr.cxx', 'src/hunspell/hashmgr.hxx', 'src/hunspell/htypes.hxx', 'src/hunspell/hunspell.cxx', 'src/hunspell/hunspell.h', 'src/hunspell/hunspell.hxx', + 'src/hunspell/hunzip.cxx', + 'src/hunspell/hunzip.hxx', 'src/hunspell/langnum.hxx', + 'src/hunspell/phonet.cxx', + 'src/hunspell/phonet.hxx', + 'src/hunspell/replist.cxx', + 'src/hunspell/replist.hxx', 'src/hunspell/suggestmgr.cxx', 'src/hunspell/suggestmgr.hxx', 'src/hunspell/utf_info.hxx', + 'src/hunspell/w_char.hxx', 'src/parsers/textparser.cxx', 'src/parsers/textparser.hxx', ], 'direct_dependent_settings': { 'defines': [ + 'HUNSPELL_STATIC', 'HUNSPELL_CHROME_CLIENT', 'USE_HUNSPELL', ], diff --git a/chrome/third_party/hunspell/src/hunspell/affentry.cxx b/chrome/third_party/hunspell/src/hunspell/affentry.cxx index 517646f..7c2dab4 100644 --- a/chrome/third_party/hunspell/src/hunspell/affentry.cxx +++ b/chrome/third_party/hunspell/src/hunspell/affentry.cxx @@ -7,9 +7,9 @@ #include <cctype> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #include <ctype.h> #endif @@ -17,7 +17,7 @@ #include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif @@ -29,22 +29,23 @@ PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) pmyMgr = pmgr; // set up its intial values - - aflag = dp->aflag; // flag + + aflag = dp->aflag; // flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string - numconds = dp->numconds; // number of conditions to match - opts = dp->opts; // cross product flag + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag // then copy over all of the conditions - memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); + if (opts & aeLONGCOND) { + memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); next = NULL; nextne = NULL; nexteq = NULL; -#ifdef HUNSPELL_EXPERIMENTAL morphcode = dp->morphcode; -#endif contclass = dp->contclass; contclasslen = dp->contclasslen; } @@ -58,15 +59,8 @@ PfxEntry::~PfxEntry() pmyMgr = NULL; appnd = NULL; strip = NULL; - if (opts & aeUTF8) { - for (int i = 0; i < numconds; i++) { - if (conds.utf8.wchars[i]) - free(conds.utf8.wchars[i]); - } - } -#ifdef HUNSPELL_EXPERIMENTAL + if (opts & aeLONGCOND) free(c.l.conds2); if (morphcode && !(opts & aeALIASM)) free(morphcode); -#endif if (contclass && !(opts & aeALIASF)) free(contclass); } @@ -75,8 +69,9 @@ char * PfxEntry::add(const char * word, int len) { char tword[MAXWORDUTF8LEN + 4]; - if ((len > stripl) && (len >= numconds) && test_condition(word) && - (!stripl || (strncmp(word, strip, stripl) == 0)) && + if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && + (len >= numconds) && test_condition(word) && + (!stripl || (strncmp(word, strip, stripl) == 0)) && ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { /* we have a match so add prefix */ char * pp = tword; @@ -87,51 +82,87 @@ char * PfxEntry::add(const char * word, int len) strcpy(pp, (word + stripl)); return mystrdup(tword); } - return NULL; + return NULL; } +inline char * PfxEntry::nextchar(char * p) { + if (p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.conds + MAXCONDLEN_1) return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) return NULL; + return *p ? p : NULL; + } + return NULL; +} inline int PfxEntry::test_condition(const char * st) { - int cond; - unsigned char * cp = (unsigned char *)st; - if (!(opts & aeUTF8)) { // 256-character codepage - for (cond = 0; cond < numconds; cond++) { - if ((conds.base[*cp++] & (1 << cond)) == 0) return 0; - } - } else { // UTF-8 encoding - unsigned short wc; - for (cond = 0; cond < numconds; cond++) { - // a simple 7-bit ASCII character in UTF-8 - if ((*cp >> 7) == 0) { - // also check limit (end of word) - if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0; - // UTF-8 multibyte character - } else { - // not dot wildcard in rule - if (!conds.utf8.all[cond]) { - if (conds.utf8.neg[cond]) { - u8_u16((w_char *) &wc, 1, (char *) cp); - if (conds.utf8.wchars[cond] && - flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short) conds.utf8.wlen[cond])) return 0; - } else { - if (!conds.utf8.wchars[cond]) return 0; - u8_u16((w_char *) &wc, 1, (char *) cp); - if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short)conds.utf8.wlen[cond])) return 0; - } + const char * pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) return 1; + char * p = c.conds; + while (1) { + switch (*p) { + case '\0': return 1; + case '[': { + neg = false; + ingroup = false; + p = nextchar(p); + pos = st; break; + } + case '^': { p = nextchar(p); neg = true; break; } + case ']': { + if ((neg && ingroup) || (!neg && !ingroup)) return 0; + pos = NULL; + p = nextchar(p); + // skip the next character + if (!ingroup) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); + if (*st == '\0' && p) return 0; // word <= condition + break; + } + case '.': if (!pos) { // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); + if (*st == '\0' && p) return 0; // word <= condition + break; + } + default: { + if (*st == *p) { + st++; + p = nextchar(p); + if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte + while (p && (*p & 0xc0) == 0x80) { // character + if (*p != *st) { + if (!pos) return 0; + st = pos; + break; + } + p = nextchar(p); + st++; + } + if (pos && st != pos) { + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); + } + } else if (pos) { + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); + } + } else if (pos) { // group + p = nextchar(p); + } else return 0; } - // jump to next UTF-8 character - for(cp++; (*cp & 0xc0) == 0x80; cp++); - } } + if (!p) return 1; } - return 1; } - -// check if this prefix entry matches +// check if this prefix entry matches struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag) { int tmpl; // length of tmpword @@ -145,7 +176,7 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -166,8 +197,8 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound if ((he = pmyMgr->lookup(tmpword)) != NULL) { do { if (TESTAFF(he->astr, aflag, he->alen) && - // forbid single prefixes with pseudoroot flag - ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && + // forbid single prefixes with needaffix flag + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && // needflag ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || (contclass && TESTAFF(contclass, needflag, contclasslen)))) @@ -175,14 +206,14 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound he = he->next_homonym; // check homonyms } while (he); } - - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix //if ((opts & aeXPRODUCT) && in_compound) { if ((opts & aeXPRODUCT)) { - he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL, + he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL, 0, NULL, FLAG_NULL, needflag, in_compound); if (he) return he; } @@ -191,7 +222,7 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound return NULL; } -// check if this prefix entry matches +// check if this prefix entry matches struct hentry * PfxEntry::check_twosfx(const char * word, int len, char in_compound, const FLAG needflag) { @@ -206,7 +237,8 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -225,8 +257,8 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, if (test_condition(tmpword)) { tmpl += stripl; - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // cross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { @@ -238,8 +270,7 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL -// check if this prefix entry matches +// check if this prefix entry matches char * PfxEntry::check_twosfx_morph(const char * word, int len, char in_compound, const FLAG needflag) { @@ -253,7 +284,8 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -272,8 +304,8 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, if (test_condition(tmpword)) { tmpl += stripl; - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { @@ -285,7 +317,7 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, return NULL; } -// check if this prefix entry matches +// check if this prefix entry matches char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag) { int tmpl; // length of tmpword @@ -293,7 +325,7 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const char tmpword[MAXWORDUTF8LEN + 4]; char result[MAXLNLEN]; char * st; - + *result = '\0'; // on entry prefix is 0 length or already matches the beginning of the word. @@ -303,7 +335,8 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -324,41 +357,56 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const if ((he = pmyMgr->lookup(tmpword)) != NULL) { do { if (TESTAFF(he->astr, aflag, he->alen) && - // forbid single prefixes with pseudoroot flag - ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && + // forbid single prefixes with needaffix flag + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && // needflag ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || (contclass && TESTAFF(contclass, needflag, contclasslen)))) { - if (morphcode) strcat(result, morphcode); else strcat(result,getKey()); - if (he->description) { - if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word); - strcat(result,he->description); + if (morphcode) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, morphcode, MAXLNLEN); + } else mystrcat(result,getKey(), MAXLNLEN); + if (!HENTRY_FIND(he, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, HENTRY_WORD(he), MAXLNLEN); + } + // store the pointer of the hash entry + if (HENTRY_DATA(he)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(he), MAXLNLEN); + } else { + // return with debug information + char * flag = pmyMgr->encode_flag(getFlag()); + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_FLAG, MAXLNLEN); + mystrcat(result, flag, MAXLNLEN); + free(flag); } - strcat(result, "\n"); + mystrcat(result, "\n", MAXLNLEN); } he = he->next_homonym; } while (he); } - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { - st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, + st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, FLAG_NULL, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } } } } - + if (*result) return mystrdup(result); return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) { @@ -366,22 +414,22 @@ SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) pmyMgr = pmgr; // set up its intial values - aflag = dp->aflag; // char flag + aflag = dp->aflag; // char flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string - numconds = dp->numconds; // number of conditions to match - opts = dp->opts; // cross product flag + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag // then copy over all of the conditions - memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); + if (opts & aeLONGCOND) { + memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); rappnd = myrevstrdup(appnd); - -#ifdef HUNSPELL_EXPERIMENTAL morphcode = dp->morphcode; -#endif contclass = dp->contclass; contclasslen = dp->contclasslen; } @@ -395,15 +443,9 @@ SfxEntry::~SfxEntry() if (strip) free(strip); pmyMgr = NULL; appnd = NULL; - strip = NULL; - if (opts & aeUTF8) { - for (int i = 0; i < numconds; i++) { - if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]); - } - } -#ifdef HUNSPELL_EXPERIMENTAL + strip = NULL; + if (opts & aeLONGCOND) free(c.l.conds2); if (morphcode && !(opts & aeALIASM)) free(morphcode); -#endif if (contclass && !(opts & aeALIASF)) free(contclass); } @@ -413,7 +455,8 @@ char * SfxEntry::add(const char * word, int len) char tword[MAXWORDUTF8LEN + 4]; /* make sure all conditions match */ - if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) && + if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && + (len >= numconds) && test_condition(word + len, word) && (!stripl || (strcmp(word + len - stripl, strip) == 0)) && ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { /* we have a match so add suffix */ @@ -428,56 +471,114 @@ char * SfxEntry::add(const char * word, int len) return NULL; } +inline char * SfxEntry::nextchar(char * p) { + if (p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) return NULL; + return *p ? p : NULL; + } + return NULL; +} inline int SfxEntry::test_condition(const char * st, const char * beg) { - int cond; - unsigned char * cp = (unsigned char *) st; - if (!(opts & aeUTF8)) { // 256-character codepage - // D\xf6m\xf6lki affix algorithm - for (cond = numconds; --cond >= 0; ) { - if ((conds.base[*--cp] & (1 << cond)) == 0) return 0; - } - } else { // UTF-8 encoding - unsigned short wc; - for (cond = numconds; --cond >= 0; ) { - // go to next character position and check limit - if ((char *) --cp < beg) return 0; - // a simple 7-bit ASCII character in UTF-8 - if ((*cp >> 7) == 0) { - if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0; - // UTF-8 multibyte character - } else { - // go to first character of UTF-8 multibyte character - for (; (*cp & 0xc0) == 0x80; cp--); - // not dot wildcard in rule - if (!conds.utf8.all[cond]) { - if (conds.utf8.neg[cond]) { - u8_u16((w_char *) &wc, 1, (char *) cp); - if (conds.utf8.wchars[cond] && - flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short) conds.utf8.wlen[cond])) return 0; - } else { - if (!conds.utf8.wchars[cond]) return 0; - u8_u16((w_char *) &wc, 1, (char *) cp); - if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short)conds.utf8.wlen[cond])) return 0; + const char * pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) return 1; + char * p = c.conds; + st--; + int i = 1; + while (1) { + switch (*p) { + case '\0': return 1; + case '[': { p = nextchar(p); pos = st; break; } + case '^': { p = nextchar(p); neg = true; break; } + case ']': { if (!neg && !ingroup) return 0; + i++; + // skip the next character + if (!ingroup) { + for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); + st--; + } + pos = NULL; + neg = false; + ingroup = false; + p = nextchar(p); + if (st < beg && p) return 0; // word <= condition + break; + } + case '.': if (!pos) { // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); + if (st < beg) { // word <= condition + if (p) return 0; else return 1; + } + if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character + st--; + if (st < beg) { // word <= condition + if (p) return 0; else return 1; + } } + break; + } + default: { + if (*st == *p) { + p = nextchar(p); + if ((opts & aeUTF8) && (*st & 0x80)) { + st--; + while (p && (st >= beg)) { + if (*p != *st) { + if (!pos) return 0; + st = pos; + break; + } + // first byte of the UTF-8 multibyte character + if ((*p & 0xc0) != 0x80) break; + p = nextchar(p); + st--; + } + if (pos && st != pos) { + if (neg) return 0; + else if (i == numconds) return 1; + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); + st--; + } + if (p && *p != ']') p = nextchar(p); + } else if (pos) { + if (neg) return 0; + else if (i == numconds) return 1; + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); +// if (p && *p != ']') p = nextchar(p); + st--; + } + if (!pos) { + i++; + st--; + } + if (st < beg && p && *p != ']') return 0; // word <= condition + } else if (pos) { // group + p = nextchar(p); + } else return 0; } - } } + if (!p) return 1; } - return 1; } - - -// see if this suffix is present in the word +// see if this suffix is present in the word struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag, const FLAG badflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword struct hentry * he; // hash entry pointer unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; @@ -497,8 +598,9 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, tmpl = len - appndl; // the second condition is not enough for UTF-8 strings // it checked in test_condition() - - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -514,7 +616,8 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, // now make sure all of the conditions on characters // are met. Please see the appendix at the end of - // this file for more info on exactly what is being // tested + // this file for more info on exactly what is being + // tested // if all conditions are met then check if resulting // root word in the dictionary @@ -528,21 +631,21 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, do { // check conditional suffix (enabled by prefix) if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && - TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && - (((optflags & aeXPRODUCT) == 0) || + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + (((optflags & aeXPRODUCT) == 0) || TESTAFF(he->astr, ep->getFlag(), he->alen) || // enabled by prefix ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) ) && // handle cont. class - ((!cclass) || + ((!cclass) || ((contclass) && TESTAFF(contclass, cclass, contclasslen)) ) && // check only in compound homonyms (bad flags) (!badflag || !TESTAFF(he->astr, badflag, he->alen) - ) && + ) && // handle required flag - ((!needflag) || + ((!needflag) || (TESTAFF(he->astr, needflag, he->alen) || ((contclass) && TESTAFF(contclass, needflag, contclasslen))) ) @@ -550,12 +653,12 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, he = he->next_homonym; // check homonyms } while (he); - // obsolote stemming code (used only by the + // obsolote stemming code (used only by the // experimental SuffixMgr:suggest_pos_stems) // store resulting root in wlst } else if (wlst && (*ns < maxSug)) { int cwrd = 1; - for (int k=0; k < *ns; k++) + for (int k=0; k < *ns; k++) if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0; if (cwrd) { wlst[*ns] = mystrdup(tmpword); @@ -572,11 +675,11 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, return NULL; } -// see if two-level suffix is present in the word +// see if two-level suffix is present in the word struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword struct hentry * he; // hash entry pointer unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; @@ -596,7 +699,8 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -620,7 +724,7 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, if (test_condition((char *) cp, (char *) tmpword)) { if (ppfx) { // handle conditional suffix - if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); else he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag); @@ -633,19 +737,18 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL -// see if two-level suffix is present in the word +// see if two-level suffix is present in the word char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; PfxEntry* ep = (PfxEntry *) ppfx; char * st; char result[MAXLNLEN]; - + *result = '\0'; // if this suffix is being cross checked with a prefix @@ -661,7 +764,8 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -689,16 +793,17 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); if (st) { if (((PfxEntry *) ppfx)->getMorph()) { - strcat(result, ((PfxEntry *) ppfx)->getMorph()); + mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); } - strcat(result,st); + mystrcat(result,st, MAXLNLEN); free(st); mychomp(result); } } else { st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); mychomp(result); } @@ -706,7 +811,7 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, } else { st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); mychomp(result); } @@ -716,28 +821,28 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, } return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE // get next homonym with same affix -struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, +struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, const FLAG cclass, const FLAG needflag) { PfxEntry* ep = (PfxEntry *) ppfx; + FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL; while (he->next_homonym) { he = he->next_homonym; - if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && - ((optflags & aeXPRODUCT) == 0 || - TESTAFF(he->astr, ep->getFlag(), he->alen) || + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + ((optflags & aeXPRODUCT) == 0 || + TESTAFF(he->astr, eFlag, he->alen) || // handle conditional suffix - ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + ((contclass) && TESTAFF(contclass, eFlag, contclasslen)) ) && // handle cont. class - ((!cclass) || + ((!cclass) || ((contclass) && TESTAFF(contclass, cclass, contclasslen)) ) && // handle required flag - ((!needflag) || + ((!needflag) || (TESTAFF(he->astr, needflag, he->alen) || ((contclass) && TESTAFF(contclass, needflag, contclasslen))) ) diff --git a/chrome/third_party/hunspell/src/hunspell/affentry.hxx b/chrome/third_party/hunspell/src/hunspell/affentry.hxx index bb21773..ef1f86d 100644 --- a/chrome/third_party/hunspell/src/hunspell/affentry.hxx +++ b/chrome/third_party/hunspell/src/hunspell/affentry.hxx @@ -54,6 +54,7 @@ public: inline void setNextEQ(PfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(PfxEntry * ptr) { flgnxt = ptr; } + inline char * nextchar(char * p); inline int test_condition(const char * st); }; @@ -123,7 +124,9 @@ public: inline void setNextEQ(SfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(SfxEntry * ptr) { flgnxt = ptr; } + inline char * nextchar(char * p); inline int test_condition(const char * st, const char * begin); + }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx b/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx index 9f53a67..29bc9f7 100644 --- a/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx @@ -7,9 +7,9 @@ #include <cctype> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #include <ctype.h> #endif @@ -20,21 +20,24 @@ #include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif #ifdef HUNSPELL_CHROME_CLIENT -AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr* ptr) +AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md) { bdict_reader = reader; #else -AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) +AffixMgr::AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) { #endif // register hash manager and load affix data from aff file - pHMgr = ptr; + pHMgr = ptr[0]; + alldic = ptr; + maxdic = md; + keystring = NULL; trystring = NULL; encoding=NULL; utf8 = 0; @@ -45,10 +48,15 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) numbreak = 0; reptable = NULL; numrep = 0; + iconvtable = NULL; + oconvtable = NULL; checkcpdtable = NULL; + // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) + simplifiedcpd = 0; numcheckcpd = 0; defcpdtable = NULL; numdefcpd = 0; + phone = NULL; compoundflag = FLAG_NULL; // permits word in compound forms compoundbegin = FLAG_NULL; // may be first word in compound forms compoundmiddle = FLAG_NULL; // may be middle word in compound forms @@ -60,11 +68,12 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution) checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds checkcompoundtriple = 0; // forbid compounds with triple letters - forbiddenword = FLAG_NULL; // forbidden word signing flag + simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt) + forbiddenword = FORBIDDENWORD; // forbidden word signing flag nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag lang = NULL; // language langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) - pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes + needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes cpdwordmax = -1; // default: unlimited wordcount in compound words cpdmin = -1; // undefined cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words @@ -88,14 +97,14 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) lemma_present = FLAG_NULL; circumfix = FLAG_NULL; onlyincompound = FLAG_NULL; - flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file maxngramsugs = -1; // undefined nosplitsugs = 0; sugswithdots = 0; keepcase = 0; checksharps = 0; + substandard = FLAG_NULL; + fullstrip = 0; - derived = NULL; // XXX not threadsafe variable for experimental stemming sfx = NULL; pfx = NULL; @@ -109,14 +118,14 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) #ifdef HUNSPELL_CHROME_CLIENT if (parse_file()) { #else + for (int j=0; j < CONTSIZE; j++) { contclasses[j] = 0; } - if (parse_file(aff_handle)) { + if (parse_file(affpath, key)) { #endif HUNSPELL_WARNING(stderr, "Failure loading aff file\n"); - wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"); } if (cpdmin == -1) cpdmin = MINCPDLEN; @@ -154,6 +163,8 @@ AffixMgr::~AffixMgr() sStart[j] = NULL; } + if (keystring) free(keystring); + keystring=NULL; if (trystring) free(trystring); trystring=NULL; if (encoding) free(encoding); @@ -178,16 +189,26 @@ AffixMgr::~AffixMgr() breaktable = NULL; } numbreak = 0; - if (reptable) { + if (reptable) { for (int j=0; j < numrep; j++) { free(reptable[j].pattern); free(reptable[j].pattern2); - reptable[j].pattern = NULL; - reptable[j].pattern2 = NULL; } free(reptable); reptable = NULL; } + if (iconvtable) delete iconvtable; + if (oconvtable) delete oconvtable; + if (phone && phone->rules) { + for (int j=0; j < phone->num + 1; j++) { + free(phone->rules[j * 2]); + free(phone->rules[j * 2 + 1]); + } + free(phone->rules); + free(phone); + phone = NULL; + } + if (defcpdtable) { for (int j=0; j < numdefcpd; j++) { free(defcpdtable[j].def); @@ -201,8 +222,10 @@ AffixMgr::~AffixMgr() for (int j=0; j < numcheckcpd; j++) { free(checkcpdtable[j].pattern); free(checkcpdtable[j].pattern2); + free(checkcpdtable[j].pattern3); checkcpdtable[j].pattern = NULL; checkcpdtable[j].pattern2 = NULL; + checkcpdtable[j].pattern3 = NULL; } free(checkcpdtable); checkcpdtable = NULL; @@ -217,7 +240,7 @@ AffixMgr::~AffixMgr() FREE_FLAG(compoundroot); FREE_FLAG(forbiddenword); FREE_FLAG(nosuggest); - FREE_FLAG(pseudoroot); + FREE_FLAG(needaffix); FREE_FLAG(lemma_present); FREE_FLAG(circumfix); FREE_FLAG(onlyincompound); @@ -236,30 +259,20 @@ AffixMgr::~AffixMgr() if (ignorechars) free(ignorechars); if (ignorechars_utf16) free(ignorechars_utf16); if (version) free(version); - if (derived) free(derived); checknum=0; } // read in aff file and build up prefix and suffix entry objects #ifdef HUNSPELL_CHROME_CLIENT - // Hack to avoid having to comment out all the fclose calls below on errors. - #define fclose(a); - int AffixMgr::parse_file() #else -int AffixMgr::parse_file(FILE* aff_handle) +int AffixMgr::parse_file(FILE* aff_handle, const char * key) #endif { - // io buffers - char line[MAXLNLEN+1]; - - // affix type - char ft; - - // first line indicator for removing byte order mark - int firstline = 1; - + char * line = new char[MAXLNLEN+1]; // io buffers + char ft; // affix type + // open the affix file #ifdef HUNSPELL_CHROME_CLIENT // We're always UTF-8 @@ -286,44 +299,65 @@ int AffixMgr::parse_file(FILE* aff_handle) char dupflags[CONTSIZE]; char dupflags_ini = 1; - FILE * afflst; - afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); + // first line indicator for removing byte order mark + int firstline = 1; + + // open the affix file + FileMgr * afflst = new FileMgr(affpath, key); if (!afflst) { - HUNSPELL_WARNING(stderr, "error: could not open affix description file\n"); + HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath); return 1; } - fseek(afflst, 0, SEEK_SET); // step one is to parse the affix file building up the internal // affix data structures - // read in each line ignoring any that do not // start with a known line type indicator - while (fgets(line,MAXLNLEN,afflst)) { + while ((line = afflst->getline())) { mychomp(line); /* remove byte order mark */ if (firstline) { firstline = 0; - if (strncmp(line,"\xef\xbb\xbf",3) == 0) { + if (strncmp(line,"\xEF\xBB\xBF",3) == 0) { memmove(line, line+3, strlen(line+3)+1); HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n"); } } #endif + /* parse in the keyboard string */ + if (strncmp(line,"KEY",3) == 0) { +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &keystring, 0)) { +#else + if (parse_string(line, &keystring, afflst->getlinenum())) { + delete afflst; +#endif + return 1; + } + } + /* parse in the try string */ if (strncmp(line,"TRY",3) == 0) { - if (parse_string(line, &trystring, "TRY")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &trystring, 0)) { +#else + if (parse_string(line, &trystring, afflst->getlinenum())) { + delete afflst; +#endif return 1; } } /* parse in the name of the character set used by the .dict and .aff */ if (strncmp(line,"SET",3) == 0) { - if (parse_string(line, &encoding, "SET")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &encoding, 0)) { +#else + if (parse_string(line, &encoding, afflst->getlinenum())) { + delete afflst; +#endif return 1; } if (strcmp(encoding, "UTF-8") == 0) { @@ -342,8 +376,12 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the flag used by the controlled compound words */ if (strncmp(line,"COMPOUNDFLAG",12) == 0) { - if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundflag)) { +#else + if (parse_flag(line, &compoundflag, afflst)) { + delete afflst; +#endif return 1; } } @@ -351,13 +389,21 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDBEGIN",13) == 0) { if (complexprefixes) { - if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundend)) { +#else + if (parse_flag(line, &compoundend, afflst)) { + delete afflst; +#endif return 1; } } else { - if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundbegin)) { +#else + if (parse_flag(line, &compoundbegin, afflst)) { + delete afflst; +#endif return 1; } } @@ -365,21 +411,33 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) { - if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundmiddle)) { +#else + if (parse_flag(line, &compoundmiddle, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDEND",11) == 0) { if (complexprefixes) { - if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundbegin)) { +#else + if (parse_flag(line, &compoundbegin, afflst)) { + delete afflst; +#endif return 1; } } else { - if (parse_flag(line, &compoundend, "COMPOUNDEND")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundend)) { +#else + if (parse_flag(line, &compoundend, afflst)) { + delete afflst; +#endif return 1; } } @@ -387,32 +445,48 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the data used by compound_check() method */ if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) { - if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_num(line, &cpdwordmax)) { +#else + if (parse_num(line, &cpdwordmax, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag sign compounds in dictionary */ if (strncmp(line,"COMPOUNDROOT",12) == 0) { - if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundroot)) { +#else + if (parse_flag(line, &compoundroot, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by compound_check() method */ if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) { - if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundpermitflag)) { +#else + if (parse_flag(line, &compoundpermitflag, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by compound_check() method */ if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) { - if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundforbidflag)) { +#else + if (parse_flag(line, &compoundforbidflag, afflst)) { + delete afflst; +#endif return 1; } } @@ -429,69 +503,105 @@ int AffixMgr::parse_file(FILE* aff_handle) checkcompoundtriple = 1; } + if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) { + simplifiedtriple = 1; + } + if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) { checkcompoundcase = 1; } if (strncmp(line,"NOSUGGEST",9) == 0) { - if (parse_flag(line, &nosuggest, "NOSUGGEST")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &nosuggest)) { +#else + if (parse_flag(line, &nosuggest, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by forbidden words */ if (strncmp(line,"FORBIDDENWORD",13) == 0) { - if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &forbiddenword)) { +#else + if (parse_flag(line, &forbiddenword, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by forbidden words */ if (strncmp(line,"LEMMA_PRESENT",13) == 0) { - if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &lemma_present)) { +#else + if (parse_flag(line, &lemma_present, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by circumfixes */ if (strncmp(line,"CIRCUMFIX",9) == 0) { - if (parse_flag(line, &circumfix, "CIRCUMFIX")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &circumfix)) { +#else + if (parse_flag(line, &circumfix, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by fogemorphemes */ if (strncmp(line,"ONLYINCOMPOUND",14) == 0) { - if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &onlyincompound)) { +#else + if (parse_flag(line, &onlyincompound, afflst)) { + delete afflst; +#endif return 1; } } - /* parse in the flag used by `pseudoroots' */ + /* parse in the flag used by `needaffixs' */ if (strncmp(line,"PSEUDOROOT",10) == 0) { - if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &needaffix)) { +#else + if (parse_flag(line, &needaffix, afflst)) { + delete afflst; +#endif return 1; } } - /* parse in the flag used by `pseudoroots' */ + /* parse in the flag used by `needaffixs' */ if (strncmp(line,"NEEDAFFIX",9) == 0) { - if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &needaffix)) { +#else + if (parse_flag(line, &needaffix, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the minimal length for words in compounds */ if (strncmp(line,"COMPOUNDMIN",11) == 0) { - if (parse_num(line, &cpdmin, "COMPOUNDMIN")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_num(line, &cpdmin)) { +#else + if (parse_num(line, &cpdmin, afflst)) { + delete afflst; +#endif return 1; } if (cpdmin < 1) cpdmin = 1; @@ -499,16 +609,24 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the max. words and syllables in compounds */ if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) { +#ifdef HUNSPELL_CHROME_CLIENT if (parse_cpdsyllable(line)) { - fclose(afflst); +#else + if (parse_cpdsyllable(line, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by compound_check() method */ if (strncmp(line,"SYLLABLENUM",11) == 0) { - if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &cpdsyllablenum, 0)) { +#else + if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) { + delete afflst; +#endif return 1; } } @@ -520,34 +638,74 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the extra word characters */ if (strncmp(line,"WORDCHARS",9) == 0) { - if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, 0)) { +#else + if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) { + delete afflst; +#endif return 1; } } /* parse in the ignored characters (for example, Arabic optional diacretics charachters */ if (strncmp(line,"IGNORE",6) == 0) { - if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, 0)) { +#else + if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) { + delete afflst; +#endif return 1; } } - /* parse in the typical fault correcting table */ #ifndef HUNSPELL_CHROME_CLIENT + /* parse in the typical fault correcting table */ if (strncmp(line,"REP",3) == 0) { if (parse_reptable(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } #endif + /* parse in the input conversion table */ + if (strncmp(line,"ICONV",5) == 0) { + if (parse_convtable(line, afflst, &iconvtable, "ICONV")) { +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif + return 1; + } + } + + /* parse in the input conversion table */ + if (strncmp(line,"OCONV",5) == 0) { + if (parse_convtable(line, afflst, &oconvtable, "OCONV")) { +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif + return 1; + } + } + + /* parse in the phonetic translation table */ + if (strncmp(line,"PHONE",5) == 0) { + if (parse_phonetable(line, afflst)) { +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif + return 1; + } + } + /* parse in the checkcompoundpattern table */ if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) { if (parse_checkcpdtable(line, afflst)) { - fclose(afflst); +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif return 1; } } @@ -555,7 +713,9 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the defcompound table */ if (strncmp(line,"COMPOUNDRULE",12) == 0) { if (parse_defcpdtable(line, afflst)) { - fclose(afflst); +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif return 1; } } @@ -563,7 +723,9 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the related character map table */ if (strncmp(line,"MAP",3) == 0) { if (parse_maptable(line, afflst)) { - fclose(afflst); +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif return 1; } } @@ -571,30 +733,38 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the word breakpoints table */ if (strncmp(line,"BREAK",5) == 0) { if (parse_breaktable(line, afflst)) { - fclose(afflst); +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif return 1; } } /* parse in the language for language specific codes */ if (strncmp(line,"LANG",4) == 0) { - if (parse_string(line, &lang, "LANG")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &lang, 0)) { +#else + if (parse_string(line, &lang, afflst->getlinenum())) { + delete afflst; +#endif return 1; } langnum = get_lang_num(lang); } if (strncmp(line,"VERSION",7) == 0) { - if (parse_string(line, &version, "VERSION")) { - fclose(afflst); - return 1; - } + for(line = line + 7; *line == ' ' || *line == '\t'; line++); + version = mystrdup(line); } if (strncmp(line,"MAXNGRAMSUGS",12) == 0) { - if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_num(line, &maxngramsugs)) { +#else + if (parse_num(line, &maxngramsugs, afflst)) { + delete afflst; +#endif return 1; } } @@ -603,14 +773,34 @@ int AffixMgr::parse_file(FILE* aff_handle) nosplitsugs=1; } + if (strncmp(line,"FULLSTRIP",9) == 0) { + fullstrip=1; + } + if (strncmp(line,"SUGSWITHDOTS",12) == 0) { sugswithdots=1; } /* parse in the flag used by forbidden words */ if (strncmp(line,"KEEPCASE",8) == 0) { - if (parse_flag(line, &keepcase, "KEEPCASE")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &keepcase)) { +#else + if (parse_flag(line, &keepcase, afflst)) { + delete afflst; +#endif + return 1; + } + } + + /* parse in the flag used by the affix generator */ + if (strncmp(line,"SUBSTANDARD",11) == 0) { +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &substandard)) { +#else + if (parse_flag(line, &substandard, afflst)) { + delete afflst; +#endif return 1; } } @@ -630,7 +820,7 @@ int AffixMgr::parse_file(FILE* aff_handle) dupflags_ini = 0; } if (parse_affix(line, ft, afflst, dupflags)) { - fclose(afflst); + delete afflst; process_pfx_tree_to_list(); process_sfx_tree_to_list(); return 1; @@ -640,7 +830,7 @@ int AffixMgr::parse_file(FILE* aff_handle) } #ifndef HUNSPELL_CHROME_CLIENT - fclose(afflst); + delete afflst; #endif // convert affix trees to sorted list @@ -673,8 +863,8 @@ int AffixMgr::parse_file(FILE* aff_handle) process_pfx_order(); process_sfx_order(); - // expand wordchars string, based on csutil (for external tokenization) - + /* get encoding for CHECKCOMPOUNDCASE */ + if (!utf8) { char * enc = get_encoding(); csconv = get_current_cs(enc); free(enc); @@ -695,16 +885,20 @@ int AffixMgr::parse_file(FILE* aff_handle) } wordchars = mystrdup(expw); + } - // temporary BREAK definition for German dash handling (OOo issue 64400) - if ((langnum == LANG_de) && (!breaktable)) { - breaktable = (char **) malloc(sizeof(char *)); + // default BREAK definition + if (!breaktable) { + breaktable = (char **) malloc(sizeof(char *) * 3); if (!breaktable) return 1; breaktable[0] = mystrdup("-"); - numbreak = 1; + breaktable[1] = mystrdup("^-"); + breaktable[2] = mystrdup("-$"); + if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3; } return 0; } + #ifdef HUNSPELL_CHROME_CLIENT #undef fclose #endif @@ -977,197 +1171,52 @@ int AffixMgr::process_sfx_order() return 0; } +// add flags to the result for dictionary debugging +void AffixMgr::debugflag(char * result, unsigned short flag) { + char * st = encode_flag(flag); + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_FLAG, MAXLNLEN); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } +} - -// takes aff file condition string and creates the -// conds array - please see the appendix at the end of the -// file affentry.cxx which describes what is going on here -// in much more detail - -int AffixMgr::encodeit(struct affentry * ptr, char * cs) +// calculate the character length of the condition +int AffixMgr::condlen(char * st) { - unsigned char c; - int i, j, k; - unsigned char mbr[MAXLNLEN]; - w_char wmbr[MAXLNLEN]; - w_char * wpos = wmbr; - - // now clear the conditions array */ - for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0; - - // now parse the string to create the conds array */ - int nc = strlen(cs); - unsigned char neg = 0; // complement indicator - int grp = 0; // group indicator - unsigned char n = 0; // number of conditions - int ec = 0; // end condition indicator - int nm = 0; // number of member in group - - // if no condition just return - if (strcmp(cs,".")==0) { - ptr->numconds = 0; - return 0; + int l = 0; + bool group = false; + for(; *st; st++) { + if (*st == '[') { + group = true; + l++; + } else if (*st == ']') group = false; + else if (!group && (!utf8 || + (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++; } + return l; +} - i = 0; - while (i < nc) { - c = *((unsigned char *)(cs + i)); - - // start group indicator - if (c == '[') { - grp = 1; - c = 0; - } - - // complement flag - if ((grp == 1) && (c == '^')) { - neg = 1; - c = 0; - } - - // end goup indicator - if (c == ']') { - ec = 1; - c = 0; - } - - // add character of group to list - if ((grp == 1) && (c != 0)) { - *(mbr + nm) = c; - nm++; - c = 0; - } - - // end of condition - if (c != 0) { - ec = 1; +int AffixMgr::encodeit(struct affentry * ptr, char * cs) +{ + if (strcmp(cs,".") != 0) { + ptr->numconds = (char) condlen(cs); + strncpy(ptr->c.conds, cs, MAXCONDLEN); + // long condition (end of conds padded by strncpy) + if (ptr->c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { + ptr->opts += aeLONGCOND; + ptr->c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); + if (!ptr->c.l.conds2) return 1; } - - if (ec) { - if (!utf8) { - if (grp == 1) { - if (neg == 0) { - // set the proper bits in the condition array vals for those chars - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - ptr->conds.base[k] = ptr->conds.base[k] | ((unsigned char)1 << n); - } - } else { - // complement so set all of them and then unset indicated ones - for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n); - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - ptr->conds.base[k] = ptr->conds.base[k] & ~((unsigned char)1 << n); - } - } - neg = 0; - grp = 0; - nm = 0; - } else { - // not a group so just set the proper bit for this char - // but first handle special case of . inside condition - if (c == '.') { - // wild card character so set them all - for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n); - } else { - ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | ((unsigned char)1 << n); - } - } - n++; - ec = 0; - } else { // UTF-8 character set - if (grp == 1) { - ptr->conds.utf8.neg[n] = neg; - if (neg == 0) { - // set the proper bits in the condition array vals for those chars - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - if (k >> 7) { - u8_u16(wpos, 1, (char *) mbr + j); - wpos++; - if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character - } else { - ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | ((unsigned char)1 << n); - } - } - } else { // neg == 1 - // complement so set all of them and then unset indicated ones - for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n); - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - if (k >> 7) { - u8_u16(wpos, 1, (char *) mbr + j); - wpos++; - if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character - } else { - ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~((unsigned char)1 << n); - } - } - } - neg = 0; - grp = 0; - nm = 0; - ptr->conds.utf8.wlen[n] = wpos - wmbr; - if ((wpos - wmbr) != 0) { - ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr)); - if (!ptr->conds.utf8.wchars[n]) return 1; - memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr)); - flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]); - wpos = wmbr; - } - } else { // grp == 0 - // is UTF-8 character? - if (c >> 7) { - ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char)); - if (!ptr->conds.utf8.wchars[n]) return 1; - ptr->conds.utf8.wlen[n] = 1; - u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i); - if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character - } else { - ptr->conds.utf8.wchars[n] = NULL; - // not a group so just set the proper bit for this char - // but first handle special case of . inside condition - if (c == '.') { - ptr->conds.utf8.all[n] = 1; - // wild card character so set them all - for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n); - } else { - ptr->conds.utf8.all[n] = 0; - ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | ((unsigned char)1 << n); - } - } - neg = 0; - } - n++; - if (n > 8) { - HUNSPELL_WARNING(stderr, "Number of conditions is larger than 8. This" - "version of Hunspell does not support more than 8 conditions." - "Please, get rid of affentries with more than 8 conditions."); - break; - } - ec = 0; - neg = 0; - } - } - - i++; + } else { + ptr->numconds = 0; + ptr->c.conds[0] = '\0'; } - ptr->numconds = n; return 0; } - // return 1 if s1 is a leading subset of s2 -/* inline int AffixMgr::isSubset(const char * s1, const char * s2) - { - while ((*s1 == *s2) && *s1) { - s1++; - s2++; - } - return (*s1 == '\0'); - } -*/ - - // return 1 if s1 is a leading subset of s2 (dots are for infixes) +// return 1 if s1 is a leading subset of s2 (dots are for infixes) inline int AffixMgr::isSubset(const char * s1, const char * s2) { while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) { @@ -1277,7 +1326,6 @@ struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL // check word for prefixes char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound, const FLAG needflag) @@ -1295,7 +1343,7 @@ char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound while (pe) { st = pe->check_morph(word,len,in_compound, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } // if (rv) return rv; @@ -1313,7 +1361,7 @@ char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound // fogemorpheme if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); pfx = (AffEntry *)pptr; } free(st); @@ -1346,7 +1394,7 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, while (pe) { st = pe->check_twosfx_morph(word,len,in_compound, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } pe = pe->getNext(); @@ -1360,7 +1408,7 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, if (isSubset(pptr->getKey(),word)) { st = pptr->check_twosfx_morph(word, len, in_compound, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); pfx = (AffEntry *)pptr; } @@ -1373,8 +1421,6 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, if (*result) return mystrdup(result); return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE - // Is word a non compound with a REP substitution (see checkcompoundrep)? int AffixMgr::cpdrep_check(const char * word, int wl) @@ -1424,11 +1470,15 @@ int AffixMgr::cpdrep_check(const char * word, int wl) } // forbid compoundings when there are special patterns at word bound -int AffixMgr::cpdpat_check(const char * word, int pos) +int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2) { int len; for (int i = 0; i < numcheckcpd; i++) { if (isSubset(checkcpdtable[i].pattern2, word + pos) && + (!r1 || !checkcpdtable[i].cond || + (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) && + (!r2 || !checkcpdtable[i].cond2 || + (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) && (len = strlen(checkcpdtable[i].pattern)) && (pos > len) && (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1; } @@ -1446,10 +1496,8 @@ int AffixMgr::cpdcase_check(const char * word, int pos) u8_u16(&w, 1, p); unsigned short a = (u.h << 8) + u.l; unsigned short b = (w.h << 8) + w.l; - // CHROME MODIFICATION: We add the checks for the dashes as they are used - // below in the non-UTF-8 case. This seems to be a bug in Hunspell. It - // causes some of the tests to fail since we convert everything to UTF-8. - if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) && (a != '-') && (b != '-')) return 1; + if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) && + (a != '-') && (b != '-')) return 1; } else { unsigned char a = *(word + pos - 1); unsigned char b = *(word + pos); @@ -1465,15 +1513,35 @@ int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** signed short btwp[MAXWORDLEN]; // word positions for metacharacters int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions short bt = 0; - int i; + int i, j; int ok; int w = 0; + if (!*words) { w = 1; *words = def; } (*words)[wnum] = rv; + // has the last word COMPOUNDRULE flag? + if (rv->alen == 0) { + (*words)[wnum] = NULL; + if (w) *words = NULL; + return 0; + } + ok = 0; + for (i = 0; i < numdefcpd; i++) { + for (j = 0; j < defcpdtable[i].len; j++) { + if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' && + TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1; + } + } + if (ok == 0) { + (*words)[wnum] = NULL; + if (w) *words = NULL; + return 0; + } + for (i = 0; i < numdefcpd; i++) { signed short pp = 0; // pattern position signed short wp = 0; // "words" position @@ -1518,17 +1586,18 @@ int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) && ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2; if (defcpdtable[i].len <= r) return 1; - } + } // backtrack if (bt) do { ok = 1; btnum[bt - 1]--; pp = btpp[bt - 1]; - wp = btwp[bt - 1] + btnum[bt - 1]; + wp = btwp[bt - 1] + (signed short) btnum[bt - 1]; } while ((btnum[bt - 1] < 0) && --bt); } while (bt); - if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; + if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; + // check zero ending while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) && ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2; @@ -1568,7 +1637,7 @@ short AffixMgr::get_syllable(const char * word, int wlen) } else if (cpdvowels_utf16) { w_char w[MAXWORDUTF8LEN]; int i = u8_u16(w, MAXWORDUTF8LEN, word); - for (; i; i--) { + for (; i > 0; i--) { if (flag_bsearch((unsigned short *) cpdvowels_utf16, ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++; } @@ -1576,15 +1645,29 @@ short AffixMgr::get_syllable(const char * word, int wlen) return num; } +void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) { + if (utf8) { + int i; + for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) { + for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++); + } + for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) { + for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--); + } + } else { + *cmin = cpdmin; + *cmax = len - cpdmin + 1; + } +} + // check if compound word is correctly spelled // hu_mov_rule = spec. Hungarian rule (XXX) struct hentry * AffixMgr::compound_check(const char * word, int len, short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL, - char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0) + char hu_mov_rule = 0, char is_sug = 0) { int i; short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; - int oldcmpdstemnum = 0; struct hentry * rv = NULL; struct hentry * rv_first; struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking @@ -1592,31 +1675,17 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, char ch; int cmin; int cmax; - + int striple = 0; + int scpd = 0; + int soldi = 0; + int oldcmin = 0; + int oldcmax = 0; + int oldlen = 0; + int checkedstriple = 0; + int checked_prefix; -#ifdef HUNSTEM - if (cmpdstemnum) { - if (wordnum == 0) { - *cmpdstemnum = 1; - } else { - (*cmpdstemnum)++; - } - } -#endif - if (utf8) { - for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) { - cmin++; - for (; (word[cmin] & 0xc0) == 0x80; cmin++); - } - for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) { - cmax--; - for (; (word[cmax] & 0xc0) == 0x80; cmax--); - } - } else { - cmin = cpdmin; - cmax = len - cpdmin + 1; - } + setcminmax(&cmin, &cmax, word, len); strcpy(st, word); @@ -1632,20 +1701,42 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, if (i >= cmax) return NULL; } - + do { // simplified checkcompoundpattern loop + + if (scpd > 0) { + for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 || + strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++); + + if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop + strcpy(st + i, checkcpdtable[scpd-1].pattern); + soldi = i; + i += strlen(checkcpdtable[scpd-1].pattern); + strcpy(st + i, checkcpdtable[scpd-1].pattern2); + strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3)); + + oldlen = len; + len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3); + oldcmin = cmin; + oldcmax = cmax; + setcminmax(&cmin, &cmax, st, len); + + cmax = len - cpdmin + 1; + } + + ch = st[i]; st[i] = '\0'; sfx = NULL; pfx = NULL; - + // FIRST WORD - + rv = lookup(st); // perhaps without prefix // search homonym with compound flag while ((rv) && !hu_mov_rule && - ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundbegin && !wordnum && TESTAFF(rv->astr, compoundbegin, rv->alen)) || @@ -1653,8 +1744,10 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, TESTAFF(rv->astr, compoundmiddle, rv->alen)) || (numdefcpd && ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) || - (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)))) - ))) { + (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) || + (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL && + !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen))) + ) { rv = rv->next_homonym; } @@ -1671,6 +1764,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, rv = NULL; } } + if (rv || (((wordnum == 0) && compoundbegin && ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || @@ -1679,9 +1773,9 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) ) checked_prefix = 1; - // else check forbiddenwords and pseudoroot + // else check forbiddenwords and needaffix } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || - TESTAFF(rv->astr, pseudoroot, rv->alen) || + TESTAFF(rv->astr, needaffix, rv->alen) || (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)) )) { st[i] = ch; @@ -1709,7 +1803,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ((SfxEntry*)sfx)->getContLen())))) { rv = NULL; } - + // check compoundmiddle flag in suffix and prefix if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule && ((pfx && ((PfxEntry*)pfx)->getCont() && @@ -1719,7 +1813,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle, ((SfxEntry*)sfx)->getContLen())))) { rv = NULL; - } + } // check forbiddenwords if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || @@ -1749,19 +1843,20 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ) ) // END of LANG_hu section + ) && + ( + // test CHECKCOMPOUNDPATTERN conditions + scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL || + TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen) ) - && ! (( checkcompoundtriple && // test triple letters + && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters (word[i-1]==word[i]) && ( - ((i>1) && (word[i-1]==word[i-2])) || + ((i>1) && (word[i-1]==word[i-2])) || ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' ) ) || - ( - // test CHECKCOMPOUNDPATTERN - numcheckcpd && cpdpat_check(word, i) - ) || - ( - checkcompoundcase && cpdcase_check(word, i) + ( + checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i) )) ) // LANG_hu section: spec. Hungarian rule @@ -1769,15 +1864,14 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) || TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen()) - ) + ) ) ) -// END of LANG_hu section - ) { + ) { // first word is ok condition // LANG_hu section: spec. Hungarian rule if (langnum == LANG_hu) { - // calculate syllable number of the word + // calculate syllable number of the word numsyllable += get_syllable(st, i); // + 1 word, if syllable number of the prefix > 1 (hungarian convention) @@ -1785,23 +1879,35 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, } // END of LANG_hu section -#ifdef HUNSTEM - if (cmpdstem) cmpdstem[*cmpdstemnum - 1] = i; -#endif // NEXT WORD(S) rv_first = rv; - rv = lookup((word+i)); // perhaps without prefix + st[i] = ch; + + do { // striple loop + + // check simplifiedtriple + if (simplifiedtriple) { + if (striple) { + checkedstriple = 1; + i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" + } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1; + } + + rv = lookup((st+i)); // perhaps without prefix // search homonym with compound flag - while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || - (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { + (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) || + (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL && + !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) + )) { rv = rv->next_homonym; } - if (rv && words && words[wnum + 1]) return rv; + if (rv && words && words[wnum + 1]) return rv_first; oldnumsyllable2 = numsyllable; oldwordnum2 = wordnum; @@ -1833,20 +1939,27 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ) && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || - ((cpdmaxsyllable==0) || - (numsyllable + get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable)) - ) - && ( + ((cpdmaxsyllable!=0) && + (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable)) + ) && + ( + // test CHECKCOMPOUNDPATTERN + !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv) + ) && + ( (!checkcompounddup || (rv != rv_first)) ) + // test CHECKCOMPOUNDPATTERN conditions + && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || + TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) ) { // forbid compound word, if it is a non compound word with typical fault if (checkcompoundrep && cpdrep_check(word,len)) return NULL; - return rv; + return rv_first; } - numsyllable = oldnumsyllable2 ; + numsyllable = oldnumsyllable2; wordnum = oldwordnum2; // perhaps second word has prefix or/and suffix @@ -1858,12 +1971,20 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, pfx = NULL; rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END); } - + if (!rv && numdefcpd && words) { rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); - if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv; + if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first; + rv = NULL; } + // test CHECKCOMPOUNDPATTERN conditions (allowed forms) + if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || + TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL; + + // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) + if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv)) rv = NULL; + // check non_compound flag in suffix and prefix if ((rv) && ((pfx && ((PfxEntry*)pfx)->getCont() && @@ -1887,7 +2008,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, if (langnum == LANG_hu) { // calculate syllable number of the word numsyllable += get_syllable(word + i, strlen(word + i)); - + // - affix syllable num. // XXX only second suffix (inflections, not derivations) if (sfxappnd) { @@ -1895,13 +2016,13 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, numsyllable -= get_syllable(tmp, strlen(tmp)); free(tmp); } - + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++; // increment syllable num, if last word has a SYLLABLENUM flag // and the suffix is beginning `s' - + if (cpdsyllablenum) { switch (sfxflag) { case 'c': { numsyllable+=2; break; } @@ -1910,7 +2031,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, } } } - + // increment word number, if the second word has a compoundroot flag if ((rv) && (compoundroot) && (TESTAFF(rv->astr, compoundroot, rv->alen))) { @@ -1924,7 +2045,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, if ((rv) && ( ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || - ((cpdmaxsyllable == 0) || + ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable)) ) && ( @@ -1932,41 +2053,61 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, )) { // forbid compound word, if it is a non compound word with typical fault if (checkcompoundrep && cpdrep_check(word, len)) return NULL; - return rv; + return rv_first; } numsyllable = oldnumsyllable2; wordnum = oldwordnum2; -#ifdef HUNSTEM - if (cmpdstemnum) oldcmpdstemnum = *cmpdstemnum; -#endif + // perhaps second word is a compound word (recursive call) if (wordnum < maxwordnum) { - rv = compound_check((word+i),strlen(word+i), wordnum+1, - numsyllable, maxwordnum, wnum + 1, words, - 0, cmpdstemnum, cmpdstem, is_sug); + rv = compound_check((st+i),strlen(st+i), wordnum+1, + numsyllable, maxwordnum, wnum + 1, words, 0, is_sug); + if (rv && numcheckcpd && (scpd == 0 && cpdpat_check(word, i, rv_first, rv) || + scpd != 0 && !cpdpat_check(word, i, rv_first, rv))) rv = NULL; } else { rv=NULL; } if (rv) { // forbid compound word, if it is a non compound word with typical fault if (checkcompoundrep && cpdrep_check(word, len)) return NULL; - return rv; - } else { -#ifdef HUNSTEM - if (cmpdstemnum) *cmpdstemnum = oldcmpdstemnum; -#endif + return rv_first; } + } while (striple && !checkedstriple); // end of striple loop + + if (checkedstriple) { + i++; + checkedstriple = 0; + striple = 0; + } + + } // first word is ok condition + + if (soldi != 0) { + i = soldi; + soldi = 0; + len = oldlen; + cmin = oldcmin; + cmax = oldcmax; } - st[i] = ch; + scpd++; + + } while (simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop + + if (soldi != 0) { + i = soldi; + strcpy(st, word); // XXX add more optim. + soldi = 0; + } else st[i] = ch; + + scpd = 0; wordnum = oldwordnum; numsyllable = oldnumsyllable; } - + return NULL; -} +} -#ifdef HUNSPELL_EXPERIMENTAL // check if compound word is correctly spelled // hu_mov_rule = spec. Hungarian rule (XXX) int AffixMgr::compound_check_morph(const char * word, int len, @@ -1982,26 +2123,14 @@ int AffixMgr::compound_check_morph(const char * word, int len, struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking char st [MAXWORDUTF8LEN + 4]; char ch; - + int checked_prefix; char presult[MAXLNLEN]; int cmin; int cmax; - - if (utf8) { - for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) { - cmin++; - for (; (word[cmin] & 0xc0) == 0x80; cmin++); - } - for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) { - cmax--; - for (; (word[cmax] & 0xc0) == 0x80; cmax--); - } - } else { - cmin = cpdmin; - cmax = len - cpdmin + 1; - } + + setcminmax(&cmin, &cmax, word, len); strcpy(st, word); @@ -2015,7 +2144,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, for (; (st[i] & 0xc0) == 0x80; i++); if (i >= cmax) return 0; } - + ch = st[i]; st[i] = '\0'; sfx = NULL; @@ -2023,12 +2152,12 @@ int AffixMgr::compound_check_morph(const char * word, int len, // FIRST WORD *presult = '\0'; if (partresult) strcat(presult, partresult); - + rv = lookup(st); // perhaps without prefix // search homonym with compound flag while ((rv) && !hu_mov_rule && - ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundbegin && !wordnum && TESTAFF(rv->astr, compoundbegin, rv->alen)) || @@ -2042,13 +2171,16 @@ int AffixMgr::compound_check_morph(const char * word, int len, } if (rv) { - if (rv->description) { - if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) - strcat(presult, st); - strcat(presult, rv->description); + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st); } - } - + // store the pointer of the hash entry +// sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv); + if (HENTRY_DATA(rv)) { + sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv)); + } + } if (!rv) { if (compoundflag && !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { @@ -2062,7 +2194,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, rv = NULL; } } - + if (rv || (((wordnum == 0) && compoundbegin && ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || @@ -2071,35 +2203,28 @@ int AffixMgr::compound_check_morph(const char * word, int len, ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) ) { - //char * p = prefix_check_morph(st, i, 0, compound); + // char * p = prefix_check_morph(st, i, 0, compound); char * p = NULL; if (compoundflag) p = affix_check_morph(st, i, compoundflag); if (!p || (*p == '\0')) { + if (p) free(p); + p = NULL; if ((wordnum == 0) && compoundbegin) { p = affix_check_morph(st, i, compoundbegin); } else if ((wordnum > 0) && compoundmiddle) { p = affix_check_morph(st, i, compoundmiddle); } } - if (*p != '\0') { - line_uniq(p); - if (strchr(p, '\n')) { - strcat(presult, "("); - strcat(presult, line_join(p, '|')); - strcat(presult, ")"); - } else { - strcat(presult, p); - } - } - if (presult[strlen(presult) - 1] == '\n') { - presult[strlen(presult) - 1] = '\0'; + if (p && (*p != '\0')) { + sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, + MORPH_PART, st, line_uniq_app(&p, MSEP_REC)); } + if (p) free(p); checked_prefix = 1; - //strcat(presult, "+"); } // else check forbiddenwords } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || - TESTAFF(rv->astr, pseudoroot, rv->alen))) { + TESTAFF(rv->astr, needaffix, rv->alen))) { st[i] = ch; continue; } @@ -2162,7 +2287,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, ) // END of LANG_hu section ) - && ! (( checkcompoundtriple && // test triple letters + && ! (( checkcompoundtriple && !words && // test triple letters (word[i-1]==word[i]) && ( ((i>1) && (word[i-1]==word[i-2])) || ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' @@ -2170,10 +2295,10 @@ int AffixMgr::compound_check_morph(const char * word, int len, ) || ( // test CHECKCOMPOUNDPATTERN - numcheckcpd && cpdpat_check(word, i) + numcheckcpd && !words && cpdpat_check(word, i, rv, NULL) ) || ( - checkcompoundcase && cpdcase_check(word, i) + checkcompoundcase && !words && cpdcase_check(word, i) )) ) // LANG_hu section: spec. Hungarian rule @@ -2202,7 +2327,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, rv = lookup((word+i)); // perhaps without prefix // search homonym with compound flag - while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { @@ -2211,11 +2336,21 @@ int AffixMgr::compound_check_morph(const char * word, int len, if (rv && words && words[wnum + 1]) { strcat(*result, presult); - if (complexprefixes && rv->description) strcat(*result, rv->description); - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, lemma_present, rv->alen))) - strcat(*result, rv->word); - if (!complexprefixes && rv->description) strcat(*result, rv->description); + strcat(*result, " "); + strcat(*result, MORPH_PART); + strcat(*result, word+i); + if (complexprefixes && HENTRY_DATA(rv)) strcat(*result, HENTRY_DATA2(rv)); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + strcat(*result, " "); + strcat(*result, MORPH_STEM); + strcat(*result, HENTRY_WORD(rv)); + } + // store the pointer of the hash entry +// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); + if (!complexprefixes && HENTRY_DATA(rv)) { + strcat(*result, " "); + strcat(*result, HENTRY_DATA2(rv)); + } strcat(*result, "\n"); ok = 1; return 0; @@ -2240,7 +2375,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, st[i] = ch; continue; } - + // second word is acceptable, as a root? // hungarian conventions: compounding is acceptable, // when compound forms consist of 2 words, or if more, @@ -2251,8 +2386,8 @@ int AffixMgr::compound_check_morph(const char * word, int len, ) && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || - ((cpdmaxsyllable==0) || - (numsyllable+get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable)) + ((cpdmaxsyllable!=0) && + (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable)) ) && ( (!checkcompounddup || (rv != rv_first)) @@ -2261,12 +2396,23 @@ int AffixMgr::compound_check_morph(const char * word, int len, { // bad compound word strcat(*result, presult); - - if (rv->description) { - if (complexprefixes) strcat(*result, rv->description); - if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) - strcat(*result, rv->word); - if (!complexprefixes) strcat(*result, rv->description); + strcat(*result, " "); + strcat(*result, MORPH_PART); + strcat(*result, word+i); + + if (HENTRY_DATA(rv)) { + if (complexprefixes) strcat(*result, HENTRY_DATA2(rv)); + if (! HENTRY_FIND(rv, MORPH_STEM)) { + strcat(*result, " "); + strcat(*result, MORPH_STEM); + strcat(*result, HENTRY_WORD(rv)); + } + // store the pointer of the hash entry +// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); + if (!complexprefixes) { + strcat(*result, " "); + strcat(*result, HENTRY_DATA2(rv)); + } } strcat(*result, "\n"); ok = 1; @@ -2292,20 +2438,16 @@ int AffixMgr::compound_check_morph(const char * word, int len, if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { char * m = NULL; if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); - if ((!m || *m == '\0') && compoundend) + if ((!m || *m == '\0') && compoundend) { + if (m) free(m); m = affix_check_morph((word+i),strlen(word+i), compoundend); + } strcat(*result, presult); - if (m) { - line_uniq(m); - if (strchr(m, '\n')) { - strcat(*result, "("); - strcat(*result, line_join(m, '|')); - strcat(*result, ")"); - } else { - strcat(*result, m); - } - free(m); + if (m || (*m != '\0')) { + sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, + MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); } + if (m) free(m); strcat(*result, "\n"); ok = 1; } @@ -2324,7 +2466,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, // check forbiddenwords if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen)) - && (! TESTAFF(rv->astr, pseudoroot, rv->alen))) { + && (! TESTAFF(rv->astr, needaffix, rv->alen))) { st[i] = ch; continue; } @@ -2368,7 +2510,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, if ((rv) && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || - ((cpdmaxsyllable==0) || + ((cpdmaxsyllable!=0) && (numsyllable <= cpdmaxsyllable)) ) && ( @@ -2376,21 +2518,17 @@ int AffixMgr::compound_check_morph(const char * word, int len, )) { char * m = NULL; if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); - if ((!m || *m == '\0') && compoundend) + if ((!m || *m == '\0') && compoundend) { + if (m) free(m); m = affix_check_morph((word+i),strlen(word+i), compoundend); + } strcat(*result, presult); - if (m) { - line_uniq(m); - if (strchr(m, '\n')) { - strcat(*result, "("); - strcat(*result, line_join(m, '|')); - strcat(*result, ")"); - } else { - strcat(*result, m); - } - free(m); + if (m && (*m != '\0')) { + sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, + MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); } - strcat(*result, "\n"); + if (m) free(m); + sprintf(*result + strlen(*result), "%c", MSEP_REC); ok = 1; } @@ -2411,7 +2549,6 @@ int AffixMgr::compound_check_morph(const char * word, int len, } return 0; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE // return 1 if s1 (reversed) is a leading subset of end of s2 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len) @@ -2442,8 +2579,6 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, const FLAG cclass, const FLAG needflag, char in_compound) { struct hentry * rv = NULL; - char result[MAXLNLEN]; - PfxEntry* ep = (PfxEntry *) ppfx; // first handle the special case of 0 length suffixes @@ -2467,11 +2602,11 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, // fogemorpheme (in_compound || !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && - // pseudoroot on prefix or first suffix + // needaffix on prefix or first suffix (cclass || - !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || + !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), pseudoroot, + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))) ) ) { @@ -2509,11 +2644,11 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, // fogemorpheme (in_compound || !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && - // pseudoroot on prefix or first suffix + // needaffix on prefix or first suffix (cclass || - !(sptr->getCont() && TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || + !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), pseudoroot, + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))) ) ) { @@ -2523,17 +2658,6 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, sfx=(AffEntry *)sptr; // BUG: sfx not stateless sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless - if (cclass || sptr->getCont()) { - if (!derived) { - derived = mystrdup(word); - } else { - strcpy(result, derived); // XXX check size - strcat(result, "\n"); - strcat(result, word); - free(derived); - derived = mystrdup(result); - } - } return rv; } } @@ -2588,7 +2712,6 @@ struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, int sfxopts, AffEntry * ppfx, const FLAG needflag) { @@ -2610,12 +2733,18 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); if (st) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + if (((PfxEntry *) ppfx)->getMorph()) { + mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); } - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); - if (se->getMorph()) strcat(result, se->getMorph()); - strcat(result, "\n"); + if (se->getMorph()) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, se->getMorph(), MAXLNLEN); + } else debugflag(result, se->getFlag()); + mystrcat(result, "\n", MAXLNLEN); } } se = se->getNext(); @@ -2637,19 +2766,14 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, free(st); result3[0] = '\0'; -#ifdef DEBUG - unsigned short flag = sptr->getFlag(); - if (flag_mode == FLAG_NUM) { - sprintf(result3, "<%d>", sptr->getKey()); - } else if (flag_mode == FLAG_LONG) { - sprintf(result3, "<%c%c>", flag >> 8, (flag << 8) >>8); - } else sprintf(result3, "<%c>", flag); - strcat(result3, ":"); -#endif - if (sptr->getMorph()) strcat(result3, sptr->getMorph()); + + if (sptr->getMorph()) { + mystrcat(result3, " ", MAXLNLEN); + mystrcat(result3, sptr->getMorph(), MAXLNLEN); + } else debugflag(result3, sptr->getFlag()); strlinecat(result2, result3); - strcat(result2, "\n"); - strcat(result, result2); + mystrcat(result2, "\n", MAXLNLEN); + mystrcat(result, result2, MAXLNLEN); } } sptr = sptr->getNextEQ(); @@ -2657,7 +2781,7 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, sptr = sptr->getNextNE(); } } - if (result) return mystrdup(result); + if (*result) return mystrdup(result); return NULL; } @@ -2692,26 +2816,40 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, // fogemorpheme (in_compound || !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && - // pseudoroot on prefix or first suffix + // needaffix on prefix or first suffix (cclass || - !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || + !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), pseudoroot, + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))) ) )) rv = se->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); while (rv) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + if (((PfxEntry *) ppfx)->getMorph()) { + mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); + } + if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + if (! HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); + } + // store the pointer of the hash entry +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); + + if (!complexprefixes && HENTRY_DATA(rv)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); } - if (complexprefixes && rv->description) strcat(result, rv->description); - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, lemma_present, rv->alen))) - strcat(result, rv->word); - if (!complexprefixes && rv->description) strcat(result, rv->description); - if (se->getMorph()) strcat(result, se->getMorph()); - strcat(result, "\n"); + if (se->getMorph()) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, se->getMorph(), MAXLNLEN); + } else debugflag(result, se->getFlag()); + mystrcat(result, "\n", MAXLNLEN); rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } } @@ -2741,30 +2879,36 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, // fogemorpheme (in_compound || !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && - // pseudoroot on first suffix + // needaffix on first suffix (cclass || !(sptr->getCont() && - TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen()))) + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))) )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); while (rv) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + if (((PfxEntry *) ppfx)->getMorph()) { + mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); } - if (complexprefixes && rv->description) strcat(result, rv->description); - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, lemma_present, rv->alen))) strcat(result, rv->word); - if (!complexprefixes && rv->description) strcat(result, rv->description); -#ifdef DEBUG - unsigned short flag = sptr->getFlag(); - if (flag_mode == FLAG_NUM) { - sprintf(result, "<%d>", sptr->getKey()); - } else if (flag_mode == FLAG_LONG) { - sprintf(result, "<%c%c>", flag >> 8, (flag << 8) >>8); - } else sprintf(result, "<%c>", flag); - strcat(result, ":"); -#endif + if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + if (! HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); + } + // store the pointer of the hash entry +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); - if (sptr->getMorph()) strcat(result, sptr->getMorph()); - strcat(result, "\n"); + if (!complexprefixes && HENTRY_DATA(rv)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + } + + if (sptr->getMorph()) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, sptr->getMorph(), MAXLNLEN); + } else debugflag(result, sptr->getFlag()); + mystrcat(result, "\n", MAXLNLEN); rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } sptr = sptr->getNextEQ(); @@ -2776,15 +2920,11 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, if (*result) return mystrdup(result); return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE - // check if word with affixes is correctly spelled struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound) { struct hentry * rv= NULL; - if (derived) free(derived); - derived = NULL; // check all prefixes (also crossed with suffixes if allowed) rv = prefix_check(word, len, in_compound, needflag); @@ -2806,7 +2946,6 @@ struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG ne return rv; } -#ifdef HUNSPELL_EXPERIMENTAL // check if word with affixes is correctly spelled char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound) { @@ -2818,14 +2957,14 @@ char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needfl // check all prefixes (also crossed with suffixes if allowed) st = prefix_check_morph(word, len, in_compound); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } // if still not found check all suffixes st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } @@ -2835,54 +2974,133 @@ char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needfl // if still not found check all two-level suffixes st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } // if still not found check all two-level suffixes st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } } - + return mystrdup(result); } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE + +char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap, + unsigned short al, char * morph, char * targetmorph, int level) +{ + // handle suffixes + char * stemmorph; + char * stemmorphcatpos; + char mymorph[MAXLNLEN]; + + if (!morph && !targetmorph) return NULL; + + // check substandard flag + if (TESTAFF(ap, substandard, al)) return NULL; + + if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts); + +// int targetcount = get_sfxcount(targetmorph); + + // use input suffix fields, if exist + if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { + stemmorph = mymorph; + strcpy(stemmorph, morph); + strcat(stemmorph, " "); + stemmorphcatpos = stemmorph + strlen(stemmorph); + } else { + stemmorph = morph; + stemmorphcatpos = NULL; + } + + for (int i = 0; i < al; i++) { + const unsigned char c = (unsigned char) (ap[i] & 0x00FF); + SfxEntry * sptr = (SfxEntry *)sFlag[c]; + while (sptr) { + if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) || + // don't generate forms with substandard affixes + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { + + if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph()); + else stemmorph = (char *) sptr->getMorph(); + + int cmp = morphcmp(stemmorph, targetmorph); + + if (cmp == 0) { + char * newword = sptr->add(ts, wl); + if (newword) { + hentry * check = pHMgr->lookup(newword); // XXX extra dic + if (!check || !check->astr || + !TESTAFF(check->astr, forbiddenword, check->alen)) { + return newword; + } + free(newword); + } + } + + // recursive call for secondary suffixes + if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && +// (get_sfxcount(stemmorph) < targetcount) && + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { + char * newword = sptr->add(ts, wl); + if (newword) { + char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(), + sptr->getContLen(), stemmorph, targetmorph, 1); + + if (newword2) { + free(newword); + return newword2; + } + free(newword); + newword = NULL; + } + } + } + sptr = (SfxEntry *)sptr ->getFlgNxt(); + } + } + return NULL; +} int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts, - int wl, const unsigned short * ap, unsigned short al, char * bad, int badl) + int wl, const unsigned short * ap, unsigned short al, char * bad, int badl, + char * phon) { - int nh=0; - // first add root word to list - if ((nh < maxn) && !(al && ((pseudoroot && TESTAFF(ap, pseudoroot, al)) || + if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { wlst[nh].word = mystrdup(ts); + if (!wlst[nh].word) return 0; wlst[nh].allow = (1 == 0); + wlst[nh].orig = NULL; nh++; + // add special phonetic version + if (phon && (nh < maxn)) { + wlst[nh].word = mystrdup(phon); + if (!wlst[nh].word) return nh - 1; + wlst[nh].allow = (1 == 0); + wlst[nh].orig = mystrdup(ts); + if (!wlst[nh].orig) return nh - 1; + nh++; + } } // handle suffixes for (int i = 0; i < al; i++) { -#ifdef HUNSPELL_CHROME_CLIENT - // This change is taken from a future version of Hunspell. In other - // places, the index is clamped to a byte, so I think this is correct. - // Our array is only 256 entries anyway, so it is required. const unsigned char c = (unsigned char) (ap[i] & 0x00FF); -#else - unsigned short c = (unsigned short) ap[i]; -#endif SfxEntry * sptr = (SfxEntry *)sFlag[c]; while (sptr) { - if (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && - (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0)) && - // check pseudoroot flag - !(sptr->getCont() && ((pseudoroot && - TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || + if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && + (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) && + // check needaffix flag + !(sptr->getCont() && ((needaffix && + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || (circumfix && TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || (onlyincompound && @@ -2892,8 +3110,22 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts if (newword) { if (nh < maxn) { wlst[nh].word = newword; - wlst[nh].allow = sptr->allowCross(); - nh++; + wlst[nh].allow = sptr->allowCross(); + wlst[nh].orig = NULL; + nh++; + // add special phonetic version + if (phon && (nh < maxn)) { + char st[MAXWORDUTF8LEN]; + strcpy(st, phon); + strcat(st, sptr->getKey()); + reverseword(st + strlen(phon)); + wlst[nh].word = mystrdup(st); + if (!wlst[nh].word) return nh - 1; + wlst[nh].allow = (1 == 0); + wlst[nh].orig = mystrdup(newword); + if (!wlst[nh].orig) return nh - 1; + nh++; + } } else { free(newword); } @@ -2909,15 +3141,10 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts for (int j=1;j<n ;j++) if (wlst[j].allow) { for (int k = 0; k < al; k++) { -#ifdef HUNSPELL_CHROME_CLIENT - // See similar change above. const unsigned char c = (unsigned char) (ap[k] & 0x00FF); -#else - unsigned short c = (unsigned short) ap[k]; -#endif PfxEntry * cptr = (PfxEntry *) pFlag[c]; while (cptr) { - if (cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) && + if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) && (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { int l1 = strlen(wlst[j].word); char * newword = cptr->add(wlst[j].word, l1); @@ -2925,6 +3152,7 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts if (nh < maxn) { wlst[nh].word = newword; wlst[nh].allow = cptr->allowCross(); + wlst[nh].orig = NULL; nh++; } else { free(newword); @@ -2939,19 +3167,14 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts // now handle pure prefixes for (int m = 0; m < al; m ++) { -#ifdef HUNSPELL_CHROME_CLIENT - // See similar change above. const unsigned char c = (unsigned char) (ap[m] & 0x00FF); -#else - unsigned short c = (unsigned short) ap[m]; -#endif PfxEntry * ptr = (PfxEntry *) pFlag[c]; while (ptr) { - if (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && - (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0)) && - // check pseudoroot flag - !(ptr->getCont() && ((pseudoroot && - TESTAFF(ptr->getCont(), pseudoroot, ptr->getContLen())) || + if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && + (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && + // check needaffix flag + !(ptr->getCont() && ((needaffix && + TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || (circumfix && TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || (onlyincompound && @@ -2962,6 +3185,7 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts if (nh < maxn) { wlst[nh].word = newword; wlst[nh].allow = ptr->allowCross(); + wlst[nh].orig = NULL; nh++; } else { free(newword); @@ -2975,8 +3199,6 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts return nh; } - - // return length of replacing table int AffixMgr::get_numrep() { @@ -2990,6 +3212,27 @@ struct replentry * AffixMgr::get_reptable() return reptable; } +// return iconv table +RepList * AffixMgr::get_iconvtable() +{ + if (! iconvtable ) return NULL; + return iconvtable; +} + +// return oconv table +RepList * AffixMgr::get_oconvtable() +{ + if (! oconvtable ) return NULL; + return oconvtable; +} + +// return replacing table +struct phonetable * AffixMgr::get_phonetable() +{ + if (! phone ) return NULL; + return phone; +} + // return length of character map table int AffixMgr::get_nummap() { @@ -3019,9 +3262,7 @@ char ** AffixMgr::get_breaktable() // return text encoding of dictionary char * AffixMgr::get_encoding() { - if (! encoding ) { - encoding = mystrdup("ISO8859-1"); - } + if (! encoding ) encoding = mystrdup(SPELL_ENCODING); return mystrdup(encoding); } @@ -3037,6 +3278,12 @@ int AffixMgr::get_complexprefixes() return complexprefixes; } +// return FULLSTRIP option +int AffixMgr::get_fullstrip() +{ + return fullstrip; +} + FLAG AffixMgr::get_keepcase() { return keepcase; @@ -3047,11 +3294,17 @@ int AffixMgr::get_checksharps() return checksharps; } +char * AffixMgr::encode_flag(unsigned short aflag) +{ + return pHMgr->encode_flag(aflag); +} + + // return the preferred ignore string for suggestions char * AffixMgr::get_ignore() { if (!ignorechars) return NULL; - return mystrdup(ignorechars); + return ignorechars; } // return the preferred ignore string for suggestions @@ -3061,6 +3314,13 @@ unsigned short * AffixMgr::get_ignore_utf16(int * len) return ignorechars_utf16; } +// return the keyboard string for suggestions +char * AffixMgr::get_key_string() +{ + if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING); + return mystrdup(keystring); +} + // return the preferred try string for suggestions char * AffixMgr::get_try_string() { @@ -3105,9 +3365,9 @@ FLAG AffixMgr::get_nosuggest() } // return the forbidden words flag modify flag -FLAG AffixMgr::get_pseudoroot() +FLAG AffixMgr::get_needaffix() { - return pseudoroot; + return needaffix; } // return the onlyincompound flag @@ -3147,12 +3407,6 @@ const char * AffixMgr::get_suffix() return sfxappnd; } -// return the value of derived form (base word with first suffix). -const char * AffixMgr::get_derived() -{ - return derived; -} - // return the value of suffix const char * AffixMgr::get_version() { @@ -3168,8 +3422,12 @@ FLAG AffixMgr::get_lemma_present() // utility method to look up root words in hash table struct hentry * AffixMgr::lookup(const char * word) { - if (! pHMgr) return NULL; - return pHMgr->lookup(word); + int i; + struct hentry * he = NULL; + for (i = 0; i < *maxdic && !he; i++) { + he = (alldic[i])->lookup(word); + } + return he; } // return the value of suffix @@ -3203,33 +3461,47 @@ int AffixMgr::get_sugswithdots(void) } /* parse flag */ -int AffixMgr::parse_flag(char * line, unsigned short * out, const char * name) { +#ifdef HUNSPELL_CHROME_CLIENT +int AffixMgr::parse_flag(char * line, unsigned short * out) +#else +int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) +#endif +{ char * s = NULL; - if (*out != FLAG_NULL) { - HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); + if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { + HUNSPELL_WARNING(stderr, "error:multiple definitions of an affix file parameter\n"); return 1; } - if (parse_string(line, &s, name)) return 1; + if (parse_string(line, &s, 0)) return 1; *out = pHMgr->decode_flag(s); free(s); return 0; } /* parse num */ -int AffixMgr::parse_num(char * line, int * out, const char * name) { +#ifdef HUNSPELL_CHROME_CLIENT +int AffixMgr::parse_num(char * line, int * out) +#else +int AffixMgr::parse_num(char * line, int * out, FileMgr * af) +#endif +{ char * s = NULL; if (*out != -1) { - HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); + HUNSPELL_WARNING(stderr, "error: multiple definitions of an affix file parameter\n"); return 1; } - if (parse_string(line, &s, name)) return 1; + if (parse_string(line, &s, 0)) return 1; *out = atoi(s); free(s); return 0; } /* parse in the max syllablecount of compound words and */ +#ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_cpdsyllable(char * line) +#else +int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af) +#endif { char * tp = line; char * piece; @@ -3262,7 +3534,6 @@ int AffixMgr::parse_cpdsyllable(char * line) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np < 2) { @@ -3273,12 +3544,12 @@ int AffixMgr::parse_cpdsyllable(char * line) return 0; } -/* parse in the typical fault correcting table */ #ifndef HUNSPELL_CHROME_CLIENT -int AffixMgr::parse_reptable(char * line, FILE * af) +/* parse in the typical fault correcting table */ +int AffixMgr::parse_reptable(char * line, FileMgr * af) { if (numrep != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate REP tables used\n"); + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); return 1; } char * tp = line; @@ -3293,8 +3564,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af) case 1: { numrep = atoi(piece); if (numrep < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in replacement table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum()); return 1; } reptable = (replentry *) malloc(numrep * sizeof(struct replentry)); @@ -3306,18 +3576,17 @@ int AffixMgr::parse_reptable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing replacement table information\n"); + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); return 1; } /* now parse the numrep lines to read in the remainder of the table */ - char * nl = line; + char * nl; for (int j=0; j < numrep; j++) { - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -3329,8 +3598,8 @@ int AffixMgr::parse_reptable(char * line, FILE * af) switch(i) { case 0: { if (strncmp(piece,"REP",3) != 0) { - HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); + numrep = 0; return 1; } break; @@ -3341,27 +3610,210 @@ int AffixMgr::parse_reptable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { - HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); + numrep = 0; + return 1; + } + } + return 0; +} +#endif + +/* parse in the typical fault correcting table */ +#ifdef HUNSPELL_CHROME_CLIENT +int AffixMgr::parse_convtable(char * line, hunspell::LineIterator* iterator, RepList ** rl, const char * keyword) +#else +int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword) +#endif +{ + if (*rl) { + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + int numrl = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + numrl = atoi(piece); + if (numrl < 1) { + HUNSPELL_WARNING(stderr, "error: incorrect entry number\n"); + return 1; + } + *rl = new RepList(numrl); + if (!rl) return 1; + np++; + break; + } + default: break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: missing data\n"); + return 1; + } + + /* now parse the num lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < numrl; j++) { +#ifdef HUNSPELL_CHROME_CLIENT + if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) + return 1; +#else + if (!(nl = af->getline())) return 1; +#endif + mychomp(nl); + tp = nl; + i = 0; + char * pattern = NULL; + char * pattern2 = NULL; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece, keyword, sizeof(keyword)) != 0) { + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + delete *rl; + *rl = NULL; + return 1; + } + break; + } + case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; } + case 2: { + pattern2 = mystrrep(mystrdup(piece),"_"," "); + break; + } + default: break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (!pattern || !pattern2) { + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); return 1; } + (*rl)->add(pattern, pattern2); } return 0; } + + +/* parse in the typical fault correcting table */ +#ifdef HUNSPELL_CHROME_CLIENT +int AffixMgr::parse_phonetable(char * line, hunspell::LineIterator* iterator) +#else +int AffixMgr::parse_phonetable(char * line, FileMgr * af) #endif +{ + if (phone) { + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + phone = (phonetable *) malloc(sizeof(struct phonetable)); + phone->num = atoi(piece); + phone->rules = NULL; + phone->utf8 = (char) utf8; + if (!phone) return 1; + if (phone->num < 1) { + HUNSPELL_WARNING(stderr, "error: line bad entry number\n"); + return 1; + } + phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *)); + if (!phone->rules) return 1; + np++; + break; + } + default: break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: missing data\n"); + return 1; + } + + /* now parse the phone->num lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < phone->num; j++) { +#ifdef HUNSPELL_CHROME_CLIENT + if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) + return 1; +#else + if (!(nl = af->getline())) return 1; +#endif + mychomp(nl); + tp = nl; + i = 0; + phone->rules[j * 2] = NULL; + phone->rules[j * 2 + 1] = NULL; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece,"PHONE",5) != 0) { + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + phone->num = 0; + return 1; + } + break; + } + case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; } + case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; } + default: break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) { + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + phone->num = 0; + return 1; + } + } + phone->rules[phone->num * 2] = mystrdup(""); + phone->rules[phone->num * 2 + 1] = mystrdup(""); + init_phonet_hash(*phone); + return 0; +} /* parse in the checkcompoundpattern table */ #if HUNSPELL_CHROME_CLIENT int AffixMgr::parse_checkcpdtable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_checkcpdtable(char * line, FILE * af) +int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) #endif { if (numcheckcpd != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate compound pattern tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -3376,11 +3828,10 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) case 1: { numcheckcpd = atoi(piece); if (numcheckcpd < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in compound pattern table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } - checkcpdtable = (replentry *) malloc(numcheckcpd * sizeof(struct replentry)); + checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry)); if (!checkcpdtable) return 1; np++; break; @@ -3389,14 +3840,13 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing compound pattern table information\n"); + HUNSPELL_WARNING(stderr, "error: missing data\n"); return 1; - } - + } + /* now parse the numcheckcpd lines to read in the remainder of the table */ char * nl = line; for (int j=0; j < numcheckcpd; j++) { @@ -3404,36 +3854,55 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; i = 0; checkcpdtable[j].pattern = NULL; checkcpdtable[j].pattern2 = NULL; + checkcpdtable[j].pattern3 = NULL; + checkcpdtable[j].cond = FLAG_NULL; + checkcpdtable[j].cond2 = FLAG_NULL; piece = mystrsep(&tp, 0); while (piece) { if (*piece != '\0') { switch(i) { case 0: { if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) { - HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numcheckcpd = 0; return 1; } break; } - case 1: { checkcpdtable[j].pattern = mystrdup(piece); break; } - case 2: { checkcpdtable[j].pattern2 = mystrdup(piece); break; } + case 1: { + checkcpdtable[j].pattern = mystrdup(piece); + char * p = strchr(checkcpdtable[j].pattern, '/'); + if (p) { + *p = '\0'; + checkcpdtable[j].cond = pHMgr->decode_flag(p + 1); + } + break; } + case 2: { + checkcpdtable[j].pattern2 = mystrdup(piece); + char * p = strchr(checkcpdtable[j].pattern2, '/'); + if (p) { + *p = '\0'; + checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1); + } + break; + } + case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; } default: break; } i++; } - free(piece); piece = mystrsep(&tp, 0); } if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { - HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numcheckcpd = 0; return 1; } } @@ -3444,11 +3913,11 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_defcpdtable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_defcpdtable(char * line, FILE * af) +int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) #endif { if (numdefcpd != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate compound rule tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -3463,8 +3932,7 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) case 1: { numdefcpd = atoi(piece); if (numdefcpd < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in compound rule table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry)); @@ -3476,11 +3944,10 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing compound rule table information\n"); + HUNSPELL_WARNING(stderr, "error: missing data\n"); return 1; } @@ -3491,7 +3958,7 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; @@ -3503,26 +3970,46 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) switch(i) { case 0: { if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { - HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numdefcpd = 0; return 1; } break; } - case 1: { - defcpdtable[j].len = - pHMgr->decode_flags(&(defcpdtable[j].def), piece); + case 1: { // handle parenthesized flags + if (strchr(piece, '(')) { + defcpdtable[j].def = (FLAG *) malloc(sizeof(piece) * sizeof(FLAG)); + defcpdtable[j].len = 0; + int end = 0; + FLAG * conv; + while (!end) { + char * par = piece + 1; + while (*par != '(' && *par != ')' && *par != '\0') par++; + if (*par == '\0') end = 1; else *par = '\0'; + if (*piece == '(') piece++; + if (*piece == '*' || *piece == '?') { + defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece; + } else if (*piece != '\0') { + int l = pHMgr->decode_flags(&conv, piece); + for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k]; + free(conv); + } + piece = par + 1; + } + } else { + defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece); + } break; } default: break; } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (!defcpdtable[j].len) { - HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: line table is corrupt\n"); + numdefcpd = 0; return 1; } } @@ -3534,11 +4021,11 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_maptable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_maptable(char * line, FILE * af) +int AffixMgr::parse_maptable(char * line, FileMgr * af) #endif { if (nummap != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate MAP tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -3553,8 +4040,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) case 1: { nummap = atoi(piece); if (nummap < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in map table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry)); @@ -3566,11 +4052,10 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing map table information\n"); + HUNSPELL_WARNING(stderr, "error: line missing data\n"); return 1; } @@ -3581,7 +4066,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; @@ -3594,8 +4079,8 @@ int AffixMgr::parse_maptable(char * line, FILE * af) switch(i) { case 0: { if (strncmp(piece,"MAP",3) != 0) { - HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + nummap = 0; return 1; } break; @@ -3623,11 +4108,11 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) { - HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + nummap = 0; return 1; } } @@ -3638,11 +4123,11 @@ int AffixMgr::parse_maptable(char * line, FILE * af) #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_breaktable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_breaktable(char * line, FILE * af) +int AffixMgr::parse_breaktable(char * line, FileMgr * af) #endif { if (numbreak != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate word breakpoint tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -3657,8 +4142,7 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) case 1: { numbreak = atoi(piece); if (numbreak < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in BREAK table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } breaktable = (char **) malloc(numbreak * sizeof(char *)); @@ -3670,11 +4154,10 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing word breakpoint table information\n"); + HUNSPELL_WARNING(stderr, "error: missing data\n"); return 1; } @@ -3685,7 +4168,7 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; @@ -3696,8 +4179,8 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) switch(i) { case 0: { if (strncmp(piece,"BREAK",5) != 0) { - HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numbreak = 0; return 1; } break; @@ -3710,21 +4193,45 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (!breaktable) { - HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numbreak = 0; return 1; } } return 0; } +void AffixMgr::reverse_condition(char * piece) { + int neg = 0; + for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { + switch(*k) { + case '[': { + if (neg) *(k+1) = '['; else *k = ']'; + break; + } + case ']': { + *k = '['; + if (neg) *(k+1) = '^'; + neg = 0; + break; + } + case '^': { + if (*(k+1) == ']') neg = 1; else *(k+1) = *k; + break; + } + default: { + if (neg) *(k+1) = *k; + } + } + } +} #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_affix(char * line, const char at, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflags) +int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags) #endif { int numents = 0; // number of affentry structures to parse @@ -3748,6 +4255,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag // split affix header line into pieces int np = 0; + piece = mystrsep(&tp, 0); while (piece) { if (*piece != '\0') { @@ -3762,10 +4270,11 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag #ifndef HUNSPELL_CHROME_CLIENT // We don't check for duplicates. if (((at == 'S') && (dupflags[aflag] & dupSFX)) || ((at == 'P') && (dupflags[aflag] & dupPFX))) { - HUNSPELL_WARNING(stderr, "error: duplicate affix flag %s in line %s\n", piece, nl); + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n", + af->getlinenum()); // return 1; XXX permissive mode for bad dictionaries } - dupflags[aflag] += ((at == 'S') ? dupSFX : dupPFX); + dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX); #endif break; } @@ -3778,19 +4287,18 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag numents = atoi(piece); if (numents == 0) { char * err = pHMgr->encode_flag(aflag); - HUNSPELL_WARNING(stderr, "error: affix %s header has incorrect entry count in line %s\n", - err, nl); - free(err); + if (err) { + HUNSPELL_WARNING(stderr, "error: line bad entry number\n"); + free(err); + } return 1; } - ptr = (struct affentry *) calloc(numents, sizeof(struct affentry)); + ptr = (struct affentry *) malloc(numents * sizeof(struct affentry)); if (!ptr) return 1; ptr->opts = ff; if (utf8) ptr->opts += aeUTF8; if (pHMgr->is_aliasf()) ptr->opts += aeALIASF; -#ifdef HUNSPELL_EXPERIMENTAL if (pHMgr->is_aliasm()) ptr->opts += aeALIASM; -#endif ptr->aflag = aflag; } @@ -3798,14 +4306,15 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag } i++; } - free(piece); piece = mystrsep(&tp, 0); } // check to make sure we parsed enough pieces if (np != 4) { - char * err = pHMgr->encode_flag(aflag); - HUNSPELL_WARNING(stderr, "error: affix %s header has insufficient data in line %s\n", err, nl); - free(err); + char * err = pHMgr->encode_flag(aflag); + if (err) { + HUNSPELL_WARNING(stderr, "error: missing data\n"); + free(err); + } free(ptr); return 1; } @@ -3819,7 +4328,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; @@ -3834,7 +4343,8 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag // piece 1 - is type case 0: { np++; - if (nptr != ptr) nptr->opts = ptr->opts; + if (nptr != ptr) nptr->opts = ptr->opts & + (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); break; } @@ -3843,10 +4353,10 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag np++; if (pHMgr->decode_flag(piece) != aflag) { char * err = pHMgr->encode_flag(aflag); - HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl); - HUNSPELL_WARNING(stderr, "error: possible incorrect count\n"); - free(err); - free(piece); + if (err) { + HUNSPELL_WARNING(stderr, "error: affix %s is corrupt\n", err); + free(err); + } return 1; } @@ -3873,9 +4383,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag // piece 4 - is affix string or 0 for null case 3: { char * dash; -#ifdef HUNSPELL_EXPERIMENTAL nptr->morphcode = NULL; -#endif nptr->contclass = NULL; nptr->contclasslen = 0; np++; @@ -3890,15 +4398,16 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag remove_ignored_chars(piece,ignorechars); } } - + if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } nptr->appnd = mystrdup(piece); - + if (pHMgr->is_aliasf()) { int index = atoi(dash + 1); nptr->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(nptr->contclass)); + if (!nptr->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1); } else { nptr->contclasslen = (unsigned short) pHMgr->decode_flags(&(nptr->contclass), dash + 1); flag_qsort(nptr->contclass, 0, nptr->contclasslen); @@ -3921,9 +4430,9 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } - nptr->appnd = mystrdup(piece); + nptr->appnd = mystrdup(piece); } - + nptr->appndl = (unsigned char) strlen(nptr->appnd); if (strcmp(nptr->appnd,"0") == 0) { free(nptr->appnd); @@ -3937,82 +4446,66 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag case 4: { np++; if (complexprefixes) { - int neg = 0; if (utf8) reverseword_utf(piece); else reverseword(piece); - // reverse condition - for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { - switch(*k) { - case '[': { - if (neg) *(k+1) = '['; else *k = ']'; - break; - } - case ']': { - *k = '['; - if (neg) *(k+1) = '^'; - neg = 0; - break; - } - case '^': { - if (*(k+1) == ']') neg = 1; else *(k+1) = *k; - break; - } - default: { - if (neg) *(k+1) = *k; - } - } - } + reverse_condition(piece); } if (nptr->stripl && (strcmp(piece, ".") != 0) && - redundant_condition(at, nptr->strip, nptr->stripl, piece, nl)) + redundant_condition(at, nptr->strip, nptr->stripl, piece, 0)) strcpy(piece, "."); - if (encodeit(nptr,piece)) return 1; + if (at == 'S') { + reverseword(piece); + reverse_condition(piece); + } + if (encodeit(nptr, piece)) return 1; break; } - -#ifdef HUNSPELL_EXPERIMENTAL + case 5: { np++; if (pHMgr->is_aliasm()) { int index = atoi(piece); nptr->morphcode = pHMgr->get_aliasm(index); } else { - if (complexprefixes) { + if (complexprefixes) { // XXX - fix me for morph. gen. if (utf8) reverseword_utf(piece); else reverseword(piece); } + // add the remaining of the line + if (*tp) { + *(tp - 1) = ' '; + tp = tp + strlen(tp); + } nptr->morphcode = mystrdup(piece); + if (!nptr->morphcode) return 1; } break; } -#endif - default: break; } i++; } - free(piece); piece = mystrsep(&tp, 0); } // check to make sure we parsed enough pieces if (np < 4) { char * err = pHMgr->encode_flag(aflag); - HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl); - free(err); + if (err) { + HUNSPELL_WARNING(stderr, "error: affix %s is corrupt\n", err); + free(err); + } free(ptr); return 1; } #ifdef DEBUG -#ifdef HUNSPELL_EXPERIMENTAL // detect unnecessary fields, excepting comments if (basefieldnum) { int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); if (fieldnum != basefieldnum) - HUNSPELL_WARNING(stderr, "warning: bad field number:\n%s\n", nl); + HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum()); } else { basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); } #endif -#endif nptr++; } @@ -4028,12 +4521,12 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag build_sfxtree((AffEntry *)sfxptr); } nptr++; - } + } free(ptr); return 0; } -int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, char * line) { +int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) { int condl = strlen(cond); int i; int j; @@ -4046,7 +4539,8 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { if (cond[j] != '[') { if (cond[j] != strip[i]) { - HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); + return 0; } } else { neg = (cond[j+1] == '^') ? 1 : 0; @@ -4056,12 +4550,12 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char if (strip[i] == cond[j]) in = 1; } while ((j < (condl - 1)) && (cond[j] != ']')); if (j == (condl - 1) && (cond[j] != ']')) { - HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line); + HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum); return 0; } if ((!neg && !in) || (neg && in)) { - HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); - return 0; + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); + return 0; } } } @@ -4074,7 +4568,8 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { if (cond[j] != ']') { if (cond[j] != strip[i]) { - HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); + return 0; } } else { in = 0; @@ -4083,18 +4578,18 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char if (strip[i] == cond[j]) in = 1; } while ((j > 0) && (cond[j] != '[')); if ((j == 0) && (cond[j] != '[')) { - HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line); + HUNSPELL_WARNING(stderr, "error: error: %d: missing ] in condition:\n%s\n", linenum); return 0; } neg = (cond[j+1] == '^') ? 1 : 0; if ((!neg && !in) || (neg && in)) { - HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); - return 0; + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); + return 0; } } } if (j < 0) return 1; - } + } } return 0; } diff --git a/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx b/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx index e960068..fa4e217 100644 --- a/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx +++ b/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx @@ -13,6 +13,8 @@ using namespace std; #include "atypes.hxx" #include "baseaffix.hxx" #include "hashmgr.hxx" +#include "phonet.hxx" +#include "replist.hxx" // check flag duplication #define dupSFX (1 << 0) @@ -66,12 +68,15 @@ class AffixMgr AffEntry * sFlag[CONTSIZE]; #endif HashMgr * pHMgr; + HashMgr ** alldic; + int * maxdic; + char * keystring; char * trystring; char * encoding; struct cs_info * csconv; int utf8; int complexprefixes; - FLAG compoundflag; + FLAG compoundflag; FLAG compoundbegin; FLAG compoundmiddle; FLAG compoundend; @@ -82,20 +87,25 @@ class AffixMgr int checkcompoundrep; int checkcompoundcase; int checkcompoundtriple; + int simplifiedtriple; FLAG forbiddenword; FLAG nosuggest; - FLAG pseudoroot; + FLAG needaffix; int cpdmin; int numrep; replentry * reptable; + RepList * iconvtable; + RepList * oconvtable; int nummap; mapentry * maptable; int numbreak; char ** breaktable; int numcheckcpd; - replentry * checkcpdtable; + patentry * checkcpdtable; + int simplifiedcpd; int numdefcpd; flagentry * defcpdtable; + phonetable * phone; int maxngramsugs; int nosplitsugs; int sugswithdots; @@ -125,7 +135,9 @@ class AffixMgr FLAG circumfix; FLAG onlyincompound; FLAG keepcase; + FLAG substandard; int checksharps; + int fullstrip; int havecontclass; // boolean variable #ifdef HUNSPELL_CHROME_CLIENT @@ -133,68 +145,81 @@ class AffixMgr #else char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold affix) #endif - flag flag_mode; - + public: + #ifdef HUNSPELL_CHROME_CLIENT - AffixMgr(hunspell::BDictReader* reader, HashMgr* ptr); + AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md); #else - AffixMgr(FILE* aff_handle, HashMgr * ptr); + AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) #endif ~AffixMgr(); struct hentry * affix_check(const char * word, int len, - const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT); + const unsigned short needflag = (unsigned short) 0, + char in_compound = IN_CPD_NOT); struct hentry * prefix_check(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); inline int isSubset(const char * s1, const char * s2); struct hentry * prefix_check_twosfx(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); inline int isRevSubset(const char * s1, const char * end_of_s2, int len); - struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx, - char ** wlst, int maxSug, int * ns, const FLAG cclass = FLAG_NULL, - const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + struct hentry * suffix_check(const char * word, int len, int sfxopts, + AffEntry* ppfx, char ** wlst, int maxSug, int * ns, + const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, + char in_compound = IN_CPD_NOT); struct hentry * suffix_check_twosfx(const char * word, int len, int sfxopts, AffEntry* ppfx, const FLAG needflag = FLAG_NULL); char * affix_check_morph(const char * word, int len, - const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); char * prefix_check_morph(const char * word, int len, - char in_compound, const FLAG needflag = FLAG_NULL); - char * suffix_check_morph (const char * word, int len, int sfxopts, AffEntry * ppfx, - const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + char in_compound, const FLAG needflag = FLAG_NULL); + char * suffix_check_morph (const char * word, int len, int sfxopts, + AffEntry * ppfx, const FLAG cclass = FLAG_NULL, + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); char * prefix_check_twosfx_morph(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); char * suffix_check_twosfx_morph(const char * word, int len, int sfxopts, AffEntry * ppfx, const FLAG needflag = FLAG_NULL); - int expand_rootword(struct guessword * wlst, int maxn, const char * ts, - int wl, const unsigned short * ap, unsigned short al, char * bad, int); + char * morphgen(char * ts, int wl, const unsigned short * ap, + unsigned short al, char * morph, char * targetmorph, int level); - short get_syllable (const char * word, int wlen); - int cpdrep_check(const char * word, int len); - int cpdpat_check(const char * word, int len); - int defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** rwords, char all); - int cpdcase_check(const char * word, int len); - inline int candidate_check(const char * word, int len); - struct hentry * compound_check(const char * word, int len, - short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, - char hu_mov_rule, int * cmpdstemnum, int * cmpdstem, char is_sug); + int expand_rootword(struct guessword * wlst, int maxn, const char * ts, + int wl, const unsigned short * ap, unsigned short al, char * bad, + int, char *); - int compound_check_morph(const char * word, int len, - short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, - char hu_mov_rule, char ** result, char * partresult); + short get_syllable (const char * word, int wlen); + int cpdrep_check(const char * word, int len); + int cpdpat_check(const char * word, int len, hentry * r1, hentry * r2); + int defcpd_check(hentry *** words, short wnum, hentry * rv, + hentry ** rwords, char all); + int cpdcase_check(const char * word, int len); + inline int candidate_check(const char * word, int len); + void setcminmax(int * cmin, int * cmax, const char * word, int len); + struct hentry * compound_check(const char * word, int len, short wordnum, + short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, char is_sug); - struct hentry * lookup(const char * word); + int compound_check_morph(const char * word, int len, short wordnum, + short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, char ** result, char * partresult); + + struct hentry * lookup(const char * word); int get_numrep(); struct replentry * get_reptable(); + RepList * get_iconvtable(); + RepList * get_oconvtable(); + struct phonetable * get_phonetable(); int get_nummap(); struct mapentry * get_maptable(); int get_numbreak(); char ** get_breaktable(); char * get_encoding(); int get_langnum(); + char * get_key_string(); char * get_try_string(); const char * get_wordchars(); unsigned short * get_wordchars_utf16(int * len); @@ -205,8 +230,7 @@ public: FLAG get_compoundbegin(); FLAG get_forbiddenword(); FLAG get_nosuggest(); -// FLAG get_circumfix(); - FLAG get_pseudoroot(); + FLAG get_needaffix(); FLAG get_onlyincompound(); FLAG get_compoundroot(); FLAG get_lemma_present(); @@ -225,6 +249,8 @@ public: int get_sugswithdots(void); FLAG get_keepcase(void); int get_checksharps(void); + char * encode_flag(unsigned short aflag); + int get_fullstrip(); private: #ifdef HUNSPELL_CHROME_CLIENT @@ -232,31 +258,37 @@ private: hunspell::BDictReader* bdict_reader; int parse_file(); -#else - int parse_file(FILE* aff_handle); -#endif -// int parse_string(char * line, char ** out, const char * name); - int parse_flag(char * line, unsigned short * out, const char * name); - int parse_num(char * line, int * out, const char * name); -// int parse_array(char * line, char ** out, unsigned short ** out_utf16, -// int * out_utf16_len, const char * name); - int parse_cpdsyllable(char * linfe); -#ifdef HUNSPELL_CHROME_CLIENT - // We just change the FILE* to be an iterator. + int parse_flag(char * line, unsigned short * out); + int parse_num(char * line, int * out); + int parse_cpdsyllable(char * line); + + int parse_reptable(char * line, hunspell::LineIterator* iterator); + int parse_convtable(char * line, hunspell::LineIterator* iterator, RepList ** rl, const char * keyword); + int parse_phonetable(char * line, hunspell::LineIterator* iterator); int parse_maptable(char * line, hunspell::LineIterator* iterator); - int parse_checkcpdtable(char * line, hunspell::LineIterator* iterator); int parse_breaktable(char * line, hunspell::LineIterator* iterator); + int parse_checkcpdtable(char * line, hunspell::LineIterator* iterator); int parse_defcpdtable(char * line, hunspell::LineIterator* iterator); int parse_affix(char * line, const char at, hunspell::LineIterator* iterator); #else - int parse_reptable(char * line, FILE * af); - int parse_maptable(char * line, FILE * af); - int parse_breaktable(char * line, FILE * af); - int parse_checkcpdtable(char * line, FILE * af); - int parse_defcpdtable(char * line, FILE * af); - int parse_affix(char * line, const char at, FILE * af, char * dupflags); + int parse_file(FILE* aff_handle, const char * key); + int parse_flag(char * line, unsigned short * out, FileMgr * af); + int parse_num(char * line, int * out, FileMgr * af); + int parse_cpdsyllable(char * line, FileMgr * af); + + int parse_reptable(char * line, FileMgr * af); + int parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword); + int parse_phonetable(char * line, FileMgr * af); + int parse_maptable(char * line, FileMgr * af); + int parse_breaktable(char * line, FileMgr * af); + int parse_checkcpdtable(char * line, FileMgr * af); + int parse_defcpdtable(char * line, FileMgr * af); + int parse_affix(char * line, const char at, FileMgr * af, char * dupflags); #endif + void reverse_condition(char *); + void debugflag(char * result, unsigned short flag); + int condlen(char *); int encodeit(struct affentry * ptr, char * cs); int build_pfxtree(AffEntry* pfxptr); int build_sfxtree(AffEntry* sfxptr); @@ -266,7 +298,8 @@ private: AffEntry * process_sfx_in_order(AffEntry * ptr, AffEntry * nptr); int process_pfx_tree_to_list(); int process_sfx_tree_to_list(); - int redundant_condition(char, char * strip, int stripl, const char * cond, char *); + int redundant_condition(char, char * strip, int stripl, + const char * cond, int); }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/atypes.hxx b/chrome/third_party/hunspell/src/hunspell/atypes.hxx index 4f6c1ea..4753f9c 100644 --- a/chrome/third_party/hunspell/src/hunspell/atypes.hxx +++ b/chrome/third_party/hunspell/src/hunspell/atypes.hxx @@ -5,27 +5,28 @@ #ifdef HUNSPELL_WARNING_ON #define HUNSPELL_WARNING fprintf #else -#define HUNSPELL_WARNING +// empty inline function to switch off warnings (instead of the C99 standard variadic macros) +static inline void HUNSPELL_WARNING(FILE *, const char *, ...) {} #endif #endif // HUNSTEM def. #define HUNSTEM -#include "csutil.hxx" #include "hashmgr.hxx" +#include "w_char.hxx" #define SETSIZE 256 #define CONTSIZE 65536 #define MAXWORDLEN 100 -#define MAXWORDUTF8LEN (MAXWORDLEN * 4) +#define MAXWORDUTF8LEN 256 // affentry options #define aeXPRODUCT (1 << 0) #define aeUTF8 (1 << 1) #define aeALIASF (1 << 2) #define aeALIASM (1 << 3) -#define aeINFIX (1 << 4) +#define aeLONGCOND (1 << 4) // compound options #define IN_CPD_NOT 0 @@ -33,10 +34,12 @@ #define IN_CPD_END 2 #define IN_CPD_OTHER 3 -#define MAXLNLEN 8192 * 4 +#define MAXLNLEN 8192 #define MINCPDLEN 3 #define MAXCOMPOUND 10 +#define MAXCONDLEN 20 +#define MAXCONDLEN_1 (MAXCONDLEN - sizeof(char *)) #define MAXACC 1000 @@ -55,26 +58,22 @@ struct affentry char numconds; char opts; unsigned short aflag; - union { - char base[SETSIZE]; - struct { - char ascii[SETSIZE/2]; - char neg[8]; - char all[8]; - w_char * wchars[8]; - int wlen[8]; - } utf8; - } conds; -#ifdef HUNSPELL_EXPERIMENTAL - char * morphcode; -#endif unsigned short * contclass; short contclasslen; + union { + char conds[MAXCONDLEN]; + struct { + char conds1[MAXCONDLEN_1]; + char * conds2; + } l; + } c; + char * morphcode; }; -struct replentry { - char * pattern; - char * pattern2; +struct guessword { + char * word; + bool allow; + char * orig; }; struct mapentry { @@ -88,14 +87,12 @@ struct flagentry { int len; }; -struct guessword { - char * word; - bool allow; +struct patentry { + char * pattern; + char * pattern2; + char * pattern3; + FLAG cond; + FLAG cond2; }; #endif - - - - - diff --git a/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx b/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx index d6a5cd6..03a876d 100644 --- a/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx +++ b/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx @@ -6,26 +6,23 @@ class AffEntry public: protected: - char * appnd; - char * strip; - unsigned char appndl; - unsigned char stripl; - char numconds; - char opts; - unsigned short aflag; - union { - char base[SETSIZE]; - struct { - char ascii[SETSIZE/2]; - char neg[8]; - char all[8]; - w_char * wchars[8]; - int wlen[8]; - } utf8; - } conds; - char * morphcode; - unsigned short * contclass; - short contclasslen; + char * appnd; + char * strip; + unsigned char appndl; + unsigned char stripl; + char numconds; + char opts; + unsigned short aflag; + union { + char conds[MAXCONDLEN]; + struct { + char conds1[MAXCONDLEN_1]; + char * conds2; + } l; + } c; + char * morphcode; + unsigned short * contclass; + short contclasslen; }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/csutil.cxx b/chrome/third_party/hunspell/src/hunspell/csutil.cxx index 4424b98..c07e34d 100644 --- a/chrome/third_party/hunspell/src/hunspell/csutil.cxx +++ b/chrome/third_party/hunspell/src/hunspell/csutil.cxx @@ -5,10 +5,12 @@ #include <cstdlib> #include <cstring> #include <cstdio> +#include <cctype> #else #include <stdlib.h> #include <string.h> #include <stdio.h> +#include <ctype.h> #endif #include "csutil.hxx" @@ -43,17 +45,18 @@ static NS_DEFINE_CID(kUnicharUtilCID, NS_UNICHARUTIL_CID); using namespace std; #endif #else -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif -struct unicode_info2 * utf_tbl = NULL; +static struct unicode_info2 * utf_tbl = NULL; +static int utf_tbl_count = 0; // utf_tbl can be used by multiple Hunspell instances /* only UTF-16 (BMP) implementation */ char * u16_u8(char * dest, int size, const w_char * src, int srclen) { - char * u8 = dest; - char * u8_max = u8 + size; + signed char * u8 = (signed char *)dest; + signed char * u8_max = (signed char *)(u8 + size); const w_char * u2 = src; const w_char * u2_max = src + srclen; while ((u2 < u2_max) && (u8 < u8_max)) { @@ -100,12 +103,12 @@ char * u16_u8(char * dest, int size, const w_char * src, int srclen) { /* only UTF-16 (BMP) implementation */ int u8_u16(w_char * dest, int size, const char * src) { - const char * u8 = src; + const signed char * u8 = (const signed char *)src; w_char * u2 = dest; w_char * u2_max = u2 + size; while ((u2 < u2_max) && *u8) { - switch ((*u8) & 0xf0) { + switch ((*u8) & 0xf0) { case 0x00: case 0x10: case 0x20: @@ -122,7 +125,7 @@ int u8_u16(w_char * dest, int size, const char * src) { case 0x90: case 0xa0: case 0xb0: { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %d. character position\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %ld. character position\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; break; @@ -134,7 +137,7 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; } @@ -148,12 +151,12 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; } } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; } @@ -218,13 +221,11 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { char * mystrsep(char ** stringp, const char delim) { - char * rv = NULL; char * mp = *stringp; - int n = strlen(mp); - if (n > 0) { + if (*mp != '\0') { char * dp; if (delim) { - dp = (char *)memchr(mp,(int)((unsigned char)delim),n); + dp = strchr(mp, delim); } else { // don't use isspace() here, the string can be in some random charset // that's way different than the locale's @@ -234,22 +235,16 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { if (dp) { *stringp = dp+1; int nc = (int)((unsigned long)dp - (unsigned long)mp); - rv = (char *) malloc(nc+1); - memcpy(rv,mp,nc); - *(rv+nc) = '\0'; - return rv; + *(mp+nc) = '\0'; + return mp; } else { - rv = (char *) malloc(n+1); - memcpy(rv, mp, n); - *(rv+n) = '\0'; - *stringp = mp + n; - return rv; + *stringp = mp + strlen(mp); + return mp; } } return NULL; } - // replaces strdup with ansi version char * mystrdup(const char * s) { @@ -257,12 +252,27 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { if (s) { int sl = strlen(s); d = (char *) malloc(((sl+1) * sizeof(char))); - if (d) memcpy(d,s,((sl+1)*sizeof(char))); + if (d) { + memcpy(d,s,((sl+1)*sizeof(char))); + return d; + } + HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); } return d; } - - + + // strcat for limited length destination string + char * mystrcat(char * dest, const char * st, int max) { + int len; + int len2; + if (dest == NULL || st == NULL) return dest; + len = strlen(dest); + len2 = strlen(st); + if (len + len2 + 1 > max) return dest; + strcpy(dest + len, st); + return dest; + } + // remove cross-platform text line end characters void mychomp(char * s) { @@ -289,112 +299,258 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { return d; } -#ifdef HUNSPELL_EXPERIMENTAL - // append s to ends of every lines in text - void strlinecat(char * dest, const char * s) - { - char * dup = mystrdup(dest); - char * source = dup; - int len = strlen(s); - while (*source) { - if (*source == '\n') { - strncpy(dest, s, len); - dest += len; - } - *dest = *source; - source++; dest++; - } - strcpy(dest, s); - free(dup); - } - // break text to lines // return number of lines -int line_tok(const char * text, char *** lines) { +int line_tok(const char * text, char *** lines, char breakchar) { int linenum = 0; char * dup = mystrdup(text); - char * p = strchr(dup, '\n'); + char * p = strchr(dup, breakchar); while (p) { linenum++; *p = '\0'; p++; - p = strchr(p, '\n'); + p = strchr(p, breakchar); + } + linenum++; +// fprintf(stderr, "LINEN:%d %p %p\n", linenum, lines, *lines); + *lines = (char **) malloc(linenum * sizeof(char *)); +// fprintf(stderr, "hello\n"); + if (!(*lines)) { + free(dup); + return 0; } - *lines = (char **) calloc(linenum + 1, sizeof(char *)); - if (!(*lines)) return -1; - p = dup; - for (int i = 0; i < linenum + 1; i++) { - (*lines)[i] = mystrdup(p); + p = dup; + int l = 0; + for (int i = 0; i < linenum; i++) { + if (*p != '\0') { + (*lines)[l] = mystrdup(p); + if (!(*lines)[l]) { + for (i = 0; i < l; i++) free((*lines)[i]); + free(dup); + return 0; + } + l++; + } p += strlen(p) + 1; } free(dup); - return linenum; + if (!l) free(*lines); + return l; } // uniq line in place -char * line_uniq(char * text) { +char * line_uniq(char * text, char breakchar) { char ** lines; - int linenum = line_tok(text, &lines); + int linenum = line_tok(text, &lines, breakchar); int i; strcpy(text, lines[0]); - for ( i = 1; i<=linenum; i++ ) { + for ( i = 1; i < linenum; i++ ) { int dup = 0; for (int j = 0; j < i; j++) { if (strcmp(lines[i], lines[j]) == 0) dup = 1; } if (!dup) { - if ((i > 1) || (*(lines[0]) != '\0')) strcat(text, "\n"); + if ((i > 1) || (*(lines[0]) != '\0')) { + sprintf(text + strlen(text), "%c", breakchar); + } strcat(text, lines[i]); } } - for ( i = 0; i<=linenum; i++ ) { + for ( i = 0; i < linenum; i++ ) { if (lines[i]) free(lines[i]); } if (lines) free(lines); return text; } +// uniq and boundary for compound analysis: "1\n\2\n\1" -> " ( \1 | \2 ) " +char * line_uniq_app(char ** text, char breakchar) { + if (!strchr(*text, breakchar)) { + return *text; + } + + char ** lines; + int i; + int linenum = line_tok(*text, &lines, breakchar); + int dup = 0; + for (i = 0; i < linenum; i++) { + for (int j = 0; j < (i - 1); j++) { + if (strcmp(lines[i], lines[j]) == 0) { + *(lines[i]) = '\0'; + dup++; + break; + } + } + } + if ((linenum - dup) == 1) { + strcpy(*text, lines[0]); + freelist(&lines, linenum); + return *text; + } + char * newtext = (char *) malloc(strlen(*text) + 2 * linenum + 3 + 1); + if (newtext) { + free(*text); + *text = newtext; + } else { + freelist(&lines, linenum); + return *text; + } + strcpy(*text," ( "); + for (i = 0; i < linenum; i++) if (*(lines[i])) { + sprintf(*text + strlen(*text), "%s%s", lines[i], " | "); + } + (*text)[strlen(*text) - 2] = ')'; // " ) " + freelist(&lines, linenum); + return *text; +} + + // append s to ends of every lines in text + void strlinecat(char * dest, const char * s) + { + char * dup = mystrdup(dest); + char * source = dup; + int len = strlen(s); + if (dup) { + while (*source) { + if (*source == '\n') { + strncpy(dest, s, len); + dest += len; + } + *dest = *source; + source++; dest++; + } + strcpy(dest, s); + free(dup); + } + } + // change \n to char c -char * line_join(char * text, char c) { +char * tr(char * text, char oldc, char newc) { char * p; - for (p = text; *p; p++) if (*p == '\n') *p = c; + for (p = text; *p; p++) if (*p == oldc) *p = newc; return text; } -// leave only last {[^}]*} substring for handling zero morphemes -char * delete_zeros(char * morphout) { - char * p = morphout; - char * q = p; - char * q2 = NULL; - int suffix = 0; - - for (;*p && *(p+1);) { - switch (*p) { - case '{': - q2 = q; - q--; - break; - case '}': - if (q2) { - suffix = 1; - q--; - } - break; - default: - if (suffix) { - q = q2; - } - suffix = 0; - *q = *p; +// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields +// in the first line of the inputs +// return 0, if inputs equal +// return 1, if inputs may equal with a secondary suffix +// otherwise return -1 +int morphcmp(const char * s, const char * t) +{ + int se = 0; + int te = 0; + const char * sl; + const char * tl; + const char * olds; + const char * oldt; + if (!s || !t) return 1; + olds = s; + sl = strchr(s, '\n'); + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s= strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + tl = strchr(t, '\n'); + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; + } + while (s && t && (!sl || sl > s) && (!tl || tl > t)) { + s += MORPH_TAG_LEN; + t += MORPH_TAG_LEN; + se = 0; + te = 0; + while ((*s == *t) && !se && !te) { + s++; + t++; + switch(*s) { + case ' ': + case '\n': + case '\t': + case '\0': se = 1; + } + switch(*t) { + case ' ': + case '\n': + case '\t': + case '\0': te = 1; + } + } + if (!se || !te) { + // not terminal suffix difference + if (olds) return -1; + return 1; + } + olds = s; + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s = strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; } - p++; - q++; } - *q = '\0'; - return morphout; + if (!s && !t && se && te) return 0; + return 1; +} + +int get_sfxcount(const char * morph) +{ + if (!morph || !*morph) return 0; + int n = 0; + const char * old = morph; + morph = strstr(morph, MORPH_DERI_SFX); + if (!morph) morph = strstr(old, MORPH_INFL_SFX); + if (!morph) morph = strstr(old, MORPH_TERM_SFX); + while (morph) { + n++; + old = morph; + morph = strstr(morph + 1, MORPH_DERI_SFX); + if (!morph) morph = strstr(old + 1, MORPH_INFL_SFX); + if (!morph) morph = strstr(old + 1, MORPH_TERM_SFX); + } + return n; +} + + +int fieldlen(const char * r) +{ + int n = 0; + while (r && *r != '\t' && *r != '\0' && *r != '\n' && *r != ' ') { + r++; + n++; + } + return n; +} + +char * copy_field(char * dest, const char * morph, const char * var) +{ + if (!morph) return NULL; + const char * beg = strstr(morph, var); + if (beg) { + char * d = dest; + for (beg += MORPH_TAG_LEN; *beg != ' ' && *beg != '\t' && + *beg != '\n' && *beg != '\0'; d++, beg++) { + *d = *beg; + } + *d = '\0'; + return dest; + } + return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE char * mystrrep(char * word, const char * pat, const char * rep) { char * pos = strstr(word, pat); @@ -445,6 +601,34 @@ char * mystrrep(char * word, const char * pat, const char * rep) { u16_u8(word, MAXWORDUTF8LEN, w, l); return 0; } + + int uniqlist(char ** list, int n) { + int i; + if (n < 2) return n; + for (i = 0; i < n; i++) { + for (int j = 0; j < i; j++) { + if (list[j] && list[i] && (strcmp(list[j], list[i]) == 0)) { + free(list[i]); + list[i] = NULL; + break; + } + } + } + int m = 1; + for (i = 1; i < n; i++) if (list[i]) { + list[m] = list[i]; + m++; + } + return m; + } + + void freelist(char *** list, int n) { + if (list && *list && n > 0) { + for (int i = 0; i < n; i++) if ((*list)[i]) free((*list)[i]); + free(*list); + *list = NULL; + } + } // convert null terminated string to all caps void mkallcap(char * p, const struct cs_info * csconv) @@ -478,8 +662,8 @@ void mkallcap_utf(w_char * u, int nc, int langnum) { for (int i = 0; i < nc; i++) { unsigned short idx = (u[i].h << 8) + u[i].l; if (idx != unicodetoupper(idx, langnum)) { - u[i].h = (unsigned char) (unicodetolower(idx, langnum) >> 8); - u[i].l = (unsigned char) (unicodetolower(idx, langnum) & 0x00FF); + u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); + u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); } } } @@ -490,6 +674,20 @@ void mkallcap_utf(w_char * u, int nc, int langnum) { if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; } + // conversion function for protected memory + void store_pointer(char * dest, char * source) + { + memcpy(dest, &source, sizeof(char *)); + } + + // conversion function for protected memory + char * get_stored_pointer(char * s) + { + char * p; + memcpy(&p, s, sizeof(char *)); + return p; + } + #ifndef MOZILLA_CLIENT // convert null terminated string to all caps using encoding void enmkallcap(char * d, const char * p, const char * encoding) @@ -782,7 +980,7 @@ struct cs_info iso1_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; @@ -1042,7 +1240,7 @@ struct cs_info iso2_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; @@ -1302,7 +1500,7 @@ struct cs_info iso3_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso4_tbl[] = { @@ -1561,7 +1759,7 @@ struct cs_info iso4_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso5_tbl[] = { @@ -1820,7 +2018,7 @@ struct cs_info iso5_tbl[] = { { 0x00, 0xfc, 0xac }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xae }, -{ 0x00, 0xff, 0xaf }, +{ 0x00, 0xff, 0xaf } }; struct cs_info iso6_tbl[] = { @@ -2079,7 +2277,7 @@ struct cs_info iso6_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso7_tbl[] = { @@ -2338,7 +2536,7 @@ struct cs_info iso7_tbl[] = { { 0x00, 0xfc, 0xbc }, { 0x00, 0xfd, 0xbe }, { 0x00, 0xfe, 0xbf }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso8_tbl[] = { @@ -2597,7 +2795,7 @@ struct cs_info iso8_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso9_tbl[] = { @@ -2856,7 +3054,7 @@ struct cs_info iso9_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0x49 }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso10_tbl[] = { @@ -3115,7 +3313,7 @@ struct cs_info iso10_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info koi8r_tbl[] = { @@ -3374,7 +3572,7 @@ struct cs_info koi8r_tbl[] = { { 0x01, 0xdc, 0xfc }, { 0x01, 0xdd, 0xfd }, { 0x01, 0xde, 0xfe }, -{ 0x01, 0xdf, 0xff }, +{ 0x01, 0xdf, 0xff } }; struct cs_info koi8u_tbl[] = { @@ -3633,7 +3831,7 @@ struct cs_info koi8u_tbl[] = { { 0x01, 0xdc, 0xfc }, { 0x01, 0xdd, 0xfd }, { 0x01, 0xde, 0xfe }, -{ 0x01, 0xdf, 0xff }, +{ 0x01, 0xdf, 0xff } }; struct cs_info cp1251_tbl[] = { @@ -3892,7 +4090,7 @@ struct cs_info cp1251_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xdf }, +{ 0x00, 0xff, 0xdf } }; struct cs_info iso13_tbl[] = { @@ -4151,7 +4349,7 @@ struct cs_info iso13_tbl[] = { { 0x00, 0xFC, 0xDC }, { 0x00, 0xFD, 0xDD }, { 0x00, 0xFE, 0xDE }, -{ 0x00, 0xFF, 0xFF }, +{ 0x00, 0xFF, 0xFF } }; @@ -4411,7 +4609,7 @@ struct cs_info iso14_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso15_tbl[] = { @@ -4670,7 +4868,7 @@ struct cs_info iso15_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xbe }, +{ 0x00, 0xff, 0xbe } }; struct cs_info iscii_devanagari_tbl[] = { @@ -4929,10 +5127,10 @@ struct cs_info iscii_devanagari_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; -struct enc_entry encds[] = { +static struct enc_entry encds[] = { {"ISO8859-1",iso1_tbl}, {"ISO8859-2",iso2_tbl}, {"ISO8859-3",iso3_tbl}, @@ -4949,7 +5147,7 @@ struct enc_entry encds[] = { {"ISO8859-13", iso13_tbl}, {"ISO8859-14", iso14_tbl}, {"ISO8859-15", iso15_tbl}, -{"ISCII-DEVANAGARI", iscii_devanagari_tbl}, +{"ISCII-DEVANAGARI", iscii_devanagari_tbl} }; struct cs_info * get_current_cs(const char * es) { @@ -4958,6 +5156,7 @@ struct cs_info * get_current_cs(const char * es) { for (int i = 0; i < n; i++) { if (strcmp(es,encds[i].enc_name) == 0) { ccs = encds[i].cs_table; + break; } } return ccs; @@ -5038,6 +5237,26 @@ struct cs_info * get_current_cs(const char * es) { } #endif +// primitive isalpha() replacement for tokenization +char * get_casechars(const char * enc) { + struct cs_info * csconv = get_current_cs(enc); + char expw[MAXLNLEN]; + char * p = expw; + for (int i = 0; i <= 255; i++) { + if ((csconv[i].cupper != csconv[i].clower)) { + *p = (char) i; + p++; + } + } + *p = '\0'; +#ifdef MOZILLA_CLIENT + delete csconv; +#endif + return mystrdup(expw); +} + + + struct lang_map lang2enc[] = { {"ar", "UTF-8", LANG_ar}, {"az", "UTF-8", LANG_az}, @@ -5090,6 +5309,8 @@ int get_lang_num(const char * lang) { #ifndef OPENOFFICEORG #ifndef MOZILLA_CLIENT int initialize_utf_tbl() { + utf_tbl_count++; + if (utf_tbl) return 0; utf_tbl = (unicode_info2 *) malloc(CONTSIZE * sizeof(unicode_info2)); if (utf_tbl) { int j; @@ -5110,7 +5331,11 @@ int initialize_utf_tbl() { #endif void free_utf_tbl() { - if (utf_tbl) free(utf_tbl); + if (utf_tbl_count > 0) utf_tbl_count--; + if (utf_tbl && (utf_tbl_count == 0)) { + free(utf_tbl); + utf_tbl = NULL; + } } #ifdef MOZILLA_CLIENT @@ -5133,11 +5358,11 @@ unsigned short unicodetoupper(unsigned short c, int langnum) return u_toupper(c); #else #ifdef MOZILLA_CLIENT - unsigned short ret(c); - getcaseConv()->ToUpper(c, &ret); - return ret; + PRUnichar ch2; + getcaseConv()->ToUpper((PRUnichar) c, &ch2); + return ch2; #else - return utf_tbl[c].cupper; + return (utf_tbl) ? utf_tbl[c].cupper : c; #endif #endif } @@ -5153,11 +5378,11 @@ unsigned short unicodetolower(unsigned short c, int langnum) return u_tolower(c); #else #ifdef MOZILLA_CLIENT - unsigned short ret(c); - getcaseConv()->ToLower(c, &ret); - return ret; + PRUnichar ch2; + getcaseConv()->ToLower((PRUnichar) c, &ch2); + return ch2; #else - return utf_tbl[c].clower; + return (utf_tbl) ? utf_tbl[c].clower : c; #endif #endif } @@ -5167,10 +5392,72 @@ int unicodeisalpha(unsigned short c) #ifdef OPENOFFICEORG return u_isalpha(c); #else - return utf_tbl[c].cletter; + return (utf_tbl) ? utf_tbl[c].cletter : 0; #endif } +/* get type of capitalization */ +int get_captype(char * word, int nl, cs_info * csconv) { + // now determine the capitalization type of the first nl letters + int ncap = 0; + int nneutral = 0; + int firstcap = 0; + if (csconv == NULL) return NOCAP; + for (char * q = word; *q != '\0'; q++) { + if (csconv[*((unsigned char *)q)].ccase) ncap++; + if (csconv[*((unsigned char *)q)].cupper == csconv[*((unsigned char *)q)].clower) nneutral++; + } + if (ncap) { + firstcap = csconv[*((unsigned char *) word)].ccase; + } + + // now finally set the captype + if (ncap == 0) { + return NOCAP; + } else if ((ncap == 1) && firstcap) { + return INITCAP; + } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { + return ALLCAP; + } else if ((ncap > 1) && firstcap) { + return HUHINITCAP; + } + return HUHCAP; +} + +int get_captype_utf8(w_char * word, int nl, int langnum) { + // now determine the capitalization type of the first nl letters + int ncap = 0; + int nneutral = 0; + int firstcap = 0; + unsigned short idx; + // don't check too long words + if (nl >= MAXWORDLEN) return 0; + // big Unicode character (non BMP area) + if (nl == -1) return NOCAP; + for (int i = 0; i < nl; i++) { + idx = (word[i].h << 8) + word[i].l; + if (idx != unicodetolower(idx, langnum)) ncap++; + if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; + } + if (ncap) { + idx = (word[0].h << 8) + word[0].l; + firstcap = (idx != unicodetolower(idx, langnum)); + } + + // now finally set the captype + if (ncap == 0) { + return NOCAP; + } else if ((ncap == 1) && firstcap) { + return INITCAP; + } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { + return ALLCAP; + } else if ((ncap > 1) && firstcap) { + return HUHINITCAP; + } + return HUHCAP; +} + + // strip all ignored characters in the string void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len) { @@ -5200,14 +5487,14 @@ void remove_ignored_chars(char * word, char * ignored_chars) *word = '\0'; } -int parse_string(char * line, char ** out, const char * name) +int parse_string(char * line, char ** out, int ln) { char * tp = line; char * piece; int i = 0; int np = 0; if (*out) { - HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions\n", ln); return 1; } piece = mystrsep(&tp, 0); @@ -5217,6 +5504,7 @@ int parse_string(char * line, char ** out, const char * name) case 0: { np++; break; } case 1: { *out = mystrdup(piece); + if (!*out) return 1; np++; break; } @@ -5224,19 +5512,19 @@ int parse_string(char * line, char ** out, const char * name) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing %s information\n", name); + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", ln); return 1; } return 0; } -int parse_array(char * line, char ** out, - unsigned short ** out_utf16, int * out_utf16_len, const char * name, int utf8) { - if (parse_string(line, out, name)) return 1; +int parse_array(char * line, char ** out, unsigned short ** out_utf16, + int * out_utf16_len, int utf8, int ln) { + if (parse_string(line, out, ln)) return 1; if (utf8) { w_char w[MAXWORDLEN]; int n = u8_u16(w, MAXWORDLEN, *out); diff --git a/chrome/third_party/hunspell/src/hunspell/csutil.hxx b/chrome/third_party/hunspell/src/hunspell/csutil.hxx index 7fc6732..0e6192b 100644 --- a/chrome/third_party/hunspell/src/hunspell/csutil.hxx +++ b/chrome/third_party/hunspell/src/hunspell/csutil.hxx @@ -3,10 +3,56 @@ // First some base level utility routines -typedef struct { - unsigned char l; - unsigned char h; -} w_char; +#include "w_char.hxx" + +// casing +#define NOCAP 0 +#define INITCAP 1 +#define ALLCAP 2 +#define HUHCAP 3 +#define HUHINITCAP 4 + +// default encoding and keystring +#define SPELL_ENCODING "ISO8859-1" +#define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm" + +// default morphological fields +#define MORPH_STEM "st:" +#define MORPH_ALLOMORPH "al:" +#define MORPH_POS "po:" +#define MORPH_DERI_PFX "dp:" +#define MORPH_INFL_PFX "ip:" +#define MORPH_TERM_PFX "tp:" +#define MORPH_DERI_SFX "ds:" +#define MORPH_INFL_SFX "is:" +#define MORPH_TERM_SFX "ts:" +#define MORPH_SURF_PFX "sp:" +#define MORPH_FREQ "fr:" +#define MORPH_PHON "ph:" +#define MORPH_HYPH "hy:" +#define MORPH_PART "pa:" +#define MORPH_FLAG "fl:" +#define MORPH_HENTRY "_H:" +#define MORPH_TAG_LEN strlen(MORPH_STEM) + +#define MSEP_FLD ' ' +#define MSEP_REC '\n' +#define MSEP_ALT '\v' + +// default flags +#define DEFAULTFLAGS 65510 +#define FORBIDDENWORD 65510 +#define ONLYUPCASEFLAG 65511 + +// hash entry macros +#define HENTRY_DATA(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \ + get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : NULL) +// NULL-free version for warning-free OOo build +#define HENTRY_DATA2(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \ + get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : "") +#define HENTRY_FIND(h,p) (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL) + +#define w_char_eq(a,b) (((a).l == (b).l) && ((a).h == (b).h)) // convert UTF-16 characters to UTF-8 char * u16_u8(char * dest, int size, const w_char * src, int srclen); @@ -26,6 +72,9 @@ void mychomp(char * s); // duplicate string char * mystrdup(const char * s); +// strcat for limited length destination string +char * mystrcat(char * dest, const char * st, int max); + // duplicate reverse of string char * myrevstrdup(const char * s); @@ -41,16 +90,14 @@ char * mystrrep(char *, const char *, const char *); void strlinecat(char * lines, const char * s); // tokenize into lines with new line - int line_tok(const char * text, char *** lines); + int line_tok(const char * text, char *** lines, char breakchar); // tokenize into lines with new line and uniq in place - char * line_uniq(char * text); - -// change \n to c in place - char * line_join(char * text, char c); + char * line_uniq(char * text, char breakchar); + char * line_uniq_app(char ** text, char breakchar); -// leave only last {[^}]*} pattern in string - char * delete_zeros(char * morphout); +// change oldchar to newchar in place + char * tr(char * text, char oldc, char newc); // reverse word int reverseword(char *); @@ -58,6 +105,12 @@ void strlinecat(char * lines, const char * s); // reverse word int reverseword_utf(char *); +// remove duplicates + int uniqlist(char ** list, int n); + +// free character array list + void freelist(char *** list, int n); + // character encoding information struct cs_info { unsigned char ccase; @@ -101,8 +154,12 @@ struct cs_info * get_current_cs(const char * es); const char * get_default_enc(const char * lang); +// get language identifiers of language codes int get_lang_num(const char * lang); +// get characters of the given 8bit encoding with lower- and uppercase forms +char * get_casechars(const char * enc); + // convert null terminated string to all caps using encoding void enmkallcap(char * d, const char * p, const char * encoding); @@ -127,15 +184,34 @@ void mkallsmall_utf(w_char * u, int nc, int langnum); // convert first nc characters of UTF-8 string to capital void mkallcap_utf(w_char * u, int nc, int langnum); +// get type of capitalization +int get_captype(char * q, int nl, cs_info *); + +// get type of capitalization (UTF-8) +int get_captype_utf8(w_char * q, int nl, int langnum); + // strip all ignored characters in the string void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len); // strip all ignored characters in the string void remove_ignored_chars(char * word, char * ignored_chars); -int parse_string(char * line, char ** out, const char * name); +int parse_string(char * line, char ** out, int ln); + +int parse_array(char * line, char ** out, unsigned short ** out_utf16, + int * out_utf16_len, int utf8, int ln); + +int fieldlen(const char * r); +char * copy_field(char * dest, const char * morph, const char * var); + +int morphcmp(const char * s, const char * t); + +int get_sfxcount(const char * morph); + +// conversion function for protected memory +void store_pointer(char * dest, char * source); -int parse_array(char * line, char ** out, - unsigned short ** out_utf16, int * out_utf16_len, const char * name, int utf8); +// conversion function for protected memory +char * get_stored_pointer(char * s); #endif diff --git a/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx b/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx index 34736a6..5594582 100644 --- a/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx @@ -135,15 +135,19 @@ char * DictMgr::mystrsep(char ** stringp, const char delim) *stringp = dp+1; int nc = (int)((unsigned long)dp - (unsigned long)mp); rv = (char *) malloc(nc+1); - memcpy(rv,mp,nc); - *(rv+nc) = '\0'; - return rv; + if (rv) { + memcpy(rv,mp,nc); + *(rv+nc) = '\0'; + return rv; + } } else { rv = (char *) malloc(n+1); - memcpy(rv, mp, n); - *(rv+n) = '\0'; - *stringp = mp + n; - return rv; + if (rv) { + memcpy(rv, mp, n); + *(rv+n) = '\0'; + *stringp = mp + n; + return rv; + } } } return NULL; diff --git a/chrome/third_party/hunspell/src/hunspell/filemgr.cxx b/chrome/third_party/hunspell/src/hunspell/filemgr.cxx new file mode 100644 index 0000000..4150ce6 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/filemgr.cxx @@ -0,0 +1,54 @@ +#include "license.hunspell" +#include "license.myspell" + +#ifndef MOZILLA_CLIENT +#include <cstdlib> +#include <cstring> +#include <cstdio> +#else +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#endif + +#include "filemgr.hxx" + +int FileMgr::fail(const char * err, const char * par) { + fprintf(stderr, err, par); + return -1; +} + +FileMgr::FileMgr(const char * file, const char * key) { + linenum = 0; + hin = NULL; + fin = fopen(file, "r"); + if (!fin) { + // check hzipped file + char * st = (char *) malloc(strlen(file) + strlen(HZIP_EXTENSION)); + if (st) { + strcpy(st, file); + strcat(st, HZIP_EXTENSION); + hin = new Hunzip(st, key); + } + } + if (!fin && !hin) fail(MSG_OPEN, file); +} + +FileMgr::~FileMgr() +{ + if (fin) fclose(fin); + if (hin) delete hin; +} + +char * FileMgr::getline() { + const char * l; + linenum++; + if (fin) return fgets(in, BUFSIZE - 1, fin); + if (hin && (l = hin->getline())) return strcpy(in, l); + linenum--; + return NULL; +} + +int FileMgr::getlinenum() { + return linenum; +} diff --git a/chrome/third_party/hunspell/src/hunspell/filemgr.hxx b/chrome/third_party/hunspell/src/hunspell/filemgr.hxx new file mode 100644 index 0000000..fb4d52b --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/filemgr.hxx @@ -0,0 +1,21 @@ +/* file manager class - read lines of files [filename] OR [filename.hz] */ +#ifndef _FILEMGR_HXX_ +#define _FILEMGR_HXX_ +#include "hunzip.hxx" + +class FileMgr +{ +protected: + FILE * fin; + Hunzip * hin; + char in[BUFSIZE + 50]; // input buffer + int fail(const char * err, const char * par); + int linenum; + +public: + FileMgr(const char * filename, const char * key = NULL); + ~FileMgr(); + char * getline(); + int getlinenum(); +}; +#endif diff --git a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx index ec6f4f3..49ea117 100644 --- a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx @@ -22,18 +22,19 @@ using namespace std; #endif #else -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif // build a hash table from a munched word list + #ifdef HUNSPELL_CHROME_CLIENT HashMgr::HashMgr(hunspell::BDictReader* reader) { bdict_reader = reader; #else -HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) +HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle, const char * key) { #endif tablesize = 0; @@ -41,6 +42,10 @@ HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) flag_mode = FLAG_CHAR; complexprefixes = 0; utf8 = 0; + langnum = 0; + lang = NULL; + enc = NULL; + csconv = 0; ignorechars = NULL; ignorechars_utf16 = NULL; ignorechars_utf16_len = 0; @@ -48,12 +53,13 @@ HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) aliasf = NULL; numaliasm = 0; aliasm = NULL; + forbiddenword = FORBIDDENWORD; // forbidden word signing flag #ifdef HUNSPELL_CHROME_CLIENT // No tables to load, just the AF config. int ec = load_config(); #else load_config(aff_handle); - int ec = load_tables(dic_handle); + int ec = load_tables(dic_handle, key); #endif if (ec) { /* error condition - what should we do here */ @@ -73,29 +79,16 @@ HashMgr::~HashMgr() // now pass through hash table freeing up everything // go through column by column of the table for (int i=0; i < tablesize; i++) { - struct hentry * pt = &tableptr[i]; + struct hentry * pt = tableptr[i]; struct hentry * nt = NULL; - if (pt) { - if (pt->astr && !aliasf) free(pt->astr); - if (pt->word) free(pt->word); -#ifdef HUNSPELL_EXPERIMENTAL - if (pt->description && !aliasm) free(pt->description); -#endif - pt = pt->next; - } while(pt) { nt = pt->next; - if (pt->astr && !aliasf) free(pt->astr); - if (pt->word) free(pt->word); -#ifdef HUNSPELL_EXPERIMENTAL - if (pt->description && !aliasm) free(pt->description); -#endif + if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr); free(pt); pt = nt; } } free(tableptr); - tableptr = NULL; } tablesize = 0; @@ -113,6 +106,15 @@ HashMgr::~HashMgr() free(aliasm); aliasm = NULL; } + +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT + if (utf8) free_utf_tbl(); +#endif +#endif + + if (enc) free(enc); + if (lang) free(lang); if (ignorechars) free(ignorechars); if (ignorechars_utf16) free(ignorechars_utf16); @@ -144,7 +146,6 @@ void HashMgr::EmptyHentryCache() { #endif // lookup a root word in the hashtable - struct hentry * HashMgr::lookup(const char *word) const { #ifdef HUNSPELL_CHROME_CLIENT @@ -167,10 +168,10 @@ struct hentry * HashMgr::lookup(const char *word) const #else struct hentry * dp; if (tableptr) { - dp = &tableptr[hash(word)]; - if (dp->word == NULL) return NULL; + dp = tableptr[hash(word)]; + if (!dp) return NULL; for ( ; dp != NULL; dp = dp->next) { - if (strcmp(word,dp->word) == 0) return dp; + if (strcmp(word,&(dp->word)) == 0) return dp; } } return NULL; @@ -178,69 +179,101 @@ struct hentry * HashMgr::lookup(const char *word) const } // add a word to the hash table (private) - -int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, const char * desc) +int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, + int al, const char * desc, bool onlyupcase) { #ifndef HUNSPELL_CHROME_CLIENT - char * st = mystrdup(word); - if (wl && !st) return 1; + bool upcasehomonym = false; + int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0; + // variable-length hash record with word and optional fields + struct hentry* hp = + (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl); + if (!hp) return 1; + char * hpw = &(hp->word); + strcpy(hpw, word); if (ignorechars != NULL) { if (utf8) { - remove_ignored_chars_utf(st, ignorechars_utf16, ignorechars_utf16_len); + remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len); } else { - remove_ignored_chars(st, ignorechars); + remove_ignored_chars(hpw, ignorechars); } } if (complexprefixes) { - if (utf8) reverseword_utf(st); else reverseword(st); + if (utf8) reverseword_utf(hpw); else reverseword(hpw); } - int i = hash(st); - struct hentry * dp = &tableptr[i]; - if (dp->word == NULL) { - dp->wlen = (short) wl; - dp->alen = (short) al; - dp->word = st; - dp->astr = aff; - dp->next = NULL; - dp->next_homonym = NULL; -#ifdef HUNSPELL_EXPERIMENTAL - if (aliasm) { - dp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc); - } else { - dp->description = mystrdup(desc); - if (desc && !dp->description) return 1; - if (dp->description && complexprefixes) { - if (utf8) reverseword_utf(dp->description); else reverseword(dp->description); - } - } -#endif - } else { - struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry)); - if (!hp) return 1; - hp->wlen = (short) wl; - hp->alen = (short) al; - hp->word = st; - hp->astr = aff; - hp->next = NULL; - hp->next_homonym = NULL; -#ifdef HUNSPELL_EXPERIMENTAL - if (aliasm) { - hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc); - } else { - hp->description = mystrdup(desc); - if (desc && !hp->description) return 1; - if (dp->description && complexprefixes) { - if (utf8) reverseword_utf(hp->description); else reverseword(hp->description); + + int i = hash(hpw); + + hp->blen = (unsigned char) wbl; + hp->clen = (unsigned char) wcl; + hp->alen = (short) al; + hp->astr = aff; + hp->next = NULL; + hp->next_homonym = NULL; + + // store the description string or its pointer + if (desc) { + hp->var = H_OPT; + if (aliasm) { + hp->var += H_OPT_ALIASM; + store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc))); + } else { + strcpy(hpw + wbl + 1, desc); + if (complexprefixes) { + if (utf8) reverseword_utf(HENTRY_DATA(hp)); + else reverseword(HENTRY_DATA(hp)); } + } + if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON; + } else hp->var = 0; + + struct hentry * dp = tableptr[i]; + if (!dp) { + tableptr[i] = hp; + return 0; } -#endif while (dp->next != NULL) { - if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp; + if ((!dp->next_homonym) && (strcmp(&(hp->word), &(dp->word)) == 0)) { + // remove hidden onlyupcase homonym + if (!onlyupcase) { + if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { + free(dp->astr); + dp->astr = hp->astr; + dp->alen = hp->alen; + free(hp); + return 0; + } else { + dp->next_homonym = hp; + } + } else { + upcasehomonym = true; + } + } dp=dp->next; } - if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp; - dp->next = hp; - } + if (strcmp(&(hp->word), &(dp->word)) == 0) { + // remove hidden onlyupcase homonym + if (!onlyupcase) { + if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { + free(dp->astr); + dp->astr = hp->astr; + dp->alen = hp->alen; + free(hp); + return 0; + } else { + dp->next_homonym = hp; + } + } else { + upcasehomonym = true; + } + } + if (!upcasehomonym) { + dp->next = hp; + } else { + // remove hidden onlyupcase homonym + if (hp->astr) free(hp->astr); + free(hp); + } #endif // HUNSPELL_CHROME_CLIENT std::map<StringPiece, int>::iterator iter = custom_word_to_affix_id_map_.find(word); @@ -255,33 +288,134 @@ int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, c return 0; } -// add a custom dic. word to the hash table (public) -int HashMgr::put_word(const char * word, int wl, char * aff) +int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl, + unsigned short * flags, int al, char * dp, int captype) { - unsigned short * flags; - int al = 0; - if (aff) { - al = decode_flags(&flags, aff); - flag_qsort(flags, 0, al); + // add inner capitalized forms to handle the following allcap forms: + // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG + // Allcaps with suffixes: CIA's -> CIA'S + if (((captype == HUHCAP) || (captype == HUHINITCAP) || + ((captype == ALLCAP) && (flags != NULL))) && + !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) { + unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned short) * (al+1)); + if (!flags2) return 1; + if (al) memcpy(flags2, flags, al * sizeof(unsigned short)); + flags2[al] = ONLYUPCASEFLAG; + if (utf8) { + char st[BUFSIZE]; + w_char w[BUFSIZE]; + int wlen = u8_u16(w, BUFSIZE, word); + mkallsmall_utf(w, wlen, langnum); + mkallcap_utf(w, 1, langnum); + u16_u8(st, BUFSIZE, w, wlen); + return add_word(st,wbl,wcl,flags2,al+1,dp, true); + } else { + mkallsmall(word, csconv); + mkinitcap(word, csconv); + return add_word(word,wbl,wcl,flags2,al+1,dp, true); + } + } + return 0; +} + +// detect captype and modify word length for UTF-8 encoding +int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) { + int len; + if (utf8) { + w_char dest_utf[BUFSIZE]; + len = u8_u16(dest_utf, BUFSIZE, word); + *captype = get_captype_utf8(dest_utf, len, langnum); } else { - flags = NULL; + len = wbl; + *captype = get_captype((char *) word, len, csconv); + } + return len; +} + +// remove word (personal dictionary function for standalone applications) +int HashMgr::remove(const char * word) +{ + struct hentry * dp = lookup(word); + while (dp) { + if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { + unsigned short * flags = + (unsigned short *) malloc(sizeof(short *) * (dp->alen + 1)); + if (!flags) return 1; + for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i]; + flags[dp->alen] = forbiddenword; + dp->astr = flags; + dp->alen++; + flag_qsort(flags, 0, dp->alen); + } + dp = dp->next_homonym; } - add_word(word, wl, flags, al, NULL); return 0; } -int HashMgr::put_word_pattern(const char * word, int wl, const char * pattern) +/* remove forbidden flag to add a personal word to the hash */ +int HashMgr::remove_forbidden_flag(const char * word) { + struct hentry * dp = lookup(word); + if (!dp) return 1; + while (dp) { + if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) { + if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic. + else { + unsigned short * flags2 = + (unsigned short *) malloc(sizeof(short *) * (dp->alen - 1)); + if (!flags2) return 1; + int i, j = 0; + for (i = 0; i < dp->alen; i++) { + if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i]; + } + dp->alen--; + dp->astr = flags2; // XXX allowed forbidden words + } + } + dp = dp->next_homonym; + } + return 0; +} + +// add a custom dic. word to the hash table (public) +int HashMgr::add(const char * word) { - unsigned short * flags; - struct hentry * dp = lookup(pattern); - if (!dp || !dp->astr) return 1; - flags = (unsigned short *) malloc (dp->alen * sizeof(short)); - memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); - add_word(word, wl, flags, dp->alen, NULL); + unsigned short * flags = NULL; + int al = 0; + if (remove_forbidden_flag(word)) { + int captype; + int wbl = strlen(word); + int wcl = get_clen_and_captype(word, wbl, &captype); + add_word(word, wbl, wcl, flags, al, NULL, false); + return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype); + } return 0; } +int HashMgr::add_with_affix(const char * word, const char * example) +{ + // detect captype and modify word length for UTF-8 encoding + struct hentry * dp = lookup(example); + remove_forbidden_flag(word); + if (dp && dp->astr) { + int captype; + int wbl = strlen(word); + int wcl = get_clen_and_captype(word, wbl, &captype); + if (aliasf) { + add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false); + } else { + unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short)); + if (flags) { + memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); + add_word(word, wbl, wcl, flags, dp->alen, NULL, false); + } else return 1; + } + return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype); + } + return 1; +} + // walk the hash table entry by entry - null at end +// initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp); struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const { #ifdef HUNSPELL_CHROME_CLIENT @@ -312,88 +446,99 @@ struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const // lists for the extra affixes. If hp is NULL, create it here. if (!hp) hp = new hentry; - hp->word = word; - hp->wlen = word_len; + hp->word = *word; + hp->blen = word_len; hp->alen = (short)const_cast<HashMgr*>(this)->get_aliasf(affix_ids[0], &hp->astr); hp->next = NULL; hp->next_homonym = NULL; - + hp->var = 0; + hp->clen = 0; return hp; #else - //reset to start - if ((col < 0) || (hp == NULL)) { - col = -1; - hp = NULL; + + if (hp && hp->next != NULL) return hp->next; + for (col++; col < tablesize; col++) { + if (tableptr[col]) return tableptr[col]; } - - if (hp && hp->next != NULL) { - hp = hp->next; - } else { - col++; - hp = (col < tablesize) ? &tableptr[col] : NULL; - // search for next non-blank column entry - while (hp && (hp->word == NULL)) { - col ++; - hp = (col < tablesize) ? &tableptr[col] : NULL; - } - if (col < tablesize) return hp; - hp = NULL; - col = -1; - } - return hp; + // null at end and reset to start + col = -1; + return NULL; #endif } // load a munched word list and build a hash table on the fly -int HashMgr::load_tables(FILE* t_handle) +int HashMgr::load_tables(FILE* t_handle, const char * key) { #ifndef HUNSPELL_CHROME_CLIENT - int wl, al; + int al; char * ap; char * dp; + char * dp2; unsigned short * flags; + char * ts; - // raw dictionary - munched file - FILE * rawdict = _fdopen(_dup(_fileno(t_handle)), "r"); - if (rawdict == NULL) return 1; - fseek(rawdict, 0, SEEK_SET); + // open dictionary file + FileMgr * dict = new FileMgr(tpath, key); + if (dict == NULL) return 1; // first read the first line of file to get hash table size */ - char ts[MAXDELEN]; - if (! fgets(ts, MAXDELEN-1,rawdict)) return 2; + if (!(ts = dict->getline())) { + HUNSPELL_WARNING(stderr, "error: empty dic file\n"); + delete dict; + return 2; + } mychomp(ts); - + /* remove byte order mark */ - if (strncmp(ts,"\xef\xbb\xbf",3) == 0) { + if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) { memmove(ts, ts+3, strlen(ts+3)+1); HUNSPELL_WARNING(stderr, "warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions\n"); } - - if ((*ts < '1') || (*ts > '9')) HUNSPELL_WARNING(stderr, "error - missing word count in dictionary file\n"); + tablesize = atoi(ts); - if (!tablesize) return 4; + if (tablesize == 0) { + HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the dic file\n"); + delete dict; + return 4; + } tablesize = tablesize + 5 + USERWORD; if ((tablesize %2) == 0) tablesize++; // allocate the hash table - tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry)); - if (! tableptr) return 3; - for (int i=0; i<tablesize; i++) tableptr[i].word = NULL; + tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *)); + if (! tableptr) { + delete dict; + return 3; + } + for (int i=0; i<tablesize; i++) tableptr[i] = NULL; // loop through all words on much list and add to hash // table and create word and affix strings - while (fgets(ts,MAXDELEN-1,rawdict)) { + while ((ts = dict->getline())) { mychomp(ts); // split each line into word and morphological description - dp = strchr(ts,'\t'); + dp = ts; + while ((dp = strchr(dp, ':'))) { + if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) { + for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--); + if (dp < ts) { // missing word + dp = NULL; + } else { + *(dp + 1) = '\0'; + dp = dp + 2; + } + break; + } + dp++; + } - if (dp) { - *dp = '\0'; - dp++; - } else { - dp = NULL; + // tabulator is the old morphological field separator + dp2 = strchr(ts, '\t'); + if (dp2 && (!dp || dp2 < dp)) { + *dp2 = '\0'; + dp = dp2 + 1; } // split each line into word and affix char strings @@ -414,13 +559,13 @@ int HashMgr::load_tables(FILE* t_handle) *ap = '\0'; if (aliasf) { int index = atoi(ap + 1); - al = get_aliasf(index, &flags); + al = get_aliasf(index, &flags, dict); if (!al) { - HUNSPELL_WARNING(stderr, "error - bad flag vector alias: %s\n", ts); + HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum()); *ap = '\0'; } } else { - al = decode_flags(&flags, ap + 1); + al = decode_flags(&flags, ap + 1, dict); flag_qsort(flags, 0, al); } } else { @@ -429,19 +574,22 @@ int HashMgr::load_tables(FILE* t_handle) flags = NULL; } - wl = strlen(ts); - - // add the word and its index - if (add_word(ts,wl,flags,al,dp)) return 5; - + int captype; + int wbl = strlen(ts); + int wcl = get_clen_and_captype(ts, wbl, &captype); + // add the word and its index plus its capitalized form optionally + if (add_word(ts,wbl,wcl,flags,al,dp, false) || + add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) { + delete dict; + return 5; + } } - - fclose(rawdict); + + delete dict; #endif return 0; } - // the hash function is a simple load and rotate // algorithm borrowed @@ -466,15 +614,17 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { switch (flag_mode) { case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) len = strlen(flags); - if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: length of FLAG_LONG flagvector is odd: %s\n", flags); - len = len/2; + if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: bad flagvector\n"); + len /= 2; *result = (unsigned short *) malloc(len * sizeof(short)); + if (!*result) return -1; for (int i = 0; i < len; i++) { (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1]; } break; } case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233) + int i; len = 1; char * src = flags; unsigned short * dest; @@ -483,23 +633,29 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { if (*p == ',') len++; } *result = (unsigned short *) malloc(len * sizeof(short)); + if (!*result) return -1; dest = *result; for (p = flags; *p; p++) { if (*p == ',') { - *dest = (unsigned short) atoi(src); + i = atoi(src); + if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); + *dest = (unsigned short) i; if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); src = p + 1; dest++; } } - *dest = (unsigned short) atoi(src); + i = atoi(src); + if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); + *dest = (unsigned short) i; if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); break; } case FLAG_UNI: { // UTF-8 characters - w_char w[MAXDELEN/2]; - len = u8_u16(w, MAXDELEN/2, flags); + w_char w[BUFSIZE/2]; + len = u8_u16(w, BUFSIZE/2, flags); *result = (unsigned short *) malloc(len * sizeof(short)); + if (!*result) return -1; memcpy(*result, w, len * sizeof(short)); break; } @@ -507,24 +663,28 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { unsigned short * dest; len = strlen(flags); *result = (unsigned short *) malloc(len * sizeof(short)); + if (!*result) return -1; dest = *result; for (unsigned char * p = (unsigned char *) flags; *p; p++) { *dest = (unsigned short) *p; dest++; } } - } + } return len; } unsigned short HashMgr::decode_flag(const char * f) { unsigned short s = 0; + int i; switch (flag_mode) { case FLAG_LONG: s = ((unsigned short) f[0] << 8) + (unsigned short) f[1]; break; case FLAG_NUM: - s = (unsigned short) atoi(f); + i = atoi(f); + if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); + s = (unsigned short) i; break; case FLAG_UNI: u8_u16((w_char *) &s, 1, f); @@ -532,7 +692,7 @@ unsigned short HashMgr::decode_flag(const char * f) { default: s = (unsigned short) *((unsigned char *)f); } - if (!s) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); + if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); return s; } @@ -569,7 +729,7 @@ int HashMgr::load_config() // diacritics characters. if (strncmp(line,"IGNORE",6) == 0) { parse_array(line, &ignorechars, &ignorechars_utf16, - &ignorechars_utf16_len, "IGNORE", utf8); + &ignorechars_utf16_len, utf8, 0); } // Retrieve the format of an AF line. if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { @@ -591,75 +751,101 @@ int HashMgr::load_config() } #else // read in aff file and set flag mode -int HashMgr::load_config(FILE* aff_handle) +int HashMgr::load_config(FILE* aff_handle, const char * key) { + char * line; // io buffers int firstline = 1; - - // io buffers - char line[MAXDELEN+1]; // open the affix file - FILE * afflst; - afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); + FileMgr * afflst = new FileMgr(affpath, key); if (!afflst) { HUNSPELL_WARNING(stderr, "Error - could not open affix description file\n"); return 1; } - fseek(afflst, 0, SEEK_SET); // read in each line ignoring any that do not // start with a known line type indicator - while (fgets(line,MAXDELEN,afflst)) { + while ((line = afflst->getline())) { mychomp(line); /* remove byte order mark */ if (firstline) { firstline = 0; - if (strncmp(line,"\xef\xbb\xbf",3) == 0) memmove(line, line+3, strlen(line+3)+1); + if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(line+3)+1); } /* parse in the try string */ if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { if (flag_mode != FLAG_CHAR) { - HUNSPELL_WARNING(stderr, "error: duplicate FLAG parameter\n"); + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst->getlinenum()); } if (strstr(line, "long")) flag_mode = FLAG_LONG; if (strstr(line, "num")) flag_mode = FLAG_NUM; if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI; if (flag_mode == FLAG_CHAR) { - HUNSPELL_WARNING(stderr, "error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line); + HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst->getlinenum()); } } - if ((strncmp(line,"SET",3) == 0) && isspace(line[3]) && strstr(line, "UTF-8")) utf8 = 1; + if (strncmp(line,"FORBIDDENWORD",13) == 0) { + char * st = NULL; + if (parse_string(line, &st, afflst->getlinenum())) { + delete afflst; + return 1; + } + forbiddenword = decode_flag(st); + free(st); + } + if (strncmp(line, "SET", 3) == 0) { + if (parse_string(line, &enc, afflst->getlinenum())) { + delete afflst; + return 1; + } + if (strcmp(enc, "UTF-8") == 0) { + utf8 = 1; +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT + initialize_utf_tbl(); +#endif +#endif + } else csconv = get_current_cs(enc); + } + if (strncmp(line, "LANG", 4) == 0) { + if (parse_string(line, &lang, afflst->getlinenum())) { + delete afflst; + return 1; + } + langnum = get_lang_num(lang); + } /* parse in the ignored characters (for example, Arabic optional diacritics characters */ if (strncmp(line,"IGNORE",6) == 0) { - if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) { - fclose(afflst); + if (parse_array(line, &ignorechars, &ignorechars_utf16, + &ignorechars_utf16_len, utf8, afflst->getlinenum())) { + delete afflst; return 1; } } if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) { if (parse_aliasf(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } -#ifdef HUNSPELL_EXPERIMENTAL if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) { if (parse_aliasm(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } -#endif - if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; - if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; + + if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; + if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; } - fclose(afflst); + if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING); + delete afflst; return 0; } #endif // HUNSPELL_CHROME_CLIENT @@ -669,11 +855,11 @@ int HashMgr::load_config(FILE* aff_handle) int HashMgr::parse_aliasf(char* line, hunspell::LineIterator* iterator) { #else -int HashMgr::parse_aliasf(char * line, FILE * af) +int HashMgr::parse_aliasf(char * line, FileMgr * af) { #endif if (numaliasf != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate AF (alias for flag vector) tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -691,8 +877,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) numaliasf = 0; aliasf = NULL; aliasflen = NULL; - HUNSPELL_WARNING(stderr, "incorrect number of entries in AF table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *)); @@ -712,7 +897,6 @@ int HashMgr::parse_aliasf(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -721,7 +905,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) free(aliasflen); aliasf = NULL; aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: missing AF table information\n"); + HUNSPELL_WARNING(stderr, "error: missing data\n"); return 1; } @@ -732,9 +916,9 @@ int HashMgr::parse_aliasf(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXDELEN)) return 1; #else - if (!fgets(nl,MAXDELEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif - mychomp(nl); + mychomp(nl); tp = nl; i = 0; aliasf[j] = NULL; @@ -750,8 +934,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) free(aliasflen); aliasf = NULL; aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); return 1; } break; @@ -765,7 +948,6 @@ int HashMgr::parse_aliasf(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (!aliasf[j]) { @@ -774,7 +956,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) aliasf = NULL; aliasflen = NULL; numaliasf = 0; - HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); return 1; } } @@ -810,8 +992,8 @@ hentry* HashMgr::AffixIDsToHentry(char* word, struct hentry* he = new hentry; if (i == 0) first_he = he; - he->word = word; - he->wlen = word_len; + he->word = *word; + he->blen = word_len; he->alen = (short)const_cast<HashMgr*>(this)->get_aliasf(affix_ids[i], &he->astr); he->next = NULL; @@ -854,12 +1036,11 @@ int HashMgr::get_aliasf(int index, unsigned short ** fvec) { return 0; } -#ifdef HUNSPELL_EXPERIMENTAL /* parse morph alias definitions */ -int HashMgr::parse_aliasm(char * line, FILE * af) +int HashMgr::parse_aliasm(char * line, FileMgr * af) { if (numaliasm != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate AM (aliases for morphological descriptions) tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -874,8 +1055,7 @@ int HashMgr::parse_aliasm(char * line, FILE * af) case 1: { numaliasm = atoi(piece); if (numaliasm < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in AM table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); return 1; } aliasm = (char **) malloc(numaliasm * sizeof(char *)); @@ -890,33 +1070,31 @@ int HashMgr::parse_aliasm(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { numaliasm = 0; free(aliasm); aliasm = NULL; - HUNSPELL_WARNING(stderr, "error: missing AM alias information\n"); + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); return 1; } /* now parse the numaliasm lines to read in the remainder of the table */ char * nl = line; for (int j=0; j < numaliasm; j++) { - if (!fgets(nl,MAXDELEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; aliasm[j] = NULL; - piece = mystrsep(&tp, 0); + piece = mystrsep(&tp, ' '); while (piece) { if (*piece != '\0') { switch(i) { case 0: { if (strncmp(piece,"AM",2) != 0) { - HUNSPELL_WARNING(stderr, "error: AM table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); numaliasm = 0; free(aliasm); aliasm = NULL; @@ -925,24 +1103,34 @@ int HashMgr::parse_aliasm(char * line, FILE * af) break; } case 1: { + // add the remaining of the line + if (*tp) { + *(tp - 1) = ' '; + tp = tp + strlen(tp); + } if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } aliasm[j] = mystrdup(piece); + if (!aliasm[j]) { + numaliasm = 0; + free(aliasm); + aliasm = NULL; + return 1; + } break; } default: break; } i++; } - free(piece); - piece = mystrsep(&tp, 0); + piece = mystrsep(&tp, ' '); } if (!aliasm[j]) { numaliasm = 0; free(aliasm); aliasm = NULL; - HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); return 1; } } @@ -958,4 +1146,3 @@ char * HashMgr::get_aliasm(int index) { HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); return NULL; } -#endif diff --git a/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx b/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx index 781175e5..acfbbce 100644 --- a/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx +++ b/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx @@ -1,8 +1,14 @@ #ifndef _HASHMGR_HXX_ #define _HASHMGR_HXX_ +#ifndef MOZILLA_CLIENT #include <cstdio> +#else +#include <stdio.h> +#endif + #include "htypes.hxx" +#include "filemgr.hxx" #ifdef HUNSPELL_CHROME_CLIENT #include <string> @@ -23,20 +29,25 @@ class HashMgr std::map<StringPiece, int> custom_word_to_affix_id_map_; std::vector<std::string*> pointer_to_strings_; #endif - int tablesize; - struct hentry * tableptr; - int userword; - flag flag_mode; - int complexprefixes; - int utf8; - char * ignorechars; - unsigned short * ignorechars_utf16; - int ignorechars_utf16_len; - int numaliasf; // flag vector `compression' with aliases - unsigned short ** aliasf; - unsigned short * aliasflen; - int numaliasm; // morphological desciption `compression' with aliases - char ** aliasm; + int tablesize; + struct hentry ** tableptr; + int userword; + flag flag_mode; + int complexprefixes; + int utf8; + unsigned short forbiddenword; + int langnum; + char * enc; + char * lang; + struct cs_info * csconv; + char * ignorechars; + unsigned short * ignorechars_utf16; + int ignorechars_utf16_len; + int numaliasf; // flag vector `compression' with aliases + unsigned short ** aliasf; + unsigned short * aliasflen; + int numaliasm; // morphological desciption `compression' with aliases + char ** aliasm; public: @@ -55,7 +66,7 @@ public: // This function allows that cache to be emptied and not grow infinitely. void EmptyHentryCache(); #else - HashMgr(FILE* t_handle, FILE* a_handle); + HashMgr(FILE* t_handle, FILE* a_handle, const char * key); #endif ~HashMgr(); @@ -63,22 +74,22 @@ public: int hash(const char *) const; struct hentry * walk_hashtable(int & col, struct hentry * hp) const; - int put_word(const char * word, int wl, char * ap); - int put_word_pattern(const char * word, int wl, const char * pattern); + int add(const char * word); + int add_with_affix(const char * word, const char * pattern); + int remove(const char * word); int decode_flags(unsigned short ** result, char * flags); unsigned short decode_flag(const char * flag); char * encode_flag(unsigned short flag); int is_aliasf(); int get_aliasf(int index, unsigned short ** fvec); -#ifdef HUNSPELL_EXPERIMENTAL int is_aliasm(); char * get_aliasm(int index); -#endif - private: - int load_tables(FILE* t_handle); - int add_word(const char * word, int wl, unsigned short * ap, int al, const char * desc); + int get_clen_and_captype(const char * word, int wbl, int * captype); + int load_tables(FILE* t_handle, const char * key); + int add_word(const char * word, int wbl, int wcl, unsigned short * ap, + int al, const char * desc, bool onlyupcase); #ifdef HUNSPELL_CHROME_CLIENT int load_config(); @@ -96,13 +107,14 @@ private: HEntryCache hentry_cache; #else - int load_config(FILE* aff_handle); + int load_config(FILE* aff_handle, const char * key); int parse_aliasf(char * line, FILE * af); #endif -#ifdef HUNSPELL_EXPERIMENTAL - int parse_aliasm(char * line, FILE * af); -#endif + int add_hidden_capitalized_word(char * word, int wbl, int wcl, + unsigned short * flags, int al, char * dp, int captype); + int parse_aliasm(char * line, FileMgr * af); + int remove_forbidden_flag(const char * word); }; diff --git a/chrome/third_party/hunspell/src/hunspell/htypes.hxx b/chrome/third_party/hunspell/src/hunspell/htypes.hxx index f8d685a..75d9542 100644 --- a/chrome/third_party/hunspell/src/hunspell/htypes.hxx +++ b/chrome/third_party/hunspell/src/hunspell/htypes.hxx @@ -15,25 +15,28 @@ #define ROTATE(v,q) \ (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1)); + +// hentry options +#define H_OPT (1 << 0) +#define H_OPT_ALIASM (1 << 1) +#define H_OPT_PHON (1 << 2) + +// see also csutil.hxx +#define HENTRY_WORD(h) &(h->word) + // approx. number of user defined words #define USERWORD 1000 struct hentry { - short wlen; - short alen; - /* NOTE: Removed by mbelshe since this is not used. - * The english dictionary is 63K in size, so removing this - * itty bitty field saves us ~250KB of RAM. - char wbeg[2]; - */ - char * word; - unsigned short * astr; - struct hentry * next; - struct hentry * next_homonym; -#ifdef HUNSPELL_EXPERIMENTAL - char * description; -#endif + unsigned char blen; // word length in bytes + unsigned char clen; // word length in characters (different for UTF-8 enc.) + short alen; // length of affix flag vector + unsigned short * astr; // affix flag vector + struct hentry * next; // next word with same hash code + struct hentry * next_homonym; // next homonym word (with same hash code) + char var; // variable fields (only for special pronounciation yet) + char word; // variable-length word (8-bit or UTF-8 encoding) }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/hunspell.cxx b/chrome/third_party/hunspell/src/hunspell/hunspell.cxx index 42b0603..131ad50 100644 --- a/chrome/third_party/hunspell/src/hunspell/hunspell.cxx +++ b/chrome/third_party/hunspell/src/hunspell/hunspell.cxx @@ -6,16 +6,17 @@ #include <cstring> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #endif #include "hunspell.hxx" #include "hunspell.h" +#include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif @@ -23,27 +24,34 @@ using namespace std; #ifdef HUNSPELL_CHROME_CLIENT Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length) #else -Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle) +Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle, const char * key = NULL) #endif { encoding = NULL; csconv = NULL; utf8 = 0; complexprefixes = 0; +#ifndef HUNSPELL_CHROME_CLIENT + affixpath = mystrdup(affpath); +#endif + maxdic = 0; #ifdef HUNSPELL_CHROME_CLIENT bdict_reader = new hunspell::BDictReader; bdict_reader->Init(bdict_data, bdict_length); - pHMgr = new HashMgr(bdict_reader); - pAMgr = new AffixMgr(bdict_reader, pHMgr); + pHMgr[0] = new HashMgr(bdict_reader); + if (pHMgr[0]) maxdic = 1; + + pAMgr = new AffixMgr(bdict_reader, pHMgr, &maxdic); #else /* first set up the hash manager */ - pHMgr = new HashMgr(dic_handle, aff_handle); + pHMgr[0] = new HashMgr(dic_handle, aff_handle, key); + if (pHMgr[0]) maxdic = 1; /* next set up the affix manager */ /* it needs access to the hash manager lookup methods */ - pAMgr = new AffixMgr(aff_handle, pHMgr); + pAMgr = new AffixMgr(aff_handle, pHMgr, &maxdic, key); #endif /* get the preferred try string and the dictionary */ @@ -65,10 +73,13 @@ Hunspell::~Hunspell() { if (pSMgr) delete pSMgr; if (pAMgr) delete pAMgr; - if (pHMgr) delete pHMgr; + for (int i = 0; i < maxdic; i++) delete pHMgr[i]; + maxdic = 0; pSMgr = NULL; pAMgr = NULL; - pHMgr = NULL; +#ifdef MOZILLA_CLIENT + free(csconv); +#endif csconv= NULL; if (encoding) free(encoding); encoding = NULL; @@ -76,27 +87,38 @@ Hunspell::~Hunspell() #ifdef HUNSPELL_CHROME_CLIENT if (bdict_reader) delete bdict_reader; bdict_reader = NULL; +#else + if (affixpath) free(affixpath); + affixpath = NULL; #endif } +#ifndef HUNSPELL_CHROME_CLIENT +// load extra dictionaries +int Hunspell::add_dic(const char * dpath, const char * key) { + if (maxdic == MAXDIC || !affixpath) return 1; + pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); + if (pHMgr[maxdic]) maxdic++; else return 1; + return 0; +} +#endif // make a copy of src at destination while removing all leading // blanks and removing any trailing periods after recording // their presence with the abbreviation flag -// also since already going through character by character, +// also since already going through character by character, // set the capitalization type // return the length of the "cleaned" (and UTF-8 encoded) word -int Hunspell::cleanword2(char * dest, const char * src, +int Hunspell::cleanword2(char * dest, const char * src, w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev) -{ +{ unsigned char * p = (unsigned char *) dest; const unsigned char * q = (const unsigned char * ) src; - int firstcap = 0; // first skip over any leading blanks while ((*q != '\0') && (*q == ' ')) q++; - + // now strip off any trailing periods (recording their presence) *pabbrev = 0; int nl = strlen((const char *)q); @@ -104,80 +126,43 @@ int Hunspell::cleanword2(char * dest, const char * src, nl--; (*pabbrev)++; } - + // if no characters are left it can't be capitalized - if (nl <= 0) { + if (nl <= 0) { *pcaptype = NOCAP; *p = '\0'; return 0; } - // now determine the capitalization type of the first nl letters - int ncap = 0; - int nneutral = 0; - *nc = 0; - - if (!utf8) { - while (nl > 0) { - (*nc)++; - if (csconv[(*q)].ccase) ncap++; - if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; - *p++ = *q++; - nl--; - } - // remember to terminate the destination string - *p = '\0'; - if (ncap) { - firstcap = csconv[(unsigned char)(*dest)].ccase; - } - } else { - unsigned short idx; - *nc = u8_u16(dest_utf, MAXWORDLEN, (const char *) q); + strncpy(dest, (char *) q, nl); + *(dest + nl) = '\0'; + nl = strlen(dest); + if (utf8) { + *nc = u8_u16(dest_utf, MAXWORDLEN, dest); // don't check too long words if (*nc >= MAXWORDLEN) return 0; if (*nc == -1) { // big Unicode character (non BMP area) *pcaptype = NOCAP; - strcpy((char *) p, (char *) q); - return strlen(dest); + return nl; } - *nc -= *pabbrev; - for (int i = 0; i < *nc; i++) { - idx = (dest_utf[i].h << 8) + dest_utf[i].l; - if (idx != unicodetolower(idx, langnum)) ncap++; - if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; - } - u16_u8(dest, MAXWORDUTF8LEN, dest_utf, *nc); - if (ncap) { - idx = (dest_utf[0].h << 8) + dest_utf[0].l; - firstcap = (idx != unicodetolower(idx, langnum)); - } - } - - // now finally set the captype - if (ncap == 0) { - *pcaptype = NOCAP; - } else if ((ncap == 1) && firstcap) { - *pcaptype = INITCAP; - } else if ((ncap == *nc) || ((ncap + nneutral) == *nc)) { - *pcaptype = ALLCAP; - } else if ((ncap > 1) && firstcap) { - *pcaptype = HUHINITCAP; + *pcaptype = get_captype_utf8(dest_utf, *nc, langnum); } else { - *pcaptype = HUHCAP; + *pcaptype = get_captype(dest, nl, csconv); + *nc = nl; } - return strlen(dest); -} + return nl; +} -int Hunspell::cleanword(char * dest, const char * src, +int Hunspell::cleanword(char * dest, const char * src, int * pcaptype, int * pabbrev) -{ +{ unsigned char * p = (unsigned char *) dest; const unsigned char * q = (const unsigned char * ) src; int firstcap = 0; // first skip over any leading blanks while ((*q != '\0') && (*q == ' ')) q++; - + // now strip off any trailing periods (recording their presence) *pabbrev = 0; int nl = strlen((const char *)q); @@ -185,9 +170,9 @@ int Hunspell::cleanword(char * dest, const char * src, nl--; (*pabbrev)++; } - + // if no characters are left it can't be capitalized - if (nl <= 0) { + if (nl <= 0) { *pcaptype = NOCAP; *p = '\0'; return 0; @@ -215,8 +200,9 @@ int Hunspell::cleanword(char * dest, const char * src, nc = u8_u16(t, MAXWORDLEN, src); for (int i = 0; i < nc; i++) { idx = (t[i].h << 8) + t[i].l; - if (idx != unicodetolower(idx, langnum)) ncap++; - if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; + unsigned short low = unicodetolower(idx, langnum); + if (idx != low) ncap++; + if (unicodetoupper(idx, langnum) == low) nneutral++; } u16_u8(dest, MAXWORDUTF8LEN, t, nc); if (ncap) { @@ -238,8 +224,7 @@ int Hunspell::cleanword(char * dest, const char * src, *pcaptype = HUHCAP; } return strlen(dest); -} - +} void Hunspell::mkallcap(char * p) { @@ -256,7 +241,7 @@ void Hunspell::mkallcap(char * p) } u16_u8(p, MAXWORDUTF8LEN, u, nc); } else { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].cupper; p++; } @@ -269,15 +254,16 @@ int Hunspell::mkallcap2(char * p, w_char * u, int nc) unsigned short idx; for (int i = 0; i < nc; i++) { idx = (u[i].h << 8) + u[i].l; - if (idx != unicodetoupper(idx, langnum)) { - u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); - u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); + unsigned short up = unicodetoupper(idx, langnum); + if (idx != up) { + u[i].h = (unsigned char) (up >> 8); + u[i].l = (unsigned char) (up & 0x00FF); } } u16_u8(p, MAXWORDUTF8LEN, u, nc); - return strlen(p); + return strlen(p); } else { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].cupper; p++; } @@ -288,7 +274,7 @@ int Hunspell::mkallcap2(char * p, w_char * u, int nc) void Hunspell::mkallsmall(char * p) { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].clower; p++; } @@ -300,15 +286,16 @@ int Hunspell::mkallsmall2(char * p, w_char * u, int nc) unsigned short idx; for (int i = 0; i < nc; i++) { idx = (u[i].h << 8) + u[i].l; - if (idx != unicodetolower(idx, langnum)) { - u[i].h = (unsigned char) (unicodetolower(idx, langnum) >> 8); - u[i].l = (unsigned char) (unicodetolower(idx, langnum) & 0x00FF); + unsigned short low = unicodetolower(idx, langnum); + if (idx != low) { + u[i].h = (unsigned char) (low >> 8); + u[i].l = (unsigned char) (low & 0x00FF); } } u16_u8(p, MAXWORDUTF8LEN, u, nc); return strlen(p); } else { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].clower; p++; } @@ -322,18 +309,18 @@ char * Hunspell::sharps_u8_l1(char * dest, char * source) { *p = *source; for (p++, source++; *(source - 1); p++, source++) { *p = *source; - if (*source == '\x9f') *--p = '\xdf'; + if (*source == '\x9F') *--p = '\xDF'; } return dest; } -// recursive search for right ss-\xdf permutations +// recursive search for right ss - sharp s permutations hentry * Hunspell::spellsharps(char * base, char * pos, int n, int repnum, char * tmp, int * info, char **root) { pos = strstr(pos, "ss"); if (pos && (n < MAXSHARPS)) { - *pos = '\xc3'; - *(pos + 1) = '\x9f'; + *pos = '\xC3'; + *(pos + 1) = '\x9F'; hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root); if (h) return h; *pos = 's'; @@ -352,31 +339,32 @@ int Hunspell::is_keepcase(const hentry * rv) { TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); } -/* check and insert a word to beginning of the suggestion array */ -int Hunspell::insert_sug(char ***slst, char * word, int *ns) { - if (spell(word)) { - if (*ns == MAXSUGGESTION) { - (*ns)--; - free((*slst)[*ns]); - } - for (int k = *ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; - (*slst)[0] = mystrdup(word); - (*ns)++; +/* insert a word to the beginning of the suggestion array and return ns */ +int Hunspell::insert_sug(char ***slst, char * word, int ns) { + char * dup = mystrdup(word); + if (!dup) return ns; + if (ns == MAXSUGGESTION) { + ns--; + free((*slst)[ns]); } - return 0; + for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; + (*slst)[0] = dup; + return ns + 1; } int Hunspell::spell(const char * word, int * info, char ** root) { #ifdef HUNSPELL_CHROME_CLIENT - if (pHMgr) pHMgr->EmptyHentryCache(); + if (pHMgr) pHMgr[0]->EmptyHentryCache(); #endif struct hentry * rv=NULL; // need larger vector. For example, Turkish capital letter I converted a // 2-byte UTF-8 character (dotless i) by mkallsmall. - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - w_char unicw[MAXWORDLEN + 1]; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + w_char unicw[MAXWORDLEN]; + // Hunspell supports XML input of the simplified API (see manual) + if (strcmp(word, SPELL_XML) == 0) return 1; int nc = strlen(word); int wl2 = 0; if (utf8) { @@ -386,14 +374,18 @@ int Hunspell::spell(const char * word, int * info, char ** root) } int captype = 0; int abbv = 0; - int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + int wl = 0; - if (wl == 0) return 1; + // input conversion + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - if (info) *info = 0; + int info2 = 0; + if (wl == 0 || maxdic == 0) return 1; if (root) *root = NULL; - // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.) + // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.) enum { NBEGIN, NNUM, NSEP }; int nstate = NBEGIN; int i; @@ -407,173 +399,179 @@ int Hunspell::spell(const char * word, int * info, char ** root) } else break; } if ((i == wl) && (nstate == NNUM)) return 1; - - // LANG_hu section: number(s) + (percent or degree) with suffixes - if (langnum == LANG_hu) { - if ((nstate == NNUM) && ((cw[i] == '%') || (cw[i] == '\xb0')) - && checkword(cw + i, info, root)) return 1; - } - // END of LANG_hu section + if (!info) info = &info2; else *info = 0; switch(captype) { - case HUHCAP: - case HUHINITCAP: - case NOCAP: { - rv = checkword(cw, info, root); - if ((abbv) && !(rv)) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = checkword(wspace, info, root); - } - break; - } + case HUHCAP: + case HUHINITCAP: + case NOCAP: { + rv = checkword(cw, info, root); + if ((abbv) && !(rv)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + } + break; + } case ALLCAP: { - rv = checkword(cw, info, root); - if (rv) break; - if (abbv) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = checkword(wspace, info, root); - if (rv) break; - } - if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { - char tmpword[MAXWORDUTF8LEN]; - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace,cw,(wl+1)); + rv = checkword(cw, info, root); + if (rv) break; + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + if (rv) break; + } + // Spec. prefix handling for Catalan, French, Italian: + // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). + if (pAMgr && strchr(cw, '\'')) { + wl = mkallsmall2(cw, unicw, nc); + char * apostrophe = strchr(cw, '\''); + if (utf8) { + w_char tmpword[MAXWORDLEN]; + *apostrophe = '\0'; + wl2 = u8_u16(tmpword, MAXWORDLEN, cw); + *apostrophe = '\''; + if (wl2 < nc) { + mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1); + rv = checkword(cw, info, root); + if (rv) break; + } + } else { + mkinitcap2(apostrophe + 1, unicw, nc); + rv = checkword(cw, info, root); + if (rv) break; + } + mkinitcap2(cw, unicw, nc); + rv = checkword(cw, info, root); + if (rv) break; + } + if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { + char tmpword[MAXWORDUTF8LEN]; + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + if (!rv) { + wl2 = mkinitcap2(cw, unicw, nc); + rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); + } + if ((abbv) && !(rv)) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + if (!rv) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - if (!rv) { - wl2 = mkinitcap2(cw, unicw, nc); - rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); - } - if ((abbv) && !(rv)) { - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - if (!rv) { - memcpy(wspace, cw, wl2); - *(wspace+wl2) = '.'; - *(wspace+wl2+1) = '\0'; - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - } - } - if (rv) break; } } - case INITCAP: { - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace,cw,(wl+1)); - rv = checkword(wspace, info, root); - if (!rv || (is_keepcase(rv) && !((captype == INITCAP) && - // if CHECKSHARPS: KEEPCASE words with \xdf are allowed - // in INITCAP form, too. - pAMgr->get_checksharps() && ((utf8 && strstr(wspace, "\xc3\x9f")) || - (!utf8 && strchr(wspace, '\xdf')))))) { - wl2 = mkinitcap2(cw, unicw, nc); - rv = checkword(cw, info, root); - if (rv && (captype == ALLCAP) && is_keepcase(rv)) rv = NULL; - } - if (abbv && !rv) { - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = checkword(wspace, info, root); - if (!rv || is_keepcase(rv)) { - memcpy(wspace, cw, wl2); - *(wspace+wl2) = '.'; - *(wspace+wl2+1) = '\0'; - rv = checkword(wspace, info, root); - if (rv && ((captype == ALLCAP) && is_keepcase(rv))) rv = NULL; - } - } - break; - } + if (rv) break; + } + } + case INITCAP: { + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + wl2 = mkinitcap2(cw, unicw, nc); + if (captype == INITCAP) *info += SPELL_INITCAP; + rv = checkword(cw, info, root); + if (captype == INITCAP) *info -= SPELL_INITCAP; + // forbid bad capitalization + // (for example, ijs -> Ijs instead of IJs in Dutch) + // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) + if (*info & SPELL_FORBIDDEN) { + rv = NULL; + break; + } + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; + if (rv) break; + + rv = checkword(wspace, info, root); + if (abbv && !rv) { + + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + if (!rv) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + if (captype == INITCAP) *info += SPELL_INITCAP; + rv = checkword(wspace, info, root); + if (captype == INITCAP) *info -= SPELL_INITCAP; + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; + break; + } + } + if (rv && is_keepcase(rv) && + ((captype == ALLCAP) || + // if CHECKSHARPS: KEEPCASE words with \xDF are allowed + // in INITCAP form, too. + !(pAMgr->get_checksharps() && + ((utf8 && strstr(wspace, "\xC3\x9F")) || + (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL; + break; + } } - + if (rv) return 1; - // recursive breaking at break points (not good for morphological analysis) + // recursive breaking at break points if (wordbreak) { char * s; char r; - for (int j = 0; j < pAMgr->get_numbreak(); j++) { + int corr = 0; + wl = strlen(cw); + int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; + // check boundary patterns (^begin and end$) + for (int j = 0; j < numbreak; j++) { + int plen = strlen(wordbreak[j]); + if (plen == 1 || plen > wl) continue; + if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 + && spell(cw + plen - 1)) return 1; + if (wordbreak[j][plen - 1] == '$' && + strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) { + r = cw[wl - plen + 1]; + cw[wl - plen + 1] = '\0'; + if (spell(cw)) return 1; + cw[wl - plen + 1] = r; + } + } + // other patterns + for (int j = 0; j < numbreak; j++) { + int result = 0; + int plen = strlen(wordbreak[j]); s=(char *) strstr(cw, wordbreak[j]); - if (s) { + if (s && (s > cw) && (s < cw + wl - plen)) { + if (!spell(s + plen)) continue; r = *s; *s = '\0'; // examine 2 sides of the break point - if (spell(cw) && spell(s + strlen(wordbreak[j]))) { - *s = r; - return 1; - } + if (spell(cw)) return 1; *s = r; + + // LANG_hu: spec. dash rule + if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { + r = s[1]; + s[1] = '\0'; + if (spell(cw)) return 1; // check the first part with dash + s[1] = r; + } + // end of LANG speficic region + } } } - // LANG_hu: compoundings with dashes and n-dashes XXX deprecated! - if (langnum == LANG_hu) { - int n; - // compound word with dash (HU) I18n - char * dash; - int result = 0; - // n-dash - dash = (char *) strstr(cw,"\xe2\x80\x93"); - if (dash && !wordbreak) { - *dash = '\0'; - // examine 2 sides of the dash - if (spell(cw) && spell(dash + 3)) { - *dash = '\xe2'; - return 1; - } - *dash = '\xe2'; - } - dash = (char *) strchr(cw,'-'); - if (dash) { - *dash='\0'; - // examine 2 sides of the dash - if (dash[1] == '\0') { // base word ending with dash - if (spell(cw)) return 1; - } else { - // first word ending with dash: word- - char r2 = *(dash + 1); - dash[0]='-'; - dash[1]='\0'; - result = spell(cw); - dash[1] = r2; - dash[0]='\0'; - if (result && spell(dash+1) && ((strlen(dash+1) > 1) || (dash[1] == 'e') || - ((dash[1] > '0') && (dash[1] < '9')))) return 1; - } - // affixed number in correct word - if (result && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)>='.'))) { - *dash='-'; - n = 1; - if (*(dash - n) == '.') n++; - // search first not a number character to left from dash - while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) { - n++; - } - if ((dash - n) < cw) n--; - // numbers: deprecated - for(; n >= 1; n--) { - if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && - checkword(dash - n, info, root)) return 1; - } - } - } - } return 0; } -//int Hunspell::spell(const char * word) { -// return spell(word, NULL, NULL); -//} - struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) { struct hentry * he = NULL; - int len; + int len, i; char w2[MAXWORDUTF8LEN]; const char * word; @@ -600,26 +598,29 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) } // look word in hash table - if (pHMgr) he = pHMgr->lookup(word); + for (i = 0; (i < maxdic) && !he; i ++) { + he = (pHMgr[i])->lookup(word); // check forbidden and onlyincompound words if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - info += SPELL_FORBIDDEN; + if (info) *info += SPELL_FORBIDDEN; // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) { if (pAMgr->get_compoundflag() && TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { - info += SPELL_COMPOUND; + if (info) *info += SPELL_COMPOUND; } } return NULL; } - // he = next not pseudoroot and not onlyincompound homonym or NULL + // he = next not needaffix, onlyincompound homonym or onlyupcase word while (he && (he->astr) && - ((pAMgr->get_pseudoroot() && TESTAFF(he->astr, pAMgr->get_pseudoroot(), he->alen)) || - (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) + ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || + (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || + (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)) )) he = he->next_homonym; + } // check with affixes if (!he && pAMgr) { @@ -627,38 +628,42 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) len = strlen(word); he = pAMgr->affix_check(word, len, 0); - // check compound restriction - if (he && he->astr && pAMgr->get_onlyincompound() && - TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) he = NULL; + // check compound restriction and onlyupcase + if (he && he->astr && ( + (pAMgr->get_onlyincompound() && + TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || + (info && (*info & SPELL_INITCAP) && + TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { + he = NULL; + } if (he) { if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - info += SPELL_FORBIDDEN; + if (info) *info += SPELL_FORBIDDEN; return NULL; } if (root) { - *root = mystrdup(he->word); - if (complexprefixes) { + *root = mystrdup(&(he->word)); + if (*root && complexprefixes) { if (utf8) reverseword_utf(*root); else reverseword(*root); } } // try check compound word } else if (pAMgr->get_compound()) { - he = pAMgr->compound_check(word, len, - 0,0,100,0,NULL,0,NULL,NULL,0); + he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0); // LANG_hu section: `moving rule' with last dash - if ((!he) && (langnum == LANG_hu) && (word[len-1]=='-')) { + if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) { char * dup = mystrdup(word); + if (!dup) return NULL; dup[len-1] = '\0'; - he = pAMgr->compound_check(dup, len-1, - -5,0,100,0,NULL,1,NULL,NULL,0); + he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0); free(dup); } - // end of LANG speficic region + // end of LANG speficic region if (he) { if (root) { - *root = mystrdup(he->word); - if (complexprefixes) { + *root = mystrdup(&(he->word)); + if (*root && complexprefixes) { if (utf8) reverseword_utf(*root); else reverseword(*root); } } @@ -674,12 +679,18 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) int Hunspell::suggest(char*** slst, const char * word) { #ifdef HUNSPELL_CHROME_CLIENT - if (pHMgr) pHMgr->EmptyHentryCache(); + if (pHMgr) pHMgr[0]->EmptyHentryCache(); #endif - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; - w_char unicw[MAXWORDLEN + 1]; + int onlycmpdsug = 0; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (!pSMgr || maxdic == 0) return 0; + w_char unicw[MAXWORDLEN]; + *slst = NULL; + // process XML input of the simplified API (see manual) + if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { + return spellml(slst, word); + } int nc = strlen(word); if (utf8) { if (nc >= MAXWORDUTF8LEN) return 0; @@ -688,49 +699,73 @@ int Hunspell::suggest(char*** slst, const char * word) } int captype = 0; int abbv = 0; - int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + int wl = 0; + + // input conversion + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + if (wl == 0) return 0; int ns = 0; - *slst = NULL; int capwords = 0; - int ngramsugs = 0; switch(captype) { - case NOCAP: { - ns = pSMgr->suggest(slst, cw, ns); + case NOCAP: { + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); break; } - case INITCAP: { + case INITCAP: { capwords = 1; - ns = pSMgr->suggest(slst, cw, ns); + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); if (ns == -1) break; memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); break; } case HUHINITCAP: capwords = 1; - case HUHCAP: { - ns = pSMgr->suggest(slst, cw, ns); + case HUHCAP: { + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); if (ns != -1) { int prevns; + // something.The -> something. The + char * dot = strchr(cw, '.'); + if (dot && (dot > cw)) { + int captype_; + if (utf8) { + w_char w_[MAXWORDLEN]; + int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1); + captype_ = get_captype_utf8(w_, wl_, langnum); + } else captype_ = get_captype(dot+1, strlen(dot+1), csconv); + if (captype_ == INITCAP) { + char * st = mystrdup(cw); + if (st) st = (char *) realloc(st, wl + 2); + if (st) { + st[(dot - cw) + 1] = ' '; + strcpy(st + (dot - cw) + 2, dot + 1); + ns = insert_sug(slst, st, ns); + free(st); + } + } + } if (captype == HUHINITCAP) { // TheOpenOffice.org -> The OpenOffice.org memcpy(wspace,cw,(wl+1)); mkinitsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); } memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - insert_sug(slst, wspace, &ns); + if (spell(wspace)) ns = insert_sug(slst, wspace, ns); prevns = ns; - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); if (captype == HUHINITCAP) { mkinitcap2(wspace, unicw, nc); - insert_sug(slst, wspace, &ns); - ns = pSMgr->suggest(slst, wspace, ns); + if (spell(wspace)) ns = insert_sug(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); } // aNew -> "a New" (instead of "a new") for (int j = prevns; j < ns; j++) { @@ -739,7 +774,7 @@ int Hunspell::suggest(char*** slst, const char * word) int slen = strlen(space + 1); // different case after space (need capitalisation) if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) { - w_char w[MAXWORDLEN + 1]; + w_char w[MAXWORDLEN]; int wc = 0; char * r = (*slst)[j]; if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1); @@ -754,31 +789,32 @@ int Hunspell::suggest(char*** slst, const char * word) break; } - case ALLCAP: { + case ALLCAP: { memcpy(wspace, cw, (wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); if (ns == -1) break; - if (pAMgr && pAMgr->get_keepcase()) insert_sug(slst, wspace, &ns); + if (pAMgr && pAMgr->get_keepcase() && spell(wspace)) + ns = insert_sug(slst, wspace, ns); mkinitcap2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); for (int j=0; j < ns; j++) { mkallcap((*slst)[j]); if (pAMgr && pAMgr->get_checksharps()) { char * pos; if (utf8) { - pos = strstr((*slst)[j], "\xc3\x9f"); + pos = strstr((*slst)[j], "\xC3\x9F"); while (pos) { *pos = 'S'; *(pos+1) = 'S'; - pos = strstr(pos+2, "\xc3\x9f"); + pos = strstr(pos+2, "\xC3\x9F"); } } else { - pos = strchr((*slst)[j], '\xdf'); + pos = strchr((*slst)[j], '\xDF'); while (pos) { (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2); - mystrrep((*slst)[j], "\xdf", "SS"); - pos = strchr((*slst)[j], '\xdf'); + mystrrep((*slst)[j], "\xDF", "SS"); + pos = strchr((*slst)[j], '\xDF'); } } } @@ -807,37 +843,76 @@ int Hunspell::suggest(char*** slst, const char * word) // END OF LANG_hu section // try ngram approach since found nothing - if ((ns == 0) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { - ngramsugs = 1; + if ((ns == 0 || onlycmpdsug) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { switch(captype) { case NOCAP: { - ns = pSMgr->ngsuggest(*slst, cw, pHMgr); + ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); break; } + case HUHINITCAP: + capwords = 1; case HUHCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); - break; + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); + break; } - case INITCAP: { + case INITCAP: { capwords = 1; memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); break; } case ALLCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); - for (int j=0; j < ns; j++) + int oldns = ns; + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); + for (int j = oldns; j < ns; j++) mkallcap((*slst)[j]); break; } } } + // try dash suggestion (Afo-American -> Afro-American) + if (strchr(cw, '-')) { + char * pos = strchr(cw, '-'); + char * ppos = cw; + int nodashsug = 1; + char ** nlst = NULL; + int nn = 0; + int last = 0; + for (int j = 0; j < ns && nodashsug == 1; j++) { + if (strchr((*slst)[j], '-')) nodashsug = 0; + } + while (nodashsug && !last) { + if (*pos == '\0') last = 1; else *pos = '\0'; + if (!spell(ppos)) { + nn = suggest(&nlst, ppos); + for (int j = nn - 1; j >= 0; j--) { + strncpy(wspace, cw, ppos - cw); + strcpy(wspace + (ppos - cw), nlst[j]); + if (!last) { + strcat(wspace, "-"); + strcat(wspace, pos + 1); + } + ns = insert_sug(slst, wspace, ns); + free(nlst[j]); + } + if (nlst != NULL) free(nlst); + nodashsug = 0; + } + if (!last) { + *pos = '-'; + ppos = pos + 1; + pos = strchr(ppos, '-'); + } + if (!pos) pos = cw + strlen(cw); + } + } + // word reversing wrapper for complex prefixes if (complexprefixes) { for (int j = 0; j < ns; j++) { @@ -858,14 +933,14 @@ int Hunspell::suggest(char*** slst, const char * word) } } - // suggest keepcase - if (pAMgr->get_keepcase()) { + // remove bad capitalized and forbidden forms + if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { switch (captype) { case INITCAP: case ALLCAP: { int l = 0; for (int j=0; j < ns; j++) { - if (!spell((*slst)[j])) { + if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) { char s[MAXSWUTF8L]; w_char w[MAXSWL]; int len; @@ -876,21 +951,21 @@ int Hunspell::suggest(char*** slst, const char * word) len = strlen(s); } mkallsmall2(s, w, len); - free((*slst)[j]); + free((*slst)[j]); if (spell(s)) { (*slst)[l] = mystrdup(s); - l++; + if ((*slst)[l]) l++; } else { mkinitcap2(s, w, len); if (spell(s)) { (*slst)[l] = mystrdup(s); - l++; + if ((*slst)[l]) l++; } } } else { (*slst)[l] = (*slst)[j]; l++; - } + } } ns = l; } @@ -909,9 +984,28 @@ int Hunspell::suggest(char*** slst, const char * word) } l++; } + + // output conversion + rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; + for (int j = 0; rl && j < ns; j++) { + if (rl->conv((*slst)[j], wspace)) { + free((*slst)[j]); + (*slst)[j] = mystrdup(wspace); + } + } + + // if suggestions removed by nosuggest, onlyincompound parameters + if (l == 0 && *slst) { + free(*slst); + *slst = NULL; + } return l; } +void Hunspell::free_list(char *** slst, int n) { + freelist(slst, n); +} + char * Hunspell::get_dic_encoding() { return encoding; @@ -921,9 +1015,9 @@ char * Hunspell::get_dic_encoding() // XXX need UTF-8 support int Hunspell::suggest_auto(char*** slst, const char * word) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (!pSMgr || maxdic == 0) return 0; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return 0; @@ -936,15 +1030,15 @@ int Hunspell::suggest_auto(char*** slst, const char * word) if (wl == 0) return 0; int ns = 0; *slst = NULL; // HU, nsug in pSMgr->suggest - + switch(captype) { - case NOCAP: { + case NOCAP: { ns = pSMgr->suggest_auto(slst, cw, ns); if (ns>0) break; break; } - case INITCAP: { + case INITCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); ns = pSMgr->suggest_auto(slst, wspace, ns); @@ -952,10 +1046,11 @@ int Hunspell::suggest_auto(char*** slst, const char * word) mkinitcap((*slst)[j]); ns = pSMgr->suggest_auto(slst, cw, ns); break; - + } - case HUHCAP: { + case HUHINITCAP: + case HUHCAP: { ns = pSMgr->suggest_auto(slst, cw, ns); if (ns == 0) { memcpy(wspace,cw,(wl+1)); @@ -965,7 +1060,7 @@ int Hunspell::suggest_auto(char*** slst, const char * word) break; } - case ALLCAP: { + case ALLCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); ns = pSMgr->suggest_auto(slst, wspace, ns); @@ -1011,103 +1106,89 @@ int Hunspell::suggest_auto(char*** slst, const char * word) } } } - // END OF LANG_hu section + // END OF LANG_hu section return ns; } +#endif -// XXX need UTF-8 support -int Hunspell::stem(char*** slst, const char * word) +int Hunspell::stem(char*** slst, char ** desc, int n) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; - int wl = strlen(word); - if (utf8) { - if (wl >= MAXWORDUTF8LEN) return 0; - } else { - if (wl >= MAXWORDLEN) return 0; - } - int captype = 0; - int abbv = 0; - wl = cleanword(cw, word, &captype, &abbv); - if (wl == 0) return 0; - - int ns = 0; - - *slst = NULL; // HU, nsug in pSMgr->suggest - - switch(captype) { - case HUHCAP: - case NOCAP: { - ns = pSMgr->suggest_stems(slst, cw, ns); - - if ((abbv) && (ns == 0)) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - ns = pSMgr->suggest_stems(slst, wspace, ns); - } - - break; - } - - case INITCAP: { - - ns = pSMgr->suggest_stems(slst, cw, ns); - - if (ns == 0) { - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - ns = pSMgr->suggest_stems(slst, wspace, ns); - - } - - if ((abbv) && (ns == 0)) { - memcpy(wspace,cw,wl); - mkallsmall(wspace); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - ns = pSMgr->suggest_stems(slst, wspace, ns); - } - - break; - - } - - case ALLCAP: { - ns = pSMgr->suggest_stems(slst, cw, ns); - if (ns != 0) break; - - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - ns = pSMgr->suggest_stems(slst, wspace, ns); - - if (ns == 0) { - mkinitcap(wspace); - ns = pSMgr->suggest_stems(slst, wspace, ns); - } + char result[MAXLNLEN]; + char result2[MAXLNLEN]; + *slst = NULL; + if (n == 0) return 0; + *result2 = '\0'; + for (int i = 0; i < n; i++) { + *result = '\0'; + // add compound word parts (except the last one) + char * s = (char *) desc[i]; + char * part = strstr(s, MORPH_PART); + if (part) { + char * nextpart = strstr(part + 1, MORPH_PART); + while (nextpart) { + copy_field(result + strlen(result), part, MORPH_PART); + part = nextpart; + nextpart = strstr(part + 1, MORPH_PART); + } + s = part; + } - if ((abbv) && (ns == 0)) { - memcpy(wspace,cw,wl); - mkallsmall(wspace); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - ns = pSMgr->suggest_stems(slst, wspace, ns); - } + char **pl; + char tok[MAXLNLEN]; + strcpy(tok, s); + char * alt = strstr(tok, " | "); + while (alt) { + alt[1] = MSEP_ALT; + alt = strstr(alt, " | "); + } + int pln = line_tok(tok, &pl, MSEP_ALT); + for (int k = 0; k < pln; k++) { + // add derivational suffixes + if (strstr(pl[k], MORPH_DERI_SFX)) { + // remove inflectional suffixes + char * is = strstr(pl[k], MORPH_INFL_SFX); + if (is) *is = '\0'; + char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); + if (sg) { + char ** gen; + int genl = line_tok(sg, &gen, MSEP_REC); + free(sg); + for (int j = 0; j < genl; j++) { + sprintf(result2 + strlen(result2), "%c%s%s", + MSEP_REC, result, gen[j]); + } + freelist(&gen, genl); + } + } else { + sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); + if (strstr(pl[k], MORPH_SURF_PFX)) { + copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); + } + copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); + } + } + freelist(&pl, pln); + } + int sln = line_tok(result2, slst, MSEP_REC); + return uniqlist(*slst, sln); +} - break; - } - } - - return ns; +int Hunspell::stem(char*** slst, const char * word) +{ + char ** pl; + int pln = analyze(&pl, word); + int pln2 = stem(slst, pl, pln); + freelist(&pl, pln); + return pln2; } +#ifdef HUNSPELL_EXPERIMENTAL int Hunspell::suggest_pos_stems(char*** slst, const char * word) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (! pSMgr || maxdic == 0) return 0; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return 0; @@ -1118,14 +1199,14 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) int abbv = 0; wl = cleanword(cw, word, &captype, &abbv); if (wl == 0) return 0; - + int ns = 0; // ns=0 = normalized input *slst = NULL; // HU, nsug in pSMgr->suggest - + switch(captype) { case HUHCAP: - case NOCAP: { + case NOCAP: { ns = pSMgr->suggest_pos_stems(slst, cw, ns); if ((abbv) && (ns == 0)) { @@ -1138,7 +1219,7 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) break; } - case INITCAP: { + case INITCAP: { ns = pSMgr->suggest_pos_stems(slst, cw, ns); @@ -1147,15 +1228,15 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) mkallsmall(wspace); ns = pSMgr->suggest_pos_stems(slst, wspace, ns); } - + break; - + } - case ALLCAP: { + case ALLCAP: { ns = pSMgr->suggest_pos_stems(slst, cw, ns); if (ns != 0) break; - + memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); ns = pSMgr->suggest_pos_stems(slst, wspace, ns); @@ -1225,19 +1306,21 @@ int Hunspell::mkinitsmall2(char * p, w_char * u, int nc) return nc; } -int Hunspell::put_word(const char * word) +int Hunspell::add(const char * word) { - if (pHMgr) { - return pHMgr->put_word(word, strlen(word), NULL); - } + if (pHMgr[0]) return (pHMgr[0])->add(word); return 0; } -int Hunspell::put_word_pattern(const char * word, const char * pattern) +int Hunspell::add_with_affix(const char * word, const char * example) { - if (pHMgr) { - return pHMgr->put_word_pattern(word, strlen(word), pattern); - } + if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example); + return 0; +} + +int Hunspell::remove(const char * word) +{ + if (pHMgr[0]) return (pHMgr[0])->remove(word); return 0; } @@ -1251,22 +1334,38 @@ struct cs_info * Hunspell::get_csconv() return csconv; } -#ifdef HUNSPELL_EXPERIMENTAL -// XXX need UTF-8 support -char * Hunspell::morph(const char * word) +void Hunspell::cat_result(char * result, char * st) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; - int wl = strlen(word); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } +} + +int Hunspell::analyze(char*** slst, const char * word) +{ + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + w_char unicw[MAXWORDLEN]; + int wl2 = 0; + *slst = NULL; + if (! pSMgr || maxdic == 0) return 0; + int nc = strlen(word); if (utf8) { - if (wl >= MAXWORDUTF8LEN) return 0; + if (nc >= MAXWORDUTF8LEN) return 0; } else { - if (wl >= MAXWORDLEN) return 0; + if (nc >= MAXWORDLEN) return 0; } int captype = 0; int abbv = 0; - wl = cleanword(cw, word, &captype, &abbv); + int wl = 0; + + // input conversion + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + if (wl == 0) { if (abbv) { for (wl = 0; wl < abbv; wl++) cw[wl] = '.'; @@ -1277,7 +1376,7 @@ char * Hunspell::morph(const char * word) char result[MAXLNLEN]; char * st = NULL; - + *result = '\0'; int n = 0; @@ -1287,177 +1386,103 @@ char * Hunspell::morph(const char * word) // test numbers // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) { - while ((n < wl) && + while ((n < wl) && (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) { n++; if ((cw[n] == '.') || (cw[n] == ',')) { - if (((n2 == 0) && (n > 3)) || + if (((n2 == 0) && (n > 3)) || ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break; n2++; n3 = n; } } - if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return NULL; - if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xb0')) && checkword(cw+n, NULL, NULL))) { - strcat(result, cw); + if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0; + if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) { + mystrcat(result, cw, MAXLNLEN); result[n - 1] = '\0'; - if (n == wl) { - st = pSMgr->suggest_morph(cw + n - 1); - if (st) { - strcat(result, st); - free(st); - } - } else { + if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1)); + else { char sign = cw[n]; cw[n] = '\0'; - st = pSMgr->suggest_morph(cw + n - 1); - if (st) { - strcat(result, st); - free(st); - } - strcat(result, "+"); // XXX SPEC. MORPHCODE + cat_result(result, pSMgr->suggest_morph(cw + n - 1)); + mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE cw[n] = sign; - st = pSMgr->suggest_morph(cw + n); - if (st) { - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(cw + n)); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } // END OF LANG_hu section - + switch(captype) { - case NOCAP: { - st = pSMgr->suggest_morph(cw); - if (st) { - strcat(result, st); - free(st); - } - if (abbv) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - } - break; - } - case INITCAP: { + case HUHCAP: + case HUHINITCAP: + case NOCAP: { + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(wspace)); + } + break; + } + case INITCAP: { + wl = mkallsmall2(cw, unicw, nc); memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - strcat(result, st); - free(st); - } - st = pSMgr->suggest_morph(cw); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - if (abbv) { - memcpy(wspace,cw,wl); + wl2 = mkinitcap2(cw, unicw, nc); + cat_result(result, pSMgr->suggest_morph(wspace)); + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(wspace)); + + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + + cat_result(result, pSMgr->suggest_morph(wspace)); } break; } - case HUHCAP: { - st = pSMgr->suggest_morph(cw); - if (st) { - strcat(result, st); - free(st); - } -#if 0 - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); + case ALLCAP: { + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(cw)); } -#endif - break; - } - case ALLCAP: { + wl = mkallsmall2(cw, unicw, nc); memcpy(wspace,cw,(wl+1)); - st = pSMgr->suggest_morph(wspace); - if (st) { - strcat(result, st); - free(st); - } - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); + wl2 = mkinitcap2(cw, unicw, nc); + + cat_result(result, pSMgr->suggest_morph(wspace)); + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(wspace)); + + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + + cat_result(result, pSMgr->suggest_morph(wspace)); } - if (abbv) { - memcpy(wspace,cw,(wl+1)); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - if (*result) strcat(result, "\n"); - st = pSMgr->suggest_morph(wspace); - if (st) { - strcat(result, st); - free(st); - } - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - } break; } } - if (result && (*result)) { + if (*result) { // word reversing wrapper for complex prefixes if (complexprefixes) { if (utf8) reverseword_utf(result); else reverseword(result); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); + } // compound word with dash (HU) I18n @@ -1466,24 +1491,24 @@ char * Hunspell::morph(const char * word) // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) dash = (char *) strchr(cw,'-'); if ((langnum == LANG_hu) && dash) { - *dash='\0'; + *dash='\0'; // examine 2 sides of the dash if (dash[1] == '\0') { // base word ending with dash - if (spell(cw)) return pSMgr->suggest_morph(cw); + if (spell(cw)) return line_tok(pSMgr->suggest_morph(cw), slst, MSEP_REC); } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. if (spell(cw) && (spell("-e"))) { st = pSMgr->suggest_morph(cw); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - strcat(result,"+"); // XXX spec. separator in MORPHCODE + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE st = pSMgr->suggest_morph("-e"); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } else { // first word ending with dash: word- XXX ??? @@ -1495,22 +1520,22 @@ char * Hunspell::morph(const char * word) dash[0]='\0'; if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || ((dash[1] > '0') && (dash[1] < '9')))) { - st = morph(cw); + st = pSMgr->suggest_morph(cw); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); - strcat(result,"+"); // XXX spec. separator in MORPHCODE + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE } - st = morph(dash+1); + st = pSMgr->suggest_morph(dash+1); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } // affixed number in correct word - if (nresult && (dash > cw) && (((*(dash-1)<='9') && + if (nresult && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)=='.'))) { *dash='-'; n = 1; @@ -1525,195 +1550,338 @@ char * Hunspell::morph(const char * word) // 56-hoz, 6-hoz for(; n >= 1; n--) { if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) { - strcat(result, cw); + mystrcat(result, cw, MAXLNLEN); result[dash - cw - n] = '\0'; st = pSMgr->suggest_morph(dash - n); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } } } - return NULL; + return 0; } +int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) +{ + *slst = NULL; + if (!pSMgr || !pln) return 0; + char **pl2; + int pl2n = analyze(&pl2, word); + int captype = 0; + int abbv = 0; + char cw[MAXWORDUTF8LEN]; + cleanword(cw, word, &captype, &abbv); + char result[MAXLNLEN]; + *result = '\0'; + + for (int i = 0; i < pln; i++) { + cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); + } + freelist(&pl2, pl2n); + + if (*result) { + // allcap + if (captype == ALLCAP) mkallcap(result); + + // line split + int linenum = line_tok(result, slst, MSEP_REC); + + // capitalize + if (captype == INITCAP || captype == HUHINITCAP) { + for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]); + } + + // temporary filtering of prefix related errors (eg. + // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") + + int r = 0; + for (int j=0; j < linenum; j++) { + if (!spell((*slst)[j])) { + free((*slst)[j]); + (*slst)[j] = NULL; + } else { + if (r < j) (*slst)[r] = (*slst)[j]; + r++; + } + } + if (r > 0) return r; + free(*slst); + *slst = NULL; + } + return 0; +} + +int Hunspell::generate(char*** slst, const char * word, const char * pattern) +{ + char **pl; + int pln = analyze(&pl, pattern); + int n = generate(slst, word, pl, pln); + freelist(&pl, pln); + return uniqlist(*slst, n); +} + +// minimal XML parser functions +int Hunspell::get_xml_par(char * dest, const char * par, int max) +{ + char * d = dest; + if (!par) return 0; + char end = *par; + char * dmax = dest + max; + if (end == '>') end = '<'; + else if (end != '\'' && end != '"') return 0; // bad XML + for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par; + *d = '\0'; + mystrrep(dest, "<", "<"); + mystrrep(dest, "&", "&"); + return d - dest; +} + +// return the beginning of the element (attr == NULL) or the attribute +const char * Hunspell::get_xml_pos(const char * s, const char * attr) +{ + const char * end = strchr(s, '>'); + const char * p = s; + if (attr == NULL) return end; + do { + p = strstr(p, attr); + if (!p || p >= end) return 0; + } while (*(p-1) != ' ' && *(p-1) != '\n'); + return p + strlen(attr); +} + +int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) { + char cw[MAXWORDUTF8LEN]; + if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) && + strcmp(cw, value) == 0) return 1; + return 0; +} + +int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) { + int n = 0; + char * p; + if (!list) return 0; + for (p = list; (p = strstr(p, tag)); p++) n++; + if (n == 0) return 0; + *slst = (char **) malloc(sizeof(char *) * n); + if (!*slst) return 0; + for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) { + int l = strlen(p); + (*slst)[n] = (char *) malloc(l); + if (!(*slst)[n]) return (n > 0 ? n - 1 : 0); + get_xml_par((*slst)[n], p + strlen(tag) - 1, l); + } + return n; +} + +int Hunspell::spellml(char*** slst, const char * word) +{ + char *q, *q2; + char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN]; + q = (char *) strstr(word, "<query"); + if (!q) return 0; // bad XML input + q2 = strchr(q, '>'); + if (!q2) return 0; // bad XML input + q2 = strstr(q2, "<word"); + if (!q2) return 0; // bad XML input + if (check_xml_par(q, "type=", "analyze")) { + int n = 0, s = 0; + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) n = analyze(slst, cw); + if (n == 0) return 0; + // convert the result to <code><a>ana1</a><a>ana2</a></code> format + for (int i = 0; i < n; i++) s+= strlen((*slst)[i]); + char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->& + if (!r) return 0; + strcpy(r, "<code>"); + for (int i = 0; i < n; i++) { + int l = strlen(r); + strcpy(r + l, "<a>"); + strcpy(r + l + 3, (*slst)[i]); + mystrrep(r + l + 3, "\t", " "); + mystrrep(r + l + 3, "<", "<"); + mystrrep(r + l + 3, "&", "&"); + strcat(r, "</a>"); + free((*slst)[i]); + } + strcat(r, "</code>"); + (*slst)[0] = r; + return 1; + } else if (check_xml_par(q, "type=", "stem")) { + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) return stem(slst, cw); + } else if (check_xml_par(q, "type=", "generate")) { + int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN); + if (n == 0) return 0; + char * q3 = strstr(q2 + 1, "<word"); + if (q3) { + if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN)) { + return generate(slst, cw, cw2); + } + } else { + char ** slst2; + if ((q2 = strstr(q2 + 1, "<code")) && + (n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) { + int n2 = generate(slst, cw, slst2, n); + freelist(&slst2, n); + return uniqlist(*slst, n2); + } + } + } + return 0; +} + + +#ifdef HUNSPELL_EXPERIMENTAL // XXX need UTF-8 support char * Hunspell::morph_with_correction(const char * word) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (! pSMgr || maxdic == 0) return NULL; int wl = strlen(word); if (utf8) { - if (wl >= MAXWORDUTF8LEN) return 0; + if (wl >= MAXWORDUTF8LEN) return NULL; } else { - if (wl >= MAXWORDLEN) return 0; + if (wl >= MAXWORDLEN) return NULL; } int captype = 0; int abbv = 0; wl = cleanword(cw, word, &captype, &abbv); - if (wl == 0) return 0; + if (wl == 0) return NULL; char result[MAXLNLEN]; char * st = NULL; - + *result = '\0'; - - + + switch(captype) { - case NOCAP: { + case NOCAP: { st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - if (abbv) { - memcpy(wspace,cw,wl); + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } } break; } - case INITCAP: { + case INITCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); - } - st = pSMgr->suggest_morph_for_spelling_error(cw); + } + st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - if (abbv) { - memcpy(wspace,cw,wl); + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); - } + } mkinitcap(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); - } + } } break; } - case HUHCAP: { + case HUHCAP: { st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); - } + } break; } - case ALLCAP: { + case ALLCAP: { memcpy(wspace,cw,(wl+1)); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); - } + } mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - if (abbv) { + if (abbv) { memcpy(wspace,cw,(wl+1)); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; - if (*result) strcat(result, "\n"); + if (*result) mystrcat(result, "\n", MAXLNLEN); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - strcat(result, st); - free(st); - } + mystrcat(result, st, MAXLNLEN); + free(st); + } mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - } + } break; } } - if (result) return mystrdup(result); + if (*result) return mystrdup(result); return NULL; } -/* analyze word - * return line count - * XXX need a better data structure for morphological analysis */ -int Hunspell::analyze(char ***out, const char *word) { - int n = 0; - if (!word) return 0; - char * m = morph(word); - if(!m) return 0; - if (!out) return line_tok(m, out); - - // without memory allocation - /* BUG missing buffer size checking */ - int i, p; - for(p = 0, i = 0; m[i]; i++) { - if(m[i] == '\n' || !m[i+1]) { - n++; - strncpy((*out)[n++], m + p, i - p + 1); - if (m[i] == '\n') (*out)[n++][i - p] = '\0'; - if(!m[i+1]) break; - p = i + 1; - } - } - free(m); - return n; -} - #endif // END OF HUNSPELL_EXPERIMENTAL CODE Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) @@ -1725,6 +1893,17 @@ Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) #endif } + +Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, + const char * key) +{ +#ifdef HUNSPELL_CHROME_CLIENT + return NULL; +#else + return (Hunhandle*)(new Hunspell(affpath, dpath, key)); +#endif +} + void Hunspell_destroy(Hunhandle *pHunspell) { delete (Hunspell*)(pHunspell); @@ -1745,3 +1924,57 @@ int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word) return ((Hunspell*)pHunspell)->suggest(slst, word); } +int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word) +{ + return ((Hunspell*)pHunspell)->analyze(slst, word); +} + +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word) +{ + return ((Hunspell*)pHunspell)->stem(slst, word); +} + +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, char** desc, int n) +{ + return ((Hunspell*)pHunspell)->stem(slst, desc, n); +} + +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + const char * word2) +{ + return ((Hunspell*)pHunspell)->generate(slst, word, word2); +} + +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + char** desc, int n) +{ + return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); +} + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + +int Hunspell_add(Hunhandle *pHunspell, const char * word) { + return ((Hunspell*)pHunspell)->add(word); +} + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + +int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, + const char * example) { + return ((Hunspell*)pHunspell)->add_with_affix(word, example); +} + + /* remove word from the run-time dictionary */ + +int Hunspell_remove(Hunhandle *pHunspell, const char * word) { + return ((Hunspell*)pHunspell)->remove(word); +} + +void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n) { + freelist(slst, n); +} diff --git a/chrome/third_party/hunspell/src/hunspell/hunspell.h b/chrome/third_party/hunspell/src/hunspell/hunspell.h index b04b83a..f926052 100644 --- a/chrome/third_party/hunspell/src/hunspell/hunspell.h +++ b/chrome/third_party/hunspell/src/hunspell/hunspell.h @@ -7,15 +7,25 @@ extern "C" { typedef struct Hunhandle Hunhandle; -Hunhandle *Hunspell_create(const char * affpath, const char * dpath); -void Hunspell_destroy(Hunhandle *pHunspell); +#ifdef _MSC_VER +#define DLL __declspec ( dllexport ) +#else +#define DLL +#endif + +DLL Hunhandle *Hunspell_create(const char * affpath, const char * dpath); + +DLL Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, + const char * key); + +DLL void Hunspell_destroy(Hunhandle *pHunspell); /* spell(word) - spellcheck word * output: 0 = bad word, not 0 = good word */ -int Hunspell_spell(Hunhandle *pHunspell, const char *); +DLL int Hunspell_spell(Hunhandle *pHunspell, const char *); -char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); +DLL char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); /* suggest(suggestions, word) - search suggestions * input: pointer to an array of strings pointer and the (bad) word @@ -24,7 +34,63 @@ char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); * a newly allocated array of strings (*slts will be NULL when number * of suggestion equals 0.) */ -int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word); +DLL int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word); + + /* morphological functions */ + + /* analyze(result, word) - morphological analysis of the word */ + +DLL int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word); + + /* stem(result, word) - stemmer function */ + +DLL int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word); + + /* stem(result, analysis, n) - get stems from a morph. analysis + * example: + * char ** result, result2; + * int n1 = Hunspell_analyze(result, "words"); + * int n2 = Hunspell_stem2(result2, result, n1); + */ + +DLL int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n); + + /* generate(result, word, word2) - morphological generation by example(s) */ + +DLL int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + const char * word2); + + /* generate(result, word, desc, n) - generation by morph. description(s) + * example: + * char ** result; + * char * affix = "is:plural"; // description depends from dictionaries, too + * int n = Hunspell_generate2(result, "word", &affix, 1); + * for (int i = 0; i < n; i++) printf("%s\n", result[i]); + */ + +DLL int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, + char** desc, int n); + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + +DLL int Hunspell_add(Hunhandle *pHunspell, const char * word); + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + +DLL int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, const char * example); + + /* remove word from the run-time dictionary */ + +DLL int Hunspell_remove(Hunhandle *pHunspell, const char * word); + + /* free suggestion lists */ + +DLL void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n); #ifdef __cplusplus } diff --git a/chrome/third_party/hunspell/src/hunspell/hunspell.hxx b/chrome/third_party/hunspell/src/hunspell/hunspell.hxx index bc6f7d5..bb26b5b 100644 --- a/chrome/third_party/hunspell/src/hunspell/hunspell.hxx +++ b/chrome/third_party/hunspell/src/hunspell/hunspell.hxx @@ -1,30 +1,20 @@ -#include "license.hunspell" -#include "license.myspell" - #include "hashmgr.hxx" #include "affixmgr.hxx" #include "suggestmgr.hxx" -#include "csutil.hxx" #include "langnum.hxx" #define SPELL_COMPOUND (1 << 0) #define SPELL_FORBIDDEN (1 << 1) +#define SPELL_ALLCAP (1 << 2) +#define SPELL_NOCAP (1 << 3) +#define SPELL_INITCAP (1 << 4) -#define NOCAP 0 -#define INITCAP 1 -#define ALLCAP 2 -#define HUHCAP 3 -#define HUHINITCAP 4 +#define SPELL_XML "<?xml?>" +#define MAXDIC 20 #define MAXSUGGESTION 15 #define MAXSHARPS 5 -#if defined(W32) && defined(LIBRARY) -#define DLLTEST2_API __declspec(dllexport) -#else -#define DLLTEST2_API -#endif - #ifndef _MYSPELLMGR_HXX_ #define _MYSPELLMGR_HXX_ @@ -32,15 +22,27 @@ #include "chrome/third_party/hunspell/google/bdict_reader.h" #endif -#ifdef W32 -class DLLTEST2_API Hunspell +#ifdef HUNSPELL_STATIC + #define DLLEXPORT +#else + #ifdef HUNSPELL_EXPORTS + #define DLLEXPORT __declspec( dllexport ) + #else + #define DLLEXPORT __declspec( dllimport ) + #endif +#endif + +#ifdef WIN32 +class DLLEXPORT Hunspell #else class Hunspell #endif { AffixMgr* pAMgr; - HashMgr* pHMgr; + HashMgr* pHMgr[MAXDIC]; + int maxdic; SuggestMgr* pSMgr; + char * affixpath; char * encoding; struct cs_info * csconv; int langnum; @@ -61,11 +63,16 @@ public: #ifdef HUNSPELL_CHROME_CLIENT Hunspell(const unsigned char* bdict_data, size_t bdict_length); #else - Hunspell(FILE* aff_handle, FILE* dic_handle); + Hunspell(FILE* aff_handle, FILE* dic_handle, const char * key = NULL); #endif ~Hunspell(); +#ifndef HUNSPELL_CHROME_CLIENT + /* load extra dictionaries (only dic files) */ + int add_dic(const char * dpath, const char * key = NULL); +#endif + /* spell(word) - spellcheck word * output: 0 = bad word, not 0 = good word * @@ -87,17 +94,62 @@ public: */ int suggest(char*** slst, const char * word); + + /* deallocate suggestion lists */ + + void free_list(char *** slst, int n); + char * get_dic_encoding(); - /* handling custom dictionary */ + /* morphological functions */ + + /* analyze(result, word) - morphological analysis of the word */ + + int analyze(char*** slst, const char * word); + + /* stem(result, word) - stemmer function */ - int put_word(const char * word); + int stem(char*** slst, const char * word); + + /* stem(result, analysis, n) - get stems from a morph. analysis + * example: + * char ** result, result2; + * int n1 = analyze(&result, "words"); + * int n2 = stem(&result2, result, n1); + */ + + int stem(char*** slst, char ** morph, int n); + + /* generate(result, word, word2) - morphological generation by example(s) */ + + int generate(char*** slst, const char * word, const char * word2); - /* pattern is a sample dictionary word - * put word into custom dictionary with affix flags of pattern word + /* generate(result, word, desc, n) - generation by morph. description(s) + * example: + * char ** result; + * char * affix = "is:plural"; // description depends from dictionaries, too + * int n = generate(&result, "word", &affix, 1); + * for (int i = 0; i < n; i++) printf("%s\n", result[i]); + */ + + int generate(char*** slst, const char * word, char ** desc, int n); + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + + int add(const char * word); + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. */ - int put_word_pattern(const char * word, const char * pattern); + int add_with_affix(const char * word, const char * example); + + /* remove word from the run-time dictionary */ + + int remove(const char * word); /* other */ @@ -107,25 +159,14 @@ public: struct cs_info * get_csconv(); const char * get_version(); - - /* experimental functions */ + + /* experimental and deprecated functions */ #ifdef HUNSPELL_EXPERIMENTAL - /* suffix is an affix flag string, similarly in dictionary files */ - + /* suffix is an affix flag string, similarly in dictionary files */ int put_word_suffix(const char * word, const char * suffix); - - /* morphological analysis */ - - char * morph(const char * word); - int analyze(char*** out, const char *word); - char * morph_with_correction(const char * word); - /* stemmer function */ - - int stem(char*** slst, const char * word); - /* spec. suggestions */ int suggest_auto(char*** slst, const char * word); int suggest_pos_stems(char*** slst, const char * word); @@ -146,8 +187,14 @@ private: char * sharps_u8_l1(char * dest, char * source); hentry * spellsharps(char * base, char *, int, int, char * tmp, int * info, char **root); int is_keepcase(const hentry * rv); - int insert_sug(char ***slst, char * word, int *ns); - + int insert_sug(char ***slst, char * word, int ns); + void cat_result(char * result, char * st); + char * stem_description(const char * desc); + int spellml(char*** slst, const char * word); + int get_xml_par(char * dest, const char * par, int maxl); + const char * get_xml_pos(const char * s, const char * attr); + int get_xml_list(char ***slst, char * list, const char * tag); + int check_xml_par(const char * q, const char * attr, const char * value); }; diff --git a/chrome/third_party/hunspell/src/hunspell/hunzip.cxx b/chrome/third_party/hunspell/src/hunspell/hunzip.cxx new file mode 100644 index 0000000..f9091b8 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/hunzip.cxx @@ -0,0 +1,196 @@ +#ifndef MOZILLA_CLIENT +#include <cstdlib> +#include <cstring> +#include <cstdio> +#else +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#endif + +#include "hunzip.hxx" + +#define CODELEN 65536 +#define BASEBITREC 5000 + +#define UNCOMPRESSED '\002' +#define MAGIC "hz0" +#define MAGIC_ENCRYPT "hz1" +#define MAGICLEN (sizeof(MAGIC) - 1) + +int Hunzip::fail(const char * err, const char * par) { + fprintf(stderr, err, par); + return -1; +} + +Hunzip::Hunzip(const char * file, const char * key) { + bufsiz = 0; + lastbit = 0; + inc = 0; + outc = 0; + dec = NULL; + filename = (char *) malloc(strlen(file) + 1); + if (filename) strcpy(filename, file); + if (getcode(key) == -1) bufsiz = -1; + else bufsiz = getbuf(); +} + +int Hunzip::getcode(const char * key) { + unsigned char c[2]; + int i, j, n, p; + int allocatedbit = BASEBITREC; + const char * enc = key; + + fin = fopen(filename, "rb"); + if (!fin) return -1; + + // read magic number + if ((fread(in, 1, 3, fin) < MAGICLEN) + || !(strncmp(MAGIC, in, MAGICLEN) == 0 || + strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0)) { + return fail(MSG_FORMAT, filename); + } + + // check encryption + if (strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0) { + unsigned char cs; + if (!key) return fail(MSG_KEY, filename); + if (fread(&c, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename); + for (cs = 0; *enc; enc++) cs ^= *enc; + if (cs != c[0]) return fail(MSG_KEY, filename); + enc = key; + } else key = NULL; + + // read record count + if (fread(&c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename); + + if (key) { + c[0] ^= *enc; + if (*(++enc) == '\0') enc = key; + c[1] ^= *enc; + } + + n = ((int) c[0] << 8) + c[1]; + dec = (struct bit *) malloc(BASEBITREC * sizeof(struct bit)); + if (!dec) return fail(MSG_MEMORY, filename); + dec[0].v[0] = 0; + dec[0].v[1] = 0; + + // read codes + for (i = 0; i < n; i++) { + unsigned char l; + if (fread(c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename); + if (key) { + if (*(++enc) == '\0') enc = key; + c[0] ^= *enc; + if (*(++enc) == '\0') enc = key; + c[1] ^= *enc; + } + if (fread(&l, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename); + if (key) { + if (*(++enc) == '\0') enc = key; + l ^= *enc; + } + if (fread(in, 1, l/8+1, fin) < (size_t) l/8+1) return fail(MSG_FORMAT, filename); + if (key) for (j = 0; j <= l/8; j++) { + if (*(++enc) == '\0') enc = key; + in[j] ^= *enc; + } + p = 0; + for (j = 0; j < l; j++) { + int b = (in[j/8] & (1 << (7 - (j % 8)))) ? 1 : 0; + int oldp = p; + p = dec[p].v[b]; + if (p == 0) { + lastbit++; + if (lastbit == allocatedbit) { + allocatedbit += BASEBITREC; + dec = (struct bit *) realloc(dec, allocatedbit * sizeof(struct bit)); + } + dec[lastbit].v[0] = 0; + dec[lastbit].v[1] = 0; + dec[oldp].v[b] = lastbit; + p = lastbit; + } + } + dec[p].c[0] = c[0]; + dec[p].c[1] = c[1]; + } + return 0; +} + +Hunzip::~Hunzip() +{ + if (dec) free(dec); + if (fin) fclose(fin); + if (filename) free(filename); +} + +int Hunzip::getbuf() { + int p = 0; + int o = 0; + do { + if (inc == 0) inbits = fread(in, 1, BUFSIZE, fin) * 8; + for (; inc < inbits; inc++) { + int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0; + int oldp = p; + p = dec[p].v[b]; + if (p == 0) { + if (oldp == lastbit) { + fclose(fin); + fin = NULL; + // add last odd byte + if (dec[lastbit].c[0]) out[o++] = dec[lastbit].c[1]; + return o; + } + out[o++] = dec[oldp].c[0]; + out[o++] = dec[oldp].c[1]; + if (o == BUFSIZE) return o; + p = dec[p].v[b]; + } + } + inc = 0; + } while (inbits == BUFSIZE * 8); + return fail(MSG_FORMAT, filename); +} + +const char * Hunzip::getline() { + char linebuf[BUFSIZE]; + int l = 0, eol = 0, left = 0, right = 0; + if (bufsiz == -1) return NULL; + while (l < bufsiz && !eol) { + linebuf[l++] = out[outc]; + switch (out[outc]) { + case '\t': break; + case 31: { // escape + if (++outc == bufsiz) { + bufsiz = getbuf(); + outc = 0; + } + linebuf[l - 1] = out[outc]; + break; + } + case ' ': break; + default: if (((unsigned char) out[outc]) < 47) { + if (out[outc] > 32) { + right = out[outc] - 31; + if (++outc == bufsiz) { + bufsiz = getbuf(); + outc = 0; + } + } + if (out[outc] == 30) left = 9; else left = out[outc]; + linebuf[l-1] = '\n'; + eol = 1; + } + } + if (++outc == bufsiz) { + outc = 0; + bufsiz = fin ? getbuf(): -1; + } + } + if (right) strcpy(linebuf + l - 1, line + strlen(line) - right - 1); + else linebuf[l] = '\0'; + strcpy(line + left, linebuf); + return line; +} diff --git a/chrome/third_party/hunspell/src/hunspell/hunzip.hxx b/chrome/third_party/hunspell/src/hunspell/hunzip.hxx new file mode 100644 index 0000000..52109d1 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/hunzip.hxx @@ -0,0 +1,41 @@ +/* hunzip: file decompression for sorted dictionaries with optional encryption, + * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */ + +#ifndef _HUNZIP_HXX_ +#define _HUNZIP_HXX_ + +#define BUFSIZE 65536 +#define HZIP_EXTENSION ".hz" + +#define MSG_OPEN "error: %s: cannot open\n" +#define MSG_FORMAT "error: %s: not in hzip format\n" +#define MSG_MEMORY "error: %s: missing memory\n" +#define MSG_KEY "error: %s: missing or bad password\n" + +struct bit { + unsigned char c[2]; + int v[2]; +}; + +class Hunzip +{ + +protected: + char * filename; + FILE * fin; + int bufsiz, lastbit, inc, inbits, outc; + struct bit * dec; // code table + char in[BUFSIZE]; // input buffer + char out[BUFSIZE + 1]; // Huffman-decoded buffer + char line[BUFSIZE + 50]; // decoded line + int getcode(const char * key); + int getbuf(); + int fail(const char * err, const char * par); + +public: + Hunzip(const char * filename, const char * key = NULL); + ~Hunzip(); + const char * getline(); +}; + +#endif diff --git a/chrome/third_party/hunspell/src/hunspell/license.hunspell b/chrome/third_party/hunspell/src/hunspell/license.hunspell index f1cf161..3afee61 100644 --- a/chrome/third_party/hunspell/src/hunspell/license.hunspell +++ b/chrome/third_party/hunspell/src/hunspell/license.hunspell @@ -14,7 +14,7 @@ * The Original Code is Hunspell, based on MySpell. * * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Kevin Hendricks (MySpell) and Laszlo Nemeth (Hunspell). * Portions created by the Initial Developers are Copyright (C) 2002-2005 * the Initial Developers. All Rights Reserved. * @@ -24,22 +24,22 @@ * Giuseppe Modugno * Gianluca Turconi * Simon Brouwer - * Noll János - * Bíró Árpád - * Goldman Eleonóra - * Sarlós Tamás - * Bencsáth Boldizsár - * Halácsy Péter - * Dvornik László - * Gefferth András + * Noll Janos + * Biro Arpad + * Goldman Eleonora + * Sarlos Tamas + * Bencsath Boldizsar + * Halacsy Peter + * Dvornik Laszlo + * Gefferth Andras * Nagy Viktor - * Varga Dániel + * Varga Daniel * Chris Halls * Rene Engelhard * Bram Moolenaar * Dafydd Jones - * Harri Pitkänen - * András Tímár + * Harri Pitkanen + * Andras Timar * Tor Lillqvist * * Alternatively, the contents of this file may be used under the terms of diff --git a/chrome/third_party/hunspell/src/hunspell/phonet.cxx b/chrome/third_party/hunspell/src/hunspell/phonet.cxx new file mode 100644 index 0000000..ca20796 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/phonet.cxx @@ -0,0 +1,299 @@ +/* phonetic.c - generic replacement aglogithms for phonetic transformation + Copyright (C) 2000 Bjoern Jacke + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation; + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; If not, see + <http://www.gnu.org/licenses/>. + + Changelog: + + 2000-01-05 Bjoern Jacke <bjoern at j3e.de> + Initial Release insprired by the article about phonetic + transformations out of c't 25/1999 + + 2007-07-26 Bjoern Jacke <bjoern at j3e.de> + Released under MPL/GPL/LGPL tri-license for Hunspell + + 2007-08-23 Laszlo Nemeth <nemeth at OOo> + Porting from Aspell to Hunspell using C-like structs +*/ + +#ifndef MOZILLA_CLIENT +#include <cstdlib> +#include <cstring> +#include <cstdio> +#include <cctype> +#else +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> +#endif + +#include "csutil.hxx" +#include "phonet.hxx" + +void init_phonet_hash(phonetable & parms) + { + int i, k; + + for (i = 0; i < HASHSIZE; i++) { + parms.hash[i] = -1; + } + + for (i = 0; parms.rules[i][0] != '\0'; i += 2) { + /** set hash value **/ + k = (unsigned char) parms.rules[i][0]; + + if (parms.hash[k] < 0) { + parms.hash[k] = i; + } + } + } + + // like strcpy but safe if the strings overlap + // but only if dest < src + static inline void strmove(char * dest, char * src) { + while (*src) + *dest++ = *src++; + *dest = '\0'; + } + +int myisalpha(char ch) { + if ((unsigned char) ch < 128) return isalpha(ch); + return 1; +} + +/* phonetic transcription algorithm */ +/* see: http://aspell.net/man-html/Phonetic-Code.html */ +/* convert string to uppercase before this call */ +int phonet (const char * inword, char * target, + int len, + phonetable & parms) + { + /** Do phonetic transformation. **/ + /** "len" = length of "inword" incl. '\0'. **/ + + /** result: >= 0: length of "target" **/ + /** otherwise: error **/ + + int i,j,k=0,n,p,z; + int k0,n0,p0=-333,z0; + char c, c0; + const char * s; + typedef unsigned char uchar; + char word[MAXPHONETUTF8LEN + 1]; + if (len == -1) len = strlen(inword); + if (len > MAXPHONETUTF8LEN) return 0; + strcpy(word, inword); + + /** check word **/ + i = j = z = 0; + while ((c = word[i]) != '\0') { + n = parms.hash[(uchar) c]; + z0 = 0; + + if (n >= 0) { + /** check all rules for the same letter **/ + while (parms.rules[n][0] == c) { + + /** check whole string **/ + k = 1; /** number of found letters **/ + p = 5; /** default priority **/ + s = parms.rules[n]; + s++; /** important for (see below) "*(s-1)" **/ + + while (*s != '\0' && word[i+k] == *s + && !isdigit ((unsigned char) *s) && strchr ("(-<^$", *s) == NULL) { + k++; + s++; + } + if (*s == '(') { + /** check letters in "(..)" **/ + if (myisalpha(word[i+k]) // ...could be implied? + && strchr(s+1, word[i+k]) != NULL) { + k++; + while (*s != ')') + s++; + s++; + } + } + p0 = (int) *s; + k0 = k; + while (*s == '-' && k > 1) { + k--; + s++; + } + if (*s == '<') + s++; + if (isdigit ((unsigned char) *s)) { + /** determine priority **/ + p = *s - '0'; + s++; + } + if (*s == '^' && *(s+1) == '^') + s++; + + if (*s == '\0' + || (*s == '^' + && (i == 0 || ! myisalpha(word[i-1])) + && (*(s+1) != '$' + || (! myisalpha(word[i+k0]) ))) + || (*s == '$' && i > 0 + && myisalpha(word[i-1]) + && (! myisalpha(word[i+k0]) ))) + { + /** search for followup rules, if: **/ + /** parms.followup and k > 1 and NO '-' in searchstring **/ + c0 = word[i+k-1]; + n0 = parms.hash[(uchar) c0]; + +// if (parms.followup && k > 1 && n0 >= 0 + if (k > 1 && n0 >= 0 + && p0 != (int) '-' && word[i+k] != '\0') { + /** test follow-up rule for "word[i+k]" **/ + while (parms.rules[n0][0] == c0) { + + /** check whole string **/ + k0 = k; + p0 = 5; + s = parms.rules[n0]; + s++; + while (*s != '\0' && word[i+k0] == *s + && ! isdigit((unsigned char) *s) && strchr("(-<^$",*s) == NULL) { + k0++; + s++; + } + if (*s == '(') { + /** check letters **/ + if (myisalpha(word[i+k0]) + && strchr (s+1, word[i+k0]) != NULL) { + k0++; + while (*s != ')' && *s != '\0') + s++; + if (*s == ')') + s++; + } + } + while (*s == '-') { + /** "k0" gets NOT reduced **/ + /** because "if (k0 == k)" **/ + s++; + } + if (*s == '<') + s++; + if (isdigit ((unsigned char) *s)) { + p0 = *s - '0'; + s++; + } + + if (*s == '\0' + /** *s == '^' cuts **/ + || (*s == '$' && ! myisalpha(word[i+k0]))) + { + if (k0 == k) { + /** this is just a piece of the string **/ + n0 += 2; + continue; + } + + if (p0 < p) { + /** priority too low **/ + n0 += 2; + continue; + } + /** rule fits; stop search **/ + break; + } + n0 += 2; + } /** End of "while (parms.rules[n0][0] == c0)" **/ + + if (p0 >= p && parms.rules[n0][0] == c0) { + n += 2; + continue; + } + } /** end of follow-up stuff **/ + + /** replace string **/ + s = parms.rules[n+1]; + p0 = (parms.rules[n][0] != '\0' + && strchr (parms.rules[n]+1,'<') != NULL) ? 1:0; + if (p0 == 1 && z == 0) { + /** rule with '<' is used **/ + if (j > 0 && *s != '\0' + && (target[j-1] == c || target[j-1] == *s)) { + j--; + } + z0 = 1; + z = 1; + k0 = 0; + while (*s != '\0' && word[i+k0] != '\0') { + word[i+k0] = *s; + k0++; + s++; + } + if (k > k0) + strmove (&word[0]+i+k0, &word[0]+i+k); + + /** new "actual letter" **/ + c = word[i]; + } + else { /** no '<' rule used **/ + i += k - 1; + z = 0; + while (*s != '\0' + && *(s+1) != '\0' && j < len) { + if (j == 0 || target[j-1] != *s) { + target[j] = *s; + j++; + } + s++; + } + /** new "actual letter" **/ + c = *s; + if (parms.rules[n][0] != '\0' + && strstr (parms.rules[n]+1, "^^") != NULL) { + if (c != '\0') { + target[j] = c; + j++; + } + strmove (&word[0], &word[0]+i+1); + i = 0; + z0 = 1; + } + } + break; + } /** end of follow-up stuff **/ + n += 2; + } /** end of while (parms.rules[n][0] == c) **/ + } /** end of if (n >= 0) **/ + if (z0 == 0) { +// if (k && (assert(p0!=-333),!p0) && j < len && c != '\0' +// && (!parms.collapse_result || j == 0 || target[j-1] != c)){ + if (k && !p0 && j < len && c != '\0' + && (1 || j == 0 || target[j-1] != c)){ + /** condense only double letters **/ + target[j] = c; + ///printf("\n setting \n"); + j++; + } + + i++; + z = 0; + k=0; + } + } /** end of while ((c = word[i]) != '\0') **/ + + target[j] = '\0'; + return (j); + + } /** end of function "phonet" **/ diff --git a/chrome/third_party/hunspell/src/hunspell/phonet.hxx b/chrome/third_party/hunspell/src/hunspell/phonet.hxx new file mode 100644 index 0000000..d1cf995 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/phonet.hxx @@ -0,0 +1,50 @@ +/* phonetic.c - generic replacement aglogithms for phonetic transformation + Copyright (C) 2000 Bjoern Jacke + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation; + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; If not, see + <http://www.gnu.org/licenses/>. + + Changelog: + + 2000-01-05 Bjoern Jacke <bjoern at j3e.de> + Initial Release insprired by the article about phonetic + transformations out of c't 25/1999 + + 2007-07-26 Bjoern Jacke <bjoern at j3e.de> + Released under MPL/GPL/LGPL tri-license for Hunspell + + 2007-08-23 Laszlo Nemeth <nemeth at OOo> + Porting from Aspell to Hunspell using C-like structs +*/ + +#ifndef __PHONETHXX__ +#define __PHONETHXX__ + +#define HASHSIZE 256 +#define MAXPHONETLEN 256 +#define MAXPHONETUTF8LEN (MAXPHONETLEN * 4) + +struct phonetable { + char utf8; + cs_info * lang; + int num; + char * * rules; + int hash[HASHSIZE]; +}; + +void init_phonet_hash(phonetable & parms); + +int phonet (const char * inword, char * target, + int len, phonetable & phone); + +#endif diff --git a/chrome/third_party/hunspell/src/hunspell/replist.cxx b/chrome/third_party/hunspell/src/hunspell/replist.cxx new file mode 100644 index 0000000..7846470 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/replist.cxx @@ -0,0 +1,95 @@ +#include "license.hunspell" +#include "license.myspell" + +#ifndef MOZILLA_CLIENT +#include <cstdlib> +#include <cstring> +#include <cstdio> +#else +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#endif + +#include "replist.hxx" +#include "csutil.hxx" + +RepList::RepList(int n) { + dat = (replentry **) malloc(sizeof(replentry *) * n); + if (dat == 0) size = 0; else size = n; + pos = 0; +} + +RepList::~RepList() +{ + for (int i = 0; i < pos; i++) { + free(dat[i]->pattern); + free(dat[i]->pattern2); + free(dat[i]); + } + free(dat); +} + +int RepList::get_pos() { + return pos; +} + +replentry * RepList::item(int n) { + return dat[n]; +} + +int RepList::near(const char * word) { + int p1 = 0; + int p2 = pos; + while ((p2 - p1) > 1) { + int m = (p1 + p2) / 2; +// fprintf(stderr, "m: %d p1: %d p2: %d dat: %s\n", m, p1, p2, dat[m]->pattern); + int c = strcmp(word, dat[m]->pattern); + if (c <= 0) { + if (c < 0) p2 = m; else p1 = p2 = m; + } else p1 = m; + } +// fprintf(stderr, "NEAR: %s (word: %s)\n", dat[p1]->pattern, word); + return p1; +} + +int RepList::match(const char * word, int n) { + if (strncmp(word, dat[n]->pattern, strlen(dat[n]->pattern)) == 0) return strlen(dat[n]->pattern); + return 0; +} + +int RepList::add(char * pat1, char * pat2) { + if (pos >= size || pat1 == NULL || pat2 == NULL) return 1; + replentry * r = (replentry *) malloc(sizeof(replentry)); + if (r == NULL) return 1; + r->pattern = mystrrep(pat1, "_", " "); + r->pattern2 = mystrrep(pat2, "_", " "); + dat[pos++] = r; + for (int i = pos - 1; i > 0; i--) { + r = dat[i]; + if (strcmp(r->pattern, dat[i - 1]->pattern) < 0) { + dat[i] = dat[i - 1]; + dat[i - 1] = r; + } else break; + } + return 0; +} + +int RepList::conv(const char * word, char * dest) { + int stl = 0; + int change = 0; +// for (int i = 0; i < pos; i++) fprintf(stderr, "%d. %s\n", i, dat[i]->pattern); + for (int i = 0; i < strlen(word); i++) { + int n = near(word + i); + int l = match(word + i, n); + if (l) { + strcpy(dest + stl, dat[n]->pattern2); + stl += strlen(dat[n]->pattern2); + i += l - 1; + change = 1; + } else dest[stl++] = word[i]; + } + dest[stl] = '\0'; +// fprintf(stderr, "i: %s o: %s change: %d\n", word, dest, change); + return change; +} diff --git a/chrome/third_party/hunspell/src/hunspell/replist.hxx b/chrome/third_party/hunspell/src/hunspell/replist.hxx new file mode 100644 index 0000000..16da313 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/replist.hxx @@ -0,0 +1,29 @@ +/* string replacement list class */ +#ifndef _REPLIST_HXX_ +#define _REPLIST_HXX_ +#ifdef HUNSPELL_CHROME_CLIENT +// Compilation issues in spellchecker.cc think near is a macro, therefore +// removing it here solves that problem. +#undef near +#endif +#include "w_char.hxx" + +class RepList +{ +protected: + replentry ** dat; + int size; + int pos; + +public: + RepList(int n); + ~RepList(); + + int get_pos(); + int add(char * pat1, char * pat2); + replentry * item(int n); + int near(const char * word); + int match(const char * word, int n); + int conv(const char * word, char * dest); +}; +#endif diff --git a/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx b/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx index 222701b..5914b6a 100644 --- a/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx @@ -14,13 +14,16 @@ #endif #include "suggestmgr.hxx" +#include "htypes.hxx" +#include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif +const w_char W_VLINE = { '\0', '|' }; SuggestMgr::SuggestMgr(const char * tryme, int maxn, AffixMgr * aptr) @@ -30,36 +33,54 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn, // try when building candidate suggestions pAMgr = aptr; + ckeyl = 0; + ckey = NULL; + ckey_utf = NULL; + ctryl = 0; ctry = NULL; ctry_utf = NULL; + utf8 = 0; + langnum = 0; + complexprefixes = 0; + maxSug = maxn; nosplitsugs = 0; maxngramsugs = MAXNGRAMSUGS; - utf8 = 0; - complexprefixes = 0; - if (pAMgr) { char * enc = pAMgr->get_encoding(); csconv = get_current_cs(enc); free(enc); + langnum = pAMgr->get_langnum(); + ckey = pAMgr->get_key_string(); nosplitsugs = pAMgr->get_nosplitsugs(); if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs(); utf8 = pAMgr->get_utf8(); complexprefixes = pAMgr->get_complexprefixes(); } - if (tryme) { + if (ckey) { if (utf8) { w_char t[MAXSWL]; + ckeyl = u8_u16(t, MAXSWL, ckey); + ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char)); + if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char)); + } else { + ckeyl = strlen(ckey); + } + } + + if (tryme) { + ctry = mystrdup(tryme); + if (ctry) ctryl = strlen(ctry); + if (ctry && utf8) { + w_char t[MAXSWL]; ctryl = u8_u16(t, MAXSWL, tryme); ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char)); - memcpy(ctry_utf, t, ctryl * sizeof(w_char)); - } else { - ctry = mystrdup(tryme); - ctryl = strlen(ctry); + if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char)); + else ctryl = 0; } } } @@ -68,6 +89,11 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn, SuggestMgr::~SuggestMgr() { pAMgr = NULL; + if (ckey) free(ckey); + ckey = NULL; + if (ckey_utf) free(ckey_utf); + ckey_utf = NULL; + ckeyl = 0; if (ctry) free(ctry); ctry = NULL; if (ctry_utf) free(ctry_utf); @@ -77,7 +103,7 @@ SuggestMgr::~SuggestMgr() } int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest, - int * timer, time_t * timelimit) { + int * timer, clock_t * timelimit) { int cwrd = 1; if (ns == maxSug) return maxSug; for (int k=0; k < ns; k++) { @@ -96,13 +122,15 @@ int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int // generate suggestions for a mispelled word // pass in address of array of char * pointers +// onlycompoundsug: probably bad suggestions (need for ngram sugs, too) -int SuggestMgr::suggest(char*** slst, const char * w, int nsug) +int SuggestMgr::suggest(char*** slst, const char * w, int nsug, + int * onlycompoundsug) { - int nocompoundtwowords = 0; - char ** wlst; - w_char word_utf[MAXSWL]; - int wl = 0; + int nocompoundtwowords = 0; + char ** wlst; + w_char word_utf[MAXSWL]; + int wl = 0; char w2[MAXWORDUTF8LEN]; const char * word = w; @@ -141,8 +169,8 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) nsug = replchars(wlst, word, nsug, cpdsuggest); // perhaps we made chose the wrong char from a related set - if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) { - nsug = mapchars(wlst, word, nsug); + if ((nsug < maxSug) && (nsug > -1)) { + nsug = mapchars(wlst, word, nsug, cpdsuggest); } // did we swap the order of chars by mistake @@ -157,6 +185,22 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) longswapchar(wlst, word, nsug, cpdsuggest); } + // did we just hit the wrong key in place of a good char (case and keyboard) + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) : + badcharkey(wlst, word, nsug, cpdsuggest); + } + + // only suggest compound words when no other suggestion + if ((cpdsuggest == 0) && (nsug > 0)) nocompoundtwowords=1; + + // did we add a char that should not be there + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : + extrachar(wlst, word, nsug, cpdsuggest); + } + + // did we forgot a char if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : @@ -169,12 +213,6 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) movechar(wlst, word, nsug, cpdsuggest); } - // did we add a char that should not be there - if ((nsug < maxSug) && (nsug > -1)) { - nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : - extrachar(wlst, word, nsug, cpdsuggest); - } - // did we just hit the wrong key in place of a good char if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : @@ -187,10 +225,6 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) doubletwochars(wlst, word, nsug, cpdsuggest); } - - // only suggest compound words when no other suggestion - if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; - // perhaps we forgot to hit space and two words ran together if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) { nsug = twowords(wlst, word, nsug, cpdsuggest); @@ -205,6 +239,8 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) free(wlst); wlst = NULL; } + + if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1; *slst = wlst; return nsug; @@ -242,8 +278,8 @@ int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug) nsug = replchars(wlst, word, nsug, cpdsuggest); // perhaps we made chose the wrong char from a related set - if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) - nsug = mapchars(wlst, word, nsug); + if ((nsug < maxSug) && (nsug > -1)) + nsug = mapchars(wlst, word, nsug, cpdsuggest); if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; @@ -273,7 +309,7 @@ int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns, char candidate[MAXSWUTF8L]; w_char candidate_utf[MAXSWL]; memcpy(candidate_utf, word, wl * sizeof(w_char)); - mkallcap_utf(candidate_utf, wl, pAMgr->get_langnum()); + mkallcap_utf(candidate_utf, wl, langnum); u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); } @@ -288,9 +324,9 @@ int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest) } // suggestions for when chose the wrong char out of a related set -int SuggestMgr::mapchars(char** wlst, const char * word, int ns) +int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest) { - time_t timelimit; + clock_t timelimit; int timer; int wl = strlen(word); @@ -300,18 +336,19 @@ int SuggestMgr::mapchars(char** wlst, const char * word, int ns) struct mapentry* maptable = pAMgr->get_maptable(); if (maptable==NULL) return ns; - timelimit = time(NULL); + timelimit = clock(); timer = MINTIMER; if (utf8) { w_char w[MAXSWL]; int len = u8_u16(w, MAXSWL, word); - ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit); - } else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit); + ns = map_related_utf(w, len, 0, cpdsuggest, wlst, ns, maptable, nummap, &timer, &timelimit); + } else ns = map_related(word, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit); return ns; } -int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, - const mapentry* maptable, int nummap, int * timer, time_t * timelimit) +int SuggestMgr::map_related(const char * word, int i, char** wlst, + int cpdsuggest, int ns, + const mapentry* maptable, int nummap, int * timer, clock_t * timelimit) { char c = *(word + i); if (c == 0) { @@ -319,8 +356,7 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, int wl = strlen(word); for (int m=0; m < ns; m++) if (strcmp(word,wlst[m]) == 0) cwrd = 0; - if ((cwrd) && (checkword(word, wl, 0, timer, timelimit) || - checkword(word, wl, 1, timer, timelimit))) { + if ((cwrd) && checkword(word, wl, cpdsuggest, timer, timelimit)) { if (ns < maxSug) { wlst[ns] = mystrdup(word); if (wlst[ns] == NULL) return -1; @@ -334,23 +370,27 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, if (strchr(maptable[j].set,c) != 0) { in_map = 1; char * newword = mystrdup(word); + if (!newword) return -1; for (int k = 0; k < maptable[j].len; k++) { *(newword + i) = *(maptable[j].set + k); - ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit); - if (!(*timelimit)) return ns; + ns = map_related(newword, (i+1), wlst, cpdsuggest, + ns, maptable, nummap, timer, timelimit); + if (!(*timer)) return ns; } free(newword); } } if (!in_map) { i++; - ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit); + ns = map_related(word, i, wlst, cpdsuggest, + ns, maptable, nummap, timer, timelimit); } return ns; } -int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns, - const mapentry* maptable, int nummap, int * timer, time_t * timelimit) +int SuggestMgr::map_related_utf(w_char * word, int len, int i, int cpdsuggest, + char** wlst, int ns, const mapentry* maptable, int nummap, + int * timer, clock_t * timelimit) { if (i == len) { int cwrd = 1; @@ -360,8 +400,7 @@ int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int wl = strlen(s); for (int m=0; m < ns; m++) if (strcmp(s,wlst[m]) == 0) cwrd = 0; - if ((cwrd) && (checkword(s, wl, 0, timer, timelimit) || - checkword(s, wl, 1, timer, timelimit))) { + if ((cwrd) && checkword(s, wl, cpdsuggest, timer, timelimit)) { if (ns < maxSug) { wlst[ns] = mystrdup(s); if (wlst[ns] == NULL) return -1; @@ -377,15 +416,17 @@ int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int in_map = 1; for (int k = 0; k < maptable[j].len; k++) { *(word + i) = *(maptable[j].set_utf16 + k); - ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit); - if (!(*timelimit)) return ns; + ns = map_related_utf(word, len, i + 1, cpdsuggest, + wlst, ns, maptable, nummap, timer, timelimit); + if (!(*timer)) return ns; } *((unsigned short *) word + i) = c; } } if (!in_map) { i++; - ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit); + ns = map_related_utf(word, len, i, cpdsuggest, + wlst, ns, maptable, nummap, timer, timelimit); } return ns; } @@ -416,6 +457,23 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest strcpy(candidate+(r-word)+lenr, r+lenp); ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; + // check REP suggestions with space + char * sp = strchr(candidate, ' '); + if (sp) { + *sp = '\0'; + if (checkword(candidate, strlen(candidate), 0, NULL, NULL)) { + int oldns = ns; + *sp = ' '; + ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + if (oldns < ns) { + free(wlst[ns - 1]); + wlst[ns - 1] = mystrdup(candidate); + if (!wlst[ns - 1]) return -1; + } + } + *sp = ' '; + } r++; // search for the next letter } } @@ -454,7 +512,7 @@ int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, in int state=0; if (wl < 5 || ! pAMgr) return ns; for (int i=2; i < wl; i++) { - if ((word[i].l==word[i-2].l) && (word[i].h==word[i-2].h)) { + if (w_char_eq(word[i], word[i-2])) { state++; if (state==3) { memcpy(candidate_utf, word, (i - 1) * sizeof(w_char)); @@ -471,25 +529,108 @@ int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, in return ns; } +// error is wrong char in place of correct one (case and keyboard related version) +int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest) +{ + char tmpc; + char candidate[MAXSWUTF8L]; + int wl = strlen(word); + strcpy(candidate, word); + // swap out each char one by one and try uppercase and neighbor + // keyboard chars in its place to see if that makes a good word + + for (int i=0; i < wl; i++) { + tmpc = candidate[i]; + // check with uppercase letters + candidate[i] = csconv[((unsigned char)tmpc)].cupper; + if (tmpc != candidate[i]) { + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + candidate[i] = tmpc; + } + // check neighbor characters in keyboard string + if (!ckey) continue; + char * loc = strchr(ckey, tmpc); + while (loc) { + if ((loc > ckey) && (*(loc - 1) != '|')) { + candidate[i] = *(loc - 1); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) { + candidate[i] = *(loc + 1); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + loc = strchr(loc + 1, tmpc); + } + candidate[i] = tmpc; + } + return ns; +} + +// error is wrong char in place of correct one (case and keyboard related version) +int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) +{ + w_char tmpc; + w_char candidate_utf[MAXSWL]; + char candidate[MAXSWUTF8L]; + memcpy(candidate_utf, word, wl * sizeof(w_char)); + // swap out each char one by one and try all the tryme + // chars in its place to see if that makes a good word + for (int i=0; i < wl; i++) { + tmpc = candidate_utf[i]; + // check with uppercase letters + mkallcap_utf(candidate_utf + i, 1, langnum); + if (!w_char_eq(tmpc, candidate_utf[i])) { + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + candidate_utf[i] = tmpc; + } + // check neighbor characters in keyboard string + if (!ckey) continue; + w_char * loc = ckey_utf; + while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++; + while (loc < (ckey_utf + ckeyl)) { + if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) { + candidate_utf[i] = *(loc - 1); + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) { + candidate_utf[i] = *(loc + 1); + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)); + } + candidate_utf[i] = tmpc; + } + return ns; +} + // error is wrong char in place of correct one int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest) { char tmpc; char candidate[MAXSWUTF8L]; - time_t timelimit = time(NULL); + clock_t timelimit = clock(); int timer = MINTIMER; int wl = strlen(word); strcpy(candidate, word); // swap out each char one by one and try all the tryme // chars in its place to see if that makes a good word - for (int i=0; i < wl; i++) { - tmpc = candidate[i]; - for (int j=0; j < ctryl; j++) { + for (int j=0; j < ctryl; j++) { + for (int i=wl-1; i >= 0; i--) { + tmpc = candidate[i]; if (ctry[j] == tmpc) continue; candidate[i] = ctry[j]; ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timelimit) return ns; + if (!timer) return ns; candidate[i] = tmpc; } } @@ -502,20 +643,20 @@ int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, i w_char tmpc; w_char candidate_utf[MAXSWL]; char candidate[MAXSWUTF8L]; - time_t timelimit = time(NULL); + clock_t timelimit = clock(); int timer = MINTIMER; memcpy(candidate_utf, word, wl * sizeof(w_char)); // swap out each char one by one and try all the tryme // chars in its place to see if that makes a good word - for (int i=0; i < wl; i++) { - tmpc = candidate_utf[i]; - for (int j=0; j < ctryl; j++) { - if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue; + for (int j=0; j < ctryl; j++) { + for (int i=wl-1; i >= 0; i--) { + tmpc = candidate_utf[i]; + if (w_char_eq(tmpc, ctry_utf[j])) continue; candidate_utf[i] = ctry_utf[j]; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timelimit) return ns; + if (!timer) return ns; candidate_utf[i] = tmpc; } } @@ -525,18 +666,20 @@ int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, i // error is word has an extra letter it does not need int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest) { - char candidate[MAXSWUTF8L]; + char candidate[MAXSWUTF8L]; w_char candidate_utf[MAXSWL]; - const w_char * p; - w_char * r; + w_char * p; + w_char tmpc = W_VLINE; // not used value, only for VCC warning message if (wl < 2) return ns; // try omitting one char of word at a time - memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char)); - for (p = word, r = candidate_utf; p < word + wl; ) { - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1); + memcpy(candidate_utf, word, wl * sizeof(w_char)); + for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) { + w_char tmpc2 = *p; + if (p < candidate_utf + wl - 1) *p = tmpc; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1); ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; - *r++ = *p++; + tmpc = tmpc2; } return ns; } @@ -544,47 +687,41 @@ int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, // error is word has an extra letter it does not need int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest) { + char tmpc = '\0'; char candidate[MAXSWUTF8L]; - const char * p; - char * r; + char * p; int wl = strlen(word); if (wl < 2) return ns; // try omitting one char of word at a time - strcpy (candidate, word + 1); - for (p = word, r = candidate; *p != 0; ) { + strcpy (candidate, word); + for (p = candidate + wl - 1; p >=candidate; p--) { + char tmpc2 = *p; + *p = tmpc; ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; - *r++ = *p++; + tmpc = tmpc2; } return ns; } - // error is missing a letter it needs int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest) { char candidate[MAXSWUTF8L]; - const char * p; - char * q; - time_t timelimit = time(NULL); + char * p; + clock_t timelimit = clock(); int timer = MINTIMER; int wl = strlen(word); - // try inserting a tryme character before every letter - strcpy(candidate + 1, word); - for (p = word, q = candidate; *p != 0; ) { - for (int i = 0; i < ctryl; i++) { - *q = ctry[i]; + // try inserting a tryme character before every letter (and the null terminator) + for (int i = 0; i < ctryl; i++) { + strcpy(candidate, word); + for (p = candidate + wl; p >= candidate; p--) { + *(p+1) = *p; + *p = ctry[i]; ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timelimit) return ns; + if (!timer) return ns; } - *q++ = *p++; - } - // now try adding one to end */ - for (int i = 0; i < ctryl; i++) { - *q = ctry[i]; - ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; } return ns; } @@ -594,31 +731,20 @@ int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns { w_char candidate_utf[MAXSWL]; char candidate[MAXSWUTF8L]; - const w_char * p; - w_char * q; - int cwrd; - time_t timelimit = time(NULL); + w_char * p; + clock_t timelimit = clock(); int timer = MINTIMER; - // try inserting a tryme character before every letter - memcpy (candidate_utf + 1, word, wl * sizeof(w_char)); - for (p = word, q = candidate_utf; p < (word + wl); ) { - for (int i = 0; i < ctryl; i++) { - *q = ctry_utf[i]; - cwrd = 1; + // try inserting a tryme character at the end of the word and before every letter + for (int i = 0; i < ctryl; i++) { + memcpy (candidate_utf, word, wl * sizeof(w_char)); + for (p = candidate_utf + wl; p >= candidate_utf; p--) { + *(p + 1) = *p; + *p = ctry_utf[i]; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timelimit) return ns; - } - *q++ = *p++; - } - // now try adding one to end */ - for (int i = 0; i < ctryl; i++) { - *q = ctry_utf[i]; - cwrd = 1; - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; + if (!timer) return ns; + } } return ns; } @@ -636,19 +762,19 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest int wl=strlen(word); if (wl < 3) return ns; - if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl); + if (langnum == LANG_hu) forbidden = check_forbidden(word, wl); strcpy(candidate + 1, word); - // split the string into two pieces after every char // if both pieces are good words make them a suggestion for (p = candidate + 1; p[1] != '\0'; p++) { p[-1] = *p; // go to end of the UTF-8 character while (utf8 && ((p[1] & 0xc0) == 0x80)) { + *p = p[1]; p++; - p[-1] = *p; } + if (utf8 && p[1] == '\0') break; // last UTF-8 character *p = '\0'; c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL); if (c1) { @@ -657,7 +783,7 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest *p = ' '; // spec. Hungarian code (need a better compound word support) - if ((pAMgr->get_langnum() == LANG_hu) && !forbidden && + if ((langnum == LANG_hu) && !forbidden && // if 3 repeating letter, use - instead of space (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) || // or multiple compounding, with more, than 6 syllables @@ -673,6 +799,23 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest ns++; } } else return ns; + // add two word suggestion with dash, if TRY string contains + // "a" or "-" + // NOTE: cwrd doesn't modified for REP twoword sugg. + if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) && + mystrlen(p + 1) > 1 && + mystrlen(candidate) - mystrlen(p) > 1) { + *p = '-'; + for (int k=0; k < ns; k++) + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; + if (ns < maxSug) { + if (cwrd) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) return -1; + ns++; + } + } else return ns; + } } } } @@ -698,6 +841,24 @@ int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest p[1] = *p; *p = tmpc; } + // try double swaps for short words + // ahev -> have, owudl -> would + if (wl == 4 || wl == 5) { + candidate[0] = word[1]; + candidate[1] = word[0]; + candidate[2] = word[2]; + candidate[wl - 2] = word[wl - 1]; + candidate[wl - 1] = word[wl - 2]; + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + if (wl == 5) { + candidate[0] = word[0]; + candidate[1] = word[2]; + candidate[2] = word[1]; + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + } return ns; } @@ -708,6 +869,7 @@ int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, char candidate[MAXSWUTF8L]; w_char * p; w_char tmpc; + int len = 0; // try swapping adjacent chars one by one memcpy (candidate_utf, word, wl * sizeof(w_char)); for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) { @@ -715,11 +877,32 @@ int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, *p = p[1]; p[1] = tmpc; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (len == 0) len = strlen(candidate); + ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; p[1] = *p; *p = tmpc; } + // try double swaps for short words + // ahev -> have, owudl -> would, suodn -> sound + if (wl == 4 || wl == 5) { + candidate_utf[0] = word[1]; + candidate_utf[1] = word[0]; + candidate_utf[2] = word[2]; + candidate_utf[wl - 2] = word[wl - 1]; + candidate_utf[wl - 1] = word[wl - 2]; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + if (wl == 5) { + candidate_utf[0] = word[0]; + candidate_utf[1] = word[2]; + candidate_utf[2] = word[1]; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + } return ns; } @@ -794,7 +977,7 @@ int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest *(q-1) = *q; *q = tmpc; if ((q-p) < 2) continue; // omit swap char - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } strcpy(candidate, word); @@ -805,7 +988,7 @@ int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest *(q+1) = *q; *q = tmpc; if ((p-q) < 2) continue; // omit swap char - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } strcpy(candidate, word); @@ -830,7 +1013,7 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, *q = tmpc; if ((q-p) < 2) continue; // omit swap char u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } memcpy (candidate_utf, word, wl * sizeof(w_char)); @@ -842,7 +1025,7 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, *q = tmpc; if ((p-q) < 2) continue; // omit swap char u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } memcpy (candidate_utf, word, wl * sizeof(w_char)); @@ -851,28 +1034,33 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, } // generate a set of suggestions for very poorly spelled words -int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) +int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md) { int i, j; int lval; - int sc; - int lp; + int sc, scphon; + int lp, lpphon; int nonbmp = 0; - if (!pHMgr) return 0; - // exhaustively search through all root words // keeping track of the MAX_ROOTS most similar root words struct hentry * roots[MAX_ROOTS]; + char * rootsphon[MAX_ROOTS]; int scores[MAX_ROOTS]; + int scoresphon[MAX_ROOTS]; for (i = 0; i < MAX_ROOTS; i++) { roots[i] = NULL; scores[i] = -100 * i; + rootsphon[i] = NULL; + scoresphon[i] = -100 * i; } lp = MAX_ROOTS - 1; - + lpphon = MAX_ROOTS - 1; + scphon = scoresphon[MAX_ROOTS-1]; + char w2[MAXWORDUTF8LEN]; + char f[MAXSWUTF8L]; char * word = w; // word reversing wrapper for complex prefixes @@ -896,8 +1084,8 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) struct hentry* hp = NULL; int col = -1; - - #ifdef HUNSPELL_CHROME_CLIENT + +#ifdef HUNSPELL_CHROME_CLIENT // A static array of hentries required for walking the hash table. struct hentry static_hentry[MAX_ROOTS]; @@ -906,31 +1094,61 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) static const int kMaxWordLen = 128; char hentry_word[MAX_ROOTS][kMaxWordLen]; unsigned short hentry_astr[MAX_ROOTS]; - #endif +#endif - while ((hp = pHMgr->walk_hashtable(col, hp))) { + phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; + char target[MAXSWUTF8L]; + char candidate[MAXSWUTF8L]; + if (ph) { + strcpy(candidate, word); + mkallcap(candidate, csconv); + phonet(candidate, target, n, *ph); + } + + for (i = 0; i < md; i++) { + while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) { if ((hp->astr) && (pAMgr) && (TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) || + TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) || TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue; - sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE); + + sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + NGRAM_LOWERING) + + leftcommonsubstring(word, HENTRY_WORD(hp)); + + // check special pronounciation + if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { + int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + NGRAM_LOWERING) + + leftcommonsubstring(word, f); + if (sc2 > sc) sc = sc2; + } + + if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) { + char target2[MAXSWUTF8L]; + strcpy(candidate, HENTRY_WORD(hp)); + mkallcap(candidate, csconv); + phonet(candidate, target2, -1, *ph); + scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); + } + if (sc > scores[lp]) { scores[lp] = sc; - #ifdef HUNSPELL_CHROME_CLIENT +#ifdef HUNSPELL_CHROME_CLIENT roots[lp] = &static_hentry[lp]; roots[lp]->alen = hp->alen; if (hp->astr) hentry_astr[lp] = *hp->astr; roots[lp]->astr = &hentry_astr[lp]; - roots[lp]->wlen = hp->wlen; - strcpy(&hentry_word[lp][0], hp->word); - roots[lp]->word = &hentry_word[lp][0]; + roots[lp]->blen = hp->blen; + strcpy(&hentry_word[lp][0], &hp->word); + roots[lp]->word = hentry_word[lp][0]; roots[lp]->next = NULL; roots[lp]->next_homonym = NULL; - #else + roots[lp]->var = 0; + roots[lp]->clen = 0; +#else roots[lp] = hp; - #endif - +#endif lval = sc; for (j=0; j < MAX_ROOTS; j++) if (scores[j] < lval) { @@ -938,7 +1156,18 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) lval = scores[j]; } } - } + + if (scphon > scoresphon[lpphon]) { + scoresphon[lpphon] = scphon; + rootsphon[lpphon] = HENTRY_WORD(hp); + lval = scphon; + for (j=0; j < MAX_ROOTS; j++) + if (scoresphon[j] < lval) { + lpphon = j; + lval = scoresphon[j]; + } + } + }} // find minimum threshhold for a passable suggestion // mangle original word three differnt ways @@ -948,11 +1177,11 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) if (utf8) { for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*'; u16_u8(mw, MAXSWUTF8L, u8, n); - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING); } else { strcpy(mw, word); for (int k=sp; k < n; k+=4) *(mw + k) = '*'; - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING); } } thresh = thresh / 3; @@ -962,9 +1191,11 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) // and use length adjusted ngram scores to select // possible suggestions char * guess[MAX_GUESS]; + char * guessorig[MAX_GUESS]; int gscore[MAX_GUESS]; for(i=0;i<MAX_GUESS;i++) { guess[i] = NULL; + guessorig[i] = NULL; gscore[i] = -100 * i; } @@ -974,31 +1205,46 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword)); if (! glst) { if (nonbmp) utf8 = 1; - return 0; + return ns; } for (i = 0; i < MAX_ROOTS; i++) { - if (roots[i]) { struct hentry * rp = roots[i]; - int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen, - rp->astr, rp->alen, word, nc); + int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, + rp->astr, rp->alen, word, nc, + ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL)); for (int k = 0; k < nw ; k++) { - sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH); + sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + NGRAM_LOWERING) + + leftcommonsubstring(word, glst[k].word); + if ((sc > thresh)) { if (sc > gscore[lp]) { - if (guess[lp]) free (guess[lp]); + if (guess[lp]) { + free (guess[lp]); + if (guessorig[lp]) { + free(guessorig[lp]); + guessorig[lp] = NULL; + } + } gscore[lp] = sc; guess[lp] = glst[k].word; + guessorig[lp] = glst[k].orig; lval = sc; for (j=0; j < MAX_GUESS; j++) if (gscore[j] < lval) { lp = j; lval = gscore[j]; } - } else free (glst[k].word); - } else free(glst[k].word); + } else { + free(glst[k].word); + if (glst[k].orig) free(glst[k].orig); + } + } else { + free(glst[k].word); + if (glst[k].orig) free(glst[k].orig); + } } } } @@ -1007,7 +1253,9 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) // now we are done generating guesses // sort in order of decreasing score - bubblesort(&guess[0], &gscore[0], MAX_GUESS); + + bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); + if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); // weight suggestions with a similarity index, based on // the longest common subsequent algorithm and resort @@ -1021,7 +1269,7 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) if (utf8) { w_char _w[MAXSWL]; len = u8_u16(_w, MAXSWL, guess[i]); - mkallsmall_utf(_w, len, pAMgr->get_langnum()); + mkallsmall_utf(_w, len, langnum); u16_u8(gl, MAXSWUTF8L, _w, len); } else { strcpy(gl, guess[i]); @@ -1039,10 +1287,10 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) // heuristic weigthing of ngram scores gscore[i] += - // length of longest common subsequent minus lenght difference + // length of longest common subsequent minus length difference 2 * _lcs - abs((int) (n - len)) + - // weight equal first letter - equalfirstletter(word, gl) + + // weight length of the left common substring + leftcommonsubstring(word, gl) + // weight equal character positions ((_lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) + // swap character (not neighboring) @@ -1050,25 +1298,84 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) } } - bubblesort(&guess[0], &gscore[0], MAX_GUESS); + bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); + +// phonetic version + if (ph) for (i=0; i < MAX_ROOTS; i++) { + if (rootsphon[i]) { + // lowering rootphon[i] + char gl[MAXSWUTF8L]; + int len; + if (utf8) { + w_char _w[MAXSWL]; + len = u8_u16(_w, MAXSWL, rootsphon[i]); + mkallsmall_utf(_w, len, langnum); + u16_u8(gl, MAXSWUTF8L, _w, len); + } else { + strcpy(gl, rootsphon[i]); + mkallsmall(gl, csconv); + len = strlen(rootsphon[i]); + } + + // heuristic weigthing of ngram scores + scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) + + // weight length of the left common substring + leftcommonsubstring(word, gl); + } + } + + if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); // copy over + int oldns = ns; - int ns = 0; int same = 0; for (i=0; i < MAX_GUESS; i++) { if (guess[i]) { - if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) { + if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) { int unique = 1; - // we have excellent suggestion(s) + // leave only excellent suggestions, if exists if (gscore[i] > 1000) same = 1; - for (j=0; j < ns; j++) + for (j = 0; j < ns; j++) { // don't suggest previous suggestions or a previous suggestion with prefixes or affixes - if (strstr(guess[i], wlst[j]) || + if ((!guessorig[i] && strstr(guess[i], wlst[j])) || + (guessorig[i] && strstr(guessorig[i], wlst[j])) || // check forbidden words !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0; - if (unique) wlst[ns++] = guess[i]; else free(guess[i]); - } else free(guess[i]); + } + if (unique) { + wlst[ns++] = guess[i]; + if (guessorig[i]) { + free(guess[i]); + wlst[ns-1] = guessorig[i]; + } + } else { + free(guess[i]); + if (guessorig[i]) free(guessorig[i]); + } + } else { + free(guess[i]); + if (guessorig[i]) free(guessorig[i]); + } + } + } + + oldns = ns; + if (ph) for (i=0; i < MAX_ROOTS; i++) { + if (rootsphon[i]) { + if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) { + int unique = 1; + for (j = 0; j < ns; j++) { + // don't suggest previous suggestions or a previous suggestion with prefixes or affixes + if (strstr(rootsphon[i], wlst[j]) || + // check forbidden words + !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0; + } + if (unique) { + wlst[ns++] = mystrdup(rootsphon[i]); + if (!wlst[ns - 1]) return ns - 1; + } + } } } @@ -1083,19 +1390,16 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) // obsolote MySpell-HU modifications: // return value 2 and 3 marks compounding with hyphen (-) // `3' marks roots without suffix -int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit) +int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit) { struct hentry * rv=NULL; int nosuffix = 0; - + // check time limit if (timer) { (*timer)--; if (!(*timer) && timelimit) { - if (time(NULL) > *timelimit) { - *timelimit = 0; - return 0; - } + if ((clock() - *timelimit) > TIMELIMIT) return 0; *timer = MAXPLUSTIMER; } } @@ -1103,7 +1407,7 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time if (pAMgr) { if (cpdsuggest==1) { if (pAMgr->get_compound()) { - rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1); + rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1); //EXT if (rv) return 3; // XXX obsolote categorisation } return 0; @@ -1114,10 +1418,15 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time if (rv) { if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0; - if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || - TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; + while (rv) { + if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { + rv = rv->next_homonym; + } else break; + } } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX - + if (rv) { nosuffix=1; } else { @@ -1130,8 +1439,9 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time } // check forbidden words - if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) - || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || + if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0; if (rv) { // XXX obsolote @@ -1149,7 +1459,7 @@ int SuggestMgr::check_forbidden(const char * word, int len) if (pAMgr) { rv = pAMgr->lookup(word); - if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || + if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; if (!(pAMgr->prefix_check(word,len,1))) rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix @@ -1160,184 +1470,6 @@ int SuggestMgr::check_forbidden(const char * word, int len) } #ifdef HUNSPELL_EXPERIMENTAL -// suggest stems, XXX experimental code -int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug) -{ - char buf[MAXSWUTF8L]; - char ** wlst; - int prevnsug = nsug; - - char w2[MAXWORDUTF8LEN]; - const char * word = w; - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - strcpy(w2, w); - if (utf8) reverseword_utf(w2); else reverseword(w2); - word = w2; - } - - if (*slst) { - wlst = *slst; - } else { - wlst = (char **) calloc(maxSug, sizeof(char *)); - if (wlst == NULL) return -1; - } - // perhaps there are a fix stem in the dictionary - if ((nsug < maxSug) && (nsug > -1)) { - - nsug = fixstems(wlst, word, nsug); - if (nsug == prevnsug) { - char * s = mystrdup(word); - char * p = s + strlen(s); - while ((*p != '-') && (p != s)) p--; - if (*p == '-') { - *p = '\0'; - nsug = fixstems(wlst, s, nsug); - if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) { - char * t; - buf[0] = '\0'; - for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number? - if (*t != '\0') strcpy(buf, "# "); - strcat(buf, s); - wlst[nsug] = mystrdup(buf); - if (wlst[nsug] == NULL) return -1; - nsug++; - } - p++; - nsug = fixstems(wlst, p, nsug); - } - - free(s); - } - } - - if (nsug < 0) { - for (int i=0;i<maxSug; i++) - if (wlst[i] != NULL) free(wlst[i]); - free(wlst); - return -1; - } - - *slst = wlst; - return nsug; -} - - -// there are fix stems in dictionary -int SuggestMgr::fixstems(char ** wlst, const char * word, int ns) -{ - char buf[MAXSWUTF8L]; - char prefix[MAXSWUTF8L] = ""; - - int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound - int cpdindex = 0; - struct hentry * rv = NULL; - - int wl = strlen(word); - int cmpdstemnum; - int cmpdstem[MAXCOMPOUND]; - - if (pAMgr) { - rv = pAMgr->lookup(word); - if (rv) { - dicstem = 0; - } else { - // try stripping off affixes - rv = pAMgr->affix_check(word, wl); - - // else try check compound word - if (!rv && pAMgr->get_compound()) { - rv = pAMgr->compound_check(word, wl, - 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1); - - if (rv) { - dicstem = 2; - for (int j = 0; j < cmpdstemnum; j++) { - cpdindex += cmpdstem[j]; - } - if(! (pAMgr->lookup(word + cpdindex))) - pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix - } - } - - - if (pAMgr->get_prefix()) { - strcpy(prefix, pAMgr->get_prefix()); - } - - // XXX obsolete, will be a general solution for stemming - if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU) - } - - } - - - - if ((rv) && (ns < maxSug)) { - - // check fixstem flag and not_valid_stem flag - // first word - if ((ns < maxSug) && (dicstem < 2)) { - strcpy(buf, prefix); - if ((dicstem > 0) && pAMgr->get_derived()) { - // XXX obsolote - if (strlen(prefix) == 1) { - strcat(buf, (pAMgr->get_derived()) + 1); - } else { - strcat(buf, pAMgr->get_derived()); - } - } else { - // special stem in affix description - const char * wordchars = pAMgr->get_wordchars(); - if (rv->description && - (strchr(wordchars, *(rv->description)))) { - char * desc = (rv->description) + 1; - while (strchr(wordchars, *desc)) desc++; - strncat(buf, rv->description, desc - (rv->description)); - } else { - strcat(buf, rv->word); - } - } - wlst[ns] = mystrdup(buf); - if (wlst[ns] == NULL) return -1; - ns++; - } - - if (dicstem == 2) { - - // compound stem - -// if (rv->astr && (strchr(rv->astr, '0') == NULL)) { - if (rv->astr) { - strcpy(buf, word); - buf[cpdindex] = '\0'; - if (prefix) strcat(buf, prefix); - if (pAMgr->get_derived()) { - strcat(buf, pAMgr->get_derived()); - } else { - // special stem in affix description - const char * wordchars = pAMgr->get_wordchars(); - if (rv->description && - (strchr(wordchars, *(rv->description)))) { - char * desc = (rv->description) + 1; - while (strchr(wordchars, *desc)) desc++; - strncat(buf, rv->description, desc - (rv->description)); - } else { - strcat(buf, rv->word); - } - } - if (ns < maxSug) { - wlst[ns] = mystrdup(buf); - if (wlst[ns] == NULL) return -1; - ns++; - } - } - } - } - return ns; -} - // suggest possible stems int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) { @@ -1377,6 +1509,7 @@ int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) *slst = wlst; return nsug; } +#endif // END OF HUNSPELL_EXPERIMENTAL CODE char * SuggestMgr::suggest_morph(const char * w) @@ -1405,20 +1538,25 @@ char * SuggestMgr::suggest_morph(const char * w) while (rv) { if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || - TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen))) - strcat(result, word); - if (rv->description) strcat(result, rv->description); - strcat(result, "\n"); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, word, MAXLNLEN); + } + if (HENTRY_DATA(rv)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + } + mystrcat(result, "\n", MAXLNLEN); } rv = rv->next_homonym; } st = pAMgr->affix_check_morph(word,strlen(word)); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } @@ -1426,28 +1564,177 @@ char * SuggestMgr::suggest_morph(const char * w) pAMgr->compound_check_morph(word, strlen(word), 0, 0, 100, 0,NULL, 0, &r, NULL); - return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL; + return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL; } +#ifdef HUNSPELL_EXPERIMENTAL char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) { char * p = NULL; char ** wlst = (char **) calloc(maxSug, sizeof(char *)); + if (!**wlst) return NULL; // we will use only the first suggestion for (int i = 0; i < maxSug - 1; i++) wlst[i] = ""; - int ns = suggest(&wlst, word, maxSug - 1); + int ns = suggest(&wlst, word, maxSug - 1, NULL); if (ns == maxSug) { p = suggest_morph(wlst[maxSug - 1]); free(wlst[maxSug - 1]); } if (wlst) free(wlst); - return p; + return p; } #endif // END OF HUNSPELL_EXPERIMENTAL CODE +/* affixation */ +char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern) +{ + char result[MAXLNLEN]; + *result = '\0'; + int sfxcount = get_sfxcount(pattern); + + if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL; + + if (HENTRY_DATA(rv)) { + char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen, + HENTRY_DATA(rv), pattern, 0); + if (aff) { + mystrcat(result, aff, MAXLNLEN); + mystrcat(result, "\n", MAXLNLEN); + free(aff); + } + } + + // check all allomorphs + char allomorph[MAXLNLEN]; + char * p = NULL; + if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH); + while (p) { + struct hentry * rv2 = NULL; + p += MORPH_TAG_LEN; + int plen = fieldlen(p); + strncpy(allomorph, p, plen); + allomorph[plen] = '\0'; + rv2 = pAMgr->lookup(allomorph); + while (rv2) { +// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) { + if (HENTRY_DATA(rv2)) { + char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM); + if (st && (strncmp(st + MORPH_TAG_LEN, + HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) { + char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen, + HENTRY_DATA(rv2), pattern, 0); + if (aff) { + mystrcat(result, aff, MAXLNLEN); + mystrcat(result, "\n", MAXLNLEN); + free(aff); + } + } + } + rv2 = rv2->next_homonym; + } + p = strstr(p + plen, MORPH_ALLOMORPH); + } + + return (*result) ? mystrdup(result) : NULL; +} + +char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) { + char result[MAXLNLEN]; + char result2[MAXLNLEN]; + char newpattern[MAXLNLEN]; + *newpattern = '\0'; + if (n == 0) return 0; + *result2 = '\0'; + struct hentry * rv = NULL; + if (!pAMgr) return NULL; + +// search affixed forms with and without derivational suffixes + while(1) { + + for (int k = 0; k < n; k++) { + *result = '\0'; + // add compound word parts (except the last one) + char * s = (char *) desc[k]; + char * part = strstr(s, MORPH_PART); + if (part) { + char * nextpart = strstr(part + 1, MORPH_PART); + while (nextpart) { + copy_field(result + strlen(result), part, MORPH_PART); + part = nextpart; + nextpart = strstr(part + 1, MORPH_PART); + } + s = part; + } + + char **pl; + char tok[MAXLNLEN]; + strcpy(tok, s); + char * alt = strstr(tok, " | "); + while (alt) { + alt[1] = MSEP_ALT; + alt = strstr(alt, " | "); + } + int pln = line_tok(tok, &pl, MSEP_ALT); + for (int i = 0; i < pln; i++) { + // remove inflectional and terminal suffixes + char * is = strstr(pl[i], MORPH_INFL_SFX); + if (is) *is = '\0'; + char * ts = strstr(pl[i], MORPH_TERM_SFX); + while (ts) { + *ts = '_'; + ts = strstr(pl[i], MORPH_TERM_SFX); + } + char * st = strstr(s, MORPH_STEM); + if (st) { + copy_field(tok, st, MORPH_STEM); + rv = pAMgr->lookup(tok); + while (rv) { + char newpat[MAXLNLEN]; + strcpy(newpat, pl[i]); + strcat(newpat, pattern); + char * sg = suggest_hentry_gen(rv, newpat); + if (!sg) sg = suggest_hentry_gen(rv, pattern); + if (sg) { + char ** gen; + int genl = line_tok(sg, &gen, MSEP_REC); + free(sg); + sg = NULL; + for (int j = 0; j < genl; j++) { + if (strstr(pl[i], MORPH_SURF_PFX)) { + int r2l = strlen(result2); + result2[r2l] = MSEP_REC; + strcpy(result2 + r2l + 1, result); + copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX); + mystrcat(result2, gen[j], MAXLNLEN); + } else { + sprintf(result2 + strlen(result2), "%c%s%s", + MSEP_REC, result, gen[j]); + } + } + freelist(&gen, genl); + } + rv = rv->next_homonym; + } + } + } + freelist(&pl, pln); + } + + if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break; + strcpy(newpattern, pattern); + pattern = newpattern; + char * ds = strstr(pattern, MORPH_DERI_SFX); + while (ds) { + strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN); + ds = strstr(pattern, MORPH_DERI_SFX); + } + } + return (*result2 ? mystrdup(result2) : NULL); +} + // generate an n-gram score comparing s1 and s2 -int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) +int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt) { int nscore = 0; int ns; @@ -1459,13 +1746,9 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) w_char su2[MAXSWL]; l1 = u8_u16(su1, MAXSWL, s1); l2 = u8_u16(su2, MAXSWL, s2); - if (!l2 || (l1==-1) || (l2==-1)) return 0; - // decapitalize dictionary word - if (complexprefixes) { - mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum()); - } else { - mkallsmall_utf(su2, 1, pAMgr->get_langnum()); - } + if ((l2 <= 0) || (l1 == -1)) return 0; + // lowering dictionary word + if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum); for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1-j); i++) { @@ -1489,13 +1772,9 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) char t[MAXSWUTF8L]; l1 = strlen(s1); l2 = strlen(s2); - if (!l2) return 0; + if (l2 == 0) return 0; strcpy(t, s2); - if (complexprefixes) { - *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower; - } else { - mkallsmall(t, csconv); - } + if (opt & NGRAM_LOWERING) mkallsmall(t, csconv); for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1-j); i++) { @@ -1510,13 +1789,14 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) } ns = 0; - if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2; - if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; + if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2; + if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; ns = (nscore - ((ns > 0) ? ns : 0)); return ns; } -int SuggestMgr::equalfirstletter(char * s1, const char * s2) { +// length of the left common substring of s1 and (decapitalised) s2 +int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) { if (utf8) { w_char su1[MAXSWL]; w_char su2[MAXSWL]; @@ -1526,9 +1806,17 @@ int SuggestMgr::equalfirstletter(char * s1, const char * s2) { int l2 = u8_u16(su2, MAXSWL, s2); if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1; } else { + int i; u8_u16(su1, 1, s1); u8_u16(su2, 1, s2); - if (*((short *)su1) == *((short *)su2)) return 1; + unsigned short idx = (su2->h << 8) + su2->l; + if (*((short *)su1) != *((short *)su2) && + (*((unsigned short *)su1) != unicodetolower(idx, langnum))) return 0; + int l1 = u8_u16(su1, MAXSWL, s1); + int l2 = u8_u16(su2, MAXSWL, s2); + for(i = 1; (i < l1) && (i < l2) && + (*((short *)(su1 + i)) == *((short *)(su2 + i))); i++); + return i; } } else { if (complexprefixes) { @@ -1536,7 +1824,13 @@ int SuggestMgr::equalfirstletter(char * s1, const char * s2) { int l2 = strlen(s2); if (*(s2+l1-1) == *(s2+l2-1)) return 1; } else { - if (*s1 == *s2) return 1; + char * olds = s1; + // decapitalise dictionary word + if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0; + do { + s1++; s2++; + } while ((*s1 == *s2) && (*s1 != '\0')); + return s1 - olds; } } return 0; @@ -1554,9 +1848,9 @@ int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_sw int l2 = u8_u16(su2, MAXSWL, s2); // decapitalize dictionary word if (complexprefixes) { - mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum()); + mkallsmall_utf(su2+l2-1, 1, langnum); } else { - mkallsmall_utf(su2, 1, pAMgr->get_langnum()); + mkallsmall_utf(su2, 1, langnum); } for (int i = 0; (i < l1) && (i < l2); i++) { if (((short *) su1)[i] == ((short *) su2)[i]) { @@ -1603,7 +1897,7 @@ int SuggestMgr::mystrlen(const char * word) { } // sort in decreasing order of score -void SuggestMgr::bubblesort(char** rword, int* rsc, int n ) +void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n ) { int m = 1; while (m < n) { @@ -1616,6 +1910,11 @@ void SuggestMgr::bubblesort(char** rword, int* rsc, int n ) rword[j-1] = rword[j]; rsc[j] = sctmp; rword[j] = wdtmp; + if (rword2) { + wdtmp = rword2[j-1]; + rword2[j-1] = rword2[j]; + rword2[j] = wdtmp; + } j--; } else break; } @@ -1642,6 +1941,12 @@ void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char * } c = (char *) malloc((m + 1) * (n + 1)); b = (char *) malloc((m + 1) * (n + 1)); + if (!c || !b) { + if (c) free(c); + if (b) free(b); + *result = NULL; + return; + } for (i = 1; i <= m; i++) c[i*(n+1)] = 0; for (j = 0; j <= n; j++) c[j] = 0; for (i = 1; i <= m; i++) { @@ -1673,6 +1978,7 @@ int SuggestMgr::lcslen(const char * s, const char* s2) { char * result; int len = 0; lcs(s, s2, &m, &n, &result); + if (!result) return 0; i = m; j = n; while ((i != 0) && (j != 0)) { @@ -1684,6 +1990,6 @@ int SuggestMgr::lcslen(const char * s, const char* s2) { i--; } else j--; } - if (result) free(result); + free(result); return len; } diff --git a/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx b/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx index 70af7f1..0e61572 100644 --- a/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx +++ b/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx @@ -5,15 +5,18 @@ #define MAXSWUTF8L (MAXSWL * 4) #define MAX_ROOTS 100 #define MAX_WORDS 100 -#define MAX_GUESS 100 -#define MAXNGRAMSUGS 5 +#define MAX_GUESS 200 +#define MAXNGRAMSUGS 4 +#define MAXPHONSUGS 2 -#define MINTIMER 500 -#define MAXPLUSTIMER 500 +// timelimit: max ~1/4 sec (process time on Linux) for a time consuming function +#define TIMELIMIT (CLOCKS_PER_SEC >> 2) +#define MINTIMER 100 +#define MAXPLUSTIMER 100 -#define NGRAM_IGNORE_LENGTH 0 -#define NGRAM_LONGER_WORSE 1 -#define NGRAM_ANY_MISMATCH 2 +#define NGRAM_LONGER_WORSE (1 << 0) +#define NGRAM_ANY_MISMATCH (1 << 1) +#define NGRAM_LOWERING (1 << 2) #include "atypes.hxx" #include "affixmgr.hxx" @@ -25,6 +28,10 @@ enum { LCS_UP, LCS_LEFT, LCS_UPLEFT }; class SuggestMgr { + char * ckey; + int ckeyl; + w_char * ckey_utf; + char * ctry; int ctryl; w_char * ctry_utf; @@ -33,6 +40,7 @@ class SuggestMgr int maxSug; struct cs_info * csconv; int utf8; + int langnum; int nosplitsugs; int maxngramsugs; int complexprefixes; @@ -42,19 +50,20 @@ public: SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr); ~SuggestMgr(); - int suggest(char*** slst, const char * word, int nsug); - int ngsuggest(char ** wlst, char * word, HashMgr* pHMgr); + int suggest(char*** slst, const char * word, int nsug, int * onlycmpdsug); + int ngsuggest(char ** wlst, char * word, int ns, HashMgr** pHMgr, int md); int suggest_auto(char*** slst, const char * word, int nsug); int suggest_stems(char*** slst, const char * word, int nsug); int suggest_pos_stems(char*** slst, const char * word, int nsug); char * suggest_morph(const char * word); + char * suggest_gen(char ** pl, int pln, char * pattern); char * suggest_morph_for_spelling_error(const char * word); private: int testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest, - int * timer, time_t * timelimit); - int checkword(const char *, int, int, int *, time_t *); + int * timer, clock_t * timelimit); + int checkword(const char *, int, int, int *, clock_t *); int check_forbidden(const char *, int); int capchars(char **, const char *, int, int); @@ -65,6 +74,7 @@ private: int longswapchar(char **, const char *, int, int); int movechar(char **, const char *, int, int); int extrachar(char **, const char *, int, int); + int badcharkey(char **, const char *, int, int); int badchar(char **, const char *, int, int); int twowords(char **, const char *, int, int); int fixstems(char **, const char *, int); @@ -73,21 +83,23 @@ private: int doubletwochars_utf(char**, const w_char *, int wl, int, int); int forgotchar_utf(char**, const w_char *, int wl, int, int); int extrachar_utf(char**, const w_char *, int wl, int, int); + int badcharkey_utf(char **, const w_char *, int wl, int, int); int badchar_utf(char **, const w_char *, int wl, int, int); int swapchar_utf(char **, const w_char *, int wl, int, int); int longswapchar_utf(char **, const w_char *, int, int, int); int movechar_utf(char **, const w_char *, int, int, int); - int mapchars(char**, const char *, int); - int map_related(const char *, int, char ** wlst, int, const mapentry*, int, int *, time_t *); - int map_related_utf(w_char *, int, int, char ** wlst, int, const mapentry*, int, int *, time_t *); - int ngram(int n, char * s1, const char * s2, int uselen); + int mapchars(char**, const char *, int, int); + int map_related(const char *, int, char ** wlst, int, int, const mapentry*, int, int *, clock_t *); + int map_related_utf(w_char *, int, int, int, char ** wlst, int, const mapentry*, int, int *, clock_t *); + int ngram(int n, char * s1, const char * s2, int opt); int mystrlen(const char * word); - int equalfirstletter(char * s1, const char * s2); + int leftcommonsubstring(char * s1, const char * s2); int commoncharacterpositions(char * s1, const char * s2, int * is_swap); - void bubblesort( char ** rwd, int * rsc, int n); + void bubblesort( char ** rwd, char ** rwd2, int * rsc, int n); void lcs(const char * s, const char * s2, int * l1, int * l2, char ** result); int lcslen(const char * s, const char* s2); + char * suggest_hentry_gen(hentry * rv, char * pattern); }; diff --git a/chrome/third_party/hunspell/src/hunspell/w_char.hxx b/chrome/third_party/hunspell/src/hunspell/w_char.hxx new file mode 100644 index 0000000..99cfe63 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/w_char.hxx @@ -0,0 +1,19 @@ +#ifndef __WCHARHXX__ +#define __WCHARHXX__ + +#ifndef GCC +typedef struct { +#else +typedef struct __attribute__ ((packed)) { +#endif + unsigned char l; + unsigned char h; +} w_char; + +// two character arrays +struct replentry { + char * pattern; + char * pattern2; +}; + +#endif |