diff options
author | mhm@chromium.org <mhm@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-08-01 02:31:34 +0000 |
---|---|---|
committer | mhm@chromium.org <mhm@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-08-01 02:31:34 +0000 |
commit | 963f23274defde0c604f7643616a10f0215e1fe0 (patch) | |
tree | 04ffcc9faaedd37a9b273f38fba4c78e047f8a63 /chrome/third_party | |
parent | 877591d6f4d26e8725ab72f3bce38819d1640bf4 (diff) | |
download | chromium_src-963f23274defde0c604f7643616a10f0215e1fe0.zip chromium_src-963f23274defde0c604f7643616a10f0215e1fe0.tar.gz chromium_src-963f23274defde0c604f7643616a10f0215e1fe0.tar.bz2 |
Update Hunspell to the latest stable version to use the latest bdict format.
Updated Hunspell to version 1.2.8 which properly deals with UTF8 and fixes many bugs. This CL will use the BDict format and remove the usage of FileMgr. Removed the unwanted "key" parameter constructors from hunspell since we are managing them through bdict. Removed all line numbers from the errors since we don't support that.
BUG= 14756 (http://crbug.com/14756)
TEST= Compiled Hunspell, Compiled Chromium, Ran Chromium, Fixed some of my spelling mistakes. Ran unit tests for SpellCheckTest.*
Review URL: http://codereview.chromium.org/155841
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@22243 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/third_party')
30 files changed, 5228 insertions, 2772 deletions
diff --git a/chrome/third_party/hunspell/README.chromium b/chrome/third_party/hunspell/README.chromium index 5e6f6f7..e19a1b1 100644 --- a/chrome/third_party/hunspell/README.chromium +++ b/chrome/third_party/hunspell/README.chromium @@ -6,24 +6,13 @@ This is a partial copy of Hunspell 1.1.5, with the following changes: reference in src/hunspell/csutil.cxx changed accordingly * Change the input params of the constructors to receive a FILE* instead of a file path. This is required to use hunspell in the sandbox. - The patch is in google.patch. +* Remove all HUNSPELL_WARNING parameters since we are not using HashMgr + anymore, just show the msg not the line number. +* Remove the key variable from Hunspell, HashMgr and AffixMgr since Bdict + is being used instead. The English dictionary distributed by Firefox has been checked in to the dictionaries directory. It has several additions over the default myspell/hunspell dictionary. -* Workaround for non-ASCII characters - -Visual Studio on Japanese Windows assumes the source files to be -encoded in Shift_JIS. The compiler is unhappy with non-ASCII letters -in the source files of Hunspell. The same problem happens with other -CJK Windows as well. Here is the workaround for this problem: - -Convert 8-bit bytes to hexadecimal escaped forms by - - % perl -i -De 's/([\x80-\xff])/sprintf("\\x%02x", $1)/ge' src/*.cxx - - -Note that Hunspell upstream is going to fix this problem. We'll no -longer need the workaround if the problem is fixed in the upstream. diff --git a/chrome/third_party/hunspell/google.patch b/chrome/third_party/hunspell/google.patch deleted file mode 100644 index ae7fd9d..0000000 --- a/chrome/third_party/hunspell/google.patch +++ /dev/null @@ -1,212 +0,0 @@ -Index: src/hunspell/affixmgr.cxx -=================================================================== ---- src/hunspell/affixmgr.cxx (revision 3811) -+++ src/hunspell/affixmgr.cxx (working copy) -@@ -25,7 +27,7 @@ - #endif - #endif - --AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) -+AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) - { - // register hash manager and load affix data from aff file - pHMgr = ptr; -@@ -104,8 +106,8 @@ - contclasses[j] = 0; - } - -- if (parse_file(affpath)) { -- HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath); -+ if (parse_file(aff_handle)) { -+ HUNSPELL_WARNING(stderr, "Failure loading aff file\n"); - wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"); - } - -@@ -232,7 +234,7 @@ - - - // read in aff file and build up prefix and suffix entry objects --int AffixMgr::parse_file(const char * affpath) -+int AffixMgr::parse_file(FILE* aff_handle) - { - - // io buffers -@@ -250,11 +252,12 @@ - - // open the affix file - FILE * afflst; -- afflst = fopen(affpath,"r"); -+ afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); - if (!afflst) { -- HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath); -+ HUNSPELL_WARNING(stderr, "error: could not open affix description file\n"); - return 1; - } -+ fseek(afflst, 0, SEEK_SET); - - // step one is to parse the affix file building up the internal - // affix data structures -Index: src/hunspell/affixmgr.hxx -=================================================================== ---- src/hunspell/affixmgr.hxx (revision 3811) -+++ src/hunspell/affixmgr.hxx (working copy) -@@ -93,7 +93,7 @@ - - public: - -- AffixMgr(const char * affpath, HashMgr * ptr); -+ AffixMgr(FILE* aff_handle, HashMgr * ptr); - ~AffixMgr(); - struct hentry * affix_check(const char * word, int len, - const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT); -@@ -179,7 +179,7 @@ - int get_checksharps(void); - - private: -- int parse_file(const char * affpath); -+ int parse_file(FILE* aff_handle); - // int parse_string(char * line, char ** out, const char * name); - int parse_flag(char * line, unsigned short * out, const char * name); - int parse_num(char * line, int * out, const char * name); -Index: src/hunspell/hashmgr.cxx -=================================================================== ---- src/hunspell/hashmgr.cxx (revision 3811) -+++ src/hunspell/hashmgr.cxx (working copy) -@@ -29,7 +31,7 @@ - - // build a hash table from a munched word list - --HashMgr::HashMgr(const char * tpath, const char * apath) -+HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) - { - tablesize = 0; - tableptr = NULL; -@@ -43,8 +45,8 @@ - aliasf = NULL; - numaliasm = 0; - aliasm = NULL; -- load_config(apath); -- int ec = load_tables(tpath); -+ load_config(aff_handle); -+ int ec = load_tables(dic_handle); - if (ec) { - /* error condition - what should we do here */ - HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec); -@@ -240,7 +242,7 @@ - } - - // load a munched word list and build a hash table on the fly --int HashMgr::load_tables(const char * tpath) -+int HashMgr::load_tables(FILE* t_handle) - { - int wl, al; - char * ap; -@@ -248,8 +250,9 @@ - unsigned short * flags; - - // raw dictionary - munched file -- FILE * rawdict = fopen(tpath, "r"); -+ FILE * rawdict = _fdopen(_dup(_fileno(t_handle)), "r"); - if (rawdict == NULL) return 1; -+ fseek(rawdict, 0, SEEK_SET); - - // first read the first line of file to get hash table size */ - char ts[MAXDELEN]; -@@ -442,7 +445,7 @@ - } - - // read in aff file and set flag mode --int HashMgr::load_config(const char * affpath) -+int HashMgr::load_config(FILE* aff_handle) - { - int firstline = 1; - -@@ -451,11 +454,12 @@ - - // open the affix file - FILE * afflst; -- afflst = fopen(affpath,"r"); -+ afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); - if (!afflst) { -- HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath); -+ HUNSPELL_WARNING(stderr, "Error - could not open affix description file\n"); - return 1; - } -+ fseek(afflst, 0, SEEK_SET); - - // read in each line ignoring any that do not - // start with a known line type indicator -Index: src/hunspell/hashmgr.hxx -=================================================================== ---- src/hunspell/hashmgr.hxx (revision 3811) -+++ src/hunspell/hashmgr.hxx (working copy) -@@ -25,7 +25,7 @@ - - - public: -- HashMgr(const char * tpath, const char * apath); -+ HashMgr(FILE* t_handle, FILE* a_handle); - ~HashMgr(); - - struct hentry * lookup(const char *) const; -@@ -46,9 +46,9 @@ - - - private: -- int load_tables(const char * tpath); -+ int load_tables(FILE* t_handle); - int add_word(const char * word, int wl, unsigned short * ap, int al, const char * desc); -- int load_config(const char * affpath); -+ int load_config(FILE* aff_handle); - int parse_aliasf(char * line, FILE * af); - #ifdef HUNSPELL_EXPERIMENTAL - int parse_aliasm(char * line, FILE * af); -Index: src/hunspell/hunspell.cxx -=================================================================== ---- src/hunspell/hunspell.cxx (revision 3811) -+++ src/hunspell/hunspell.cxx (working copy) -@@ -20,7 +20,7 @@ - #endif - #endif - --Hunspell::Hunspell(const char * affpath, const char * dpath) -+Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle) - { - encoding = NULL; - csconv = NULL; -@@ -28,11 +28,11 @@ - complexprefixes = 0; - - /* first set up the hash manager */ -- pHMgr = new HashMgr(dpath, affpath); -+ pHMgr = new HashMgr(dic_handle, aff_handle); - - /* next set up the affix manager */ - /* it needs access to the hash manager lookup methods */ -- pAMgr = new AffixMgr(affpath,pHMgr); -+ pAMgr = new AffixMgr(aff_handle, pHMgr); - - /* get the preferred try string and the dictionary */ - /* encoding from the Affix Manager for that dictionary */ -@@ -1694,9 +1694,9 @@ - - #endif // END OF HUNSPELL_EXPERIMENTAL CODE - --Hunhandle *Hunspell_create(const char * affpath, const char * dpath) -+Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) - { -- return (Hunhandle*)(new Hunspell(affpath, dpath)); -+ return (Hunhandle*)(new Hunspell(aff_handle, dic_handle)); - } - - void Hunspell_destroy(Hunhandle *pHunspell) -Index: src/hunspell/hunspell.hxx -=================================================================== ---- src/hunspell/hunspell.hxx (revision 3811) -+++ src/hunspell/hunspell.hxx (working copy) -@@ -48,7 +48,7 @@ - * input: path of affix file and dictionary file - */ - -- Hunspell(const char * affpath, const char * dpath); -+ Hunspell(FILE* aff_handle, FILE* dic_handle); diff --git a/chrome/third_party/hunspell/hunspell.gyp b/chrome/third_party/hunspell/hunspell.gyp index a19f103..217fa5e 100644 --- a/chrome/third_party/hunspell/hunspell.gyp +++ b/chrome/third_party/hunspell/hunspell.gyp @@ -16,6 +16,7 @@ '../../../third_party/icu38/icu38.gyp:icuuc', ], 'defines': [ + 'HUNSPELL_STATIC', 'HUNSPELL_CHROME_CLIENT', 'OPENOFFICEORG', ], @@ -35,21 +36,31 @@ 'src/hunspell/csutil.hxx', 'src/hunspell/dictmgr.cxx', 'src/hunspell/dictmgr.hxx', + 'src/hunspell/filemgr.cxx', + 'src/hunspell/filemgr.hxx', 'src/hunspell/hashmgr.cxx', 'src/hunspell/hashmgr.hxx', 'src/hunspell/htypes.hxx', 'src/hunspell/hunspell.cxx', 'src/hunspell/hunspell.h', 'src/hunspell/hunspell.hxx', + 'src/hunspell/hunzip.cxx', + 'src/hunspell/hunzip.hxx', 'src/hunspell/langnum.hxx', + 'src/hunspell/phonet.cxx', + 'src/hunspell/phonet.hxx', + 'src/hunspell/replist.cxx', + 'src/hunspell/replist.hxx', 'src/hunspell/suggestmgr.cxx', 'src/hunspell/suggestmgr.hxx', 'src/hunspell/utf_info.hxx', + 'src/hunspell/w_char.hxx', 'src/parsers/textparser.cxx', 'src/parsers/textparser.hxx', ], 'direct_dependent_settings': { 'defines': [ + 'HUNSPELL_STATIC', 'HUNSPELL_CHROME_CLIENT', 'USE_HUNSPELL', ], diff --git a/chrome/third_party/hunspell/src/hunspell/affentry.cxx b/chrome/third_party/hunspell/src/hunspell/affentry.cxx index 517646f..7c2dab4 100644 --- a/chrome/third_party/hunspell/src/hunspell/affentry.cxx +++ b/chrome/third_party/hunspell/src/hunspell/affentry.cxx @@ -7,9 +7,9 @@ #include <cctype> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #include <ctype.h> #endif @@ -17,7 +17,7 @@ #include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif @@ -29,22 +29,23 @@ PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) pmyMgr = pmgr; // set up its intial values - - aflag = dp->aflag; // flag + + aflag = dp->aflag; // flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string - numconds = dp->numconds; // number of conditions to match - opts = dp->opts; // cross product flag + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag // then copy over all of the conditions - memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); + if (opts & aeLONGCOND) { + memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); next = NULL; nextne = NULL; nexteq = NULL; -#ifdef HUNSPELL_EXPERIMENTAL morphcode = dp->morphcode; -#endif contclass = dp->contclass; contclasslen = dp->contclasslen; } @@ -58,15 +59,8 @@ PfxEntry::~PfxEntry() pmyMgr = NULL; appnd = NULL; strip = NULL; - if (opts & aeUTF8) { - for (int i = 0; i < numconds; i++) { - if (conds.utf8.wchars[i]) - free(conds.utf8.wchars[i]); - } - } -#ifdef HUNSPELL_EXPERIMENTAL + if (opts & aeLONGCOND) free(c.l.conds2); if (morphcode && !(opts & aeALIASM)) free(morphcode); -#endif if (contclass && !(opts & aeALIASF)) free(contclass); } @@ -75,8 +69,9 @@ char * PfxEntry::add(const char * word, int len) { char tword[MAXWORDUTF8LEN + 4]; - if ((len > stripl) && (len >= numconds) && test_condition(word) && - (!stripl || (strncmp(word, strip, stripl) == 0)) && + if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && + (len >= numconds) && test_condition(word) && + (!stripl || (strncmp(word, strip, stripl) == 0)) && ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { /* we have a match so add prefix */ char * pp = tword; @@ -87,51 +82,87 @@ char * PfxEntry::add(const char * word, int len) strcpy(pp, (word + stripl)); return mystrdup(tword); } - return NULL; + return NULL; } +inline char * PfxEntry::nextchar(char * p) { + if (p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.conds + MAXCONDLEN_1) return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) return NULL; + return *p ? p : NULL; + } + return NULL; +} inline int PfxEntry::test_condition(const char * st) { - int cond; - unsigned char * cp = (unsigned char *)st; - if (!(opts & aeUTF8)) { // 256-character codepage - for (cond = 0; cond < numconds; cond++) { - if ((conds.base[*cp++] & (1 << cond)) == 0) return 0; - } - } else { // UTF-8 encoding - unsigned short wc; - for (cond = 0; cond < numconds; cond++) { - // a simple 7-bit ASCII character in UTF-8 - if ((*cp >> 7) == 0) { - // also check limit (end of word) - if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0; - // UTF-8 multibyte character - } else { - // not dot wildcard in rule - if (!conds.utf8.all[cond]) { - if (conds.utf8.neg[cond]) { - u8_u16((w_char *) &wc, 1, (char *) cp); - if (conds.utf8.wchars[cond] && - flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short) conds.utf8.wlen[cond])) return 0; - } else { - if (!conds.utf8.wchars[cond]) return 0; - u8_u16((w_char *) &wc, 1, (char *) cp); - if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short)conds.utf8.wlen[cond])) return 0; - } + const char * pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) return 1; + char * p = c.conds; + while (1) { + switch (*p) { + case '\0': return 1; + case '[': { + neg = false; + ingroup = false; + p = nextchar(p); + pos = st; break; + } + case '^': { p = nextchar(p); neg = true; break; } + case ']': { + if ((neg && ingroup) || (!neg && !ingroup)) return 0; + pos = NULL; + p = nextchar(p); + // skip the next character + if (!ingroup) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); + if (*st == '\0' && p) return 0; // word <= condition + break; + } + case '.': if (!pos) { // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); + if (*st == '\0' && p) return 0; // word <= condition + break; + } + default: { + if (*st == *p) { + st++; + p = nextchar(p); + if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte + while (p && (*p & 0xc0) == 0x80) { // character + if (*p != *st) { + if (!pos) return 0; + st = pos; + break; + } + p = nextchar(p); + st++; + } + if (pos && st != pos) { + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); + } + } else if (pos) { + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); + } + } else if (pos) { // group + p = nextchar(p); + } else return 0; } - // jump to next UTF-8 character - for(cp++; (*cp & 0xc0) == 0x80; cp++); - } } + if (!p) return 1; } - return 1; } - -// check if this prefix entry matches +// check if this prefix entry matches struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag) { int tmpl; // length of tmpword @@ -145,7 +176,7 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -166,8 +197,8 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound if ((he = pmyMgr->lookup(tmpword)) != NULL) { do { if (TESTAFF(he->astr, aflag, he->alen) && - // forbid single prefixes with pseudoroot flag - ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && + // forbid single prefixes with needaffix flag + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && // needflag ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || (contclass && TESTAFF(contclass, needflag, contclasslen)))) @@ -175,14 +206,14 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound he = he->next_homonym; // check homonyms } while (he); } - - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix //if ((opts & aeXPRODUCT) && in_compound) { if ((opts & aeXPRODUCT)) { - he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL, + he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL, 0, NULL, FLAG_NULL, needflag, in_compound); if (he) return he; } @@ -191,7 +222,7 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound return NULL; } -// check if this prefix entry matches +// check if this prefix entry matches struct hentry * PfxEntry::check_twosfx(const char * word, int len, char in_compound, const FLAG needflag) { @@ -206,7 +237,8 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -225,8 +257,8 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, if (test_condition(tmpword)) { tmpl += stripl; - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // cross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { @@ -238,8 +270,7 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL -// check if this prefix entry matches +// check if this prefix entry matches char * PfxEntry::check_twosfx_morph(const char * word, int len, char in_compound, const FLAG needflag) { @@ -253,7 +284,8 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -272,8 +304,8 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, if (test_condition(tmpword)) { tmpl += stripl; - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { @@ -285,7 +317,7 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, return NULL; } -// check if this prefix entry matches +// check if this prefix entry matches char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag) { int tmpl; // length of tmpword @@ -293,7 +325,7 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const char tmpword[MAXWORDUTF8LEN + 4]; char result[MAXLNLEN]; char * st; - + *result = '\0'; // on entry prefix is 0 length or already matches the beginning of the word. @@ -303,7 +335,8 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -324,41 +357,56 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const if ((he = pmyMgr->lookup(tmpword)) != NULL) { do { if (TESTAFF(he->astr, aflag, he->alen) && - // forbid single prefixes with pseudoroot flag - ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && + // forbid single prefixes with needaffix flag + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && // needflag ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || (contclass && TESTAFF(contclass, needflag, contclasslen)))) { - if (morphcode) strcat(result, morphcode); else strcat(result,getKey()); - if (he->description) { - if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word); - strcat(result,he->description); + if (morphcode) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, morphcode, MAXLNLEN); + } else mystrcat(result,getKey(), MAXLNLEN); + if (!HENTRY_FIND(he, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, HENTRY_WORD(he), MAXLNLEN); + } + // store the pointer of the hash entry + if (HENTRY_DATA(he)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(he), MAXLNLEN); + } else { + // return with debug information + char * flag = pmyMgr->encode_flag(getFlag()); + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_FLAG, MAXLNLEN); + mystrcat(result, flag, MAXLNLEN); + free(flag); } - strcat(result, "\n"); + mystrcat(result, "\n", MAXLNLEN); } he = he->next_homonym; } while (he); } - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { - st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, + st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, FLAG_NULL, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } } } } - + if (*result) return mystrdup(result); return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) { @@ -366,22 +414,22 @@ SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) pmyMgr = pmgr; // set up its intial values - aflag = dp->aflag; // char flag + aflag = dp->aflag; // char flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string - numconds = dp->numconds; // number of conditions to match - opts = dp->opts; // cross product flag + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag // then copy over all of the conditions - memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); + if (opts & aeLONGCOND) { + memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); rappnd = myrevstrdup(appnd); - -#ifdef HUNSPELL_EXPERIMENTAL morphcode = dp->morphcode; -#endif contclass = dp->contclass; contclasslen = dp->contclasslen; } @@ -395,15 +443,9 @@ SfxEntry::~SfxEntry() if (strip) free(strip); pmyMgr = NULL; appnd = NULL; - strip = NULL; - if (opts & aeUTF8) { - for (int i = 0; i < numconds; i++) { - if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]); - } - } -#ifdef HUNSPELL_EXPERIMENTAL + strip = NULL; + if (opts & aeLONGCOND) free(c.l.conds2); if (morphcode && !(opts & aeALIASM)) free(morphcode); -#endif if (contclass && !(opts & aeALIASF)) free(contclass); } @@ -413,7 +455,8 @@ char * SfxEntry::add(const char * word, int len) char tword[MAXWORDUTF8LEN + 4]; /* make sure all conditions match */ - if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) && + if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && + (len >= numconds) && test_condition(word + len, word) && (!stripl || (strcmp(word + len - stripl, strip) == 0)) && ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { /* we have a match so add suffix */ @@ -428,56 +471,114 @@ char * SfxEntry::add(const char * word, int len) return NULL; } +inline char * SfxEntry::nextchar(char * p) { + if (p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) return NULL; + return *p ? p : NULL; + } + return NULL; +} inline int SfxEntry::test_condition(const char * st, const char * beg) { - int cond; - unsigned char * cp = (unsigned char *) st; - if (!(opts & aeUTF8)) { // 256-character codepage - // D\xf6m\xf6lki affix algorithm - for (cond = numconds; --cond >= 0; ) { - if ((conds.base[*--cp] & (1 << cond)) == 0) return 0; - } - } else { // UTF-8 encoding - unsigned short wc; - for (cond = numconds; --cond >= 0; ) { - // go to next character position and check limit - if ((char *) --cp < beg) return 0; - // a simple 7-bit ASCII character in UTF-8 - if ((*cp >> 7) == 0) { - if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0; - // UTF-8 multibyte character - } else { - // go to first character of UTF-8 multibyte character - for (; (*cp & 0xc0) == 0x80; cp--); - // not dot wildcard in rule - if (!conds.utf8.all[cond]) { - if (conds.utf8.neg[cond]) { - u8_u16((w_char *) &wc, 1, (char *) cp); - if (conds.utf8.wchars[cond] && - flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short) conds.utf8.wlen[cond])) return 0; - } else { - if (!conds.utf8.wchars[cond]) return 0; - u8_u16((w_char *) &wc, 1, (char *) cp); - if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short)conds.utf8.wlen[cond])) return 0; + const char * pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) return 1; + char * p = c.conds; + st--; + int i = 1; + while (1) { + switch (*p) { + case '\0': return 1; + case '[': { p = nextchar(p); pos = st; break; } + case '^': { p = nextchar(p); neg = true; break; } + case ']': { if (!neg && !ingroup) return 0; + i++; + // skip the next character + if (!ingroup) { + for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); + st--; + } + pos = NULL; + neg = false; + ingroup = false; + p = nextchar(p); + if (st < beg && p) return 0; // word <= condition + break; + } + case '.': if (!pos) { // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); + if (st < beg) { // word <= condition + if (p) return 0; else return 1; + } + if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character + st--; + if (st < beg) { // word <= condition + if (p) return 0; else return 1; + } } + break; + } + default: { + if (*st == *p) { + p = nextchar(p); + if ((opts & aeUTF8) && (*st & 0x80)) { + st--; + while (p && (st >= beg)) { + if (*p != *st) { + if (!pos) return 0; + st = pos; + break; + } + // first byte of the UTF-8 multibyte character + if ((*p & 0xc0) != 0x80) break; + p = nextchar(p); + st--; + } + if (pos && st != pos) { + if (neg) return 0; + else if (i == numconds) return 1; + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); + st--; + } + if (p && *p != ']') p = nextchar(p); + } else if (pos) { + if (neg) return 0; + else if (i == numconds) return 1; + ingroup = true; + while (p && *p != ']' && (p = nextchar(p))); +// if (p && *p != ']') p = nextchar(p); + st--; + } + if (!pos) { + i++; + st--; + } + if (st < beg && p && *p != ']') return 0; // word <= condition + } else if (pos) { // group + p = nextchar(p); + } else return 0; } - } } + if (!p) return 1; } - return 1; } - - -// see if this suffix is present in the word +// see if this suffix is present in the word struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag, const FLAG badflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword struct hentry * he; // hash entry pointer unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; @@ -497,8 +598,9 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, tmpl = len - appndl; // the second condition is not enough for UTF-8 strings // it checked in test_condition() - - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -514,7 +616,8 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, // now make sure all of the conditions on characters // are met. Please see the appendix at the end of - // this file for more info on exactly what is being // tested + // this file for more info on exactly what is being + // tested // if all conditions are met then check if resulting // root word in the dictionary @@ -528,21 +631,21 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, do { // check conditional suffix (enabled by prefix) if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && - TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && - (((optflags & aeXPRODUCT) == 0) || + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + (((optflags & aeXPRODUCT) == 0) || TESTAFF(he->astr, ep->getFlag(), he->alen) || // enabled by prefix ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) ) && // handle cont. class - ((!cclass) || + ((!cclass) || ((contclass) && TESTAFF(contclass, cclass, contclasslen)) ) && // check only in compound homonyms (bad flags) (!badflag || !TESTAFF(he->astr, badflag, he->alen) - ) && + ) && // handle required flag - ((!needflag) || + ((!needflag) || (TESTAFF(he->astr, needflag, he->alen) || ((contclass) && TESTAFF(contclass, needflag, contclasslen))) ) @@ -550,12 +653,12 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, he = he->next_homonym; // check homonyms } while (he); - // obsolote stemming code (used only by the + // obsolote stemming code (used only by the // experimental SuffixMgr:suggest_pos_stems) // store resulting root in wlst } else if (wlst && (*ns < maxSug)) { int cwrd = 1; - for (int k=0; k < *ns; k++) + for (int k=0; k < *ns; k++) if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0; if (cwrd) { wlst[*ns] = mystrdup(tmpword); @@ -572,11 +675,11 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, return NULL; } -// see if two-level suffix is present in the word +// see if two-level suffix is present in the word struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword struct hentry * he; // hash entry pointer unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; @@ -596,7 +699,8 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -620,7 +724,7 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, if (test_condition((char *) cp, (char *) tmpword)) { if (ppfx) { // handle conditional suffix - if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); else he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag); @@ -633,19 +737,18 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL -// see if two-level suffix is present in the word +// see if two-level suffix is present in the word char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; PfxEntry* ep = (PfxEntry *) ppfx; char * st; char result[MAXLNLEN]; - + *result = '\0'; // if this suffix is being cross checked with a prefix @@ -661,7 +764,8 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) && + (tmpl + stripl >= numconds)) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -689,16 +793,17 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); if (st) { if (((PfxEntry *) ppfx)->getMorph()) { - strcat(result, ((PfxEntry *) ppfx)->getMorph()); + mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); } - strcat(result,st); + mystrcat(result,st, MAXLNLEN); free(st); mychomp(result); } } else { st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); mychomp(result); } @@ -706,7 +811,7 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, } else { st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); mychomp(result); } @@ -716,28 +821,28 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, } return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE // get next homonym with same affix -struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, +struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, const FLAG cclass, const FLAG needflag) { PfxEntry* ep = (PfxEntry *) ppfx; + FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL; while (he->next_homonym) { he = he->next_homonym; - if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && - ((optflags & aeXPRODUCT) == 0 || - TESTAFF(he->astr, ep->getFlag(), he->alen) || + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + ((optflags & aeXPRODUCT) == 0 || + TESTAFF(he->astr, eFlag, he->alen) || // handle conditional suffix - ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + ((contclass) && TESTAFF(contclass, eFlag, contclasslen)) ) && // handle cont. class - ((!cclass) || + ((!cclass) || ((contclass) && TESTAFF(contclass, cclass, contclasslen)) ) && // handle required flag - ((!needflag) || + ((!needflag) || (TESTAFF(he->astr, needflag, he->alen) || ((contclass) && TESTAFF(contclass, needflag, contclasslen))) ) diff --git a/chrome/third_party/hunspell/src/hunspell/affentry.hxx b/chrome/third_party/hunspell/src/hunspell/affentry.hxx index bb21773..ef1f86d 100644 --- a/chrome/third_party/hunspell/src/hunspell/affentry.hxx +++ b/chrome/third_party/hunspell/src/hunspell/affentry.hxx @@ -54,6 +54,7 @@ public: inline void setNextEQ(PfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(PfxEntry * ptr) { flgnxt = ptr; } + inline char * nextchar(char * p); inline int test_condition(const char * st); }; @@ -123,7 +124,9 @@ public: inline void setNextEQ(SfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(SfxEntry * ptr) { flgnxt = ptr; } + inline char * nextchar(char * p); inline int test_condition(const char * st, const char * begin); + }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx b/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx index 9f53a67..29bc9f7 100644 --- a/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/affixmgr.cxx @@ -7,9 +7,9 @@ #include <cctype> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #include <ctype.h> #endif @@ -20,21 +20,24 @@ #include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif #ifdef HUNSPELL_CHROME_CLIENT -AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr* ptr) +AffixMgr::AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md) { bdict_reader = reader; #else -AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) +AffixMgr::AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) { #endif // register hash manager and load affix data from aff file - pHMgr = ptr; + pHMgr = ptr[0]; + alldic = ptr; + maxdic = md; + keystring = NULL; trystring = NULL; encoding=NULL; utf8 = 0; @@ -45,10 +48,15 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) numbreak = 0; reptable = NULL; numrep = 0; + iconvtable = NULL; + oconvtable = NULL; checkcpdtable = NULL; + // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN) + simplifiedcpd = 0; numcheckcpd = 0; defcpdtable = NULL; numdefcpd = 0; + phone = NULL; compoundflag = FLAG_NULL; // permits word in compound forms compoundbegin = FLAG_NULL; // may be first word in compound forms compoundmiddle = FLAG_NULL; // may be middle word in compound forms @@ -60,11 +68,12 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution) checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds checkcompoundtriple = 0; // forbid compounds with triple letters - forbiddenword = FLAG_NULL; // forbidden word signing flag + simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt) + forbiddenword = FORBIDDENWORD; // forbidden word signing flag nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag lang = NULL; // language langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) - pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes + needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes cpdwordmax = -1; // default: unlimited wordcount in compound words cpdmin = -1; // undefined cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words @@ -88,14 +97,14 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) lemma_present = FLAG_NULL; circumfix = FLAG_NULL; onlyincompound = FLAG_NULL; - flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file maxngramsugs = -1; // undefined nosplitsugs = 0; sugswithdots = 0; keepcase = 0; checksharps = 0; + substandard = FLAG_NULL; + fullstrip = 0; - derived = NULL; // XXX not threadsafe variable for experimental stemming sfx = NULL; pfx = NULL; @@ -109,14 +118,14 @@ AffixMgr::AffixMgr(FILE* aff_handle, HashMgr* ptr) #ifdef HUNSPELL_CHROME_CLIENT if (parse_file()) { #else + for (int j=0; j < CONTSIZE; j++) { contclasses[j] = 0; } - if (parse_file(aff_handle)) { + if (parse_file(affpath, key)) { #endif HUNSPELL_WARNING(stderr, "Failure loading aff file\n"); - wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"); } if (cpdmin == -1) cpdmin = MINCPDLEN; @@ -154,6 +163,8 @@ AffixMgr::~AffixMgr() sStart[j] = NULL; } + if (keystring) free(keystring); + keystring=NULL; if (trystring) free(trystring); trystring=NULL; if (encoding) free(encoding); @@ -178,16 +189,26 @@ AffixMgr::~AffixMgr() breaktable = NULL; } numbreak = 0; - if (reptable) { + if (reptable) { for (int j=0; j < numrep; j++) { free(reptable[j].pattern); free(reptable[j].pattern2); - reptable[j].pattern = NULL; - reptable[j].pattern2 = NULL; } free(reptable); reptable = NULL; } + if (iconvtable) delete iconvtable; + if (oconvtable) delete oconvtable; + if (phone && phone->rules) { + for (int j=0; j < phone->num + 1; j++) { + free(phone->rules[j * 2]); + free(phone->rules[j * 2 + 1]); + } + free(phone->rules); + free(phone); + phone = NULL; + } + if (defcpdtable) { for (int j=0; j < numdefcpd; j++) { free(defcpdtable[j].def); @@ -201,8 +222,10 @@ AffixMgr::~AffixMgr() for (int j=0; j < numcheckcpd; j++) { free(checkcpdtable[j].pattern); free(checkcpdtable[j].pattern2); + free(checkcpdtable[j].pattern3); checkcpdtable[j].pattern = NULL; checkcpdtable[j].pattern2 = NULL; + checkcpdtable[j].pattern3 = NULL; } free(checkcpdtable); checkcpdtable = NULL; @@ -217,7 +240,7 @@ AffixMgr::~AffixMgr() FREE_FLAG(compoundroot); FREE_FLAG(forbiddenword); FREE_FLAG(nosuggest); - FREE_FLAG(pseudoroot); + FREE_FLAG(needaffix); FREE_FLAG(lemma_present); FREE_FLAG(circumfix); FREE_FLAG(onlyincompound); @@ -236,30 +259,20 @@ AffixMgr::~AffixMgr() if (ignorechars) free(ignorechars); if (ignorechars_utf16) free(ignorechars_utf16); if (version) free(version); - if (derived) free(derived); checknum=0; } // read in aff file and build up prefix and suffix entry objects #ifdef HUNSPELL_CHROME_CLIENT - // Hack to avoid having to comment out all the fclose calls below on errors. - #define fclose(a); - int AffixMgr::parse_file() #else -int AffixMgr::parse_file(FILE* aff_handle) +int AffixMgr::parse_file(FILE* aff_handle, const char * key) #endif { - // io buffers - char line[MAXLNLEN+1]; - - // affix type - char ft; - - // first line indicator for removing byte order mark - int firstline = 1; - + char * line = new char[MAXLNLEN+1]; // io buffers + char ft; // affix type + // open the affix file #ifdef HUNSPELL_CHROME_CLIENT // We're always UTF-8 @@ -286,44 +299,65 @@ int AffixMgr::parse_file(FILE* aff_handle) char dupflags[CONTSIZE]; char dupflags_ini = 1; - FILE * afflst; - afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); + // first line indicator for removing byte order mark + int firstline = 1; + + // open the affix file + FileMgr * afflst = new FileMgr(affpath, key); if (!afflst) { - HUNSPELL_WARNING(stderr, "error: could not open affix description file\n"); + HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath); return 1; } - fseek(afflst, 0, SEEK_SET); // step one is to parse the affix file building up the internal // affix data structures - // read in each line ignoring any that do not // start with a known line type indicator - while (fgets(line,MAXLNLEN,afflst)) { + while ((line = afflst->getline())) { mychomp(line); /* remove byte order mark */ if (firstline) { firstline = 0; - if (strncmp(line,"\xef\xbb\xbf",3) == 0) { + if (strncmp(line,"\xEF\xBB\xBF",3) == 0) { memmove(line, line+3, strlen(line+3)+1); HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n"); } } #endif + /* parse in the keyboard string */ + if (strncmp(line,"KEY",3) == 0) { +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &keystring, 0)) { +#else + if (parse_string(line, &keystring, afflst->getlinenum())) { + delete afflst; +#endif + return 1; + } + } + /* parse in the try string */ if (strncmp(line,"TRY",3) == 0) { - if (parse_string(line, &trystring, "TRY")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &trystring, 0)) { +#else + if (parse_string(line, &trystring, afflst->getlinenum())) { + delete afflst; +#endif return 1; } } /* parse in the name of the character set used by the .dict and .aff */ if (strncmp(line,"SET",3) == 0) { - if (parse_string(line, &encoding, "SET")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &encoding, 0)) { +#else + if (parse_string(line, &encoding, afflst->getlinenum())) { + delete afflst; +#endif return 1; } if (strcmp(encoding, "UTF-8") == 0) { @@ -342,8 +376,12 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the flag used by the controlled compound words */ if (strncmp(line,"COMPOUNDFLAG",12) == 0) { - if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundflag)) { +#else + if (parse_flag(line, &compoundflag, afflst)) { + delete afflst; +#endif return 1; } } @@ -351,13 +389,21 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDBEGIN",13) == 0) { if (complexprefixes) { - if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundend)) { +#else + if (parse_flag(line, &compoundend, afflst)) { + delete afflst; +#endif return 1; } } else { - if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundbegin)) { +#else + if (parse_flag(line, &compoundbegin, afflst)) { + delete afflst; +#endif return 1; } } @@ -365,21 +411,33 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) { - if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundmiddle)) { +#else + if (parse_flag(line, &compoundmiddle, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDEND",11) == 0) { if (complexprefixes) { - if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundbegin)) { +#else + if (parse_flag(line, &compoundbegin, afflst)) { + delete afflst; +#endif return 1; } } else { - if (parse_flag(line, &compoundend, "COMPOUNDEND")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundend)) { +#else + if (parse_flag(line, &compoundend, afflst)) { + delete afflst; +#endif return 1; } } @@ -387,32 +445,48 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the data used by compound_check() method */ if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) { - if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_num(line, &cpdwordmax)) { +#else + if (parse_num(line, &cpdwordmax, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag sign compounds in dictionary */ if (strncmp(line,"COMPOUNDROOT",12) == 0) { - if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundroot)) { +#else + if (parse_flag(line, &compoundroot, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by compound_check() method */ if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) { - if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundpermitflag)) { +#else + if (parse_flag(line, &compoundpermitflag, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by compound_check() method */ if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) { - if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &compoundforbidflag)) { +#else + if (parse_flag(line, &compoundforbidflag, afflst)) { + delete afflst; +#endif return 1; } } @@ -429,69 +503,105 @@ int AffixMgr::parse_file(FILE* aff_handle) checkcompoundtriple = 1; } + if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) { + simplifiedtriple = 1; + } + if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) { checkcompoundcase = 1; } if (strncmp(line,"NOSUGGEST",9) == 0) { - if (parse_flag(line, &nosuggest, "NOSUGGEST")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &nosuggest)) { +#else + if (parse_flag(line, &nosuggest, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by forbidden words */ if (strncmp(line,"FORBIDDENWORD",13) == 0) { - if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &forbiddenword)) { +#else + if (parse_flag(line, &forbiddenword, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by forbidden words */ if (strncmp(line,"LEMMA_PRESENT",13) == 0) { - if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &lemma_present)) { +#else + if (parse_flag(line, &lemma_present, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by circumfixes */ if (strncmp(line,"CIRCUMFIX",9) == 0) { - if (parse_flag(line, &circumfix, "CIRCUMFIX")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &circumfix)) { +#else + if (parse_flag(line, &circumfix, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by fogemorphemes */ if (strncmp(line,"ONLYINCOMPOUND",14) == 0) { - if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &onlyincompound)) { +#else + if (parse_flag(line, &onlyincompound, afflst)) { + delete afflst; +#endif return 1; } } - /* parse in the flag used by `pseudoroots' */ + /* parse in the flag used by `needaffixs' */ if (strncmp(line,"PSEUDOROOT",10) == 0) { - if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &needaffix)) { +#else + if (parse_flag(line, &needaffix, afflst)) { + delete afflst; +#endif return 1; } } - /* parse in the flag used by `pseudoroots' */ + /* parse in the flag used by `needaffixs' */ if (strncmp(line,"NEEDAFFIX",9) == 0) { - if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &needaffix)) { +#else + if (parse_flag(line, &needaffix, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the minimal length for words in compounds */ if (strncmp(line,"COMPOUNDMIN",11) == 0) { - if (parse_num(line, &cpdmin, "COMPOUNDMIN")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_num(line, &cpdmin)) { +#else + if (parse_num(line, &cpdmin, afflst)) { + delete afflst; +#endif return 1; } if (cpdmin < 1) cpdmin = 1; @@ -499,16 +609,24 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the max. words and syllables in compounds */ if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) { +#ifdef HUNSPELL_CHROME_CLIENT if (parse_cpdsyllable(line)) { - fclose(afflst); +#else + if (parse_cpdsyllable(line, afflst)) { + delete afflst; +#endif return 1; } } /* parse in the flag used by compound_check() method */ if (strncmp(line,"SYLLABLENUM",11) == 0) { - if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &cpdsyllablenum, 0)) { +#else + if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) { + delete afflst; +#endif return 1; } } @@ -520,34 +638,74 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the extra word characters */ if (strncmp(line,"WORDCHARS",9) == 0) { - if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, 0)) { +#else + if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) { + delete afflst; +#endif return 1; } } /* parse in the ignored characters (for example, Arabic optional diacretics charachters */ if (strncmp(line,"IGNORE",6) == 0) { - if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, 0)) { +#else + if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) { + delete afflst; +#endif return 1; } } - /* parse in the typical fault correcting table */ #ifndef HUNSPELL_CHROME_CLIENT + /* parse in the typical fault correcting table */ if (strncmp(line,"REP",3) == 0) { if (parse_reptable(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } #endif + /* parse in the input conversion table */ + if (strncmp(line,"ICONV",5) == 0) { + if (parse_convtable(line, afflst, &iconvtable, "ICONV")) { +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif + return 1; + } + } + + /* parse in the input conversion table */ + if (strncmp(line,"OCONV",5) == 0) { + if (parse_convtable(line, afflst, &oconvtable, "OCONV")) { +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif + return 1; + } + } + + /* parse in the phonetic translation table */ + if (strncmp(line,"PHONE",5) == 0) { + if (parse_phonetable(line, afflst)) { +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif + return 1; + } + } + /* parse in the checkcompoundpattern table */ if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) { if (parse_checkcpdtable(line, afflst)) { - fclose(afflst); +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif return 1; } } @@ -555,7 +713,9 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the defcompound table */ if (strncmp(line,"COMPOUNDRULE",12) == 0) { if (parse_defcpdtable(line, afflst)) { - fclose(afflst); +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif return 1; } } @@ -563,7 +723,9 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the related character map table */ if (strncmp(line,"MAP",3) == 0) { if (parse_maptable(line, afflst)) { - fclose(afflst); +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif return 1; } } @@ -571,30 +733,38 @@ int AffixMgr::parse_file(FILE* aff_handle) /* parse in the word breakpoints table */ if (strncmp(line,"BREAK",5) == 0) { if (parse_breaktable(line, afflst)) { - fclose(afflst); +#ifndef HUNSPELL_CHROME_CLIENT + delete afflst; +#endif return 1; } } /* parse in the language for language specific codes */ if (strncmp(line,"LANG",4) == 0) { - if (parse_string(line, &lang, "LANG")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_string(line, &lang, 0)) { +#else + if (parse_string(line, &lang, afflst->getlinenum())) { + delete afflst; +#endif return 1; } langnum = get_lang_num(lang); } if (strncmp(line,"VERSION",7) == 0) { - if (parse_string(line, &version, "VERSION")) { - fclose(afflst); - return 1; - } + for(line = line + 7; *line == ' ' || *line == '\t'; line++); + version = mystrdup(line); } if (strncmp(line,"MAXNGRAMSUGS",12) == 0) { - if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_num(line, &maxngramsugs)) { +#else + if (parse_num(line, &maxngramsugs, afflst)) { + delete afflst; +#endif return 1; } } @@ -603,14 +773,34 @@ int AffixMgr::parse_file(FILE* aff_handle) nosplitsugs=1; } + if (strncmp(line,"FULLSTRIP",9) == 0) { + fullstrip=1; + } + if (strncmp(line,"SUGSWITHDOTS",12) == 0) { sugswithdots=1; } /* parse in the flag used by forbidden words */ if (strncmp(line,"KEEPCASE",8) == 0) { - if (parse_flag(line, &keepcase, "KEEPCASE")) { - fclose(afflst); +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &keepcase)) { +#else + if (parse_flag(line, &keepcase, afflst)) { + delete afflst; +#endif + return 1; + } + } + + /* parse in the flag used by the affix generator */ + if (strncmp(line,"SUBSTANDARD",11) == 0) { +#ifdef HUNSPELL_CHROME_CLIENT + if (parse_flag(line, &substandard)) { +#else + if (parse_flag(line, &substandard, afflst)) { + delete afflst; +#endif return 1; } } @@ -630,7 +820,7 @@ int AffixMgr::parse_file(FILE* aff_handle) dupflags_ini = 0; } if (parse_affix(line, ft, afflst, dupflags)) { - fclose(afflst); + delete afflst; process_pfx_tree_to_list(); process_sfx_tree_to_list(); return 1; @@ -640,7 +830,7 @@ int AffixMgr::parse_file(FILE* aff_handle) } #ifndef HUNSPELL_CHROME_CLIENT - fclose(afflst); + delete afflst; #endif // convert affix trees to sorted list @@ -673,8 +863,8 @@ int AffixMgr::parse_file(FILE* aff_handle) process_pfx_order(); process_sfx_order(); - // expand wordchars string, based on csutil (for external tokenization) - + /* get encoding for CHECKCOMPOUNDCASE */ + if (!utf8) { char * enc = get_encoding(); csconv = get_current_cs(enc); free(enc); @@ -695,16 +885,20 @@ int AffixMgr::parse_file(FILE* aff_handle) } wordchars = mystrdup(expw); + } - // temporary BREAK definition for German dash handling (OOo issue 64400) - if ((langnum == LANG_de) && (!breaktable)) { - breaktable = (char **) malloc(sizeof(char *)); + // default BREAK definition + if (!breaktable) { + breaktable = (char **) malloc(sizeof(char *) * 3); if (!breaktable) return 1; breaktable[0] = mystrdup("-"); - numbreak = 1; + breaktable[1] = mystrdup("^-"); + breaktable[2] = mystrdup("-$"); + if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3; } return 0; } + #ifdef HUNSPELL_CHROME_CLIENT #undef fclose #endif @@ -977,197 +1171,52 @@ int AffixMgr::process_sfx_order() return 0; } +// add flags to the result for dictionary debugging +void AffixMgr::debugflag(char * result, unsigned short flag) { + char * st = encode_flag(flag); + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_FLAG, MAXLNLEN); + if (st) { + mystrcat(result, st, MAXLNLEN); + free(st); + } +} - -// takes aff file condition string and creates the -// conds array - please see the appendix at the end of the -// file affentry.cxx which describes what is going on here -// in much more detail - -int AffixMgr::encodeit(struct affentry * ptr, char * cs) +// calculate the character length of the condition +int AffixMgr::condlen(char * st) { - unsigned char c; - int i, j, k; - unsigned char mbr[MAXLNLEN]; - w_char wmbr[MAXLNLEN]; - w_char * wpos = wmbr; - - // now clear the conditions array */ - for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0; - - // now parse the string to create the conds array */ - int nc = strlen(cs); - unsigned char neg = 0; // complement indicator - int grp = 0; // group indicator - unsigned char n = 0; // number of conditions - int ec = 0; // end condition indicator - int nm = 0; // number of member in group - - // if no condition just return - if (strcmp(cs,".")==0) { - ptr->numconds = 0; - return 0; + int l = 0; + bool group = false; + for(; *st; st++) { + if (*st == '[') { + group = true; + l++; + } else if (*st == ']') group = false; + else if (!group && (!utf8 || + (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++; } + return l; +} - i = 0; - while (i < nc) { - c = *((unsigned char *)(cs + i)); - - // start group indicator - if (c == '[') { - grp = 1; - c = 0; - } - - // complement flag - if ((grp == 1) && (c == '^')) { - neg = 1; - c = 0; - } - - // end goup indicator - if (c == ']') { - ec = 1; - c = 0; - } - - // add character of group to list - if ((grp == 1) && (c != 0)) { - *(mbr + nm) = c; - nm++; - c = 0; - } - - // end of condition - if (c != 0) { - ec = 1; +int AffixMgr::encodeit(struct affentry * ptr, char * cs) +{ + if (strcmp(cs,".") != 0) { + ptr->numconds = (char) condlen(cs); + strncpy(ptr->c.conds, cs, MAXCONDLEN); + // long condition (end of conds padded by strncpy) + if (ptr->c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { + ptr->opts += aeLONGCOND; + ptr->c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); + if (!ptr->c.l.conds2) return 1; } - - if (ec) { - if (!utf8) { - if (grp == 1) { - if (neg == 0) { - // set the proper bits in the condition array vals for those chars - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - ptr->conds.base[k] = ptr->conds.base[k] | ((unsigned char)1 << n); - } - } else { - // complement so set all of them and then unset indicated ones - for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n); - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - ptr->conds.base[k] = ptr->conds.base[k] & ~((unsigned char)1 << n); - } - } - neg = 0; - grp = 0; - nm = 0; - } else { - // not a group so just set the proper bit for this char - // but first handle special case of . inside condition - if (c == '.') { - // wild card character so set them all - for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n); - } else { - ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | ((unsigned char)1 << n); - } - } - n++; - ec = 0; - } else { // UTF-8 character set - if (grp == 1) { - ptr->conds.utf8.neg[n] = neg; - if (neg == 0) { - // set the proper bits in the condition array vals for those chars - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - if (k >> 7) { - u8_u16(wpos, 1, (char *) mbr + j); - wpos++; - if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character - } else { - ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | ((unsigned char)1 << n); - } - } - } else { // neg == 1 - // complement so set all of them and then unset indicated ones - for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n); - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - if (k >> 7) { - u8_u16(wpos, 1, (char *) mbr + j); - wpos++; - if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character - } else { - ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~((unsigned char)1 << n); - } - } - } - neg = 0; - grp = 0; - nm = 0; - ptr->conds.utf8.wlen[n] = wpos - wmbr; - if ((wpos - wmbr) != 0) { - ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr)); - if (!ptr->conds.utf8.wchars[n]) return 1; - memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr)); - flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]); - wpos = wmbr; - } - } else { // grp == 0 - // is UTF-8 character? - if (c >> 7) { - ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char)); - if (!ptr->conds.utf8.wchars[n]) return 1; - ptr->conds.utf8.wlen[n] = 1; - u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i); - if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character - } else { - ptr->conds.utf8.wchars[n] = NULL; - // not a group so just set the proper bit for this char - // but first handle special case of . inside condition - if (c == '.') { - ptr->conds.utf8.all[n] = 1; - // wild card character so set them all - for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n); - } else { - ptr->conds.utf8.all[n] = 0; - ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | ((unsigned char)1 << n); - } - } - neg = 0; - } - n++; - if (n > 8) { - HUNSPELL_WARNING(stderr, "Number of conditions is larger than 8. This" - "version of Hunspell does not support more than 8 conditions." - "Please, get rid of affentries with more than 8 conditions."); - break; - } - ec = 0; - neg = 0; - } - } - - i++; + } else { + ptr->numconds = 0; + ptr->c.conds[0] = '\0'; } - ptr->numconds = n; return 0; } - // return 1 if s1 is a leading subset of s2 -/* inline int AffixMgr::isSubset(const char * s1, const char * s2) - { - while ((*s1 == *s2) && *s1) { - s1++; - s2++; - } - return (*s1 == '\0'); - } -*/ - - // return 1 if s1 is a leading subset of s2 (dots are for infixes) +// return 1 if s1 is a leading subset of s2 (dots are for infixes) inline int AffixMgr::isSubset(const char * s1, const char * s2) { while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) { @@ -1277,7 +1326,6 @@ struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL // check word for prefixes char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound, const FLAG needflag) @@ -1295,7 +1343,7 @@ char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound while (pe) { st = pe->check_morph(word,len,in_compound, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } // if (rv) return rv; @@ -1313,7 +1361,7 @@ char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound // fogemorpheme if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); pfx = (AffEntry *)pptr; } free(st); @@ -1346,7 +1394,7 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, while (pe) { st = pe->check_twosfx_morph(word,len,in_compound, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } pe = pe->getNext(); @@ -1360,7 +1408,7 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, if (isSubset(pptr->getKey(),word)) { st = pptr->check_twosfx_morph(word, len, in_compound, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); pfx = (AffEntry *)pptr; } @@ -1373,8 +1421,6 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, if (*result) return mystrdup(result); return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE - // Is word a non compound with a REP substitution (see checkcompoundrep)? int AffixMgr::cpdrep_check(const char * word, int wl) @@ -1424,11 +1470,15 @@ int AffixMgr::cpdrep_check(const char * word, int wl) } // forbid compoundings when there are special patterns at word bound -int AffixMgr::cpdpat_check(const char * word, int pos) +int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2) { int len; for (int i = 0; i < numcheckcpd; i++) { if (isSubset(checkcpdtable[i].pattern2, word + pos) && + (!r1 || !checkcpdtable[i].cond || + (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) && + (!r2 || !checkcpdtable[i].cond2 || + (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) && (len = strlen(checkcpdtable[i].pattern)) && (pos > len) && (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1; } @@ -1446,10 +1496,8 @@ int AffixMgr::cpdcase_check(const char * word, int pos) u8_u16(&w, 1, p); unsigned short a = (u.h << 8) + u.l; unsigned short b = (w.h << 8) + w.l; - // CHROME MODIFICATION: We add the checks for the dashes as they are used - // below in the non-UTF-8 case. This seems to be a bug in Hunspell. It - // causes some of the tests to fail since we convert everything to UTF-8. - if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) && (a != '-') && (b != '-')) return 1; + if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) && + (a != '-') && (b != '-')) return 1; } else { unsigned char a = *(word + pos - 1); unsigned char b = *(word + pos); @@ -1465,15 +1513,35 @@ int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** signed short btwp[MAXWORDLEN]; // word positions for metacharacters int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions short bt = 0; - int i; + int i, j; int ok; int w = 0; + if (!*words) { w = 1; *words = def; } (*words)[wnum] = rv; + // has the last word COMPOUNDRULE flag? + if (rv->alen == 0) { + (*words)[wnum] = NULL; + if (w) *words = NULL; + return 0; + } + ok = 0; + for (i = 0; i < numdefcpd; i++) { + for (j = 0; j < defcpdtable[i].len; j++) { + if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' && + TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1; + } + } + if (ok == 0) { + (*words)[wnum] = NULL; + if (w) *words = NULL; + return 0; + } + for (i = 0; i < numdefcpd; i++) { signed short pp = 0; // pattern position signed short wp = 0; // "words" position @@ -1518,17 +1586,18 @@ int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) && ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2; if (defcpdtable[i].len <= r) return 1; - } + } // backtrack if (bt) do { ok = 1; btnum[bt - 1]--; pp = btpp[bt - 1]; - wp = btwp[bt - 1] + btnum[bt - 1]; + wp = btwp[bt - 1] + (signed short) btnum[bt - 1]; } while ((btnum[bt - 1] < 0) && --bt); } while (bt); - if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; + if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; + // check zero ending while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) && ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2; @@ -1568,7 +1637,7 @@ short AffixMgr::get_syllable(const char * word, int wlen) } else if (cpdvowels_utf16) { w_char w[MAXWORDUTF8LEN]; int i = u8_u16(w, MAXWORDUTF8LEN, word); - for (; i; i--) { + for (; i > 0; i--) { if (flag_bsearch((unsigned short *) cpdvowels_utf16, ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++; } @@ -1576,15 +1645,29 @@ short AffixMgr::get_syllable(const char * word, int wlen) return num; } +void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) { + if (utf8) { + int i; + for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) { + for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++); + } + for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) { + for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--); + } + } else { + *cmin = cpdmin; + *cmax = len - cpdmin + 1; + } +} + // check if compound word is correctly spelled // hu_mov_rule = spec. Hungarian rule (XXX) struct hentry * AffixMgr::compound_check(const char * word, int len, short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL, - char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0) + char hu_mov_rule = 0, char is_sug = 0) { int i; short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; - int oldcmpdstemnum = 0; struct hentry * rv = NULL; struct hentry * rv_first; struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking @@ -1592,31 +1675,17 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, char ch; int cmin; int cmax; - + int striple = 0; + int scpd = 0; + int soldi = 0; + int oldcmin = 0; + int oldcmax = 0; + int oldlen = 0; + int checkedstriple = 0; + int checked_prefix; -#ifdef HUNSTEM - if (cmpdstemnum) { - if (wordnum == 0) { - *cmpdstemnum = 1; - } else { - (*cmpdstemnum)++; - } - } -#endif - if (utf8) { - for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) { - cmin++; - for (; (word[cmin] & 0xc0) == 0x80; cmin++); - } - for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) { - cmax--; - for (; (word[cmax] & 0xc0) == 0x80; cmax--); - } - } else { - cmin = cpdmin; - cmax = len - cpdmin + 1; - } + setcminmax(&cmin, &cmax, word, len); strcpy(st, word); @@ -1632,20 +1701,42 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, if (i >= cmax) return NULL; } - + do { // simplified checkcompoundpattern loop + + if (scpd > 0) { + for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 || + strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++); + + if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop + strcpy(st + i, checkcpdtable[scpd-1].pattern); + soldi = i; + i += strlen(checkcpdtable[scpd-1].pattern); + strcpy(st + i, checkcpdtable[scpd-1].pattern2); + strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3)); + + oldlen = len; + len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3); + oldcmin = cmin; + oldcmax = cmax; + setcminmax(&cmin, &cmax, st, len); + + cmax = len - cpdmin + 1; + } + + ch = st[i]; st[i] = '\0'; sfx = NULL; pfx = NULL; - + // FIRST WORD - + rv = lookup(st); // perhaps without prefix // search homonym with compound flag while ((rv) && !hu_mov_rule && - ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundbegin && !wordnum && TESTAFF(rv->astr, compoundbegin, rv->alen)) || @@ -1653,8 +1744,10 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, TESTAFF(rv->astr, compoundmiddle, rv->alen)) || (numdefcpd && ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) || - (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)))) - ))) { + (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))) || + (scpd != 0 && checkcpdtable[scpd-1].cond != FLAG_NULL && + !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen))) + ) { rv = rv->next_homonym; } @@ -1671,6 +1764,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, rv = NULL; } } + if (rv || (((wordnum == 0) && compoundbegin && ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || @@ -1679,9 +1773,9 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) ) checked_prefix = 1; - // else check forbiddenwords and pseudoroot + // else check forbiddenwords and needaffix } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || - TESTAFF(rv->astr, pseudoroot, rv->alen) || + TESTAFF(rv->astr, needaffix, rv->alen) || (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)) )) { st[i] = ch; @@ -1709,7 +1803,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ((SfxEntry*)sfx)->getContLen())))) { rv = NULL; } - + // check compoundmiddle flag in suffix and prefix if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule && ((pfx && ((PfxEntry*)pfx)->getCont() && @@ -1719,7 +1813,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle, ((SfxEntry*)sfx)->getContLen())))) { rv = NULL; - } + } // check forbiddenwords if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || @@ -1749,19 +1843,20 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ) ) // END of LANG_hu section + ) && + ( + // test CHECKCOMPOUNDPATTERN conditions + scpd == 0 || checkcpdtable[scpd-1].cond == FLAG_NULL || + TESTAFF(rv->astr, checkcpdtable[scpd-1].cond, rv->alen) ) - && ! (( checkcompoundtriple && // test triple letters + && ! (( checkcompoundtriple && scpd == 0 && !words && // test triple letters (word[i-1]==word[i]) && ( - ((i>1) && (word[i-1]==word[i-2])) || + ((i>1) && (word[i-1]==word[i-2])) || ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' ) ) || - ( - // test CHECKCOMPOUNDPATTERN - numcheckcpd && cpdpat_check(word, i) - ) || - ( - checkcompoundcase && cpdcase_check(word, i) + ( + checkcompoundcase && scpd == 0 && !words && cpdcase_check(word, i) )) ) // LANG_hu section: spec. Hungarian rule @@ -1769,15 +1864,14 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) || TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen()) - ) + ) ) ) -// END of LANG_hu section - ) { + ) { // first word is ok condition // LANG_hu section: spec. Hungarian rule if (langnum == LANG_hu) { - // calculate syllable number of the word + // calculate syllable number of the word numsyllable += get_syllable(st, i); // + 1 word, if syllable number of the prefix > 1 (hungarian convention) @@ -1785,23 +1879,35 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, } // END of LANG_hu section -#ifdef HUNSTEM - if (cmpdstem) cmpdstem[*cmpdstemnum - 1] = i; -#endif // NEXT WORD(S) rv_first = rv; - rv = lookup((word+i)); // perhaps without prefix + st[i] = ch; + + do { // striple loop + + // check simplifiedtriple + if (simplifiedtriple) { + if (striple) { + checkedstriple = 1; + i--; // check "fahrt" instead of "ahrt" in "Schiffahrt" + } else if (i > 2 && *(word+i - 1) == *(word + i - 2)) striple = 1; + } + + rv = lookup((st+i)); // perhaps without prefix // search homonym with compound flag - while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || - (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { + (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))) || + (scpd != 0 && checkcpdtable[scpd-1].cond2 != FLAG_NULL && + !TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) + )) { rv = rv->next_homonym; } - if (rv && words && words[wnum + 1]) return rv; + if (rv && words && words[wnum + 1]) return rv_first; oldnumsyllable2 = numsyllable; oldwordnum2 = wordnum; @@ -1833,20 +1939,27 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ) && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || - ((cpdmaxsyllable==0) || - (numsyllable + get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable)) - ) - && ( + ((cpdmaxsyllable!=0) && + (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable)) + ) && + ( + // test CHECKCOMPOUNDPATTERN + !numcheckcpd || scpd != 0 || !cpdpat_check(word, i, rv_first, rv) + ) && + ( (!checkcompounddup || (rv != rv_first)) ) + // test CHECKCOMPOUNDPATTERN conditions + && (scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || + TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen)) ) { // forbid compound word, if it is a non compound word with typical fault if (checkcompoundrep && cpdrep_check(word,len)) return NULL; - return rv; + return rv_first; } - numsyllable = oldnumsyllable2 ; + numsyllable = oldnumsyllable2; wordnum = oldwordnum2; // perhaps second word has prefix or/and suffix @@ -1858,12 +1971,20 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, pfx = NULL; rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END); } - + if (!rv && numdefcpd && words) { rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); - if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv; + if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv_first; + rv = NULL; } + // test CHECKCOMPOUNDPATTERN conditions (allowed forms) + if (rv && !(scpd == 0 || checkcpdtable[scpd-1].cond2 == FLAG_NULL || + TESTAFF(rv->astr, checkcpdtable[scpd-1].cond2, rv->alen))) rv = NULL; + + // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds) + if (rv && numcheckcpd && scpd == 0 && cpdpat_check(word, i, rv_first, rv)) rv = NULL; + // check non_compound flag in suffix and prefix if ((rv) && ((pfx && ((PfxEntry*)pfx)->getCont() && @@ -1887,7 +2008,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, if (langnum == LANG_hu) { // calculate syllable number of the word numsyllable += get_syllable(word + i, strlen(word + i)); - + // - affix syllable num. // XXX only second suffix (inflections, not derivations) if (sfxappnd) { @@ -1895,13 +2016,13 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, numsyllable -= get_syllable(tmp, strlen(tmp)); free(tmp); } - + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++; // increment syllable num, if last word has a SYLLABLENUM flag // and the suffix is beginning `s' - + if (cpdsyllablenum) { switch (sfxflag) { case 'c': { numsyllable+=2; break; } @@ -1910,7 +2031,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, } } } - + // increment word number, if the second word has a compoundroot flag if ((rv) && (compoundroot) && (TESTAFF(rv->astr, compoundroot, rv->alen))) { @@ -1924,7 +2045,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, if ((rv) && ( ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) || - ((cpdmaxsyllable == 0) || + ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable)) ) && ( @@ -1932,41 +2053,61 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, )) { // forbid compound word, if it is a non compound word with typical fault if (checkcompoundrep && cpdrep_check(word, len)) return NULL; - return rv; + return rv_first; } numsyllable = oldnumsyllable2; wordnum = oldwordnum2; -#ifdef HUNSTEM - if (cmpdstemnum) oldcmpdstemnum = *cmpdstemnum; -#endif + // perhaps second word is a compound word (recursive call) if (wordnum < maxwordnum) { - rv = compound_check((word+i),strlen(word+i), wordnum+1, - numsyllable, maxwordnum, wnum + 1, words, - 0, cmpdstemnum, cmpdstem, is_sug); + rv = compound_check((st+i),strlen(st+i), wordnum+1, + numsyllable, maxwordnum, wnum + 1, words, 0, is_sug); + if (rv && numcheckcpd && (scpd == 0 && cpdpat_check(word, i, rv_first, rv) || + scpd != 0 && !cpdpat_check(word, i, rv_first, rv))) rv = NULL; } else { rv=NULL; } if (rv) { // forbid compound word, if it is a non compound word with typical fault if (checkcompoundrep && cpdrep_check(word, len)) return NULL; - return rv; - } else { -#ifdef HUNSTEM - if (cmpdstemnum) *cmpdstemnum = oldcmpdstemnum; -#endif + return rv_first; } + } while (striple && !checkedstriple); // end of striple loop + + if (checkedstriple) { + i++; + checkedstriple = 0; + striple = 0; + } + + } // first word is ok condition + + if (soldi != 0) { + i = soldi; + soldi = 0; + len = oldlen; + cmin = oldcmin; + cmax = oldcmax; } - st[i] = ch; + scpd++; + + } while (simplifiedcpd && scpd <= numcheckcpd); // end of simplifiedcpd loop + + if (soldi != 0) { + i = soldi; + strcpy(st, word); // XXX add more optim. + soldi = 0; + } else st[i] = ch; + + scpd = 0; wordnum = oldwordnum; numsyllable = oldnumsyllable; } - + return NULL; -} +} -#ifdef HUNSPELL_EXPERIMENTAL // check if compound word is correctly spelled // hu_mov_rule = spec. Hungarian rule (XXX) int AffixMgr::compound_check_morph(const char * word, int len, @@ -1982,26 +2123,14 @@ int AffixMgr::compound_check_morph(const char * word, int len, struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking char st [MAXWORDUTF8LEN + 4]; char ch; - + int checked_prefix; char presult[MAXLNLEN]; int cmin; int cmax; - - if (utf8) { - for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) { - cmin++; - for (; (word[cmin] & 0xc0) == 0x80; cmin++); - } - for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) { - cmax--; - for (; (word[cmax] & 0xc0) == 0x80; cmax--); - } - } else { - cmin = cpdmin; - cmax = len - cpdmin + 1; - } + + setcminmax(&cmin, &cmax, word, len); strcpy(st, word); @@ -2015,7 +2144,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, for (; (st[i] & 0xc0) == 0x80; i++); if (i >= cmax) return 0; } - + ch = st[i]; st[i] = '\0'; sfx = NULL; @@ -2023,12 +2152,12 @@ int AffixMgr::compound_check_morph(const char * word, int len, // FIRST WORD *presult = '\0'; if (partresult) strcat(presult, partresult); - + rv = lookup(st); // perhaps without prefix // search homonym with compound flag while ((rv) && !hu_mov_rule && - ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundbegin && !wordnum && TESTAFF(rv->astr, compoundbegin, rv->alen)) || @@ -2042,13 +2171,16 @@ int AffixMgr::compound_check_morph(const char * word, int len, } if (rv) { - if (rv->description) { - if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) - strcat(presult, st); - strcat(presult, rv->description); + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st); } - } - + // store the pointer of the hash entry +// sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv); + if (HENTRY_DATA(rv)) { + sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA2(rv)); + } + } if (!rv) { if (compoundflag && !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { @@ -2062,7 +2194,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, rv = NULL; } } - + if (rv || (((wordnum == 0) && compoundbegin && ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || @@ -2071,35 +2203,28 @@ int AffixMgr::compound_check_morph(const char * word, int len, ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) ) { - //char * p = prefix_check_morph(st, i, 0, compound); + // char * p = prefix_check_morph(st, i, 0, compound); char * p = NULL; if (compoundflag) p = affix_check_morph(st, i, compoundflag); if (!p || (*p == '\0')) { + if (p) free(p); + p = NULL; if ((wordnum == 0) && compoundbegin) { p = affix_check_morph(st, i, compoundbegin); } else if ((wordnum > 0) && compoundmiddle) { p = affix_check_morph(st, i, compoundmiddle); } } - if (*p != '\0') { - line_uniq(p); - if (strchr(p, '\n')) { - strcat(presult, "("); - strcat(presult, line_join(p, '|')); - strcat(presult, ")"); - } else { - strcat(presult, p); - } - } - if (presult[strlen(presult) - 1] == '\n') { - presult[strlen(presult) - 1] = '\0'; + if (p && (*p != '\0')) { + sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, + MORPH_PART, st, line_uniq_app(&p, MSEP_REC)); } + if (p) free(p); checked_prefix = 1; - //strcat(presult, "+"); } // else check forbiddenwords } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || - TESTAFF(rv->astr, pseudoroot, rv->alen))) { + TESTAFF(rv->astr, needaffix, rv->alen))) { st[i] = ch; continue; } @@ -2162,7 +2287,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, ) // END of LANG_hu section ) - && ! (( checkcompoundtriple && // test triple letters + && ! (( checkcompoundtriple && !words && // test triple letters (word[i-1]==word[i]) && ( ((i>1) && (word[i-1]==word[i-2])) || ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' @@ -2170,10 +2295,10 @@ int AffixMgr::compound_check_morph(const char * word, int len, ) || ( // test CHECKCOMPOUNDPATTERN - numcheckcpd && cpdpat_check(word, i) + numcheckcpd && !words && cpdpat_check(word, i, rv, NULL) ) || ( - checkcompoundcase && cpdcase_check(word, i) + checkcompoundcase && !words && cpdcase_check(word, i) )) ) // LANG_hu section: spec. Hungarian rule @@ -2202,7 +2327,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, rv = lookup((word+i)); // perhaps without prefix // search homonym with compound flag - while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { @@ -2211,11 +2336,21 @@ int AffixMgr::compound_check_morph(const char * word, int len, if (rv && words && words[wnum + 1]) { strcat(*result, presult); - if (complexprefixes && rv->description) strcat(*result, rv->description); - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, lemma_present, rv->alen))) - strcat(*result, rv->word); - if (!complexprefixes && rv->description) strcat(*result, rv->description); + strcat(*result, " "); + strcat(*result, MORPH_PART); + strcat(*result, word+i); + if (complexprefixes && HENTRY_DATA(rv)) strcat(*result, HENTRY_DATA2(rv)); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + strcat(*result, " "); + strcat(*result, MORPH_STEM); + strcat(*result, HENTRY_WORD(rv)); + } + // store the pointer of the hash entry +// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); + if (!complexprefixes && HENTRY_DATA(rv)) { + strcat(*result, " "); + strcat(*result, HENTRY_DATA2(rv)); + } strcat(*result, "\n"); ok = 1; return 0; @@ -2240,7 +2375,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, st[i] = ch; continue; } - + // second word is acceptable, as a root? // hungarian conventions: compounding is acceptable, // when compound forms consist of 2 words, or if more, @@ -2251,8 +2386,8 @@ int AffixMgr::compound_check_morph(const char * word, int len, ) && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || - ((cpdmaxsyllable==0) || - (numsyllable+get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable)) + ((cpdmaxsyllable!=0) && + (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable)) ) && ( (!checkcompounddup || (rv != rv_first)) @@ -2261,12 +2396,23 @@ int AffixMgr::compound_check_morph(const char * word, int len, { // bad compound word strcat(*result, presult); - - if (rv->description) { - if (complexprefixes) strcat(*result, rv->description); - if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) - strcat(*result, rv->word); - if (!complexprefixes) strcat(*result, rv->description); + strcat(*result, " "); + strcat(*result, MORPH_PART); + strcat(*result, word+i); + + if (HENTRY_DATA(rv)) { + if (complexprefixes) strcat(*result, HENTRY_DATA2(rv)); + if (! HENTRY_FIND(rv, MORPH_STEM)) { + strcat(*result, " "); + strcat(*result, MORPH_STEM); + strcat(*result, HENTRY_WORD(rv)); + } + // store the pointer of the hash entry +// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); + if (!complexprefixes) { + strcat(*result, " "); + strcat(*result, HENTRY_DATA2(rv)); + } } strcat(*result, "\n"); ok = 1; @@ -2292,20 +2438,16 @@ int AffixMgr::compound_check_morph(const char * word, int len, if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { char * m = NULL; if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); - if ((!m || *m == '\0') && compoundend) + if ((!m || *m == '\0') && compoundend) { + if (m) free(m); m = affix_check_morph((word+i),strlen(word+i), compoundend); + } strcat(*result, presult); - if (m) { - line_uniq(m); - if (strchr(m, '\n')) { - strcat(*result, "("); - strcat(*result, line_join(m, '|')); - strcat(*result, ")"); - } else { - strcat(*result, m); - } - free(m); + if (m || (*m != '\0')) { + sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, + MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); } + if (m) free(m); strcat(*result, "\n"); ok = 1; } @@ -2324,7 +2466,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, // check forbiddenwords if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen)) - && (! TESTAFF(rv->astr, pseudoroot, rv->alen))) { + && (! TESTAFF(rv->astr, needaffix, rv->alen))) { st[i] = ch; continue; } @@ -2368,7 +2510,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, if ((rv) && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || - ((cpdmaxsyllable==0) || + ((cpdmaxsyllable!=0) && (numsyllable <= cpdmaxsyllable)) ) && ( @@ -2376,21 +2518,17 @@ int AffixMgr::compound_check_morph(const char * word, int len, )) { char * m = NULL; if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); - if ((!m || *m == '\0') && compoundend) + if ((!m || *m == '\0') && compoundend) { + if (m) free(m); m = affix_check_morph((word+i),strlen(word+i), compoundend); + } strcat(*result, presult); - if (m) { - line_uniq(m); - if (strchr(m, '\n')) { - strcat(*result, "("); - strcat(*result, line_join(m, '|')); - strcat(*result, ")"); - } else { - strcat(*result, m); - } - free(m); + if (m && (*m != '\0')) { + sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, + MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); } - strcat(*result, "\n"); + if (m) free(m); + sprintf(*result + strlen(*result), "%c", MSEP_REC); ok = 1; } @@ -2411,7 +2549,6 @@ int AffixMgr::compound_check_morph(const char * word, int len, } return 0; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE // return 1 if s1 (reversed) is a leading subset of end of s2 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len) @@ -2442,8 +2579,6 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, const FLAG cclass, const FLAG needflag, char in_compound) { struct hentry * rv = NULL; - char result[MAXLNLEN]; - PfxEntry* ep = (PfxEntry *) ppfx; // first handle the special case of 0 length suffixes @@ -2467,11 +2602,11 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, // fogemorpheme (in_compound || !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && - // pseudoroot on prefix or first suffix + // needaffix on prefix or first suffix (cclass || - !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || + !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), pseudoroot, + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))) ) ) { @@ -2509,11 +2644,11 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, // fogemorpheme (in_compound || !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && - // pseudoroot on prefix or first suffix + // needaffix on prefix or first suffix (cclass || - !(sptr->getCont() && TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || + !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), pseudoroot, + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))) ) ) { @@ -2523,17 +2658,6 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, sfx=(AffEntry *)sptr; // BUG: sfx not stateless sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless - if (cclass || sptr->getCont()) { - if (!derived) { - derived = mystrdup(word); - } else { - strcpy(result, derived); // XXX check size - strcat(result, "\n"); - strcat(result, word); - free(derived); - derived = mystrdup(result); - } - } return rv; } } @@ -2588,7 +2712,6 @@ struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, int sfxopts, AffEntry * ppfx, const FLAG needflag) { @@ -2610,12 +2733,18 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); if (st) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + if (((PfxEntry *) ppfx)->getMorph()) { + mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); } - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); - if (se->getMorph()) strcat(result, se->getMorph()); - strcat(result, "\n"); + if (se->getMorph()) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, se->getMorph(), MAXLNLEN); + } else debugflag(result, se->getFlag()); + mystrcat(result, "\n", MAXLNLEN); } } se = se->getNext(); @@ -2637,19 +2766,14 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, free(st); result3[0] = '\0'; -#ifdef DEBUG - unsigned short flag = sptr->getFlag(); - if (flag_mode == FLAG_NUM) { - sprintf(result3, "<%d>", sptr->getKey()); - } else if (flag_mode == FLAG_LONG) { - sprintf(result3, "<%c%c>", flag >> 8, (flag << 8) >>8); - } else sprintf(result3, "<%c>", flag); - strcat(result3, ":"); -#endif - if (sptr->getMorph()) strcat(result3, sptr->getMorph()); + + if (sptr->getMorph()) { + mystrcat(result3, " ", MAXLNLEN); + mystrcat(result3, sptr->getMorph(), MAXLNLEN); + } else debugflag(result3, sptr->getFlag()); strlinecat(result2, result3); - strcat(result2, "\n"); - strcat(result, result2); + mystrcat(result2, "\n", MAXLNLEN); + mystrcat(result, result2, MAXLNLEN); } } sptr = sptr->getNextEQ(); @@ -2657,7 +2781,7 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, sptr = sptr->getNextNE(); } } - if (result) return mystrdup(result); + if (*result) return mystrdup(result); return NULL; } @@ -2692,26 +2816,40 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, // fogemorpheme (in_compound || !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && - // pseudoroot on prefix or first suffix + // needaffix on prefix or first suffix (cclass || - !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || + !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), pseudoroot, + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))) ) )) rv = se->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); while (rv) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + if (((PfxEntry *) ppfx)->getMorph()) { + mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); + } + if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + if (! HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); + } + // store the pointer of the hash entry +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); + + if (!complexprefixes && HENTRY_DATA(rv)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); } - if (complexprefixes && rv->description) strcat(result, rv->description); - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, lemma_present, rv->alen))) - strcat(result, rv->word); - if (!complexprefixes && rv->description) strcat(result, rv->description); - if (se->getMorph()) strcat(result, se->getMorph()); - strcat(result, "\n"); + if (se->getMorph()) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, se->getMorph(), MAXLNLEN); + } else debugflag(result, se->getFlag()); + mystrcat(result, "\n", MAXLNLEN); rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } } @@ -2741,30 +2879,36 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, // fogemorpheme (in_compound || !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && - // pseudoroot on first suffix + // needaffix on first suffix (cclass || !(sptr->getCont() && - TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen()))) + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))) )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); while (rv) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + if (((PfxEntry *) ppfx)->getMorph()) { + mystrcat(result, ((PfxEntry *) ppfx)->getMorph(), MAXLNLEN); + mystrcat(result, " ", MAXLNLEN); + } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); } - if (complexprefixes && rv->description) strcat(result, rv->description); - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, lemma_present, rv->alen))) strcat(result, rv->word); - if (!complexprefixes && rv->description) strcat(result, rv->description); -#ifdef DEBUG - unsigned short flag = sptr->getFlag(); - if (flag_mode == FLAG_NUM) { - sprintf(result, "<%d>", sptr->getKey()); - } else if (flag_mode == FLAG_LONG) { - sprintf(result, "<%c%c>", flag >> 8, (flag << 8) >>8); - } else sprintf(result, "<%c>", flag); - strcat(result, ":"); -#endif + if (complexprefixes && HENTRY_DATA(rv)) mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + if (! HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, HENTRY_WORD(rv), MAXLNLEN); + } + // store the pointer of the hash entry +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); - if (sptr->getMorph()) strcat(result, sptr->getMorph()); - strcat(result, "\n"); + if (!complexprefixes && HENTRY_DATA(rv)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + } + + if (sptr->getMorph()) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, sptr->getMorph(), MAXLNLEN); + } else debugflag(result, sptr->getFlag()); + mystrcat(result, "\n", MAXLNLEN); rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } sptr = sptr->getNextEQ(); @@ -2776,15 +2920,11 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, if (*result) return mystrdup(result); return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE - // check if word with affixes is correctly spelled struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound) { struct hentry * rv= NULL; - if (derived) free(derived); - derived = NULL; // check all prefixes (also crossed with suffixes if allowed) rv = prefix_check(word, len, in_compound, needflag); @@ -2806,7 +2946,6 @@ struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG ne return rv; } -#ifdef HUNSPELL_EXPERIMENTAL // check if word with affixes is correctly spelled char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound) { @@ -2818,14 +2957,14 @@ char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needfl // check all prefixes (also crossed with suffixes if allowed) st = prefix_check_morph(word, len, in_compound); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } // if still not found check all suffixes st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } @@ -2835,54 +2974,133 @@ char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needfl // if still not found check all two-level suffixes st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } // if still not found check all two-level suffixes st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } } - + return mystrdup(result); } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE + +char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap, + unsigned short al, char * morph, char * targetmorph, int level) +{ + // handle suffixes + char * stemmorph; + char * stemmorphcatpos; + char mymorph[MAXLNLEN]; + + if (!morph && !targetmorph) return NULL; + + // check substandard flag + if (TESTAFF(ap, substandard, al)) return NULL; + + if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts); + +// int targetcount = get_sfxcount(targetmorph); + + // use input suffix fields, if exist + if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { + stemmorph = mymorph; + strcpy(stemmorph, morph); + strcat(stemmorph, " "); + stemmorphcatpos = stemmorph + strlen(stemmorph); + } else { + stemmorph = morph; + stemmorphcatpos = NULL; + } + + for (int i = 0; i < al; i++) { + const unsigned char c = (unsigned char) (ap[i] & 0x00FF); + SfxEntry * sptr = (SfxEntry *)sFlag[c]; + while (sptr) { + if (sptr->getFlag() == ap[i] && sptr->getMorph() && ((sptr->getContLen() == 0) || + // don't generate forms with substandard affixes + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { + + if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph()); + else stemmorph = (char *) sptr->getMorph(); + + int cmp = morphcmp(stemmorph, targetmorph); + + if (cmp == 0) { + char * newword = sptr->add(ts, wl); + if (newword) { + hentry * check = pHMgr->lookup(newword); // XXX extra dic + if (!check || !check->astr || + !TESTAFF(check->astr, forbiddenword, check->alen)) { + return newword; + } + free(newword); + } + } + + // recursive call for secondary suffixes + if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && +// (get_sfxcount(stemmorph) < targetcount) && + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { + char * newword = sptr->add(ts, wl); + if (newword) { + char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(), + sptr->getContLen(), stemmorph, targetmorph, 1); + + if (newword2) { + free(newword); + return newword2; + } + free(newword); + newword = NULL; + } + } + } + sptr = (SfxEntry *)sptr ->getFlgNxt(); + } + } + return NULL; +} int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts, - int wl, const unsigned short * ap, unsigned short al, char * bad, int badl) + int wl, const unsigned short * ap, unsigned short al, char * bad, int badl, + char * phon) { - int nh=0; - // first add root word to list - if ((nh < maxn) && !(al && ((pseudoroot && TESTAFF(ap, pseudoroot, al)) || + if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { wlst[nh].word = mystrdup(ts); + if (!wlst[nh].word) return 0; wlst[nh].allow = (1 == 0); + wlst[nh].orig = NULL; nh++; + // add special phonetic version + if (phon && (nh < maxn)) { + wlst[nh].word = mystrdup(phon); + if (!wlst[nh].word) return nh - 1; + wlst[nh].allow = (1 == 0); + wlst[nh].orig = mystrdup(ts); + if (!wlst[nh].orig) return nh - 1; + nh++; + } } // handle suffixes for (int i = 0; i < al; i++) { -#ifdef HUNSPELL_CHROME_CLIENT - // This change is taken from a future version of Hunspell. In other - // places, the index is clamped to a byte, so I think this is correct. - // Our array is only 256 entries anyway, so it is required. const unsigned char c = (unsigned char) (ap[i] & 0x00FF); -#else - unsigned short c = (unsigned short) ap[i]; -#endif SfxEntry * sptr = (SfxEntry *)sFlag[c]; while (sptr) { - if (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && - (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0)) && - // check pseudoroot flag - !(sptr->getCont() && ((pseudoroot && - TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || + if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && + (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) && + // check needaffix flag + !(sptr->getCont() && ((needaffix && + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || (circumfix && TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || (onlyincompound && @@ -2892,8 +3110,22 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts if (newword) { if (nh < maxn) { wlst[nh].word = newword; - wlst[nh].allow = sptr->allowCross(); - nh++; + wlst[nh].allow = sptr->allowCross(); + wlst[nh].orig = NULL; + nh++; + // add special phonetic version + if (phon && (nh < maxn)) { + char st[MAXWORDUTF8LEN]; + strcpy(st, phon); + strcat(st, sptr->getKey()); + reverseword(st + strlen(phon)); + wlst[nh].word = mystrdup(st); + if (!wlst[nh].word) return nh - 1; + wlst[nh].allow = (1 == 0); + wlst[nh].orig = mystrdup(newword); + if (!wlst[nh].orig) return nh - 1; + nh++; + } } else { free(newword); } @@ -2909,15 +3141,10 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts for (int j=1;j<n ;j++) if (wlst[j].allow) { for (int k = 0; k < al; k++) { -#ifdef HUNSPELL_CHROME_CLIENT - // See similar change above. const unsigned char c = (unsigned char) (ap[k] & 0x00FF); -#else - unsigned short c = (unsigned short) ap[k]; -#endif PfxEntry * cptr = (PfxEntry *) pFlag[c]; while (cptr) { - if (cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) && + if ((cptr->getFlag() == ap[k]) && cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) && (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { int l1 = strlen(wlst[j].word); char * newword = cptr->add(wlst[j].word, l1); @@ -2925,6 +3152,7 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts if (nh < maxn) { wlst[nh].word = newword; wlst[nh].allow = cptr->allowCross(); + wlst[nh].orig = NULL; nh++; } else { free(newword); @@ -2939,19 +3167,14 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts // now handle pure prefixes for (int m = 0; m < al; m ++) { -#ifdef HUNSPELL_CHROME_CLIENT - // See similar change above. const unsigned char c = (unsigned char) (ap[m] & 0x00FF); -#else - unsigned short c = (unsigned short) ap[m]; -#endif PfxEntry * ptr = (PfxEntry *) pFlag[c]; while (ptr) { - if (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && - (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0)) && - // check pseudoroot flag - !(ptr->getCont() && ((pseudoroot && - TESTAFF(ptr->getCont(), pseudoroot, ptr->getContLen())) || + if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && + (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && + // check needaffix flag + !(ptr->getCont() && ((needaffix && + TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || (circumfix && TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || (onlyincompound && @@ -2962,6 +3185,7 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts if (nh < maxn) { wlst[nh].word = newword; wlst[nh].allow = ptr->allowCross(); + wlst[nh].orig = NULL; nh++; } else { free(newword); @@ -2975,8 +3199,6 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts return nh; } - - // return length of replacing table int AffixMgr::get_numrep() { @@ -2990,6 +3212,27 @@ struct replentry * AffixMgr::get_reptable() return reptable; } +// return iconv table +RepList * AffixMgr::get_iconvtable() +{ + if (! iconvtable ) return NULL; + return iconvtable; +} + +// return oconv table +RepList * AffixMgr::get_oconvtable() +{ + if (! oconvtable ) return NULL; + return oconvtable; +} + +// return replacing table +struct phonetable * AffixMgr::get_phonetable() +{ + if (! phone ) return NULL; + return phone; +} + // return length of character map table int AffixMgr::get_nummap() { @@ -3019,9 +3262,7 @@ char ** AffixMgr::get_breaktable() // return text encoding of dictionary char * AffixMgr::get_encoding() { - if (! encoding ) { - encoding = mystrdup("ISO8859-1"); - } + if (! encoding ) encoding = mystrdup(SPELL_ENCODING); return mystrdup(encoding); } @@ -3037,6 +3278,12 @@ int AffixMgr::get_complexprefixes() return complexprefixes; } +// return FULLSTRIP option +int AffixMgr::get_fullstrip() +{ + return fullstrip; +} + FLAG AffixMgr::get_keepcase() { return keepcase; @@ -3047,11 +3294,17 @@ int AffixMgr::get_checksharps() return checksharps; } +char * AffixMgr::encode_flag(unsigned short aflag) +{ + return pHMgr->encode_flag(aflag); +} + + // return the preferred ignore string for suggestions char * AffixMgr::get_ignore() { if (!ignorechars) return NULL; - return mystrdup(ignorechars); + return ignorechars; } // return the preferred ignore string for suggestions @@ -3061,6 +3314,13 @@ unsigned short * AffixMgr::get_ignore_utf16(int * len) return ignorechars_utf16; } +// return the keyboard string for suggestions +char * AffixMgr::get_key_string() +{ + if (! keystring ) keystring = mystrdup(SPELL_KEYSTRING); + return mystrdup(keystring); +} + // return the preferred try string for suggestions char * AffixMgr::get_try_string() { @@ -3105,9 +3365,9 @@ FLAG AffixMgr::get_nosuggest() } // return the forbidden words flag modify flag -FLAG AffixMgr::get_pseudoroot() +FLAG AffixMgr::get_needaffix() { - return pseudoroot; + return needaffix; } // return the onlyincompound flag @@ -3147,12 +3407,6 @@ const char * AffixMgr::get_suffix() return sfxappnd; } -// return the value of derived form (base word with first suffix). -const char * AffixMgr::get_derived() -{ - return derived; -} - // return the value of suffix const char * AffixMgr::get_version() { @@ -3168,8 +3422,12 @@ FLAG AffixMgr::get_lemma_present() // utility method to look up root words in hash table struct hentry * AffixMgr::lookup(const char * word) { - if (! pHMgr) return NULL; - return pHMgr->lookup(word); + int i; + struct hentry * he = NULL; + for (i = 0; i < *maxdic && !he; i++) { + he = (alldic[i])->lookup(word); + } + return he; } // return the value of suffix @@ -3203,33 +3461,47 @@ int AffixMgr::get_sugswithdots(void) } /* parse flag */ -int AffixMgr::parse_flag(char * line, unsigned short * out, const char * name) { +#ifdef HUNSPELL_CHROME_CLIENT +int AffixMgr::parse_flag(char * line, unsigned short * out) +#else +int AffixMgr::parse_flag(char * line, unsigned short * out, FileMgr * af) +#endif +{ char * s = NULL; - if (*out != FLAG_NULL) { - HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); + if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { + HUNSPELL_WARNING(stderr, "error:multiple definitions of an affix file parameter\n"); return 1; } - if (parse_string(line, &s, name)) return 1; + if (parse_string(line, &s, 0)) return 1; *out = pHMgr->decode_flag(s); free(s); return 0; } /* parse num */ -int AffixMgr::parse_num(char * line, int * out, const char * name) { +#ifdef HUNSPELL_CHROME_CLIENT +int AffixMgr::parse_num(char * line, int * out) +#else +int AffixMgr::parse_num(char * line, int * out, FileMgr * af) +#endif +{ char * s = NULL; if (*out != -1) { - HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); + HUNSPELL_WARNING(stderr, "error: multiple definitions of an affix file parameter\n"); return 1; } - if (parse_string(line, &s, name)) return 1; + if (parse_string(line, &s, 0)) return 1; *out = atoi(s); free(s); return 0; } /* parse in the max syllablecount of compound words and */ +#ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_cpdsyllable(char * line) +#else +int AffixMgr::parse_cpdsyllable(char * line, FileMgr * af) +#endif { char * tp = line; char * piece; @@ -3262,7 +3534,6 @@ int AffixMgr::parse_cpdsyllable(char * line) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np < 2) { @@ -3273,12 +3544,12 @@ int AffixMgr::parse_cpdsyllable(char * line) return 0; } -/* parse in the typical fault correcting table */ #ifndef HUNSPELL_CHROME_CLIENT -int AffixMgr::parse_reptable(char * line, FILE * af) +/* parse in the typical fault correcting table */ +int AffixMgr::parse_reptable(char * line, FileMgr * af) { if (numrep != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate REP tables used\n"); + HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum()); return 1; } char * tp = line; @@ -3293,8 +3564,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af) case 1: { numrep = atoi(piece); if (numrep < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in replacement table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n", af->getlinenum()); return 1; } reptable = (replentry *) malloc(numrep * sizeof(struct replentry)); @@ -3306,18 +3576,17 @@ int AffixMgr::parse_reptable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing replacement table information\n"); + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); return 1; } /* now parse the numrep lines to read in the remainder of the table */ - char * nl = line; + char * nl; for (int j=0; j < numrep; j++) { - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -3329,8 +3598,8 @@ int AffixMgr::parse_reptable(char * line, FILE * af) switch(i) { case 0: { if (strncmp(piece,"REP",3) != 0) { - HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); + numrep = 0; return 1; } break; @@ -3341,27 +3610,210 @@ int AffixMgr::parse_reptable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { - HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); + numrep = 0; + return 1; + } + } + return 0; +} +#endif + +/* parse in the typical fault correcting table */ +#ifdef HUNSPELL_CHROME_CLIENT +int AffixMgr::parse_convtable(char * line, hunspell::LineIterator* iterator, RepList ** rl, const char * keyword) +#else +int AffixMgr::parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword) +#endif +{ + if (*rl) { + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + int numrl = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + numrl = atoi(piece); + if (numrl < 1) { + HUNSPELL_WARNING(stderr, "error: incorrect entry number\n"); + return 1; + } + *rl = new RepList(numrl); + if (!rl) return 1; + np++; + break; + } + default: break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: missing data\n"); + return 1; + } + + /* now parse the num lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < numrl; j++) { +#ifdef HUNSPELL_CHROME_CLIENT + if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) + return 1; +#else + if (!(nl = af->getline())) return 1; +#endif + mychomp(nl); + tp = nl; + i = 0; + char * pattern = NULL; + char * pattern2 = NULL; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece, keyword, sizeof(keyword)) != 0) { + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + delete *rl; + *rl = NULL; + return 1; + } + break; + } + case 1: { pattern = mystrrep(mystrdup(piece),"_"," "); break; } + case 2: { + pattern2 = mystrrep(mystrdup(piece),"_"," "); + break; + } + default: break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (!pattern || !pattern2) { + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); return 1; } + (*rl)->add(pattern, pattern2); } return 0; } + + +/* parse in the typical fault correcting table */ +#ifdef HUNSPELL_CHROME_CLIENT +int AffixMgr::parse_phonetable(char * line, hunspell::LineIterator* iterator) +#else +int AffixMgr::parse_phonetable(char * line, FileMgr * af) #endif +{ + if (phone) { + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + phone = (phonetable *) malloc(sizeof(struct phonetable)); + phone->num = atoi(piece); + phone->rules = NULL; + phone->utf8 = (char) utf8; + if (!phone) return 1; + if (phone->num < 1) { + HUNSPELL_WARNING(stderr, "error: line bad entry number\n"); + return 1; + } + phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *)); + if (!phone->rules) return 1; + np++; + break; + } + default: break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if (np != 2) { + HUNSPELL_WARNING(stderr, "error: missing data\n"); + return 1; + } + + /* now parse the phone->num lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < phone->num; j++) { +#ifdef HUNSPELL_CHROME_CLIENT + if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) + return 1; +#else + if (!(nl = af->getline())) return 1; +#endif + mychomp(nl); + tp = nl; + i = 0; + phone->rules[j * 2] = NULL; + phone->rules[j * 2 + 1] = NULL; + piece = mystrsep(&tp, 0); + while (piece) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece,"PHONE",5) != 0) { + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + phone->num = 0; + return 1; + } + break; + } + case 1: { phone->rules[j * 2] = mystrrep(mystrdup(piece),"_",""); break; } + case 2: { phone->rules[j * 2 + 1] = mystrrep(mystrdup(piece),"_",""); break; } + default: break; + } + i++; + } + piece = mystrsep(&tp, 0); + } + if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) { + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + phone->num = 0; + return 1; + } + } + phone->rules[phone->num * 2] = mystrdup(""); + phone->rules[phone->num * 2 + 1] = mystrdup(""); + init_phonet_hash(*phone); + return 0; +} /* parse in the checkcompoundpattern table */ #if HUNSPELL_CHROME_CLIENT int AffixMgr::parse_checkcpdtable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_checkcpdtable(char * line, FILE * af) +int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) #endif { if (numcheckcpd != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate compound pattern tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -3376,11 +3828,10 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) case 1: { numcheckcpd = atoi(piece); if (numcheckcpd < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in compound pattern table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } - checkcpdtable = (replentry *) malloc(numcheckcpd * sizeof(struct replentry)); + checkcpdtable = (patentry *) malloc(numcheckcpd * sizeof(struct patentry)); if (!checkcpdtable) return 1; np++; break; @@ -3389,14 +3840,13 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing compound pattern table information\n"); + HUNSPELL_WARNING(stderr, "error: missing data\n"); return 1; - } - + } + /* now parse the numcheckcpd lines to read in the remainder of the table */ char * nl = line; for (int j=0; j < numcheckcpd; j++) { @@ -3404,36 +3854,55 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; i = 0; checkcpdtable[j].pattern = NULL; checkcpdtable[j].pattern2 = NULL; + checkcpdtable[j].pattern3 = NULL; + checkcpdtable[j].cond = FLAG_NULL; + checkcpdtable[j].cond2 = FLAG_NULL; piece = mystrsep(&tp, 0); while (piece) { if (*piece != '\0') { switch(i) { case 0: { if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) { - HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numcheckcpd = 0; return 1; } break; } - case 1: { checkcpdtable[j].pattern = mystrdup(piece); break; } - case 2: { checkcpdtable[j].pattern2 = mystrdup(piece); break; } + case 1: { + checkcpdtable[j].pattern = mystrdup(piece); + char * p = strchr(checkcpdtable[j].pattern, '/'); + if (p) { + *p = '\0'; + checkcpdtable[j].cond = pHMgr->decode_flag(p + 1); + } + break; } + case 2: { + checkcpdtable[j].pattern2 = mystrdup(piece); + char * p = strchr(checkcpdtable[j].pattern2, '/'); + if (p) { + *p = '\0'; + checkcpdtable[j].cond2 = pHMgr->decode_flag(p + 1); + } + break; + } + case 3: { checkcpdtable[j].pattern3 = mystrdup(piece); simplifiedcpd = 1; break; } default: break; } i++; } - free(piece); piece = mystrsep(&tp, 0); } if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { - HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numcheckcpd = 0; return 1; } } @@ -3444,11 +3913,11 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_defcpdtable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_defcpdtable(char * line, FILE * af) +int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) #endif { if (numdefcpd != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate compound rule tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -3463,8 +3932,7 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) case 1: { numdefcpd = atoi(piece); if (numdefcpd < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in compound rule table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry)); @@ -3476,11 +3944,10 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing compound rule table information\n"); + HUNSPELL_WARNING(stderr, "error: missing data\n"); return 1; } @@ -3491,7 +3958,7 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; @@ -3503,26 +3970,46 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) switch(i) { case 0: { if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { - HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numdefcpd = 0; return 1; } break; } - case 1: { - defcpdtable[j].len = - pHMgr->decode_flags(&(defcpdtable[j].def), piece); + case 1: { // handle parenthesized flags + if (strchr(piece, '(')) { + defcpdtable[j].def = (FLAG *) malloc(sizeof(piece) * sizeof(FLAG)); + defcpdtable[j].len = 0; + int end = 0; + FLAG * conv; + while (!end) { + char * par = piece + 1; + while (*par != '(' && *par != ')' && *par != '\0') par++; + if (*par == '\0') end = 1; else *par = '\0'; + if (*piece == '(') piece++; + if (*piece == '*' || *piece == '?') { + defcpdtable[j].def[defcpdtable[j].len++] = (FLAG) *piece; + } else if (*piece != '\0') { + int l = pHMgr->decode_flags(&conv, piece); + for (int k = 0; k < l; k++) defcpdtable[j].def[defcpdtable[j].len++] = conv[k]; + free(conv); + } + piece = par + 1; + } + } else { + defcpdtable[j].len = pHMgr->decode_flags(&(defcpdtable[j].def), piece); + } break; } default: break; } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (!defcpdtable[j].len) { - HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: line table is corrupt\n"); + numdefcpd = 0; return 1; } } @@ -3534,11 +4021,11 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_maptable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_maptable(char * line, FILE * af) +int AffixMgr::parse_maptable(char * line, FileMgr * af) #endif { if (nummap != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate MAP tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -3553,8 +4040,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) case 1: { nummap = atoi(piece); if (nummap < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in map table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry)); @@ -3566,11 +4052,10 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing map table information\n"); + HUNSPELL_WARNING(stderr, "error: line missing data\n"); return 1; } @@ -3581,7 +4066,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; @@ -3594,8 +4079,8 @@ int AffixMgr::parse_maptable(char * line, FILE * af) switch(i) { case 0: { if (strncmp(piece,"MAP",3) != 0) { - HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + nummap = 0; return 1; } break; @@ -3623,11 +4108,11 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) { - HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + nummap = 0; return 1; } } @@ -3638,11 +4123,11 @@ int AffixMgr::parse_maptable(char * line, FILE * af) #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_breaktable(char * line, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_breaktable(char * line, FILE * af) +int AffixMgr::parse_breaktable(char * line, FileMgr * af) #endif { if (numbreak != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate word breakpoint tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -3657,8 +4142,7 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) case 1: { numbreak = atoi(piece); if (numbreak < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in BREAK table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } breaktable = (char **) malloc(numbreak * sizeof(char *)); @@ -3670,11 +4154,10 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing word breakpoint table information\n"); + HUNSPELL_WARNING(stderr, "error: missing data\n"); return 1; } @@ -3685,7 +4168,7 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; @@ -3696,8 +4179,8 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) switch(i) { case 0: { if (strncmp(piece,"BREAK",5) != 0) { - HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numbreak = 0; return 1; } break; @@ -3710,21 +4193,45 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (!breaktable) { - HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); + numbreak = 0; return 1; } } return 0; } +void AffixMgr::reverse_condition(char * piece) { + int neg = 0; + for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { + switch(*k) { + case '[': { + if (neg) *(k+1) = '['; else *k = ']'; + break; + } + case ']': { + *k = '['; + if (neg) *(k+1) = '^'; + neg = 0; + break; + } + case '^': { + if (*(k+1) == ']') neg = 1; else *(k+1) = *k; + break; + } + default: { + if (neg) *(k+1) = *k; + } + } + } +} #ifdef HUNSPELL_CHROME_CLIENT int AffixMgr::parse_affix(char * line, const char at, hunspell::LineIterator* iterator) #else -int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflags) +int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags) #endif { int numents = 0; // number of affentry structures to parse @@ -3748,6 +4255,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag // split affix header line into pieces int np = 0; + piece = mystrsep(&tp, 0); while (piece) { if (*piece != '\0') { @@ -3762,10 +4270,11 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag #ifndef HUNSPELL_CHROME_CLIENT // We don't check for duplicates. if (((at == 'S') && (dupflags[aflag] & dupSFX)) || ((at == 'P') && (dupflags[aflag] & dupPFX))) { - HUNSPELL_WARNING(stderr, "error: duplicate affix flag %s in line %s\n", piece, nl); + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of an affix flag\n", + af->getlinenum()); // return 1; XXX permissive mode for bad dictionaries } - dupflags[aflag] += ((at == 'S') ? dupSFX : dupPFX); + dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX); #endif break; } @@ -3778,19 +4287,18 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag numents = atoi(piece); if (numents == 0) { char * err = pHMgr->encode_flag(aflag); - HUNSPELL_WARNING(stderr, "error: affix %s header has incorrect entry count in line %s\n", - err, nl); - free(err); + if (err) { + HUNSPELL_WARNING(stderr, "error: line bad entry number\n"); + free(err); + } return 1; } - ptr = (struct affentry *) calloc(numents, sizeof(struct affentry)); + ptr = (struct affentry *) malloc(numents * sizeof(struct affentry)); if (!ptr) return 1; ptr->opts = ff; if (utf8) ptr->opts += aeUTF8; if (pHMgr->is_aliasf()) ptr->opts += aeALIASF; -#ifdef HUNSPELL_EXPERIMENTAL if (pHMgr->is_aliasm()) ptr->opts += aeALIASM; -#endif ptr->aflag = aflag; } @@ -3798,14 +4306,15 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag } i++; } - free(piece); piece = mystrsep(&tp, 0); } // check to make sure we parsed enough pieces if (np != 4) { - char * err = pHMgr->encode_flag(aflag); - HUNSPELL_WARNING(stderr, "error: affix %s header has insufficient data in line %s\n", err, nl); - free(err); + char * err = pHMgr->encode_flag(aflag); + if (err) { + HUNSPELL_WARNING(stderr, "error: missing data\n"); + free(err); + } free(ptr); return 1; } @@ -3819,7 +4328,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag if (!iterator->AdvanceAndCopy(nl, MAXLNLEN)) return 1; #else - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif mychomp(nl); tp = nl; @@ -3834,7 +4343,8 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag // piece 1 - is type case 0: { np++; - if (nptr != ptr) nptr->opts = ptr->opts; + if (nptr != ptr) nptr->opts = ptr->opts & + (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); break; } @@ -3843,10 +4353,10 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag np++; if (pHMgr->decode_flag(piece) != aflag) { char * err = pHMgr->encode_flag(aflag); - HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl); - HUNSPELL_WARNING(stderr, "error: possible incorrect count\n"); - free(err); - free(piece); + if (err) { + HUNSPELL_WARNING(stderr, "error: affix %s is corrupt\n", err); + free(err); + } return 1; } @@ -3873,9 +4383,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag // piece 4 - is affix string or 0 for null case 3: { char * dash; -#ifdef HUNSPELL_EXPERIMENTAL nptr->morphcode = NULL; -#endif nptr->contclass = NULL; nptr->contclasslen = 0; np++; @@ -3890,15 +4398,16 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag remove_ignored_chars(piece,ignorechars); } } - + if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } nptr->appnd = mystrdup(piece); - + if (pHMgr->is_aliasf()) { int index = atoi(dash + 1); nptr->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(nptr->contclass)); + if (!nptr->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1); } else { nptr->contclasslen = (unsigned short) pHMgr->decode_flags(&(nptr->contclass), dash + 1); flag_qsort(nptr->contclass, 0, nptr->contclasslen); @@ -3921,9 +4430,9 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } - nptr->appnd = mystrdup(piece); + nptr->appnd = mystrdup(piece); } - + nptr->appndl = (unsigned char) strlen(nptr->appnd); if (strcmp(nptr->appnd,"0") == 0) { free(nptr->appnd); @@ -3937,82 +4446,66 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag case 4: { np++; if (complexprefixes) { - int neg = 0; if (utf8) reverseword_utf(piece); else reverseword(piece); - // reverse condition - for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { - switch(*k) { - case '[': { - if (neg) *(k+1) = '['; else *k = ']'; - break; - } - case ']': { - *k = '['; - if (neg) *(k+1) = '^'; - neg = 0; - break; - } - case '^': { - if (*(k+1) == ']') neg = 1; else *(k+1) = *k; - break; - } - default: { - if (neg) *(k+1) = *k; - } - } - } + reverse_condition(piece); } if (nptr->stripl && (strcmp(piece, ".") != 0) && - redundant_condition(at, nptr->strip, nptr->stripl, piece, nl)) + redundant_condition(at, nptr->strip, nptr->stripl, piece, 0)) strcpy(piece, "."); - if (encodeit(nptr,piece)) return 1; + if (at == 'S') { + reverseword(piece); + reverse_condition(piece); + } + if (encodeit(nptr, piece)) return 1; break; } - -#ifdef HUNSPELL_EXPERIMENTAL + case 5: { np++; if (pHMgr->is_aliasm()) { int index = atoi(piece); nptr->morphcode = pHMgr->get_aliasm(index); } else { - if (complexprefixes) { + if (complexprefixes) { // XXX - fix me for morph. gen. if (utf8) reverseword_utf(piece); else reverseword(piece); } + // add the remaining of the line + if (*tp) { + *(tp - 1) = ' '; + tp = tp + strlen(tp); + } nptr->morphcode = mystrdup(piece); + if (!nptr->morphcode) return 1; } break; } -#endif - default: break; } i++; } - free(piece); piece = mystrsep(&tp, 0); } // check to make sure we parsed enough pieces if (np < 4) { char * err = pHMgr->encode_flag(aflag); - HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl); - free(err); + if (err) { + HUNSPELL_WARNING(stderr, "error: affix %s is corrupt\n", err); + free(err); + } free(ptr); return 1; } #ifdef DEBUG -#ifdef HUNSPELL_EXPERIMENTAL // detect unnecessary fields, excepting comments if (basefieldnum) { int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); if (fieldnum != basefieldnum) - HUNSPELL_WARNING(stderr, "warning: bad field number:\n%s\n", nl); + HUNSPELL_WARNING(stderr, "warning: line %d: bad field number\n", af->getlinenum()); } else { basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); } #endif -#endif nptr++; } @@ -4028,12 +4521,12 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag build_sfxtree((AffEntry *)sfxptr); } nptr++; - } + } free(ptr); return 0; } -int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, char * line) { +int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, int linenum) { int condl = strlen(cond); int i; int j; @@ -4046,7 +4539,8 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { if (cond[j] != '[') { if (cond[j] != strip[i]) { - HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); + return 0; } } else { neg = (cond[j+1] == '^') ? 1 : 0; @@ -4056,12 +4550,12 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char if (strip[i] == cond[j]) in = 1; } while ((j < (condl - 1)) && (cond[j] != ']')); if (j == (condl - 1) && (cond[j] != ']')) { - HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line); + HUNSPELL_WARNING(stderr, "error: line %d: missing ] in condition:\n%s\n", linenum); return 0; } if ((!neg && !in) || (neg && in)) { - HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); - return 0; + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); + return 0; } } } @@ -4074,7 +4568,8 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { if (cond[j] != ']') { if (cond[j] != strip[i]) { - HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); + return 0; } } else { in = 0; @@ -4083,18 +4578,18 @@ int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char if (strip[i] == cond[j]) in = 1; } while ((j > 0) && (cond[j] != '[')); if ((j == 0) && (cond[j] != '[')) { - HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line); + HUNSPELL_WARNING(stderr, "error: error: %d: missing ] in condition:\n%s\n", linenum); return 0; } neg = (cond[j+1] == '^') ? 1 : 0; if ((!neg && !in) || (neg && in)) { - HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line); - return 0; + HUNSPELL_WARNING(stderr, "warning: line %d: incompatible stripping characters and condition\n", linenum); + return 0; } } } if (j < 0) return 1; - } + } } return 0; } diff --git a/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx b/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx index e960068..fa4e217 100644 --- a/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx +++ b/chrome/third_party/hunspell/src/hunspell/affixmgr.hxx @@ -13,6 +13,8 @@ using namespace std; #include "atypes.hxx" #include "baseaffix.hxx" #include "hashmgr.hxx" +#include "phonet.hxx" +#include "replist.hxx" // check flag duplication #define dupSFX (1 << 0) @@ -66,12 +68,15 @@ class AffixMgr AffEntry * sFlag[CONTSIZE]; #endif HashMgr * pHMgr; + HashMgr ** alldic; + int * maxdic; + char * keystring; char * trystring; char * encoding; struct cs_info * csconv; int utf8; int complexprefixes; - FLAG compoundflag; + FLAG compoundflag; FLAG compoundbegin; FLAG compoundmiddle; FLAG compoundend; @@ -82,20 +87,25 @@ class AffixMgr int checkcompoundrep; int checkcompoundcase; int checkcompoundtriple; + int simplifiedtriple; FLAG forbiddenword; FLAG nosuggest; - FLAG pseudoroot; + FLAG needaffix; int cpdmin; int numrep; replentry * reptable; + RepList * iconvtable; + RepList * oconvtable; int nummap; mapentry * maptable; int numbreak; char ** breaktable; int numcheckcpd; - replentry * checkcpdtable; + patentry * checkcpdtable; + int simplifiedcpd; int numdefcpd; flagentry * defcpdtable; + phonetable * phone; int maxngramsugs; int nosplitsugs; int sugswithdots; @@ -125,7 +135,9 @@ class AffixMgr FLAG circumfix; FLAG onlyincompound; FLAG keepcase; + FLAG substandard; int checksharps; + int fullstrip; int havecontclass; // boolean variable #ifdef HUNSPELL_CHROME_CLIENT @@ -133,68 +145,81 @@ class AffixMgr #else char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold affix) #endif - flag flag_mode; - + public: + #ifdef HUNSPELL_CHROME_CLIENT - AffixMgr(hunspell::BDictReader* reader, HashMgr* ptr); + AffixMgr(hunspell::BDictReader* reader, HashMgr** ptr, int * md); #else - AffixMgr(FILE* aff_handle, HashMgr * ptr); + AffixMgr(FILE* aff_handle, HashMgr** ptr, int * md, const char * key) #endif ~AffixMgr(); struct hentry * affix_check(const char * word, int len, - const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT); + const unsigned short needflag = (unsigned short) 0, + char in_compound = IN_CPD_NOT); struct hentry * prefix_check(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); inline int isSubset(const char * s1, const char * s2); struct hentry * prefix_check_twosfx(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); inline int isRevSubset(const char * s1, const char * end_of_s2, int len); - struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx, - char ** wlst, int maxSug, int * ns, const FLAG cclass = FLAG_NULL, - const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + struct hentry * suffix_check(const char * word, int len, int sfxopts, + AffEntry* ppfx, char ** wlst, int maxSug, int * ns, + const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, + char in_compound = IN_CPD_NOT); struct hentry * suffix_check_twosfx(const char * word, int len, int sfxopts, AffEntry* ppfx, const FLAG needflag = FLAG_NULL); char * affix_check_morph(const char * word, int len, - const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); char * prefix_check_morph(const char * word, int len, - char in_compound, const FLAG needflag = FLAG_NULL); - char * suffix_check_morph (const char * word, int len, int sfxopts, AffEntry * ppfx, - const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + char in_compound, const FLAG needflag = FLAG_NULL); + char * suffix_check_morph (const char * word, int len, int sfxopts, + AffEntry * ppfx, const FLAG cclass = FLAG_NULL, + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); char * prefix_check_twosfx_morph(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); char * suffix_check_twosfx_morph(const char * word, int len, int sfxopts, AffEntry * ppfx, const FLAG needflag = FLAG_NULL); - int expand_rootword(struct guessword * wlst, int maxn, const char * ts, - int wl, const unsigned short * ap, unsigned short al, char * bad, int); + char * morphgen(char * ts, int wl, const unsigned short * ap, + unsigned short al, char * morph, char * targetmorph, int level); - short get_syllable (const char * word, int wlen); - int cpdrep_check(const char * word, int len); - int cpdpat_check(const char * word, int len); - int defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** rwords, char all); - int cpdcase_check(const char * word, int len); - inline int candidate_check(const char * word, int len); - struct hentry * compound_check(const char * word, int len, - short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, - char hu_mov_rule, int * cmpdstemnum, int * cmpdstem, char is_sug); + int expand_rootword(struct guessword * wlst, int maxn, const char * ts, + int wl, const unsigned short * ap, unsigned short al, char * bad, + int, char *); - int compound_check_morph(const char * word, int len, - short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, - char hu_mov_rule, char ** result, char * partresult); + short get_syllable (const char * word, int wlen); + int cpdrep_check(const char * word, int len); + int cpdpat_check(const char * word, int len, hentry * r1, hentry * r2); + int defcpd_check(hentry *** words, short wnum, hentry * rv, + hentry ** rwords, char all); + int cpdcase_check(const char * word, int len); + inline int candidate_check(const char * word, int len); + void setcminmax(int * cmin, int * cmax, const char * word, int len); + struct hentry * compound_check(const char * word, int len, short wordnum, + short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, char is_sug); - struct hentry * lookup(const char * word); + int compound_check_morph(const char * word, int len, short wordnum, + short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, char ** result, char * partresult); + + struct hentry * lookup(const char * word); int get_numrep(); struct replentry * get_reptable(); + RepList * get_iconvtable(); + RepList * get_oconvtable(); + struct phonetable * get_phonetable(); int get_nummap(); struct mapentry * get_maptable(); int get_numbreak(); char ** get_breaktable(); char * get_encoding(); int get_langnum(); + char * get_key_string(); char * get_try_string(); const char * get_wordchars(); unsigned short * get_wordchars_utf16(int * len); @@ -205,8 +230,7 @@ public: FLAG get_compoundbegin(); FLAG get_forbiddenword(); FLAG get_nosuggest(); -// FLAG get_circumfix(); - FLAG get_pseudoroot(); + FLAG get_needaffix(); FLAG get_onlyincompound(); FLAG get_compoundroot(); FLAG get_lemma_present(); @@ -225,6 +249,8 @@ public: int get_sugswithdots(void); FLAG get_keepcase(void); int get_checksharps(void); + char * encode_flag(unsigned short aflag); + int get_fullstrip(); private: #ifdef HUNSPELL_CHROME_CLIENT @@ -232,31 +258,37 @@ private: hunspell::BDictReader* bdict_reader; int parse_file(); -#else - int parse_file(FILE* aff_handle); -#endif -// int parse_string(char * line, char ** out, const char * name); - int parse_flag(char * line, unsigned short * out, const char * name); - int parse_num(char * line, int * out, const char * name); -// int parse_array(char * line, char ** out, unsigned short ** out_utf16, -// int * out_utf16_len, const char * name); - int parse_cpdsyllable(char * linfe); -#ifdef HUNSPELL_CHROME_CLIENT - // We just change the FILE* to be an iterator. + int parse_flag(char * line, unsigned short * out); + int parse_num(char * line, int * out); + int parse_cpdsyllable(char * line); + + int parse_reptable(char * line, hunspell::LineIterator* iterator); + int parse_convtable(char * line, hunspell::LineIterator* iterator, RepList ** rl, const char * keyword); + int parse_phonetable(char * line, hunspell::LineIterator* iterator); int parse_maptable(char * line, hunspell::LineIterator* iterator); - int parse_checkcpdtable(char * line, hunspell::LineIterator* iterator); int parse_breaktable(char * line, hunspell::LineIterator* iterator); + int parse_checkcpdtable(char * line, hunspell::LineIterator* iterator); int parse_defcpdtable(char * line, hunspell::LineIterator* iterator); int parse_affix(char * line, const char at, hunspell::LineIterator* iterator); #else - int parse_reptable(char * line, FILE * af); - int parse_maptable(char * line, FILE * af); - int parse_breaktable(char * line, FILE * af); - int parse_checkcpdtable(char * line, FILE * af); - int parse_defcpdtable(char * line, FILE * af); - int parse_affix(char * line, const char at, FILE * af, char * dupflags); + int parse_file(FILE* aff_handle, const char * key); + int parse_flag(char * line, unsigned short * out, FileMgr * af); + int parse_num(char * line, int * out, FileMgr * af); + int parse_cpdsyllable(char * line, FileMgr * af); + + int parse_reptable(char * line, FileMgr * af); + int parse_convtable(char * line, FileMgr * af, RepList ** rl, const char * keyword); + int parse_phonetable(char * line, FileMgr * af); + int parse_maptable(char * line, FileMgr * af); + int parse_breaktable(char * line, FileMgr * af); + int parse_checkcpdtable(char * line, FileMgr * af); + int parse_defcpdtable(char * line, FileMgr * af); + int parse_affix(char * line, const char at, FileMgr * af, char * dupflags); #endif + void reverse_condition(char *); + void debugflag(char * result, unsigned short flag); + int condlen(char *); int encodeit(struct affentry * ptr, char * cs); int build_pfxtree(AffEntry* pfxptr); int build_sfxtree(AffEntry* sfxptr); @@ -266,7 +298,8 @@ private: AffEntry * process_sfx_in_order(AffEntry * ptr, AffEntry * nptr); int process_pfx_tree_to_list(); int process_sfx_tree_to_list(); - int redundant_condition(char, char * strip, int stripl, const char * cond, char *); + int redundant_condition(char, char * strip, int stripl, + const char * cond, int); }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/atypes.hxx b/chrome/third_party/hunspell/src/hunspell/atypes.hxx index 4f6c1ea..4753f9c 100644 --- a/chrome/third_party/hunspell/src/hunspell/atypes.hxx +++ b/chrome/third_party/hunspell/src/hunspell/atypes.hxx @@ -5,27 +5,28 @@ #ifdef HUNSPELL_WARNING_ON #define HUNSPELL_WARNING fprintf #else -#define HUNSPELL_WARNING +// empty inline function to switch off warnings (instead of the C99 standard variadic macros) +static inline void HUNSPELL_WARNING(FILE *, const char *, ...) {} #endif #endif // HUNSTEM def. #define HUNSTEM -#include "csutil.hxx" #include "hashmgr.hxx" +#include "w_char.hxx" #define SETSIZE 256 #define CONTSIZE 65536 #define MAXWORDLEN 100 -#define MAXWORDUTF8LEN (MAXWORDLEN * 4) +#define MAXWORDUTF8LEN 256 // affentry options #define aeXPRODUCT (1 << 0) #define aeUTF8 (1 << 1) #define aeALIASF (1 << 2) #define aeALIASM (1 << 3) -#define aeINFIX (1 << 4) +#define aeLONGCOND (1 << 4) // compound options #define IN_CPD_NOT 0 @@ -33,10 +34,12 @@ #define IN_CPD_END 2 #define IN_CPD_OTHER 3 -#define MAXLNLEN 8192 * 4 +#define MAXLNLEN 8192 #define MINCPDLEN 3 #define MAXCOMPOUND 10 +#define MAXCONDLEN 20 +#define MAXCONDLEN_1 (MAXCONDLEN - sizeof(char *)) #define MAXACC 1000 @@ -55,26 +58,22 @@ struct affentry char numconds; char opts; unsigned short aflag; - union { - char base[SETSIZE]; - struct { - char ascii[SETSIZE/2]; - char neg[8]; - char all[8]; - w_char * wchars[8]; - int wlen[8]; - } utf8; - } conds; -#ifdef HUNSPELL_EXPERIMENTAL - char * morphcode; -#endif unsigned short * contclass; short contclasslen; + union { + char conds[MAXCONDLEN]; + struct { + char conds1[MAXCONDLEN_1]; + char * conds2; + } l; + } c; + char * morphcode; }; -struct replentry { - char * pattern; - char * pattern2; +struct guessword { + char * word; + bool allow; + char * orig; }; struct mapentry { @@ -88,14 +87,12 @@ struct flagentry { int len; }; -struct guessword { - char * word; - bool allow; +struct patentry { + char * pattern; + char * pattern2; + char * pattern3; + FLAG cond; + FLAG cond2; }; #endif - - - - - diff --git a/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx b/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx index d6a5cd6..03a876d 100644 --- a/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx +++ b/chrome/third_party/hunspell/src/hunspell/baseaffix.hxx @@ -6,26 +6,23 @@ class AffEntry public: protected: - char * appnd; - char * strip; - unsigned char appndl; - unsigned char stripl; - char numconds; - char opts; - unsigned short aflag; - union { - char base[SETSIZE]; - struct { - char ascii[SETSIZE/2]; - char neg[8]; - char all[8]; - w_char * wchars[8]; - int wlen[8]; - } utf8; - } conds; - char * morphcode; - unsigned short * contclass; - short contclasslen; + char * appnd; + char * strip; + unsigned char appndl; + unsigned char stripl; + char numconds; + char opts; + unsigned short aflag; + union { + char conds[MAXCONDLEN]; + struct { + char conds1[MAXCONDLEN_1]; + char * conds2; + } l; + } c; + char * morphcode; + unsigned short * contclass; + short contclasslen; }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/csutil.cxx b/chrome/third_party/hunspell/src/hunspell/csutil.cxx index 4424b98..c07e34d 100644 --- a/chrome/third_party/hunspell/src/hunspell/csutil.cxx +++ b/chrome/third_party/hunspell/src/hunspell/csutil.cxx @@ -5,10 +5,12 @@ #include <cstdlib> #include <cstring> #include <cstdio> +#include <cctype> #else #include <stdlib.h> #include <string.h> #include <stdio.h> +#include <ctype.h> #endif #include "csutil.hxx" @@ -43,17 +45,18 @@ static NS_DEFINE_CID(kUnicharUtilCID, NS_UNICHARUTIL_CID); using namespace std; #endif #else -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif -struct unicode_info2 * utf_tbl = NULL; +static struct unicode_info2 * utf_tbl = NULL; +static int utf_tbl_count = 0; // utf_tbl can be used by multiple Hunspell instances /* only UTF-16 (BMP) implementation */ char * u16_u8(char * dest, int size, const w_char * src, int srclen) { - char * u8 = dest; - char * u8_max = u8 + size; + signed char * u8 = (signed char *)dest; + signed char * u8_max = (signed char *)(u8 + size); const w_char * u2 = src; const w_char * u2_max = src + srclen; while ((u2 < u2_max) && (u8 < u8_max)) { @@ -100,12 +103,12 @@ char * u16_u8(char * dest, int size, const w_char * src, int srclen) { /* only UTF-16 (BMP) implementation */ int u8_u16(w_char * dest, int size, const char * src) { - const char * u8 = src; + const signed char * u8 = (const signed char *)src; w_char * u2 = dest; w_char * u2_max = u2 + size; while ((u2 < u2_max) && *u8) { - switch ((*u8) & 0xf0) { + switch ((*u8) & 0xf0) { case 0x00: case 0x10: case 0x20: @@ -122,7 +125,7 @@ int u8_u16(w_char * dest, int size, const char * src) { case 0x90: case 0xa0: case 0xb0: { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %d. character position\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %ld. character position\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; break; @@ -134,7 +137,7 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; } @@ -148,12 +151,12 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; } } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; } @@ -218,13 +221,11 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { char * mystrsep(char ** stringp, const char delim) { - char * rv = NULL; char * mp = *stringp; - int n = strlen(mp); - if (n > 0) { + if (*mp != '\0') { char * dp; if (delim) { - dp = (char *)memchr(mp,(int)((unsigned char)delim),n); + dp = strchr(mp, delim); } else { // don't use isspace() here, the string can be in some random charset // that's way different than the locale's @@ -234,22 +235,16 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { if (dp) { *stringp = dp+1; int nc = (int)((unsigned long)dp - (unsigned long)mp); - rv = (char *) malloc(nc+1); - memcpy(rv,mp,nc); - *(rv+nc) = '\0'; - return rv; + *(mp+nc) = '\0'; + return mp; } else { - rv = (char *) malloc(n+1); - memcpy(rv, mp, n); - *(rv+n) = '\0'; - *stringp = mp + n; - return rv; + *stringp = mp + strlen(mp); + return mp; } } return NULL; } - // replaces strdup with ansi version char * mystrdup(const char * s) { @@ -257,12 +252,27 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { if (s) { int sl = strlen(s); d = (char *) malloc(((sl+1) * sizeof(char))); - if (d) memcpy(d,s,((sl+1)*sizeof(char))); + if (d) { + memcpy(d,s,((sl+1)*sizeof(char))); + return d; + } + HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); } return d; } - - + + // strcat for limited length destination string + char * mystrcat(char * dest, const char * st, int max) { + int len; + int len2; + if (dest == NULL || st == NULL) return dest; + len = strlen(dest); + len2 = strlen(st); + if (len + len2 + 1 > max) return dest; + strcpy(dest + len, st); + return dest; + } + // remove cross-platform text line end characters void mychomp(char * s) { @@ -289,112 +299,258 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { return d; } -#ifdef HUNSPELL_EXPERIMENTAL - // append s to ends of every lines in text - void strlinecat(char * dest, const char * s) - { - char * dup = mystrdup(dest); - char * source = dup; - int len = strlen(s); - while (*source) { - if (*source == '\n') { - strncpy(dest, s, len); - dest += len; - } - *dest = *source; - source++; dest++; - } - strcpy(dest, s); - free(dup); - } - // break text to lines // return number of lines -int line_tok(const char * text, char *** lines) { +int line_tok(const char * text, char *** lines, char breakchar) { int linenum = 0; char * dup = mystrdup(text); - char * p = strchr(dup, '\n'); + char * p = strchr(dup, breakchar); while (p) { linenum++; *p = '\0'; p++; - p = strchr(p, '\n'); + p = strchr(p, breakchar); + } + linenum++; +// fprintf(stderr, "LINEN:%d %p %p\n", linenum, lines, *lines); + *lines = (char **) malloc(linenum * sizeof(char *)); +// fprintf(stderr, "hello\n"); + if (!(*lines)) { + free(dup); + return 0; } - *lines = (char **) calloc(linenum + 1, sizeof(char *)); - if (!(*lines)) return -1; - p = dup; - for (int i = 0; i < linenum + 1; i++) { - (*lines)[i] = mystrdup(p); + p = dup; + int l = 0; + for (int i = 0; i < linenum; i++) { + if (*p != '\0') { + (*lines)[l] = mystrdup(p); + if (!(*lines)[l]) { + for (i = 0; i < l; i++) free((*lines)[i]); + free(dup); + return 0; + } + l++; + } p += strlen(p) + 1; } free(dup); - return linenum; + if (!l) free(*lines); + return l; } // uniq line in place -char * line_uniq(char * text) { +char * line_uniq(char * text, char breakchar) { char ** lines; - int linenum = line_tok(text, &lines); + int linenum = line_tok(text, &lines, breakchar); int i; strcpy(text, lines[0]); - for ( i = 1; i<=linenum; i++ ) { + for ( i = 1; i < linenum; i++ ) { int dup = 0; for (int j = 0; j < i; j++) { if (strcmp(lines[i], lines[j]) == 0) dup = 1; } if (!dup) { - if ((i > 1) || (*(lines[0]) != '\0')) strcat(text, "\n"); + if ((i > 1) || (*(lines[0]) != '\0')) { + sprintf(text + strlen(text), "%c", breakchar); + } strcat(text, lines[i]); } } - for ( i = 0; i<=linenum; i++ ) { + for ( i = 0; i < linenum; i++ ) { if (lines[i]) free(lines[i]); } if (lines) free(lines); return text; } +// uniq and boundary for compound analysis: "1\n\2\n\1" -> " ( \1 | \2 ) " +char * line_uniq_app(char ** text, char breakchar) { + if (!strchr(*text, breakchar)) { + return *text; + } + + char ** lines; + int i; + int linenum = line_tok(*text, &lines, breakchar); + int dup = 0; + for (i = 0; i < linenum; i++) { + for (int j = 0; j < (i - 1); j++) { + if (strcmp(lines[i], lines[j]) == 0) { + *(lines[i]) = '\0'; + dup++; + break; + } + } + } + if ((linenum - dup) == 1) { + strcpy(*text, lines[0]); + freelist(&lines, linenum); + return *text; + } + char * newtext = (char *) malloc(strlen(*text) + 2 * linenum + 3 + 1); + if (newtext) { + free(*text); + *text = newtext; + } else { + freelist(&lines, linenum); + return *text; + } + strcpy(*text," ( "); + for (i = 0; i < linenum; i++) if (*(lines[i])) { + sprintf(*text + strlen(*text), "%s%s", lines[i], " | "); + } + (*text)[strlen(*text) - 2] = ')'; // " ) " + freelist(&lines, linenum); + return *text; +} + + // append s to ends of every lines in text + void strlinecat(char * dest, const char * s) + { + char * dup = mystrdup(dest); + char * source = dup; + int len = strlen(s); + if (dup) { + while (*source) { + if (*source == '\n') { + strncpy(dest, s, len); + dest += len; + } + *dest = *source; + source++; dest++; + } + strcpy(dest, s); + free(dup); + } + } + // change \n to char c -char * line_join(char * text, char c) { +char * tr(char * text, char oldc, char newc) { char * p; - for (p = text; *p; p++) if (*p == '\n') *p = c; + for (p = text; *p; p++) if (*p == oldc) *p = newc; return text; } -// leave only last {[^}]*} substring for handling zero morphemes -char * delete_zeros(char * morphout) { - char * p = morphout; - char * q = p; - char * q2 = NULL; - int suffix = 0; - - for (;*p && *(p+1);) { - switch (*p) { - case '{': - q2 = q; - q--; - break; - case '}': - if (q2) { - suffix = 1; - q--; - } - break; - default: - if (suffix) { - q = q2; - } - suffix = 0; - *q = *p; +// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields +// in the first line of the inputs +// return 0, if inputs equal +// return 1, if inputs may equal with a secondary suffix +// otherwise return -1 +int morphcmp(const char * s, const char * t) +{ + int se = 0; + int te = 0; + const char * sl; + const char * tl; + const char * olds; + const char * oldt; + if (!s || !t) return 1; + olds = s; + sl = strchr(s, '\n'); + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s= strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + tl = strchr(t, '\n'); + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; + } + while (s && t && (!sl || sl > s) && (!tl || tl > t)) { + s += MORPH_TAG_LEN; + t += MORPH_TAG_LEN; + se = 0; + te = 0; + while ((*s == *t) && !se && !te) { + s++; + t++; + switch(*s) { + case ' ': + case '\n': + case '\t': + case '\0': se = 1; + } + switch(*t) { + case ' ': + case '\n': + case '\t': + case '\0': te = 1; + } + } + if (!se || !te) { + // not terminal suffix difference + if (olds) return -1; + return 1; + } + olds = s; + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s = strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; } - p++; - q++; } - *q = '\0'; - return morphout; + if (!s && !t && se && te) return 0; + return 1; +} + +int get_sfxcount(const char * morph) +{ + if (!morph || !*morph) return 0; + int n = 0; + const char * old = morph; + morph = strstr(morph, MORPH_DERI_SFX); + if (!morph) morph = strstr(old, MORPH_INFL_SFX); + if (!morph) morph = strstr(old, MORPH_TERM_SFX); + while (morph) { + n++; + old = morph; + morph = strstr(morph + 1, MORPH_DERI_SFX); + if (!morph) morph = strstr(old + 1, MORPH_INFL_SFX); + if (!morph) morph = strstr(old + 1, MORPH_TERM_SFX); + } + return n; +} + + +int fieldlen(const char * r) +{ + int n = 0; + while (r && *r != '\t' && *r != '\0' && *r != '\n' && *r != ' ') { + r++; + n++; + } + return n; +} + +char * copy_field(char * dest, const char * morph, const char * var) +{ + if (!morph) return NULL; + const char * beg = strstr(morph, var); + if (beg) { + char * d = dest; + for (beg += MORPH_TAG_LEN; *beg != ' ' && *beg != '\t' && + *beg != '\n' && *beg != '\0'; d++, beg++) { + *d = *beg; + } + *d = '\0'; + return dest; + } + return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE char * mystrrep(char * word, const char * pat, const char * rep) { char * pos = strstr(word, pat); @@ -445,6 +601,34 @@ char * mystrrep(char * word, const char * pat, const char * rep) { u16_u8(word, MAXWORDUTF8LEN, w, l); return 0; } + + int uniqlist(char ** list, int n) { + int i; + if (n < 2) return n; + for (i = 0; i < n; i++) { + for (int j = 0; j < i; j++) { + if (list[j] && list[i] && (strcmp(list[j], list[i]) == 0)) { + free(list[i]); + list[i] = NULL; + break; + } + } + } + int m = 1; + for (i = 1; i < n; i++) if (list[i]) { + list[m] = list[i]; + m++; + } + return m; + } + + void freelist(char *** list, int n) { + if (list && *list && n > 0) { + for (int i = 0; i < n; i++) if ((*list)[i]) free((*list)[i]); + free(*list); + *list = NULL; + } + } // convert null terminated string to all caps void mkallcap(char * p, const struct cs_info * csconv) @@ -478,8 +662,8 @@ void mkallcap_utf(w_char * u, int nc, int langnum) { for (int i = 0; i < nc; i++) { unsigned short idx = (u[i].h << 8) + u[i].l; if (idx != unicodetoupper(idx, langnum)) { - u[i].h = (unsigned char) (unicodetolower(idx, langnum) >> 8); - u[i].l = (unsigned char) (unicodetolower(idx, langnum) & 0x00FF); + u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); + u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); } } } @@ -490,6 +674,20 @@ void mkallcap_utf(w_char * u, int nc, int langnum) { if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; } + // conversion function for protected memory + void store_pointer(char * dest, char * source) + { + memcpy(dest, &source, sizeof(char *)); + } + + // conversion function for protected memory + char * get_stored_pointer(char * s) + { + char * p; + memcpy(&p, s, sizeof(char *)); + return p; + } + #ifndef MOZILLA_CLIENT // convert null terminated string to all caps using encoding void enmkallcap(char * d, const char * p, const char * encoding) @@ -782,7 +980,7 @@ struct cs_info iso1_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; @@ -1042,7 +1240,7 @@ struct cs_info iso2_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; @@ -1302,7 +1500,7 @@ struct cs_info iso3_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso4_tbl[] = { @@ -1561,7 +1759,7 @@ struct cs_info iso4_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso5_tbl[] = { @@ -1820,7 +2018,7 @@ struct cs_info iso5_tbl[] = { { 0x00, 0xfc, 0xac }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xae }, -{ 0x00, 0xff, 0xaf }, +{ 0x00, 0xff, 0xaf } }; struct cs_info iso6_tbl[] = { @@ -2079,7 +2277,7 @@ struct cs_info iso6_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso7_tbl[] = { @@ -2338,7 +2536,7 @@ struct cs_info iso7_tbl[] = { { 0x00, 0xfc, 0xbc }, { 0x00, 0xfd, 0xbe }, { 0x00, 0xfe, 0xbf }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso8_tbl[] = { @@ -2597,7 +2795,7 @@ struct cs_info iso8_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso9_tbl[] = { @@ -2856,7 +3054,7 @@ struct cs_info iso9_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0x49 }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso10_tbl[] = { @@ -3115,7 +3313,7 @@ struct cs_info iso10_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info koi8r_tbl[] = { @@ -3374,7 +3572,7 @@ struct cs_info koi8r_tbl[] = { { 0x01, 0xdc, 0xfc }, { 0x01, 0xdd, 0xfd }, { 0x01, 0xde, 0xfe }, -{ 0x01, 0xdf, 0xff }, +{ 0x01, 0xdf, 0xff } }; struct cs_info koi8u_tbl[] = { @@ -3633,7 +3831,7 @@ struct cs_info koi8u_tbl[] = { { 0x01, 0xdc, 0xfc }, { 0x01, 0xdd, 0xfd }, { 0x01, 0xde, 0xfe }, -{ 0x01, 0xdf, 0xff }, +{ 0x01, 0xdf, 0xff } }; struct cs_info cp1251_tbl[] = { @@ -3892,7 +4090,7 @@ struct cs_info cp1251_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xdf }, +{ 0x00, 0xff, 0xdf } }; struct cs_info iso13_tbl[] = { @@ -4151,7 +4349,7 @@ struct cs_info iso13_tbl[] = { { 0x00, 0xFC, 0xDC }, { 0x00, 0xFD, 0xDD }, { 0x00, 0xFE, 0xDE }, -{ 0x00, 0xFF, 0xFF }, +{ 0x00, 0xFF, 0xFF } }; @@ -4411,7 +4609,7 @@ struct cs_info iso14_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso15_tbl[] = { @@ -4670,7 +4868,7 @@ struct cs_info iso15_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xbe }, +{ 0x00, 0xff, 0xbe } }; struct cs_info iscii_devanagari_tbl[] = { @@ -4929,10 +5127,10 @@ struct cs_info iscii_devanagari_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; -struct enc_entry encds[] = { +static struct enc_entry encds[] = { {"ISO8859-1",iso1_tbl}, {"ISO8859-2",iso2_tbl}, {"ISO8859-3",iso3_tbl}, @@ -4949,7 +5147,7 @@ struct enc_entry encds[] = { {"ISO8859-13", iso13_tbl}, {"ISO8859-14", iso14_tbl}, {"ISO8859-15", iso15_tbl}, -{"ISCII-DEVANAGARI", iscii_devanagari_tbl}, +{"ISCII-DEVANAGARI", iscii_devanagari_tbl} }; struct cs_info * get_current_cs(const char * es) { @@ -4958,6 +5156,7 @@ struct cs_info * get_current_cs(const char * es) { for (int i = 0; i < n; i++) { if (strcmp(es,encds[i].enc_name) == 0) { ccs = encds[i].cs_table; + break; } } return ccs; @@ -5038,6 +5237,26 @@ struct cs_info * get_current_cs(const char * es) { } #endif +// primitive isalpha() replacement for tokenization +char * get_casechars(const char * enc) { + struct cs_info * csconv = get_current_cs(enc); + char expw[MAXLNLEN]; + char * p = expw; + for (int i = 0; i <= 255; i++) { + if ((csconv[i].cupper != csconv[i].clower)) { + *p = (char) i; + p++; + } + } + *p = '\0'; +#ifdef MOZILLA_CLIENT + delete csconv; +#endif + return mystrdup(expw); +} + + + struct lang_map lang2enc[] = { {"ar", "UTF-8", LANG_ar}, {"az", "UTF-8", LANG_az}, @@ -5090,6 +5309,8 @@ int get_lang_num(const char * lang) { #ifndef OPENOFFICEORG #ifndef MOZILLA_CLIENT int initialize_utf_tbl() { + utf_tbl_count++; + if (utf_tbl) return 0; utf_tbl = (unicode_info2 *) malloc(CONTSIZE * sizeof(unicode_info2)); if (utf_tbl) { int j; @@ -5110,7 +5331,11 @@ int initialize_utf_tbl() { #endif void free_utf_tbl() { - if (utf_tbl) free(utf_tbl); + if (utf_tbl_count > 0) utf_tbl_count--; + if (utf_tbl && (utf_tbl_count == 0)) { + free(utf_tbl); + utf_tbl = NULL; + } } #ifdef MOZILLA_CLIENT @@ -5133,11 +5358,11 @@ unsigned short unicodetoupper(unsigned short c, int langnum) return u_toupper(c); #else #ifdef MOZILLA_CLIENT - unsigned short ret(c); - getcaseConv()->ToUpper(c, &ret); - return ret; + PRUnichar ch2; + getcaseConv()->ToUpper((PRUnichar) c, &ch2); + return ch2; #else - return utf_tbl[c].cupper; + return (utf_tbl) ? utf_tbl[c].cupper : c; #endif #endif } @@ -5153,11 +5378,11 @@ unsigned short unicodetolower(unsigned short c, int langnum) return u_tolower(c); #else #ifdef MOZILLA_CLIENT - unsigned short ret(c); - getcaseConv()->ToLower(c, &ret); - return ret; + PRUnichar ch2; + getcaseConv()->ToLower((PRUnichar) c, &ch2); + return ch2; #else - return utf_tbl[c].clower; + return (utf_tbl) ? utf_tbl[c].clower : c; #endif #endif } @@ -5167,10 +5392,72 @@ int unicodeisalpha(unsigned short c) #ifdef OPENOFFICEORG return u_isalpha(c); #else - return utf_tbl[c].cletter; + return (utf_tbl) ? utf_tbl[c].cletter : 0; #endif } +/* get type of capitalization */ +int get_captype(char * word, int nl, cs_info * csconv) { + // now determine the capitalization type of the first nl letters + int ncap = 0; + int nneutral = 0; + int firstcap = 0; + if (csconv == NULL) return NOCAP; + for (char * q = word; *q != '\0'; q++) { + if (csconv[*((unsigned char *)q)].ccase) ncap++; + if (csconv[*((unsigned char *)q)].cupper == csconv[*((unsigned char *)q)].clower) nneutral++; + } + if (ncap) { + firstcap = csconv[*((unsigned char *) word)].ccase; + } + + // now finally set the captype + if (ncap == 0) { + return NOCAP; + } else if ((ncap == 1) && firstcap) { + return INITCAP; + } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { + return ALLCAP; + } else if ((ncap > 1) && firstcap) { + return HUHINITCAP; + } + return HUHCAP; +} + +int get_captype_utf8(w_char * word, int nl, int langnum) { + // now determine the capitalization type of the first nl letters + int ncap = 0; + int nneutral = 0; + int firstcap = 0; + unsigned short idx; + // don't check too long words + if (nl >= MAXWORDLEN) return 0; + // big Unicode character (non BMP area) + if (nl == -1) return NOCAP; + for (int i = 0; i < nl; i++) { + idx = (word[i].h << 8) + word[i].l; + if (idx != unicodetolower(idx, langnum)) ncap++; + if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; + } + if (ncap) { + idx = (word[0].h << 8) + word[0].l; + firstcap = (idx != unicodetolower(idx, langnum)); + } + + // now finally set the captype + if (ncap == 0) { + return NOCAP; + } else if ((ncap == 1) && firstcap) { + return INITCAP; + } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { + return ALLCAP; + } else if ((ncap > 1) && firstcap) { + return HUHINITCAP; + } + return HUHCAP; +} + + // strip all ignored characters in the string void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len) { @@ -5200,14 +5487,14 @@ void remove_ignored_chars(char * word, char * ignored_chars) *word = '\0'; } -int parse_string(char * line, char ** out, const char * name) +int parse_string(char * line, char ** out, int ln) { char * tp = line; char * piece; int i = 0; int np = 0; if (*out) { - HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions\n", ln); return 1; } piece = mystrsep(&tp, 0); @@ -5217,6 +5504,7 @@ int parse_string(char * line, char ** out, const char * name) case 0: { np++; break; } case 1: { *out = mystrdup(piece); + if (!*out) return 1; np++; break; } @@ -5224,19 +5512,19 @@ int parse_string(char * line, char ** out, const char * name) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { - HUNSPELL_WARNING(stderr, "error: missing %s information\n", name); + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", ln); return 1; } return 0; } -int parse_array(char * line, char ** out, - unsigned short ** out_utf16, int * out_utf16_len, const char * name, int utf8) { - if (parse_string(line, out, name)) return 1; +int parse_array(char * line, char ** out, unsigned short ** out_utf16, + int * out_utf16_len, int utf8, int ln) { + if (parse_string(line, out, ln)) return 1; if (utf8) { w_char w[MAXWORDLEN]; int n = u8_u16(w, MAXWORDLEN, *out); diff --git a/chrome/third_party/hunspell/src/hunspell/csutil.hxx b/chrome/third_party/hunspell/src/hunspell/csutil.hxx index 7fc6732..0e6192b 100644 --- a/chrome/third_party/hunspell/src/hunspell/csutil.hxx +++ b/chrome/third_party/hunspell/src/hunspell/csutil.hxx @@ -3,10 +3,56 @@ // First some base level utility routines -typedef struct { - unsigned char l; - unsigned char h; -} w_char; +#include "w_char.hxx" + +// casing +#define NOCAP 0 +#define INITCAP 1 +#define ALLCAP 2 +#define HUHCAP 3 +#define HUHINITCAP 4 + +// default encoding and keystring +#define SPELL_ENCODING "ISO8859-1" +#define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm" + +// default morphological fields +#define MORPH_STEM "st:" +#define MORPH_ALLOMORPH "al:" +#define MORPH_POS "po:" +#define MORPH_DERI_PFX "dp:" +#define MORPH_INFL_PFX "ip:" +#define MORPH_TERM_PFX "tp:" +#define MORPH_DERI_SFX "ds:" +#define MORPH_INFL_SFX "is:" +#define MORPH_TERM_SFX "ts:" +#define MORPH_SURF_PFX "sp:" +#define MORPH_FREQ "fr:" +#define MORPH_PHON "ph:" +#define MORPH_HYPH "hy:" +#define MORPH_PART "pa:" +#define MORPH_FLAG "fl:" +#define MORPH_HENTRY "_H:" +#define MORPH_TAG_LEN strlen(MORPH_STEM) + +#define MSEP_FLD ' ' +#define MSEP_REC '\n' +#define MSEP_ALT '\v' + +// default flags +#define DEFAULTFLAGS 65510 +#define FORBIDDENWORD 65510 +#define ONLYUPCASEFLAG 65511 + +// hash entry macros +#define HENTRY_DATA(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \ + get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : NULL) +// NULL-free version for warning-free OOo build +#define HENTRY_DATA2(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \ + get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : "") +#define HENTRY_FIND(h,p) (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL) + +#define w_char_eq(a,b) (((a).l == (b).l) && ((a).h == (b).h)) // convert UTF-16 characters to UTF-8 char * u16_u8(char * dest, int size, const w_char * src, int srclen); @@ -26,6 +72,9 @@ void mychomp(char * s); // duplicate string char * mystrdup(const char * s); +// strcat for limited length destination string +char * mystrcat(char * dest, const char * st, int max); + // duplicate reverse of string char * myrevstrdup(const char * s); @@ -41,16 +90,14 @@ char * mystrrep(char *, const char *, const char *); void strlinecat(char * lines, const char * s); // tokenize into lines with new line - int line_tok(const char * text, char *** lines); + int line_tok(const char * text, char *** lines, char breakchar); // tokenize into lines with new line and uniq in place - char * line_uniq(char * text); - -// change \n to c in place - char * line_join(char * text, char c); + char * line_uniq(char * text, char breakchar); + char * line_uniq_app(char ** text, char breakchar); -// leave only last {[^}]*} pattern in string - char * delete_zeros(char * morphout); +// change oldchar to newchar in place + char * tr(char * text, char oldc, char newc); // reverse word int reverseword(char *); @@ -58,6 +105,12 @@ void strlinecat(char * lines, const char * s); // reverse word int reverseword_utf(char *); +// remove duplicates + int uniqlist(char ** list, int n); + +// free character array list + void freelist(char *** list, int n); + // character encoding information struct cs_info { unsigned char ccase; @@ -101,8 +154,12 @@ struct cs_info * get_current_cs(const char * es); const char * get_default_enc(const char * lang); +// get language identifiers of language codes int get_lang_num(const char * lang); +// get characters of the given 8bit encoding with lower- and uppercase forms +char * get_casechars(const char * enc); + // convert null terminated string to all caps using encoding void enmkallcap(char * d, const char * p, const char * encoding); @@ -127,15 +184,34 @@ void mkallsmall_utf(w_char * u, int nc, int langnum); // convert first nc characters of UTF-8 string to capital void mkallcap_utf(w_char * u, int nc, int langnum); +// get type of capitalization +int get_captype(char * q, int nl, cs_info *); + +// get type of capitalization (UTF-8) +int get_captype_utf8(w_char * q, int nl, int langnum); + // strip all ignored characters in the string void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len); // strip all ignored characters in the string void remove_ignored_chars(char * word, char * ignored_chars); -int parse_string(char * line, char ** out, const char * name); +int parse_string(char * line, char ** out, int ln); + +int parse_array(char * line, char ** out, unsigned short ** out_utf16, + int * out_utf16_len, int utf8, int ln); + +int fieldlen(const char * r); +char * copy_field(char * dest, const char * morph, const char * var); + +int morphcmp(const char * s, const char * t); + +int get_sfxcount(const char * morph); + +// conversion function for protected memory +void store_pointer(char * dest, char * source); -int parse_array(char * line, char ** out, - unsigned short ** out_utf16, int * out_utf16_len, const char * name, int utf8); +// conversion function for protected memory +char * get_stored_pointer(char * s); #endif diff --git a/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx b/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx index 34736a6..5594582 100644 --- a/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/dictmgr.cxx @@ -135,15 +135,19 @@ char * DictMgr::mystrsep(char ** stringp, const char delim) *stringp = dp+1; int nc = (int)((unsigned long)dp - (unsigned long)mp); rv = (char *) malloc(nc+1); - memcpy(rv,mp,nc); - *(rv+nc) = '\0'; - return rv; + if (rv) { + memcpy(rv,mp,nc); + *(rv+nc) = '\0'; + return rv; + } } else { rv = (char *) malloc(n+1); - memcpy(rv, mp, n); - *(rv+n) = '\0'; - *stringp = mp + n; - return rv; + if (rv) { + memcpy(rv, mp, n); + *(rv+n) = '\0'; + *stringp = mp + n; + return rv; + } } } return NULL; diff --git a/chrome/third_party/hunspell/src/hunspell/filemgr.cxx b/chrome/third_party/hunspell/src/hunspell/filemgr.cxx new file mode 100644 index 0000000..4150ce6 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/filemgr.cxx @@ -0,0 +1,54 @@ +#include "license.hunspell" +#include "license.myspell" + +#ifndef MOZILLA_CLIENT +#include <cstdlib> +#include <cstring> +#include <cstdio> +#else +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#endif + +#include "filemgr.hxx" + +int FileMgr::fail(const char * err, const char * par) { + fprintf(stderr, err, par); + return -1; +} + +FileMgr::FileMgr(const char * file, const char * key) { + linenum = 0; + hin = NULL; + fin = fopen(file, "r"); + if (!fin) { + // check hzipped file + char * st = (char *) malloc(strlen(file) + strlen(HZIP_EXTENSION)); + if (st) { + strcpy(st, file); + strcat(st, HZIP_EXTENSION); + hin = new Hunzip(st, key); + } + } + if (!fin && !hin) fail(MSG_OPEN, file); +} + +FileMgr::~FileMgr() +{ + if (fin) fclose(fin); + if (hin) delete hin; +} + +char * FileMgr::getline() { + const char * l; + linenum++; + if (fin) return fgets(in, BUFSIZE - 1, fin); + if (hin && (l = hin->getline())) return strcpy(in, l); + linenum--; + return NULL; +} + +int FileMgr::getlinenum() { + return linenum; +} diff --git a/chrome/third_party/hunspell/src/hunspell/filemgr.hxx b/chrome/third_party/hunspell/src/hunspell/filemgr.hxx new file mode 100644 index 0000000..fb4d52b --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/filemgr.hxx @@ -0,0 +1,21 @@ +/* file manager class - read lines of files [filename] OR [filename.hz] */ +#ifndef _FILEMGR_HXX_ +#define _FILEMGR_HXX_ +#include "hunzip.hxx" + +class FileMgr +{ +protected: + FILE * fin; + Hunzip * hin; + char in[BUFSIZE + 50]; // input buffer + int fail(const char * err, const char * par); + int linenum; + +public: + FileMgr(const char * filename, const char * key = NULL); + ~FileMgr(); + char * getline(); + int getlinenum(); +}; +#endif diff --git a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx index ec6f4f3..49ea117 100644 --- a/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/hashmgr.cxx @@ -22,18 +22,19 @@ using namespace std; #endif #else -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif // build a hash table from a munched word list + #ifdef HUNSPELL_CHROME_CLIENT HashMgr::HashMgr(hunspell::BDictReader* reader) { bdict_reader = reader; #else -HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) +HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle, const char * key) { #endif tablesize = 0; @@ -41,6 +42,10 @@ HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) flag_mode = FLAG_CHAR; complexprefixes = 0; utf8 = 0; + langnum = 0; + lang = NULL; + enc = NULL; + csconv = 0; ignorechars = NULL; ignorechars_utf16 = NULL; ignorechars_utf16_len = 0; @@ -48,12 +53,13 @@ HashMgr::HashMgr(FILE* dic_handle, FILE* aff_handle) aliasf = NULL; numaliasm = 0; aliasm = NULL; + forbiddenword = FORBIDDENWORD; // forbidden word signing flag #ifdef HUNSPELL_CHROME_CLIENT // No tables to load, just the AF config. int ec = load_config(); #else load_config(aff_handle); - int ec = load_tables(dic_handle); + int ec = load_tables(dic_handle, key); #endif if (ec) { /* error condition - what should we do here */ @@ -73,29 +79,16 @@ HashMgr::~HashMgr() // now pass through hash table freeing up everything // go through column by column of the table for (int i=0; i < tablesize; i++) { - struct hentry * pt = &tableptr[i]; + struct hentry * pt = tableptr[i]; struct hentry * nt = NULL; - if (pt) { - if (pt->astr && !aliasf) free(pt->astr); - if (pt->word) free(pt->word); -#ifdef HUNSPELL_EXPERIMENTAL - if (pt->description && !aliasm) free(pt->description); -#endif - pt = pt->next; - } while(pt) { nt = pt->next; - if (pt->astr && !aliasf) free(pt->astr); - if (pt->word) free(pt->word); -#ifdef HUNSPELL_EXPERIMENTAL - if (pt->description && !aliasm) free(pt->description); -#endif + if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr); free(pt); pt = nt; } } free(tableptr); - tableptr = NULL; } tablesize = 0; @@ -113,6 +106,15 @@ HashMgr::~HashMgr() free(aliasm); aliasm = NULL; } + +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT + if (utf8) free_utf_tbl(); +#endif +#endif + + if (enc) free(enc); + if (lang) free(lang); if (ignorechars) free(ignorechars); if (ignorechars_utf16) free(ignorechars_utf16); @@ -144,7 +146,6 @@ void HashMgr::EmptyHentryCache() { #endif // lookup a root word in the hashtable - struct hentry * HashMgr::lookup(const char *word) const { #ifdef HUNSPELL_CHROME_CLIENT @@ -167,10 +168,10 @@ struct hentry * HashMgr::lookup(const char *word) const #else struct hentry * dp; if (tableptr) { - dp = &tableptr[hash(word)]; - if (dp->word == NULL) return NULL; + dp = tableptr[hash(word)]; + if (!dp) return NULL; for ( ; dp != NULL; dp = dp->next) { - if (strcmp(word,dp->word) == 0) return dp; + if (strcmp(word,&(dp->word)) == 0) return dp; } } return NULL; @@ -178,69 +179,101 @@ struct hentry * HashMgr::lookup(const char *word) const } // add a word to the hash table (private) - -int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, const char * desc) +int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, + int al, const char * desc, bool onlyupcase) { #ifndef HUNSPELL_CHROME_CLIENT - char * st = mystrdup(word); - if (wl && !st) return 1; + bool upcasehomonym = false; + int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0; + // variable-length hash record with word and optional fields + struct hentry* hp = + (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl); + if (!hp) return 1; + char * hpw = &(hp->word); + strcpy(hpw, word); if (ignorechars != NULL) { if (utf8) { - remove_ignored_chars_utf(st, ignorechars_utf16, ignorechars_utf16_len); + remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len); } else { - remove_ignored_chars(st, ignorechars); + remove_ignored_chars(hpw, ignorechars); } } if (complexprefixes) { - if (utf8) reverseword_utf(st); else reverseword(st); + if (utf8) reverseword_utf(hpw); else reverseword(hpw); } - int i = hash(st); - struct hentry * dp = &tableptr[i]; - if (dp->word == NULL) { - dp->wlen = (short) wl; - dp->alen = (short) al; - dp->word = st; - dp->astr = aff; - dp->next = NULL; - dp->next_homonym = NULL; -#ifdef HUNSPELL_EXPERIMENTAL - if (aliasm) { - dp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc); - } else { - dp->description = mystrdup(desc); - if (desc && !dp->description) return 1; - if (dp->description && complexprefixes) { - if (utf8) reverseword_utf(dp->description); else reverseword(dp->description); - } - } -#endif - } else { - struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry)); - if (!hp) return 1; - hp->wlen = (short) wl; - hp->alen = (short) al; - hp->word = st; - hp->astr = aff; - hp->next = NULL; - hp->next_homonym = NULL; -#ifdef HUNSPELL_EXPERIMENTAL - if (aliasm) { - hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc); - } else { - hp->description = mystrdup(desc); - if (desc && !hp->description) return 1; - if (dp->description && complexprefixes) { - if (utf8) reverseword_utf(hp->description); else reverseword(hp->description); + + int i = hash(hpw); + + hp->blen = (unsigned char) wbl; + hp->clen = (unsigned char) wcl; + hp->alen = (short) al; + hp->astr = aff; + hp->next = NULL; + hp->next_homonym = NULL; + + // store the description string or its pointer + if (desc) { + hp->var = H_OPT; + if (aliasm) { + hp->var += H_OPT_ALIASM; + store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc))); + } else { + strcpy(hpw + wbl + 1, desc); + if (complexprefixes) { + if (utf8) reverseword_utf(HENTRY_DATA(hp)); + else reverseword(HENTRY_DATA(hp)); } + } + if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON; + } else hp->var = 0; + + struct hentry * dp = tableptr[i]; + if (!dp) { + tableptr[i] = hp; + return 0; } -#endif while (dp->next != NULL) { - if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp; + if ((!dp->next_homonym) && (strcmp(&(hp->word), &(dp->word)) == 0)) { + // remove hidden onlyupcase homonym + if (!onlyupcase) { + if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { + free(dp->astr); + dp->astr = hp->astr; + dp->alen = hp->alen; + free(hp); + return 0; + } else { + dp->next_homonym = hp; + } + } else { + upcasehomonym = true; + } + } dp=dp->next; } - if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp; - dp->next = hp; - } + if (strcmp(&(hp->word), &(dp->word)) == 0) { + // remove hidden onlyupcase homonym + if (!onlyupcase) { + if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) { + free(dp->astr); + dp->astr = hp->astr; + dp->alen = hp->alen; + free(hp); + return 0; + } else { + dp->next_homonym = hp; + } + } else { + upcasehomonym = true; + } + } + if (!upcasehomonym) { + dp->next = hp; + } else { + // remove hidden onlyupcase homonym + if (hp->astr) free(hp->astr); + free(hp); + } #endif // HUNSPELL_CHROME_CLIENT std::map<StringPiece, int>::iterator iter = custom_word_to_affix_id_map_.find(word); @@ -255,33 +288,134 @@ int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, c return 0; } -// add a custom dic. word to the hash table (public) -int HashMgr::put_word(const char * word, int wl, char * aff) +int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl, + unsigned short * flags, int al, char * dp, int captype) { - unsigned short * flags; - int al = 0; - if (aff) { - al = decode_flags(&flags, aff); - flag_qsort(flags, 0, al); + // add inner capitalized forms to handle the following allcap forms: + // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG + // Allcaps with suffixes: CIA's -> CIA'S + if (((captype == HUHCAP) || (captype == HUHINITCAP) || + ((captype == ALLCAP) && (flags != NULL))) && + !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) { + unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned short) * (al+1)); + if (!flags2) return 1; + if (al) memcpy(flags2, flags, al * sizeof(unsigned short)); + flags2[al] = ONLYUPCASEFLAG; + if (utf8) { + char st[BUFSIZE]; + w_char w[BUFSIZE]; + int wlen = u8_u16(w, BUFSIZE, word); + mkallsmall_utf(w, wlen, langnum); + mkallcap_utf(w, 1, langnum); + u16_u8(st, BUFSIZE, w, wlen); + return add_word(st,wbl,wcl,flags2,al+1,dp, true); + } else { + mkallsmall(word, csconv); + mkinitcap(word, csconv); + return add_word(word,wbl,wcl,flags2,al+1,dp, true); + } + } + return 0; +} + +// detect captype and modify word length for UTF-8 encoding +int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) { + int len; + if (utf8) { + w_char dest_utf[BUFSIZE]; + len = u8_u16(dest_utf, BUFSIZE, word); + *captype = get_captype_utf8(dest_utf, len, langnum); } else { - flags = NULL; + len = wbl; + *captype = get_captype((char *) word, len, csconv); + } + return len; +} + +// remove word (personal dictionary function for standalone applications) +int HashMgr::remove(const char * word) +{ + struct hentry * dp = lookup(word); + while (dp) { + if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { + unsigned short * flags = + (unsigned short *) malloc(sizeof(short *) * (dp->alen + 1)); + if (!flags) return 1; + for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i]; + flags[dp->alen] = forbiddenword; + dp->astr = flags; + dp->alen++; + flag_qsort(flags, 0, dp->alen); + } + dp = dp->next_homonym; } - add_word(word, wl, flags, al, NULL); return 0; } -int HashMgr::put_word_pattern(const char * word, int wl, const char * pattern) +/* remove forbidden flag to add a personal word to the hash */ +int HashMgr::remove_forbidden_flag(const char * word) { + struct hentry * dp = lookup(word); + if (!dp) return 1; + while (dp) { + if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) { + if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic. + else { + unsigned short * flags2 = + (unsigned short *) malloc(sizeof(short *) * (dp->alen - 1)); + if (!flags2) return 1; + int i, j = 0; + for (i = 0; i < dp->alen; i++) { + if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i]; + } + dp->alen--; + dp->astr = flags2; // XXX allowed forbidden words + } + } + dp = dp->next_homonym; + } + return 0; +} + +// add a custom dic. word to the hash table (public) +int HashMgr::add(const char * word) { - unsigned short * flags; - struct hentry * dp = lookup(pattern); - if (!dp || !dp->astr) return 1; - flags = (unsigned short *) malloc (dp->alen * sizeof(short)); - memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); - add_word(word, wl, flags, dp->alen, NULL); + unsigned short * flags = NULL; + int al = 0; + if (remove_forbidden_flag(word)) { + int captype; + int wbl = strlen(word); + int wcl = get_clen_and_captype(word, wbl, &captype); + add_word(word, wbl, wcl, flags, al, NULL, false); + return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype); + } return 0; } +int HashMgr::add_with_affix(const char * word, const char * example) +{ + // detect captype and modify word length for UTF-8 encoding + struct hentry * dp = lookup(example); + remove_forbidden_flag(word); + if (dp && dp->astr) { + int captype; + int wbl = strlen(word); + int wcl = get_clen_and_captype(word, wbl, &captype); + if (aliasf) { + add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false); + } else { + unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short)); + if (flags) { + memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); + add_word(word, wbl, wcl, flags, dp->alen, NULL, false); + } else return 1; + } + return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype); + } + return 1; +} + // walk the hash table entry by entry - null at end +// initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp); struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const { #ifdef HUNSPELL_CHROME_CLIENT @@ -312,88 +446,99 @@ struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const // lists for the extra affixes. If hp is NULL, create it here. if (!hp) hp = new hentry; - hp->word = word; - hp->wlen = word_len; + hp->word = *word; + hp->blen = word_len; hp->alen = (short)const_cast<HashMgr*>(this)->get_aliasf(affix_ids[0], &hp->astr); hp->next = NULL; hp->next_homonym = NULL; - + hp->var = 0; + hp->clen = 0; return hp; #else - //reset to start - if ((col < 0) || (hp == NULL)) { - col = -1; - hp = NULL; + + if (hp && hp->next != NULL) return hp->next; + for (col++; col < tablesize; col++) { + if (tableptr[col]) return tableptr[col]; } - - if (hp && hp->next != NULL) { - hp = hp->next; - } else { - col++; - hp = (col < tablesize) ? &tableptr[col] : NULL; - // search for next non-blank column entry - while (hp && (hp->word == NULL)) { - col ++; - hp = (col < tablesize) ? &tableptr[col] : NULL; - } - if (col < tablesize) return hp; - hp = NULL; - col = -1; - } - return hp; + // null at end and reset to start + col = -1; + return NULL; #endif } // load a munched word list and build a hash table on the fly -int HashMgr::load_tables(FILE* t_handle) +int HashMgr::load_tables(FILE* t_handle, const char * key) { #ifndef HUNSPELL_CHROME_CLIENT - int wl, al; + int al; char * ap; char * dp; + char * dp2; unsigned short * flags; + char * ts; - // raw dictionary - munched file - FILE * rawdict = _fdopen(_dup(_fileno(t_handle)), "r"); - if (rawdict == NULL) return 1; - fseek(rawdict, 0, SEEK_SET); + // open dictionary file + FileMgr * dict = new FileMgr(tpath, key); + if (dict == NULL) return 1; // first read the first line of file to get hash table size */ - char ts[MAXDELEN]; - if (! fgets(ts, MAXDELEN-1,rawdict)) return 2; + if (!(ts = dict->getline())) { + HUNSPELL_WARNING(stderr, "error: empty dic file\n"); + delete dict; + return 2; + } mychomp(ts); - + /* remove byte order mark */ - if (strncmp(ts,"\xef\xbb\xbf",3) == 0) { + if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) { memmove(ts, ts+3, strlen(ts+3)+1); HUNSPELL_WARNING(stderr, "warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions\n"); } - - if ((*ts < '1') || (*ts > '9')) HUNSPELL_WARNING(stderr, "error - missing word count in dictionary file\n"); + tablesize = atoi(ts); - if (!tablesize) return 4; + if (tablesize == 0) { + HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the dic file\n"); + delete dict; + return 4; + } tablesize = tablesize + 5 + USERWORD; if ((tablesize %2) == 0) tablesize++; // allocate the hash table - tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry)); - if (! tableptr) return 3; - for (int i=0; i<tablesize; i++) tableptr[i].word = NULL; + tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *)); + if (! tableptr) { + delete dict; + return 3; + } + for (int i=0; i<tablesize; i++) tableptr[i] = NULL; // loop through all words on much list and add to hash // table and create word and affix strings - while (fgets(ts,MAXDELEN-1,rawdict)) { + while ((ts = dict->getline())) { mychomp(ts); // split each line into word and morphological description - dp = strchr(ts,'\t'); + dp = ts; + while ((dp = strchr(dp, ':'))) { + if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) { + for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--); + if (dp < ts) { // missing word + dp = NULL; + } else { + *(dp + 1) = '\0'; + dp = dp + 2; + } + break; + } + dp++; + } - if (dp) { - *dp = '\0'; - dp++; - } else { - dp = NULL; + // tabulator is the old morphological field separator + dp2 = strchr(ts, '\t'); + if (dp2 && (!dp || dp2 < dp)) { + *dp2 = '\0'; + dp = dp2 + 1; } // split each line into word and affix char strings @@ -414,13 +559,13 @@ int HashMgr::load_tables(FILE* t_handle) *ap = '\0'; if (aliasf) { int index = atoi(ap + 1); - al = get_aliasf(index, &flags); + al = get_aliasf(index, &flags, dict); if (!al) { - HUNSPELL_WARNING(stderr, "error - bad flag vector alias: %s\n", ts); + HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum()); *ap = '\0'; } } else { - al = decode_flags(&flags, ap + 1); + al = decode_flags(&flags, ap + 1, dict); flag_qsort(flags, 0, al); } } else { @@ -429,19 +574,22 @@ int HashMgr::load_tables(FILE* t_handle) flags = NULL; } - wl = strlen(ts); - - // add the word and its index - if (add_word(ts,wl,flags,al,dp)) return 5; - + int captype; + int wbl = strlen(ts); + int wcl = get_clen_and_captype(ts, wbl, &captype); + // add the word and its index plus its capitalized form optionally + if (add_word(ts,wbl,wcl,flags,al,dp, false) || + add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) { + delete dict; + return 5; + } } - - fclose(rawdict); + + delete dict; #endif return 0; } - // the hash function is a simple load and rotate // algorithm borrowed @@ -466,15 +614,17 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { switch (flag_mode) { case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) len = strlen(flags); - if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: length of FLAG_LONG flagvector is odd: %s\n", flags); - len = len/2; + if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: bad flagvector\n"); + len /= 2; *result = (unsigned short *) malloc(len * sizeof(short)); + if (!*result) return -1; for (int i = 0; i < len; i++) { (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1]; } break; } case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233) + int i; len = 1; char * src = flags; unsigned short * dest; @@ -483,23 +633,29 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { if (*p == ',') len++; } *result = (unsigned short *) malloc(len * sizeof(short)); + if (!*result) return -1; dest = *result; for (p = flags; *p; p++) { if (*p == ',') { - *dest = (unsigned short) atoi(src); + i = atoi(src); + if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); + *dest = (unsigned short) i; if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); src = p + 1; dest++; } } - *dest = (unsigned short) atoi(src); + i = atoi(src); + if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); + *dest = (unsigned short) i; if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); break; } case FLAG_UNI: { // UTF-8 characters - w_char w[MAXDELEN/2]; - len = u8_u16(w, MAXDELEN/2, flags); + w_char w[BUFSIZE/2]; + len = u8_u16(w, BUFSIZE/2, flags); *result = (unsigned short *) malloc(len * sizeof(short)); + if (!*result) return -1; memcpy(*result, w, len * sizeof(short)); break; } @@ -507,24 +663,28 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { unsigned short * dest; len = strlen(flags); *result = (unsigned short *) malloc(len * sizeof(short)); + if (!*result) return -1; dest = *result; for (unsigned char * p = (unsigned char *) flags; *p; p++) { *dest = (unsigned short) *p; dest++; } } - } + } return len; } unsigned short HashMgr::decode_flag(const char * f) { unsigned short s = 0; + int i; switch (flag_mode) { case FLAG_LONG: s = ((unsigned short) f[0] << 8) + (unsigned short) f[1]; break; case FLAG_NUM: - s = (unsigned short) atoi(f); + i = atoi(f); + if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1); + s = (unsigned short) i; break; case FLAG_UNI: u8_u16((w_char *) &s, 1, f); @@ -532,7 +692,7 @@ unsigned short HashMgr::decode_flag(const char * f) { default: s = (unsigned short) *((unsigned char *)f); } - if (!s) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); + if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n"); return s; } @@ -569,7 +729,7 @@ int HashMgr::load_config() // diacritics characters. if (strncmp(line,"IGNORE",6) == 0) { parse_array(line, &ignorechars, &ignorechars_utf16, - &ignorechars_utf16_len, "IGNORE", utf8); + &ignorechars_utf16_len, utf8, 0); } // Retrieve the format of an AF line. if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { @@ -591,75 +751,101 @@ int HashMgr::load_config() } #else // read in aff file and set flag mode -int HashMgr::load_config(FILE* aff_handle) +int HashMgr::load_config(FILE* aff_handle, const char * key) { + char * line; // io buffers int firstline = 1; - - // io buffers - char line[MAXDELEN+1]; // open the affix file - FILE * afflst; - afflst = _fdopen(_dup(_fileno(aff_handle)), "r"); + FileMgr * afflst = new FileMgr(affpath, key); if (!afflst) { HUNSPELL_WARNING(stderr, "Error - could not open affix description file\n"); return 1; } - fseek(afflst, 0, SEEK_SET); // read in each line ignoring any that do not // start with a known line type indicator - while (fgets(line,MAXDELEN,afflst)) { + while ((line = afflst->getline())) { mychomp(line); /* remove byte order mark */ if (firstline) { firstline = 0; - if (strncmp(line,"\xef\xbb\xbf",3) == 0) memmove(line, line+3, strlen(line+3)+1); + if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(line+3)+1); } /* parse in the try string */ if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { if (flag_mode != FLAG_CHAR) { - HUNSPELL_WARNING(stderr, "error: duplicate FLAG parameter\n"); + HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst->getlinenum()); } if (strstr(line, "long")) flag_mode = FLAG_LONG; if (strstr(line, "num")) flag_mode = FLAG_NUM; if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI; if (flag_mode == FLAG_CHAR) { - HUNSPELL_WARNING(stderr, "error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line); + HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst->getlinenum()); } } - if ((strncmp(line,"SET",3) == 0) && isspace(line[3]) && strstr(line, "UTF-8")) utf8 = 1; + if (strncmp(line,"FORBIDDENWORD",13) == 0) { + char * st = NULL; + if (parse_string(line, &st, afflst->getlinenum())) { + delete afflst; + return 1; + } + forbiddenword = decode_flag(st); + free(st); + } + if (strncmp(line, "SET", 3) == 0) { + if (parse_string(line, &enc, afflst->getlinenum())) { + delete afflst; + return 1; + } + if (strcmp(enc, "UTF-8") == 0) { + utf8 = 1; +#ifndef OPENOFFICEORG +#ifndef MOZILLA_CLIENT + initialize_utf_tbl(); +#endif +#endif + } else csconv = get_current_cs(enc); + } + if (strncmp(line, "LANG", 4) == 0) { + if (parse_string(line, &lang, afflst->getlinenum())) { + delete afflst; + return 1; + } + langnum = get_lang_num(lang); + } /* parse in the ignored characters (for example, Arabic optional diacritics characters */ if (strncmp(line,"IGNORE",6) == 0) { - if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) { - fclose(afflst); + if (parse_array(line, &ignorechars, &ignorechars_utf16, + &ignorechars_utf16_len, utf8, afflst->getlinenum())) { + delete afflst; return 1; } } if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) { if (parse_aliasf(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } -#ifdef HUNSPELL_EXPERIMENTAL if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) { if (parse_aliasm(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } -#endif - if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; - if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; + + if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; + if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; } - fclose(afflst); + if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING); + delete afflst; return 0; } #endif // HUNSPELL_CHROME_CLIENT @@ -669,11 +855,11 @@ int HashMgr::load_config(FILE* aff_handle) int HashMgr::parse_aliasf(char* line, hunspell::LineIterator* iterator) { #else -int HashMgr::parse_aliasf(char * line, FILE * af) +int HashMgr::parse_aliasf(char * line, FileMgr * af) { #endif if (numaliasf != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate AF (alias for flag vector) tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -691,8 +877,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) numaliasf = 0; aliasf = NULL; aliasflen = NULL; - HUNSPELL_WARNING(stderr, "incorrect number of entries in AF table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: bad entry number\n"); return 1; } aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *)); @@ -712,7 +897,6 @@ int HashMgr::parse_aliasf(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -721,7 +905,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) free(aliasflen); aliasf = NULL; aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: missing AF table information\n"); + HUNSPELL_WARNING(stderr, "error: missing data\n"); return 1; } @@ -732,9 +916,9 @@ int HashMgr::parse_aliasf(char * line, FILE * af) if (!iterator->AdvanceAndCopy(nl, MAXDELEN)) return 1; #else - if (!fgets(nl,MAXDELEN,af)) return 1; + if (!(nl = af->getline())) return 1; #endif - mychomp(nl); + mychomp(nl); tp = nl; i = 0; aliasf[j] = NULL; @@ -750,8 +934,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) free(aliasflen); aliasf = NULL; aliasflen = NULL; - HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); return 1; } break; @@ -765,7 +948,6 @@ int HashMgr::parse_aliasf(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (!aliasf[j]) { @@ -774,7 +956,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) aliasf = NULL; aliasflen = NULL; numaliasf = 0; - HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); return 1; } } @@ -810,8 +992,8 @@ hentry* HashMgr::AffixIDsToHentry(char* word, struct hentry* he = new hentry; if (i == 0) first_he = he; - he->word = word; - he->wlen = word_len; + he->word = *word; + he->blen = word_len; he->alen = (short)const_cast<HashMgr*>(this)->get_aliasf(affix_ids[i], &he->astr); he->next = NULL; @@ -854,12 +1036,11 @@ int HashMgr::get_aliasf(int index, unsigned short ** fvec) { return 0; } -#ifdef HUNSPELL_EXPERIMENTAL /* parse morph alias definitions */ -int HashMgr::parse_aliasm(char * line, FILE * af) +int HashMgr::parse_aliasm(char * line, FileMgr * af) { if (numaliasm != 0) { - HUNSPELL_WARNING(stderr, "error: duplicate AM (aliases for morphological descriptions) tables used\n"); + HUNSPELL_WARNING(stderr, "error: multiple table definitions\n"); return 1; } char * tp = line; @@ -874,8 +1055,7 @@ int HashMgr::parse_aliasm(char * line, FILE * af) case 1: { numaliasm = atoi(piece); if (numaliasm < 1) { - HUNSPELL_WARNING(stderr, "incorrect number of entries in AM table\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum()); return 1; } aliasm = (char **) malloc(numaliasm * sizeof(char *)); @@ -890,33 +1070,31 @@ int HashMgr::parse_aliasm(char * line, FILE * af) } i++; } - free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { numaliasm = 0; free(aliasm); aliasm = NULL; - HUNSPELL_WARNING(stderr, "error: missing AM alias information\n"); + HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum()); return 1; } /* now parse the numaliasm lines to read in the remainder of the table */ char * nl = line; for (int j=0; j < numaliasm; j++) { - if (!fgets(nl,MAXDELEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; aliasm[j] = NULL; - piece = mystrsep(&tp, 0); + piece = mystrsep(&tp, ' '); while (piece) { if (*piece != '\0') { switch(i) { case 0: { if (strncmp(piece,"AM",2) != 0) { - HUNSPELL_WARNING(stderr, "error: AM table is corrupt\n"); - free(piece); + HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum()); numaliasm = 0; free(aliasm); aliasm = NULL; @@ -925,24 +1103,34 @@ int HashMgr::parse_aliasm(char * line, FILE * af) break; } case 1: { + // add the remaining of the line + if (*tp) { + *(tp - 1) = ' '; + tp = tp + strlen(tp); + } if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); } aliasm[j] = mystrdup(piece); + if (!aliasm[j]) { + numaliasm = 0; + free(aliasm); + aliasm = NULL; + return 1; + } break; } default: break; } i++; } - free(piece); - piece = mystrsep(&tp, 0); + piece = mystrsep(&tp, ' '); } if (!aliasm[j]) { numaliasm = 0; free(aliasm); aliasm = NULL; - HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); + HUNSPELL_WARNING(stderr, "error: table is corrupt\n"); return 1; } } @@ -958,4 +1146,3 @@ char * HashMgr::get_aliasm(int index) { HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); return NULL; } -#endif diff --git a/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx b/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx index 781175e5..acfbbce 100644 --- a/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx +++ b/chrome/third_party/hunspell/src/hunspell/hashmgr.hxx @@ -1,8 +1,14 @@ #ifndef _HASHMGR_HXX_ #define _HASHMGR_HXX_ +#ifndef MOZILLA_CLIENT #include <cstdio> +#else +#include <stdio.h> +#endif + #include "htypes.hxx" +#include "filemgr.hxx" #ifdef HUNSPELL_CHROME_CLIENT #include <string> @@ -23,20 +29,25 @@ class HashMgr std::map<StringPiece, int> custom_word_to_affix_id_map_; std::vector<std::string*> pointer_to_strings_; #endif - int tablesize; - struct hentry * tableptr; - int userword; - flag flag_mode; - int complexprefixes; - int utf8; - char * ignorechars; - unsigned short * ignorechars_utf16; - int ignorechars_utf16_len; - int numaliasf; // flag vector `compression' with aliases - unsigned short ** aliasf; - unsigned short * aliasflen; - int numaliasm; // morphological desciption `compression' with aliases - char ** aliasm; + int tablesize; + struct hentry ** tableptr; + int userword; + flag flag_mode; + int complexprefixes; + int utf8; + unsigned short forbiddenword; + int langnum; + char * enc; + char * lang; + struct cs_info * csconv; + char * ignorechars; + unsigned short * ignorechars_utf16; + int ignorechars_utf16_len; + int numaliasf; // flag vector `compression' with aliases + unsigned short ** aliasf; + unsigned short * aliasflen; + int numaliasm; // morphological desciption `compression' with aliases + char ** aliasm; public: @@ -55,7 +66,7 @@ public: // This function allows that cache to be emptied and not grow infinitely. void EmptyHentryCache(); #else - HashMgr(FILE* t_handle, FILE* a_handle); + HashMgr(FILE* t_handle, FILE* a_handle, const char * key); #endif ~HashMgr(); @@ -63,22 +74,22 @@ public: int hash(const char *) const; struct hentry * walk_hashtable(int & col, struct hentry * hp) const; - int put_word(const char * word, int wl, char * ap); - int put_word_pattern(const char * word, int wl, const char * pattern); + int add(const char * word); + int add_with_affix(const char * word, const char * pattern); + int remove(const char * word); int decode_flags(unsigned short ** result, char * flags); unsigned short decode_flag(const char * flag); char * encode_flag(unsigned short flag); int is_aliasf(); int get_aliasf(int index, unsigned short ** fvec); -#ifdef HUNSPELL_EXPERIMENTAL int is_aliasm(); char * get_aliasm(int index); -#endif - private: - int load_tables(FILE* t_handle); - int add_word(const char * word, int wl, unsigned short * ap, int al, const char * desc); + int get_clen_and_captype(const char * word, int wbl, int * captype); + int load_tables(FILE* t_handle, const char * key); + int add_word(const char * word, int wbl, int wcl, unsigned short * ap, + int al, const char * desc, bool onlyupcase); #ifdef HUNSPELL_CHROME_CLIENT int load_config(); @@ -96,13 +107,14 @@ private: HEntryCache hentry_cache; #else - int load_config(FILE* aff_handle); + int load_config(FILE* aff_handle, const char * key); int parse_aliasf(char * line, FILE * af); #endif -#ifdef HUNSPELL_EXPERIMENTAL - int parse_aliasm(char * line, FILE * af); -#endif + int add_hidden_capitalized_word(char * word, int wbl, int wcl, + unsigned short * flags, int al, char * dp, int captype); + int parse_aliasm(char * line, FileMgr * af); + int remove_forbidden_flag(const char * word); }; diff --git a/chrome/third_party/hunspell/src/hunspell/htypes.hxx b/chrome/third_party/hunspell/src/hunspell/htypes.hxx index f8d685a..75d9542 100644 --- a/chrome/third_party/hunspell/src/hunspell/htypes.hxx +++ b/chrome/third_party/hunspell/src/hunspell/htypes.hxx @@ -15,25 +15,28 @@ #define ROTATE(v,q) \ (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1)); + +// hentry options +#define H_OPT (1 << 0) +#define H_OPT_ALIASM (1 << 1) +#define H_OPT_PHON (1 << 2) + +// see also csutil.hxx +#define HENTRY_WORD(h) &(h->word) + // approx. number of user defined words #define USERWORD 1000 struct hentry { - short wlen; - short alen; - /* NOTE: Removed by mbelshe since this is not used. - * The english dictionary is 63K in size, so removing this - * itty bitty field saves us ~250KB of RAM. - char wbeg[2]; - */ - char * word; - unsigned short * astr; - struct hentry * next; - struct hentry * next_homonym; -#ifdef HUNSPELL_EXPERIMENTAL - char * description; -#endif + unsigned char blen; // word length in bytes + unsigned char clen; // word length in characters (different for UTF-8 enc.) + short alen; // length of affix flag vector + unsigned short * astr; // affix flag vector + struct hentry * next; // next word with same hash code + struct hentry * next_homonym; // next homonym word (with same hash code) + char var; // variable fields (only for special pronounciation yet) + char word; // variable-length word (8-bit or UTF-8 encoding) }; #endif diff --git a/chrome/third_party/hunspell/src/hunspell/hunspell.cxx b/chrome/third_party/hunspell/src/hunspell/hunspell.cxx index 42b0603..131ad50 100644 --- a/chrome/third_party/hunspell/src/hunspell/hunspell.cxx +++ b/chrome/third_party/hunspell/src/hunspell/hunspell.cxx @@ -6,16 +6,17 @@ #include <cstring> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #endif #include "hunspell.hxx" #include "hunspell.h" +#include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif @@ -23,27 +24,34 @@ using namespace std; #ifdef HUNSPELL_CHROME_CLIENT Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length) #else -Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle) +Hunspell::Hunspell(FILE* aff_handle, FILE* dic_handle, const char * key = NULL) #endif { encoding = NULL; csconv = NULL; utf8 = 0; complexprefixes = 0; +#ifndef HUNSPELL_CHROME_CLIENT + affixpath = mystrdup(affpath); +#endif + maxdic = 0; #ifdef HUNSPELL_CHROME_CLIENT bdict_reader = new hunspell::BDictReader; bdict_reader->Init(bdict_data, bdict_length); - pHMgr = new HashMgr(bdict_reader); - pAMgr = new AffixMgr(bdict_reader, pHMgr); + pHMgr[0] = new HashMgr(bdict_reader); + if (pHMgr[0]) maxdic = 1; + + pAMgr = new AffixMgr(bdict_reader, pHMgr, &maxdic); #else /* first set up the hash manager */ - pHMgr = new HashMgr(dic_handle, aff_handle); + pHMgr[0] = new HashMgr(dic_handle, aff_handle, key); + if (pHMgr[0]) maxdic = 1; /* next set up the affix manager */ /* it needs access to the hash manager lookup methods */ - pAMgr = new AffixMgr(aff_handle, pHMgr); + pAMgr = new AffixMgr(aff_handle, pHMgr, &maxdic, key); #endif /* get the preferred try string and the dictionary */ @@ -65,10 +73,13 @@ Hunspell::~Hunspell() { if (pSMgr) delete pSMgr; if (pAMgr) delete pAMgr; - if (pHMgr) delete pHMgr; + for (int i = 0; i < maxdic; i++) delete pHMgr[i]; + maxdic = 0; pSMgr = NULL; pAMgr = NULL; - pHMgr = NULL; +#ifdef MOZILLA_CLIENT + free(csconv); +#endif csconv= NULL; if (encoding) free(encoding); encoding = NULL; @@ -76,27 +87,38 @@ Hunspell::~Hunspell() #ifdef HUNSPELL_CHROME_CLIENT if (bdict_reader) delete bdict_reader; bdict_reader = NULL; +#else + if (affixpath) free(affixpath); + affixpath = NULL; #endif } +#ifndef HUNSPELL_CHROME_CLIENT +// load extra dictionaries +int Hunspell::add_dic(const char * dpath, const char * key) { + if (maxdic == MAXDIC || !affixpath) return 1; + pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); + if (pHMgr[maxdic]) maxdic++; else return 1; + return 0; +} +#endif // make a copy of src at destination while removing all leading // blanks and removing any trailing periods after recording // their presence with the abbreviation flag -// also since already going through character by character, +// also since already going through character by character, // set the capitalization type // return the length of the "cleaned" (and UTF-8 encoded) word -int Hunspell::cleanword2(char * dest, const char * src, +int Hunspell::cleanword2(char * dest, const char * src, w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev) -{ +{ unsigned char * p = (unsigned char *) dest; const unsigned char * q = (const unsigned char * ) src; - int firstcap = 0; // first skip over any leading blanks while ((*q != '\0') && (*q == ' ')) q++; - + // now strip off any trailing periods (recording their presence) *pabbrev = 0; int nl = strlen((const char *)q); @@ -104,80 +126,43 @@ int Hunspell::cleanword2(char * dest, const char * src, nl--; (*pabbrev)++; } - + // if no characters are left it can't be capitalized - if (nl <= 0) { + if (nl <= 0) { *pcaptype = NOCAP; *p = '\0'; return 0; } - // now determine the capitalization type of the first nl letters - int ncap = 0; - int nneutral = 0; - *nc = 0; - - if (!utf8) { - while (nl > 0) { - (*nc)++; - if (csconv[(*q)].ccase) ncap++; - if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; - *p++ = *q++; - nl--; - } - // remember to terminate the destination string - *p = '\0'; - if (ncap) { - firstcap = csconv[(unsigned char)(*dest)].ccase; - } - } else { - unsigned short idx; - *nc = u8_u16(dest_utf, MAXWORDLEN, (const char *) q); + strncpy(dest, (char *) q, nl); + *(dest + nl) = '\0'; + nl = strlen(dest); + if (utf8) { + *nc = u8_u16(dest_utf, MAXWORDLEN, dest); // don't check too long words if (*nc >= MAXWORDLEN) return 0; if (*nc == -1) { // big Unicode character (non BMP area) *pcaptype = NOCAP; - strcpy((char *) p, (char *) q); - return strlen(dest); + return nl; } - *nc -= *pabbrev; - for (int i = 0; i < *nc; i++) { - idx = (dest_utf[i].h << 8) + dest_utf[i].l; - if (idx != unicodetolower(idx, langnum)) ncap++; - if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; - } - u16_u8(dest, MAXWORDUTF8LEN, dest_utf, *nc); - if (ncap) { - idx = (dest_utf[0].h << 8) + dest_utf[0].l; - firstcap = (idx != unicodetolower(idx, langnum)); - } - } - - // now finally set the captype - if (ncap == 0) { - *pcaptype = NOCAP; - } else if ((ncap == 1) && firstcap) { - *pcaptype = INITCAP; - } else if ((ncap == *nc) || ((ncap + nneutral) == *nc)) { - *pcaptype = ALLCAP; - } else if ((ncap > 1) && firstcap) { - *pcaptype = HUHINITCAP; + *pcaptype = get_captype_utf8(dest_utf, *nc, langnum); } else { - *pcaptype = HUHCAP; + *pcaptype = get_captype(dest, nl, csconv); + *nc = nl; } - return strlen(dest); -} + return nl; +} -int Hunspell::cleanword(char * dest, const char * src, +int Hunspell::cleanword(char * dest, const char * src, int * pcaptype, int * pabbrev) -{ +{ unsigned char * p = (unsigned char *) dest; const unsigned char * q = (const unsigned char * ) src; int firstcap = 0; // first skip over any leading blanks while ((*q != '\0') && (*q == ' ')) q++; - + // now strip off any trailing periods (recording their presence) *pabbrev = 0; int nl = strlen((const char *)q); @@ -185,9 +170,9 @@ int Hunspell::cleanword(char * dest, const char * src, nl--; (*pabbrev)++; } - + // if no characters are left it can't be capitalized - if (nl <= 0) { + if (nl <= 0) { *pcaptype = NOCAP; *p = '\0'; return 0; @@ -215,8 +200,9 @@ int Hunspell::cleanword(char * dest, const char * src, nc = u8_u16(t, MAXWORDLEN, src); for (int i = 0; i < nc; i++) { idx = (t[i].h << 8) + t[i].l; - if (idx != unicodetolower(idx, langnum)) ncap++; - if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; + unsigned short low = unicodetolower(idx, langnum); + if (idx != low) ncap++; + if (unicodetoupper(idx, langnum) == low) nneutral++; } u16_u8(dest, MAXWORDUTF8LEN, t, nc); if (ncap) { @@ -238,8 +224,7 @@ int Hunspell::cleanword(char * dest, const char * src, *pcaptype = HUHCAP; } return strlen(dest); -} - +} void Hunspell::mkallcap(char * p) { @@ -256,7 +241,7 @@ void Hunspell::mkallcap(char * p) } u16_u8(p, MAXWORDUTF8LEN, u, nc); } else { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].cupper; p++; } @@ -269,15 +254,16 @@ int Hunspell::mkallcap2(char * p, w_char * u, int nc) unsigned short idx; for (int i = 0; i < nc; i++) { idx = (u[i].h << 8) + u[i].l; - if (idx != unicodetoupper(idx, langnum)) { - u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); - u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); + unsigned short up = unicodetoupper(idx, langnum); + if (idx != up) { + u[i].h = (unsigned char) (up >> 8); + u[i].l = (unsigned char) (up & 0x00FF); } } u16_u8(p, MAXWORDUTF8LEN, u, nc); - return strlen(p); + return strlen(p); } else { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].cupper; p++; } @@ -288,7 +274,7 @@ int Hunspell::mkallcap2(char * p, w_char * u, int nc) void Hunspell::mkallsmall(char * p) { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].clower; p++; } @@ -300,15 +286,16 @@ int Hunspell::mkallsmall2(char * p, w_char * u, int nc) unsigned short idx; for (int i = 0; i < nc; i++) { idx = (u[i].h << 8) + u[i].l; - if (idx != unicodetolower(idx, langnum)) { - u[i].h = (unsigned char) (unicodetolower(idx, langnum) >> 8); - u[i].l = (unsigned char) (unicodetolower(idx, langnum) & 0x00FF); + unsigned short low = unicodetolower(idx, langnum); + if (idx != low) { + u[i].h = (unsigned char) (low >> 8); + u[i].l = (unsigned char) (low & 0x00FF); } } u16_u8(p, MAXWORDUTF8LEN, u, nc); return strlen(p); } else { - while (*p != '\0') { + while (*p != '\0') { *p = csconv[((unsigned char) *p)].clower; p++; } @@ -322,18 +309,18 @@ char * Hunspell::sharps_u8_l1(char * dest, char * source) { *p = *source; for (p++, source++; *(source - 1); p++, source++) { *p = *source; - if (*source == '\x9f') *--p = '\xdf'; + if (*source == '\x9F') *--p = '\xDF'; } return dest; } -// recursive search for right ss-\xdf permutations +// recursive search for right ss - sharp s permutations hentry * Hunspell::spellsharps(char * base, char * pos, int n, int repnum, char * tmp, int * info, char **root) { pos = strstr(pos, "ss"); if (pos && (n < MAXSHARPS)) { - *pos = '\xc3'; - *(pos + 1) = '\x9f'; + *pos = '\xC3'; + *(pos + 1) = '\x9F'; hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root); if (h) return h; *pos = 's'; @@ -352,31 +339,32 @@ int Hunspell::is_keepcase(const hentry * rv) { TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); } -/* check and insert a word to beginning of the suggestion array */ -int Hunspell::insert_sug(char ***slst, char * word, int *ns) { - if (spell(word)) { - if (*ns == MAXSUGGESTION) { - (*ns)--; - free((*slst)[*ns]); - } - for (int k = *ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; - (*slst)[0] = mystrdup(word); - (*ns)++; +/* insert a word to the beginning of the suggestion array and return ns */ +int Hunspell::insert_sug(char ***slst, char * word, int ns) { + char * dup = mystrdup(word); + if (!dup) return ns; + if (ns == MAXSUGGESTION) { + ns--; + free((*slst)[ns]); } - return 0; + for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; + (*slst)[0] = dup; + return ns + 1; } int Hunspell::spell(const char * word, int * info, char ** root) { #ifdef HUNSPELL_CHROME_CLIENT - if (pHMgr) pHMgr->EmptyHentryCache(); + if (pHMgr) pHMgr[0]->EmptyHentryCache(); #endif struct hentry * rv=NULL; // need larger vector. For example, Turkish capital letter I converted a // 2-byte UTF-8 character (dotless i) by mkallsmall. - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - w_char unicw[MAXWORDLEN + 1]; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + w_char unicw[MAXWORDLEN]; + // Hunspell supports XML input of the simplified API (see manual) + if (strcmp(word, SPELL_XML) == 0) return 1; int nc = strlen(word); int wl2 = 0; if (utf8) { @@ -386,14 +374,18 @@ int Hunspell::spell(const char * word, int * info, char ** root) } int captype = 0; int abbv = 0; - int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + int wl = 0; - if (wl == 0) return 1; + // input conversion + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); - if (info) *info = 0; + int info2 = 0; + if (wl == 0 || maxdic == 0) return 1; if (root) *root = NULL; - // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.) + // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.) enum { NBEGIN, NNUM, NSEP }; int nstate = NBEGIN; int i; @@ -407,173 +399,179 @@ int Hunspell::spell(const char * word, int * info, char ** root) } else break; } if ((i == wl) && (nstate == NNUM)) return 1; - - // LANG_hu section: number(s) + (percent or degree) with suffixes - if (langnum == LANG_hu) { - if ((nstate == NNUM) && ((cw[i] == '%') || (cw[i] == '\xb0')) - && checkword(cw + i, info, root)) return 1; - } - // END of LANG_hu section + if (!info) info = &info2; else *info = 0; switch(captype) { - case HUHCAP: - case HUHINITCAP: - case NOCAP: { - rv = checkword(cw, info, root); - if ((abbv) && !(rv)) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = checkword(wspace, info, root); - } - break; - } + case HUHCAP: + case HUHINITCAP: + case NOCAP: { + rv = checkword(cw, info, root); + if ((abbv) && !(rv)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + } + break; + } case ALLCAP: { - rv = checkword(cw, info, root); - if (rv) break; - if (abbv) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = checkword(wspace, info, root); - if (rv) break; - } - if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { - char tmpword[MAXWORDUTF8LEN]; - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace,cw,(wl+1)); + rv = checkword(cw, info, root); + if (rv) break; + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + if (rv) break; + } + // Spec. prefix handling for Catalan, French, Italian: + // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia). + if (pAMgr && strchr(cw, '\'')) { + wl = mkallsmall2(cw, unicw, nc); + char * apostrophe = strchr(cw, '\''); + if (utf8) { + w_char tmpword[MAXWORDLEN]; + *apostrophe = '\0'; + wl2 = u8_u16(tmpword, MAXWORDLEN, cw); + *apostrophe = '\''; + if (wl2 < nc) { + mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1); + rv = checkword(cw, info, root); + if (rv) break; + } + } else { + mkinitcap2(apostrophe + 1, unicw, nc); + rv = checkword(cw, info, root); + if (rv) break; + } + mkinitcap2(cw, unicw, nc); + rv = checkword(cw, info, root); + if (rv) break; + } + if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { + char tmpword[MAXWORDUTF8LEN]; + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + if (!rv) { + wl2 = mkinitcap2(cw, unicw, nc); + rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); + } + if ((abbv) && !(rv)) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); + if (!rv) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - if (!rv) { - wl2 = mkinitcap2(cw, unicw, nc); - rv = spellsharps(cw, cw, 0, 0, tmpword, info, root); - } - if ((abbv) && !(rv)) { - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - if (!rv) { - memcpy(wspace, cw, wl2); - *(wspace+wl2) = '.'; - *(wspace+wl2+1) = '\0'; - rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root); - } - } - if (rv) break; } } - case INITCAP: { - wl = mkallsmall2(cw, unicw, nc); - memcpy(wspace,cw,(wl+1)); - rv = checkword(wspace, info, root); - if (!rv || (is_keepcase(rv) && !((captype == INITCAP) && - // if CHECKSHARPS: KEEPCASE words with \xdf are allowed - // in INITCAP form, too. - pAMgr->get_checksharps() && ((utf8 && strstr(wspace, "\xc3\x9f")) || - (!utf8 && strchr(wspace, '\xdf')))))) { - wl2 = mkinitcap2(cw, unicw, nc); - rv = checkword(cw, info, root); - if (rv && (captype == ALLCAP) && is_keepcase(rv)) rv = NULL; - } - if (abbv && !rv) { - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = checkword(wspace, info, root); - if (!rv || is_keepcase(rv)) { - memcpy(wspace, cw, wl2); - *(wspace+wl2) = '.'; - *(wspace+wl2+1) = '\0'; - rv = checkword(wspace, info, root); - if (rv && ((captype == ALLCAP) && is_keepcase(rv))) rv = NULL; - } - } - break; - } + if (rv) break; + } + } + case INITCAP: { + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + wl2 = mkinitcap2(cw, unicw, nc); + if (captype == INITCAP) *info += SPELL_INITCAP; + rv = checkword(cw, info, root); + if (captype == INITCAP) *info -= SPELL_INITCAP; + // forbid bad capitalization + // (for example, ijs -> Ijs instead of IJs in Dutch) + // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag) + if (*info & SPELL_FORBIDDEN) { + rv = NULL; + break; + } + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; + if (rv) break; + + rv = checkword(wspace, info, root); + if (abbv && !rv) { + + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = checkword(wspace, info, root); + if (!rv) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + if (captype == INITCAP) *info += SPELL_INITCAP; + rv = checkword(wspace, info, root); + if (captype == INITCAP) *info -= SPELL_INITCAP; + if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL; + break; + } + } + if (rv && is_keepcase(rv) && + ((captype == ALLCAP) || + // if CHECKSHARPS: KEEPCASE words with \xDF are allowed + // in INITCAP form, too. + !(pAMgr->get_checksharps() && + ((utf8 && strstr(wspace, "\xC3\x9F")) || + (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL; + break; + } } - + if (rv) return 1; - // recursive breaking at break points (not good for morphological analysis) + // recursive breaking at break points if (wordbreak) { char * s; char r; - for (int j = 0; j < pAMgr->get_numbreak(); j++) { + int corr = 0; + wl = strlen(cw); + int numbreak = pAMgr ? pAMgr->get_numbreak() : 0; + // check boundary patterns (^begin and end$) + for (int j = 0; j < numbreak; j++) { + int plen = strlen(wordbreak[j]); + if (plen == 1 || plen > wl) continue; + if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0 + && spell(cw + plen - 1)) return 1; + if (wordbreak[j][plen - 1] == '$' && + strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) { + r = cw[wl - plen + 1]; + cw[wl - plen + 1] = '\0'; + if (spell(cw)) return 1; + cw[wl - plen + 1] = r; + } + } + // other patterns + for (int j = 0; j < numbreak; j++) { + int result = 0; + int plen = strlen(wordbreak[j]); s=(char *) strstr(cw, wordbreak[j]); - if (s) { + if (s && (s > cw) && (s < cw + wl - plen)) { + if (!spell(s + plen)) continue; r = *s; *s = '\0'; // examine 2 sides of the break point - if (spell(cw) && spell(s + strlen(wordbreak[j]))) { - *s = r; - return 1; - } + if (spell(cw)) return 1; *s = r; + + // LANG_hu: spec. dash rule + if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) { + r = s[1]; + s[1] = '\0'; + if (spell(cw)) return 1; // check the first part with dash + s[1] = r; + } + // end of LANG speficic region + } } } - // LANG_hu: compoundings with dashes and n-dashes XXX deprecated! - if (langnum == LANG_hu) { - int n; - // compound word with dash (HU) I18n - char * dash; - int result = 0; - // n-dash - dash = (char *) strstr(cw,"\xe2\x80\x93"); - if (dash && !wordbreak) { - *dash = '\0'; - // examine 2 sides of the dash - if (spell(cw) && spell(dash + 3)) { - *dash = '\xe2'; - return 1; - } - *dash = '\xe2'; - } - dash = (char *) strchr(cw,'-'); - if (dash) { - *dash='\0'; - // examine 2 sides of the dash - if (dash[1] == '\0') { // base word ending with dash - if (spell(cw)) return 1; - } else { - // first word ending with dash: word- - char r2 = *(dash + 1); - dash[0]='-'; - dash[1]='\0'; - result = spell(cw); - dash[1] = r2; - dash[0]='\0'; - if (result && spell(dash+1) && ((strlen(dash+1) > 1) || (dash[1] == 'e') || - ((dash[1] > '0') && (dash[1] < '9')))) return 1; - } - // affixed number in correct word - if (result && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)>='.'))) { - *dash='-'; - n = 1; - if (*(dash - n) == '.') n++; - // search first not a number character to left from dash - while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) { - n++; - } - if ((dash - n) < cw) n--; - // numbers: deprecated - for(; n >= 1; n--) { - if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && - checkword(dash - n, info, root)) return 1; - } - } - } - } return 0; } -//int Hunspell::spell(const char * word) { -// return spell(word, NULL, NULL); -//} - struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) { struct hentry * he = NULL; - int len; + int len, i; char w2[MAXWORDUTF8LEN]; const char * word; @@ -600,26 +598,29 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) } // look word in hash table - if (pHMgr) he = pHMgr->lookup(word); + for (i = 0; (i < maxdic) && !he; i ++) { + he = (pHMgr[i])->lookup(word); // check forbidden and onlyincompound words if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - info += SPELL_FORBIDDEN; + if (info) *info += SPELL_FORBIDDEN; // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) { if (pAMgr->get_compoundflag() && TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { - info += SPELL_COMPOUND; + if (info) *info += SPELL_COMPOUND; } } return NULL; } - // he = next not pseudoroot and not onlyincompound homonym or NULL + // he = next not needaffix, onlyincompound homonym or onlyupcase word while (he && (he->astr) && - ((pAMgr->get_pseudoroot() && TESTAFF(he->astr, pAMgr->get_pseudoroot(), he->alen)) || - (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) + ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || + (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || + (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)) )) he = he->next_homonym; + } // check with affixes if (!he && pAMgr) { @@ -627,38 +628,42 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) len = strlen(word); he = pAMgr->affix_check(word, len, 0); - // check compound restriction - if (he && he->astr && pAMgr->get_onlyincompound() && - TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) he = NULL; + // check compound restriction and onlyupcase + if (he && he->astr && ( + (pAMgr->get_onlyincompound() && + TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || + (info && (*info & SPELL_INITCAP) && + TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) { + he = NULL; + } if (he) { if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { - info += SPELL_FORBIDDEN; + if (info) *info += SPELL_FORBIDDEN; return NULL; } if (root) { - *root = mystrdup(he->word); - if (complexprefixes) { + *root = mystrdup(&(he->word)); + if (*root && complexprefixes) { if (utf8) reverseword_utf(*root); else reverseword(*root); } } // try check compound word } else if (pAMgr->get_compound()) { - he = pAMgr->compound_check(word, len, - 0,0,100,0,NULL,0,NULL,NULL,0); + he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0); // LANG_hu section: `moving rule' with last dash - if ((!he) && (langnum == LANG_hu) && (word[len-1]=='-')) { + if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) { char * dup = mystrdup(word); + if (!dup) return NULL; dup[len-1] = '\0'; - he = pAMgr->compound_check(dup, len-1, - -5,0,100,0,NULL,1,NULL,NULL,0); + he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0); free(dup); } - // end of LANG speficic region + // end of LANG speficic region if (he) { if (root) { - *root = mystrdup(he->word); - if (complexprefixes) { + *root = mystrdup(&(he->word)); + if (*root && complexprefixes) { if (utf8) reverseword_utf(*root); else reverseword(*root); } } @@ -674,12 +679,18 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) int Hunspell::suggest(char*** slst, const char * word) { #ifdef HUNSPELL_CHROME_CLIENT - if (pHMgr) pHMgr->EmptyHentryCache(); + if (pHMgr) pHMgr[0]->EmptyHentryCache(); #endif - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; - w_char unicw[MAXWORDLEN + 1]; + int onlycmpdsug = 0; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (!pSMgr || maxdic == 0) return 0; + w_char unicw[MAXWORDLEN]; + *slst = NULL; + // process XML input of the simplified API (see manual) + if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) { + return spellml(slst, word); + } int nc = strlen(word); if (utf8) { if (nc >= MAXWORDUTF8LEN) return 0; @@ -688,49 +699,73 @@ int Hunspell::suggest(char*** slst, const char * word) } int captype = 0; int abbv = 0; - int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + int wl = 0; + + // input conversion + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + if (wl == 0) return 0; int ns = 0; - *slst = NULL; int capwords = 0; - int ngramsugs = 0; switch(captype) { - case NOCAP: { - ns = pSMgr->suggest(slst, cw, ns); + case NOCAP: { + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); break; } - case INITCAP: { + case INITCAP: { capwords = 1; - ns = pSMgr->suggest(slst, cw, ns); + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); if (ns == -1) break; memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); break; } case HUHINITCAP: capwords = 1; - case HUHCAP: { - ns = pSMgr->suggest(slst, cw, ns); + case HUHCAP: { + ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug); if (ns != -1) { int prevns; + // something.The -> something. The + char * dot = strchr(cw, '.'); + if (dot && (dot > cw)) { + int captype_; + if (utf8) { + w_char w_[MAXWORDLEN]; + int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1); + captype_ = get_captype_utf8(w_, wl_, langnum); + } else captype_ = get_captype(dot+1, strlen(dot+1), csconv); + if (captype_ == INITCAP) { + char * st = mystrdup(cw); + if (st) st = (char *) realloc(st, wl + 2); + if (st) { + st[(dot - cw) + 1] = ' '; + strcpy(st + (dot - cw) + 2, dot + 1); + ns = insert_sug(slst, st, ns); + free(st); + } + } + } if (captype == HUHINITCAP) { // TheOpenOffice.org -> The OpenOffice.org memcpy(wspace,cw,(wl+1)); mkinitsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); } memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - insert_sug(slst, wspace, &ns); + if (spell(wspace)) ns = insert_sug(slst, wspace, ns); prevns = ns; - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); if (captype == HUHINITCAP) { mkinitcap2(wspace, unicw, nc); - insert_sug(slst, wspace, &ns); - ns = pSMgr->suggest(slst, wspace, ns); + if (spell(wspace)) ns = insert_sug(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); } // aNew -> "a New" (instead of "a new") for (int j = prevns; j < ns; j++) { @@ -739,7 +774,7 @@ int Hunspell::suggest(char*** slst, const char * word) int slen = strlen(space + 1); // different case after space (need capitalisation) if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) { - w_char w[MAXWORDLEN + 1]; + w_char w[MAXWORDLEN]; int wc = 0; char * r = (*slst)[j]; if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1); @@ -754,31 +789,32 @@ int Hunspell::suggest(char*** slst, const char * word) break; } - case ALLCAP: { + case ALLCAP: { memcpy(wspace, cw, (wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); if (ns == -1) break; - if (pAMgr && pAMgr->get_keepcase()) insert_sug(slst, wspace, &ns); + if (pAMgr && pAMgr->get_keepcase() && spell(wspace)) + ns = insert_sug(slst, wspace, ns); mkinitcap2(wspace, unicw, nc); - ns = pSMgr->suggest(slst, wspace, ns); + ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug); for (int j=0; j < ns; j++) { mkallcap((*slst)[j]); if (pAMgr && pAMgr->get_checksharps()) { char * pos; if (utf8) { - pos = strstr((*slst)[j], "\xc3\x9f"); + pos = strstr((*slst)[j], "\xC3\x9F"); while (pos) { *pos = 'S'; *(pos+1) = 'S'; - pos = strstr(pos+2, "\xc3\x9f"); + pos = strstr(pos+2, "\xC3\x9F"); } } else { - pos = strchr((*slst)[j], '\xdf'); + pos = strchr((*slst)[j], '\xDF'); while (pos) { (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2); - mystrrep((*slst)[j], "\xdf", "SS"); - pos = strchr((*slst)[j], '\xdf'); + mystrrep((*slst)[j], "\xDF", "SS"); + pos = strchr((*slst)[j], '\xDF'); } } } @@ -807,37 +843,76 @@ int Hunspell::suggest(char*** slst, const char * word) // END OF LANG_hu section // try ngram approach since found nothing - if ((ns == 0) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { - ngramsugs = 1; + if ((ns == 0 || onlycmpdsug) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { switch(captype) { case NOCAP: { - ns = pSMgr->ngsuggest(*slst, cw, pHMgr); + ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); break; } + case HUHINITCAP: + capwords = 1; case HUHCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); - break; + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); + break; } - case INITCAP: { + case INITCAP: { capwords = 1; memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); break; } case ALLCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); - for (int j=0; j < ns; j++) + int oldns = ns; + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); + for (int j = oldns; j < ns; j++) mkallcap((*slst)[j]); break; } } } + // try dash suggestion (Afo-American -> Afro-American) + if (strchr(cw, '-')) { + char * pos = strchr(cw, '-'); + char * ppos = cw; + int nodashsug = 1; + char ** nlst = NULL; + int nn = 0; + int last = 0; + for (int j = 0; j < ns && nodashsug == 1; j++) { + if (strchr((*slst)[j], '-')) nodashsug = 0; + } + while (nodashsug && !last) { + if (*pos == '\0') last = 1; else *pos = '\0'; + if (!spell(ppos)) { + nn = suggest(&nlst, ppos); + for (int j = nn - 1; j >= 0; j--) { + strncpy(wspace, cw, ppos - cw); + strcpy(wspace + (ppos - cw), nlst[j]); + if (!last) { + strcat(wspace, "-"); + strcat(wspace, pos + 1); + } + ns = insert_sug(slst, wspace, ns); + free(nlst[j]); + } + if (nlst != NULL) free(nlst); + nodashsug = 0; + } + if (!last) { + *pos = '-'; + ppos = pos + 1; + pos = strchr(ppos, '-'); + } + if (!pos) pos = cw + strlen(cw); + } + } + // word reversing wrapper for complex prefixes if (complexprefixes) { for (int j = 0; j < ns; j++) { @@ -858,14 +933,14 @@ int Hunspell::suggest(char*** slst, const char * word) } } - // suggest keepcase - if (pAMgr->get_keepcase()) { + // remove bad capitalized and forbidden forms + if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) { switch (captype) { case INITCAP: case ALLCAP: { int l = 0; for (int j=0; j < ns; j++) { - if (!spell((*slst)[j])) { + if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) { char s[MAXSWUTF8L]; w_char w[MAXSWL]; int len; @@ -876,21 +951,21 @@ int Hunspell::suggest(char*** slst, const char * word) len = strlen(s); } mkallsmall2(s, w, len); - free((*slst)[j]); + free((*slst)[j]); if (spell(s)) { (*slst)[l] = mystrdup(s); - l++; + if ((*slst)[l]) l++; } else { mkinitcap2(s, w, len); if (spell(s)) { (*slst)[l] = mystrdup(s); - l++; + if ((*slst)[l]) l++; } } } else { (*slst)[l] = (*slst)[j]; l++; - } + } } ns = l; } @@ -909,9 +984,28 @@ int Hunspell::suggest(char*** slst, const char * word) } l++; } + + // output conversion + rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL; + for (int j = 0; rl && j < ns; j++) { + if (rl->conv((*slst)[j], wspace)) { + free((*slst)[j]); + (*slst)[j] = mystrdup(wspace); + } + } + + // if suggestions removed by nosuggest, onlyincompound parameters + if (l == 0 && *slst) { + free(*slst); + *slst = NULL; + } return l; } +void Hunspell::free_list(char *** slst, int n) { + freelist(slst, n); +} + char * Hunspell::get_dic_encoding() { return encoding; @@ -921,9 +1015,9 @@ char * Hunspell::get_dic_encoding() // XXX need UTF-8 support int Hunspell::suggest_auto(char*** slst, const char * word) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (!pSMgr || maxdic == 0) return 0; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return 0; @@ -936,15 +1030,15 @@ int Hunspell::suggest_auto(char*** slst, const char * word) if (wl == 0) return 0; int ns = 0; *slst = NULL; // HU, nsug in pSMgr->suggest - + switch(captype) { - case NOCAP: { + case NOCAP: { ns = pSMgr->suggest_auto(slst, cw, ns); if (ns>0) break; break; } - case INITCAP: { + case INITCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); ns = pSMgr->suggest_auto(slst, wspace, ns); @@ -952,10 +1046,11 @@ int Hunspell::suggest_auto(char*** slst, const char * word) mkinitcap((*slst)[j]); ns = pSMgr->suggest_auto(slst, cw, ns); break; - + } - case HUHCAP: { + case HUHINITCAP: + case HUHCAP: { ns = pSMgr->suggest_auto(slst, cw, ns); if (ns == 0) { memcpy(wspace,cw,(wl+1)); @@ -965,7 +1060,7 @@ int Hunspell::suggest_auto(char*** slst, const char * word) break; } - case ALLCAP: { + case ALLCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); ns = pSMgr->suggest_auto(slst, wspace, ns); @@ -1011,103 +1106,89 @@ int Hunspell::suggest_auto(char*** slst, const char * word) } } } - // END OF LANG_hu section + // END OF LANG_hu section return ns; } +#endif -// XXX need UTF-8 support -int Hunspell::stem(char*** slst, const char * word) +int Hunspell::stem(char*** slst, char ** desc, int n) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; - int wl = strlen(word); - if (utf8) { - if (wl >= MAXWORDUTF8LEN) return 0; - } else { - if (wl >= MAXWORDLEN) return 0; - } - int captype = 0; - int abbv = 0; - wl = cleanword(cw, word, &captype, &abbv); - if (wl == 0) return 0; - - int ns = 0; - - *slst = NULL; // HU, nsug in pSMgr->suggest - - switch(captype) { - case HUHCAP: - case NOCAP: { - ns = pSMgr->suggest_stems(slst, cw, ns); - - if ((abbv) && (ns == 0)) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - ns = pSMgr->suggest_stems(slst, wspace, ns); - } - - break; - } - - case INITCAP: { - - ns = pSMgr->suggest_stems(slst, cw, ns); - - if (ns == 0) { - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - ns = pSMgr->suggest_stems(slst, wspace, ns); - - } - - if ((abbv) && (ns == 0)) { - memcpy(wspace,cw,wl); - mkallsmall(wspace); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - ns = pSMgr->suggest_stems(slst, wspace, ns); - } - - break; - - } - - case ALLCAP: { - ns = pSMgr->suggest_stems(slst, cw, ns); - if (ns != 0) break; - - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - ns = pSMgr->suggest_stems(slst, wspace, ns); - - if (ns == 0) { - mkinitcap(wspace); - ns = pSMgr->suggest_stems(slst, wspace, ns); - } + char result[MAXLNLEN]; + char result2[MAXLNLEN]; + *slst = NULL; + if (n == 0) return 0; + *result2 = '\0'; + for (int i = 0; i < n; i++) { + *result = '\0'; + // add compound word parts (except the last one) + char * s = (char *) desc[i]; + char * part = strstr(s, MORPH_PART); + if (part) { + char * nextpart = strstr(part + 1, MORPH_PART); + while (nextpart) { + copy_field(result + strlen(result), part, MORPH_PART); + part = nextpart; + nextpart = strstr(part + 1, MORPH_PART); + } + s = part; + } - if ((abbv) && (ns == 0)) { - memcpy(wspace,cw,wl); - mkallsmall(wspace); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - ns = pSMgr->suggest_stems(slst, wspace, ns); - } + char **pl; + char tok[MAXLNLEN]; + strcpy(tok, s); + char * alt = strstr(tok, " | "); + while (alt) { + alt[1] = MSEP_ALT; + alt = strstr(alt, " | "); + } + int pln = line_tok(tok, &pl, MSEP_ALT); + for (int k = 0; k < pln; k++) { + // add derivational suffixes + if (strstr(pl[k], MORPH_DERI_SFX)) { + // remove inflectional suffixes + char * is = strstr(pl[k], MORPH_INFL_SFX); + if (is) *is = '\0'; + char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); + if (sg) { + char ** gen; + int genl = line_tok(sg, &gen, MSEP_REC); + free(sg); + for (int j = 0; j < genl; j++) { + sprintf(result2 + strlen(result2), "%c%s%s", + MSEP_REC, result, gen[j]); + } + freelist(&gen, genl); + } + } else { + sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); + if (strstr(pl[k], MORPH_SURF_PFX)) { + copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); + } + copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); + } + } + freelist(&pl, pln); + } + int sln = line_tok(result2, slst, MSEP_REC); + return uniqlist(*slst, sln); +} - break; - } - } - - return ns; +int Hunspell::stem(char*** slst, const char * word) +{ + char ** pl; + int pln = analyze(&pl, word); + int pln2 = stem(slst, pl, pln); + freelist(&pl, pln); + return pln2; } +#ifdef HUNSPELL_EXPERIMENTAL int Hunspell::suggest_pos_stems(char*** slst, const char * word) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (! pSMgr || maxdic == 0) return 0; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return 0; @@ -1118,14 +1199,14 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) int abbv = 0; wl = cleanword(cw, word, &captype, &abbv); if (wl == 0) return 0; - + int ns = 0; // ns=0 = normalized input *slst = NULL; // HU, nsug in pSMgr->suggest - + switch(captype) { case HUHCAP: - case NOCAP: { + case NOCAP: { ns = pSMgr->suggest_pos_stems(slst, cw, ns); if ((abbv) && (ns == 0)) { @@ -1138,7 +1219,7 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) break; } - case INITCAP: { + case INITCAP: { ns = pSMgr->suggest_pos_stems(slst, cw, ns); @@ -1147,15 +1228,15 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) mkallsmall(wspace); ns = pSMgr->suggest_pos_stems(slst, wspace, ns); } - + break; - + } - case ALLCAP: { + case ALLCAP: { ns = pSMgr->suggest_pos_stems(slst, cw, ns); if (ns != 0) break; - + memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); ns = pSMgr->suggest_pos_stems(slst, wspace, ns); @@ -1225,19 +1306,21 @@ int Hunspell::mkinitsmall2(char * p, w_char * u, int nc) return nc; } -int Hunspell::put_word(const char * word) +int Hunspell::add(const char * word) { - if (pHMgr) { - return pHMgr->put_word(word, strlen(word), NULL); - } + if (pHMgr[0]) return (pHMgr[0])->add(word); return 0; } -int Hunspell::put_word_pattern(const char * word, const char * pattern) +int Hunspell::add_with_affix(const char * word, const char * example) { - if (pHMgr) { - return pHMgr->put_word_pattern(word, strlen(word), pattern); - } + if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example); + return 0; +} + +int Hunspell::remove(const char * word) +{ + if (pHMgr[0]) return (pHMgr[0])->remove(word); return 0; } @@ -1251,22 +1334,38 @@ struct cs_info * Hunspell::get_csconv() return csconv; } -#ifdef HUNSPELL_EXPERIMENTAL -// XXX need UTF-8 support -char * Hunspell::morph(const char * word) +void Hunspell::cat_result(char * result, char * st) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; - int wl = strlen(word); + if (st) { + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); + free(st); + } +} + +int Hunspell::analyze(char*** slst, const char * word) +{ + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + w_char unicw[MAXWORDLEN]; + int wl2 = 0; + *slst = NULL; + if (! pSMgr || maxdic == 0) return 0; + int nc = strlen(word); if (utf8) { - if (wl >= MAXWORDUTF8LEN) return 0; + if (nc >= MAXWORDUTF8LEN) return 0; } else { - if (wl >= MAXWORDLEN) return 0; + if (nc >= MAXWORDLEN) return 0; } int captype = 0; int abbv = 0; - wl = cleanword(cw, word, &captype, &abbv); + int wl = 0; + + // input conversion + RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL; + if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv); + else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + if (wl == 0) { if (abbv) { for (wl = 0; wl < abbv; wl++) cw[wl] = '.'; @@ -1277,7 +1376,7 @@ char * Hunspell::morph(const char * word) char result[MAXLNLEN]; char * st = NULL; - + *result = '\0'; int n = 0; @@ -1287,177 +1386,103 @@ char * Hunspell::morph(const char * word) // test numbers // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) { - while ((n < wl) && + while ((n < wl) && (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) { n++; if ((cw[n] == '.') || (cw[n] == ',')) { - if (((n2 == 0) && (n > 3)) || + if (((n2 == 0) && (n > 3)) || ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break; n2++; n3 = n; } } - if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return NULL; - if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xb0')) && checkword(cw+n, NULL, NULL))) { - strcat(result, cw); + if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0; + if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) { + mystrcat(result, cw, MAXLNLEN); result[n - 1] = '\0'; - if (n == wl) { - st = pSMgr->suggest_morph(cw + n - 1); - if (st) { - strcat(result, st); - free(st); - } - } else { + if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1)); + else { char sign = cw[n]; cw[n] = '\0'; - st = pSMgr->suggest_morph(cw + n - 1); - if (st) { - strcat(result, st); - free(st); - } - strcat(result, "+"); // XXX SPEC. MORPHCODE + cat_result(result, pSMgr->suggest_morph(cw + n - 1)); + mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE cw[n] = sign; - st = pSMgr->suggest_morph(cw + n); - if (st) { - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(cw + n)); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } // END OF LANG_hu section - + switch(captype) { - case NOCAP: { - st = pSMgr->suggest_morph(cw); - if (st) { - strcat(result, st); - free(st); - } - if (abbv) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - } - break; - } - case INITCAP: { + case HUHCAP: + case HUHINITCAP: + case NOCAP: { + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(wspace)); + } + break; + } + case INITCAP: { + wl = mkallsmall2(cw, unicw, nc); memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - strcat(result, st); - free(st); - } - st = pSMgr->suggest_morph(cw); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - if (abbv) { - memcpy(wspace,cw,wl); + wl2 = mkinitcap2(cw, unicw, nc); + cat_result(result, pSMgr->suggest_morph(wspace)); + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(wspace)); + + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + + cat_result(result, pSMgr->suggest_morph(wspace)); } break; } - case HUHCAP: { - st = pSMgr->suggest_morph(cw); - if (st) { - strcat(result, st); - free(st); - } -#if 0 - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); + case ALLCAP: { + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(cw)); } -#endif - break; - } - case ALLCAP: { + wl = mkallsmall2(cw, unicw, nc); memcpy(wspace,cw,(wl+1)); - st = pSMgr->suggest_morph(wspace); - if (st) { - strcat(result, st); - free(st); - } - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); + wl2 = mkinitcap2(cw, unicw, nc); + + cat_result(result, pSMgr->suggest_morph(wspace)); + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(wspace)); + + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + + cat_result(result, pSMgr->suggest_morph(wspace)); } - if (abbv) { - memcpy(wspace,cw,(wl+1)); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - if (*result) strcat(result, "\n"); - st = pSMgr->suggest_morph(wspace); - if (st) { - strcat(result, st); - free(st); - } - mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - } break; } } - if (result && (*result)) { + if (*result) { // word reversing wrapper for complex prefixes if (complexprefixes) { if (utf8) reverseword_utf(result); else reverseword(result); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); + } // compound word with dash (HU) I18n @@ -1466,24 +1491,24 @@ char * Hunspell::morph(const char * word) // LANG_hu section: set dash information for suggestions if (langnum == LANG_hu) dash = (char *) strchr(cw,'-'); if ((langnum == LANG_hu) && dash) { - *dash='\0'; + *dash='\0'; // examine 2 sides of the dash if (dash[1] == '\0') { // base word ending with dash - if (spell(cw)) return pSMgr->suggest_morph(cw); + if (spell(cw)) return line_tok(pSMgr->suggest_morph(cw), slst, MSEP_REC); } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. if (spell(cw) && (spell("-e"))) { st = pSMgr->suggest_morph(cw); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - strcat(result,"+"); // XXX spec. separator in MORPHCODE + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE st = pSMgr->suggest_morph("-e"); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } else { // first word ending with dash: word- XXX ??? @@ -1495,22 +1520,22 @@ char * Hunspell::morph(const char * word) dash[0]='\0'; if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || ((dash[1] > '0') && (dash[1] < '9')))) { - st = morph(cw); + st = pSMgr->suggest_morph(cw); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); - strcat(result,"+"); // XXX spec. separator in MORPHCODE + mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE } - st = morph(dash+1); + st = pSMgr->suggest_morph(dash+1); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } // affixed number in correct word - if (nresult && (dash > cw) && (((*(dash-1)<='9') && + if (nresult && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)=='.'))) { *dash='-'; n = 1; @@ -1525,195 +1550,338 @@ char * Hunspell::morph(const char * word) // 56-hoz, 6-hoz for(; n >= 1; n--) { if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) { - strcat(result, cw); + mystrcat(result, cw, MAXLNLEN); result[dash - cw - n] = '\0'; st = pSMgr->suggest_morph(dash - n); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } } } - return NULL; + return 0; } +int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) +{ + *slst = NULL; + if (!pSMgr || !pln) return 0; + char **pl2; + int pl2n = analyze(&pl2, word); + int captype = 0; + int abbv = 0; + char cw[MAXWORDUTF8LEN]; + cleanword(cw, word, &captype, &abbv); + char result[MAXLNLEN]; + *result = '\0'; + + for (int i = 0; i < pln; i++) { + cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); + } + freelist(&pl2, pl2n); + + if (*result) { + // allcap + if (captype == ALLCAP) mkallcap(result); + + // line split + int linenum = line_tok(result, slst, MSEP_REC); + + // capitalize + if (captype == INITCAP || captype == HUHINITCAP) { + for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]); + } + + // temporary filtering of prefix related errors (eg. + // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") + + int r = 0; + for (int j=0; j < linenum; j++) { + if (!spell((*slst)[j])) { + free((*slst)[j]); + (*slst)[j] = NULL; + } else { + if (r < j) (*slst)[r] = (*slst)[j]; + r++; + } + } + if (r > 0) return r; + free(*slst); + *slst = NULL; + } + return 0; +} + +int Hunspell::generate(char*** slst, const char * word, const char * pattern) +{ + char **pl; + int pln = analyze(&pl, pattern); + int n = generate(slst, word, pl, pln); + freelist(&pl, pln); + return uniqlist(*slst, n); +} + +// minimal XML parser functions +int Hunspell::get_xml_par(char * dest, const char * par, int max) +{ + char * d = dest; + if (!par) return 0; + char end = *par; + char * dmax = dest + max; + if (end == '>') end = '<'; + else if (end != '\'' && end != '"') return 0; // bad XML + for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par; + *d = '\0'; + mystrrep(dest, "<", "<"); + mystrrep(dest, "&", "&"); + return d - dest; +} + +// return the beginning of the element (attr == NULL) or the attribute +const char * Hunspell::get_xml_pos(const char * s, const char * attr) +{ + const char * end = strchr(s, '>'); + const char * p = s; + if (attr == NULL) return end; + do { + p = strstr(p, attr); + if (!p || p >= end) return 0; + } while (*(p-1) != ' ' && *(p-1) != '\n'); + return p + strlen(attr); +} + +int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) { + char cw[MAXWORDUTF8LEN]; + if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) && + strcmp(cw, value) == 0) return 1; + return 0; +} + +int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) { + int n = 0; + char * p; + if (!list) return 0; + for (p = list; (p = strstr(p, tag)); p++) n++; + if (n == 0) return 0; + *slst = (char **) malloc(sizeof(char *) * n); + if (!*slst) return 0; + for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) { + int l = strlen(p); + (*slst)[n] = (char *) malloc(l); + if (!(*slst)[n]) return (n > 0 ? n - 1 : 0); + get_xml_par((*slst)[n], p + strlen(tag) - 1, l); + } + return n; +} + +int Hunspell::spellml(char*** slst, const char * word) +{ + char *q, *q2; + char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN]; + q = (char *) strstr(word, "<query"); + if (!q) return 0; // bad XML input + q2 = strchr(q, '>'); + if (!q2) return 0; // bad XML input + q2 = strstr(q2, "<word"); + if (!q2) return 0; // bad XML input + if (check_xml_par(q, "type=", "analyze")) { + int n = 0, s = 0; + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) n = analyze(slst, cw); + if (n == 0) return 0; + // convert the result to <code><a>ana1</a><a>ana2</a></code> format + for (int i = 0; i < n; i++) s+= strlen((*slst)[i]); + char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->& + if (!r) return 0; + strcpy(r, "<code>"); + for (int i = 0; i < n; i++) { + int l = strlen(r); + strcpy(r + l, "<a>"); + strcpy(r + l + 3, (*slst)[i]); + mystrrep(r + l + 3, "\t", " "); + mystrrep(r + l + 3, "<", "<"); + mystrrep(r + l + 3, "&", "&"); + strcat(r, "</a>"); + free((*slst)[i]); + } + strcat(r, "</code>"); + (*slst)[0] = r; + return 1; + } else if (check_xml_par(q, "type=", "stem")) { + if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN)) return stem(slst, cw); + } else if (check_xml_par(q, "type=", "generate")) { + int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN); + if (n == 0) return 0; + char * q3 = strstr(q2 + 1, "<word"); + if (q3) { + if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN)) { + return generate(slst, cw, cw2); + } + } else { + char ** slst2; + if ((q2 = strstr(q2 + 1, "<code")) && + (n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) { + int n2 = generate(slst, cw, slst2, n); + freelist(&slst2, n); + return uniqlist(*slst, n2); + } + } + } + return 0; +} + + +#ifdef HUNSPELL_EXPERIMENTAL // XXX need UTF-8 support char * Hunspell::morph_with_correction(const char * word) { - char cw[MAXWORDUTF8LEN + 4]; - char wspace[MAXWORDUTF8LEN + 4]; - if (! pSMgr) return 0; + char cw[MAXWORDUTF8LEN]; + char wspace[MAXWORDUTF8LEN]; + if (! pSMgr || maxdic == 0) return NULL; int wl = strlen(word); if (utf8) { - if (wl >= MAXWORDUTF8LEN) return 0; + if (wl >= MAXWORDUTF8LEN) return NULL; } else { - if (wl >= MAXWORDLEN) return 0; + if (wl >= MAXWORDLEN) return NULL; } int captype = 0; int abbv = 0; wl = cleanword(cw, word, &captype, &abbv); - if (wl == 0) return 0; + if (wl == 0) return NULL; char result[MAXLNLEN]; char * st = NULL; - + *result = '\0'; - - + + switch(captype) { - case NOCAP: { + case NOCAP: { st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } - if (abbv) { - memcpy(wspace,cw,wl); + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } } break; } - case INITCAP: { + case INITCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); - } - st = pSMgr->suggest_morph_for_spelling_error(cw); + } + st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - if (abbv) { - memcpy(wspace,cw,wl); + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); - } + } mkinitcap(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); - } + } } break; } - case HUHCAP: { + case HUHCAP: { st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); - } + } break; } - case ALLCAP: { + case ALLCAP: { memcpy(wspace,cw,(wl+1)); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); - } + } mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - if (abbv) { + if (abbv) { memcpy(wspace,cw,(wl+1)); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; - if (*result) strcat(result, "\n"); + if (*result) mystrcat(result, "\n", MAXLNLEN); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - strcat(result, st); - free(st); - } + mystrcat(result, st, MAXLNLEN); + free(st); + } mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); + if (*result) mystrcat(result, "\n", MAXLNLEN); + mystrcat(result, st, MAXLNLEN); free(st); } - } + } break; } } - if (result) return mystrdup(result); + if (*result) return mystrdup(result); return NULL; } -/* analyze word - * return line count - * XXX need a better data structure for morphological analysis */ -int Hunspell::analyze(char ***out, const char *word) { - int n = 0; - if (!word) return 0; - char * m = morph(word); - if(!m) return 0; - if (!out) return line_tok(m, out); - - // without memory allocation - /* BUG missing buffer size checking */ - int i, p; - for(p = 0, i = 0; m[i]; i++) { - if(m[i] == '\n' || !m[i+1]) { - n++; - strncpy((*out)[n++], m + p, i - p + 1); - if (m[i] == '\n') (*out)[n++][i - p] = '\0'; - if(!m[i+1]) break; - p = i + 1; - } - } - free(m); - return n; -} - #endif // END OF HUNSPELL_EXPERIMENTAL CODE Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) @@ -1725,6 +1893,17 @@ Hunhandle *Hunspell_create(FILE* aff_handle, FILE* dic_handle) #endif } + +Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, + const char * key) +{ +#ifdef HUNSPELL_CHROME_CLIENT + return NULL; +#else + return (Hunhandle*)(new Hunspell(affpath, dpath, key)); +#endif +} + void Hunspell_destroy(Hunhandle *pHunspell) { delete (Hunspell*)(pHunspell); @@ -1745,3 +1924,57 @@ int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word) return ((Hunspell*)pHunspell)->suggest(slst, word); } +int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word) +{ + return ((Hunspell*)pHunspell)->analyze(slst, word); +} + +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word) +{ + return ((Hunspell*)pHunspell)->stem(slst, word); +} + +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, char** desc, int n) +{ + return ((Hunspell*)pHunspell)->stem(slst, desc, n); +} + +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + const char * word2) +{ + return ((Hunspell*)pHunspell)->generate(slst, word, word2); +} + +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + char** desc, int n) +{ + return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); +} + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + +int Hunspell_add(Hunhandle *pHunspell, const char * word) { + return ((Hunspell*)pHunspell)->add(word); +} + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + +int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, + const char * example) { + return ((Hunspell*)pHunspell)->add_with_affix(word, example); +} + + /* remove word from the run-time dictionary */ + +int Hunspell_remove(Hunhandle *pHunspell, const char * word) { + return ((Hunspell*)pHunspell)->remove(word); +} + +void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n) { + freelist(slst, n); +} diff --git a/chrome/third_party/hunspell/src/hunspell/hunspell.h b/chrome/third_party/hunspell/src/hunspell/hunspell.h index b04b83a..f926052 100644 --- a/chrome/third_party/hunspell/src/hunspell/hunspell.h +++ b/chrome/third_party/hunspell/src/hunspell/hunspell.h @@ -7,15 +7,25 @@ extern "C" { typedef struct Hunhandle Hunhandle; -Hunhandle *Hunspell_create(const char * affpath, const char * dpath); -void Hunspell_destroy(Hunhandle *pHunspell); +#ifdef _MSC_VER +#define DLL __declspec ( dllexport ) +#else +#define DLL +#endif + +DLL Hunhandle *Hunspell_create(const char * affpath, const char * dpath); + +DLL Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, + const char * key); + +DLL void Hunspell_destroy(Hunhandle *pHunspell); /* spell(word) - spellcheck word * output: 0 = bad word, not 0 = good word */ -int Hunspell_spell(Hunhandle *pHunspell, const char *); +DLL int Hunspell_spell(Hunhandle *pHunspell, const char *); -char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); +DLL char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); /* suggest(suggestions, word) - search suggestions * input: pointer to an array of strings pointer and the (bad) word @@ -24,7 +34,63 @@ char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); * a newly allocated array of strings (*slts will be NULL when number * of suggestion equals 0.) */ -int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word); +DLL int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word); + + /* morphological functions */ + + /* analyze(result, word) - morphological analysis of the word */ + +DLL int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word); + + /* stem(result, word) - stemmer function */ + +DLL int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word); + + /* stem(result, analysis, n) - get stems from a morph. analysis + * example: + * char ** result, result2; + * int n1 = Hunspell_analyze(result, "words"); + * int n2 = Hunspell_stem2(result2, result, n1); + */ + +DLL int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n); + + /* generate(result, word, word2) - morphological generation by example(s) */ + +DLL int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + const char * word2); + + /* generate(result, word, desc, n) - generation by morph. description(s) + * example: + * char ** result; + * char * affix = "is:plural"; // description depends from dictionaries, too + * int n = Hunspell_generate2(result, "word", &affix, 1); + * for (int i = 0; i < n; i++) printf("%s\n", result[i]); + */ + +DLL int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, + char** desc, int n); + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + +DLL int Hunspell_add(Hunhandle *pHunspell, const char * word); + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + +DLL int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, const char * example); + + /* remove word from the run-time dictionary */ + +DLL int Hunspell_remove(Hunhandle *pHunspell, const char * word); + + /* free suggestion lists */ + +DLL void Hunspell_free_list(Hunhandle *pHunspell, char *** slst, int n); #ifdef __cplusplus } diff --git a/chrome/third_party/hunspell/src/hunspell/hunspell.hxx b/chrome/third_party/hunspell/src/hunspell/hunspell.hxx index bc6f7d5..bb26b5b 100644 --- a/chrome/third_party/hunspell/src/hunspell/hunspell.hxx +++ b/chrome/third_party/hunspell/src/hunspell/hunspell.hxx @@ -1,30 +1,20 @@ -#include "license.hunspell" -#include "license.myspell" - #include "hashmgr.hxx" #include "affixmgr.hxx" #include "suggestmgr.hxx" -#include "csutil.hxx" #include "langnum.hxx" #define SPELL_COMPOUND (1 << 0) #define SPELL_FORBIDDEN (1 << 1) +#define SPELL_ALLCAP (1 << 2) +#define SPELL_NOCAP (1 << 3) +#define SPELL_INITCAP (1 << 4) -#define NOCAP 0 -#define INITCAP 1 -#define ALLCAP 2 -#define HUHCAP 3 -#define HUHINITCAP 4 +#define SPELL_XML "<?xml?>" +#define MAXDIC 20 #define MAXSUGGESTION 15 #define MAXSHARPS 5 -#if defined(W32) && defined(LIBRARY) -#define DLLTEST2_API __declspec(dllexport) -#else -#define DLLTEST2_API -#endif - #ifndef _MYSPELLMGR_HXX_ #define _MYSPELLMGR_HXX_ @@ -32,15 +22,27 @@ #include "chrome/third_party/hunspell/google/bdict_reader.h" #endif -#ifdef W32 -class DLLTEST2_API Hunspell +#ifdef HUNSPELL_STATIC + #define DLLEXPORT +#else + #ifdef HUNSPELL_EXPORTS + #define DLLEXPORT __declspec( dllexport ) + #else + #define DLLEXPORT __declspec( dllimport ) + #endif +#endif + +#ifdef WIN32 +class DLLEXPORT Hunspell #else class Hunspell #endif { AffixMgr* pAMgr; - HashMgr* pHMgr; + HashMgr* pHMgr[MAXDIC]; + int maxdic; SuggestMgr* pSMgr; + char * affixpath; char * encoding; struct cs_info * csconv; int langnum; @@ -61,11 +63,16 @@ public: #ifdef HUNSPELL_CHROME_CLIENT Hunspell(const unsigned char* bdict_data, size_t bdict_length); #else - Hunspell(FILE* aff_handle, FILE* dic_handle); + Hunspell(FILE* aff_handle, FILE* dic_handle, const char * key = NULL); #endif ~Hunspell(); +#ifndef HUNSPELL_CHROME_CLIENT + /* load extra dictionaries (only dic files) */ + int add_dic(const char * dpath, const char * key = NULL); +#endif + /* spell(word) - spellcheck word * output: 0 = bad word, not 0 = good word * @@ -87,17 +94,62 @@ public: */ int suggest(char*** slst, const char * word); + + /* deallocate suggestion lists */ + + void free_list(char *** slst, int n); + char * get_dic_encoding(); - /* handling custom dictionary */ + /* morphological functions */ + + /* analyze(result, word) - morphological analysis of the word */ + + int analyze(char*** slst, const char * word); + + /* stem(result, word) - stemmer function */ - int put_word(const char * word); + int stem(char*** slst, const char * word); + + /* stem(result, analysis, n) - get stems from a morph. analysis + * example: + * char ** result, result2; + * int n1 = analyze(&result, "words"); + * int n2 = stem(&result2, result, n1); + */ + + int stem(char*** slst, char ** morph, int n); + + /* generate(result, word, word2) - morphological generation by example(s) */ + + int generate(char*** slst, const char * word, const char * word2); - /* pattern is a sample dictionary word - * put word into custom dictionary with affix flags of pattern word + /* generate(result, word, desc, n) - generation by morph. description(s) + * example: + * char ** result; + * char * affix = "is:plural"; // description depends from dictionaries, too + * int n = generate(&result, "word", &affix, 1); + * for (int i = 0; i < n; i++) printf("%s\n", result[i]); + */ + + int generate(char*** slst, const char * word, char ** desc, int n); + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + + int add(const char * word); + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. */ - int put_word_pattern(const char * word, const char * pattern); + int add_with_affix(const char * word, const char * example); + + /* remove word from the run-time dictionary */ + + int remove(const char * word); /* other */ @@ -107,25 +159,14 @@ public: struct cs_info * get_csconv(); const char * get_version(); - - /* experimental functions */ + + /* experimental and deprecated functions */ #ifdef HUNSPELL_EXPERIMENTAL - /* suffix is an affix flag string, similarly in dictionary files */ - + /* suffix is an affix flag string, similarly in dictionary files */ int put_word_suffix(const char * word, const char * suffix); - - /* morphological analysis */ - - char * morph(const char * word); - int analyze(char*** out, const char *word); - char * morph_with_correction(const char * word); - /* stemmer function */ - - int stem(char*** slst, const char * word); - /* spec. suggestions */ int suggest_auto(char*** slst, const char * word); int suggest_pos_stems(char*** slst, const char * word); @@ -146,8 +187,14 @@ private: char * sharps_u8_l1(char * dest, char * source); hentry * spellsharps(char * base, char *, int, int, char * tmp, int * info, char **root); int is_keepcase(const hentry * rv); - int insert_sug(char ***slst, char * word, int *ns); - + int insert_sug(char ***slst, char * word, int ns); + void cat_result(char * result, char * st); + char * stem_description(const char * desc); + int spellml(char*** slst, const char * word); + int get_xml_par(char * dest, const char * par, int maxl); + const char * get_xml_pos(const char * s, const char * attr); + int get_xml_list(char ***slst, char * list, const char * tag); + int check_xml_par(const char * q, const char * attr, const char * value); }; diff --git a/chrome/third_party/hunspell/src/hunspell/hunzip.cxx b/chrome/third_party/hunspell/src/hunspell/hunzip.cxx new file mode 100644 index 0000000..f9091b8 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/hunzip.cxx @@ -0,0 +1,196 @@ +#ifndef MOZILLA_CLIENT +#include <cstdlib> +#include <cstring> +#include <cstdio> +#else +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#endif + +#include "hunzip.hxx" + +#define CODELEN 65536 +#define BASEBITREC 5000 + +#define UNCOMPRESSED '\002' +#define MAGIC "hz0" +#define MAGIC_ENCRYPT "hz1" +#define MAGICLEN (sizeof(MAGIC) - 1) + +int Hunzip::fail(const char * err, const char * par) { + fprintf(stderr, err, par); + return -1; +} + +Hunzip::Hunzip(const char * file, const char * key) { + bufsiz = 0; + lastbit = 0; + inc = 0; + outc = 0; + dec = NULL; + filename = (char *) malloc(strlen(file) + 1); + if (filename) strcpy(filename, file); + if (getcode(key) == -1) bufsiz = -1; + else bufsiz = getbuf(); +} + +int Hunzip::getcode(const char * key) { + unsigned char c[2]; + int i, j, n, p; + int allocatedbit = BASEBITREC; + const char * enc = key; + + fin = fopen(filename, "rb"); + if (!fin) return -1; + + // read magic number + if ((fread(in, 1, 3, fin) < MAGICLEN) + || !(strncmp(MAGIC, in, MAGICLEN) == 0 || + strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0)) { + return fail(MSG_FORMAT, filename); + } + + // check encryption + if (strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0) { + unsigned char cs; + if (!key) return fail(MSG_KEY, filename); + if (fread(&c, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename); + for (cs = 0; *enc; enc++) cs ^= *enc; + if (cs != c[0]) return fail(MSG_KEY, filename); + enc = key; + } else key = NULL; + + // read record count + if (fread(&c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename); + + if (key) { + c[0] ^= *enc; + if (*(++enc) == '\0') enc = key; + c[1] ^= *enc; + } + + n = ((int) c[0] << 8) + c[1]; + dec = (struct bit *) malloc(BASEBITREC * sizeof(struct bit)); + if (!dec) return fail(MSG_MEMORY, filename); + dec[0].v[0] = 0; + dec[0].v[1] = 0; + + // read codes + for (i = 0; i < n; i++) { + unsigned char l; + if (fread(c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename); + if (key) { + if (*(++enc) == '\0') enc = key; + c[0] ^= *enc; + if (*(++enc) == '\0') enc = key; + c[1] ^= *enc; + } + if (fread(&l, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename); + if (key) { + if (*(++enc) == '\0') enc = key; + l ^= *enc; + } + if (fread(in, 1, l/8+1, fin) < (size_t) l/8+1) return fail(MSG_FORMAT, filename); + if (key) for (j = 0; j <= l/8; j++) { + if (*(++enc) == '\0') enc = key; + in[j] ^= *enc; + } + p = 0; + for (j = 0; j < l; j++) { + int b = (in[j/8] & (1 << (7 - (j % 8)))) ? 1 : 0; + int oldp = p; + p = dec[p].v[b]; + if (p == 0) { + lastbit++; + if (lastbit == allocatedbit) { + allocatedbit += BASEBITREC; + dec = (struct bit *) realloc(dec, allocatedbit * sizeof(struct bit)); + } + dec[lastbit].v[0] = 0; + dec[lastbit].v[1] = 0; + dec[oldp].v[b] = lastbit; + p = lastbit; + } + } + dec[p].c[0] = c[0]; + dec[p].c[1] = c[1]; + } + return 0; +} + +Hunzip::~Hunzip() +{ + if (dec) free(dec); + if (fin) fclose(fin); + if (filename) free(filename); +} + +int Hunzip::getbuf() { + int p = 0; + int o = 0; + do { + if (inc == 0) inbits = fread(in, 1, BUFSIZE, fin) * 8; + for (; inc < inbits; inc++) { + int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0; + int oldp = p; + p = dec[p].v[b]; + if (p == 0) { + if (oldp == lastbit) { + fclose(fin); + fin = NULL; + // add last odd byte + if (dec[lastbit].c[0]) out[o++] = dec[lastbit].c[1]; + return o; + } + out[o++] = dec[oldp].c[0]; + out[o++] = dec[oldp].c[1]; + if (o == BUFSIZE) return o; + p = dec[p].v[b]; + } + } + inc = 0; + } while (inbits == BUFSIZE * 8); + return fail(MSG_FORMAT, filename); +} + +const char * Hunzip::getline() { + char linebuf[BUFSIZE]; + int l = 0, eol = 0, left = 0, right = 0; + if (bufsiz == -1) return NULL; + while (l < bufsiz && !eol) { + linebuf[l++] = out[outc]; + switch (out[outc]) { + case '\t': break; + case 31: { // escape + if (++outc == bufsiz) { + bufsiz = getbuf(); + outc = 0; + } + linebuf[l - 1] = out[outc]; + break; + } + case ' ': break; + default: if (((unsigned char) out[outc]) < 47) { + if (out[outc] > 32) { + right = out[outc] - 31; + if (++outc == bufsiz) { + bufsiz = getbuf(); + outc = 0; + } + } + if (out[outc] == 30) left = 9; else left = out[outc]; + linebuf[l-1] = '\n'; + eol = 1; + } + } + if (++outc == bufsiz) { + outc = 0; + bufsiz = fin ? getbuf(): -1; + } + } + if (right) strcpy(linebuf + l - 1, line + strlen(line) - right - 1); + else linebuf[l] = '\0'; + strcpy(line + left, linebuf); + return line; +} diff --git a/chrome/third_party/hunspell/src/hunspell/hunzip.hxx b/chrome/third_party/hunspell/src/hunspell/hunzip.hxx new file mode 100644 index 0000000..52109d1 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/hunzip.hxx @@ -0,0 +1,41 @@ +/* hunzip: file decompression for sorted dictionaries with optional encryption, + * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */ + +#ifndef _HUNZIP_HXX_ +#define _HUNZIP_HXX_ + +#define BUFSIZE 65536 +#define HZIP_EXTENSION ".hz" + +#define MSG_OPEN "error: %s: cannot open\n" +#define MSG_FORMAT "error: %s: not in hzip format\n" +#define MSG_MEMORY "error: %s: missing memory\n" +#define MSG_KEY "error: %s: missing or bad password\n" + +struct bit { + unsigned char c[2]; + int v[2]; +}; + +class Hunzip +{ + +protected: + char * filename; + FILE * fin; + int bufsiz, lastbit, inc, inbits, outc; + struct bit * dec; // code table + char in[BUFSIZE]; // input buffer + char out[BUFSIZE + 1]; // Huffman-decoded buffer + char line[BUFSIZE + 50]; // decoded line + int getcode(const char * key); + int getbuf(); + int fail(const char * err, const char * par); + +public: + Hunzip(const char * filename, const char * key = NULL); + ~Hunzip(); + const char * getline(); +}; + +#endif diff --git a/chrome/third_party/hunspell/src/hunspell/license.hunspell b/chrome/third_party/hunspell/src/hunspell/license.hunspell index f1cf161..3afee61 100644 --- a/chrome/third_party/hunspell/src/hunspell/license.hunspell +++ b/chrome/third_party/hunspell/src/hunspell/license.hunspell @@ -14,7 +14,7 @@ * The Original Code is Hunspell, based on MySpell. * * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Kevin Hendricks (MySpell) and Laszlo Nemeth (Hunspell). * Portions created by the Initial Developers are Copyright (C) 2002-2005 * the Initial Developers. All Rights Reserved. * @@ -24,22 +24,22 @@ * Giuseppe Modugno * Gianluca Turconi * Simon Brouwer - * Noll János - * Bíró Árpád - * Goldman Eleonóra - * Sarlós Tamás - * Bencsáth Boldizsár - * Halácsy Péter - * Dvornik László - * Gefferth András + * Noll Janos + * Biro Arpad + * Goldman Eleonora + * Sarlos Tamas + * Bencsath Boldizsar + * Halacsy Peter + * Dvornik Laszlo + * Gefferth Andras * Nagy Viktor - * Varga Dániel + * Varga Daniel * Chris Halls * Rene Engelhard * Bram Moolenaar * Dafydd Jones - * Harri Pitkänen - * András Tímár + * Harri Pitkanen + * Andras Timar * Tor Lillqvist * * Alternatively, the contents of this file may be used under the terms of diff --git a/chrome/third_party/hunspell/src/hunspell/phonet.cxx b/chrome/third_party/hunspell/src/hunspell/phonet.cxx new file mode 100644 index 0000000..ca20796 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/phonet.cxx @@ -0,0 +1,299 @@ +/* phonetic.c - generic replacement aglogithms for phonetic transformation + Copyright (C) 2000 Bjoern Jacke + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation; + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; If not, see + <http://www.gnu.org/licenses/>. + + Changelog: + + 2000-01-05 Bjoern Jacke <bjoern at j3e.de> + Initial Release insprired by the article about phonetic + transformations out of c't 25/1999 + + 2007-07-26 Bjoern Jacke <bjoern at j3e.de> + Released under MPL/GPL/LGPL tri-license for Hunspell + + 2007-08-23 Laszlo Nemeth <nemeth at OOo> + Porting from Aspell to Hunspell using C-like structs +*/ + +#ifndef MOZILLA_CLIENT +#include <cstdlib> +#include <cstring> +#include <cstdio> +#include <cctype> +#else +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> +#endif + +#include "csutil.hxx" +#include "phonet.hxx" + +void init_phonet_hash(phonetable & parms) + { + int i, k; + + for (i = 0; i < HASHSIZE; i++) { + parms.hash[i] = -1; + } + + for (i = 0; parms.rules[i][0] != '\0'; i += 2) { + /** set hash value **/ + k = (unsigned char) parms.rules[i][0]; + + if (parms.hash[k] < 0) { + parms.hash[k] = i; + } + } + } + + // like strcpy but safe if the strings overlap + // but only if dest < src + static inline void strmove(char * dest, char * src) { + while (*src) + *dest++ = *src++; + *dest = '\0'; + } + +int myisalpha(char ch) { + if ((unsigned char) ch < 128) return isalpha(ch); + return 1; +} + +/* phonetic transcription algorithm */ +/* see: http://aspell.net/man-html/Phonetic-Code.html */ +/* convert string to uppercase before this call */ +int phonet (const char * inword, char * target, + int len, + phonetable & parms) + { + /** Do phonetic transformation. **/ + /** "len" = length of "inword" incl. '\0'. **/ + + /** result: >= 0: length of "target" **/ + /** otherwise: error **/ + + int i,j,k=0,n,p,z; + int k0,n0,p0=-333,z0; + char c, c0; + const char * s; + typedef unsigned char uchar; + char word[MAXPHONETUTF8LEN + 1]; + if (len == -1) len = strlen(inword); + if (len > MAXPHONETUTF8LEN) return 0; + strcpy(word, inword); + + /** check word **/ + i = j = z = 0; + while ((c = word[i]) != '\0') { + n = parms.hash[(uchar) c]; + z0 = 0; + + if (n >= 0) { + /** check all rules for the same letter **/ + while (parms.rules[n][0] == c) { + + /** check whole string **/ + k = 1; /** number of found letters **/ + p = 5; /** default priority **/ + s = parms.rules[n]; + s++; /** important for (see below) "*(s-1)" **/ + + while (*s != '\0' && word[i+k] == *s + && !isdigit ((unsigned char) *s) && strchr ("(-<^$", *s) == NULL) { + k++; + s++; + } + if (*s == '(') { + /** check letters in "(..)" **/ + if (myisalpha(word[i+k]) // ...could be implied? + && strchr(s+1, word[i+k]) != NULL) { + k++; + while (*s != ')') + s++; + s++; + } + } + p0 = (int) *s; + k0 = k; + while (*s == '-' && k > 1) { + k--; + s++; + } + if (*s == '<') + s++; + if (isdigit ((unsigned char) *s)) { + /** determine priority **/ + p = *s - '0'; + s++; + } + if (*s == '^' && *(s+1) == '^') + s++; + + if (*s == '\0' + || (*s == '^' + && (i == 0 || ! myisalpha(word[i-1])) + && (*(s+1) != '$' + || (! myisalpha(word[i+k0]) ))) + || (*s == '$' && i > 0 + && myisalpha(word[i-1]) + && (! myisalpha(word[i+k0]) ))) + { + /** search for followup rules, if: **/ + /** parms.followup and k > 1 and NO '-' in searchstring **/ + c0 = word[i+k-1]; + n0 = parms.hash[(uchar) c0]; + +// if (parms.followup && k > 1 && n0 >= 0 + if (k > 1 && n0 >= 0 + && p0 != (int) '-' && word[i+k] != '\0') { + /** test follow-up rule for "word[i+k]" **/ + while (parms.rules[n0][0] == c0) { + + /** check whole string **/ + k0 = k; + p0 = 5; + s = parms.rules[n0]; + s++; + while (*s != '\0' && word[i+k0] == *s + && ! isdigit((unsigned char) *s) && strchr("(-<^$",*s) == NULL) { + k0++; + s++; + } + if (*s == '(') { + /** check letters **/ + if (myisalpha(word[i+k0]) + && strchr (s+1, word[i+k0]) != NULL) { + k0++; + while (*s != ')' && *s != '\0') + s++; + if (*s == ')') + s++; + } + } + while (*s == '-') { + /** "k0" gets NOT reduced **/ + /** because "if (k0 == k)" **/ + s++; + } + if (*s == '<') + s++; + if (isdigit ((unsigned char) *s)) { + p0 = *s - '0'; + s++; + } + + if (*s == '\0' + /** *s == '^' cuts **/ + || (*s == '$' && ! myisalpha(word[i+k0]))) + { + if (k0 == k) { + /** this is just a piece of the string **/ + n0 += 2; + continue; + } + + if (p0 < p) { + /** priority too low **/ + n0 += 2; + continue; + } + /** rule fits; stop search **/ + break; + } + n0 += 2; + } /** End of "while (parms.rules[n0][0] == c0)" **/ + + if (p0 >= p && parms.rules[n0][0] == c0) { + n += 2; + continue; + } + } /** end of follow-up stuff **/ + + /** replace string **/ + s = parms.rules[n+1]; + p0 = (parms.rules[n][0] != '\0' + && strchr (parms.rules[n]+1,'<') != NULL) ? 1:0; + if (p0 == 1 && z == 0) { + /** rule with '<' is used **/ + if (j > 0 && *s != '\0' + && (target[j-1] == c || target[j-1] == *s)) { + j--; + } + z0 = 1; + z = 1; + k0 = 0; + while (*s != '\0' && word[i+k0] != '\0') { + word[i+k0] = *s; + k0++; + s++; + } + if (k > k0) + strmove (&word[0]+i+k0, &word[0]+i+k); + + /** new "actual letter" **/ + c = word[i]; + } + else { /** no '<' rule used **/ + i += k - 1; + z = 0; + while (*s != '\0' + && *(s+1) != '\0' && j < len) { + if (j == 0 || target[j-1] != *s) { + target[j] = *s; + j++; + } + s++; + } + /** new "actual letter" **/ + c = *s; + if (parms.rules[n][0] != '\0' + && strstr (parms.rules[n]+1, "^^") != NULL) { + if (c != '\0') { + target[j] = c; + j++; + } + strmove (&word[0], &word[0]+i+1); + i = 0; + z0 = 1; + } + } + break; + } /** end of follow-up stuff **/ + n += 2; + } /** end of while (parms.rules[n][0] == c) **/ + } /** end of if (n >= 0) **/ + if (z0 == 0) { +// if (k && (assert(p0!=-333),!p0) && j < len && c != '\0' +// && (!parms.collapse_result || j == 0 || target[j-1] != c)){ + if (k && !p0 && j < len && c != '\0' + && (1 || j == 0 || target[j-1] != c)){ + /** condense only double letters **/ + target[j] = c; + ///printf("\n setting \n"); + j++; + } + + i++; + z = 0; + k=0; + } + } /** end of while ((c = word[i]) != '\0') **/ + + target[j] = '\0'; + return (j); + + } /** end of function "phonet" **/ diff --git a/chrome/third_party/hunspell/src/hunspell/phonet.hxx b/chrome/third_party/hunspell/src/hunspell/phonet.hxx new file mode 100644 index 0000000..d1cf995 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/phonet.hxx @@ -0,0 +1,50 @@ +/* phonetic.c - generic replacement aglogithms for phonetic transformation + Copyright (C) 2000 Bjoern Jacke + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation; + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; If not, see + <http://www.gnu.org/licenses/>. + + Changelog: + + 2000-01-05 Bjoern Jacke <bjoern at j3e.de> + Initial Release insprired by the article about phonetic + transformations out of c't 25/1999 + + 2007-07-26 Bjoern Jacke <bjoern at j3e.de> + Released under MPL/GPL/LGPL tri-license for Hunspell + + 2007-08-23 Laszlo Nemeth <nemeth at OOo> + Porting from Aspell to Hunspell using C-like structs +*/ + +#ifndef __PHONETHXX__ +#define __PHONETHXX__ + +#define HASHSIZE 256 +#define MAXPHONETLEN 256 +#define MAXPHONETUTF8LEN (MAXPHONETLEN * 4) + +struct phonetable { + char utf8; + cs_info * lang; + int num; + char * * rules; + int hash[HASHSIZE]; +}; + +void init_phonet_hash(phonetable & parms); + +int phonet (const char * inword, char * target, + int len, phonetable & phone); + +#endif diff --git a/chrome/third_party/hunspell/src/hunspell/replist.cxx b/chrome/third_party/hunspell/src/hunspell/replist.cxx new file mode 100644 index 0000000..7846470 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/replist.cxx @@ -0,0 +1,95 @@ +#include "license.hunspell" +#include "license.myspell" + +#ifndef MOZILLA_CLIENT +#include <cstdlib> +#include <cstring> +#include <cstdio> +#else +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#endif + +#include "replist.hxx" +#include "csutil.hxx" + +RepList::RepList(int n) { + dat = (replentry **) malloc(sizeof(replentry *) * n); + if (dat == 0) size = 0; else size = n; + pos = 0; +} + +RepList::~RepList() +{ + for (int i = 0; i < pos; i++) { + free(dat[i]->pattern); + free(dat[i]->pattern2); + free(dat[i]); + } + free(dat); +} + +int RepList::get_pos() { + return pos; +} + +replentry * RepList::item(int n) { + return dat[n]; +} + +int RepList::near(const char * word) { + int p1 = 0; + int p2 = pos; + while ((p2 - p1) > 1) { + int m = (p1 + p2) / 2; +// fprintf(stderr, "m: %d p1: %d p2: %d dat: %s\n", m, p1, p2, dat[m]->pattern); + int c = strcmp(word, dat[m]->pattern); + if (c <= 0) { + if (c < 0) p2 = m; else p1 = p2 = m; + } else p1 = m; + } +// fprintf(stderr, "NEAR: %s (word: %s)\n", dat[p1]->pattern, word); + return p1; +} + +int RepList::match(const char * word, int n) { + if (strncmp(word, dat[n]->pattern, strlen(dat[n]->pattern)) == 0) return strlen(dat[n]->pattern); + return 0; +} + +int RepList::add(char * pat1, char * pat2) { + if (pos >= size || pat1 == NULL || pat2 == NULL) return 1; + replentry * r = (replentry *) malloc(sizeof(replentry)); + if (r == NULL) return 1; + r->pattern = mystrrep(pat1, "_", " "); + r->pattern2 = mystrrep(pat2, "_", " "); + dat[pos++] = r; + for (int i = pos - 1; i > 0; i--) { + r = dat[i]; + if (strcmp(r->pattern, dat[i - 1]->pattern) < 0) { + dat[i] = dat[i - 1]; + dat[i - 1] = r; + } else break; + } + return 0; +} + +int RepList::conv(const char * word, char * dest) { + int stl = 0; + int change = 0; +// for (int i = 0; i < pos; i++) fprintf(stderr, "%d. %s\n", i, dat[i]->pattern); + for (int i = 0; i < strlen(word); i++) { + int n = near(word + i); + int l = match(word + i, n); + if (l) { + strcpy(dest + stl, dat[n]->pattern2); + stl += strlen(dat[n]->pattern2); + i += l - 1; + change = 1; + } else dest[stl++] = word[i]; + } + dest[stl] = '\0'; +// fprintf(stderr, "i: %s o: %s change: %d\n", word, dest, change); + return change; +} diff --git a/chrome/third_party/hunspell/src/hunspell/replist.hxx b/chrome/third_party/hunspell/src/hunspell/replist.hxx new file mode 100644 index 0000000..16da313 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/replist.hxx @@ -0,0 +1,29 @@ +/* string replacement list class */ +#ifndef _REPLIST_HXX_ +#define _REPLIST_HXX_ +#ifdef HUNSPELL_CHROME_CLIENT +// Compilation issues in spellchecker.cc think near is a macro, therefore +// removing it here solves that problem. +#undef near +#endif +#include "w_char.hxx" + +class RepList +{ +protected: + replentry ** dat; + int size; + int pos; + +public: + RepList(int n); + ~RepList(); + + int get_pos(); + int add(char * pat1, char * pat2); + replentry * item(int n); + int near(const char * word); + int match(const char * word, int n); + int conv(const char * word, char * dest); +}; +#endif diff --git a/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx b/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx index 222701b..5914b6a 100644 --- a/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx +++ b/chrome/third_party/hunspell/src/hunspell/suggestmgr.cxx @@ -14,13 +14,16 @@ #endif #include "suggestmgr.hxx" +#include "htypes.hxx" +#include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif +const w_char W_VLINE = { '\0', '|' }; SuggestMgr::SuggestMgr(const char * tryme, int maxn, AffixMgr * aptr) @@ -30,36 +33,54 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn, // try when building candidate suggestions pAMgr = aptr; + ckeyl = 0; + ckey = NULL; + ckey_utf = NULL; + ctryl = 0; ctry = NULL; ctry_utf = NULL; + utf8 = 0; + langnum = 0; + complexprefixes = 0; + maxSug = maxn; nosplitsugs = 0; maxngramsugs = MAXNGRAMSUGS; - utf8 = 0; - complexprefixes = 0; - if (pAMgr) { char * enc = pAMgr->get_encoding(); csconv = get_current_cs(enc); free(enc); + langnum = pAMgr->get_langnum(); + ckey = pAMgr->get_key_string(); nosplitsugs = pAMgr->get_nosplitsugs(); if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs(); utf8 = pAMgr->get_utf8(); complexprefixes = pAMgr->get_complexprefixes(); } - if (tryme) { + if (ckey) { if (utf8) { w_char t[MAXSWL]; + ckeyl = u8_u16(t, MAXSWL, ckey); + ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char)); + if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char)); + } else { + ckeyl = strlen(ckey); + } + } + + if (tryme) { + ctry = mystrdup(tryme); + if (ctry) ctryl = strlen(ctry); + if (ctry && utf8) { + w_char t[MAXSWL]; ctryl = u8_u16(t, MAXSWL, tryme); ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char)); - memcpy(ctry_utf, t, ctryl * sizeof(w_char)); - } else { - ctry = mystrdup(tryme); - ctryl = strlen(ctry); + if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char)); + else ctryl = 0; } } } @@ -68,6 +89,11 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn, SuggestMgr::~SuggestMgr() { pAMgr = NULL; + if (ckey) free(ckey); + ckey = NULL; + if (ckey_utf) free(ckey_utf); + ckey_utf = NULL; + ckeyl = 0; if (ctry) free(ctry); ctry = NULL; if (ctry_utf) free(ctry_utf); @@ -77,7 +103,7 @@ SuggestMgr::~SuggestMgr() } int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest, - int * timer, time_t * timelimit) { + int * timer, clock_t * timelimit) { int cwrd = 1; if (ns == maxSug) return maxSug; for (int k=0; k < ns; k++) { @@ -96,13 +122,15 @@ int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int // generate suggestions for a mispelled word // pass in address of array of char * pointers +// onlycompoundsug: probably bad suggestions (need for ngram sugs, too) -int SuggestMgr::suggest(char*** slst, const char * w, int nsug) +int SuggestMgr::suggest(char*** slst, const char * w, int nsug, + int * onlycompoundsug) { - int nocompoundtwowords = 0; - char ** wlst; - w_char word_utf[MAXSWL]; - int wl = 0; + int nocompoundtwowords = 0; + char ** wlst; + w_char word_utf[MAXSWL]; + int wl = 0; char w2[MAXWORDUTF8LEN]; const char * word = w; @@ -141,8 +169,8 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) nsug = replchars(wlst, word, nsug, cpdsuggest); // perhaps we made chose the wrong char from a related set - if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) { - nsug = mapchars(wlst, word, nsug); + if ((nsug < maxSug) && (nsug > -1)) { + nsug = mapchars(wlst, word, nsug, cpdsuggest); } // did we swap the order of chars by mistake @@ -157,6 +185,22 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) longswapchar(wlst, word, nsug, cpdsuggest); } + // did we just hit the wrong key in place of a good char (case and keyboard) + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) : + badcharkey(wlst, word, nsug, cpdsuggest); + } + + // only suggest compound words when no other suggestion + if ((cpdsuggest == 0) && (nsug > 0)) nocompoundtwowords=1; + + // did we add a char that should not be there + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : + extrachar(wlst, word, nsug, cpdsuggest); + } + + // did we forgot a char if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : @@ -169,12 +213,6 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) movechar(wlst, word, nsug, cpdsuggest); } - // did we add a char that should not be there - if ((nsug < maxSug) && (nsug > -1)) { - nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : - extrachar(wlst, word, nsug, cpdsuggest); - } - // did we just hit the wrong key in place of a good char if ((nsug < maxSug) && (nsug > -1)) { nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : @@ -187,10 +225,6 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) doubletwochars(wlst, word, nsug, cpdsuggest); } - - // only suggest compound words when no other suggestion - if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; - // perhaps we forgot to hit space and two words ran together if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) { nsug = twowords(wlst, word, nsug, cpdsuggest); @@ -205,6 +239,8 @@ int SuggestMgr::suggest(char*** slst, const char * w, int nsug) free(wlst); wlst = NULL; } + + if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1; *slst = wlst; return nsug; @@ -242,8 +278,8 @@ int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug) nsug = replchars(wlst, word, nsug, cpdsuggest); // perhaps we made chose the wrong char from a related set - if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) - nsug = mapchars(wlst, word, nsug); + if ((nsug < maxSug) && (nsug > -1)) + nsug = mapchars(wlst, word, nsug, cpdsuggest); if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; @@ -273,7 +309,7 @@ int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns, char candidate[MAXSWUTF8L]; w_char candidate_utf[MAXSWL]; memcpy(candidate_utf, word, wl * sizeof(w_char)); - mkallcap_utf(candidate_utf, wl, pAMgr->get_langnum()); + mkallcap_utf(candidate_utf, wl, langnum); u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); } @@ -288,9 +324,9 @@ int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest) } // suggestions for when chose the wrong char out of a related set -int SuggestMgr::mapchars(char** wlst, const char * word, int ns) +int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest) { - time_t timelimit; + clock_t timelimit; int timer; int wl = strlen(word); @@ -300,18 +336,19 @@ int SuggestMgr::mapchars(char** wlst, const char * word, int ns) struct mapentry* maptable = pAMgr->get_maptable(); if (maptable==NULL) return ns; - timelimit = time(NULL); + timelimit = clock(); timer = MINTIMER; if (utf8) { w_char w[MAXSWL]; int len = u8_u16(w, MAXSWL, word); - ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit); - } else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit); + ns = map_related_utf(w, len, 0, cpdsuggest, wlst, ns, maptable, nummap, &timer, &timelimit); + } else ns = map_related(word, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit); return ns; } -int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, - const mapentry* maptable, int nummap, int * timer, time_t * timelimit) +int SuggestMgr::map_related(const char * word, int i, char** wlst, + int cpdsuggest, int ns, + const mapentry* maptable, int nummap, int * timer, clock_t * timelimit) { char c = *(word + i); if (c == 0) { @@ -319,8 +356,7 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, int wl = strlen(word); for (int m=0; m < ns; m++) if (strcmp(word,wlst[m]) == 0) cwrd = 0; - if ((cwrd) && (checkword(word, wl, 0, timer, timelimit) || - checkword(word, wl, 1, timer, timelimit))) { + if ((cwrd) && checkword(word, wl, cpdsuggest, timer, timelimit)) { if (ns < maxSug) { wlst[ns] = mystrdup(word); if (wlst[ns] == NULL) return -1; @@ -334,23 +370,27 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, if (strchr(maptable[j].set,c) != 0) { in_map = 1; char * newword = mystrdup(word); + if (!newword) return -1; for (int k = 0; k < maptable[j].len; k++) { *(newword + i) = *(maptable[j].set + k); - ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit); - if (!(*timelimit)) return ns; + ns = map_related(newword, (i+1), wlst, cpdsuggest, + ns, maptable, nummap, timer, timelimit); + if (!(*timer)) return ns; } free(newword); } } if (!in_map) { i++; - ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit); + ns = map_related(word, i, wlst, cpdsuggest, + ns, maptable, nummap, timer, timelimit); } return ns; } -int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns, - const mapentry* maptable, int nummap, int * timer, time_t * timelimit) +int SuggestMgr::map_related_utf(w_char * word, int len, int i, int cpdsuggest, + char** wlst, int ns, const mapentry* maptable, int nummap, + int * timer, clock_t * timelimit) { if (i == len) { int cwrd = 1; @@ -360,8 +400,7 @@ int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int wl = strlen(s); for (int m=0; m < ns; m++) if (strcmp(s,wlst[m]) == 0) cwrd = 0; - if ((cwrd) && (checkword(s, wl, 0, timer, timelimit) || - checkword(s, wl, 1, timer, timelimit))) { + if ((cwrd) && checkword(s, wl, cpdsuggest, timer, timelimit)) { if (ns < maxSug) { wlst[ns] = mystrdup(s); if (wlst[ns] == NULL) return -1; @@ -377,15 +416,17 @@ int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int in_map = 1; for (int k = 0; k < maptable[j].len; k++) { *(word + i) = *(maptable[j].set_utf16 + k); - ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit); - if (!(*timelimit)) return ns; + ns = map_related_utf(word, len, i + 1, cpdsuggest, + wlst, ns, maptable, nummap, timer, timelimit); + if (!(*timer)) return ns; } *((unsigned short *) word + i) = c; } } if (!in_map) { i++; - ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit); + ns = map_related_utf(word, len, i, cpdsuggest, + wlst, ns, maptable, nummap, timer, timelimit); } return ns; } @@ -416,6 +457,23 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest strcpy(candidate+(r-word)+lenr, r+lenp); ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; + // check REP suggestions with space + char * sp = strchr(candidate, ' '); + if (sp) { + *sp = '\0'; + if (checkword(candidate, strlen(candidate), 0, NULL, NULL)) { + int oldns = ns; + *sp = ' '; + ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + if (oldns < ns) { + free(wlst[ns - 1]); + wlst[ns - 1] = mystrdup(candidate); + if (!wlst[ns - 1]) return -1; + } + } + *sp = ' '; + } r++; // search for the next letter } } @@ -454,7 +512,7 @@ int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, in int state=0; if (wl < 5 || ! pAMgr) return ns; for (int i=2; i < wl; i++) { - if ((word[i].l==word[i-2].l) && (word[i].h==word[i-2].h)) { + if (w_char_eq(word[i], word[i-2])) { state++; if (state==3) { memcpy(candidate_utf, word, (i - 1) * sizeof(w_char)); @@ -471,25 +529,108 @@ int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, in return ns; } +// error is wrong char in place of correct one (case and keyboard related version) +int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest) +{ + char tmpc; + char candidate[MAXSWUTF8L]; + int wl = strlen(word); + strcpy(candidate, word); + // swap out each char one by one and try uppercase and neighbor + // keyboard chars in its place to see if that makes a good word + + for (int i=0; i < wl; i++) { + tmpc = candidate[i]; + // check with uppercase letters + candidate[i] = csconv[((unsigned char)tmpc)].cupper; + if (tmpc != candidate[i]) { + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + candidate[i] = tmpc; + } + // check neighbor characters in keyboard string + if (!ckey) continue; + char * loc = strchr(ckey, tmpc); + while (loc) { + if ((loc > ckey) && (*(loc - 1) != '|')) { + candidate[i] = *(loc - 1); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) { + candidate[i] = *(loc + 1); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + loc = strchr(loc + 1, tmpc); + } + candidate[i] = tmpc; + } + return ns; +} + +// error is wrong char in place of correct one (case and keyboard related version) +int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) +{ + w_char tmpc; + w_char candidate_utf[MAXSWL]; + char candidate[MAXSWUTF8L]; + memcpy(candidate_utf, word, wl * sizeof(w_char)); + // swap out each char one by one and try all the tryme + // chars in its place to see if that makes a good word + for (int i=0; i < wl; i++) { + tmpc = candidate_utf[i]; + // check with uppercase letters + mkallcap_utf(candidate_utf + i, 1, langnum); + if (!w_char_eq(tmpc, candidate_utf[i])) { + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + candidate_utf[i] = tmpc; + } + // check neighbor characters in keyboard string + if (!ckey) continue; + w_char * loc = ckey_utf; + while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++; + while (loc < (ckey_utf + ckeyl)) { + if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) { + candidate_utf[i] = *(loc - 1); + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) { + candidate_utf[i] = *(loc + 1); + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)); + } + candidate_utf[i] = tmpc; + } + return ns; +} + // error is wrong char in place of correct one int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest) { char tmpc; char candidate[MAXSWUTF8L]; - time_t timelimit = time(NULL); + clock_t timelimit = clock(); int timer = MINTIMER; int wl = strlen(word); strcpy(candidate, word); // swap out each char one by one and try all the tryme // chars in its place to see if that makes a good word - for (int i=0; i < wl; i++) { - tmpc = candidate[i]; - for (int j=0; j < ctryl; j++) { + for (int j=0; j < ctryl; j++) { + for (int i=wl-1; i >= 0; i--) { + tmpc = candidate[i]; if (ctry[j] == tmpc) continue; candidate[i] = ctry[j]; ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timelimit) return ns; + if (!timer) return ns; candidate[i] = tmpc; } } @@ -502,20 +643,20 @@ int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, i w_char tmpc; w_char candidate_utf[MAXSWL]; char candidate[MAXSWUTF8L]; - time_t timelimit = time(NULL); + clock_t timelimit = clock(); int timer = MINTIMER; memcpy(candidate_utf, word, wl * sizeof(w_char)); // swap out each char one by one and try all the tryme // chars in its place to see if that makes a good word - for (int i=0; i < wl; i++) { - tmpc = candidate_utf[i]; - for (int j=0; j < ctryl; j++) { - if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue; + for (int j=0; j < ctryl; j++) { + for (int i=wl-1; i >= 0; i--) { + tmpc = candidate_utf[i]; + if (w_char_eq(tmpc, ctry_utf[j])) continue; candidate_utf[i] = ctry_utf[j]; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timelimit) return ns; + if (!timer) return ns; candidate_utf[i] = tmpc; } } @@ -525,18 +666,20 @@ int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, i // error is word has an extra letter it does not need int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest) { - char candidate[MAXSWUTF8L]; + char candidate[MAXSWUTF8L]; w_char candidate_utf[MAXSWL]; - const w_char * p; - w_char * r; + w_char * p; + w_char tmpc = W_VLINE; // not used value, only for VCC warning message if (wl < 2) return ns; // try omitting one char of word at a time - memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char)); - for (p = word, r = candidate_utf; p < word + wl; ) { - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1); + memcpy(candidate_utf, word, wl * sizeof(w_char)); + for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) { + w_char tmpc2 = *p; + if (p < candidate_utf + wl - 1) *p = tmpc; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1); ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; - *r++ = *p++; + tmpc = tmpc2; } return ns; } @@ -544,47 +687,41 @@ int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, // error is word has an extra letter it does not need int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest) { + char tmpc = '\0'; char candidate[MAXSWUTF8L]; - const char * p; - char * r; + char * p; int wl = strlen(word); if (wl < 2) return ns; // try omitting one char of word at a time - strcpy (candidate, word + 1); - for (p = word, r = candidate; *p != 0; ) { + strcpy (candidate, word); + for (p = candidate + wl - 1; p >=candidate; p--) { + char tmpc2 = *p; + *p = tmpc; ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; - *r++ = *p++; + tmpc = tmpc2; } return ns; } - // error is missing a letter it needs int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest) { char candidate[MAXSWUTF8L]; - const char * p; - char * q; - time_t timelimit = time(NULL); + char * p; + clock_t timelimit = clock(); int timer = MINTIMER; int wl = strlen(word); - // try inserting a tryme character before every letter - strcpy(candidate + 1, word); - for (p = word, q = candidate; *p != 0; ) { - for (int i = 0; i < ctryl; i++) { - *q = ctry[i]; + // try inserting a tryme character before every letter (and the null terminator) + for (int i = 0; i < ctryl; i++) { + strcpy(candidate, word); + for (p = candidate + wl; p >= candidate; p--) { + *(p+1) = *p; + *p = ctry[i]; ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timelimit) return ns; + if (!timer) return ns; } - *q++ = *p++; - } - // now try adding one to end */ - for (int i = 0; i < ctryl; i++) { - *q = ctry[i]; - ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; } return ns; } @@ -594,31 +731,20 @@ int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns { w_char candidate_utf[MAXSWL]; char candidate[MAXSWUTF8L]; - const w_char * p; - w_char * q; - int cwrd; - time_t timelimit = time(NULL); + w_char * p; + clock_t timelimit = clock(); int timer = MINTIMER; - // try inserting a tryme character before every letter - memcpy (candidate_utf + 1, word, wl * sizeof(w_char)); - for (p = word, q = candidate_utf; p < (word + wl); ) { - for (int i = 0; i < ctryl; i++) { - *q = ctry_utf[i]; - cwrd = 1; + // try inserting a tryme character at the end of the word and before every letter + for (int i = 0; i < ctryl; i++) { + memcpy (candidate_utf, word, wl * sizeof(w_char)); + for (p = candidate_utf + wl; p >= candidate_utf; p--) { + *(p + 1) = *p; + *p = ctry_utf[i]; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit); if (ns == -1) return -1; - if (!timelimit) return ns; - } - *q++ = *p++; - } - // now try adding one to end */ - for (int i = 0; i < ctryl; i++) { - *q = ctry_utf[i]; - cwrd = 1; - u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); - if (ns == -1) return -1; + if (!timer) return ns; + } } return ns; } @@ -636,19 +762,19 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest int wl=strlen(word); if (wl < 3) return ns; - if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl); + if (langnum == LANG_hu) forbidden = check_forbidden(word, wl); strcpy(candidate + 1, word); - // split the string into two pieces after every char // if both pieces are good words make them a suggestion for (p = candidate + 1; p[1] != '\0'; p++) { p[-1] = *p; // go to end of the UTF-8 character while (utf8 && ((p[1] & 0xc0) == 0x80)) { + *p = p[1]; p++; - p[-1] = *p; } + if (utf8 && p[1] == '\0') break; // last UTF-8 character *p = '\0'; c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL); if (c1) { @@ -657,7 +783,7 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest *p = ' '; // spec. Hungarian code (need a better compound word support) - if ((pAMgr->get_langnum() == LANG_hu) && !forbidden && + if ((langnum == LANG_hu) && !forbidden && // if 3 repeating letter, use - instead of space (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) || // or multiple compounding, with more, than 6 syllables @@ -673,6 +799,23 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest ns++; } } else return ns; + // add two word suggestion with dash, if TRY string contains + // "a" or "-" + // NOTE: cwrd doesn't modified for REP twoword sugg. + if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) && + mystrlen(p + 1) > 1 && + mystrlen(candidate) - mystrlen(p) > 1) { + *p = '-'; + for (int k=0; k < ns; k++) + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; + if (ns < maxSug) { + if (cwrd) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) return -1; + ns++; + } + } else return ns; + } } } } @@ -698,6 +841,24 @@ int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest p[1] = *p; *p = tmpc; } + // try double swaps for short words + // ahev -> have, owudl -> would + if (wl == 4 || wl == 5) { + candidate[0] = word[1]; + candidate[1] = word[0]; + candidate[2] = word[2]; + candidate[wl - 2] = word[wl - 1]; + candidate[wl - 1] = word[wl - 2]; + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + if (wl == 5) { + candidate[0] = word[0]; + candidate[1] = word[2]; + candidate[2] = word[1]; + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + } return ns; } @@ -708,6 +869,7 @@ int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, char candidate[MAXSWUTF8L]; w_char * p; w_char tmpc; + int len = 0; // try swapping adjacent chars one by one memcpy (candidate_utf, word, wl * sizeof(w_char)); for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) { @@ -715,11 +877,32 @@ int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, *p = p[1]; p[1] = tmpc; u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + if (len == 0) len = strlen(candidate); + ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; p[1] = *p; *p = tmpc; } + // try double swaps for short words + // ahev -> have, owudl -> would, suodn -> sound + if (wl == 4 || wl == 5) { + candidate_utf[0] = word[1]; + candidate_utf[1] = word[0]; + candidate_utf[2] = word[2]; + candidate_utf[wl - 2] = word[wl - 1]; + candidate_utf[wl - 1] = word[wl - 2]; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + if (wl == 5) { + candidate_utf[0] = word[0]; + candidate_utf[1] = word[2]; + candidate_utf[2] = word[1]; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL); + if (ns == -1) return -1; + } + } return ns; } @@ -794,7 +977,7 @@ int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest *(q-1) = *q; *q = tmpc; if ((q-p) < 2) continue; // omit swap char - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } strcpy(candidate, word); @@ -805,7 +988,7 @@ int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest *(q+1) = *q; *q = tmpc; if ((p-q) < 2) continue; // omit swap char - ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } strcpy(candidate, word); @@ -830,7 +1013,7 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, *q = tmpc; if ((q-p) < 2) continue; // omit swap char u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } memcpy (candidate_utf, word, wl * sizeof(w_char)); @@ -842,7 +1025,7 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, *q = tmpc; if ((p-q) < 2) continue; // omit swap char u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); - ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL); + ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL); if (ns == -1) return -1; } memcpy (candidate_utf, word, wl * sizeof(w_char)); @@ -851,28 +1034,33 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, } // generate a set of suggestions for very poorly spelled words -int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) +int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md) { int i, j; int lval; - int sc; - int lp; + int sc, scphon; + int lp, lpphon; int nonbmp = 0; - if (!pHMgr) return 0; - // exhaustively search through all root words // keeping track of the MAX_ROOTS most similar root words struct hentry * roots[MAX_ROOTS]; + char * rootsphon[MAX_ROOTS]; int scores[MAX_ROOTS]; + int scoresphon[MAX_ROOTS]; for (i = 0; i < MAX_ROOTS; i++) { roots[i] = NULL; scores[i] = -100 * i; + rootsphon[i] = NULL; + scoresphon[i] = -100 * i; } lp = MAX_ROOTS - 1; - + lpphon = MAX_ROOTS - 1; + scphon = scoresphon[MAX_ROOTS-1]; + char w2[MAXWORDUTF8LEN]; + char f[MAXSWUTF8L]; char * word = w; // word reversing wrapper for complex prefixes @@ -896,8 +1084,8 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) struct hentry* hp = NULL; int col = -1; - - #ifdef HUNSPELL_CHROME_CLIENT + +#ifdef HUNSPELL_CHROME_CLIENT // A static array of hentries required for walking the hash table. struct hentry static_hentry[MAX_ROOTS]; @@ -906,31 +1094,61 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) static const int kMaxWordLen = 128; char hentry_word[MAX_ROOTS][kMaxWordLen]; unsigned short hentry_astr[MAX_ROOTS]; - #endif +#endif - while ((hp = pHMgr->walk_hashtable(col, hp))) { + phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; + char target[MAXSWUTF8L]; + char candidate[MAXSWUTF8L]; + if (ph) { + strcpy(candidate, word); + mkallcap(candidate, csconv); + phonet(candidate, target, n, *ph); + } + + for (i = 0; i < md; i++) { + while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) { if ((hp->astr) && (pAMgr) && (TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) || + TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) || TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue; - sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE); + + sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + NGRAM_LOWERING) + + leftcommonsubstring(word, HENTRY_WORD(hp)); + + // check special pronounciation + if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { + int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + NGRAM_LOWERING) + + leftcommonsubstring(word, f); + if (sc2 > sc) sc = sc2; + } + + if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) { + char target2[MAXSWUTF8L]; + strcpy(candidate, HENTRY_WORD(hp)); + mkallcap(candidate, csconv); + phonet(candidate, target2, -1, *ph); + scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); + } + if (sc > scores[lp]) { scores[lp] = sc; - #ifdef HUNSPELL_CHROME_CLIENT +#ifdef HUNSPELL_CHROME_CLIENT roots[lp] = &static_hentry[lp]; roots[lp]->alen = hp->alen; if (hp->astr) hentry_astr[lp] = *hp->astr; roots[lp]->astr = &hentry_astr[lp]; - roots[lp]->wlen = hp->wlen; - strcpy(&hentry_word[lp][0], hp->word); - roots[lp]->word = &hentry_word[lp][0]; + roots[lp]->blen = hp->blen; + strcpy(&hentry_word[lp][0], &hp->word); + roots[lp]->word = hentry_word[lp][0]; roots[lp]->next = NULL; roots[lp]->next_homonym = NULL; - #else + roots[lp]->var = 0; + roots[lp]->clen = 0; +#else roots[lp] = hp; - #endif - +#endif lval = sc; for (j=0; j < MAX_ROOTS; j++) if (scores[j] < lval) { @@ -938,7 +1156,18 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) lval = scores[j]; } } - } + + if (scphon > scoresphon[lpphon]) { + scoresphon[lpphon] = scphon; + rootsphon[lpphon] = HENTRY_WORD(hp); + lval = scphon; + for (j=0; j < MAX_ROOTS; j++) + if (scoresphon[j] < lval) { + lpphon = j; + lval = scoresphon[j]; + } + } + }} // find minimum threshhold for a passable suggestion // mangle original word three differnt ways @@ -948,11 +1177,11 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) if (utf8) { for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*'; u16_u8(mw, MAXSWUTF8L, u8, n); - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING); } else { strcpy(mw, word); for (int k=sp; k < n; k+=4) *(mw + k) = '*'; - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + NGRAM_LOWERING); } } thresh = thresh / 3; @@ -962,9 +1191,11 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) // and use length adjusted ngram scores to select // possible suggestions char * guess[MAX_GUESS]; + char * guessorig[MAX_GUESS]; int gscore[MAX_GUESS]; for(i=0;i<MAX_GUESS;i++) { guess[i] = NULL; + guessorig[i] = NULL; gscore[i] = -100 * i; } @@ -974,31 +1205,46 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword)); if (! glst) { if (nonbmp) utf8 = 1; - return 0; + return ns; } for (i = 0; i < MAX_ROOTS; i++) { - if (roots[i]) { struct hentry * rp = roots[i]; - int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen, - rp->astr, rp->alen, word, nc); + int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, + rp->astr, rp->alen, word, nc, + ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL)); for (int k = 0; k < nw ; k++) { - sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH); + sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + NGRAM_LOWERING) + + leftcommonsubstring(word, glst[k].word); + if ((sc > thresh)) { if (sc > gscore[lp]) { - if (guess[lp]) free (guess[lp]); + if (guess[lp]) { + free (guess[lp]); + if (guessorig[lp]) { + free(guessorig[lp]); + guessorig[lp] = NULL; + } + } gscore[lp] = sc; guess[lp] = glst[k].word; + guessorig[lp] = glst[k].orig; lval = sc; for (j=0; j < MAX_GUESS; j++) if (gscore[j] < lval) { lp = j; lval = gscore[j]; } - } else free (glst[k].word); - } else free(glst[k].word); + } else { + free(glst[k].word); + if (glst[k].orig) free(glst[k].orig); + } + } else { + free(glst[k].word); + if (glst[k].orig) free(glst[k].orig); + } } } } @@ -1007,7 +1253,9 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) // now we are done generating guesses // sort in order of decreasing score - bubblesort(&guess[0], &gscore[0], MAX_GUESS); + + bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); + if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); // weight suggestions with a similarity index, based on // the longest common subsequent algorithm and resort @@ -1021,7 +1269,7 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) if (utf8) { w_char _w[MAXSWL]; len = u8_u16(_w, MAXSWL, guess[i]); - mkallsmall_utf(_w, len, pAMgr->get_langnum()); + mkallsmall_utf(_w, len, langnum); u16_u8(gl, MAXSWUTF8L, _w, len); } else { strcpy(gl, guess[i]); @@ -1039,10 +1287,10 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) // heuristic weigthing of ngram scores gscore[i] += - // length of longest common subsequent minus lenght difference + // length of longest common subsequent minus length difference 2 * _lcs - abs((int) (n - len)) + - // weight equal first letter - equalfirstletter(word, gl) + + // weight length of the left common substring + leftcommonsubstring(word, gl) + // weight equal character positions ((_lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) + // swap character (not neighboring) @@ -1050,25 +1298,84 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) } } - bubblesort(&guess[0], &gscore[0], MAX_GUESS); + bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS); + +// phonetic version + if (ph) for (i=0; i < MAX_ROOTS; i++) { + if (rootsphon[i]) { + // lowering rootphon[i] + char gl[MAXSWUTF8L]; + int len; + if (utf8) { + w_char _w[MAXSWL]; + len = u8_u16(_w, MAXSWL, rootsphon[i]); + mkallsmall_utf(_w, len, langnum); + u16_u8(gl, MAXSWUTF8L, _w, len); + } else { + strcpy(gl, rootsphon[i]); + mkallsmall(gl, csconv); + len = strlen(rootsphon[i]); + } + + // heuristic weigthing of ngram scores + scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) + + // weight length of the left common substring + leftcommonsubstring(word, gl); + } + } + + if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS); // copy over + int oldns = ns; - int ns = 0; int same = 0; for (i=0; i < MAX_GUESS; i++) { if (guess[i]) { - if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) { + if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) { int unique = 1; - // we have excellent suggestion(s) + // leave only excellent suggestions, if exists if (gscore[i] > 1000) same = 1; - for (j=0; j < ns; j++) + for (j = 0; j < ns; j++) { // don't suggest previous suggestions or a previous suggestion with prefixes or affixes - if (strstr(guess[i], wlst[j]) || + if ((!guessorig[i] && strstr(guess[i], wlst[j])) || + (guessorig[i] && strstr(guessorig[i], wlst[j])) || // check forbidden words !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0; - if (unique) wlst[ns++] = guess[i]; else free(guess[i]); - } else free(guess[i]); + } + if (unique) { + wlst[ns++] = guess[i]; + if (guessorig[i]) { + free(guess[i]); + wlst[ns-1] = guessorig[i]; + } + } else { + free(guess[i]); + if (guessorig[i]) free(guessorig[i]); + } + } else { + free(guess[i]); + if (guessorig[i]) free(guessorig[i]); + } + } + } + + oldns = ns; + if (ph) for (i=0; i < MAX_ROOTS; i++) { + if (rootsphon[i]) { + if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) { + int unique = 1; + for (j = 0; j < ns; j++) { + // don't suggest previous suggestions or a previous suggestion with prefixes or affixes + if (strstr(rootsphon[i], wlst[j]) || + // check forbidden words + !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0; + } + if (unique) { + wlst[ns++] = mystrdup(rootsphon[i]); + if (!wlst[ns - 1]) return ns - 1; + } + } } } @@ -1083,19 +1390,16 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) // obsolote MySpell-HU modifications: // return value 2 and 3 marks compounding with hyphen (-) // `3' marks roots without suffix -int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit) +int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit) { struct hentry * rv=NULL; int nosuffix = 0; - + // check time limit if (timer) { (*timer)--; if (!(*timer) && timelimit) { - if (time(NULL) > *timelimit) { - *timelimit = 0; - return 0; - } + if ((clock() - *timelimit) > TIMELIMIT) return 0; *timer = MAXPLUSTIMER; } } @@ -1103,7 +1407,7 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time if (pAMgr) { if (cpdsuggest==1) { if (pAMgr->get_compound()) { - rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1); + rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1); //EXT if (rv) return 3; // XXX obsolote categorisation } return 0; @@ -1114,10 +1418,15 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time if (rv) { if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0; - if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || - TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; + while (rv) { + if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { + rv = rv->next_homonym; + } else break; + } } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX - + if (rv) { nosuffix=1; } else { @@ -1130,8 +1439,9 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time } // check forbidden words - if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) - || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || + if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || + TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || + TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0; if (rv) { // XXX obsolote @@ -1149,7 +1459,7 @@ int SuggestMgr::check_forbidden(const char * word, int len) if (pAMgr) { rv = pAMgr->lookup(word); - if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || + if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; if (!(pAMgr->prefix_check(word,len,1))) rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix @@ -1160,184 +1470,6 @@ int SuggestMgr::check_forbidden(const char * word, int len) } #ifdef HUNSPELL_EXPERIMENTAL -// suggest stems, XXX experimental code -int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug) -{ - char buf[MAXSWUTF8L]; - char ** wlst; - int prevnsug = nsug; - - char w2[MAXWORDUTF8LEN]; - const char * word = w; - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - strcpy(w2, w); - if (utf8) reverseword_utf(w2); else reverseword(w2); - word = w2; - } - - if (*slst) { - wlst = *slst; - } else { - wlst = (char **) calloc(maxSug, sizeof(char *)); - if (wlst == NULL) return -1; - } - // perhaps there are a fix stem in the dictionary - if ((nsug < maxSug) && (nsug > -1)) { - - nsug = fixstems(wlst, word, nsug); - if (nsug == prevnsug) { - char * s = mystrdup(word); - char * p = s + strlen(s); - while ((*p != '-') && (p != s)) p--; - if (*p == '-') { - *p = '\0'; - nsug = fixstems(wlst, s, nsug); - if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) { - char * t; - buf[0] = '\0'; - for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number? - if (*t != '\0') strcpy(buf, "# "); - strcat(buf, s); - wlst[nsug] = mystrdup(buf); - if (wlst[nsug] == NULL) return -1; - nsug++; - } - p++; - nsug = fixstems(wlst, p, nsug); - } - - free(s); - } - } - - if (nsug < 0) { - for (int i=0;i<maxSug; i++) - if (wlst[i] != NULL) free(wlst[i]); - free(wlst); - return -1; - } - - *slst = wlst; - return nsug; -} - - -// there are fix stems in dictionary -int SuggestMgr::fixstems(char ** wlst, const char * word, int ns) -{ - char buf[MAXSWUTF8L]; - char prefix[MAXSWUTF8L] = ""; - - int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound - int cpdindex = 0; - struct hentry * rv = NULL; - - int wl = strlen(word); - int cmpdstemnum; - int cmpdstem[MAXCOMPOUND]; - - if (pAMgr) { - rv = pAMgr->lookup(word); - if (rv) { - dicstem = 0; - } else { - // try stripping off affixes - rv = pAMgr->affix_check(word, wl); - - // else try check compound word - if (!rv && pAMgr->get_compound()) { - rv = pAMgr->compound_check(word, wl, - 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1); - - if (rv) { - dicstem = 2; - for (int j = 0; j < cmpdstemnum; j++) { - cpdindex += cmpdstem[j]; - } - if(! (pAMgr->lookup(word + cpdindex))) - pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix - } - } - - - if (pAMgr->get_prefix()) { - strcpy(prefix, pAMgr->get_prefix()); - } - - // XXX obsolete, will be a general solution for stemming - if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU) - } - - } - - - - if ((rv) && (ns < maxSug)) { - - // check fixstem flag and not_valid_stem flag - // first word - if ((ns < maxSug) && (dicstem < 2)) { - strcpy(buf, prefix); - if ((dicstem > 0) && pAMgr->get_derived()) { - // XXX obsolote - if (strlen(prefix) == 1) { - strcat(buf, (pAMgr->get_derived()) + 1); - } else { - strcat(buf, pAMgr->get_derived()); - } - } else { - // special stem in affix description - const char * wordchars = pAMgr->get_wordchars(); - if (rv->description && - (strchr(wordchars, *(rv->description)))) { - char * desc = (rv->description) + 1; - while (strchr(wordchars, *desc)) desc++; - strncat(buf, rv->description, desc - (rv->description)); - } else { - strcat(buf, rv->word); - } - } - wlst[ns] = mystrdup(buf); - if (wlst[ns] == NULL) return -1; - ns++; - } - - if (dicstem == 2) { - - // compound stem - -// if (rv->astr && (strchr(rv->astr, '0') == NULL)) { - if (rv->astr) { - strcpy(buf, word); - buf[cpdindex] = '\0'; - if (prefix) strcat(buf, prefix); - if (pAMgr->get_derived()) { - strcat(buf, pAMgr->get_derived()); - } else { - // special stem in affix description - const char * wordchars = pAMgr->get_wordchars(); - if (rv->description && - (strchr(wordchars, *(rv->description)))) { - char * desc = (rv->description) + 1; - while (strchr(wordchars, *desc)) desc++; - strncat(buf, rv->description, desc - (rv->description)); - } else { - strcat(buf, rv->word); - } - } - if (ns < maxSug) { - wlst[ns] = mystrdup(buf); - if (wlst[ns] == NULL) return -1; - ns++; - } - } - } - } - return ns; -} - // suggest possible stems int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) { @@ -1377,6 +1509,7 @@ int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) *slst = wlst; return nsug; } +#endif // END OF HUNSPELL_EXPERIMENTAL CODE char * SuggestMgr::suggest_morph(const char * w) @@ -1405,20 +1538,25 @@ char * SuggestMgr::suggest_morph(const char * w) while (rv) { if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || - TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen))) - strcat(result, word); - if (rv->description) strcat(result, rv->description); - strcat(result, "\n"); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, MORPH_STEM, MAXLNLEN); + mystrcat(result, word, MAXLNLEN); + } + if (HENTRY_DATA(rv)) { + mystrcat(result, " ", MAXLNLEN); + mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN); + } + mystrcat(result, "\n", MAXLNLEN); } rv = rv->next_homonym; } st = pAMgr->affix_check_morph(word,strlen(word)); if (st) { - strcat(result, st); + mystrcat(result, st, MAXLNLEN); free(st); } @@ -1426,28 +1564,177 @@ char * SuggestMgr::suggest_morph(const char * w) pAMgr->compound_check_morph(word, strlen(word), 0, 0, 100, 0,NULL, 0, &r, NULL); - return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL; + return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL; } +#ifdef HUNSPELL_EXPERIMENTAL char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) { char * p = NULL; char ** wlst = (char **) calloc(maxSug, sizeof(char *)); + if (!**wlst) return NULL; // we will use only the first suggestion for (int i = 0; i < maxSug - 1; i++) wlst[i] = ""; - int ns = suggest(&wlst, word, maxSug - 1); + int ns = suggest(&wlst, word, maxSug - 1, NULL); if (ns == maxSug) { p = suggest_morph(wlst[maxSug - 1]); free(wlst[maxSug - 1]); } if (wlst) free(wlst); - return p; + return p; } #endif // END OF HUNSPELL_EXPERIMENTAL CODE +/* affixation */ +char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern) +{ + char result[MAXLNLEN]; + *result = '\0'; + int sfxcount = get_sfxcount(pattern); + + if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL; + + if (HENTRY_DATA(rv)) { + char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen, + HENTRY_DATA(rv), pattern, 0); + if (aff) { + mystrcat(result, aff, MAXLNLEN); + mystrcat(result, "\n", MAXLNLEN); + free(aff); + } + } + + // check all allomorphs + char allomorph[MAXLNLEN]; + char * p = NULL; + if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH); + while (p) { + struct hentry * rv2 = NULL; + p += MORPH_TAG_LEN; + int plen = fieldlen(p); + strncpy(allomorph, p, plen); + allomorph[plen] = '\0'; + rv2 = pAMgr->lookup(allomorph); + while (rv2) { +// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) { + if (HENTRY_DATA(rv2)) { + char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM); + if (st && (strncmp(st + MORPH_TAG_LEN, + HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) { + char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen, + HENTRY_DATA(rv2), pattern, 0); + if (aff) { + mystrcat(result, aff, MAXLNLEN); + mystrcat(result, "\n", MAXLNLEN); + free(aff); + } + } + } + rv2 = rv2->next_homonym; + } + p = strstr(p + plen, MORPH_ALLOMORPH); + } + + return (*result) ? mystrdup(result) : NULL; +} + +char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) { + char result[MAXLNLEN]; + char result2[MAXLNLEN]; + char newpattern[MAXLNLEN]; + *newpattern = '\0'; + if (n == 0) return 0; + *result2 = '\0'; + struct hentry * rv = NULL; + if (!pAMgr) return NULL; + +// search affixed forms with and without derivational suffixes + while(1) { + + for (int k = 0; k < n; k++) { + *result = '\0'; + // add compound word parts (except the last one) + char * s = (char *) desc[k]; + char * part = strstr(s, MORPH_PART); + if (part) { + char * nextpart = strstr(part + 1, MORPH_PART); + while (nextpart) { + copy_field(result + strlen(result), part, MORPH_PART); + part = nextpart; + nextpart = strstr(part + 1, MORPH_PART); + } + s = part; + } + + char **pl; + char tok[MAXLNLEN]; + strcpy(tok, s); + char * alt = strstr(tok, " | "); + while (alt) { + alt[1] = MSEP_ALT; + alt = strstr(alt, " | "); + } + int pln = line_tok(tok, &pl, MSEP_ALT); + for (int i = 0; i < pln; i++) { + // remove inflectional and terminal suffixes + char * is = strstr(pl[i], MORPH_INFL_SFX); + if (is) *is = '\0'; + char * ts = strstr(pl[i], MORPH_TERM_SFX); + while (ts) { + *ts = '_'; + ts = strstr(pl[i], MORPH_TERM_SFX); + } + char * st = strstr(s, MORPH_STEM); + if (st) { + copy_field(tok, st, MORPH_STEM); + rv = pAMgr->lookup(tok); + while (rv) { + char newpat[MAXLNLEN]; + strcpy(newpat, pl[i]); + strcat(newpat, pattern); + char * sg = suggest_hentry_gen(rv, newpat); + if (!sg) sg = suggest_hentry_gen(rv, pattern); + if (sg) { + char ** gen; + int genl = line_tok(sg, &gen, MSEP_REC); + free(sg); + sg = NULL; + for (int j = 0; j < genl; j++) { + if (strstr(pl[i], MORPH_SURF_PFX)) { + int r2l = strlen(result2); + result2[r2l] = MSEP_REC; + strcpy(result2 + r2l + 1, result); + copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX); + mystrcat(result2, gen[j], MAXLNLEN); + } else { + sprintf(result2 + strlen(result2), "%c%s%s", + MSEP_REC, result, gen[j]); + } + } + freelist(&gen, genl); + } + rv = rv->next_homonym; + } + } + } + freelist(&pl, pln); + } + + if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break; + strcpy(newpattern, pattern); + pattern = newpattern; + char * ds = strstr(pattern, MORPH_DERI_SFX); + while (ds) { + strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN); + ds = strstr(pattern, MORPH_DERI_SFX); + } + } + return (*result2 ? mystrdup(result2) : NULL); +} + // generate an n-gram score comparing s1 and s2 -int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) +int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt) { int nscore = 0; int ns; @@ -1459,13 +1746,9 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) w_char su2[MAXSWL]; l1 = u8_u16(su1, MAXSWL, s1); l2 = u8_u16(su2, MAXSWL, s2); - if (!l2 || (l1==-1) || (l2==-1)) return 0; - // decapitalize dictionary word - if (complexprefixes) { - mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum()); - } else { - mkallsmall_utf(su2, 1, pAMgr->get_langnum()); - } + if ((l2 <= 0) || (l1 == -1)) return 0; + // lowering dictionary word + if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum); for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1-j); i++) { @@ -1489,13 +1772,9 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) char t[MAXSWUTF8L]; l1 = strlen(s1); l2 = strlen(s2); - if (!l2) return 0; + if (l2 == 0) return 0; strcpy(t, s2); - if (complexprefixes) { - *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower; - } else { - mkallsmall(t, csconv); - } + if (opt & NGRAM_LOWERING) mkallsmall(t, csconv); for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1-j); i++) { @@ -1510,13 +1789,14 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) } ns = 0; - if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2; - if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; + if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2; + if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; ns = (nscore - ((ns > 0) ? ns : 0)); return ns; } -int SuggestMgr::equalfirstletter(char * s1, const char * s2) { +// length of the left common substring of s1 and (decapitalised) s2 +int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) { if (utf8) { w_char su1[MAXSWL]; w_char su2[MAXSWL]; @@ -1526,9 +1806,17 @@ int SuggestMgr::equalfirstletter(char * s1, const char * s2) { int l2 = u8_u16(su2, MAXSWL, s2); if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1; } else { + int i; u8_u16(su1, 1, s1); u8_u16(su2, 1, s2); - if (*((short *)su1) == *((short *)su2)) return 1; + unsigned short idx = (su2->h << 8) + su2->l; + if (*((short *)su1) != *((short *)su2) && + (*((unsigned short *)su1) != unicodetolower(idx, langnum))) return 0; + int l1 = u8_u16(su1, MAXSWL, s1); + int l2 = u8_u16(su2, MAXSWL, s2); + for(i = 1; (i < l1) && (i < l2) && + (*((short *)(su1 + i)) == *((short *)(su2 + i))); i++); + return i; } } else { if (complexprefixes) { @@ -1536,7 +1824,13 @@ int SuggestMgr::equalfirstletter(char * s1, const char * s2) { int l2 = strlen(s2); if (*(s2+l1-1) == *(s2+l2-1)) return 1; } else { - if (*s1 == *s2) return 1; + char * olds = s1; + // decapitalise dictionary word + if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0; + do { + s1++; s2++; + } while ((*s1 == *s2) && (*s1 != '\0')); + return s1 - olds; } } return 0; @@ -1554,9 +1848,9 @@ int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_sw int l2 = u8_u16(su2, MAXSWL, s2); // decapitalize dictionary word if (complexprefixes) { - mkallsmall_utf(su2+l2-1, 1, pAMgr->get_langnum()); + mkallsmall_utf(su2+l2-1, 1, langnum); } else { - mkallsmall_utf(su2, 1, pAMgr->get_langnum()); + mkallsmall_utf(su2, 1, langnum); } for (int i = 0; (i < l1) && (i < l2); i++) { if (((short *) su1)[i] == ((short *) su2)[i]) { @@ -1603,7 +1897,7 @@ int SuggestMgr::mystrlen(const char * word) { } // sort in decreasing order of score -void SuggestMgr::bubblesort(char** rword, int* rsc, int n ) +void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n ) { int m = 1; while (m < n) { @@ -1616,6 +1910,11 @@ void SuggestMgr::bubblesort(char** rword, int* rsc, int n ) rword[j-1] = rword[j]; rsc[j] = sctmp; rword[j] = wdtmp; + if (rword2) { + wdtmp = rword2[j-1]; + rword2[j-1] = rword2[j]; + rword2[j] = wdtmp; + } j--; } else break; } @@ -1642,6 +1941,12 @@ void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char * } c = (char *) malloc((m + 1) * (n + 1)); b = (char *) malloc((m + 1) * (n + 1)); + if (!c || !b) { + if (c) free(c); + if (b) free(b); + *result = NULL; + return; + } for (i = 1; i <= m; i++) c[i*(n+1)] = 0; for (j = 0; j <= n; j++) c[j] = 0; for (i = 1; i <= m; i++) { @@ -1673,6 +1978,7 @@ int SuggestMgr::lcslen(const char * s, const char* s2) { char * result; int len = 0; lcs(s, s2, &m, &n, &result); + if (!result) return 0; i = m; j = n; while ((i != 0) && (j != 0)) { @@ -1684,6 +1990,6 @@ int SuggestMgr::lcslen(const char * s, const char* s2) { i--; } else j--; } - if (result) free(result); + free(result); return len; } diff --git a/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx b/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx index 70af7f1..0e61572 100644 --- a/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx +++ b/chrome/third_party/hunspell/src/hunspell/suggestmgr.hxx @@ -5,15 +5,18 @@ #define MAXSWUTF8L (MAXSWL * 4) #define MAX_ROOTS 100 #define MAX_WORDS 100 -#define MAX_GUESS 100 -#define MAXNGRAMSUGS 5 +#define MAX_GUESS 200 +#define MAXNGRAMSUGS 4 +#define MAXPHONSUGS 2 -#define MINTIMER 500 -#define MAXPLUSTIMER 500 +// timelimit: max ~1/4 sec (process time on Linux) for a time consuming function +#define TIMELIMIT (CLOCKS_PER_SEC >> 2) +#define MINTIMER 100 +#define MAXPLUSTIMER 100 -#define NGRAM_IGNORE_LENGTH 0 -#define NGRAM_LONGER_WORSE 1 -#define NGRAM_ANY_MISMATCH 2 +#define NGRAM_LONGER_WORSE (1 << 0) +#define NGRAM_ANY_MISMATCH (1 << 1) +#define NGRAM_LOWERING (1 << 2) #include "atypes.hxx" #include "affixmgr.hxx" @@ -25,6 +28,10 @@ enum { LCS_UP, LCS_LEFT, LCS_UPLEFT }; class SuggestMgr { + char * ckey; + int ckeyl; + w_char * ckey_utf; + char * ctry; int ctryl; w_char * ctry_utf; @@ -33,6 +40,7 @@ class SuggestMgr int maxSug; struct cs_info * csconv; int utf8; + int langnum; int nosplitsugs; int maxngramsugs; int complexprefixes; @@ -42,19 +50,20 @@ public: SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr); ~SuggestMgr(); - int suggest(char*** slst, const char * word, int nsug); - int ngsuggest(char ** wlst, char * word, HashMgr* pHMgr); + int suggest(char*** slst, const char * word, int nsug, int * onlycmpdsug); + int ngsuggest(char ** wlst, char * word, int ns, HashMgr** pHMgr, int md); int suggest_auto(char*** slst, const char * word, int nsug); int suggest_stems(char*** slst, const char * word, int nsug); int suggest_pos_stems(char*** slst, const char * word, int nsug); char * suggest_morph(const char * word); + char * suggest_gen(char ** pl, int pln, char * pattern); char * suggest_morph_for_spelling_error(const char * word); private: int testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest, - int * timer, time_t * timelimit); - int checkword(const char *, int, int, int *, time_t *); + int * timer, clock_t * timelimit); + int checkword(const char *, int, int, int *, clock_t *); int check_forbidden(const char *, int); int capchars(char **, const char *, int, int); @@ -65,6 +74,7 @@ private: int longswapchar(char **, const char *, int, int); int movechar(char **, const char *, int, int); int extrachar(char **, const char *, int, int); + int badcharkey(char **, const char *, int, int); int badchar(char **, const char *, int, int); int twowords(char **, const char *, int, int); int fixstems(char **, const char *, int); @@ -73,21 +83,23 @@ private: int doubletwochars_utf(char**, const w_char *, int wl, int, int); int forgotchar_utf(char**, const w_char *, int wl, int, int); int extrachar_utf(char**, const w_char *, int wl, int, int); + int badcharkey_utf(char **, const w_char *, int wl, int, int); int badchar_utf(char **, const w_char *, int wl, int, int); int swapchar_utf(char **, const w_char *, int wl, int, int); int longswapchar_utf(char **, const w_char *, int, int, int); int movechar_utf(char **, const w_char *, int, int, int); - int mapchars(char**, const char *, int); - int map_related(const char *, int, char ** wlst, int, const mapentry*, int, int *, time_t *); - int map_related_utf(w_char *, int, int, char ** wlst, int, const mapentry*, int, int *, time_t *); - int ngram(int n, char * s1, const char * s2, int uselen); + int mapchars(char**, const char *, int, int); + int map_related(const char *, int, char ** wlst, int, int, const mapentry*, int, int *, clock_t *); + int map_related_utf(w_char *, int, int, int, char ** wlst, int, const mapentry*, int, int *, clock_t *); + int ngram(int n, char * s1, const char * s2, int opt); int mystrlen(const char * word); - int equalfirstletter(char * s1, const char * s2); + int leftcommonsubstring(char * s1, const char * s2); int commoncharacterpositions(char * s1, const char * s2, int * is_swap); - void bubblesort( char ** rwd, int * rsc, int n); + void bubblesort( char ** rwd, char ** rwd2, int * rsc, int n); void lcs(const char * s, const char * s2, int * l1, int * l2, char ** result); int lcslen(const char * s, const char* s2); + char * suggest_hentry_gen(hentry * rv, char * pattern); }; diff --git a/chrome/third_party/hunspell/src/hunspell/w_char.hxx b/chrome/third_party/hunspell/src/hunspell/w_char.hxx new file mode 100644 index 0000000..99cfe63 --- /dev/null +++ b/chrome/third_party/hunspell/src/hunspell/w_char.hxx @@ -0,0 +1,19 @@ +#ifndef __WCHARHXX__ +#define __WCHARHXX__ + +#ifndef GCC +typedef struct { +#else +typedef struct __attribute__ ((packed)) { +#endif + unsigned char l; + unsigned char h; +} w_char; + +// two character arrays +struct replentry { + char * pattern; + char * pattern2; +}; + +#endif |