diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/myspell/Makefile.am | 31 | ||||
-rw-r--r-- | src/myspell/affentry.cxx | 115 | ||||
-rw-r--r-- | src/myspell/affixmgr.cxx | 227 | ||||
-rw-r--r-- | src/myspell/affixmgr.hxx | 28 | ||||
-rw-r--r-- | src/myspell/atypes.hxx | 14 | ||||
-rw-r--r-- | src/myspell/csutil.cxx | 99 | ||||
-rw-r--r-- | src/myspell/csutil.hxx | 24 | ||||
-rw-r--r-- | src/myspell/filemgr.cxx | 38 | ||||
-rw-r--r-- | src/myspell/filemgr.hxx | 19 | ||||
-rw-r--r-- | src/myspell/hashmgr.cxx | 157 | ||||
-rw-r--r-- | src/myspell/hashmgr.hxx | 12 | ||||
-rw-r--r-- | src/myspell/htypes.hxx | 6 | ||||
-rw-r--r-- | src/myspell/hunspell.cxx | 98 | ||||
-rw-r--r-- | src/myspell/hunspell.h | 38 | ||||
-rw-r--r-- | src/myspell/hunspell.hxx | 18 | ||||
-rw-r--r-- | src/myspell/hunzip.cxx | 196 | ||||
-rw-r--r-- | src/myspell/hunzip.hxx | 41 | ||||
-rw-r--r-- | src/myspell/myspell_checker.cpp | 7 | ||||
-rw-r--r-- | src/myspell/phonet.cxx | 2 | ||||
-rw-r--r-- | src/myspell/phonet.hxx | 4 | ||||
-rw-r--r-- | src/myspell/suggestmgr.cxx | 16 | ||||
-rw-r--r-- | src/myspell/suggestmgr.hxx | 2 | ||||
-rw-r--r-- | src/myspell/w_char.hxx | 19 |
23 files changed, 823 insertions, 388 deletions
diff --git a/src/myspell/Makefile.am b/src/myspell/Makefile.am index f2ea5dc..0e529ca 100644 --- a/src/myspell/Makefile.am +++ b/src/myspell/Makefile.am @@ -4,38 +4,45 @@ else target_lib = endif -# copied from hunspell 1.2.1 +# copied from hunspell 1.2.2 COPIED_MYSPELL_FILES= \ + affentry.cxx affentry.hxx \ + affixmgr.cxx \ affixmgr.hxx \ atypes.hxx \ baseaffix.hxx \ + csutil.cxx \ csutil.hxx \ + dictmgr.cxx \ dictmgr.hxx \ + filemgr.cxx \ + filemgr.hxx \ + hashmgr.cxx \ hashmgr.hxx \ htypes.hxx \ + hunspell.cxx \ hunspell.h \ hunspell.hxx \ + hunzip.cxx \ + hunzip.hxx \ langnum.hxx \ - suggestmgr.hxx \ - affentry.cxx \ - affixmgr.cxx \ - csutil.cxx \ - dictmgr.cxx \ - hashmgr.cxx \ - hunspell.cxx \ - suggestmgr.cxx \ phonet.cxx \ - phonet.hxx + phonet.hxx \ + suggestmgr.cxx \ + suggestmgr.hxx \ + utf_info.cxx \ + w_char.hxx if WITH_SYSTEM_MYSPELL EXTRA_MYSPELL_FILES= +EXTRA_MYSPELL_CFLAGS=-DWITH_SYSTEM_MYSPELL else EXTRA_MYSPELL_FILES=$(COPIED_MYSPELL_FILES) +EXTRA_MYSPELL_CFLAGS= endif - -INCLUDES=-I$(top_srcdir)/src $(ENCHANT_CFLAGS) $(MYSPELL_CFLAGS) $(CXX_WARN_CFLAGS) -D_ENCHANT_BUILD=1 +INCLUDES=-I$(top_srcdir)/src $(ENCHANT_CFLAGS) $(MYSPELL_CFLAGS) $(CXX_WARN_CFLAGS) $(EXTRA_MYSPELL_CFLAGS) -D_ENCHANT_BUILD=1 myspell_LTLIBRARIES = $(target_lib) myspelldir= $(libdir)/enchant diff --git a/src/myspell/affentry.cxx b/src/myspell/affentry.cxx index 0ffe557..2436fbb 100644 --- a/src/myspell/affentry.cxx +++ b/src/myspell/affentry.cxx @@ -7,9 +7,9 @@ #include <cctype> #include <cstdio> #else -#include <stdlib.h> +#include <stdlib.h> #include <string.h> -#include <stdio.h> +#include <stdio.h> #include <ctype.h> #endif @@ -17,7 +17,7 @@ #include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif @@ -29,8 +29,8 @@ PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) pmyMgr = pmgr; // set up its intial values - - aflag = dp->aflag; // flag + + aflag = dp->aflag; // flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string @@ -70,7 +70,7 @@ char * PfxEntry::add(const char * word, int len) char tword[MAXWORDUTF8LEN + 4]; if ((len > stripl) && (len >= numconds) && test_condition(word) && - (!stripl || (strncmp(word, strip, stripl) == 0)) && + (!stripl || (strncmp(word, strip, stripl) == 0)) && ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { /* we have a match so add prefix */ char * pp = tword; @@ -81,7 +81,7 @@ char * PfxEntry::add(const char * word, int len) strcpy(pp, (word + stripl)); return mystrdup(tword); } - return NULL; + return NULL; } inline char * PfxEntry::nextchar(char * p) { @@ -149,7 +149,7 @@ inline int PfxEntry::test_condition(const char * st) } } -// check if this prefix entry matches +// check if this prefix entry matches struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag) { int tmpl; // length of tmpword @@ -171,7 +171,7 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound if (stripl) strcpy (tmpword, strip); strcpy ((tmpword + stripl), (word + appndl)); - + // now make sure all of the conditions on characters // are met. Please see the appendix at the end of // this file for more info on exactly what is being @@ -194,14 +194,14 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound he = he->next_homonym; // check homonyms } while (he); } - - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix //if ((opts & aeXPRODUCT) && in_compound) { if ((opts & aeXPRODUCT)) { - he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL, + he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL, 0, NULL, FLAG_NULL, needflag, in_compound); if (he) return he; } @@ -210,7 +210,7 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound return NULL; } -// check if this prefix entry matches +// check if this prefix entry matches struct hentry * PfxEntry::check_twosfx(const char * word, int len, char in_compound, const FLAG needflag) { @@ -245,8 +245,8 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, if (test_condition(tmpword)) { tmpl += stripl; - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // cross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { @@ -258,7 +258,7 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, return NULL; } -// check if this prefix entry matches +// check if this prefix entry matches char * PfxEntry::check_twosfx_morph(const char * word, int len, char in_compound, const FLAG needflag) { @@ -292,8 +292,8 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, if (test_condition(tmpword)) { tmpl += stripl; - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { @@ -305,7 +305,7 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, return NULL; } -// check if this prefix entry matches +// check if this prefix entry matches char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag) { int tmpl; // length of tmpword @@ -313,7 +313,7 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const char tmpword[MAXWORDUTF8LEN + 4]; char result[MAXLNLEN]; char * st; - + *result = '\0'; // on entry prefix is 0 length or already matches the beginning of the word. @@ -349,7 +349,7 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && // needflag ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || - (contclass && TESTAFF(contclass, needflag, contclasslen)))) { + (contclass && TESTAFF(contclass, needflag, contclasslen)))) { if (morphcode) { strcat(result, " "); strcat(result, morphcode); @@ -364,6 +364,13 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const if (HENTRY_DATA(he)) { strcat(result, " "); strcat(result,HENTRY_DATA(he)); + } else { + // return with debug information + char * flag = pmyMgr->encode_flag(getFlag()); + strcat(result, " "); + strcat(result, MORPH_FLAG); + strcat(result, flag); + free(flag); } strcat(result, "\n"); } @@ -371,12 +378,12 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const } while (he); } - // prefix matched but no root word was found - // if aeXPRODUCT is allowed, try again but now + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { - st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, + st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, FLAG_NULL, needflag); if (st) { strcat(result, st); @@ -385,7 +392,7 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const } } } - + if (*result) return mystrdup(result); return NULL; } @@ -396,7 +403,7 @@ SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) pmyMgr = pmgr; // set up its intial values - aflag = dp->aflag; // char flag + aflag = dp->aflag; // char flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string @@ -425,7 +432,7 @@ SfxEntry::~SfxEntry() if (strip) free(strip); pmyMgr = NULL; appnd = NULL; - strip = NULL; + strip = NULL; if (opts & aeLONGCOND) free(c.l.conds2); if (morphcode && !(opts & aeALIASM)) free(morphcode); if (contclass && !(opts & aeALIASF)) free(contclass); @@ -470,14 +477,14 @@ inline int SfxEntry::test_condition(const char * st, const char * beg) if (numconds == 0) return 1; char * p = c.conds; st--; - int c = 1; + int i = 1; while (1) { switch (*p) { case '\0': return 1; case '[': { p = nextchar(p); pos = st; break; } case '^': { p = nextchar(p); neg = true; break; } case ']': { if (!neg && !ingroup) return 0; - c++; + i++; pos = NULL; neg = false; ingroup = false; @@ -515,17 +522,17 @@ inline int SfxEntry::test_condition(const char * st, const char * beg) } if (pos && st != pos) { if (neg) return 0; - else if (c == numconds) return 1; + else if (i == numconds) return 1; ingroup = true; } if (p && *p != '\0') p = nextchar(p); } else if (pos) { if (neg) return 0; - else if (c == numconds) return 1; + else if (i == numconds) return 1; ingroup = true; } if (!pos) { - c++; + i++; st--; if (st < beg && p && *p != '\0') return 0; // word <= condition } @@ -538,12 +545,12 @@ inline int SfxEntry::test_condition(const char * st, const char * beg) } } -// see if this suffix is present in the word +// see if this suffix is present in the word struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag, const FLAG badflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword struct hentry * he; // hash entry pointer unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; @@ -563,7 +570,7 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, tmpl = len - appndl; // the second condition is not enough for UTF-8 strings // it checked in test_condition() - + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { // if (tmpl > 0) { @@ -596,21 +603,21 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, do { // check conditional suffix (enabled by prefix) if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && - TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && - (((optflags & aeXPRODUCT) == 0) || + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + (((optflags & aeXPRODUCT) == 0) || TESTAFF(he->astr, ep->getFlag(), he->alen) || // enabled by prefix ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) ) && // handle cont. class - ((!cclass) || + ((!cclass) || ((contclass) && TESTAFF(contclass, cclass, contclasslen)) ) && // check only in compound homonyms (bad flags) (!badflag || !TESTAFF(he->astr, badflag, he->alen) - ) && + ) && // handle required flag - ((!needflag) || + ((!needflag) || (TESTAFF(he->astr, needflag, he->alen) || ((contclass) && TESTAFF(contclass, needflag, contclasslen))) ) @@ -618,12 +625,12 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, he = he->next_homonym; // check homonyms } while (he); - // obsolote stemming code (used only by the + // obsolote stemming code (used only by the // experimental SuffixMgr:suggest_pos_stems) // store resulting root in wlst } else if (wlst && (*ns < maxSug)) { int cwrd = 1; - for (int k=0; k < *ns; k++) + for (int k=0; k < *ns; k++) if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0; if (cwrd) { wlst[*ns] = mystrdup(tmpword); @@ -640,11 +647,11 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, return NULL; } -// see if two-level suffix is present in the word +// see if two-level suffix is present in the word struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword struct hentry * he; // hash entry pointer unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; @@ -689,7 +696,7 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, if (test_condition((char *) cp, (char *) tmpword)) { if (ppfx) { // handle conditional suffix - if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); else he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag); @@ -702,18 +709,18 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, return NULL; } -// see if two-level suffix is present in the word +// see if two-level suffix is present in the word char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag) { - int tmpl; // length of tmpword + int tmpl; // length of tmpword unsigned char * cp; char tmpword[MAXWORDUTF8LEN + 4]; PfxEntry* ep = (PfxEntry *) ppfx; char * st; char result[MAXLNLEN]; - + *result = '\0'; // if this suffix is being cross checked with a prefix @@ -788,7 +795,7 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, } // get next homonym with same affix -struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, +struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, const FLAG cclass, const FLAG needflag) { PfxEntry* ep = (PfxEntry *) ppfx; @@ -796,18 +803,18 @@ struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, Aff while (he->next_homonym) { he = he->next_homonym; - if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && - ((optflags & aeXPRODUCT) == 0 || + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + ((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) || // handle conditional suffix ((contclass) && TESTAFF(contclass, eFlag, contclasslen)) ) && // handle cont. class - ((!cclass) || + ((!cclass) || ((contclass) && TESTAFF(contclass, cclass, contclasslen)) ) && // handle required flag - ((!needflag) || + ((!needflag) || (TESTAFF(he->astr, needflag, he->alen) || ((contclass) && TESTAFF(contclass, needflag, contclasslen))) ) diff --git a/src/myspell/affixmgr.cxx b/src/myspell/affixmgr.cxx index d3e36be..1bcec78 100644 --- a/src/myspell/affixmgr.cxx +++ b/src/myspell/affixmgr.cxx @@ -20,15 +20,17 @@ #include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif -AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) +AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key) { // register hash manager and load affix data from aff file - pHMgr = ptr; + pHMgr = ptr[0]; + alldic = ptr; + maxdic = md; keystring = NULL; trystring = NULL; encoding=NULL; @@ -107,7 +109,7 @@ AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) contclasses[j] = 0; } - if (parse_file(affpath)) { + if (parse_file(affpath, key)) { HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath); } @@ -244,14 +246,10 @@ AffixMgr::~AffixMgr() // read in aff file and build up prefix and suffix entry objects -int AffixMgr::parse_file(const char * affpath) +int AffixMgr::parse_file(const char * affpath, const char * key) { - - // io buffers - char line[MAXLNLEN+1]; - - // affix type - char ft; + char * line; // io buffers + char ft; // affix type // checking flag duplication char dupflags[CONTSIZE]; @@ -261,8 +259,7 @@ int AffixMgr::parse_file(const char * affpath) int firstline = 1; // open the affix file - FILE * afflst; - afflst = fopen(affpath,"r"); + FileMgr * afflst = new FileMgr(affpath, key); if (!afflst) { HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath); return 1; @@ -271,10 +268,9 @@ int AffixMgr::parse_file(const char * affpath) // step one is to parse the affix file building up the internal // affix data structures - // read in each line ignoring any that do not // start with a known line type indicator - while (fgets(line,MAXLNLEN,afflst)) { + while ((line = afflst->getline())) { mychomp(line); /* remove byte order mark */ @@ -289,7 +285,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the keyboard string */ if (strncmp(line,"KEY",3) == 0) { if (parse_string(line, &keystring, "KEY")) { - fclose(afflst); + delete afflst; return 1; } } @@ -297,7 +293,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the try string */ if (strncmp(line,"TRY",3) == 0) { if (parse_string(line, &trystring, "TRY")) { - fclose(afflst); + delete afflst; return 1; } } @@ -305,7 +301,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the name of the character set used by the .dict and .aff */ if (strncmp(line,"SET",3) == 0) { if (parse_string(line, &encoding, "SET")) { - fclose(afflst); + delete afflst; return 1; } if (strcmp(encoding, "UTF-8") == 0) { @@ -325,7 +321,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by the controlled compound words */ if (strncmp(line,"COMPOUNDFLAG",12) == 0) { if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) { - fclose(afflst); + delete afflst; return 1; } } @@ -334,12 +330,12 @@ int AffixMgr::parse_file(const char * affpath) if (strncmp(line,"COMPOUNDBEGIN",13) == 0) { if (complexprefixes) { if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) { - fclose(afflst); + delete afflst; return 1; } } else { if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) { - fclose(afflst); + delete afflst; return 1; } } @@ -348,7 +344,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by compound words */ if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) { if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) { - fclose(afflst); + delete afflst; return 1; } } @@ -356,12 +352,12 @@ int AffixMgr::parse_file(const char * affpath) if (strncmp(line,"COMPOUNDEND",11) == 0) { if (complexprefixes) { if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) { - fclose(afflst); + delete afflst; return 1; } } else { if (parse_flag(line, &compoundend, "COMPOUNDEND")) { - fclose(afflst); + delete afflst; return 1; } } @@ -370,7 +366,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the data used by compound_check() method */ if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) { if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) { - fclose(afflst); + delete afflst; return 1; } } @@ -378,7 +374,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag sign compounds in dictionary */ if (strncmp(line,"COMPOUNDROOT",12) == 0) { if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) { - fclose(afflst); + delete afflst; return 1; } } @@ -386,7 +382,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by compound_check() method */ if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) { if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) { - fclose(afflst); + delete afflst; return 1; } } @@ -394,7 +390,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by compound_check() method */ if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) { if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) { - fclose(afflst); + delete afflst; return 1; } } @@ -417,7 +413,7 @@ int AffixMgr::parse_file(const char * affpath) if (strncmp(line,"NOSUGGEST",9) == 0) { if (parse_flag(line, &nosuggest, "NOSUGGEST")) { - fclose(afflst); + delete afflst; return 1; } } @@ -425,7 +421,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by forbidden words */ if (strncmp(line,"FORBIDDENWORD",13) == 0) { if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) { - fclose(afflst); + delete afflst; return 1; } } @@ -433,7 +429,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by forbidden words */ if (strncmp(line,"LEMMA_PRESENT",13) == 0) { if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) { - fclose(afflst); + delete afflst; return 1; } } @@ -441,7 +437,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by circumfixes */ if (strncmp(line,"CIRCUMFIX",9) == 0) { if (parse_flag(line, &circumfix, "CIRCUMFIX")) { - fclose(afflst); + delete afflst; return 1; } } @@ -449,7 +445,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by fogemorphemes */ if (strncmp(line,"ONLYINCOMPOUND",14) == 0) { if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) { - fclose(afflst); + delete afflst; return 1; } } @@ -457,7 +453,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by `needaffixs' */ if (strncmp(line,"PSEUDOROOT",10) == 0) { if (parse_flag(line, &needaffix, "PSEUDOROOT")) { - fclose(afflst); + delete afflst; return 1; } } @@ -465,7 +461,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by `needaffixs' */ if (strncmp(line,"NEEDAFFIX",9) == 0) { if (parse_flag(line, &needaffix, "NEEDAFFIX")) { - fclose(afflst); + delete afflst; return 1; } } @@ -473,7 +469,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the minimal length for words in compounds */ if (strncmp(line,"COMPOUNDMIN",11) == 0) { if (parse_num(line, &cpdmin, "COMPOUNDMIN")) { - fclose(afflst); + delete afflst; return 1; } if (cpdmin < 1) cpdmin = 1; @@ -482,7 +478,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the max. words and syllables in compounds */ if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) { if (parse_cpdsyllable(line)) { - fclose(afflst); + delete afflst; return 1; } } @@ -490,7 +486,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by compound_check() method */ if (strncmp(line,"SYLLABLENUM",11) == 0) { if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) { - fclose(afflst); + delete afflst; return 1; } } @@ -503,7 +499,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the extra word characters */ if (strncmp(line,"WORDCHARS",9) == 0) { if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) { - fclose(afflst); + delete afflst; return 1; } } @@ -511,7 +507,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the ignored characters (for example, Arabic optional diacretics charachters */ if (strncmp(line,"IGNORE",6) == 0) { if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) { - fclose(afflst); + delete afflst; return 1; } } @@ -519,7 +515,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the typical fault correcting table */ if (strncmp(line,"REP",3) == 0) { if (parse_reptable(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } @@ -527,7 +523,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the phonetic translation table */ if (strncmp(line,"PHONE",5) == 0) { if (parse_phonetable(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } @@ -535,7 +531,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the checkcompoundpattern table */ if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) { if (parse_checkcpdtable(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } @@ -543,7 +539,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the defcompound table */ if (strncmp(line,"COMPOUNDRULE",12) == 0) { if (parse_defcpdtable(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } @@ -551,7 +547,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the related character map table */ if (strncmp(line,"MAP",3) == 0) { if (parse_maptable(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } @@ -559,7 +555,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the word breakpoints table */ if (strncmp(line,"BREAK",5) == 0) { if (parse_breaktable(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } @@ -567,7 +563,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the language for language specific codes */ if (strncmp(line,"LANG",4) == 0) { if (parse_string(line, &lang, "LANG")) { - fclose(afflst); + delete afflst; return 1; } langnum = get_lang_num(lang); @@ -575,14 +571,14 @@ int AffixMgr::parse_file(const char * affpath) if (strncmp(line,"VERSION",7) == 0) { if (parse_string(line, &version, "VERSION")) { - fclose(afflst); + delete afflst; return 1; } } if (strncmp(line,"MAXNGRAMSUGS",12) == 0) { if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) { - fclose(afflst); + delete afflst; return 1; } } @@ -598,7 +594,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by forbidden words */ if (strncmp(line,"KEEPCASE",8) == 0) { if (parse_flag(line, &keepcase, "KEEPCASE")) { - fclose(afflst); + delete afflst; return 1; } } @@ -606,7 +602,7 @@ int AffixMgr::parse_file(const char * affpath) /* parse in the flag used by the affix generator */ if (strncmp(line,"SUBSTANDARD",11) == 0) { if (parse_flag(line, &substandard, "SUBSTANDARD")) { - fclose(afflst); + delete afflst; return 1; } } @@ -625,7 +621,7 @@ int AffixMgr::parse_file(const char * affpath) dupflags_ini = 0; } if (parse_affix(line, ft, afflst, dupflags)) { - fclose(afflst); + delete afflst; process_pfx_tree_to_list(); process_sfx_tree_to_list(); return 1; @@ -633,7 +629,7 @@ int AffixMgr::parse_file(const char * affpath) } } - fclose(afflst); + delete afflst; // convert affix trees to sorted list process_pfx_tree_to_list(); @@ -671,6 +667,23 @@ int AffixMgr::parse_file(const char * affpath) free(enc); enc = NULL; +#ifdef WINSHELL + char expw[MAXLNLEN]; + if (wordchars) { + strcpy(expw, wordchars); + free(wordchars); + } else *expw = '\0'; + + for (int i = 0; i <= 255; i++) { + if ( (csconv[i].cupper != csconv[i].clower) && + (! strchr(expw, (char) i))) { + *(expw + strlen(expw) + 1) = '\0'; + *(expw + strlen(expw)) = (char) i; + } + } + + wordchars = mystrdup(expw); +#endif // temporary BREAK definition for German dash handling (OOo issue 64400) if ((langnum == LANG_de) && (!breaktable)) { breaktable = (char **) malloc(sizeof(char *)); @@ -950,6 +963,15 @@ int AffixMgr::process_sfx_order() return 0; } +// add flags to the result for dictionary debugging +void AffixMgr::debugflag(char * result, unsigned short flag) { + char * st = encode_flag(flag); + strcat(result, " "); + strcat(result, MORPH_FLAG); + strcat(result, st); + free(st); +} + // calculate the character length of the condition int AffixMgr::condlen(char * st) { @@ -969,7 +991,7 @@ int AffixMgr::condlen(char * st) int AffixMgr::encodeit(struct affentry * ptr, char * cs) { if (strcmp(cs,".") != 0) { - ptr->numconds = condlen(cs); + ptr->numconds = (char) condlen(cs); strncpy(ptr->c.conds, cs, MAXCONDLEN); // long condition (end of conds padded by strncpy) if (ptr->c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { @@ -1313,7 +1335,7 @@ int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** ok = 1; btnum[bt - 1]--; pp = btpp[bt - 1]; - wp = btwp[bt - 1] + btnum[bt - 1]; + wp = btwp[bt - 1] + (signed short) btnum[bt - 1]; } while ((btnum[bt - 1] < 0) && --bt); } while (bt); @@ -2415,14 +2437,14 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, if (((PfxEntry *) ppfx)->getMorph()) { strcat(result, ((PfxEntry *) ppfx)->getMorph()); strcat(result, " "); - } + } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); } strcat(result, st); free(st); if (se->getMorph()) { strcat(result, " "); strcat(result, se->getMorph()); - } + } else debugflag(result, se->getFlag()); strcat(result, "\n"); } } @@ -2457,7 +2479,7 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, if (sptr->getMorph()) { strcat(result3, " "); strcat(result3, sptr->getMorph()); - } + } else debugflag(result3, sptr->getFlag()); strlinecat(result2, result3); strcat(result2, "\n"); strcat(result, result2); @@ -2517,7 +2539,7 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, if (((PfxEntry *) ppfx)->getMorph()) { strcat(result, ((PfxEntry *) ppfx)->getMorph()); strcat(result, " "); - } + } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); } if (complexprefixes && HENTRY_DATA(rv)) strcat(result, HENTRY_DATA(rv)); if (! HENTRY_FIND(rv, MORPH_STEM)) { @@ -2529,13 +2551,13 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); if (!complexprefixes && HENTRY_DATA(rv)) { - strcat(result, " "); - strcat(result, HENTRY_DATA(rv)); + strcat(result, " "); + strcat(result, HENTRY_DATA(rv)); } if (se->getMorph()) { strcat(result, " "); strcat(result, se->getMorph()); - } + } else debugflag(result, se->getFlag()); strcat(result, "\n"); rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } @@ -2575,7 +2597,7 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, if (((PfxEntry *) ppfx)->getMorph()) { strcat(result, ((PfxEntry *) ppfx)->getMorph()); strcat(result, " "); - } + } else debugflag(result, ((PfxEntry *) ppfx)->getFlag()); } if (complexprefixes && HENTRY_DATA(rv)) strcat(result, HENTRY_DATA(rv)); if (! HENTRY_FIND(rv, MORPH_STEM)) { @@ -2603,7 +2625,7 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, if (sptr->getMorph()) { strcat(result, " "); strcat(result, sptr->getMorph()); - } + } else debugflag(result, sptr->getFlag()); strcat(result, "\n"); rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } @@ -2731,7 +2753,7 @@ char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap, if (cmp == 0) { char * newword = sptr->add(ts, wl); if (newword) { - hentry * check = pHMgr->lookup(newword); + hentry * check = pHMgr->lookup(newword); // XXX extra dic if (!check || !check->astr || !TESTAFF(check->astr, forbiddenword, check->alen)) { return newword; @@ -2767,7 +2789,7 @@ char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap, int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts, int wl, const unsigned short * ap, unsigned short al, char * bad, int badl, - char * phone) + char * phon) { int nh=0; // first add root word to list @@ -2778,8 +2800,8 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts wlst[nh].orig = NULL; nh++; // add special phonetic version - if (phone && (nh < maxn)) { - wlst[nh].word = mystrdup(phone); + if (phon && (nh < maxn)) { + wlst[nh].word = mystrdup(phon); wlst[nh].allow = (1 == 0); wlst[nh].orig = mystrdup(ts); nh++; @@ -2809,11 +2831,11 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts wlst[nh].orig = NULL; nh++; // add special phonetic version - if (phone && (nh < maxn)) { + if (phon && (nh < maxn)) { char st[MAXWORDUTF8LEN]; - strcpy(st, phone); + strcpy(st, phon); strcat(st, sptr->getKey()); - reverseword(st + strlen(phone)); + reverseword(st + strlen(phon)); wlst[nh].word = mystrdup(st); wlst[nh].allow = (1 == 0); wlst[nh].orig = mystrdup(newword); @@ -2969,6 +2991,12 @@ int AffixMgr::get_checksharps() return checksharps; } +char * AffixMgr::encode_flag(unsigned short aflag) +{ + return pHMgr->encode_flag(aflag); +} + + // return the preferred ignore string for suggestions char * AffixMgr::get_ignore() { @@ -3097,8 +3125,12 @@ FLAG AffixMgr::get_lemma_present() // utility method to look up root words in hash table struct hentry * AffixMgr::lookup(const char * word) { - if (! pHMgr) return NULL; - return pHMgr->lookup(word); + int i; + struct hentry * he = NULL; + for (i = 0; i < *maxdic && !he; i++) { + he = (alldic[i])->lookup(word); + } + return he; } // return the value of suffix @@ -3203,7 +3235,7 @@ int AffixMgr::parse_cpdsyllable(char * line) } /* parse in the typical fault correcting table */ -int AffixMgr::parse_reptable(char * line, FILE * af) +int AffixMgr::parse_reptable(char * line, FileMgr * af) { if (numrep != 0) { HUNSPELL_WARNING(stderr, "error: duplicate REP tables used\n"); @@ -3243,9 +3275,9 @@ int AffixMgr::parse_reptable(char * line, FILE * af) } /* now parse the numrep lines to read in the remainder of the table */ - char * nl = line; + char * nl; for (int j=0; j < numrep; j++) { - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -3283,7 +3315,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af) } /* parse in the typical fault correcting table */ -int AffixMgr::parse_phonetable(char * line, FILE * af) +int AffixMgr::parse_phonetable(char * line, FileMgr * af) { if (phone) { HUNSPELL_WARNING(stderr, "error: duplicate PHONE tables used\n"); @@ -3302,7 +3334,7 @@ int AffixMgr::parse_phonetable(char * line, FILE * af) phone = (phonetable *) malloc(sizeof(struct phonetable)); phone->num = atoi(piece); phone->rules = NULL; - phone->utf8 = utf8; + phone->utf8 = (char) utf8; if (!phone) return 1; if (phone->num < 1) { HUNSPELL_WARNING(stderr, "incorrect number of entries in phonelacement table\n"); @@ -3327,9 +3359,9 @@ int AffixMgr::parse_phonetable(char * line, FILE * af) } /* now parse the phone->num lines to read in the remainder of the table */ - char * nl = line; + char * nl; for (int j=0; j < phone->num; j++) { - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -3370,7 +3402,7 @@ int AffixMgr::parse_phonetable(char * line, FILE * af) } /* parse in the checkcompoundpattern table */ -int AffixMgr::parse_checkcpdtable(char * line, FILE * af) +int AffixMgr::parse_checkcpdtable(char * line, FileMgr * af) { if (numcheckcpd != 0) { HUNSPELL_WARNING(stderr, "error: duplicate compound pattern tables used\n"); @@ -3410,9 +3442,9 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) } /* now parse the numcheckcpd lines to read in the remainder of the table */ - char * nl = line; + char * nl; for (int j=0; j < numcheckcpd; j++) { - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -3450,7 +3482,7 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) } /* parse in the compound rule table */ -int AffixMgr::parse_defcpdtable(char * line, FILE * af) +int AffixMgr::parse_defcpdtable(char * line, FileMgr * af) { if (numdefcpd != 0) { HUNSPELL_WARNING(stderr, "error: duplicate compound rule tables used\n"); @@ -3490,9 +3522,9 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) } /* now parse the numdefcpd lines to read in the remainder of the table */ - char * nl = line; + char * nl; for (int j=0; j < numdefcpd; j++) { - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -3533,7 +3565,7 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) /* parse in the character map table */ -int AffixMgr::parse_maptable(char * line, FILE * af) +int AffixMgr::parse_maptable(char * line, FileMgr * af) { if (nummap != 0) { HUNSPELL_WARNING(stderr, "error: duplicate MAP tables used\n"); @@ -3573,9 +3605,9 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } /* now parse the nummap lines to read in the remainder of the table */ - char * nl = line; + char * nl; for (int j=0; j < nummap; j++) { - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -3630,7 +3662,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } /* parse in the word breakpoint table */ -int AffixMgr::parse_breaktable(char * line, FILE * af) +int AffixMgr::parse_breaktable(char * line, FileMgr * af) { if (numbreak != 0) { HUNSPELL_WARNING(stderr, "error: duplicate word breakpoint tables used\n"); @@ -3670,9 +3702,9 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) } /* now parse the numbreak lines to read in the remainder of the table */ - char * nl = line; + char * nl; for (int j=0; j < numbreak; j++) { - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -3734,7 +3766,7 @@ void AffixMgr::reverse_condition(char * piece) { } } -int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflags) +int AffixMgr::parse_affix(char * line, const char at, FileMgr * af, char * dupflags) { int numents = 0; // number of affentry structures to parse @@ -3745,7 +3777,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag struct affentry * nptr= NULL; char * tp = line; - char * nl = line; + char * nl = NULL; char * piece; int i = 0; @@ -3773,7 +3805,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag HUNSPELL_WARNING(stderr, "error: duplicate affix flag %s in line %s\n", piece, nl); // return 1; XXX permissive mode for bad dictionaries } - dupflags[aflag] += ((at == 'S') ? dupSFX : dupPFX); + dupflags[aflag] += (char) ((at == 'S') ? dupSFX : dupPFX); break; } // piece 3 - is cross product indicator @@ -3820,7 +3852,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag // now parse numents affentries for this affix for (int j=0; j < numents; j++) { - if (!fgets(nl,MAXLNLEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -3835,7 +3867,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag case 0: { np++; if (nptr != ptr) nptr->opts = ptr->opts & - (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); + (char) (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); break; } @@ -3898,6 +3930,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag if (pHMgr->is_aliasf()) { int index = atoi(dash + 1); nptr->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(nptr->contclass)); + if (!nptr->contclasslen) HUNSPELL_WARNING(stderr, "error: bad affix flag alias: \"%s\"\n", dash+1); } else { nptr->contclasslen = (unsigned short) pHMgr->decode_flags(&(nptr->contclass), dash + 1); flag_qsort(nptr->contclass, 0, nptr->contclasslen); diff --git a/src/myspell/affixmgr.hxx b/src/myspell/affixmgr.hxx index 644d2c9..f045b87 100644 --- a/src/myspell/affixmgr.hxx +++ b/src/myspell/affixmgr.hxx @@ -27,6 +27,8 @@ class AffixMgr AffEntry * pFlag[CONTSIZE]; AffEntry * sFlag[CONTSIZE]; HashMgr * pHMgr; + HashMgr ** alldic; + int * maxdic; char * keystring; char * trystring; char * encoding; @@ -96,8 +98,9 @@ class AffixMgr flag flag_mode; public: - - AffixMgr(const char * affpath, HashMgr * ptr); + + AffixMgr(const char * affpath, HashMgr** ptr, int * md, + const char * key = NULL); ~AffixMgr(); struct hentry * affix_check(const char * word, int len, const unsigned short needflag = (unsigned short) 0, @@ -150,7 +153,7 @@ public: short numsyllable, short maxwordnum, short wnum, hentry ** words, char hu_mov_rule, char ** result, char * partresult); - struct hentry * lookup(const char * word); + struct hentry * lookup(const char * word); int get_numrep(); struct replentry * get_reptable(); struct phonetable * get_phonetable(); @@ -171,7 +174,6 @@ public: FLAG get_compoundbegin(); FLAG get_forbiddenword(); FLAG get_nosuggest(); -// FLAG get_circumfix(); FLAG get_needaffix(); FLAG get_onlyincompound(); FLAG get_compoundroot(); @@ -191,21 +193,23 @@ public: int get_sugswithdots(void); FLAG get_keepcase(void); int get_checksharps(void); + char * encode_flag(unsigned short aflag); private: - int parse_file(const char * affpath); + int parse_file(const char * affpath, const char * key); int parse_flag(char * line, unsigned short * out, const char * name); int parse_num(char * line, int * out, const char * name); int parse_cpdsyllable(char * line); - int parse_reptable(char * line, FILE * af); - int parse_phonetable(char * line, FILE * af); - int parse_maptable(char * line, FILE * af); - int parse_breaktable(char * line, FILE * af); - int parse_checkcpdtable(char * line, FILE * af); - int parse_defcpdtable(char * line, FILE * af); - int parse_affix(char * line, const char at, FILE * af, char * dupflags); + int parse_reptable(char * line, FileMgr * af); + int parse_phonetable(char * line, FileMgr * af); + int parse_maptable(char * line, FileMgr * af); + int parse_breaktable(char * line, FileMgr * af); + int parse_checkcpdtable(char * line, FileMgr * af); + int parse_defcpdtable(char * line, FileMgr * af); + int parse_affix(char * line, const char at, FileMgr * af, char * dupflags); void reverse_condition(char *); + void debugflag(char * result, unsigned short flag); int condlen(char *); int encodeit(struct affentry * ptr, char * cs); int build_pfxtree(AffEntry* pfxptr); diff --git a/src/myspell/atypes.hxx b/src/myspell/atypes.hxx index 0afb345..0d4db14 100644 --- a/src/myspell/atypes.hxx +++ b/src/myspell/atypes.hxx @@ -13,8 +13,8 @@ static inline void HUNSPELL_WARNING(FILE *, const char *, ...) {} // HUNSTEM def. #define HUNSTEM -#include "csutil.hxx" #include "hashmgr.hxx" +#include "w_char.hxx" #define SETSIZE 256 #define CONTSIZE 65536 @@ -70,6 +70,12 @@ struct affentry char * morphcode; }; +struct guessword { + char * word; + bool allow; + char * orig; +}; + struct mapentry { char * set; w_char * set_utf16; @@ -81,10 +87,4 @@ struct flagentry { int len; }; -struct guessword { - char * word; - bool allow; - char * orig; -}; - #endif diff --git a/src/myspell/csutil.cxx b/src/myspell/csutil.cxx index da9ff9f..e282754 100644 --- a/src/myspell/csutil.cxx +++ b/src/myspell/csutil.cxx @@ -45,7 +45,7 @@ static NS_DEFINE_CID(kUnicharUtilCID, NS_UNICHARUTIL_CID); using namespace std; #endif #else -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif @@ -55,8 +55,8 @@ static int utf_tbl_count = 0; // utf_tbl can be used by multiple Hunspell instan /* only UTF-16 (BMP) implementation */ char * u16_u8(char * dest, int size, const w_char * src, int srclen) { - char * u8 = dest; - char * u8_max = u8 + size; + signed char * u8 = (signed char *)dest; + signed char * u8_max = (signed char *)(u8 + size); const w_char * u2 = src; const w_char * u2_max = src + srclen; while ((u2 < u2_max) && (u8 < u8_max)) { @@ -103,7 +103,7 @@ char * u16_u8(char * dest, int size, const w_char * src, int srclen) { /* only UTF-16 (BMP) implementation */ int u8_u16(w_char * dest, int size, const char * src) { - const char * u8 = src; + const signed char * u8 = (const signed char *)src; w_char * u2 = dest; w_char * u2_max = u2 + size; @@ -125,7 +125,7 @@ int u8_u16(w_char * dest, int size, const char * src) { case 0x90: case 0xa0: case 0xb0: { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %ld. character position\n%s\n", static_cast<long>(u8 - src), src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %ld. character position\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; break; @@ -137,7 +137,7 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - src), src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; } @@ -151,12 +151,12 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - src), src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; } } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - src), src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); u2->h = 0xff; u2->l = 0xfd; } @@ -350,9 +350,10 @@ char * line_uniq_app(char ** text, char breakchar) { } char ** lines; + int i; int linenum = line_tok(*text, &lines, breakchar); int dup = 0; - for (int i = 0; i < linenum; i++) { + for (i = 0; i < linenum; i++) { for (int j = 0; j < (i - 1); j++) { if (strcmp(lines[i], lines[j]) == 0) { *(lines[i]) = '\0'; @@ -375,7 +376,7 @@ char * line_uniq_app(char ** text, char breakchar) { return *text; } strcpy(*text," ( "); - for (int i = 0; i < linenum; i++) if (*(lines[i])) { + for (i = 0; i < linenum; i++) if (*(lines[i])) { sprintf(*text + strlen(*text), "%s%s", lines[i], " | "); } (*text)[strlen(*text) - 2] = ')'; // " ) " @@ -415,8 +416,8 @@ char * tr(char * text, char oldc, char newc) { // otherwise return -1 int morphcmp(const char * s, const char * t) { - int se; - int te; + int se = 0; + int te = 0; const char * sl; const char * tl; const char * olds; @@ -579,8 +580,9 @@ char * mystrrep(char * word, const char * pat, const char * rep) { } int uniqlist(char ** list, int n) { + int i; if (n < 2) return n; - for (int i = 0; i < n; i++) { + for (i = 0; i < n; i++) { for (int j = 0; j < i; j++) { if (list[j] && list[i] && (strcmp(list[j], list[i]) == 0)) { free(list[i]); @@ -590,7 +592,7 @@ char * mystrrep(char * word, const char * pat, const char * rep) { } } int m = 1; - for (int i = 1; i < n; i++) if (list[i]) { + for (i = 1; i < n; i++) if (list[i]) { list[m] = list[i]; m++; } @@ -681,6 +683,20 @@ void mkallcap_utf(w_char * u, int nc, int langnum) { if (*p != '\0') *d= csconv[((unsigned char)*p)].cupper; } + // conversion function for protected memory + void store_pointer(char * dest, char * source) + { + memcpy(dest, &source, sizeof(char *)); + } + + // conversion function for protected memory + char * get_stored_pointer(char * s) + { + char * p; + memcpy(&p, s, sizeof(char *)); + return p; + } + // these are simple character mappings for the // encodings supported // supplying isupper, tolower, and toupper @@ -941,7 +957,7 @@ struct cs_info iso1_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; @@ -1201,7 +1217,7 @@ struct cs_info iso2_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; @@ -1461,7 +1477,7 @@ struct cs_info iso3_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso4_tbl[] = { @@ -1720,7 +1736,7 @@ struct cs_info iso4_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso5_tbl[] = { @@ -1979,7 +1995,7 @@ struct cs_info iso5_tbl[] = { { 0x00, 0xfc, 0xac }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xae }, -{ 0x00, 0xff, 0xaf }, +{ 0x00, 0xff, 0xaf } }; struct cs_info iso6_tbl[] = { @@ -2238,7 +2254,7 @@ struct cs_info iso6_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso7_tbl[] = { @@ -2497,7 +2513,7 @@ struct cs_info iso7_tbl[] = { { 0x00, 0xfc, 0xbc }, { 0x00, 0xfd, 0xbe }, { 0x00, 0xfe, 0xbf }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso8_tbl[] = { @@ -2756,7 +2772,7 @@ struct cs_info iso8_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso9_tbl[] = { @@ -3015,7 +3031,7 @@ struct cs_info iso9_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0x49 }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso10_tbl[] = { @@ -3274,7 +3290,7 @@ struct cs_info iso10_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info koi8r_tbl[] = { @@ -3533,7 +3549,7 @@ struct cs_info koi8r_tbl[] = { { 0x01, 0xdc, 0xfc }, { 0x01, 0xdd, 0xfd }, { 0x01, 0xde, 0xfe }, -{ 0x01, 0xdf, 0xff }, +{ 0x01, 0xdf, 0xff } }; struct cs_info koi8u_tbl[] = { @@ -3792,7 +3808,7 @@ struct cs_info koi8u_tbl[] = { { 0x01, 0xdc, 0xfc }, { 0x01, 0xdd, 0xfd }, { 0x01, 0xde, 0xfe }, -{ 0x01, 0xdf, 0xff }, +{ 0x01, 0xdf, 0xff } }; struct cs_info cp1251_tbl[] = { @@ -4051,7 +4067,7 @@ struct cs_info cp1251_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xdf }, +{ 0x00, 0xff, 0xdf } }; struct cs_info iso13_tbl[] = { @@ -4310,7 +4326,7 @@ struct cs_info iso13_tbl[] = { { 0x00, 0xFC, 0xDC }, { 0x00, 0xFD, 0xDD }, { 0x00, 0xFE, 0xDE }, -{ 0x00, 0xFF, 0xFF }, +{ 0x00, 0xFF, 0xFF } }; @@ -4570,7 +4586,7 @@ struct cs_info iso14_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; struct cs_info iso15_tbl[] = { @@ -4829,7 +4845,7 @@ struct cs_info iso15_tbl[] = { { 0x00, 0xfc, 0xdc }, { 0x00, 0xfd, 0xdd }, { 0x00, 0xfe, 0xde }, -{ 0x00, 0xff, 0xbe }, +{ 0x00, 0xff, 0xbe } }; struct cs_info iscii_devanagari_tbl[] = { @@ -5088,10 +5104,10 @@ struct cs_info iscii_devanagari_tbl[] = { { 0x00, 0xfc, 0xfc }, { 0x00, 0xfd, 0xfd }, { 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xff, 0xff } }; -struct enc_entry encds[] = { +static struct enc_entry encds[] = { {"ISO8859-1",iso1_tbl}, {"ISO8859-2",iso2_tbl}, {"ISO8859-3",iso3_tbl}, @@ -5108,7 +5124,7 @@ struct enc_entry encds[] = { {"ISO8859-13", iso13_tbl}, {"ISO8859-14", iso14_tbl}, {"ISO8859-15", iso15_tbl}, -{"ISCII-DEVANAGARI", iscii_devanagari_tbl}, +{"ISCII-DEVANAGARI", iscii_devanagari_tbl} }; struct cs_info * get_current_cs(const char * es) { @@ -5117,6 +5133,7 @@ struct cs_info * get_current_cs(const char * es) { for (int i = 0; i < n; i++) { if (strcmp(es,encds[i].enc_name) == 0) { ccs = encds[i].cs_table; + break; } } return ccs; @@ -5362,14 +5379,14 @@ int get_captype(char * word, int nl, cs_info * csconv) { int ncap = 0; int nneutral = 0; int firstcap = 0; - - for (char * q = word; *q != '\0'; q++) { - if (csconv[*((unsigned char *)q)].ccase) ncap++; - if (csconv[*((unsigned char *)q)].cupper == csconv[*((unsigned char *)q)].clower) nneutral++; - } - if (ncap) { - firstcap = csconv[*((unsigned char *) word)].ccase; - } + if (csconv == NULL) return NOCAP; + for (char * q = word; *q != '\0'; q++) { + if (csconv[*((unsigned char *)q)].ccase) ncap++; + if (csconv[*((unsigned char *)q)].cupper == csconv[*((unsigned char *)q)].clower) nneutral++; + } + if (ncap) { + firstcap = csconv[*((unsigned char *) word)].ccase; + } // now finally set the captype if (ncap == 0) { diff --git a/src/myspell/csutil.hxx b/src/myspell/csutil.hxx index df7979b..2a16538 100644 --- a/src/myspell/csutil.hxx +++ b/src/myspell/csutil.hxx @@ -3,6 +3,8 @@ // First some base level utility routines +#include "w_char.hxx" + #define NOCAP 0 #define INITCAP 1 #define ALLCAP 2 @@ -23,6 +25,7 @@ #define MORPH_PHON "ph:" #define MORPH_HYPH "hy:" #define MORPH_PART "pa:" +#define MORPH_FLAG "fl:" #define MORPH_HENTRY "_H:" #define MORPH_TAG_LEN strlen(MORPH_STEM) @@ -30,16 +33,15 @@ #define MSEP_REC '\n' #define MSEP_ALT '\v' - // default flags #define DEFAULTFLAGS 65510 #define FORBIDDENWORD 65510 #define ONLYUPCASEFLAG 65511 -typedef struct { - unsigned char l; - unsigned char h; -} w_char; +// hash entry macros +#define HENTRY_DATA(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \ + get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : NULL) +#define HENTRY_FIND(h,p) (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL) #define w_char_eq(a,b) (((a).l == (b).l) && ((a).h == (b).h)) @@ -104,12 +106,6 @@ struct cs_info { unsigned char cupper; }; -// two character arrays -struct replentry { - char * pattern; - char * pattern2; -}; - // Unicode character encoding information struct unicode_info { unsigned short c; @@ -200,4 +196,10 @@ int morphcmp(const char * s, const char * t); int get_sfxcount(const char * morph); +// conversion function for protected memory +void store_pointer(char * dest, char * source); + +// conversion function for protected memory +char * get_stored_pointer(char * s); + #endif diff --git a/src/myspell/filemgr.cxx b/src/myspell/filemgr.cxx new file mode 100644 index 0000000..165fc77 --- /dev/null +++ b/src/myspell/filemgr.cxx @@ -0,0 +1,38 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "filemgr.hxx" + +int FileMgr::fail(const char * err, const char * par) { + fprintf(stderr, err, par); + return -1; +} + +FileMgr::FileMgr(const char * file, const char * key) { + hin = NULL; + fin = fopen(file, "r"); + if (!fin) { + // check hzipped file + char * st = (char *) malloc(strlen(file) + strlen(HZIP_EXTENSION)); + if (st) { + strcpy(st, file); + strcat(st, HZIP_EXTENSION); + hin = new Hunzip(st, key); + } + } + if (!fin && !hin) fail(MSG_OPEN, file); +} + +FileMgr::~FileMgr() +{ + if (fin) fclose(fin); + if (hin) delete hin; +} + +char * FileMgr::getline() { + const char * l; + if (fin) return fgets(in, BUFSIZE - 1, fin); + if (hin && (l = hin->getline())) return strcpy(in, l); + return NULL; +} diff --git a/src/myspell/filemgr.hxx b/src/myspell/filemgr.hxx new file mode 100644 index 0000000..593228d --- /dev/null +++ b/src/myspell/filemgr.hxx @@ -0,0 +1,19 @@ +/* file manager class - read lines of files [filename] OR [filename.hz] */ +#ifndef _FILEMGR_HXX_ +#define _FILEMGR_HXX_ +#include "hunzip.hxx" + +class FileMgr +{ +protected: + FILE * fin; + Hunzip * hin; + char in[BUFSIZE + 50]; // input buffer + int fail(const char * err, const char * par); + +public: + FileMgr(const char * filename, const char * key = NULL); + ~FileMgr(); + char * getline(); +}; +#endif diff --git a/src/myspell/hashmgr.cxx b/src/myspell/hashmgr.cxx index 08e061c..a1ca329 100644 --- a/src/myspell/hashmgr.cxx +++ b/src/myspell/hashmgr.cxx @@ -22,14 +22,14 @@ using namespace std; #endif #else -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif // build a hash table from a munched word list -HashMgr::HashMgr(const char * tpath, const char * apath) +HashMgr::HashMgr(const char * tpath, const char * apath, const char * key) { tablesize = 0; tableptr = NULL; @@ -48,8 +48,8 @@ HashMgr::HashMgr(const char * tpath, const char * apath) numaliasm = 0; aliasm = NULL; forbiddenword = FORBIDDENWORD; // forbidden word signing flag - load_config(apath); - int ec = load_tables(tpath); + load_config(apath, key); + int ec = load_tables(tpath, key); if (ec) { /* error condition - what should we do here */ HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec); @@ -129,7 +129,7 @@ int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, int al, const char * desc, bool onlyupcase) { bool upcasehomonym = false; - int descl = desc ? (aliasm ? sizeof(char *) : strlen(desc) + 1) : 0; + int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0; // variable-length hash record with word and optional fields struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl); @@ -161,7 +161,8 @@ int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, hp->var = H_OPT; if (aliasm) { hp->var += H_OPT_ALIASM; - *((char **) (hpw + wbl + 1)) = get_aliasm(atoi(desc)); +// *((char **) (hpw + wbl + 1)) = get_aliasm(atoi(desc)); + store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc))); } else { strcpy(hpw + wbl + 1, desc); if (complexprefixes) { @@ -236,12 +237,12 @@ int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl, if (al) memcpy(flags2, flags, al * sizeof(unsigned short)); flags2[al] = ONLYUPCASEFLAG; if (utf8) { - char st[MAXDELEN]; - w_char w[MAXDELEN]; - int wlen = u8_u16(w, MAXDELEN, word); + char st[BUFSIZE]; + w_char w[BUFSIZE]; + int wlen = u8_u16(w, BUFSIZE, word); mkallsmall_utf(w, wlen, langnum); mkallcap_utf(w, 1, langnum); - u16_u8(st, MAXDELEN, w, wlen); + u16_u8(st, BUFSIZE, w, wlen); return add_word(st,wbl,wcl,flags2,al+1,dp, true); } else { mkallsmall(word, csconv); @@ -256,8 +257,8 @@ int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl, int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) { int len; if (utf8) { - w_char dest_utf[MAXDELEN]; - len = u8_u16(dest_utf, MAXDELEN, word); + w_char dest_utf[BUFSIZE]; + len = u8_u16(dest_utf, BUFSIZE, word); *captype = get_captype_utf8(dest_utf, len, langnum); } else { len = wbl; @@ -266,27 +267,47 @@ int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) { return len; } -// remove word with FORBIDDENWORD flag (not implemented) +// remove word (personal dictionary function for standalone applications) int HashMgr::remove(const char * word) { - struct hentry * dp = lookup(word); -/* - if (!word || (!dp->astr || !TESTAFF(dp->astr, forbiddenword, pt->alen))) { - int wbl = strlen(word); - int wcl = get_clen_and_captype(word, wbl, &captype); - if (aliasf) { - add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false); - } else { - unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short)); - if (flags) { - memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); - add_word(word, wbl, wcl, flags, dp->alen, NULL, false); - } else return 1; + struct hentry * dp = lookup(word); + while (dp) { + if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { + unsigned short * flags = + (unsigned short *) malloc(sizeof(short *) * (dp->alen + 1)); + if (!flags) return 1; + for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i]; + flags[dp->alen] = forbiddenword; + dp->astr = flags; + dp->alen++; + flag_qsort(flags, 0, dp->alen); } - return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype); + dp = dp->next_homonym; } -*/ - return 1; +} + +/* remove forbidden flag to add a personal word to the hash */ +int HashMgr::remove_forbidden_flag(const char * word) { + struct hentry * dp = lookup(word); + if (!dp) return 1; + while (dp) { + if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) { + if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic. + else { + unsigned short * flags2 = + (unsigned short *) malloc(sizeof(short *) * (dp->alen - 1)); + if (!flags2) return 1; + int i, j = 0; + for (i = 0; i < dp->alen; i++) { + if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i]; + } + dp->alen--; + dp->astr = flags2; // XXX allowed forbidden words + } + } + dp = dp->next_homonym; + } + return 0; } // add a custom dic. word to the hash table (public) @@ -301,17 +322,21 @@ int HashMgr::add(const char * word, char * aff) flags = NULL; } - int captype; - int wbl = strlen(word); - int wcl = get_clen_and_captype(word, wbl, &captype); - add_word(word, wbl, wcl, flags, al, NULL, false); - return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype); + if (remove_forbidden_flag(word)) { + int captype; + int wbl = strlen(word); + int wcl = get_clen_and_captype(word, wbl, &captype); + add_word(word, wbl, wcl, flags, al, NULL, false); + return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype); + } + return 0; } int HashMgr::add_with_affix(const char * word, const char * example) { // detect captype and modify word length for UTF-8 encoding struct hentry * dp = lookup(example); + remove_forbidden_flag(word); if (dp && dp->astr) { int captype; int wbl = strlen(word); @@ -344,22 +369,22 @@ struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const } // load a munched word list and build a hash table on the fly -int HashMgr::load_tables(const char * tpath) +int HashMgr::load_tables(const char * tpath, const char * key) { int al; char * ap; char * dp; unsigned short * flags; + char * ts; - // raw dictionary - munched file - FILE * rawdict = fopen(tpath, "r"); - if (rawdict == NULL) return 1; + // open dictionary file + FileMgr * dict = new FileMgr(tpath, key); + if (dict == NULL) return 1; // first read the first line of file to get hash table size */ - char ts[MAXDELEN]; - if (! fgets(ts, MAXDELEN-1,rawdict)) { + if (!(ts = dict->getline())) { HUNSPELL_WARNING(stderr, "error: empty dic file\n"); - fclose(rawdict); + delete dict; return 2; } mychomp(ts); @@ -373,7 +398,7 @@ int HashMgr::load_tables(const char * tpath) if ((*ts < '1') || (*ts > '9')) HUNSPELL_WARNING(stderr, "error - missing word count in dictionary file\n"); tablesize = atoi(ts); if (!tablesize) { - fclose(rawdict); + delete dict; return 4; } tablesize = tablesize + 5 + USERWORD; @@ -382,7 +407,7 @@ int HashMgr::load_tables(const char * tpath) // allocate the hash table tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *)); if (! tableptr) { - fclose(rawdict); + delete dict; return 3; } for (int i=0; i<tablesize; i++) tableptr[i] = NULL; @@ -390,7 +415,7 @@ int HashMgr::load_tables(const char * tpath) // loop through all words on much list and add to hash // table and create word and affix strings - while (fgets(ts,MAXDELEN-1,rawdict)) { + while ((ts = dict->getline())) { mychomp(ts); // split each line into word and morphological description dp = strchr(ts,'\t'); @@ -443,16 +468,15 @@ int HashMgr::load_tables(const char * tpath) // add the word and its index plus its capitalized form optionally if (add_word(ts,wbl,wcl,flags,al,dp, false) || add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) { - fclose(rawdict); + delete dict; return 5; } } - fclose(rawdict); + delete dict; return 0; } - // the hash function is a simple load and rotate // algorithm borrowed @@ -506,8 +530,8 @@ int HashMgr::decode_flags(unsigned short ** result, char * flags) { break; } case FLAG_UNI: { // UTF-8 characters - w_char w[MAXDELEN/2]; - len = u8_u16(w, MAXDELEN/2, flags); + w_char w[BUFSIZE/2]; + len = u8_u16(w, BUFSIZE/2, flags); *result = (unsigned short *) malloc(len * sizeof(short)); if (!*result) return -1; memcpy(*result, w, len * sizeof(short)); @@ -566,16 +590,13 @@ char * HashMgr::encode_flag(unsigned short f) { } // read in aff file and set flag mode -int HashMgr::load_config(const char * affpath) +int HashMgr::load_config(const char * affpath, const char * key) { + char * line; // io buffers int firstline = 1; - - // io buffers - char line[MAXDELEN+1]; // open the affix file - FILE * afflst; - afflst = fopen(affpath,"r"); + FileMgr * afflst = new FileMgr(affpath, key); if (!afflst) { HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath); return 1; @@ -584,7 +605,7 @@ int HashMgr::load_config(const char * affpath) // read in each line ignoring any that do not // start with a known line type indicator - while (fgets(line,MAXDELEN,afflst)) { + while ((line = afflst->getline())) { mychomp(line); /* remove byte order mark */ @@ -608,7 +629,7 @@ int HashMgr::load_config(const char * affpath) if (strncmp(line,"FORBIDDENWORD",13) == 0) { char * st = NULL; if (parse_string(line, &st, "FORBIDDENWORD")) { - fclose(afflst); + delete afflst; return 1; } forbiddenword = decode_flag(st); @@ -616,7 +637,7 @@ int HashMgr::load_config(const char * affpath) } if (strncmp(line, "SET", 3) == 0) { if (parse_string(line, &enc, "SET")) { - fclose(afflst); + delete afflst; return 1; } if (strcmp(enc, "UTF-8") == 0) { @@ -630,7 +651,7 @@ int HashMgr::load_config(const char * affpath) } if (strncmp(line, "LANG", 4) == 0) { if (parse_string(line, &lang, "LANG")) { - fclose(afflst); + delete afflst; return 1; } langnum = get_lang_num(lang); @@ -639,21 +660,21 @@ int HashMgr::load_config(const char * affpath) /* parse in the ignored characters (for example, Arabic optional diacritics characters */ if (strncmp(line,"IGNORE",6) == 0) { if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) { - fclose(afflst); + delete afflst; return 1; } } if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) { if (parse_aliasf(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) { if (parse_aliasm(line, afflst)) { - fclose(afflst); + delete afflst; return 1; } } @@ -662,12 +683,12 @@ int HashMgr::load_config(const char * affpath) if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; } if (csconv == NULL) csconv = get_current_cs("ISO8859-1"); - fclose(afflst); + delete afflst; return 0; } /* parse in the ALIAS table */ -int HashMgr::parse_aliasf(char * line, FILE * af) +int HashMgr::parse_aliasf(char * line, FileMgr * af) { if (numaliasf != 0) { HUNSPELL_WARNING(stderr, "error: duplicate AF (alias for flag vector) tables used\n"); @@ -723,9 +744,9 @@ int HashMgr::parse_aliasf(char * line, FILE * af) } /* now parse the numaliasf lines to read in the remainder of the table */ - char * nl = line; + char * nl; for (int j=0; j < numaliasf; j++) { - if (!fgets(nl,MAXDELEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; @@ -788,7 +809,7 @@ int HashMgr::get_aliasf(int index, unsigned short ** fvec) { } /* parse morph alias definitions */ -int HashMgr::parse_aliasm(char * line, FILE * af) +int HashMgr::parse_aliasm(char * line, FileMgr * af) { if (numaliasm != 0) { HUNSPELL_WARNING(stderr, "error: duplicate AM (aliases for morphological descriptions) tables used\n"); @@ -836,7 +857,7 @@ int HashMgr::parse_aliasm(char * line, FILE * af) /* now parse the numaliasm lines to read in the remainder of the table */ char * nl = line; for (int j=0; j < numaliasm; j++) { - if (!fgets(nl,MAXDELEN,af)) return 1; + if (!(nl = af->getline())) return 1; mychomp(nl); tp = nl; i = 0; diff --git a/src/myspell/hashmgr.hxx b/src/myspell/hashmgr.hxx index d88de48..ce23f71 100644 --- a/src/myspell/hashmgr.hxx +++ b/src/myspell/hashmgr.hxx @@ -8,6 +8,7 @@ #endif #include "htypes.hxx" +#include "filemgr.hxx" enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI }; @@ -35,7 +36,7 @@ class HashMgr public: - HashMgr(const char * tpath, const char * apath); + HashMgr(const char * tpath, const char * apath, const char * key = NULL); ~HashMgr(); struct hentry * lookup(const char *) const; @@ -55,14 +56,15 @@ public: private: int get_clen_and_captype(const char * word, int wbl, int * captype); - int load_tables(const char * tpath); + int load_tables(const char * tpath, const char * key); int add_word(const char * word, int wbl, int wcl, unsigned short * ap, int al, const char * desc, bool onlyupcase); - int load_config(const char * affpath); - int parse_aliasf(char * line, FILE * af); + int load_config(const char * affpath, const char * key); + int parse_aliasf(char * line, FileMgr * af); int add_hidden_capitalized_word(char * word, int wbl, int wcl, unsigned short * flags, int al, char * dp, int captype); - int parse_aliasm(char * line, FILE * af); + int parse_aliasm(char * line, FileMgr * af); + int remove_forbidden_flag(const char * word); }; diff --git a/src/myspell/htypes.hxx b/src/myspell/htypes.hxx index bc078c3..80647f9 100644 --- a/src/myspell/htypes.hxx +++ b/src/myspell/htypes.hxx @@ -1,8 +1,6 @@ #ifndef _HTYPES_HXX_ #define _HTYPES_HXX_ -#define MAXDELEN 8192 - #define ROTATE_LEN 5 #define ROTATE(v,q) \ @@ -13,10 +11,8 @@ #define H_OPT_ALIASM (1 << 1) #define H_OPT_PHON (1 << 2) +// see also csutil.hxx #define HENTRY_WORD(h) &(h->word) -#define HENTRY_DATA(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \ - *((char **) (&(h->word) + h->blen + 1)) : &(h->word) + h->blen + 1) : NULL) -#define HENTRY_FIND(h,p) (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL) // approx. number of user defined words #define USERWORD 1000 diff --git a/src/myspell/hunspell.cxx b/src/myspell/hunspell.cxx index 7075c36..e74b34a 100644 --- a/src/myspell/hunspell.cxx +++ b/src/myspell/hunspell.cxx @@ -13,30 +13,31 @@ #include "hunspell.hxx" #include "hunspell.h" - -#ifdef HAVE_CONFIG_H #include "config.h" -#endif +#include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif -Hunspell::Hunspell(const char * affpath, const char * dpath) +Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key) { encoding = NULL; csconv = NULL; utf8 = 0; complexprefixes = 0; + affixpath = mystrdup(affpath); + maxdic = 0; /* first set up the hash manager */ - pHMgr = new HashMgr(dpath, affpath); + pHMgr[0] = new HashMgr(dpath, affpath, key); + if (pHMgr[0]) maxdic = 1; /* next set up the affix manager */ /* it needs access to the hash manager lookup methods */ - pAMgr = new AffixMgr(affpath,pHMgr); + pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key); /* get the preferred try string and the dictionary */ /* encoding from the Affix Manager for that dictionary */ @@ -51,25 +52,33 @@ Hunspell::Hunspell(const char * affpath, const char * dpath) /* and finally set up the suggestion manager */ pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); if (try_string) free(try_string); - } Hunspell::~Hunspell() { if (pSMgr) delete pSMgr; if (pAMgr) delete pAMgr; - if (pHMgr) delete pHMgr; + for (int i = 0; i < maxdic; i++) delete pHMgr[i]; + maxdic = 0; pSMgr = NULL; pAMgr = NULL; - pHMgr = NULL; #ifdef MOZILLA_CLIENT free(csconv); #endif csconv= NULL; if (encoding) free(encoding); encoding = NULL; + if (affixpath) free(affixpath); + affixpath = NULL; } +// load extra dictionaries +int Hunspell::add_dic(const char * dpath, const char * key) { + if (maxdic == MAXDIC) return 1; + pHMgr[maxdic] = new HashMgr(dpath, affixpath, key); + if (pHMgr[maxdic]) maxdic++; else return 1; + return 0; +} // make a copy of src at destination while removing all leading // blanks and removing any trailing periods after recording @@ -337,7 +346,7 @@ int Hunspell::spell(const char * word, int * info, char ** root) int abbv = 0; int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); int info2 = 0; - if (wl == 0) return 1; + if (wl == 0 || maxdic == 0) return 1; if (root) *root = NULL; // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.) @@ -359,7 +368,8 @@ int Hunspell::spell(const char * word, int * info, char ** root) // LANG_hu section: number(s) + (percent or degree) with suffixes if (langnum == LANG_hu) { if ((nstate == NNUM) && ((cw[i] == '%') || ((!utf8 && (cw[i] == '\xB0')) || - (utf8 && (strncmp(cw + i, "\xC2\xB0", 2)==0)))) + (utf8 && (strncmp(cw + i, "\xC2\xB0", 2)==0 || // UTF-8 degree + strncmp(cw + i, "\xE2\x80\xB0", 3)==0)))) // UTF-8 per mille && checkword(cw + i, info, root)) return 1; } // END of LANG_hu section @@ -562,7 +572,7 @@ int Hunspell::spell(const char * word, int * info, char ** root) struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) { struct hentry * he = NULL; - int len; + int len, i; char w2[MAXWORDUTF8LEN]; const char * word; @@ -589,7 +599,8 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) } // look word in hash table - if (pHMgr) he = pHMgr->lookup(word); + for (i = 0; (i < maxdic) && !he; i ++) { + he = (pHMgr[i])->lookup(word); // check forbidden and onlyincompound words if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { @@ -610,6 +621,7 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)) )) he = he->next_homonym; + } // check with affixes if (!he && pAMgr) { @@ -671,7 +683,7 @@ int Hunspell::suggest(char*** slst, const char * word) int onlycmpdsug = 0; char cw[MAXWORDUTF8LEN]; char wspace[MAXWORDUTF8LEN]; - if (! pSMgr) return 0; + if (!pSMgr || maxdic == 0) return 0; w_char unicw[MAXWORDLEN]; int nc = strlen(word); if (utf8) { @@ -823,27 +835,27 @@ int Hunspell::suggest(char*** slst, const char * word) if ((ns == 0 || onlycmpdsug) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { switch(captype) { case NOCAP: { - ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr); + ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic); break; } case HUHCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr); + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); break; } case INITCAP: { capwords = 1; memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); - ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr); + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); break; } case ALLCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall2(wspace, unicw, nc); int oldns = ns; - ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr); + ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic); for (int j = oldns; j < ns; j++) mkallcap((*slst)[j]); break; @@ -936,7 +948,7 @@ int Hunspell::suggest_auto(char*** slst, const char * word) { char cw[MAXWORDUTF8LEN]; char wspace[MAXWORDUTF8LEN]; - if (! pSMgr) return 0; + if (!pSMgr || maxdic == 0) return 0; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return 0; @@ -1059,13 +1071,13 @@ int Hunspell::stem(char*** slst, char ** desc, int n) alt = strstr(alt, " | "); } int pln = line_tok(tok, &pl, MSEP_ALT); - for (int i = 0; i < pln; i++) { + for (int k = 0; k < pln; k++) { // add derivational suffixes - if (strstr(pl[i], MORPH_DERI_SFX)) { + if (strstr(pl[k], MORPH_DERI_SFX)) { // remove inflectional suffixes - char * is = strstr(pl[i], MORPH_INFL_SFX); + char * is = strstr(pl[k], MORPH_INFL_SFX); if (is) *is = '\0'; - char * sg = pSMgr->suggest_gen(&(pl[i]), 1, pl[i]); + char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]); if (sg) { char ** gen; int genl = line_tok(sg, &gen, MSEP_REC); @@ -1078,15 +1090,17 @@ int Hunspell::stem(char*** slst, char ** desc, int n) } } else { sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); - if (strstr(pl[i], MORPH_SURF_PFX)) { - copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX); + if (strstr(pl[k], MORPH_SURF_PFX)) { + copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX); } - copy_field(result2 + strlen(result2), pl[i], MORPH_STEM); + copy_field(result2 + strlen(result2), pl[k], MORPH_STEM); } } freelist(&pl, pln); } - return uniqlist(*slst, line_tok(result2, slst, MSEP_REC)); + int sln = line_tok(result2, slst, MSEP_REC); + return uniqlist(*slst, sln); + } int Hunspell::stem(char*** slst, const char * word) @@ -1103,7 +1117,7 @@ int Hunspell::suggest_pos_stems(char*** slst, const char * word) { char cw[MAXWORDUTF8LEN]; char wspace[MAXWORDUTF8LEN]; - if (! pSMgr) return 0; + if (! pSMgr || maxdic == 0) return 0; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return 0; @@ -1223,21 +1237,19 @@ int Hunspell::mkinitsmall2(char * p, w_char * u, int nc) int Hunspell::add(const char * word) { - if (pHMgr) return pHMgr->add(word, NULL); + if (pHMgr[0]) return (pHMgr[0])->add(word, NULL); return 0; } int Hunspell::add_with_affix(const char * word, const char * example) { - if (pHMgr) return pHMgr->add_with_affix(word, example); + if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example); return 0; } -/* XXX not implemented yet */ - int Hunspell::remove(const char * word) { - if (pHMgr) return pHMgr->remove(word); + if (pHMgr[0]) return (pHMgr[0])->remove(word); return 0; } @@ -1251,21 +1263,20 @@ struct cs_info * Hunspell::get_csconv() return csconv; } -char * Hunspell::cat_result(char * result, char * st) +void Hunspell::cat_result(char * result, char * st) { if (st) { if (*result) strcat(result, "\n"); strcat(result, st); free(st); } - return result; } int Hunspell::analyze(char*** slst, const char * word) { char cw[MAXWORDUTF8LEN]; char wspace[MAXWORDUTF8LEN]; - if (! pSMgr) return 0; + if (! pSMgr || maxdic == 0) return 0; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return 0; @@ -1474,7 +1485,7 @@ int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) { if (!pSMgr || !pln) return 0; char **pl2; - char pl2n = analyze(&pl2, word); + int pl2n = analyze(&pl2, word); int captype = 0; int abbv = 0; char cw[MAXWORDUTF8LEN]; @@ -1522,7 +1533,7 @@ int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) int Hunspell::generate(char*** slst, const char * word, const char * pattern) { char **pl; - char pln = analyze(&pl, pattern); + int pln = analyze(&pl, pattern); int n = generate(slst, word, pl, pln); freelist(&pl, pln); return uniqlist(*slst, n); @@ -1534,7 +1545,7 @@ char * Hunspell::morph_with_correction(const char * word) { char cw[MAXWORDUTF8LEN]; char wspace[MAXWORDUTF8LEN]; - if (! pSMgr) return NULL; + if (! pSMgr || maxdic == 0) return NULL; int wl = strlen(word); if (utf8) { if (wl >= MAXWORDUTF8LEN) return NULL; @@ -1684,6 +1695,12 @@ Hunhandle *Hunspell_create(const char * affpath, const char * dpath) return (Hunhandle*)(new Hunspell(affpath, dpath)); } +Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, + const char * key) +{ + return (Hunhandle*)(new Hunspell(affpath, dpath, key)); +} + void Hunspell_destroy(Hunhandle *pHunspell) { delete (Hunspell*)(pHunspell); @@ -1750,7 +1767,6 @@ int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, } /* remove word from the run-time dictionary */ - /* NOTE: not implemented yet */ int Hunspell_remove(Hunhandle *pHunspell, const char * word) { return ((Hunspell*)pHunspell)->remove(word); diff --git a/src/myspell/hunspell.h b/src/myspell/hunspell.h index 452599c..a18cec4 100644 --- a/src/myspell/hunspell.h +++ b/src/myspell/hunspell.h @@ -7,15 +7,25 @@ extern "C" { typedef struct Hunhandle Hunhandle; -Hunhandle *Hunspell_create(const char * affpath, const char * dpath); -void Hunspell_destroy(Hunhandle *pHunspell); +#ifdef _MSC_VER +#define DLL __declspec ( dllexport ) +#else +#define DLL +#endif + +DLL Hunhandle *Hunspell_create(const char * affpath, const char * dpath); + +DLL Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath, + const char * key); + +DLL void Hunspell_destroy(Hunhandle *pHunspell); /* spell(word) - spellcheck word * output: 0 = bad word, not 0 = good word */ -int Hunspell_spell(Hunhandle *pHunspell, const char *); +DLL int Hunspell_spell(Hunhandle *pHunspell, const char *); -char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); +DLL char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); /* suggest(suggestions, word) - search suggestions * input: pointer to an array of strings pointer and the (bad) word @@ -24,17 +34,17 @@ char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); * a newly allocated array of strings (*slts will be NULL when number * of suggestion equals 0.) */ -int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word); +DLL int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word); /* morphological functions */ /* analyze(result, word) - morphological analysis of the word */ -int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word); +DLL int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word); /* stem(result, word) - stemmer function */ -int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word); +DLL int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word); /* stem(result, analysis, n) - get stems from a morph. analysis * example: @@ -43,11 +53,11 @@ int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word); * int n2 = Hunspell_stem2(result2, result, n1); */ -int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n); +DLL int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n); /* generate(result, word, word2) - morphological generation by example(s) */ -int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, +DLL int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, const char * word2); /* generate(result, word, desc, n) - generation by morph. description(s) @@ -58,27 +68,25 @@ int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, * for (int i = 0; i < n; i++) printf("%s\n", result[i]); */ -int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, +DLL int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, char** desc, int n); /* functions for run-time modification of the dictionary */ /* add word to the run-time dictionary */ -int Hunspell_add(const char * word); +DLL int Hunspell_add(Hunhandle *pHunspell, const char * word); /* add word to the run-time dictionary with affix flags of * the example (a dictionary word): Hunspell will recognize * affixed forms of the new word, too. */ -int Hunspell_add_with_affix(const char * word, const char * example); +DLL int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, const char * example); /* remove word from the run-time dictionary */ - /* NOTE: not implemented yet */ - -int Hunspell_remove(const char * word); +DLL int Hunspell_remove(Hunhandle *pHunspell, const char * word); #ifdef __cplusplus } diff --git a/src/myspell/hunspell.hxx b/src/myspell/hunspell.hxx index 38c141e..854f354 100644 --- a/src/myspell/hunspell.hxx +++ b/src/myspell/hunspell.hxx @@ -1,7 +1,6 @@ #include "hashmgr.hxx" #include "affixmgr.hxx" #include "suggestmgr.hxx" -#include "csutil.hxx" #include "langnum.hxx" #define SPELL_COMPOUND (1 << 0) @@ -10,6 +9,7 @@ #define SPELL_NOCAP (1 << 3) #define SPELL_INITCAP (1 << 4) +#define MAXDIC 20 #define MAXSUGGESTION 15 #define MAXSHARPS 5 @@ -26,32 +26,37 @@ #endif #endif -#ifdef W32 +#ifdef WIN32 class DLLEXPORT Hunspell #else class Hunspell #endif { AffixMgr* pAMgr; - HashMgr* pHMgr; + HashMgr* pHMgr[MAXDIC]; + int maxdic; SuggestMgr* pSMgr; + char * affixpath; char * encoding; struct cs_info * csconv; int langnum; int utf8; int complexprefixes; char** wordbreak; + char * key; public: /* Hunspell(aff, dic) - constructor of Hunspell class * input: path of affix file and dictionary file */ - - Hunspell(const char * affpath, const char * dpath); + Hunspell(const char * affpath, const char * dpath, const char * key = NULL); ~Hunspell(); + /* load extra dictionaries (only dic files) */ + int add_dic(const char * dpath, const char * key = NULL); + /* spell(word) - spellcheck word * output: 0 = bad word, not 0 = good word * @@ -123,7 +128,6 @@ public: int add_with_affix(const char * word, const char * example); /* remove word from the run-time dictionary */ - /* NOTE: not implemented yet */ int remove(const char * word); @@ -164,7 +168,7 @@ private: hentry * spellsharps(char * base, char *, int, int, char * tmp, int * info, char **root); int is_keepcase(const hentry * rv); int insert_sug(char ***slst, char * word, int ns); - char * cat_result(char * result, char * st); + void cat_result(char * result, char * st); char * stem_description(const char * desc); }; diff --git a/src/myspell/hunzip.cxx b/src/myspell/hunzip.cxx new file mode 100644 index 0000000..f9091b8 --- /dev/null +++ b/src/myspell/hunzip.cxx @@ -0,0 +1,196 @@ +#ifndef MOZILLA_CLIENT +#include <cstdlib> +#include <cstring> +#include <cstdio> +#else +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#endif + +#include "hunzip.hxx" + +#define CODELEN 65536 +#define BASEBITREC 5000 + +#define UNCOMPRESSED '\002' +#define MAGIC "hz0" +#define MAGIC_ENCRYPT "hz1" +#define MAGICLEN (sizeof(MAGIC) - 1) + +int Hunzip::fail(const char * err, const char * par) { + fprintf(stderr, err, par); + return -1; +} + +Hunzip::Hunzip(const char * file, const char * key) { + bufsiz = 0; + lastbit = 0; + inc = 0; + outc = 0; + dec = NULL; + filename = (char *) malloc(strlen(file) + 1); + if (filename) strcpy(filename, file); + if (getcode(key) == -1) bufsiz = -1; + else bufsiz = getbuf(); +} + +int Hunzip::getcode(const char * key) { + unsigned char c[2]; + int i, j, n, p; + int allocatedbit = BASEBITREC; + const char * enc = key; + + fin = fopen(filename, "rb"); + if (!fin) return -1; + + // read magic number + if ((fread(in, 1, 3, fin) < MAGICLEN) + || !(strncmp(MAGIC, in, MAGICLEN) == 0 || + strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0)) { + return fail(MSG_FORMAT, filename); + } + + // check encryption + if (strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0) { + unsigned char cs; + if (!key) return fail(MSG_KEY, filename); + if (fread(&c, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename); + for (cs = 0; *enc; enc++) cs ^= *enc; + if (cs != c[0]) return fail(MSG_KEY, filename); + enc = key; + } else key = NULL; + + // read record count + if (fread(&c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename); + + if (key) { + c[0] ^= *enc; + if (*(++enc) == '\0') enc = key; + c[1] ^= *enc; + } + + n = ((int) c[0] << 8) + c[1]; + dec = (struct bit *) malloc(BASEBITREC * sizeof(struct bit)); + if (!dec) return fail(MSG_MEMORY, filename); + dec[0].v[0] = 0; + dec[0].v[1] = 0; + + // read codes + for (i = 0; i < n; i++) { + unsigned char l; + if (fread(c, 1, 2, fin) < 2) return fail(MSG_FORMAT, filename); + if (key) { + if (*(++enc) == '\0') enc = key; + c[0] ^= *enc; + if (*(++enc) == '\0') enc = key; + c[1] ^= *enc; + } + if (fread(&l, 1, 1, fin) < 1) return fail(MSG_FORMAT, filename); + if (key) { + if (*(++enc) == '\0') enc = key; + l ^= *enc; + } + if (fread(in, 1, l/8+1, fin) < (size_t) l/8+1) return fail(MSG_FORMAT, filename); + if (key) for (j = 0; j <= l/8; j++) { + if (*(++enc) == '\0') enc = key; + in[j] ^= *enc; + } + p = 0; + for (j = 0; j < l; j++) { + int b = (in[j/8] & (1 << (7 - (j % 8)))) ? 1 : 0; + int oldp = p; + p = dec[p].v[b]; + if (p == 0) { + lastbit++; + if (lastbit == allocatedbit) { + allocatedbit += BASEBITREC; + dec = (struct bit *) realloc(dec, allocatedbit * sizeof(struct bit)); + } + dec[lastbit].v[0] = 0; + dec[lastbit].v[1] = 0; + dec[oldp].v[b] = lastbit; + p = lastbit; + } + } + dec[p].c[0] = c[0]; + dec[p].c[1] = c[1]; + } + return 0; +} + +Hunzip::~Hunzip() +{ + if (dec) free(dec); + if (fin) fclose(fin); + if (filename) free(filename); +} + +int Hunzip::getbuf() { + int p = 0; + int o = 0; + do { + if (inc == 0) inbits = fread(in, 1, BUFSIZE, fin) * 8; + for (; inc < inbits; inc++) { + int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0; + int oldp = p; + p = dec[p].v[b]; + if (p == 0) { + if (oldp == lastbit) { + fclose(fin); + fin = NULL; + // add last odd byte + if (dec[lastbit].c[0]) out[o++] = dec[lastbit].c[1]; + return o; + } + out[o++] = dec[oldp].c[0]; + out[o++] = dec[oldp].c[1]; + if (o == BUFSIZE) return o; + p = dec[p].v[b]; + } + } + inc = 0; + } while (inbits == BUFSIZE * 8); + return fail(MSG_FORMAT, filename); +} + +const char * Hunzip::getline() { + char linebuf[BUFSIZE]; + int l = 0, eol = 0, left = 0, right = 0; + if (bufsiz == -1) return NULL; + while (l < bufsiz && !eol) { + linebuf[l++] = out[outc]; + switch (out[outc]) { + case '\t': break; + case 31: { // escape + if (++outc == bufsiz) { + bufsiz = getbuf(); + outc = 0; + } + linebuf[l - 1] = out[outc]; + break; + } + case ' ': break; + default: if (((unsigned char) out[outc]) < 47) { + if (out[outc] > 32) { + right = out[outc] - 31; + if (++outc == bufsiz) { + bufsiz = getbuf(); + outc = 0; + } + } + if (out[outc] == 30) left = 9; else left = out[outc]; + linebuf[l-1] = '\n'; + eol = 1; + } + } + if (++outc == bufsiz) { + outc = 0; + bufsiz = fin ? getbuf(): -1; + } + } + if (right) strcpy(linebuf + l - 1, line + strlen(line) - right - 1); + else linebuf[l] = '\0'; + strcpy(line + left, linebuf); + return line; +} diff --git a/src/myspell/hunzip.hxx b/src/myspell/hunzip.hxx new file mode 100644 index 0000000..52109d1 --- /dev/null +++ b/src/myspell/hunzip.hxx @@ -0,0 +1,41 @@ +/* hunzip: file decompression for sorted dictionaries with optional encryption, + * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */ + +#ifndef _HUNZIP_HXX_ +#define _HUNZIP_HXX_ + +#define BUFSIZE 65536 +#define HZIP_EXTENSION ".hz" + +#define MSG_OPEN "error: %s: cannot open\n" +#define MSG_FORMAT "error: %s: not in hzip format\n" +#define MSG_MEMORY "error: %s: missing memory\n" +#define MSG_KEY "error: %s: missing or bad password\n" + +struct bit { + unsigned char c[2]; + int v[2]; +}; + +class Hunzip +{ + +protected: + char * filename; + FILE * fin; + int bufsiz, lastbit, inc, inbits, outc; + struct bit * dec; // code table + char in[BUFSIZE]; // input buffer + char out[BUFSIZE + 1]; // Huffman-decoded buffer + char line[BUFSIZE + 50]; // decoded line + int getcode(const char * key); + int getbuf(); + int fail(const char * err, const char * par); + +public: + Hunzip(const char * filename, const char * key = NULL); + ~Hunzip(); + const char * getline(); +}; + +#endif diff --git a/src/myspell/myspell_checker.cpp b/src/myspell/myspell_checker.cpp index 390d2a2..7b86ca9 100644 --- a/src/myspell/myspell_checker.cpp +++ b/src/myspell/myspell_checker.cpp @@ -38,8 +38,13 @@ #include "enchant.h" #include "enchant-provider.h" -/* built against hunspell 1.1.5 on 2007-03-19 */ +/* built against hunspell 1.2.2 on 2008-04-12 */ + +#ifdef WITH_SYSTEM_MYSPELL +#include <hunspell/hunspell.hxx> +#else #include "hunspell.hxx" +#endif ENCHANT_PLUGIN_DECLARE("Myspell") diff --git a/src/myspell/phonet.cxx b/src/myspell/phonet.cxx index e4ad5a0..91dc419 100644 --- a/src/myspell/phonet.cxx +++ b/src/myspell/phonet.cxx @@ -46,7 +46,7 @@ void init_phonet_hash(phonetable & parms) { int i, k; - for (i = 0; i < parms.hash_size; i++) { + for (i = 0; i < HASHSIZE; i++) { parms.hash[i] = -1; } diff --git a/src/myspell/phonet.hxx b/src/myspell/phonet.hxx index 471ff84..d1cf995 100644 --- a/src/myspell/phonet.hxx +++ b/src/myspell/phonet.hxx @@ -30,6 +30,7 @@ #ifndef __PHONETHXX__ #define __PHONETHXX__ +#define HASHSIZE 256 #define MAXPHONETLEN 256 #define MAXPHONETUTF8LEN (MAXPHONETLEN * 4) @@ -38,8 +39,7 @@ struct phonetable { cs_info * lang; int num; char * * rules; - static const int hash_size = 256; - int hash[hash_size]; + int hash[HASHSIZE]; }; void init_phonet_hash(phonetable & parms); diff --git a/src/myspell/suggestmgr.cxx b/src/myspell/suggestmgr.cxx index b1a58f3..c19ba08 100644 --- a/src/myspell/suggestmgr.cxx +++ b/src/myspell/suggestmgr.cxx @@ -15,9 +15,10 @@ #include "suggestmgr.hxx" #include "htypes.hxx" +#include "csutil.hxx" #ifndef MOZILLA_CLIENT -#ifndef W32 +#ifndef WIN32 using namespace std; #endif #endif @@ -1028,7 +1029,7 @@ int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, } // generate a set of suggestions for very poorly spelled words -int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr* pHMgr) +int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md) { int i, j; @@ -1037,8 +1038,6 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr* pHMgr) int lp, lpphon; int nonbmp = 0; - if (!pHMgr) return ns; - // exhaustively search through all root words // keeping track of the MAX_ROOTS most similar root words struct hentry * roots[MAX_ROOTS]; @@ -1088,8 +1087,9 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr* pHMgr) mkallcap(candidate, csconv); phonet(candidate, target, n, *ph); } - - while ((hp = pHMgr->walk_hashtable(col, hp))) { + + for (i = 0; i < md; i++) { + while ((hp = (pHMgr[i])->walk_hashtable(col, hp))) { if ((hp->astr) && (pAMgr) && (TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) || TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) || @@ -1135,7 +1135,7 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr* pHMgr) lval = scoresphon[j]; } } - } + }} // find minimum threshhold for a passable suggestion // mangle original word three differnt ways @@ -1557,7 +1557,7 @@ char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern) *result = '\0'; int sfxcount = get_sfxcount(pattern); -// if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL; + if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL; if (HENTRY_DATA(rv)) { char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen, diff --git a/src/myspell/suggestmgr.hxx b/src/myspell/suggestmgr.hxx index d22884f..0e61572 100644 --- a/src/myspell/suggestmgr.hxx +++ b/src/myspell/suggestmgr.hxx @@ -51,7 +51,7 @@ public: ~SuggestMgr(); int suggest(char*** slst, const char * word, int nsug, int * onlycmpdsug); - int ngsuggest(char ** wlst, char * word, int ns, HashMgr* pHMgr); + int ngsuggest(char ** wlst, char * word, int ns, HashMgr** pHMgr, int md); int suggest_auto(char*** slst, const char * word, int nsug); int suggest_stems(char*** slst, const char * word, int nsug); int suggest_pos_stems(char*** slst, const char * word, int nsug); diff --git a/src/myspell/w_char.hxx b/src/myspell/w_char.hxx new file mode 100644 index 0000000..a3d11c3 --- /dev/null +++ b/src/myspell/w_char.hxx @@ -0,0 +1,19 @@ +#ifndef __WCHARHXX__ +#define __WCHARHXX__ + +#ifdef WIN32 +typedef struct { +#else +typedef struct __attribute__ ((packed)) { +#endif + unsigned char l; + unsigned char h; +} w_char; + +// two character arrays +struct replentry { + char * pattern; + char * pattern2; +}; + +#endif |