diff options
Diffstat (limited to 'src/spell.c')
-rw-r--r-- | src/spell.c | 1419 |
1 files changed, 968 insertions, 451 deletions
diff --git a/src/spell.c b/src/spell.c index a12c0cc55..eab725a11 100644 --- a/src/spell.c +++ b/src/spell.c @@ -51,18 +51,28 @@ */ /* - * Use this to let the score depend in how much a suggestion sounds like the - * bad word. It's quite slow and only occasionally makes the sorting better. -#define SOUNDFOLD_SCORE - */ - -/* * Use this to adjust the score after finding suggestions, based on the * suggested word sounding like the bad word. This is much faster than doing * it for every possible suggestion. * Disadvantage: When "the" is typed as "hte" it sounds different and goes * down in the list. -#define RESCORE(word_score, sound_score) ((2 * word_score + sound_score) / 3) + * Used when 'spellsuggest' is set to "best". + */ +#define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) + +/* + * The double scoring mechanism is based on the principle that there are two + * kinds of spelling mistakes: + * 1. You know how to spell the word, but mistype something. This results in + * a small editing distance (character swapped/omitted/inserted) and + * possibly a word that sounds completely different. + * 2. You don't know how to spell the word and type something that sounds + * right. The edit distance can be big but the word is similar after + * sound-folding. + * Since scores for these two mistakes will be very different we use a list + * for each. + * The sound-folding is slow, only do double scoring when 'spellsuggest' is + * "double". */ /* @@ -231,7 +241,7 @@ typedef long idx_T; #define BY_SPECIAL BY_INDEX /* hightest special byte value */ /* Info from "REP" and "SAL" entries in ".aff" file used in si_rep, sl_rep, - * si_sal and sl_sal. + * and si_sal. Not for sl_sal! * One replacement: from "ft_from" to "ft_to". */ typedef struct fromto_S { @@ -239,6 +249,18 @@ typedef struct fromto_S char_u *ft_to; } fromto_T; +/* Info from "SAL" entries in ".aff" file used in sl_sal. + * The info is split for quick processing by spell_soundfold(). + * Note that "sm_oneof" and "sm_rules" point into sm_lead. */ +typedef struct salitem_S +{ + char_u *sm_lead; /* leading letters */ + int sm_leadlen; /* length of "sm_lead" */ + char_u *sm_oneoff; /* letters from () or NULL */ + char_u *sm_rules; /* rules like ^, $, priority */ + char_u *sm_to; /* replacement. */ +} salitem_T; + /* * Structure used to store words and other info for one language, loaded from * a .spl file. @@ -277,7 +299,7 @@ struct slang_S garray_T sl_rep; /* list of fromto_T entries from REP lines */ short sl_rep_first[256]; /* indexes where byte first appears, -1 if there is none */ - garray_T sl_sal; /* list of fromto_T entries from SAL lines */ + garray_T sl_sal; /* list of salitem_T entries from SAL lines */ short sl_sal_first[256]; /* indexes where byte first appears, -1 if there is none */ int sl_followup; /* SAL followup */ @@ -330,16 +352,14 @@ typedef struct langp_S typedef struct suginfo_S { garray_T su_ga; /* suggestions, contains "suggest_T" */ + int su_maxcount; /* max. number of suggestions displayed */ int su_maxscore; /* maximum score for adding to su_ga */ + garray_T su_sga; /* like su_ga, sound-folded scoring */ char_u *su_badptr; /* start of bad word in line */ int su_badlen; /* length of detected bad word in line */ char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */ char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */ hashtab_T su_banned; /* table with banned words */ -#ifdef SOUNDFOLD_SCORE - slang_T *su_slang; /* currently used slang_T */ - char_u su_salword[MAXWLEN]; /* soundfolded badword */ -#endif } suginfo_T; /* One word suggestion. Used in "si_ga". */ @@ -348,32 +368,27 @@ typedef struct suggest_S char_u *st_word; /* suggested word, allocated string */ int st_orglen; /* length of replaced text */ int st_score; /* lower is better */ -#ifdef RESCORE + int st_altscore; /* used when st_score compares equal */ + int st_salscore; /* st_score is for soundalike */ int st_had_bonus; /* bonus already included in score */ -#endif } suggest_T; -#define SUG(sup, i) (((suggest_T *)(sup)->su_ga.ga_data)[i]) - -/* Number of suggestions displayed. */ -#define SUG_PROMPT_COUNT ((int)Rows - 2) +#define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) /* Number of suggestions kept when cleaning up. When rescore_suggestions() is * called the score may change, thus we need to keep more than what is * displayed. */ -#define SUG_CLEAN_COUNT (SUG_PROMPT_COUNT < 25 ? 25 : SUG_PROMPT_COUNT) +#define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 25 ? 25 : (su)->su_maxcount) /* Threshold for sorting and cleaning up suggestions. Don't want to keep lots * of suggestions that are not going to be displayed. */ -#define SUG_MAX_COUNT (SUG_PROMPT_COUNT + 50) +#define SUG_MAX_COUNT(su) ((su)->su_maxcount + 50) /* score for various changes */ -#define SCORE_SPLIT 99 /* split bad word */ +#define SCORE_SPLIT 149 /* split bad word */ #define SCORE_ICASE 52 /* slightly different case */ #define SCORE_REGION 70 /* word is for different region */ #define SCORE_RARE 180 /* rare word */ - -/* score for edit distance */ #define SCORE_SWAP 90 /* swap two characters */ #define SCORE_SWAP3 110 /* swap two characters in three */ #define SCORE_REP 87 /* REP replacement */ @@ -384,6 +399,8 @@ typedef struct suggest_S #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower. * 350 allows for about three changes. */ + +#define SCORE_BIG SCORE_INS * 3 /* big difference */ #define SCORE_MAXMAX 999999 /* accept any score */ /* @@ -451,25 +468,26 @@ static void write_spell_prefcond __ARGS((FILE *fd, garray_T *gap)); #endif /* - * For finding suggestion: At each node in the tree these states are tried: + * For finding suggestions: At each node in the tree these states are tried: */ typedef enum { - STATE_START = 0, /* At start of node, check if word may end or - * split word. */ - STATE_SPLITUNDO, /* Undo word split. */ + STATE_START = 0, /* At start of node check for NUL bytes (goodword + * ends); if badword ends there is a match, otherwise + * try splitting word. */ + STATE_SPLITUNDO, /* Undo splitting. */ STATE_ENDNUL, /* Past NUL bytes at start of the node. */ STATE_PLAIN, /* Use each byte of the node. */ STATE_DEL, /* Delete a byte from the bad word. */ STATE_INS, /* Insert a byte in the bad word. */ STATE_SWAP, /* Swap two bytes. */ - STATE_UNSWAP, /* Undo swap two bytes. */ - STATE_SWAP3, /* Swap two bytes over three. */ - STATE_UNSWAP3, /* Undo Swap two bytes over three. */ - STATE_ROT3L, /* Rotate three bytes left */ - STATE_UNROT3L, /* Undo rotate three bytes left */ - STATE_ROT3R, /* Rotate three bytes right */ - STATE_UNROT3R, /* Undo rotate three bytes right */ + STATE_UNSWAP, /* Undo swap two characters. */ + STATE_SWAP3, /* Swap two characters over three. */ + STATE_UNSWAP3, /* Undo Swap two characters over three. */ + STATE_ROT3L, /* Rotate three characters left */ + STATE_UNROT3L, /* Undo rotate three characters left */ + STATE_ROT3R, /* Rotate three characters right */ + STATE_UNROT3R, /* Undo rotate three characters right */ STATE_REP_INI, /* Prepare for using REP items. */ STATE_REP, /* Use matching REP items from the .aff file. */ STATE_REP_UNDO, /* Undo a REP item replacement. */ @@ -483,6 +501,7 @@ typedef struct trystate_S { state_T ts_state; /* state at this level, STATE_ */ int ts_score; /* score */ + idx_T ts_arridx; /* index in tree array, start of node */ short ts_curi; /* index in list of child nodes */ char_u ts_fidx; /* index in fword[], case-folded bad word */ char_u ts_fidxtry; /* ts_fidx at which bytes may be changed */ @@ -493,7 +512,6 @@ typedef struct trystate_S char_u ts_isdiff; /* DIFF_ values */ char_u ts_fcharstart; /* index in fword where badword char started */ #endif - idx_T ts_arridx; /* index in tree array, start of node */ char_u ts_save_prewordlen; /* saved "prewordlen" */ char_u ts_save_splitoff; /* su_splitoff saved here */ char_u ts_save_badflags; /* badflags saved here */ @@ -528,30 +546,28 @@ static int set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp)); static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp)); static void write_spell_chartab __ARGS((FILE *fd)); static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen)); +static void spell_find_suggest __ARGS((char_u *badptr, suginfo_T *su, int maxcount)); +static void spell_find_cleanup __ARGS((suginfo_T *su)); static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper)); +static void allcap_copy __ARGS((char_u *word, char_u *wcopy)); static void spell_try_change __ARGS((suginfo_T *su)); static int try_deeper __ARGS((suginfo_T *su, trystate_T *stack, int depth, int score_add)); static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword)); +static void score_comp_sal __ARGS((suginfo_T *su)); +static void score_combine __ARGS((suginfo_T *su)); static void spell_try_soundalike __ARGS((suginfo_T *su)); static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags)); static void set_map_str __ARGS((slang_T *lp, char_u *map)); static int similar_chars __ARGS((slang_T *slang, int c1, int c2)); -#ifdef RESCORE -static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score, int had_bonus)); -#else -static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score)); -#endif +static void add_suggestion __ARGS((suginfo_T *su, garray_T *gap, char_u *goodword, int use_score, int had_bonus)); static void add_banned __ARGS((suginfo_T *su, char_u *word)); static int was_banned __ARGS((suginfo_T *su, char_u *word)); static void free_banned __ARGS((suginfo_T *su)); -#ifdef RESCORE static void rescore_suggestions __ARGS((suginfo_T *su)); -#endif -static void cleanup_suggestions __ARGS((suginfo_T *su, int keep)); +static int cleanup_suggestions __ARGS((garray_T *gap, int maxscore, int keep)); static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, char_u *res)); -#if defined(RESCORE) || defined(SOUNDFOLD_SCORE) static int spell_sound_score __ARGS((slang_T *slang, char_u *goodword, char_u *badsound)); -#endif +static int soundalike_score __ARGS((char_u *goodsound, char_u *badsound)); static int spell_edit_score __ARGS((char_u *badword, char_u *goodword)); /* @@ -617,95 +633,103 @@ spell_check(wp, ptr, attrp) { matchinf_T mi; /* Most things are put in "mi" so that it can be passed to functions quickly. */ + int nrlen = 0; /* found a number first */ /* A word never starts at a space or a control character. Return quickly * then, skipping over the character. */ if (*ptr <= ' ') return 1; - /* A word starting with a number is always OK. Also skip hexadecimal - * numbers 0xFF99 and 0X99FF. */ + /* A number is always OK. Also skip hexadecimal numbers 0xFF99 and + * 0X99FF. But when a word character follows do check spelling. */ if (*ptr >= '0' && *ptr <= '9') { if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X')) mi.mi_end = skiphex(ptr + 2); else + { mi.mi_end = skipdigits(ptr); + nrlen = mi.mi_end - ptr; + } + if (!SPELL_ISWORDP(mi.mi_end)) + return (int)(mi.mi_end - ptr); } - else - { - /* Find the end of the word. */ - mi.mi_word = ptr; - mi.mi_fend = ptr; - if (SPELL_ISWORDP(mi.mi_fend)) + /* Find the end of the word. */ + mi.mi_word = ptr; + mi.mi_fend = ptr; + + if (SPELL_ISWORDP(mi.mi_fend)) + { + /* Make case-folded copy of the characters until the next non-word + * character. */ + do { - /* Make case-folded copy of the characters until the next non-word - * character. */ - do - { - mb_ptr_adv(mi.mi_fend); - } while (*mi.mi_fend != NUL && SPELL_ISWORDP(mi.mi_fend)); - } + mb_ptr_adv(mi.mi_fend); + } while (*mi.mi_fend != NUL && SPELL_ISWORDP(mi.mi_fend)); + } - /* We always use the characters up to the next non-word character, - * also for bad words. */ - mi.mi_end = mi.mi_fend; + /* We always use the characters up to the next non-word character, + * also for bad words. */ + mi.mi_end = mi.mi_fend; - /* Check caps type later. */ - mi.mi_capflags = 0; - mi.mi_cend = NULL; + /* Check caps type later. */ + mi.mi_capflags = 0; + mi.mi_cend = NULL; - /* Include one non-word character so that we can check for the - * word end. */ - if (*mi.mi_fend != NUL) - mb_ptr_adv(mi.mi_fend); + /* Include one non-word character so that we can check for the + * word end. */ + if (*mi.mi_fend != NUL) + mb_ptr_adv(mi.mi_fend); - (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword, - MAXWLEN + 1); - mi.mi_fwordlen = STRLEN(mi.mi_fword); + (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword, + MAXWLEN + 1); + mi.mi_fwordlen = STRLEN(mi.mi_fword); - /* The word is bad unless we recognize it. */ - mi.mi_result = SP_BAD; + /* The word is bad unless we recognize it. */ + mi.mi_result = SP_BAD; - /* - * Loop over the languages specified in 'spelllang'. - * We check them all, because a matching word may be longer than an - * already found matching word. - */ - for (mi.mi_lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0); - mi.mi_lp->lp_slang != NULL; ++mi.mi_lp) - { - /* Check for a matching word in case-folded words. */ - find_word(&mi, FIND_FOLDWORD); + /* + * Loop over the languages specified in 'spelllang'. + * We check them all, because a matching word may be longer than an + * already found matching word. + */ + for (mi.mi_lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0); + mi.mi_lp->lp_slang != NULL; ++mi.mi_lp) + { + /* Check for a matching word in case-folded words. */ + find_word(&mi, FIND_FOLDWORD); - /* Check for a matching word in keep-case words. */ - find_word(&mi, FIND_KEEPWORD); + /* Check for a matching word in keep-case words. */ + find_word(&mi, FIND_KEEPWORD); - /* Check for matching prefixes. */ - find_prefix(&mi); - } + /* Check for matching prefixes. */ + find_prefix(&mi); + } + + if (mi.mi_result != SP_OK) + { + /* If we found a number skip over it. Allows for "42nd". */ + if (nrlen > 0) + return nrlen; - if (mi.mi_result != SP_OK) + /* When we are at a non-word character there is no error, just + * skip over the character (try looking for a word after it). */ + if (!SPELL_ISWORDP(ptr)) { - /* When we are at a non-word character there is no error, just - * skip over the character (try looking for a word after it). */ - if (!SPELL_ISWORDP(ptr)) - { #ifdef FEAT_MBYTE - if (has_mbyte) - return mb_ptr2len_check(ptr); + if (has_mbyte) + return mb_ptr2len_check(ptr); #endif - return 1; - } - - if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) - *attrp = highlight_attr[HLF_SPB]; - else if (mi.mi_result == SP_RARE) - *attrp = highlight_attr[HLF_SPR]; - else - *attrp = highlight_attr[HLF_SPL]; + return 1; } + + if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) + *attrp = highlight_attr[HLF_SPB]; + else if (mi.mi_result == SP_RARE) + *attrp = highlight_attr[HLF_SPR]; + else + *attrp = highlight_attr[HLF_SPL]; } return (int)(mi.mi_end - ptr); @@ -934,8 +958,8 @@ find_word(mip, mode) { regmatch.regprog = rp; regmatch.rm_ic = FALSE; - if (!vim_regexec(®match, mip->mi_fword, - (colnr_T)mip->mi_prefixlen)) + if (!vim_regexec(®match, + mip->mi_fword + mip->mi_prefixlen, 0)) continue; } @@ -1338,7 +1362,7 @@ slang_alloc(lang) { lp->sl_name = vim_strsave(lang); ga_init2(&lp->sl_rep, sizeof(fromto_T), 10); - ga_init2(&lp->sl_sal, sizeof(fromto_T), 10); + ga_init2(&lp->sl_sal, sizeof(salitem_T), 10); } return lp; } @@ -1365,7 +1389,7 @@ slang_clear(lp) { garray_T *gap; fromto_T *ftp; - int round; + salitem_T *smp; int i; vim_free(lp->sl_fbyts); @@ -1382,17 +1406,23 @@ slang_clear(lp) vim_free(lp->sl_pidxs); lp->sl_pidxs = NULL; - for (round = 1; round <= 2; ++round) + gap = &lp->sl_rep; + while (gap->ga_len > 0) { - gap = round == 1 ? &lp->sl_rep : &lp->sl_sal; - while (gap->ga_len > 0) - { - ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len]; - vim_free(ftp->ft_from); - vim_free(ftp->ft_to); - } - ga_clear(gap); + ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len]; + vim_free(ftp->ft_from); + vim_free(ftp->ft_to); } + ga_clear(gap); + + gap = &lp->sl_sal; + while (gap->ga_len > 0) + { + smp = &((salitem_T *)gap->ga_data)[--gap->ga_len]; + vim_free(smp->sm_lead); + vim_free(smp->sm_to); + } + ga_clear(gap); for (i = 0; i < lp->sl_prefixcnt; ++i) vim_free(lp->sl_prefprog[i]); @@ -1459,9 +1489,11 @@ spell_load_file(fname, lang, old_lp, silent) slang_T *lp = NULL; garray_T *gap; fromto_T *ftp; + salitem_T *smp; int rr; short *first; idx_T idx; + int c = 0; fd = mch_fopen((char *)fname, "r"); if (fd == NULL) @@ -1598,7 +1630,8 @@ formerr: * compile the regexp program used to check for the condition. */ if (n > 0) { - p = buf; + buf[0] = '^'; /* always match at one position only */ + p = buf + 1; while (n-- > 0) *p++ = getc(fd); /* <condstr> */ *p = NUL; @@ -1612,76 +1645,145 @@ formerr: * <salflags> <salcount> <sal> ... * <maplen> <mapstr> */ - /* - * round 1: rep items - * round 2: sal items - */ - for (round = 1; round <= 2; ++round) + cnt = (getc(fd) << 8) + getc(fd); /* <repcount> */ + if (cnt < 0) + goto formerr; + + gap = &lp->sl_rep; + if (ga_grow(gap, cnt) == FAIL) + goto endFAIL; + + /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */ + for (; gap->ga_len < cnt; ++gap->ga_len) { - if (round == 1) + ftp = &((fromto_T *)gap->ga_data)[gap->ga_len]; + for (rr = 1; rr <= 2; ++rr) { - gap = &lp->sl_rep; - first = lp->sl_rep_first; - } - else - { - gap = &lp->sl_sal; - first = lp->sl_sal_first; - - i = getc(fd); /* <salflags> */ - if (i & SAL_F0LLOWUP) - lp->sl_followup = TRUE; - if (i & SAL_COLLAPSE) - lp->sl_collapse = TRUE; - if (i & SAL_REM_ACCENTS) - lp->sl_rem_accents = TRUE; + ccnt = getc(fd); + if (ccnt < 0) + { + if (rr == 2) + vim_free(ftp->ft_from); + goto formerr; + } + if ((p = alloc(ccnt + 1)) == NULL) + { + if (rr == 2) + vim_free(ftp->ft_from); + goto endFAIL; + } + for (i = 0; i < ccnt; ++i) + p[i] = getc(fd); /* <repfrom> or <repto> */ + p[i] = NUL; + if (rr == 1) + ftp->ft_from = p; + else + ftp->ft_to = p; } + } - cnt = (getc(fd) << 8) + getc(fd); /* <repcount> or <salcount> */ - if (cnt < 0) - goto formerr; + /* Fill the first-index table. */ + first = lp->sl_rep_first; + for (i = 0; i < 256; ++i) + first[i] = -1; + for (i = 0; i < gap->ga_len; ++i) + { + ftp = &((fromto_T *)gap->ga_data)[i]; + if (first[*ftp->ft_from] == -1) + first[*ftp->ft_from] = i; + } + + i = getc(fd); /* <salflags> */ + if (i & SAL_F0LLOWUP) + lp->sl_followup = TRUE; + if (i & SAL_COLLAPSE) + lp->sl_collapse = TRUE; + if (i & SAL_REM_ACCENTS) + lp->sl_rem_accents = TRUE; + + cnt = (getc(fd) << 8) + getc(fd); /* <salcount> */ + if (cnt < 0) + goto formerr; + + gap = &lp->sl_sal; + if (ga_grow(gap, cnt) == FAIL) + goto endFAIL; - if (ga_grow(gap, cnt) == FAIL) + /* <sal> : <salfromlen> <salfrom> <saltolen> <salto> */ + for (; gap->ga_len < cnt; ++gap->ga_len) + { + smp = &((salitem_T *)gap->ga_data)[gap->ga_len]; + ccnt = getc(fd); /* <salfromlen> */ + if (ccnt < 0) + goto formerr; + if ((p = alloc(ccnt + 2)) == NULL) goto endFAIL; - for (; gap->ga_len < cnt; ++gap->ga_len) + smp->sm_lead = p; + + /* Read up to the first special char into sm_lead. */ + for (i = 0; i < ccnt; ++i) { - /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */ - /* <sal> : <salfromlen> <salfrom> <saltolen> <salto> */ - ftp = &((fromto_T *)gap->ga_data)[gap->ga_len]; - for (rr = 1; rr <= 2; ++rr) + c = getc(fd); /* <salfrom> */ + if (vim_strchr((char_u *)"0123456789(-<^$", c) != NULL) + break; + *p++ = c; + } + smp->sm_leadlen = p - smp->sm_lead; + *p++ = NUL; + + /* Put optional chars in sm_oneoff, if any. */ + if (c == '(') + { + smp->sm_oneoff = p; + for (++i; i < ccnt; ++i) { - ccnt = getc(fd); - if (ccnt < 0) - { - if (rr == 2) - vim_free(ftp->ft_from); - goto formerr; - } - if ((p = alloc(ccnt + 1)) == NULL) - { - if (rr == 2) - vim_free(ftp->ft_from); - goto endFAIL; - } - for (i = 0; i < ccnt; ++i) - p[i] = getc(fd); /* <repfrom> or <salfrom> */ - p[i] = NUL; - if (rr == 1) - ftp->ft_from = p; - else - ftp->ft_to = p; + c = getc(fd); /* <salfrom> */ + if (c == ')') + break; + *p++ = c; } + *p++ = NUL; + if (++i < ccnt) + c = getc(fd); } - - /* Fill the first-index table. */ - for (i = 0; i < 256; ++i) - first[i] = -1; - for (i = 0; i < gap->ga_len; ++i) + else + smp->sm_oneoff = NULL; + + /* Any following chars go in sm_rules. */ + smp->sm_rules = p; + if (i < ccnt) + *p++ = c; + for (++i; i < ccnt; ++i) + *p++ = getc(fd); /* <salfrom> */ + *p++ = NUL; + + ccnt = getc(fd); /* <saltolen> */ + if (ccnt < 0) { - ftp = &((fromto_T *)gap->ga_data)[i]; - if (first[*ftp->ft_from] == -1) - first[*ftp->ft_from] = i; + vim_free(smp->sm_lead); + goto formerr; } + if ((p = alloc(ccnt + 1)) == NULL) + { + vim_free(smp->sm_lead); + goto endFAIL; + } + smp->sm_to = p; + + for (i = 0; i < ccnt; ++i) + *p++ = getc(fd); /* <salto> */ + *p++ = NUL; + } + + /* Fill the first-index table. */ + first = lp->sl_sal_first; + for (i = 0; i < 256; ++i) + first[i] = -1; + for (i = 0; i < gap->ga_len; ++i) + { + smp = &((salitem_T *)gap->ga_data)[i]; + if (first[*smp->sm_lead] == -1) + first[*smp->sm_lead] = i; } cnt = (getc(fd) << 8) + getc(fd); /* <maplen> */ @@ -2509,7 +2611,7 @@ spell_read_aff(fname, spin) /* Use a new number in the .spl file later, to be able to * handle multiple .aff files. */ if (aff->af_pfxpostpone) - cur_aff->ah_newID = spin->si_newID++; + cur_aff->ah_newID = ++spin->si_newID; } else tp = &aff->af_suff; @@ -4784,14 +4886,12 @@ spell_casefold(str, len, buf, buflen) /* * "z?": Find badly spelled word under or after the cursor. * Give suggestions for the properly spelled word. - * This is based on the mechanisms of Aspell, but completely reimplemented. */ void spell_suggest() { char_u *line; pos_T prev_cursor = curwin->w_cursor; - int attr; char_u wcopy[MAXWLEN + 2]; char_u *p; int i; @@ -4799,76 +4899,23 @@ spell_suggest() suginfo_T sug; suggest_T *stp; - /* - * Find the start of the badly spelled word. - */ + /* Find the start of the badly spelled word. */ if (spell_move_to(FORWARD, TRUE, TRUE) == FAIL) { beep_flush(); return; } - /* - * Set the info in "sug". - */ - vim_memset(&sug, 0, sizeof(sug)); - ga_init2(&sug.su_ga, (int)sizeof(suggest_T), 10); - hash_init(&sug.su_banned); + /* Get the word and its length. */ line = ml_get_curline(); - sug.su_badptr = line + curwin->w_cursor.col; - sug.su_badlen = spell_check(curwin, sug.su_badptr, &attr); - if (sug.su_badlen >= MAXWLEN) - sug.su_badlen = MAXWLEN - 1; /* just in case */ - vim_strncpy(sug.su_badword, sug.su_badptr, sug.su_badlen); - (void)spell_casefold(sug.su_badptr, sug.su_badlen, - sug.su_fbadword, MAXWLEN); - /* Ban the bad word itself. It may appear in another region. */ - add_banned(&sug, sug.su_badword); - - /* - * 1. Try inserting/deleting/swapping/changing a letter, use REP entries - * from the .aff file and inserting a space (split the word). - * - * Set a maximum score to limit the combination of operations that is - * tried. - */ - sug.su_maxscore = SCORE_MAXINIT; - spell_try_change(&sug); - - /* - * 2. Try finding sound-a-like words. - * - * Only do this when we don't have a lot of suggestions yet, because it's - * very slow and often doesn't find new suggestions. - */ - if (sug.su_ga.ga_len < SUG_CLEAN_COUNT) - { - /* Allow a higher score now. */ - sug.su_maxscore = SCORE_MAXMAX; - spell_try_soundalike(&sug); - } - - /* When CTRL-C was hit while searching do show the results. */ - ui_breakcheck(); - if (got_int) - { - (void)vgetc(); - got_int = FALSE; - } + /* Get the list of suggestions */ + spell_find_suggest(line + curwin->w_cursor.col, &sug, (int)Rows - 2); if (sug.su_ga.ga_len == 0) MSG(_("Sorry, no suggestions")); else { -#ifdef RESCORE - /* Do slow but more accurate computation of the word score. */ - rescore_suggestions(&sug); -#endif - - /* Sort the suggestions and truncate at SUG_PROMPT_COUNT. */ - cleanup_suggestions(&sug, SUG_PROMPT_COUNT); - /* List the suggestions. */ msg_start(); vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"), @@ -4879,7 +4926,7 @@ spell_suggest() msg_scroll = TRUE; for (i = 0; i < sug.su_ga.ga_len; ++i) { - stp = &SUG(&sug, i); + stp = &SUG(sug.su_ga, i); /* The suggested word may replace only part of the bad word, add * the not replaced part. */ @@ -4889,8 +4936,17 @@ spell_suggest() sug.su_badptr + stp->st_orglen, sug.su_badlen - stp->st_orglen); if (p_verbose > 0) - vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\" (%d)"), + { + if (sps_flags & SPS_DOUBLE) + vim_snprintf((char *)IObuff, IOSIZE, + _("%2d \"%s\" (%s%d - %d)"), + i + 1, wcopy, + stp->st_salscore ? "s " : "", + stp->st_score, stp->st_altscore); + else + vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\" (%d)"), i + 1, wcopy, stp->st_score); + } else vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\""), i + 1, wcopy); @@ -4901,10 +4957,10 @@ spell_suggest() /* Ask for choice. */ i = prompt_for_number(); - if (i > 0 && i <= sug.su_ga.ga_len && u_save_cursor()) + if (i > 0 && i <= sug.su_ga.ga_len && u_save_cursor() == OK) { /* Replace the word. */ - stp = &SUG(&sug, i - 1); + stp = &SUG(sug.su_ga, i - 1); p = alloc(STRLEN(line) - stp->st_orglen + STRLEN(stp->st_word) + 1); if (p != NULL) { @@ -4915,19 +4971,169 @@ spell_suggest() ml_replace(curwin->w_cursor.lnum, p, FALSE); curwin->w_cursor.col = c; changed_bytes(curwin->w_cursor.lnum, c); + + /* For redo we use a change-word command. */ + ResetRedobuff(); + AppendToRedobuff((char_u *)"ciw"); + AppendToRedobuff(stp->st_word); + AppendCharToRedobuff(ESC); } } else curwin->w_cursor = prev_cursor; } - /* Free the suggestions. */ + spell_find_cleanup(&sug); +} + +/* + * Find spell suggestions for "word". Return them in the growarray "*gap" as + * a list of allocated strings. + */ + void +spell_suggest_list(gap, word, maxcount) + garray_T *gap; + char_u *word; + int maxcount; /* maximum nr of suggestions */ +{ + suginfo_T sug; + int i; + suggest_T *stp; + char_u *wcopy; + + spell_find_suggest(word, &sug, maxcount); + + /* Make room in "gap". */ + ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1); + if (ga_grow(gap, sug.su_ga.ga_len) == FAIL) + return; + for (i = 0; i < sug.su_ga.ga_len; ++i) - vim_free(SUG(&sug, i).st_word); - ga_clear(&sug.su_ga); + { + stp = &SUG(sug.su_ga, i); + + /* The suggested word may replace only part of "word", add the not + * replaced part. */ + wcopy = alloc(STRLEN(stp->st_word) + + STRLEN(sug.su_badptr + stp->st_orglen) + 1); + if (wcopy == NULL) + break; + STRCPY(wcopy, stp->st_word); + STRCAT(wcopy, sug.su_badptr + stp->st_orglen); + ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy; + } + + spell_find_cleanup(&sug); +} + +/* + * Find spell suggestions for the word at the start of "badptr". + * Return the suggestions in "su->su_ga". + * The maximum number of suggestions is "maxcount". + * Note: does use info for the current window. + * This is based on the mechanisms of Aspell, but completely reimplemented. + */ + static void +spell_find_suggest(badptr, su, maxcount) + char_u *badptr; + suginfo_T *su; + int maxcount; +{ + int attr; + + /* + * Set the info in "*su". + */ + vim_memset(su, 0, sizeof(suginfo_T)); + ga_init2(&su->su_ga, (int)sizeof(suggest_T), 10); + ga_init2(&su->su_sga, (int)sizeof(suggest_T), 10); + hash_init(&su->su_banned); + + su->su_badptr = badptr; + su->su_badlen = spell_check(curwin, su->su_badptr, &attr); + su->su_maxcount = maxcount; + + if (su->su_badlen >= MAXWLEN) + su->su_badlen = MAXWLEN - 1; /* just in case */ + vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen); + (void)spell_casefold(su->su_badptr, su->su_badlen, + su->su_fbadword, MAXWLEN); + + /* Ban the bad word itself. It may appear in another region. */ + add_banned(su, su->su_badword); + + /* + * 1. Try inserting/deleting/swapping/changing a letter, use REP entries + * from the .aff file and inserting a space (split the word). + * + * Set a maximum score to limit the combination of operations that is + * tried. + */ + su->su_maxscore = SCORE_MAXINIT; + spell_try_change(su); + + /* For the resulting top-scorers compute the sound-a-like score. */ + if (sps_flags & SPS_DOUBLE) + score_comp_sal(su); + + /* + * 2. Try finding sound-a-like words. + * + * Only do this when we don't have a lot of suggestions yet, because it's + * very slow and often doesn't find new suggestions. + */ + if ((sps_flags & SPS_DOUBLE) + || (!(sps_flags & SPS_FAST) + && su->su_ga.ga_len < SUG_CLEAN_COUNT(su))) + { + /* Allow a higher score now. */ + su->su_maxscore = SCORE_MAXMAX; + spell_try_soundalike(su); + } + + /* When CTRL-C was hit while searching do show the results. */ + ui_breakcheck(); + if (got_int) + { + (void)vgetc(); + got_int = FALSE; + } + + if (sps_flags & SPS_DOUBLE) + { + /* Combine the two list of suggestions. */ + score_combine(su); + } + else if (su->su_ga.ga_len != 0) + { + if (sps_flags & SPS_BEST) + /* Adjust the word score for how it sounds like. */ + rescore_suggestions(su); + + /* Sort the suggestions and truncate at "maxcount". */ + (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, maxcount); + } +} + +/* + * Free the info put in "*su" by spell_find_suggest(). + */ + static void +spell_find_cleanup(su) + suginfo_T *su; +{ + int i; + + /* Free the suggestions. */ + for (i = 0; i < su->su_ga.ga_len; ++i) + vim_free(SUG(su->su_ga, i).st_word); + ga_clear(&su->su_ga); + for (i = 0; i < su->su_sga.ga_len; ++i) + vim_free(SUG(su->su_sga, i).st_word); + ga_clear(&su->su_sga); /* Free the banned words. */ - free_banned(&sug); + free_banned(su); } /* @@ -5058,13 +5264,6 @@ spell_try_change(su) for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0); lp->lp_slang != NULL; ++lp) { -#ifdef SOUNDFOLD_SCORE - su->su_slang = lp->lp_slang; - if (lp->lp_slang->sl_sal.ga_len > 0) - /* soundfold the bad word */ - spell_soundfold(lp->lp_slang, su->su_fbadword, su->su_salword); -#endif - /* * Go through the whole case-fold tree, try changes at each node. * "tword[]" contains the word collected from nodes in the tree. @@ -5160,11 +5359,8 @@ spell_try_change(su) if (fword[sp->ts_fidx] == 0) { /* The badword also ends: add suggestions, */ - add_suggestion(su, preword, sp->ts_score + newscore -#ifdef RESCORE - , FALSE -#endif - ); + add_suggestion(su, &su->su_ga, preword, + sp->ts_score + newscore, FALSE); } else if (sp->ts_fidx >= sp->ts_fidxtry #ifdef FEAT_MBYTE @@ -5757,9 +5953,10 @@ spell_try_change(su) default: /* Did all possible states at this level, go up one level. */ --depth; - } - line_breakcheck(); + /* Don't check for CTRL-C too often, it takes time. */ + line_breakcheck(); + } } } } @@ -5950,6 +6147,168 @@ find_keepcap_word(slang, fword, kword) } /* + * Compute the sound-a-like score for suggestions in su->su_ga and add them to + * su->su_sga. + */ + static void +score_comp_sal(su) + suginfo_T *su; +{ + langp_T *lp; + char_u badsound[MAXWLEN]; + int i; + suggest_T *stp; + suggest_T *sstp; + char_u fword[MAXWLEN]; + char_u goodsound[MAXWLEN]; + int score; + + if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL) + return; + + /* Use the sound-folding of the first language that supports it. */ + for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0); + lp->lp_slang != NULL; ++lp) + if (lp->lp_slang->sl_sal.ga_len > 0) + { + /* soundfold the bad word */ + spell_soundfold(lp->lp_slang, su->su_fbadword, badsound); + + for (i = 0; i < su->su_ga.ga_len; ++i) + { + stp = &SUG(su->su_ga, i); + + /* Case-fold the suggested word and sound-fold it. */ + (void)spell_casefold(stp->st_word, STRLEN(stp->st_word), + fword, MAXWLEN); + spell_soundfold(lp->lp_slang, fword, goodsound); + score = soundalike_score(goodsound, badsound); + if (score < SCORE_MAXMAX) + { + /* Add the suggestion. */ + sstp = &SUG(su->su_sga, su->su_sga.ga_len); + sstp->st_word = vim_strsave(stp->st_word); + if (sstp->st_word != NULL) + { + sstp->st_score = score; + sstp->st_altscore = 0; + sstp->st_orglen = stp->st_orglen; + ++su->su_sga.ga_len; + } + } + } + break; + } +} + +/* + * Combine the list of suggestions in su->su_ga and su->su_sga. + * They are intwined. + */ + static void +score_combine(su) + suginfo_T *su; +{ + int i; + int j; + garray_T ga; + garray_T *gap; + langp_T *lp; + suggest_T *stp; + char_u *p; + char_u badsound[MAXWLEN]; + char_u goodsound[MAXWLEN]; + char_u fword[MAXWLEN]; + int round; + + /* Add the alternate score to su_ga. */ + for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0); + lp->lp_slang != NULL; ++lp) + { + if (lp->lp_slang->sl_sal.ga_len > 0) + { + /* soundfold the bad word */ + spell_soundfold(lp->lp_slang, su->su_fbadword, badsound); + + for (i = 0; i < su->su_ga.ga_len; ++i) + { + stp = &SUG(su->su_ga, i); + + /* Case-fold the word, sound-fold the word and compute the + * score for the difference. */ + (void)spell_casefold(stp->st_word, STRLEN(stp->st_word), + fword, MAXWLEN); + spell_soundfold(lp->lp_slang, fword, goodsound); + stp->st_altscore = soundalike_score(goodsound, badsound); + if (stp->st_altscore == SCORE_MAXMAX) + stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; + else + stp->st_score = (stp->st_score * 3 + + stp->st_altscore) / 4; + stp->st_salscore = FALSE; + } + break; + } + } + + /* Add the alternate score to su_sga. */ + for (i = 0; i < su->su_sga.ga_len; ++i) + { + stp = &SUG(su->su_sga, i); + stp->st_altscore = spell_edit_score(su->su_badword, stp->st_word); + if (stp->st_score == SCORE_MAXMAX) + stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; + else + stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; + stp->st_salscore = TRUE; + } + + /* Sort the suggestions and truncate at "maxcount" for both lists. */ + (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); + (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); + + ga_init2(&ga, (int)sizeof(suginfo_T), 1); + if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL) + return; + + stp = &SUG(ga, 0); + for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i) + { + /* round 1: get a suggestion from su_ga + * round 2: get a suggestion from su_sga */ + for (round = 1; round <= 2; ++round) + { + gap = round == 1 ? &su->su_ga : &su->su_sga; + if (i < gap->ga_len) + { + /* Don't add a word if it's already there. */ + p = SUG(*gap, i).st_word; + for (j = 0; j < ga.ga_len; ++j) + if (STRCMP(stp[j].st_word, p) == 0) + break; + if (j == ga.ga_len) + stp[ga.ga_len++] = SUG(*gap, i); + else + vim_free(p); + } + } + } + + ga_clear(&su->su_ga); + ga_clear(&su->su_sga); + + /* Truncate the list to the number of suggestions that will be displayed. */ + if (ga.ga_len > su->su_maxcount) + { + for (i = su->su_maxcount; i < ga.ga_len; ++i) + vim_free(stp[i].st_word); + ga.ga_len = su->su_maxcount; + } + + su->su_ga = ga; +} + +/* * Find suggestions by comparing the word in a sound-a-like form. */ static void @@ -5970,9 +6329,9 @@ spell_try_soundalike(su) idx_T n; int round; int flags; - int score, sound_score; - char_u *bp, *sp; + int sound_score; + /* Do this for all languages that support sound folding. */ for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0); lp->lp_slang != NULL; ++lp) { @@ -6033,40 +6392,15 @@ spell_try_soundalike(su) tfword, tsalword); } - /* - * Accept the word if the sound-folded words - * are (almost) equal. - */ - for (bp = salword, sp = tsalword; *bp == *sp; - ++bp, ++sp) - if (*bp == NUL) - break; - - if (*bp == *sp) - /* equal */ - sound_score = 0; - else if (*bp != NUL && bp[1] != NUL - && *bp == sp[1] && bp[1] == *sp - && STRCMP(bp + 2, sp + 2) == 0) - /* swap two bytes */ - sound_score = SCORE_SWAP; - else if (STRCMP(bp + 1, sp) == 0) - /* delete byte */ - sound_score = SCORE_DEL; - else if (STRCMP(bp, sp + 1) == 0) - /* insert byte */ - sound_score = SCORE_INS; - else if (STRCMP(bp + 1, sp + 1) == 0) - /* skip one byte */ - sound_score = SCORE_SUBST; - else - /* not equal or similar */ - sound_score = SCORE_MAXMAX; - + /* Compute the edit distance between the + * sound-a-like words. */ + sound_score = soundalike_score(salword, + tsalword); if (sound_score < SCORE_MAXMAX) { char_u cword[MAXWLEN]; char_u *p; + int score; if (round == 1 && flags != 0) { @@ -6078,18 +6412,25 @@ spell_try_soundalike(su) else p = tword; - /* Compute the score. */ - score = spell_edit_score(su->su_badword, p); -#ifdef RESCORE - /* give a bonus for the good word sounding - * the same as the bad word */ - add_suggestion(su, tword, - RESCORE(score, sound_score), + if (sps_flags & SPS_DOUBLE) + add_suggestion(su, &su->su_sga, p, + sound_score, FALSE); + else + { + /* Compute the score. */ + score = spell_edit_score( + su->su_badword, p); + if (sps_flags & SPS_BEST) + /* give a bonus for the good word + * sounding the same as the bad + * word */ + add_suggestion(su, &su->su_ga, p, + RESCORE(score, sound_score), TRUE); -#else - add_suggestion(su, tword, - score + sound_score); -#endif + else + add_suggestion(su, &su->su_ga, p, + score + sound_score, FALSE); + } } } @@ -6275,24 +6616,15 @@ similar_chars(slang, c1, c2) * with spell_edit_score(). */ static void -add_suggestion(su, goodword, score -#ifdef RESCORE - , had_bonus -#endif - ) +add_suggestion(su, gap, goodword, score, had_bonus) suginfo_T *su; + garray_T *gap; char_u *goodword; int score; -#ifdef RESCORE - int had_bonus; /* set st_had_bonus */ -#endif + int had_bonus; /* value for st_had_bonus */ { suggest_T *stp; int i; -#ifdef SOUNDFOLD_SCORE - char_u fword[MAXWLEN]; - char_u salword[MAXWLEN]; -#endif /* Check that the word wasn't banned. */ if (was_banned(su, goodword)) @@ -6300,47 +6632,38 @@ add_suggestion(su, goodword, score if (score <= su->su_maxscore) { -#ifdef SOUNDFOLD_SCORE - /* Add to the score when the word sounds differently. - * This is slow... */ - if (su->su_slang->sl_sal.ga_len > 0) - score += spell_sound_score(su->su_slang, fword, su->su_salword); -#endif - /* Check if the word is already there. */ - stp = &SUG(su, 0); - for (i = su->su_ga.ga_len - 1; i >= 0; --i) + stp = &SUG(*gap, 0); + for (i = gap->ga_len - 1; i >= 0; --i) if (STRCMP(stp[i].st_word, goodword) == 0) { /* Found it. Remember the lowest score. */ if (stp[i].st_score > score) { stp[i].st_score = score; -#ifdef RESCORE stp[i].st_had_bonus = had_bonus; -#endif } break; } - if (i < 0 && ga_grow(&su->su_ga, 1) == OK) + if (i < 0 && ga_grow(gap, 1) == OK) { /* Add a suggestion. */ - stp = &SUG(su, su->su_ga.ga_len); + stp = &SUG(*gap, gap->ga_len); stp->st_word = vim_strsave(goodword); if (stp->st_word != NULL) { stp->st_score = score; -#ifdef RESCORE + stp->st_altscore = 0; stp->st_had_bonus = had_bonus; -#endif stp->st_orglen = su->su_badlen; - ++su->su_ga.ga_len; + ++gap->ga_len; /* If we have too many suggestions now, sort the list and keep * the best suggestions. */ - if (su->su_ga.ga_len > SUG_MAX_COUNT) - cleanup_suggestions(su, SUG_CLEAN_COUNT); + if (gap->ga_len > SUG_MAX_COUNT(su)) + su->su_maxscore = cleanup_suggestions(gap, su->su_maxscore, + SUG_CLEAN_COUNT(su)); } } } @@ -6402,7 +6725,6 @@ free_banned(su) hash_clear(&su->su_banned); } -#ifdef RESCORE /* * Recompute the score if sound-folding is possible. This is slow, * thus only done for the final results. @@ -6427,7 +6749,7 @@ rescore_suggestions(su) for (i = 0; i < su->su_ga.ga_len; ++i) { - stp = &SUG(su, i); + stp = &SUG(su->su_ga, i); if (!stp->st_had_bonus) { score = spell_sound_score(lp->lp_slang, stp->st_word, @@ -6439,7 +6761,6 @@ rescore_suggestions(su) } } } -#endif static int #ifdef __BORLANDC__ @@ -6460,35 +6781,40 @@ sug_compare(s1, s2) { suggest_T *p1 = (suggest_T *)s1; suggest_T *p2 = (suggest_T *)s2; + int n = p1->st_score - p2->st_score; - return p1->st_score - p2->st_score; + if (n == 0) + return p1->st_altscore - p2->st_altscore; + return n; } /* * Cleanup the suggestions: * - Sort on score. * - Remove words that won't be displayed. + * Returns the maximum score in the list or "maxscore" unmodified. */ - static void -cleanup_suggestions(su, keep) - suginfo_T *su; + static int +cleanup_suggestions(gap, maxscore, keep) + garray_T *gap; + int maxscore; int keep; /* nr of suggestions to keep */ { - suggest_T *stp = &SUG(su, 0); + suggest_T *stp = &SUG(*gap, 0); int i; /* Sort the list. */ - qsort(su->su_ga.ga_data, (size_t)su->su_ga.ga_len, - sizeof(suggest_T), sug_compare); + qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare); /* Truncate the list to the number of suggestions that will be displayed. */ - if (su->su_ga.ga_len > keep) + if (gap->ga_len > keep) { - for (i = keep; i < su->su_ga.ga_len; ++i) + for (i = keep; i < gap->ga_len; ++i) vim_free(stp[i].st_word); - su->su_ga.ga_len = keep; - su->su_maxscore = stp[keep - 1].st_score; + gap->ga_len = keep; + return stp[keep - 1].st_score; } + return maxscore; } /* @@ -6500,7 +6826,7 @@ spell_soundfold(slang, inword, res) char_u *inword; char_u *res; { - fromto_T *ftp; + salitem_T *smp; char_u word[MAXWLEN]; #ifdef FEAT_MBYTE int l; @@ -6508,7 +6834,9 @@ spell_soundfold(slang, inword, res) #endif char_u *s; char_u *t; + char_u *pf; int i, j, z; + int reslen; int n, k = 0; int z0; int k0; @@ -6526,7 +6854,10 @@ spell_soundfold(slang, inword, res) for (s = inword; *s != NUL; ) { if (vim_iswhite(*s)) - *t++ = *s++; + { + *t++ = ' '; + s = skipwhite(s); + } #ifdef FEAT_MBYTE else if (has_mbyte) { @@ -6574,55 +6905,53 @@ spell_soundfold(slang, inword, res) } #endif - ftp = (fromto_T *)slang->sl_sal.ga_data; + smp = (salitem_T *)slang->sl_sal.ga_data; /* * This comes from Aspell phonet.cpp. Converted from C++ to C. * Changed to keep spaces. * TODO: support for multi-byte chars. */ - i = j = z = 0; + i = reslen = z = 0; while ((c = word[i]) != NUL) { + /* Start with the first rule that has the character in the word. */ n = slang->sl_sal_first[c]; z0 = 0; if (n >= 0) { /* check all rules for the same letter */ - while (ftp[n].ft_from[0] == c) + for (; (s = smp[n].sm_lead)[0] == c; ++n) { - /* check whole string */ - k = 1; /* number of found letters */ - pri = 5; /* default priority */ - s = ftp[n].ft_from; - s++; /* important for (see below) "*(s-1)" */ - - /* Skip over normal letters that match with the word. */ - while (*s != NUL && word[i + k] == *s - && !vim_isdigit(*s) && strchr("(-<^$", *s) == NULL) + /* Quickly skip entries that don't match the word. Most + * entries are less then three chars, optimize for that. */ + k = smp[n].sm_leadlen; + if (k > 1) { - k++; - s++; + if (word[i + 1] != s[1]) + continue; + if (k > 2) + { + for (j = 2; j < k; ++j) + if (word[i + j] != s[j]) + break; + if (j < k) + continue; + } } - if (*s == '(') + if ((pf = smp[n].sm_oneoff) != NULL) { - /* check alternate letters in "(..)" */ - for (t = s + 1; *t != ')' && *t != NUL; ++t) - if (*t == word[i + k]) - { - /* match */ - ++k; - for (s = t + 1; *s != NUL; ++s) - if (*s == ')') - { - ++s; - break; - } - break; - } + /* Check for match with one of the chars in "sm_oneoff". */ + while (*pf != NUL && *pf != word[i + k]) + ++pf; + if (*pf == NUL) + continue; + ++k; } + s = smp[n].sm_rules; + pri = 5; /* default priority */ p0 = *s; k0 = k; @@ -6633,7 +6962,7 @@ spell_soundfold(slang, inword, res) } if (*s == '<') s++; - if (vim_isdigit(*s)) + if (VIM_ISDIGIT(*s)) { /* determine priority */ pri = *s - '0'; @@ -6658,50 +6987,52 @@ spell_soundfold(slang, inword, res) n0 = slang->sl_sal_first[c0]; if (slang->sl_followup && k > 1 && n0 >= 0 - && p0 != '-' && word[i + k] != NUL) + && p0 != '-' && word[i + k] != NUL) { /* test follow-up rule for "word[i + k]" */ - while (ftp[n0].ft_from[0] == c0) + for ( ; (s = smp[n0].sm_lead)[0] == c0; ++n0) { - - /* check whole string */ - k0 = k; - p0 = 5; - s = ftp[n0].ft_from; - s++; - while (*s != NUL && word[i+k0] == *s - && !vim_isdigit(*s) - && strchr("(-<^$",*s) == NULL) + /* Quickly skip entries that don't match the word. + * */ + k0 = smp[n0].sm_leadlen; + if (k0 > 1) { - k0++; - s++; + if (word[i + k] != s[1]) + continue; + if (k0 > 2) + { + pf = word + i + k + 1; + for (j = 2; j < k0; ++j) + if (*pf++ != s[j]) + break; + if (j < k0) + continue; + } } - if (*s == '(') + k0 += k - 1; + + if ((pf = smp[n0].sm_oneoff) != NULL) { - /* check alternate letters in "(..)" */ - for (t = s + 1; *t != ')' && *t != NUL; ++t) - if (*t == word[i + k0]) - { - /* match */ - ++k0; - for (s = t + 1; *s != NUL; ++s) - if (*s == ')') - { - ++s; - break; - } - break; - } + /* Check for match with one of the chars in + * "sm_oneoff". */ + while (*pf != NUL && *pf != word[i + k0]) + ++pf; + if (*pf == NUL) + continue; + ++k0; } + + p0 = 5; + s = smp[n0].sm_rules; while (*s == '-') { - /* "k0" gets NOT reduced */ - /* because "if (k0 == k)" */ + /* "k0" gets NOT reduced because + * "if (k0 == k)" */ s++; } if (*s == '<') s++; - if (vim_isdigit(*s)) + if (VIM_ISDIGIT(*s)) { p0 = *s - '0'; s++; @@ -6713,42 +7044,31 @@ spell_soundfold(slang, inword, res) && !SPELL_ISWORDP(word + i + k0))) { if (k0 == k) - { /* this is just a piece of the string */ - ++n0; continue; - } if (p0 < pri) - { /* priority too low */ - ++n0; continue; - } /* rule fits; stop search */ break; } - ++n0; } - if (p0 >= pri && ftp[n0].ft_from[0] == c0) - { - ++n; + if (p0 >= pri && smp[n0].sm_lead[0] == c0) continue; - } } /* replace string */ - s = ftp[n].ft_to; - p0 = (ftp[n].ft_from[0] != NUL - && vim_strchr(ftp[n].ft_from + 1, - '<') != NULL) ? 1 : 0; + s = smp[n].sm_to; + pf = smp[n].sm_rules; + p0 = (vim_strchr(pf, '<') != NULL) ? 1 : 0; if (p0 == 1 && z == 0) { /* rule with '<' is used */ - if (j > 0 && *s != NUL - && (res[j - 1] == c || res[j - 1] == *s)) - j--; + if (reslen > 0 && *s != NUL && (res[reslen - 1] == c + || res[reslen - 1] == *s)) + reslen--; z0 = 1; z = 1; k0 = 0; @@ -6770,25 +7090,23 @@ spell_soundfold(slang, inword, res) /* no '<' rule used */ i += k - 1; z = 0; - while (*s != NUL && s[1] != NUL && j < MAXWLEN) + while (*s != NUL && s[1] != NUL && reslen < MAXWLEN) { - if (j == 0 || res[j - 1] != *s) + if (reslen == 0 || res[reslen - 1] != *s) { - res[j] = *s; - j++; + res[reslen] = *s; + reslen++; } s++; } /* new "actual letter" */ c = *s; - if (ftp[n].ft_from[0] != NUL - && strstr((char *)ftp[n].ft_from + 1, - "^^") != NULL) + if (strstr((char *)pf, "^^") != NULL) { if (c != NUL) { - res[j] = c; - j++; + res[reslen] = c; + reslen++; } mch_memmove(word, word + i + 1, STRLEN(word + i + 1) + 1); @@ -6798,7 +7116,6 @@ spell_soundfold(slang, inword, res) } break; } - ++n; } } else if (vim_iswhite(c)) @@ -6809,12 +7126,13 @@ spell_soundfold(slang, inword, res) if (z0 == 0) { - if (k && !p0 && j < MAXWLEN && c != NUL - && (!slang->sl_collapse || j == 0 || res[j - 1] != c)) + if (k && !p0 && reslen < MAXWLEN && c != NUL + && (!slang->sl_collapse || reslen == 0 + || res[reslen - 1] != c)) { /* condense only double letters */ - res[j] = c; - j++; + res[reslen] = c; + reslen++; } i++; @@ -6823,10 +7141,9 @@ spell_soundfold(slang, inword, res) } } - res[j] = NUL; + res[reslen] = NUL; } -#if defined(RESCORE) || defined(SOUNDFOLD_SCORE) /* * Return the score for how much words sound different. */ @@ -6840,13 +7157,14 @@ spell_sound_score(slang, goodword, badsound) char_u goodsound[MAXWLEN]; int score; - /* Case-fold the word, needed for sound folding. */ + /* Case-fold the goodword, needed for sound folding. */ (void)spell_casefold(goodword, STRLEN(goodword), fword, MAXWLEN); - /* sound-fold the good word */ + /* sound-fold the goodword */ spell_soundfold(slang, fword, goodsound); - /* compute the edit distance-score of the sounds */ + /* Compute the edit distance-score of the sounds. This is slow but we + * only do it for a small number of words. */ score = spell_edit_score(badsound, goodsound); /* Correction: adding/inserting "*" at the start (word starts with vowel) @@ -6857,11 +7175,207 @@ spell_sound_score(slang, goodword, badsound) return score; } -#endif + +/* + * Compute a score for two sound-a-like words. + * This permits up to two inserts/deletes/swaps/etc. to keep things fast. + * Instead of a generic loop we write out the code. That keeps it fast by + * avoiding checks that will not be possible. + */ + static int +soundalike_score(goodsound, badsound) + char_u *goodsound; /* sound-folded good word */ + char_u *badsound; /* sound-folded bad word */ +{ + int goodlen = STRLEN(goodsound); + int badlen = STRLEN(badsound); + int n; + char_u *pl, *ps; + char_u *pl2, *ps2; + + /* Return quickly if the lenghts are too different to be fixed by two + * changes. */ + n = goodlen - badlen; + if (n < -2 || n > 2) + return SCORE_MAXMAX; + + if (n > 0) + { + pl = goodsound; /* longest */ + ps = badsound; + } + else + { + pl = badsound; /* longest */ + ps = goodsound; + } + + /* Skip over the identical part. */ + while (*pl == *ps && *pl != NUL) + { + ++pl; + ++ps; + } + + switch (n) + { + case -2: + case 2: + /* + * Must delete two characters from "pl". + */ + ++pl; /* first delete */ + while (*pl == *ps) + { + ++pl; + ++ps; + } + /* strings must be equal after second delete */ + if (STRCMP(pl + 1, ps) == 0) + return SCORE_DEL * 2; + + /* Failed to compare. */ + break; + + case -1: + case 1: + /* + * Minimal one delete from "pl" required. + */ + + /* 1: delete */ + pl2 = pl + 1; + ps2 = ps; + while (*pl2 == *ps2) + { + if (*pl2 == NUL) /* reached the end */ + return SCORE_DEL; + ++pl2; + ++ps2; + } + + /* 2: delete then swap, then rest must be equal */ + if (pl2[0] == ps2[1] && pl2[1] == ps2[0] + && STRCMP(pl2 + 2, ps2 + 2) == 0) + return SCORE_DEL + SCORE_SWAP; + + /* 3: delete then substitute, then the rest must be equal */ + if (STRCMP(pl2 + 1, ps2 + 1) == 0) + return SCORE_DEL + SCORE_SUBST; + + /* 4: first swap then delete */ + if (pl[0] == ps[1] && pl[1] == ps[0]) + { + pl2 = pl + 2; /* swap, skip two chars */ + ps2 = ps + 2; + while (*pl2 == *ps2) + { + ++pl2; + ++ps2; + } + /* delete a char and then strings must be equal */ + if (STRCMP(pl2 + 1, ps2) == 0) + return SCORE_SWAP + SCORE_DEL; + } + + /* 5: first substitute then delete */ + pl2 = pl + 1; /* substitute, skip one char */ + ps2 = ps + 1; + while (*pl2 == *ps2) + { + ++pl2; + ++ps2; + } + /* delete a char and then strings must be equal */ + if (STRCMP(pl2 + 1, ps2) == 0) + return SCORE_SUBST + SCORE_DEL; + + /* Failed to compare. */ + break; + + case 0: + /* + * Lenghts are equal, thus changes must result in same length: An + * insert is only possible in combination with a delete. + * 1: check if for identical strings + */ + if (*pl == NUL) + return 0; + + /* 2: swap */ + if (pl[0] == ps[1] && pl[1] == ps[0]) + { + pl2 = pl + 2; /* swap, skip two chars */ + ps2 = ps + 2; + while (*pl2 == *ps2) + { + if (*pl2 == NUL) /* reached the end */ + return SCORE_SWAP; + ++pl2; + ++ps2; + } + /* 3: swap and swap again */ + if (pl2[0] == ps2[1] && pl2[1] == ps2[0] + && STRCMP(pl2 + 2, ps2 + 2) == 0) + return SCORE_SWAP + SCORE_SWAP; + + /* 4: swap and substitute */ + if (STRCMP(pl2 + 1, ps2 + 1) == 0) + return SCORE_SWAP + SCORE_SUBST; + } + + /* 5: substitute */ + pl2 = pl + 1; + ps2 = ps + 1; + while (*pl2 == *ps2) + { + if (*pl2 == NUL) /* reached the end */ + return SCORE_SUBST; + ++pl2; + ++ps2; + } + + /* 6: substitute and swap */ + if (pl2[0] == ps2[1] && pl2[1] == ps2[0] + && STRCMP(pl2 + 2, ps2 + 2) == 0) + return SCORE_SUBST + SCORE_SWAP; + + /* 7: substitute and substitute */ + if (STRCMP(pl2 + 1, ps2 + 1) == 0) + return SCORE_SUBST + SCORE_SUBST; + + /* 8: insert then delete */ + pl2 = pl; + ps2 = ps + 1; + while (*pl2 == *ps2) + { + ++pl2; + ++ps2; + } + if (STRCMP(pl2 + 1, ps2) == 0) + return SCORE_INS + SCORE_DEL; + + /* 9: delete then insert */ + pl2 = pl + 1; + ps2 = ps; + while (*pl2 == *ps2) + { + ++pl2; + ++ps2; + } + if (STRCMP(pl2, ps2 + 1) == 0) + return SCORE_INS + SCORE_DEL; + + /* Failed to compare. */ + break; + } + + return SCORE_MAXMAX; +} /* * Compute the "edit distance" to turn "badword" into "goodword". The less - * deletes/inserts/swaps are required the lower the score. + * deletes/inserts/substitutes/swaps are required the lower the score. * * The algorithm comes from Aspell editdist.cpp, edit_distance(). * It has been converted from C++ to C and modified to support multi-byte @@ -6969,7 +7483,10 @@ spell_edit_score(badword, goodword) } } } - return CNT(badlen - 1, goodlen - 1); + + i = CNT(badlen - 1, goodlen - 1); + vim_free(cnt); + return i; } #endif /* FEAT_SYN_HL */ |