diff options
author | Paolo Bonzini <bonzini@gnu.org> | 2011-06-07 10:33:59 +0200 |
---|---|---|
committer | Paolo Bonzini <bonzini@gnu.org> | 2011-06-07 13:24:58 +0200 |
commit | c4bf2934edfa8eea03e0a8c5028e5d2cdaa63968 (patch) | |
tree | 9dba90d74e49b6856af0533f29eb15528d91f1b5 | |
parent | b5cd4788533da574c202219de76f82c5cda9cb20 (diff) | |
download | grep-c4bf2934edfa8eea03e0a8c5028e5d2cdaa63968.tar.gz |
dfa: correct handling of single-byte character ranges
This provides a better fix for the unibyte-bracket-expr and high-bit-range
testcases, and fixes the latent bug tested by bogus-wctob.
* src/dfa.c (setbit_case_fold): Remove, replace with...
(setbit_wc, setbit_c, setbit_case_fold_c): ... these.
(parse_bracket_exp): Use setbit_case_fold_c when iterating over
single-byte sequences. Use setbit_wc for multi-byte character sets,
and setbit_case_fold_c for single-byte character sets.
(lex): Use setbit_case_fold_c for single-byte character sets.
-rw-r--r-- | src/dfa.c | 104 |
1 files changed, 55 insertions, 49 deletions
@@ -536,55 +536,65 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) eolbyte = eol; } -/* Like setbit, but if case is folded, set both cases of a letter. - For MB_CUR_MAX > 1, one or both of the two cases may not be set, - so the resulting charset may only be used as an optimization. */ -static void -setbit_case_fold ( +/* Set a bit in the charclass for the given wchar_t. Do nothing if WC + is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, + this may happen when folding case in weird Turkish locales where + dotless i/dotted I are not included in the chosen character set. + Return whether a bit was set in the charclass. */ #if MBS_SUPPORT - wint_t b, +static bool +setbit_wc (wint_t wc, charclass c) +{ + int b = wctob (wc); + if (b == EOF) + return false; + + setbit (b, c); + return true; +} + +/* Set a bit in the charclass for the given single byte character, + if it is valid in the current character set. */ +static void +setbit_c (int b, charclass c) +{ + /* Do nothing if b is invalid in this character set. */ + if (MB_CUR_MAX > 1 && btowc (b) == EOF) + return; + setbit (b, c); +} #else - unsigned int b, +#define setbit_c setbit #endif - charclass c) + +/* Like setbit_c, but if case is folded, set both cases of a letter. For + MB_CUR_MAX > 1, the resulting charset is only used as an optimization, + and the caller takes care of setting the appropriate field of struct + mb_char_classes. */ +static void +setbit_case_fold_c (int b, charclass c) { - if (case_fold) - { #if MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - wint_t b1 = iswupper(b) ? towlower(b) : b; - wint_t b2 = iswlower(b) ? towupper(b) : b; - if (wctob ((unsigned char)b1) == b1) - setbit (b1, c); - if (b2 != b1 && wctob ((unsigned char)b2) == b2) - setbit (b2, c); - } - else -#endif - { - unsigned char b1 = isupper(b) ? tolower(b) : b; - unsigned char b2 = islower(b) ? toupper(b) : b; - setbit (b1, c); - if (b2 != b1) - setbit (b2, c); - } + if (MB_CUR_MAX > 1) + { + wint_t wc = btowc (b); + if (wc == EOF) + return; + setbit (b, c); + if (case_fold && iswalpha (wc)) + setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c); } else - { -#if MBS_SUPPORT - /* Below, note how when b2 != b and we have a uni-byte locale - (MB_CUR_MAX == 1), we set b = b2. I.e., in a uni-byte locale, - we can safely call setbit with a non-EOF value returned by wctob. */ - int b2 = wctob (b); - if (b2 == EOF || b2 == b || (MB_CUR_MAX == 1 ? (b=b2), 1 : 0)) #endif - if (b < NOTCHAR) - setbit (b, c); + { + setbit (b, c); + if (case_fold && isalpha (b)) + setbit_c (isupper (b) ? tolower (b) : toupper (b), c); } } + /* UTF-8 encoding allows some optimizations that we can't otherwise assume in a multibyte encoding. */ static inline int @@ -863,7 +873,7 @@ parse_bracket_exp (void) for (c2 = 0; c2 < NOTCHAR; ++c2) if (pred->func(c2)) - setbit_case_fold (c2, ccl); + setbit_case_fold_c (c2, ccl); } #if MBS_SUPPORT @@ -974,7 +984,7 @@ parse_bracket_exp (void) } if (!hard_LC_COLLATE) for (c = c1; c <= c2; c++) - setbit_case_fold (c, ccl); + setbit_case_fold_c (c, ccl); else { /* Defer to the system regex library about the meaning @@ -988,7 +998,7 @@ parse_bracket_exp (void) subject[0] = c; if (!(case_fold && isupper (c)) && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH) - setbit_case_fold (c, ccl); + setbit_case_fold_c (c, ccl); } regfree (&re); } @@ -1002,15 +1012,12 @@ parse_bracket_exp (void) colon_warning_state |= (c == ':') ? 2 : 4; #if MBS_SUPPORT - /* Build normal characters. */ - setbit_case_fold (wc, ccl); if (MB_CUR_MAX > 1) { if (case_fold && iswalpha(wc)) { wc = towlower(wc); - c = wctob(wc); - if (c == EOF || (wint_t)c != (wint_t)wc) + if (!setbit_wc (wc, ccl)) { REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, work_mbc->nchars + 1); @@ -1020,19 +1027,18 @@ parse_bracket_exp (void) continue; #else wc = towupper(wc); - c = wctob(wc); #endif } - if (c == EOF || (wint_t)c != (wint_t)wc) + if (!setbit_wc (wc, ccl)) { REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, work_mbc->nchars + 1); work_mbc->chars[work_mbc->nchars++] = wc; } } -#else - setbit_case_fold (c, ccl); + else #endif + setbit_case_fold_c (c, ccl); } while (( #if MBS_SUPPORT @@ -1386,7 +1392,7 @@ lex (void) if (case_fold && isalpha(c)) { zeroset(ccl); - setbit_case_fold (c, ccl); + setbit_case_fold_c (c, ccl); return lasttok = CSET + charclass_index(ccl); } |