summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <bonzini@gnu.org>2011-06-07 10:33:59 +0200
committerPaolo Bonzini <bonzini@gnu.org>2011-06-07 13:24:58 +0200
commitc4bf2934edfa8eea03e0a8c5028e5d2cdaa63968 (patch)
tree9dba90d74e49b6856af0533f29eb15528d91f1b5
parentb5cd4788533da574c202219de76f82c5cda9cb20 (diff)
downloadgrep-c4bf2934edfa8eea03e0a8c5028e5d2cdaa63968.tar.gz
dfa: correct handling of single-byte character ranges
This provides a better fix for the unibyte-bracket-expr and high-bit-range testcases, and fixes the latent bug tested by bogus-wctob. * src/dfa.c (setbit_case_fold): Remove, replace with... (setbit_wc, setbit_c, setbit_case_fold_c): ... these. (parse_bracket_exp): Use setbit_case_fold_c when iterating over single-byte sequences. Use setbit_wc for multi-byte character sets, and setbit_case_fold_c for single-byte character sets. (lex): Use setbit_case_fold_c for single-byte character sets.
-rw-r--r--src/dfa.c104
1 files changed, 55 insertions, 49 deletions
diff --git a/src/dfa.c b/src/dfa.c
index 83386aae..ba3a27cb 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -536,55 +536,65 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
eolbyte = eol;
}
-/* Like setbit, but if case is folded, set both cases of a letter.
- For MB_CUR_MAX > 1, one or both of the two cases may not be set,
- so the resulting charset may only be used as an optimization. */
-static void
-setbit_case_fold (
+/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
+ is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
+ this may happen when folding case in weird Turkish locales where
+ dotless i/dotted I are not included in the chosen character set.
+ Return whether a bit was set in the charclass. */
#if MBS_SUPPORT
- wint_t b,
+static bool
+setbit_wc (wint_t wc, charclass c)
+{
+ int b = wctob (wc);
+ if (b == EOF)
+ return false;
+
+ setbit (b, c);
+ return true;
+}
+
+/* Set a bit in the charclass for the given single byte character,
+ if it is valid in the current character set. */
+static void
+setbit_c (int b, charclass c)
+{
+ /* Do nothing if b is invalid in this character set. */
+ if (MB_CUR_MAX > 1 && btowc (b) == EOF)
+ return;
+ setbit (b, c);
+}
#else
- unsigned int b,
+#define setbit_c setbit
#endif
- charclass c)
+
+/* Like setbit_c, but if case is folded, set both cases of a letter. For
+ MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
+ and the caller takes care of setting the appropriate field of struct
+ mb_char_classes. */
+static void
+setbit_case_fold_c (int b, charclass c)
{
- if (case_fold)
- {
#if MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- {
- wint_t b1 = iswupper(b) ? towlower(b) : b;
- wint_t b2 = iswlower(b) ? towupper(b) : b;
- if (wctob ((unsigned char)b1) == b1)
- setbit (b1, c);
- if (b2 != b1 && wctob ((unsigned char)b2) == b2)
- setbit (b2, c);
- }
- else
-#endif
- {
- unsigned char b1 = isupper(b) ? tolower(b) : b;
- unsigned char b2 = islower(b) ? toupper(b) : b;
- setbit (b1, c);
- if (b2 != b1)
- setbit (b2, c);
- }
+ if (MB_CUR_MAX > 1)
+ {
+ wint_t wc = btowc (b);
+ if (wc == EOF)
+ return;
+ setbit (b, c);
+ if (case_fold && iswalpha (wc))
+ setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
}
else
- {
-#if MBS_SUPPORT
- /* Below, note how when b2 != b and we have a uni-byte locale
- (MB_CUR_MAX == 1), we set b = b2. I.e., in a uni-byte locale,
- we can safely call setbit with a non-EOF value returned by wctob. */
- int b2 = wctob (b);
- if (b2 == EOF || b2 == b || (MB_CUR_MAX == 1 ? (b=b2), 1 : 0))
#endif
- if (b < NOTCHAR)
- setbit (b, c);
+ {
+ setbit (b, c);
+ if (case_fold && isalpha (b))
+ setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
}
}
+
/* UTF-8 encoding allows some optimizations that we can't otherwise
assume in a multibyte encoding. */
static inline int
@@ -863,7 +873,7 @@ parse_bracket_exp (void)
for (c2 = 0; c2 < NOTCHAR; ++c2)
if (pred->func(c2))
- setbit_case_fold (c2, ccl);
+ setbit_case_fold_c (c2, ccl);
}
#if MBS_SUPPORT
@@ -974,7 +984,7 @@ parse_bracket_exp (void)
}
if (!hard_LC_COLLATE)
for (c = c1; c <= c2; c++)
- setbit_case_fold (c, ccl);
+ setbit_case_fold_c (c, ccl);
else
{
/* Defer to the system regex library about the meaning
@@ -988,7 +998,7 @@ parse_bracket_exp (void)
subject[0] = c;
if (!(case_fold && isupper (c))
&& regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
- setbit_case_fold (c, ccl);
+ setbit_case_fold_c (c, ccl);
}
regfree (&re);
}
@@ -1002,15 +1012,12 @@ parse_bracket_exp (void)
colon_warning_state |= (c == ':') ? 2 : 4;
#if MBS_SUPPORT
- /* Build normal characters. */
- setbit_case_fold (wc, ccl);
if (MB_CUR_MAX > 1)
{
if (case_fold && iswalpha(wc))
{
wc = towlower(wc);
- c = wctob(wc);
- if (c == EOF || (wint_t)c != (wint_t)wc)
+ if (!setbit_wc (wc, ccl))
{
REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
work_mbc->nchars + 1);
@@ -1020,19 +1027,18 @@ parse_bracket_exp (void)
continue;
#else
wc = towupper(wc);
- c = wctob(wc);
#endif
}
- if (c == EOF || (wint_t)c != (wint_t)wc)
+ if (!setbit_wc (wc, ccl))
{
REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
work_mbc->nchars + 1);
work_mbc->chars[work_mbc->nchars++] = wc;
}
}
-#else
- setbit_case_fold (c, ccl);
+ else
#endif
+ setbit_case_fold_c (c, ccl);
}
while ((
#if MBS_SUPPORT
@@ -1386,7 +1392,7 @@ lex (void)
if (case_fold && isalpha(c))
{
zeroset(ccl);
- setbit_case_fold (c, ccl);
+ setbit_case_fold_c (c, ccl);
return lasttok = CSET + charclass_index(ccl);
}