summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2014-04-30 11:22:27 +0900
committerPaul Eggert <eggert@cs.ucla.edu>2014-05-04 19:35:48 -0700
commitfb7d53887851476c84f38ecc9a63901d5d620806 (patch)
treeabdaed7e10c720b67c29bcaaaab9b1396cb06624
parenta159d14c79c86c441e834a513e7b27ea735c26ff (diff)
downloadgrep-fb7d53887851476c84f38ecc9a63901d5d620806.tar.gz
grep: make KWset and DFA agree about invalid sequences in patterns
See: http://bugs.gnu.org/17376 * src/dfa.c (dfambcache): Don't cache invalid sequences, because they can't be represented by wide characters. (dfambcache, mbs_to_wchar): Return WEOF for invalid sequences. (ctok): New global variable. (parse_bracket_exp, atom, match_anychar, match_mb_charset): Don't allow WEOF. (lex): Set 'ctok'. * src/kwsearch.c (Fexecute): * src/searchutils.c (is_mb_middle): Don't check here. * tests/invalid-multibyte-infloop: Adjust to fixed behavior. * tests/prefix-of-multibyte: Add test cases for this bug.
-rw-r--r--src/dfa.c101
-rw-r--r--src/kwsearch.c7
-rw-r--r--src/searchutils.c9
-rwxr-xr-xtests/invalid-multibyte-infloop10
-rwxr-xr-xtests/prefix-of-multibyte12
5 files changed, 78 insertions, 61 deletions
diff --git a/src/dfa.c b/src/dfa.c
index 45dcc52f..1c220e23 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -456,9 +456,13 @@ dfambcache (struct dfa *d)
wint_t wi;
switch (mbrtowc (&wc, &c, 1, &s))
{
- default: wi = wc; break;
- case (size_t) -2: wi = WEOF; break;
- case (size_t) -1: wi = uc; break;
+ default:
+ wi = wc;
+ break;
+ case (size_t) -1:
+ case (size_t) -2:
+ wi = WEOF;
+ break;
}
d->mbrtowc_cache[uc] = wi;
}
@@ -492,7 +496,6 @@ mbs_to_wchar (wchar_t *pwc, char const *s, size_t n, struct dfa *d)
if (0 < nbytes && nbytes < (size_t) -2)
return nbytes;
memset (&d->mbs, 0, sizeof d->mbs);
- wc = uc;
}
*pwc = wc;
@@ -847,6 +850,8 @@ static int cur_mb_len = 1; /* Length of the multibyte representation of
/* These variables are used only if (MB_CUR_MAX > 1). */
static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
+static unsigned int ctok; /* Single character representation of the current
+ multibyte character. */
/* Note that characters become unsigned here. */
@@ -1128,19 +1133,22 @@ parse_bracket_exp (void)
to the pair of ranges, [m-z] [M-Z]. Although this code
is wrong in multiple ways, it's never used in practice.
FIXME: Remove this (and related) unused code. */
- work_mbc->ranges
- = maybe_realloc (work_mbc->ranges, work_mbc->nranges + 2,
- &ranges_al, sizeof *work_mbc->ranges);
- work_mbc->ranges[work_mbc->nranges].beg
- = case_fold ? towlower (wc) : wc;
- work_mbc->ranges[work_mbc->nranges++].end
- = case_fold ? towlower (wc2) : wc2;
-
- if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+ if (wc != WEOF && wc2 != WEOF)
{
- work_mbc->ranges[work_mbc->nranges].beg = towupper (wc);
+ work_mbc->ranges
+ = maybe_realloc (work_mbc->ranges, work_mbc->nranges + 2,
+ &ranges_al, sizeof *work_mbc->ranges);
+ work_mbc->ranges[work_mbc->nranges].beg
+ = case_fold ? towlower (wc) : wc;
work_mbc->ranges[work_mbc->nranges++].end
- = towupper (wc2);
+ = case_fold ? towlower (wc2) : wc2;
+
+ if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+ {
+ work_mbc->ranges[work_mbc->nranges].beg = towupper (wc);
+ work_mbc->ranges[work_mbc->nranges++].end
+ = towupper (wc2);
+ }
}
}
else if (using_simple_locale ())
@@ -1184,23 +1192,28 @@ parse_bracket_exp (void)
continue;
}
- if (case_fold)
- {
- wchar_t folded[CASE_FOLDED_BUFSIZE];
- int i, n = case_folded_counterparts (wc, folded);
- work_mbc->chars = maybe_realloc (work_mbc->chars,
- work_mbc->nchars + n, &chars_al,
- sizeof *work_mbc->chars);
- for (i = 0; i < n; i++)
- if (!setbit_wc (folded[i], ccl))
- work_mbc->chars[work_mbc->nchars++] = folded[i];
- }
- if (!setbit_wc (wc, ccl))
+ if (wc != WEOF)
{
- work_mbc->chars = maybe_realloc (work_mbc->chars, work_mbc->nchars,
- &chars_al, sizeof *work_mbc->chars);
- work_mbc->chars[work_mbc->nchars++] = wc;
+ if (case_fold)
+ {
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wc, folded);
+ work_mbc->chars = maybe_realloc (work_mbc->chars,
+ work_mbc->nchars + n, &chars_al,
+ sizeof *work_mbc->chars);
+ for (i = 0; i < n; i++)
+ if (!setbit_wc (folded[i], ccl))
+ work_mbc->chars[work_mbc->nchars++] = folded[i];
+ }
+ else if (!setbit_wc (wc, ccl))
+ {
+ work_mbc->chars = maybe_realloc (work_mbc->chars, work_mbc->nchars,
+ &chars_al, sizeof *work_mbc->chars);
+ work_mbc->chars[work_mbc->nchars++] = wc;
+ }
}
+ else
+ setbit (c, ccl);
}
while ((wc = wc1, (c = c1) != ']'));
@@ -1245,7 +1258,8 @@ lex (void)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- FETCH_WC (c, wctok, NULL);
+ FETCH_WC (ctok, wctok, NULL);
+ c = ctok;
if (c == (unsigned int) EOF)
goto normal_char;
@@ -1776,18 +1790,23 @@ atom (void)
{
if (tok == WCHAR)
{
- addtok_wc (wctok);
-
- if (case_fold)
+ if (wctok != WEOF)
{
- wchar_t folded[CASE_FOLDED_BUFSIZE];
- int i, n = case_folded_counterparts (wctok, folded);
- for (i = 0; i < n; i++)
+ addtok_wc (wctok);
+
+ if (case_fold)
{
- addtok_wc (folded[i]);
- addtok (OR);
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ int i, n = case_folded_counterparts (wctok, folded);
+ for (i = 0; i < n; i++)
+ {
+ addtok_wc (folded[i]);
+ addtok (OR);
+ }
}
}
+ else
+ addtok_mb (ctok, 3);
tok = lex ();
}
@@ -2953,6 +2972,8 @@ match_anychar (struct dfa *d, state_num s, position pos,
if (syntax_bits & RE_DOT_NOT_NULL)
return 0;
}
+ else if (wc == WEOF)
+ return 0;
context = wchar_context (wc);
if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
@@ -2989,6 +3010,8 @@ match_mb_charset (struct dfa *d, state_num s, position pos,
if (syntax_bits & RE_DOT_NOT_NULL)
return 0;
}
+ else if (wc == WEOF)
+ return 0;
context = wchar_context (wc);
if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 7c64c86b..46569e9a 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -131,12 +131,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
{
/* The match was a part of multibyte character, advance at least
one byte to ensure no infinite loop happens. */
- mbstate_t s;
- memset (&s, 0, sizeof s);
- size_t mb_len = mbrlen (mb_start, (buf + size) - (beg + offset), &s);
- if (mb_len == (size_t) -2 || mb_len == (size_t) -1)
- goto failure;
- beg = mb_start + mb_len - 1;
+ beg = mb_start;
continue;
}
beg += offset;
diff --git a/src/searchutils.c b/src/searchutils.c
index 6440f073..3c78f31c 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -228,7 +228,6 @@ is_mb_middle (const char **good, const char *buf, const char *end,
size_t match_len)
{
const char *p = *good;
- const char *prev = p;
mbstate_t cur_state;
if (using_utf8 () && buf - p > MB_CUR_MAX)
@@ -250,10 +249,6 @@ is_mb_middle (const char **good, const char *buf, const char *end,
if (mbclen == (size_t) -2)
mbclen = mbrlen (p, end - p, &cur_state);
- /* Store the beginning of the previous complete multibyte character. */
- if (mbclen != (size_t) -2)
- prev = p;
-
if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
{
/* An invalid sequence, or a truncated multibyte character.
@@ -264,11 +259,11 @@ is_mb_middle (const char **good, const char *buf, const char *end,
p += mbclen;
}
- *good = prev;
+ *good = p;
if (p > buf)
return true;
/* P == BUF here. */
- return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state);
+ return false;
}
diff --git a/tests/invalid-multibyte-infloop b/tests/invalid-multibyte-infloop
index 487fef05..e98c1707 100755
--- a/tests/invalid-multibyte-infloop
+++ b/tests/invalid-multibyte-infloop
@@ -7,14 +7,14 @@ require_en_utf8_locale_
require_compiled_in_MB_support
require_timeout_
-printf '\202' > search-str || framework_failure_
-cat search-str search-str > input || framework_failure_
+encode() { echo "$1" | tr A '\202'; }
+
+encode AA > input
fail=0
# Before 2.15, this would infloop.
-LC_ALL=en_US.UTF-8 timeout 3 grep -F -f search-str input > out
-test $? = 1 || fail=1
-test -s out && fail=1
+LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out || fail=1
+compare input out || fail=1
Exit $fail
diff --git a/tests/prefix-of-multibyte b/tests/prefix-of-multibyte
index b15fa9b3..70a924e6 100755
--- a/tests/prefix-of-multibyte
+++ b/tests/prefix-of-multibyte
@@ -1,5 +1,5 @@
#!/bin/sh
-# This would mistakenly print a line prior to grep-2.6.2.
+# This would mistakenly print a line prior to grep-2.18.
. "${srcdir=.}/init.sh"; path_prepend_ ../src
require_en_utf8_locale_
@@ -7,14 +7,18 @@ require_compiled_in_MB_support
encode() { echo "$1" | tr ABC '\357\274\241'; }
+encode ABC >exp1
+encode aABC >exp2
+
fail=0
for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do
for opt in '' '-F'; do
out=out-$opt-$LOC
- encode ABC | LC_ALL=$LOC grep $opt "$(encode A)" > $out 2>&1
- test $? = 1 || fail=1
- compare /dev/null $out || fail=1
+ LC_ALL=$LOC grep $opt "$(encode A)" exp1 >$out || fail=1
+ compare exp1 $out || fail=1
+ LC_ALL=$LOC grep $opt "$(encode aA)" exp2 >$out || fail=1
+ compare exp2 $out || fail=1
done
done