summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2012-03-17 18:04:50 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2012-03-18 16:03:20 -0700
commita48ed5b6953d6d40f6797f7e151e324924039b78 (patch)
tree761dffa269aa3a2aae2d714db4f5449bea475aa9
parentc6e3ea61d9f08aa0128a0eb13d31a2fbad376f99 (diff)
downloadgrep-a48ed5b6953d6d40f6797f7e151e324924039b78.tar.gz
grep: report overflow for ERE a{1000000000}
* NEWS: Document this. * src/dfa.c (MIN): New macro. (lex): Lexically analyze the repeat-count operator once, not twice; the double-scan complicated the code and made it harder to understand and fix. Adjust the repeat-count parsing so that it better matches the behavior of the regex code, in three ways: 1. Diagnose too-large repeat counts rather than treating them as literal characters. 2. Use RE_INVALID_INTERVAL_ORD, not RE_NO_BK_BRACES, to decide whether to treat invalid-syntax {...}s as literals. 3. Use the same wording for {...}-related diagnostics that the regex code uses. * tests/bre.tests, tests/ere.tests, tests/repetition-overflow: Adjust to match new behavior, and add a few tests. * cfg.mk (exclude_file_name_regexp--sc_error_message_uppercase): New macro, since the diagnostics start with uppercase letters.
-rw-r--r--NEWS3
-rw-r--r--cfg.mk1
-rw-r--r--src/dfa.c127
-rw-r--r--tests/bre.tests6
-rw-r--r--tests/ere.tests2
-rwxr-xr-xtests/repetition-overflow4
6 files changed, 59 insertions, 84 deletions
diff --git a/NEWS b/NEWS
index 6dad6086..b219b65e 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,9 @@ GNU grep NEWS -*- outline -*-
name too long", and it can run much faster when dealing with large
directory hierarchies.
+ grep -E 'a{1000000000}' now reports an overflow error rather than
+ silently acting like grep -E 'a\{1000000000}'.
+
** New features
The -R option now has a long-option alias --dereference-recursive.
diff --git a/cfg.mk b/cfg.mk
index 84115c20..329af439 100644
--- a/cfg.mk
+++ b/cfg.mk
@@ -88,3 +88,4 @@ exclude_file_name_regexp--sc_prohibit_xalloc_without_use = ^src/kwset\.c$$
exclude_file_name_regexp--sc_prohibit_tab_based_indentation = \
(Makefile|\.(am|mk)$$|^gl/lib/.*\.c\.diff$$)
exclude_file_name_regexp--sc_space_tab = ^gl/lib/.*\.c\.diff$$
+exclude_file_name_regexp--sc_error_message_uppercase = ^src/dfa\.c$$
diff --git a/src/dfa.c b/src/dfa.c
index 6eb4a119..613f5484 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -861,6 +861,10 @@ static unsigned char const *buf_end; /* reference to end in dfaexec(). */
#endif /* MBS_SUPPORT */
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
@@ -1328,90 +1332,53 @@ lex (void)
if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
goto normal_char;
- if (syntax_bits & RE_NO_BK_BRACES)
- {
- /* Scan ahead for a valid interval; if it's not valid,
- treat it as a literal '{'. */
- int lo = -1, hi = -1;
- char const *p = lexptr;
- char const *lim = p + lexleft;
- for (; p != lim && ISASCIIDIGIT (*p); p++)
- {
- if (lo < 0)
- lo = *p - '0';
- else
- {
- lo = lo * 10 + *p - '0';
- if (RE_DUP_MAX < lo)
- goto normal_char;
- }
- }
- if (p != lim && *p == ',')
- while (++p != lim && ISASCIIDIGIT (*p))
- {
- if (hi < 0)
- hi = *p - '0';
- else
- {
- hi = hi * 10 + *p - '0';
- if (RE_DUP_MAX < hi)
- goto normal_char;
- }
- }
- else
- hi = lo;
- if (p == lim || *p != '}' || lo < 0 || (0 <= hi && hi < lo))
- goto normal_char;
- }
-
- minrep = 0;
/* Cases:
{M} - exact count
{M,} - minimum count, maximum is infinity
+ {,N} - 0 through N
+ {,} - 0 to infinity (same as '*')
{M,N} - M through N */
- FETCH (c, _("unfinished repeat count"));
- if (ISASCIIDIGIT (c))
- {
- minrep = c - '0';
- for (;;)
- {
- FETCH (c, _("unfinished repeat count"));
- if (!ISASCIIDIGIT (c))
- break;
- minrep = 10 * minrep + c - '0';
- }
- }
- else
- dfaerror (_("malformed repeat count"));
- if (c == ',')
- {
- FETCH (c, _("unfinished repeat count"));
- if (!ISASCIIDIGIT (c))
- maxrep = -1;
- else
- {
- maxrep = c - '0';
- for (;;)
- {
- FETCH (c, _("unfinished repeat count"));
- if (!ISASCIIDIGIT (c))
- break;
- maxrep = 10 * maxrep + c - '0';
- }
- if (0 <= maxrep && maxrep < minrep)
- dfaerror (_("malformed repeat count"));
- }
- }
- else
- maxrep = minrep;
- if (!(syntax_bits & RE_NO_BK_BRACES))
- {
- if (c != '\\')
- dfaerror (_("malformed repeat count"));
- FETCH (c, _("unfinished repeat count"));
- }
- if (c != '}')
- dfaerror (_("malformed repeat count"));
+ {
+ char const *p = lexptr;
+ char const *lim = p + lexleft;
+ minrep = maxrep = -1;
+ for (; p != lim && ISASCIIDIGIT (*p); p++)
+ {
+ if (minrep < 0)
+ minrep = *p - '0';
+ else
+ minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
+ }
+ if (p != lim)
+ {
+ if (*p != ',')
+ maxrep = minrep;
+ else
+ {
+ if (minrep < 0)
+ minrep = 0;
+ while (++p != lim && ISASCIIDIGIT (*p))
+ {
+ if (maxrep < 0)
+ maxrep = *p - '0';
+ else
+ maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0');
+ }
+ }
+ }
+ if (! ((! backslash || (p != lim && *p++ == '\\'))
+ && p != lim && *p++ == '}'
+ && 0 <= minrep && (maxrep < 0 || minrep <= maxrep)))
+ {
+ if (syntax_bits & RE_INVALID_INTERVAL_ORD)
+ goto normal_char;
+ dfaerror (_("Invalid content of \\{\\}"));
+ }
+ if (RE_DUP_MAX < maxrep)
+ dfaerror (_("Regular expression too big"));
+ lexptr = p;
+ lexleft = lim - p;
+ }
laststart = 0;
return lasttok = REPMN;
diff --git a/tests/bre.tests b/tests/bre.tests
index 60ff1b54..9d01a3c3 100644
--- a/tests/bre.tests
+++ b/tests/bre.tests
@@ -42,8 +42,9 @@
2@a\{1@EBRACE
2@a\{1a@EBRACE
2@a\{1a\}@BADBR
-2@a\{,2\}@BADBR
-2@a\{,\}@BADBR
+0@a\{,2\}@a\{,2\}
+0@a\{,\}@a\{,\}
+2@a\{\}@BADBR
2@a\{1,x\}@BADBR
2@a\{1,x@EBRACE
2@a\{32768\}@BADBR
@@ -60,3 +61,4 @@
2@a\{1\}*@BADRPT@TO CORRECT
1@a\(b\)?c\1d@acd
0@-\{0,1\}[0-9]*$@-5
+2@b\{1000000000\}@ESIZE
diff --git a/tests/ere.tests b/tests/ere.tests
index 08b3dbae..e0aad2a8 100644
--- a/tests/ere.tests
+++ b/tests/ere.tests
@@ -76,6 +76,7 @@
0@a{1a}@a{1a}
0@a{,2}@a{,2}
0@a{,}@a{,}
+2@a{}@BADBR
0@a{1,*}@a{1,,,}
2@a{1,x@EBRACE@TO CORRECT
2@a{300}@BADBR@TO CORRECT
@@ -213,3 +214,4 @@
0@abcdefghijklmnopqrstuv@abcdefghijklmnopqrstuv
0@CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a@CC11
0@a?b@ab
+2@b{1000000000}@ESIZE
diff --git a/tests/repetition-overflow b/tests/repetition-overflow
index c92de237..66a44a6b 100755
--- a/tests/repetition-overflow
+++ b/tests/repetition-overflow
@@ -11,9 +11,9 @@ fail=0
# range of "unsigned int" would silently wrap around. Hence, 2^32+1
# would be treated just like "1", and both of these would mistakenly match.
-echo abc | grep -E "b{$xp1}" > out 2>&1; test $? = 1 || fail=1
+echo abc | grep -E "b{$xp1}" > out 2> /dev/null; test $? = 2 || fail=1
compare out /dev/null || fail=1
-echo abbc | grep -E "b{1,$xp2}" > out 2>&1; test $? = 1 || fail=1
+echo abbc | grep -E "b{1,$xp2}" > out 2> /dev/null; test $? = 2 || fail=1
compare out /dev/null || fail=1
Exit $fail