grep: report overflow for ERE a{1000000000}

* NEWS: Document this. * src/dfa.c (MIN): New macro. (lex): Lexically analyze the repeat-count operator once, not twice; the double-scan complicated the code and made it harder to understand and fix. Adjust the repeat-count parsing so that it better matches the behavior of the regex code, in three ways: 1. Diagnose too-large repeat counts rather than treating them as literal characters. 2. Use RE_INVALID_INTERVAL_ORD, not RE_NO_BK_BRACES, to decide whether to treat invalid-syntax {...}s as literals. 3. Use the same wording for {...}-related diagnostics that the regex code uses. * tests/bre.tests, tests/ere.tests, tests/repetition-overflow: Adjust to match new behavior, and add a few tests. * cfg.mk (exclude_file_name_regexp--sc_error_message_uppercase): New macro, since the diagnostics start with uppercase letters.
author: Paul Eggert <eggert@cs.ucla.edu> 2012-03-17 18:04:50 -0700
committer: Paul Eggert <eggert@cs.ucla.edu> 2012-03-18 16:03:20 -0700
commit: a48ed5b6953d6d40f6797f7e151e324924039b78 (patch)
tree: 761dffa269aa3a2aae2d714db4f5449bea475aa9
parent: c6e3ea61d9f08aa0128a0eb13d31a2fbad376f99 (diff)
download: grep-a48ed5b6953d6d40f6797f7e151e324924039b78.tar.gz
6 files changed, 59 insertions, 84 deletions
diff --git a/NEWS b/NEWS
index 6dad6086..b219b65e 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,9 @@ GNU grep NEWS                                    -*- outline -*-
   name too long", and it can run much faster when dealing with large
   directory hierarchies.
 
+  grep -E 'a{1000000000}' now reports an overflow error rather than
+  silently acting like grep -E 'a\{1000000000}'.
+
 ** New features
 
   The -R option now has a long-option alias --dereference-recursive.
diff --git a/cfg.mk b/cfg.mk
index 84115c20..329af439 100644
--- a/cfg.mk
+++ b/cfg.mk
@@ -88,3 +88,4 @@ exclude_file_name_regexp--sc_prohibit_xalloc_without_use = ^src/kwset\.c$$
 exclude_file_name_regexp--sc_prohibit_tab_based_indentation = \
   (Makefile|\.(am|mk)$$|^gl/lib/.*\.c\.diff$$)
 exclude_file_name_regexp--sc_space_tab = ^gl/lib/.*\.c\.diff$$
+exclude_file_name_regexp--sc_error_message_uppercase = ^src/dfa\.c$$
diff --git a/src/dfa.c b/src/dfa.c
index 6eb4a119..613f5484 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -861,6 +861,10 @@ static unsigned char const *buf_end;    /* reference to end in dfaexec().  */
 
 #endif /* MBS_SUPPORT */
 
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
@@ -1328,90 +1332,53 @@ lex (void)
           if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
             goto normal_char;
 
-          if (syntax_bits & RE_NO_BK_BRACES)
-            {
-              /* Scan ahead for a valid interval; if it's not valid,
-                 treat it as a literal '{'.  */
-              int lo = -1, hi = -1;
-              char const *p = lexptr;
-              char const *lim = p + lexleft;
-              for (; p != lim && ISASCIIDIGIT (*p); p++)
-                {
-                  if (lo < 0)
-                    lo = *p - '0';
-                  else
-                    {
-                      lo = lo * 10 + *p - '0';
-                      if (RE_DUP_MAX < lo)
-                        goto normal_char;
-                    }
-                }
-              if (p != lim && *p == ',')
-                while (++p != lim && ISASCIIDIGIT (*p))
-                  {
-                    if (hi < 0)
-                      hi = *p - '0';
-                    else
-                      {
-                        hi = hi * 10 + *p - '0';
-                        if (RE_DUP_MAX < hi)
-                          goto normal_char;
-                      }
-                  }
-              else
-                hi = lo;
-              if (p == lim || *p != '}' || lo < 0 || (0 <= hi && hi < lo))
-                goto normal_char;
-            }
-
-          minrep = 0;
           /* Cases:
              {M} - exact count
              {M,} - minimum count, maximum is infinity
+             {,N} - 0 through N
+             {,} - 0 to infinity (same as '*')
              {M,N} - M through N */
-          FETCH (c, _("unfinished repeat count"));
-          if (ISASCIIDIGIT (c))
-            {
-              minrep = c - '0';
-              for (;;)
-                {
-                  FETCH (c, _("unfinished repeat count"));
-                  if (!ISASCIIDIGIT (c))
-                    break;
-                  minrep = 10 * minrep + c - '0';
-                }
-            }
-          else
-            dfaerror (_("malformed repeat count"));
-          if (c == ',')
-            {
-              FETCH (c, _("unfinished repeat count"));
-              if (!ISASCIIDIGIT (c))
-                maxrep = -1;
-              else
-                {
-                  maxrep = c - '0';
-                  for (;;)
-                    {
-                      FETCH (c, _("unfinished repeat count"));
-                      if (!ISASCIIDIGIT (c))
-                        break;
-                      maxrep = 10 * maxrep + c - '0';
-                    }
-                  if (0 <= maxrep && maxrep < minrep)
-                    dfaerror (_("malformed repeat count"));
-                }
-            }
-          else
-            maxrep = minrep;
-          if (!(syntax_bits & RE_NO_BK_BRACES))
-            {
-              if (c != '\\')
-                dfaerror (_("malformed repeat count"));
-              FETCH (c, _("unfinished repeat count"));
-            }
-          if (c != '}')
-            dfaerror (_("malformed repeat count"));
+          {
+            char const *p = lexptr;
+            char const *lim = p + lexleft;
+            minrep = maxrep = -1;
+            for (; p != lim && ISASCIIDIGIT (*p); p++)
+              {
+                if (minrep < 0)
+                  minrep = *p - '0';
+                else
+                  minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
+              }
+            if (p != lim)
+              {
+                if (*p != ',')
+                  maxrep = minrep;
+                else
+                  {
+                    if (minrep < 0)
+                      minrep = 0;
+                    while (++p != lim && ISASCIIDIGIT (*p))
+                      {
+                        if (maxrep < 0)
+                          maxrep = *p - '0';
+                        else
+                          maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0');
+                      }
+                  }
+              }
+            if (! ((! backslash || (p != lim && *p++ == '\\'))
+                   && p != lim && *p++ == '}'
+                   && 0 <= minrep && (maxrep < 0 || minrep <= maxrep)))
+              {
+                if (syntax_bits & RE_INVALID_INTERVAL_ORD)
+                  goto normal_char;
+                dfaerror (_("Invalid content of \\{\\}"));
+              }
+            if (RE_DUP_MAX < maxrep)
+              dfaerror (_("Regular expression too big"));
+            lexptr = p;
+            lexleft = lim - p;
+          }
           laststart = 0;
           return lasttok = REPMN;
 
diff --git a/tests/bre.tests b/tests/bre.tests
index 60ff1b54..9d01a3c3 100644
--- a/tests/bre.tests
+++ b/tests/bre.tests
@@ -42,8 +42,9 @@
 2@a\{1@EBRACE
 2@a\{1a@EBRACE
 2@a\{1a\}@BADBR
-2@a\{,2\}@BADBR
-2@a\{,\}@BADBR
+0@a\{,2\}@a\{,2\}
+0@a\{,\}@a\{,\}
+2@a\{\}@BADBR
 2@a\{1,x\}@BADBR
 2@a\{1,x@EBRACE
 2@a\{32768\}@BADBR
@@ -60,3 +61,4 @@
 2@a\{1\}*@BADRPT@TO CORRECT
 1@a\(b\)?c\1d@acd
 0@-\{0,1\}[0-9]*$@-5
+2@b\{1000000000\}@ESIZE
diff --git a/tests/ere.tests b/tests/ere.tests
index 08b3dbae..e0aad2a8 100644
--- a/tests/ere.tests
+++ b/tests/ere.tests
@@ -76,6 +76,7 @@
 0@a{1a}@a{1a}
 0@a{,2}@a{,2}
 0@a{,}@a{,}
+2@a{}@BADBR
 0@a{1,*}@a{1,,,}
 2@a{1,x@EBRACE@TO CORRECT
 2@a{300}@BADBR@TO CORRECT
@@ -213,3 +214,4 @@
 0@abcdefghijklmnopqrstuv@abcdefghijklmnopqrstuv
 0@CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a@CC11
 0@a?b@ab
+2@b{1000000000}@ESIZE
diff --git a/tests/repetition-overflow b/tests/repetition-overflow
index c92de237..66a44a6b 100755
--- a/tests/repetition-overflow
+++ b/tests/repetition-overflow
@@ -11,9 +11,9 @@ fail=0
 # range of "unsigned int" would silently wrap around.   Hence, 2^32+1
 # would be treated just like "1", and both of these would mistakenly match.
 
-echo abc  | grep -E "b{$xp1}"   > out 2>&1; test $? = 1 || fail=1
+echo abc  | grep -E "b{$xp1}"   > out 2> /dev/null; test $? = 2 || fail=1
 compare out /dev/null || fail=1
-echo abbc | grep -E "b{1,$xp2}" > out 2>&1; test $? = 1 || fail=1
+echo abbc | grep -E "b{1,$xp2}" > out 2> /dev/null; test $? = 2 || fail=1
 compare out /dev/null || fail=1
 
 Exit $fail
author	Paul Eggert <eggert@cs.ucla.edu>	2012-03-17 18:04:50 -0700
committer	Paul Eggert <eggert@cs.ucla.edu>	2012-03-18 16:03:20 -0700
commit	a48ed5b6953d6d40f6797f7e151e324924039b78 (patch)
tree	761dffa269aa3a2aae2d714db4f5449bea475aa9
parent	c6e3ea61d9f08aa0128a0eb13d31a2fbad376f99 (diff)
download	grep-a48ed5b6953d6d40f6797f7e151e324924039b78.tar.gz