Sync dfa with grep, update a test.

author: Arnold D. Robbins <arnold@skeeve.com> 2014-02-28 14:22:43 +0200
committer: Arnold D. Robbins <arnold@skeeve.com> 2014-02-28 14:22:43 +0200
commit: 0b27d28fad7db725bb6a72c025c91ac39f5eeaf3 (patch)
tree: eac9f9b9af6debd10bc6f0e8fa78e06393adef41
parent: 4cea49ca8f817354ffd513c6ec808152e9299f21 (diff)
download: gawk-0b27d28fad7db725bb6a72c025c91ac39f5eeaf3.tar.gz
4 files changed, 83 insertions, 70 deletions
diff --git a/ChangeLog b/ChangeLog
index 507cd3e7..965cd1f7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2014-02-28         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* dfa.c: Sync with grep. Looks like good improvement with
+	respect to bracket expressions.
+
 2014-02-27         Arnold D. Robbins     <arnold@skeeve.com>
 
 	Fixes for enum/int mismatches as warned by some compilers.
diff --git a/dfa.c b/dfa.c
index 03a61878..2e9d2fd0 100644
--- a/dfa.c
+++ b/dfa.c
@@ -219,7 +219,8 @@ enum
   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
                                    the empty string.  */
 
-  BACKREF,                      /* BACKREF is generated by \<digit>; it
+  BACKREF,                      /* BACKREF is generated by \<digit>
+                                   or by any other construct that
                                    is not completely handled.  If the scanner
                                    detects a transition on backref, it returns
                                    a kind of "semi-success" indicating that
@@ -811,6 +812,45 @@ using_utf8 (void)
   return utf8;
 }
 
+/* Return true if the current locale is known to be a unibyte locale
+   without multicharacter collating sequences and where range
+   comparisons simply use the native encoding.  These locales can be
+   processed more efficiently.  */
+
+static bool
+using_simple_locale (void)
+{
+  /* True if the native character set is known to be compatible with
+     the C locale.  The following test isn't perfect, but it's good
+     enough in practice, as only ASCII and EBCDIC are in common use
+     and this test correctly accepts ASCII and rejects EBCDIC.  */
+  enum { native_c_charset =
+    ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
+     && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
+     && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
+     && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
+     && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
+     && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
+     && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
+     && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
+     && '}' == 125 && '~' == 126)
+  };
+
+  if (! native_c_charset || MB_CUR_MAX > 1)
+    return false;
+  else
+    {
+      static int unibyte_c = -1;
+      if (unibyte_c < 0)
+        {
+          char *locale = setlocale (LC_ALL, 0);
+          unibyte_c = (locale && (STREQ (locale, "C")
+                                  || STREQ (locale, "POSIX")));
+        }
+      return unibyte_c;
+    }
+}
+
 /* Lexical analyzer.  All the dross that deals with the obnoxious
    GNU Regex syntax bits is located here.  The poor, suffering
    reader is referred to the GNU Regex documentation for the
@@ -928,7 +968,7 @@ static const struct dfa_ctype prednames[] = {
   {"upper", isupper, false},
   {"lower", islower, false},
   {"digit", isdigit, true},
-  {"xdigit", isxdigit, true},
+  {"xdigit", isxdigit, false},
   {"space", isspace, false},
   {"punct", ispunct, false},
   {"alnum", isalnum, false},
@@ -959,6 +999,10 @@ parse_bracket_exp (void)
   int c, c1, c2;
   charclass ccl;
 
+  /* True if this is a bracket expression that dfaexec is known to
+     process correctly.  */
+  bool known_bracket_exp = true;
+
   /* Used to warn about [:space:].
      Bit 0 = first character is a colon.
      Bit 1 = last character is a colon.
@@ -1000,6 +1044,7 @@ parse_bracket_exp (void)
     {
       FETCH_WC (c, wc, _("unbalanced ["));
       invert = 1;
+      known_bracket_exp = using_simple_locale ();
     }
   else
     invert = 0;
@@ -1014,16 +1059,14 @@ parse_bracket_exp (void)
          we just treat it as a bunch of ordinary characters.  We can do
          this because we assume regex has checked for syntax errors before
          dfa is ever called.  */
-      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
+      if (c == '[')
         {
 #define MAX_BRACKET_STRING_LEN 32
           char str[MAX_BRACKET_STRING_LEN + 1];
           FETCH_WC (c1, wc1, _("unbalanced ["));
 
-          /* If pattern contains '[[:', '[[.', or '[[='.  */
-          if (c1 == ':'
-              /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1.  */
-              || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
+          if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES))
+              || c1 == '.' || c1 == '=')
             {
               size_t len = 0;
               for (;;)
@@ -1042,7 +1085,10 @@ parse_bracket_exp (void)
               /* Fetch bracket.  */
               FETCH_WC (c, wc, _("unbalanced ["));
               if (c1 == ':')
-                /* build character class.  */
+                /* Build character class.  POSIX allows character
+                   classes to match multicharacter collating elements,
+                   but the regex code does not support that, so do not
+                   worry about that possibility.  */
                 {
                   char const *class
                     = (case_fold && (STREQ (str, "upper")
@@ -1066,28 +1112,9 @@ parse_bracket_exp (void)
                     if (pred->func (c2))
                       setbit_case_fold_c (c2, ccl);
                 }
+              else
+                known_bracket_exp = false;
 
-              else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
-                {
-                  char *elem = xmemdup (str, len + 1);
-
-                  if (c1 == '=')
-                    /* build equivalence class.  */
-                    {
-                      REALLOC_IF_NECESSARY (work_mbc->equivs,
-                                            equivs_al, work_mbc->nequivs + 1);
-                      work_mbc->equivs[work_mbc->nequivs++] = elem;
-                    }
-
-                  if (c1 == '.')
-                    /* build collating element.  */
-                    {
-                      REALLOC_IF_NECESSARY (work_mbc->coll_elems,
-                                            coll_elems_al,
-                                            work_mbc->ncoll_elems + 1);
-                      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
-                    }
-                }
               colon_warning_state |= 8;
 
               /* Fetch new lookahead character.  */
@@ -1109,6 +1136,16 @@ parse_bracket_exp (void)
         /* build range characters.  */
         {
           FETCH_WC (c2, wc2, _("unbalanced ["));
+
+          /* A bracket expression like [a-[.aa.]] matches an unknown set.
+             Treat it like [-a[.aa.]] while parsing it, and
+             remember that the set is unknown.  */
+          if (c2 == '[' && *lexptr == '.')
+            {
+              known_bracket_exp = false;
+              c2 = ']';
+            }
+
           if (c2 == ']')
             {
               /* In the case [x-], the - is an ordinary hyphen,
@@ -1146,47 +1183,11 @@ parse_bracket_exp (void)
                   work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
                 }
             }
+          else if (using_simple_locale ())
+            for (; c <= c2; c++)
+              setbit_case_fold_c (c, ccl);
           else
-            {
-#ifdef GAWK
-              c1 = c;
-              if (case_fold)
-                {
-                  c1 = tolower (c1);
-                  c2 = tolower (c2);
-                }
-              for (c = c1; c <= c2; c++)
-                setbit_case_fold_c (c, ccl);
-#else
-              /* Defer to the system regex library about the meaning
-                 of range expressions.  */
-              struct re_pattern_buffer re = { 0 };
-              char const *compile_msg;
-#if 199901 <= __STDC_VERSION__
-              char pattern[] = { '[', '\\', c, '-', '\\', c2, ']' };
-#else
-              char pattern[] = { '[', '\\', 0, '-', '\\', 0, ']' };
-              pattern[2] = c;
-              pattern[5] = c2;
-#endif
-              re_set_syntax (syntax_bits | RE_BACKSLASH_ESCAPE_IN_LISTS);
-              compile_msg = re_compile_pattern (pattern, sizeof pattern, &re);
-              if (compile_msg)
-                dfaerror (compile_msg);
-              for (c = 0; c < NOTCHAR; c++)
-                {
-                  char subject = c;
-                  switch (re_match (&re, &subject, 1, 0, NULL))
-                    {
-                    case 1: setbit (c, ccl); break;
-                    case -1: break;
-                    default: xalloc_die ();
-                    }
-                }
-              regfree (&re);
-              re_set_syntax (syntax_bits);
-#endif
-            }
+            known_bracket_exp = false;
 
           colon_warning_state |= 8;
           FETCH_WC (c1, wc1, _("unbalanced ["));
@@ -1224,6 +1225,9 @@ parse_bracket_exp (void)
   if (colon_warning_state == 7)
     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
 
+  if (! known_bracket_exp)
+    return BACKREF;
+
   if (MB_CUR_MAX > 1)
     {
       static charclass zeroclass;
diff --git a/test/ChangeLog b/test/ChangeLog
index 18b912dd..ab7ff0f2 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,7 @@
+2014-02-28         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* regrange.ok: Update after code improvements.
+
 2014-02-03         Stepan Kasal         <kasal@ucw.cz>
 
 	* strftime.awk: the default format uses %e, not %d (Introduced on
diff --git a/test/regrange.ok b/test/regrange.ok
index 1fa00c70..ae8c6499 100644
--- a/test/regrange.ok
+++ b/test/regrange.ok
@@ -3,4 +3,4 @@
 "c" ~ /[[a-d]/ --> 1
 "\" ~ /[\[-\]]/ --> 1
 "[.c.]" ~ /[a-[.e.]]/ --> 1
-"[.d.]" ~ /[[.c.]-[.z.]]/ --> 0
+"[.d.]" ~ /[[.c.]-[.z.]]/ --> 1
author	Arnold D. Robbins <arnold@skeeve.com>	2014-02-28 14:22:43 +0200
committer	Arnold D. Robbins <arnold@skeeve.com>	2014-02-28 14:22:43 +0200
commit	0b27d28fad7db725bb6a72c025c91ac39f5eeaf3 (patch)
tree	eac9f9b9af6debd10bc6f0e8fa78e06393adef41
parent	4cea49ca8f817354ffd513c6ec808152e9299f21 (diff)
download	gawk-0b27d28fad7db725bb6a72c025c91ac39f5eeaf3.tar.gz