dfa: speed up handling of brackets

This patch has two sides. One is to fold the parsing of brackets in the single- and multi-byte cases. The second is to leverage this change, and use a bitset to test for single-byte characters in the charset. Splitting the two would be very hard. Testcase: yes 'the quick brown fox jumps over the lazy dog' | sed 100000q | \ time grep -c [ABCDEFGHIJKLMNOPQRSTUVWXYZ,] Before: 59ms (best of three runs); after: 51ms (best of three runs). Nice, but mostly providing infrastructure for the next patch. * src/dfa.c (setbit_case_fold): Try applying towlower/towupper. (looking_at): Remove. (FETCH_WC): New. (fetch_wc): Merge into FETCH_WC [MBS_SUPPORT]. (FETCH) [MBS_SUPPORT]: Call FETCH_WC. (prednames, find_pred, is_blank and other predicates): Move above, remove K&R syntax support. (parse_bracket_exp): New name of parse_bracket_exp_mb, rewritten to include single-byte character set parsing of brackets. (lex): Adjust for fetch_wc->FETCH_WC change, remove single-byte character set parsing of brackets. (match_mb_charset): Test against work_mbc->cset. * src/dfa.h (struct mb_char_classes): Add cset.
author: Paolo Bonzini <bonzini@gnu.org> 2010-03-07 11:22:00 +0100
committer: Paolo Bonzini <bonzini@gnu.org> 2010-03-17 15:32:54 +0100
commit: 8f9106c419d18759f767da351b3b6913f022c8f8 (patch)
tree: 598d251c073b0c65c5b10927692dddbc2613a2fb
parent: 3cba8f98be7f791a55af6433863349e742054dd0 (diff)
download: grep-8f9106c419d18759f767da351b3b6913f022c8f8.tar.gz
2 files changed, 305 insertions, 305 deletions
diff --git a/src/dfa.c b/src/dfa.c
index 352782a5..3b0d8610 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -238,17 +238,40 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
   eolbyte = eol;
 }
 
-/* Like setbit, but if case is folded, set both cases of a letter.  */
+/* Like setbit, but if case is folded, set both cases of a letter.
+   For MB_CUR_MAX > 1, one or both of the two cases may not be set,
+   so the resulting charset may only be used as an optimization.  */
 static void
 setbit_case_fold (unsigned b, charclass c)
 {
-  setbit (b, c);
   if (case_fold)
     {
-      if (ISUPPER (b))
-	setbit (tolower (b), c);
-      else if (ISLOWER (b))
-	setbit (toupper (b), c);
+#ifdef MBS_SUPPORT
+      if (MB_CUR_MAX > 1)
+        {
+          wint_t b1 = iswupper(b) ? towlower(b) : b;
+          wint_t b2 = iswlower(b) ? towupper(b) : b;
+          if (wctob ((unsigned char)b1) == b1)
+            setbit (b1, c);
+          if (b2 != b1 && wctob ((unsigned char)b2) == b2)
+            setbit (b2, c);
+        }
+      else
+        {
+#endif
+          unsigned char b1 = ISUPPER(b) ? tolower(b) : b;
+          unsigned char b2 = ISLOWER(b) ? toupper(b) : b;
+	  setbit (b1, c);
+          if (b2 != b1)
+            setbit (b2, c);
+        }
+    }
+  else
+    {
+#ifdef MBS_SUPPORT
+      if (wctob ((unsigned char)b) == b)
+#endif
+        setbit (b, c);
     }
 }
 
@@ -315,43 +338,39 @@ static unsigned char const *buf_end;	/* reference to end in dfaexec().  */
 
 #ifdef MBS_SUPPORT
 /* Note that characters become unsigned here. */
-# define FETCH(c, eoferr)			\
+# define FETCH_WC(c, wc, eoferr)		\
   do {						\
     if (! lexleft)				\
-     {						\
-	if (eoferr != 0)			\
+      {						\
+        if (eoferr != 0)			\
 	  dfaerror (eoferr);			\
-	else					\
+        else					\
 	  return lasttok = END;			\
       }						\
-    (c) = (unsigned char) *lexptr++;		\
-    --lexleft;					\
+    else					\
+      {						\
+        cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); \
+        if (cur_mb_len <= 0)			\
+          {					\
+            cur_mb_len = 1;			\
+            --lexleft;				\
+            wc = c = (unsigned char) *lexptr++;	\
+          }					\
+        else					\
+          {					\
+            lexptr += cur_mb_len;		\
+            lexleft -= cur_mb_len;		\
+            (c) = wctob(wc);			\
+          }					\
+      }						\
   } while(0)
 
-/* This function fetch a wide character, and update cur_mb_len,
-   used only if the current locale is a multibyte environment.  */
-static wint_t
-fetch_wc (char const *eoferr)
-{
-  wchar_t wc;
-  if (! lexleft)
-    {
-      if (eoferr != 0)
-	dfaerror (eoferr);
-      else
-	return WEOF;
-    }
+# define FETCH(c, eoferr)			\
+  do {						\
+    wint_t _wc;					\
+    FETCH_WC(c, _wc, eoferr);			\
+  } while(0)
 
-  cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
-  if (cur_mb_len <= 0)
-   {
-      cur_mb_len = 1;
-      wc = (unsigned char) *lexptr;
-    }
-  lexptr += cur_mb_len;
-  lexleft -= cur_mb_len;
-  return wc;
-}
 #else
 /* Note that characters become unsigned here. */
 # define FETCH(c, eoferr)	      \
@@ -366,6 +385,10 @@ fetch_wc (char const *eoferr)
     (c) = (unsigned char) *lexptr++;  \
     --lexleft;			      \
   } while(0)
+
+# define FETCH_WC(c, unused, eoferr)		\
+  FETCH(c, eoferr)
+
 #endif /* MBS_SUPPORT */
 
 static int
@@ -375,13 +398,70 @@ in_coll_range (char ch, char from, char to)
   return strcoll (&c[0], &c[2]) <= 0 && 0 <= strcoll (&c[2], &c[4]);
 }
 
-#ifdef MBS_SUPPORT
+static int is_alpha(int c) { return ISALPHA(c); }
+static int is_upper(int c) { return ISUPPER(c); }
+static int is_lower(int c) { return ISLOWER(c); }
+static int is_digit(int c) { return ISDIGIT(c); }
+static int is_xdigit(int c) { return ISXDIGIT(c); }
+static int is_space(int c) { return ISSPACE(c); }
+static int is_punct(int c) { return ISPUNCT(c); }
+static int is_alnum(int c) { return ISALNUM(c); }
+static int is_print(int c) { return ISPRINT(c); }
+static int is_graph(int c) { return ISGRAPH(c); }
+static int is_cntrl(int c) { return ISCNTRL(c); }
+
+static int
+is_blank (int c)
+{
+   return (c == ' ' || c == '\t');
+}
+
+typedef int predicate (int);
+
+/* The following list maps the names of the Posix named character classes
+   to predicate functions that determine whether a given character is in
+   the class.  The leading [ has already been eaten by the lexical analyzer. */
+static struct {
+  const char *name;
+  predicate *pred;
+} const prednames[] = {
+  { "alpha", is_alpha },
+  { "upper", is_upper },
+  { "lower", is_lower },
+  { "digit", is_digit },
+  { "xdigit", is_xdigit },
+  { "space", is_space },
+  { "punct", is_punct },
+  { "alnum", is_alnum },
+  { "print", is_print },
+  { "graph", is_graph },
+  { "cntrl", is_cntrl },
+  { "blank", is_blank },
+  { NULL, NULL }
+};
+
+static predicate *
+find_pred (const char *str)
+{
+  unsigned int i;
+  for (i = 0; prednames[i].name; ++i)
+    if (!strcmp(str, prednames[i].name))
+      break;
+
+  return prednames[i].pred;
+}
+
 /* Multibyte character handling sub-routine for lex.
    This function  parse a bracket expression and build a struct
    mb_char_classes.  */
 static token
-parse_bracket_exp_mb (void)
+parse_bracket_exp (void)
 {
+  int invert;
+  int c, c1, c2;
+  charclass ccl;
+
+#ifdef MBS_SUPPORT
   wint_t wc, wc1, wc2;
 
   /* Work area to build a mb_char_classes.  */
@@ -389,63 +469,68 @@ parse_bracket_exp_mb (void)
   int chars_al, range_sts_al, range_ends_al, ch_classes_al,
     equivs_al, coll_elems_al;
 
-  REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
-		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);
-  /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
-     We will update dfa->multibyte_prop[] in addtok(), because we can't
-     decide the index in dfa->tokens[].  */
-
-  /* Initialize work are */
-  work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
-
   chars_al = 1;
   range_sts_al = range_ends_al = 0;
   ch_classes_al = equivs_al = coll_elems_al = 0;
+  if (MB_CUR_MAX > 1)
+    {
+      REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
+                           dfa->mbcsets_alloc, dfa->nmbcsets + 1);
+
+      /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
+         We will update dfa->multibyte_prop[] in addtok(), because we can't
+         decide the index in dfa->tokens[].  */
+
+      /* Initialize work area.  */
+      work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
+      work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
+      work_mbc->nequivs = work_mbc->ncoll_elems = 0;
+      work_mbc->chars = NULL;
+      work_mbc->ch_classes = NULL;
+      work_mbc->range_sts = work_mbc->range_ends = NULL;
+      work_mbc->equivs = work_mbc->coll_elems = NULL;
+    }
+  else
+    work_mbc = NULL;
+#endif
 
-  work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
-  work_mbc->nequivs = work_mbc->ncoll_elems = 0;
-  work_mbc->chars = NULL;
-  work_mbc->ch_classes = NULL;
-  work_mbc->range_sts = work_mbc->range_ends = NULL;
-  work_mbc->equivs = work_mbc->coll_elems = NULL;
-
-  wc = fetch_wc(_("unbalanced ["));
-  if (wc == L'^')
+  memset (ccl, 0, sizeof(ccl));
+  FETCH_WC (c, wc, _("unbalanced ["));
+  if (c == '^')
     {
-      wc = fetch_wc(_("unbalanced ["));
-      work_mbc->invert = 1;
+      FETCH_WC (c, wc, _("unbalanced ["));
+      invert = 1;
     }
   else
-    work_mbc->invert = 0;
+    invert = 0;
+
   do
     {
-      wc1 = WEOF; /* mark wc1 is not initialized".  */
+      c1 = EOF; /* mark c1 is not initialized".  */
 
       /* Note that if we're looking at some other [:...:] construct,
 	 we just treat it as a bunch of ordinary characters.  We can do
 	 this because we assume regex has checked for syntax errors before
 	 dfa is ever called. */
-      if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
+      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
 	{
 #define BRACKET_BUFFER_SIZE 128
 	  char str[BRACKET_BUFFER_SIZE];
-	  wc1 = wc;
-	  wc = fetch_wc(_("unbalanced ["));
+	  FETCH_WC (c1, wc1, _("unbalanced ["));
 
 	  /* If pattern contains `[[:', `[[.', or `[[='.  */
-	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
+	  if (c1 == ':'
+#ifdef MBS_SUPPORT
+              /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1.  */
+	      || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))
+#endif
+	      )
 	    {
-	      unsigned char c;
-	      unsigned char delim = (unsigned char)wc;
 	      int len = 0;
 	      for (;;)
 		{
-		  if (! lexleft)
-		    dfaerror (_("unbalanced ["));
-		  c = (unsigned char) *lexptr++;
-		  --lexleft;
-
-		  if ((c == delim && *lexptr == ']') || lexleft == 0)
+		  FETCH (c, _("unbalanced ["));
+		  if ((c == c1 && *lexptr == ']') || lexleft == 0)
 		    break;
 		  if (len < BRACKET_BUFFER_SIZE)
 		    str[len++] = c;
@@ -455,18 +540,9 @@ parse_bracket_exp_mb (void)
 		}
 	      str[len] = '\0';
 
-	      if (lexleft == 0)
-		{
-		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
-				       work_mbc->nchars + 2);
-		  work_mbc->chars[work_mbc->nchars++] = L'[';
-		  work_mbc->chars[work_mbc->nchars++] = delim;
-		  break;
-		}
-
-	      if (--lexleft, *lexptr++ != ']')
-		dfaerror (_("unbalanced ["));
-	      if (delim == ':')
+              /* Fetch bracket.  */
+	      FETCH (c, _("unbalanced ["));
+	      if (c1 == ':')
 		/* build character class.  */
 		{
 		  char const *class
@@ -474,24 +550,39 @@ parse_bracket_exp_mb (void)
 				     || !strcmp (str, "lower"))
 				       ? "alpha"
 				       : str);
-		  /* Query the character class as wctype_t.  */
-		  wctype_t wt = wctype (class);
+#ifdef MBS_SUPPORT
+                  if (MB_CUR_MAX > 1)
+                    {
+		      /* Store the character class as wctype_t.  */
+                      wctype_t wt = wctype (class);
+
+                      if (ch_classes_al == 0)
+                        MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
+                      REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
+                                           ch_classes_al,
+                                           work_mbc->nch_classes + 1);
+                      work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
+                    }
+#endif
 
-		  if (ch_classes_al == 0)
-		    MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
-		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
-				       ch_classes_al,
-				       work_mbc->nch_classes + 1);
-		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
+                  {
+                    predicate *pred = find_pred (class);
+                    if (!pred)
+                      dfaerror(_("invalid character class"));
+                    for (c2 = 0; c2 < NOTCHAR; ++c2)
+                      if ((*pred)(c2))
+                        setbit_case_fold (c2, ccl);
+                  }
+                }
 
-		}
-	      else if (delim == '=' || delim == '.')
+#ifdef MBS_SUPPORT
+	      else if (c1 == '=' || c1 == '.')
 		{
 		  char *elem;
 		  MALLOC(elem, char, len + 1);
 		  strncpy(elem, str, len + 1);
 
-		  if (delim == '=')
+		  if (c1 == '=')
 		    /* build equivalent class.  */
 		    {
 		      if (equivs_al == 0)
@@ -502,7 +593,7 @@ parse_bracket_exp_mb (void)
 		      work_mbc->equivs[work_mbc->nequivs++] = elem;
 		    }
 
-		  if (delim == '.')
+		  if (c1 == '.')
 		    /* build collating element.  */
 		    {
 		      if (coll_elems_al == 0)
@@ -513,158 +604,157 @@ parse_bracket_exp_mb (void)
 		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
 		    }
 		}
-	      wc1 = wc = WEOF;
-	    }
-	  else
-	    /* We treat '[' as a normal character here.  */
-	    {
-	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
+#endif
+
+              /* Fetch new lookahead character.  */
+	      FETCH_WC (c1, wc1, _("unbalanced ["));
+              continue;
 	    }
-	}
-      else
-	{
-	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-	    wc = fetch_wc(("unbalanced ["));
+
+          /* We treat '[' as a normal character here.  c/c1/wc/wc1
+             are already set up.  */
 	}
 
-      if (wc1 == WEOF)
-	wc1 = fetch_wc(_("unbalanced ["));
+      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+        FETCH_WC(c, wc, _("unbalanced ["));
 
-      if (wc1 == L'-')
+      if (c1 == EOF)
+	FETCH_WC(c1, wc1, _("unbalanced ["));
+
+      if (c1 == '-')
 	/* build range characters.  */
 	{
-	  wc2 = fetch_wc(_("unbalanced ["));
-	  if (wc2 == L']')
+	  FETCH_WC(c2, wc2, _("unbalanced ["));
+	  if (c2 == ']')
 	    {
 	      /* In the case [x-], the - is an ordinary hyphen,
 		 which is left in c1, the lookahead character. */
 	      lexptr -= cur_mb_len;
 	      lexleft += cur_mb_len;
-	      wc2 = wc;
-	    }
-	  else
-	    {
-	      if (wc2 == L'\\'
-		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-		wc2 = fetch_wc(_("unbalanced ["));
-	      wc1 = fetch_wc(_("unbalanced ["));
-	    }
+            }
+        }
 
-	  /* When case folding map a range, say [m-z] (or even [M-z]) to the
-	     pair of ranges, [m-z] [M-Z].  */
-	  if (range_sts_al == 0)
-	    {
-	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
-	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
-	    }
-	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
-			       range_sts_al, work_mbc->nranges + 1);
-	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
-			       range_ends_al, work_mbc->nranges + 1);
-	  work_mbc->range_sts[work_mbc->nranges] =
-            case_fold ? towlower(wc) : (wchar_t)wc;
-	  work_mbc->range_ends[work_mbc->nranges++] =
-            case_fold ? towlower(wc2) : (wchar_t)wc2;
+      if (c1 == '-' && c2 != ']')
+        {
+          if (c2 == '\\'
+              && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+            FETCH_WC(c2, wc2, _("unbalanced ["));
 
-#ifndef GREP
-	  if (case_fold)
+#ifdef MBS_SUPPORT
+          if (MB_CUR_MAX > 1)
             {
+	      /* When case folding map a range, say [m-z] (or even [M-z])
+		 to the pair of ranges, [m-z] [M-Z].  */
+	      if (range_sts_al == 0)
+                {
+                  MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
+                  MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
+                }
               REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
                                    range_sts_al, work_mbc->nranges + 1);
-              work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
               REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
                                    range_ends_al, work_mbc->nranges + 1);
-              work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
+              work_mbc->range_sts[work_mbc->nranges] =
+                case_fold ? towlower(wc) : (wchar_t)wc;
+              work_mbc->range_ends[work_mbc->nranges++] =
+                case_fold ? towlower(wc2) : (wchar_t)wc2;
+
+#ifndef GREP
+              if (case_fold && (iswalpha(wc) || iswalpha(wc2)))
+                {
+                  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+                                       range_sts_al, work_mbc->nranges + 1);
+                  work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
+                  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+                                       range_ends_al, work_mbc->nranges + 1);
+                  work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
+                }
+#endif
             }
+          else
 #endif
+            {
+              c1 = c;
+              if (case_fold)
+                {
+                  c1 = tolower (c1);
+                  c2 = tolower (c2);
+                }
+              if (!hard_LC_COLLATE)
+                for (c = c1; c <= c2; c++)
+                  setbit_case_fold (c, ccl);
+              else
+                for (c = 0; c < NOTCHAR; ++c)
+                  if (!(case_fold && ISUPPER (c))
+                      && in_coll_range (c, c1, c2))
+                    setbit_case_fold (c, ccl);
+            }
+
+          FETCH_WC(c1, wc1, _("unbalanced ["));
+	  continue;
 	}
-      else if (wc != WEOF)
-	/* build normal characters.  */
-	{
-	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
-			       work_mbc->nchars + 1);
-	  work_mbc->chars[work_mbc->nchars++] =
-		(wchar_t) (case_fold ? towlower(wc) : wc);
-#ifndef GREP
-	  if (case_fold)
+
+      setbit_case_fold (c, ccl);
+#ifdef MBS_SUPPORT
+      /* Build normal characters.  */
+      if (MB_CUR_MAX > 1)
+        {
+          if (case_fold && iswalpha(wc))
+            {
+              wc = towlower(wc);
+              c = wctob(wc);
+              if (c == EOF || (wint_t)c != (wint_t)wc)
+                {
+                  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+                                       work_mbc->nchars + 1);
+                  work_mbc->chars[work_mbc->nchars++] = wc;
+                }
+#ifdef GREP
+	      continue;
+#else
+              wc = towupper(wc);
+              c = wctob(wc);
+#endif
+            }
+          if (c == EOF || (wint_t)c != (wint_t)wc)
             {
               REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
                                    work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = towupper(wc);
+              work_mbc->chars[work_mbc->nchars++] = wc;
             }
 #endif
         }
     }
-  while ((wc = wc1) != L']');
-  return MBCSET;
-}
-#endif /* MBS_SUPPORT */
+  while ((wc = wc1, (c = c1) != L']'));
 
-#ifdef __STDC__
-#define FUNC(F, P) static int F(int c) { return P(c); }
-#else
-#define FUNC(F, P) static int F(c) int c; { return P(c); }
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      static charclass zeroclass;
+      work_mbc->invert = invert;
+      work_mbc->cset = equal(ccl, zeroclass) ? -1 : charclass_index(ccl);
+      return MBCSET;
+    }
 #endif
 
-FUNC(is_alpha, ISALPHA)
-FUNC(is_upper, ISUPPER)
-FUNC(is_lower, ISLOWER)
-FUNC(is_digit, ISDIGIT)
-FUNC(is_xdigit, ISXDIGIT)
-FUNC(is_space, ISSPACE)
-FUNC(is_punct, ISPUNCT)
-FUNC(is_alnum, ISALNUM)
-FUNC(is_print, ISPRINT)
-FUNC(is_graph, ISGRAPH)
-FUNC(is_cntrl, ISCNTRL)
+  if (invert)
+    {
+      notset(ccl);
+      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
+        clrbit(eolbyte, ccl);
+    }
 
-static int
-is_blank (int c)
-{
-   return (c == ' ' || c == '\t');
+  return CSET + charclass_index(ccl);
 }
 
-/* The following list maps the names of the Posix named character classes
-   to predicate functions that determine whether a given character is in
-   the class.  The leading [ has already been eaten by the lexical analyzer. */
-static struct {
-  const char *name;
-  int (*pred) (int);
-} const prednames[] = {
-  { ":alpha:]", is_alpha },
-  { ":upper:]", is_upper },
-  { ":lower:]", is_lower },
-  { ":digit:]", is_digit },
-  { ":xdigit:]", is_xdigit },
-  { ":space:]", is_space },
-  { ":punct:]", is_punct },
-  { ":alnum:]", is_alnum },
-  { ":print:]", is_print },
-  { ":graph:]", is_graph },
-  { ":cntrl:]", is_cntrl },
-  { ":blank:]", is_blank },
-  { 0, 0 }
-};
-
 /* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
 
-static int
-looking_at (char const *s)
-{
-  size_t len;
-
-  len = strlen(s);
-  if (lexleft < len)
-    return 0;
-  return strncmp(s, lexptr, len) == 0;
-}
-
 static token
 lex (void)
 {
-  unsigned c, c1, c2;
-  int backslash = 0, invert;
+  unsigned c, c2;
+  int backslash = 0;
   charclass ccl;
   int i;
 
@@ -679,10 +769,7 @@ lex (void)
 #ifdef MBS_SUPPORT
       if (MB_CUR_MAX > 1)
         {
-          wint_t wi = fetch_wc (NULL);
-          if (wi == WEOF)
-            return lasttok = EOF;
-          wctok = wi, c = wctob (wi);
+          FETCH_WC (c, wctok, NULL);
           if ((int)c == EOF)
             goto normal_char;
         }
@@ -963,100 +1050,7 @@ lex (void)
 	  if (backslash)
 	    goto normal_char;
 	  laststart = 0;
-#ifdef MBS_SUPPORT
-	  if (MB_CUR_MAX > 1)
-	    {
-	      /* In multibyte environment a bracket expression may contain
-		 multibyte characters, which must be treated as characters
-		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */
-	      return lasttok = parse_bracket_exp_mb();
-	    }
-#endif
-	  zeroset(ccl);
-	  FETCH(c, _("unbalanced ["));
-	  if (c == '^')
-	    {
-	      FETCH(c, _("unbalanced ["));
-	      invert = 1;
-	    }
-	  else
-	    invert = 0;
-	  do
-	    {
-	      /* Nobody ever said this had to be fast. :-)
-		 Note that if we're looking at some other [:...:]
-		 construct, we just treat it as a bunch of ordinary
-		 characters.  We can do this because we assume
-		 regex has checked for syntax errors before
-		 dfa is ever called. */
-	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
-		for (c1 = 0; prednames[c1].name; ++c1)
-		  if (looking_at(prednames[c1].name))
-		    {
-		      int (*pred) (int) = prednames[c1].pred;
-
-		      for (c2 = 0; c2 < NOTCHAR; ++c2)
-			if ((*pred)(c2))
-			  setbit_case_fold (c2, ccl);
-		      lexptr += strlen(prednames[c1].name);
-		      lexleft -= strlen(prednames[c1].name);
-		      FETCH(c1, _("unbalanced ["));
-		      goto skip;
-		    }
-	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-		FETCH(c, _("unbalanced ["));
-	      FETCH(c1, _("unbalanced ["));
-	      if (c1 == '-')
-		{
-		  FETCH(c2, _("unbalanced ["));
-		  if (c2 == ']')
-		    {
-		      /* In the case [x-], the - is an ordinary hyphen,
-			 which is left in c1, the lookahead character. */
-		      --lexptr;
-		      ++lexleft;
-		    }
-		  else
-		    {
-		      if (c2 == '\\'
-			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-			FETCH(c2, _("unbalanced ["));
-
-                      c1 = c;
-		      if (!hard_LC_COLLATE)
-		        for (c = c1; c <= c2; c++)
-			  setbit_case_fold (c, ccl);
-		      else
-                        {
-                          if (case_fold)
-                            {
-                              c1 = tolower (c1);
-                              c2 = tolower (c2);
-                            }
-                          for (c = 0; c < NOTCHAR; ++c)
-                            if (!(case_fold && ISUPPER (c))
-                                && in_coll_range (c, c1, c2))
-                              setbit_case_fold (c, ccl);
-                        }
-
-		      FETCH(c1, _("unbalanced ["));
-		      continue;
-		    }
-		}
-
-	      setbit_case_fold (c, ccl);
-
-	    skip:
-	      ;
-	    }
-	  while ((c = c1) != ']');
-	  if (invert)
-	    {
-	      notset(ccl);
-	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
-		clrbit(eolbyte, ccl);
-	    }
-	  return lasttok = CSET + charclass_index(ccl);
+	  return lasttok = parse_bracket_exp();
 
 	default:
 	normal_char:
@@ -2499,6 +2493,11 @@ match_mb_charset (struct dfa *d, int s, position pos, int idx)
   match = !work_mbc->invert;
   match_len = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
 
+  /* Match in range 0-255?  */
+  if (wc < NOTCHAR && work_mbc->cset != -1
+      && tstbit((unsigned char)wc, d->charclasses[work_mbc->cset]))
+    goto charset_matched;
+
   /* match with a character class?  */
   for (i = 0; i<work_mbc->nch_classes; i++)
     {
diff --git a/src/dfa.h b/src/dfa.h
index 4928d822..594e25cf 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -243,6 +243,7 @@ struct dfamust
    e.g. [a-c], [[:alpha:]], etc.  */
 struct mb_char_classes
 {
+  int cset;
   int invert;
   wchar_t *chars;		/* Normal characters.  */
   int nchars;
author	Paolo Bonzini <bonzini@gnu.org>	2010-03-07 11:22:00 +0100
committer	Paolo Bonzini <bonzini@gnu.org>	2010-03-17 15:32:54 +0100
commit	8f9106c419d18759f767da351b3b6913f022c8f8 (patch)
tree	598d251c073b0c65c5b10927692dddbc2613a2fb
parent	3cba8f98be7f791a55af6433863349e742054dd0 (diff)
download	grep-8f9106c419d18759f767da351b3b6913f022c8f8.tar.gz