From 368d38f6c4118cfb7db1fd6500c45db1f53c0216 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Wed, 31 Aug 2016 20:16:32 -0700
Subject: dfa: make dfa.c fully thread-safe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This follows up on Zev Weiss’s recent patches to make the DFA code
thread-safe (Bug#24249).  It removes the remaining static
variables used by dfa.c.  These variables are locale-dependent, so
they would cause problems in multithreaded code where different
threads are in different locales (e.g., via uselocale).  I
abstracted most of the variables into a new localeinfo module.
* src/Makefile.am (grep_SOURCES): Add localeinfo.c.
(noinst_HEADERS): Add localeinfo.h.
* src/dfa.c: Include localeinfo.h.
(struct dfa): Remove multibyte member, as it is now part of
localeinfo.  New members simple_locale and localeinfo.
Put locale-related members at the end.
(mbrtowc_cache): Remove; now part of dfa->localeinfo.
(charclass_index): Rename back from dfa_charclass_index,
since it's private.
(unibyte_word_constituent): New arg DFA; use its sbctowc member.
(using_utf8, dfa_using_utf8, init_mbrtowc_cache, check_utf8):
Remove; now done by localeinfo members.  All uses changed.
(dfasyntax): New localeinfo arg.  Move to end to avoid forward decls.
Initialize the entire DFA.
(unibyte_c, check_unibyte_c): Remove; now in simple_locale member.
(using_simple_locale): Now takes bool instead of DFA.
Do the locale check here, rather than in the caller,
as the result is now cached in dfa->simple_locale.
(dfaalloc): Just allocate the DFA.  dfasyntax now initializes it.
* src/dfa.h: Add forward decl of struct localeinfo.
Adjust to new dfa.c API.
* src/dfasearch.c (localeinfo): New var, replacing former static
vars like mbrtowc_cache.
* src/localeinfo.c, src/localeinfo.h: New files.
* src/search.h: Include localeinfo.h.
(localeinfo): New decl.
* src/searchutils.c (mbclen_cache, build_mbclen_cache):
Remove.  All uses changed to localeinfo.
* tests/Makefile.am (dfa_match_aux_LDADD): Add localeinfo.o.
* tests/dfa-match-aux.c: Include localeinfo.h.
(main): Adjust to changes in DFA API.
---
 src/Makefile.am       |   4 +-
 src/dfa.c             | 269 ++++++++++++++++++++++----------------------------
 src/dfa.h             |  22 +++--
 src/dfasearch.c       |   6 +-
 src/grep.c            |   5 +-
 src/kwsearch.c        |   2 +-
 src/localeinfo.c      |  66 +++++++++++++
 src/localeinfo.h      |  47 +++++++++
 src/pcresearch.c      |   4 +-
 src/search.h          |   7 +-
 src/searchutils.c     |  18 ----
 tests/Makefile.am     |   2 +-
 tests/dfa-match-aux.c |   7 +-
 13 files changed, 262 insertions(+), 197 deletions(-)
 create mode 100644 src/localeinfo.c
 create mode 100644 src/localeinfo.h

diff --git a/src/Makefile.am b/src/Makefile.am
index 941384ea..2b0ba0f1 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -25,9 +25,9 @@ bin_PROGRAMS = grep
 bin_SCRIPTS = egrep fgrep
 grep_SOURCES = grep.c searchutils.c \
           dfa.c dfasearch.c \
-          kwset.c kwsearch.c \
+          kwset.c kwsearch.c localeinfo.c \
           pcresearch.c
-noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h
+noinst_HEADERS = grep.h dfa.h kwset.h localeinfo.h search.h system.h
 
 # Sometimes, the expansion of $(LIBINTL) includes -lc which may
 # include modules defining variables like 'optind', so libgreputils.a
diff --git a/src/dfa.c b/src/dfa.c
index 8451c818..bf8c5463 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -24,6 +24,8 @@
 
 #include "dfa.h"
 
+#include "localeinfo.h"
+
 #include <assert.h>
 #include <ctype.h>
 #include <stdio.h>
@@ -418,14 +420,9 @@ struct dfa
   size_t nregexps;              /* Count of parallel regexps being built
                                    with dfaparse.  */
   bool fast;			/* The DFA is fast.  */
-  bool multibyte;		/* MB_CUR_MAX > 1.  */
   token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
   mbstate_t mbs;		/* Multibyte conversion state.  */
 
-  /* dfaexec implementation.  */
-  char *(*dfaexec) (struct dfa *, char const *, char *,
-                    bool, size_t *, bool *);
-
   /* The following are valid only if MB_CUR_MAX > 1.  */
 
   /* The value of multibyte_prop[i] is defined by following rule.
@@ -511,6 +508,21 @@ struct dfa
   state_num **mb_trans;      /* Transition tables for states with ANYCHAR.  */
   state_num mb_trcount;         /* Number of transition tables for states with
                                    ANYCHAR that have actually been built.  */
+
+  /* Information derived from the locale.  This is at the end so that
+     a quick memset need not clear it specially.  */
+
+  /* dfaexec implementation.  */
+  char *(*dfaexec) (struct dfa *, char const *, char *,
+                    bool, size_t *, bool *);
+
+  /* The locale is simple, like the C locale.  These locales can be
+     processed more efficiently, e.g., the relationship between lower-
+     and upper-case letters is 1-1.  */
+  bool simple_locale;
+
+  /* Other cached information derived from the locale.  */
+  struct localeinfo localeinfo;
 };
 
 /* Some macros for user access to dfa internals.  */
@@ -524,13 +536,8 @@ struct dfa
 
 static void regexp (struct dfa *dfa);
 
-/* A table indexed by byte values that contains the corresponding wide
-   character (if any) for that byte.  WEOF means the byte is not a
-   valid single-byte character.  */
-static wint_t mbrtowc_cache[NOTCHAR];
-
 /* Store into *PWC the result of converting the leading bytes of the
-   multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+   multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
    and updating the conversion state in *D.  On conversion error,
    convert just a single byte, to WEOF.  Return the number of bytes
    converted.
@@ -539,7 +546,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
 
    * PWC points to wint_t, not to wchar_t.
    * The last arg is a dfa *D instead of merely a multibyte conversion
-     state D->mbs.  D also contains an mbrtowc_cache for speed.
+     state D->mbs.
    * N must be at least 1.
    * S[N - 1] must be a sentinel byte.
    * Shift encodings are not supported.
@@ -550,7 +557,7 @@ static size_t
 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 {
   unsigned char uc = s[0];
-  wint_t wc = mbrtowc_cache[uc];
+  wint_t wc = d->localeinfo.sbctowc[uc];
 
   if (wc == WEOF)
     {
@@ -727,7 +734,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
 
 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 static size_t
-dfa_charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass const s)
 {
   size_t i;
 
@@ -742,9 +749,9 @@ dfa_charclass_index (struct dfa *d, charclass const s)
 }
 
 static bool
-unibyte_word_constituent (unsigned char c)
+unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 {
-  return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+  return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 }
 
 static int
@@ -752,68 +759,11 @@ char_context (struct dfa const *dfa, unsigned char c)
 {
   if (c == dfa->syntax.eolbyte)
     return CTX_NEWLINE;
-  if (unibyte_word_constituent (c))
+  if (unibyte_word_constituent (dfa, c))
     return CTX_LETTER;
   return CTX_NONE;
 }
 
-/* UTF-8 encoding allows some optimizations that we can't otherwise
-   assume in a multibyte encoding.  */
-static bool using_utf8;
-
-bool
-dfa_using_utf8 (void)
-{
-  return using_utf8;
-}
-
-static void
-init_mbrtowc_cache (void)
-{
-  int i;
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      char c = i;
-      unsigned char uc = i;
-      mbstate_t s = { 0 };
-      wchar_t wc;
-      mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
-    }
-}
-
-/* Entry point to set syntax options.  */
-void
-dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
-{
-  int i;
-  dfa->syntax.syntax_bits_set = true;
-  dfa->syntax.syntax_bits = bits;
-  dfa->syntax.case_fold = fold;
-  dfa->syntax.eolbyte = eol;
-
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      unsigned char uc = i;
-
-      /* Use mbrtowc_cache to calculate sbit.  */
-      dfa->syntax.sbit[uc] = char_context (dfa, uc);
-      switch (dfa->syntax.sbit[uc])
-        {
-        case CTX_LETTER:
-          setbit (uc, dfa->syntax.letters);
-          break;
-        case CTX_NEWLINE:
-          setbit (uc, dfa->syntax.newline);
-          break;
-        }
-
-      /* POSIX requires that the five bytes in "\n\r./" (including the
-         terminating NUL) cannot occur inside a multibyte character.  */
-      dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
-                                     : strchr ("\n\r./", uc) != NULL);
-    }
-}
-
 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
    this may happen when folding case in weird Turkish locales where
@@ -842,30 +792,10 @@ setbit_case_fold_c (int b, charclass c)
       setbit (i, c);
 }
 
-static void check_utf8 (void)
-{
-  wchar_t wc;
-  mbstate_t mbs = { 0 };
-  using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-}
-
-static bool unibyte_c;
-
-static void check_unibyte_c (void)
-{
-  char const *locale = setlocale (LC_ALL, NULL);
-  unibyte_c = (!locale
-               || STREQ (locale, "C")
-               || STREQ (locale, "POSIX"));
-}
-
-/* The current locale is known to be a unibyte locale
-   without multicharacter collating sequences and where range
-   comparisons simply use the native encoding.  These locales can be
-   processed more efficiently.  */
+/* Return true if the locale compatible with the C locale.  */
 
 static bool
-using_simple_locale (struct dfa const *dfa)
+using_simple_locale (bool multibyte)
 {
   /* The native character set is known to be compatible with
      the C locale.  The following test isn't perfect, but it's good
@@ -883,7 +813,15 @@ using_simple_locale (struct dfa const *dfa)
      && '}' == 125 && '~' == 126)
   };
 
-  return (native_c_charset & !dfa->multibyte) | unibyte_c;
+  if (native_c_charset && !multibyte)
+    return true;
+  else
+    {
+      /* Treat C and POSIX locales as being compatible.  Also, treat
+         errors as compatible, as these are invariably from stubs.  */
+      char const *loc = setlocale (LC_ALL, NULL);
+      return !loc || strcmp (loc, "C") == 0 || strcmp (loc, "POSIX") == 0;
+    }
 }
 
 /* Fetch the next lexical input character.  Set C (of type int) to the
@@ -1034,7 +972,7 @@ parse_bracket_exp (struct dfa *dfa)
   size_t chars_al;
 
   chars_al = 0;
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     {
       dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
                                     &dfa->mbcsets_alloc,
@@ -1057,7 +995,7 @@ parse_bracket_exp (struct dfa *dfa)
     {
       FETCH_WC (dfa, c, wc, _("unbalanced ["));
       invert = true;
-      known_bracket_exp = using_simple_locale (dfa);
+      known_bracket_exp = dfa->simple_locale;
     }
   else
     invert = false;
@@ -1112,7 +1050,7 @@ parse_bracket_exp (struct dfa *dfa)
                   if (!pred)
                     dfaerror (_("invalid character class"));
 
-                  if (dfa->multibyte && !pred->single_byte_only)
+                  if (dfa->localeinfo.multibyte && !pred->single_byte_only)
                     known_bracket_exp = false;
                   else
                     for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1172,9 +1110,9 @@ parse_bracket_exp (struct dfa *dfa)
               /* Treat [x-y] as a range if x != y.  */
               if (wc != wc2 || wc == WEOF)
                 {
-                  if (dfa->multibyte)
+                  if (dfa->localeinfo.multibyte)
                     known_bracket_exp = false;
-                  else if (using_simple_locale (dfa))
+                  else if (dfa->simple_locale)
                     {
                       int ci;
                       for (ci = c; ci <= c2; ci++)
@@ -1201,7 +1139,7 @@ parse_bracket_exp (struct dfa *dfa)
 
       colon_warning_state |= (c == ':') ? 2 : 4;
 
-      if (!dfa->multibyte)
+      if (!dfa->localeinfo.multibyte)
         {
           if (dfa->syntax.case_fold)
             setbit_case_fold_c (c, ccl);
@@ -1238,22 +1176,22 @@ parse_bracket_exp (struct dfa *dfa)
   if (! known_bracket_exp)
     return BACKREF;
 
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     {
       work_mbc->invert = invert;
-      work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
+      work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
       return MBCSET;
     }
 
   if (invert)
     {
-      assert (!dfa->multibyte);
+      assert (!dfa->localeinfo.multibyte);
       notset (ccl);
       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
         clrbit ('\n', ccl);
     }
 
-  return CSET + dfa_charclass_index (dfa, ccl);
+  return CSET + charclass_index (dfa, ccl);
 }
 
 struct lexptr
@@ -1508,7 +1446,7 @@ lex (struct dfa *dfa)
         case '.':
           if (backslash)
             goto normal_char;
-          if (dfa->multibyte)
+          if (dfa->localeinfo.multibyte)
             {
               /* In multibyte environment period must match with a single
                  character not a byte.  So we use ANYCHAR.  */
@@ -1522,13 +1460,13 @@ lex (struct dfa *dfa)
           if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
             clrbit ('\0', ccl);
           dfa->lex.laststart = false;
-          return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+          return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
 
         case 's':
         case 'S':
           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
-          if (!dfa->multibyte)
+          if (!dfa->localeinfo.multibyte)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1537,7 +1475,7 @@ lex (struct dfa *dfa)
               if (c == 'S')
                 notset (ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1561,16 +1499,16 @@ lex (struct dfa *dfa)
           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
 
-          if (!dfa->multibyte)
+          if (!dfa->localeinfo.multibyte)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
-                if (unibyte_word_constituent (c2))
+                if (unibyte_word_constituent (dfa, c2))
                   setbit (c2, ccl);
               if (c == 'W')
                 notset (ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1600,14 +1538,14 @@ lex (struct dfa *dfa)
           dfa->lex.laststart = false;
           /* For multibyte character sets, folding is done in atom.  Always
              return WCHAR.  */
-          if (dfa->multibyte)
+          if (dfa->localeinfo.multibyte)
             return dfa->lex.lasttok = WCHAR;
 
           if (dfa->syntax.case_fold && isalpha (c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           return dfa->lex.lasttok = c;
@@ -1627,11 +1565,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
     {
       dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
                                 sizeof *dfa->tokens);
-      if (dfa->multibyte)
+      if (dfa->localeinfo.multibyte)
         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
                                          sizeof *dfa->multibyte_prop);
     }
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     dfa->multibyte_prop[dfa->tindex] = mbprop;
   dfa->tokens[dfa->tindex++] = t;
 
@@ -1668,7 +1606,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc);
 static void
 addtok (struct dfa *dfa, token t)
 {
-  if (dfa->multibyte && t == MBCSET)
+  if (dfa->localeinfo.multibyte && t == MBCSET)
     {
       bool need_or = false;
       struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1767,7 +1705,7 @@ add_utf8_anychar (struct dfa *dfa)
             if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
               clrbit ('\0', c);
           }
-        dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
+        dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
       }
 
   /* A valid UTF-8 character is
@@ -1851,7 +1789,7 @@ atom (struct dfa *dfa)
 
       dfa->parse.tok = lex (dfa);
     }
-  else if (dfa->parse.tok == ANYCHAR && using_utf8)
+  else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
     {
       /* For UTF-8 expand the period to a series of CSETs that define a valid
          UTF-8 character.  This avoids using the slow multibyte path.  I'm
@@ -1912,7 +1850,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
 {
   size_t i;
 
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     for (i = 0; i < ntokens; ++i)
       addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
   else
@@ -1998,7 +1936,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   d->lex.lasttok = END;
   d->lex.laststart = true;
   d->lex.parens = 0;
-  if (d->multibyte)
+  if (d->localeinfo.multibyte)
     {
       d->lex.cur_mb_len = 0;
       memset (&d->mbs, 0, sizeof d->mbs);
@@ -2187,7 +2125,7 @@ state_index (struct dfa *d, position_set const *s, int context)
         }
       else if (d->tokens[s->elems[j].index] == BACKREF)
         constraint = NO_CONSTRAINT;
-      if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+      if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
         {
           int acceptable
             = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2664,7 +2602,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+      else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
         {
           /* ANYCHAR must match a single character, so put it to
              D->states[s].mbps which contains the positions which can
@@ -2810,7 +2748,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         state_letter = state;
 
       for (i = 0; i < NOTCHAR; ++i)
-        trans[i] = unibyte_word_constituent (i) ? state_letter : state;
+        trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
       trans[d->syntax.eolbyte] = state_newline;
     }
   else
@@ -2827,7 +2765,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
           insert (d->follows[grps[i].elems[j]].elems[k], &follows);
 
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           /* If a token in follows.elems is not 1st byte of a multibyte
              character, or the states of follows must accept the bytes
@@ -2860,7 +2798,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
       /* If we are building a searching matcher, throw in the positions
          of state 0 as well.  */
-      if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
+      if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
         {
           merge (&d->states[0].elems, &follows, &tmp);
           copy (&tmp, &follows);
@@ -2916,7 +2854,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
               if (c == d->syntax.eolbyte)
                 trans[c] = state_newline;
-              else if (unibyte_word_constituent (c))
+              else if (unibyte_word_constituent (d, c))
                 trans[c] = state_letter;
               else if (c < NOTCHAR)
                 trans[c] = state;
@@ -2957,7 +2895,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
@@ -2969,7 +2907,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
         {
           d->trans[oldalloc] = NULL;
           d->fails[oldalloc] = NULL;
-          if (d->multibyte)
+          if (d->localeinfo.multibyte)
             d->mb_trans[oldalloc] = NULL;
         }
     }
@@ -3003,7 +2941,7 @@ build_state (state_num s, struct dfa *d)
         }
       d->trcount = d->min_trcount;
 
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           for (i = d->min_trcount; i < d->tralloc; i++)
             {
@@ -3454,7 +3392,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end,
   return (char *) begin;
 }
 
-/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte),
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
    but faster and set *BACKREF if the DFA code does not support this
    regexp usage.  */
 
@@ -3512,7 +3450,7 @@ dfa_supported (struct dfa const *d)
         case ENDWORD:
         case LIMWORD:
         case NOTLIMWORD:
-          if (!d->multibyte)
+          if (!d->localeinfo.multibyte)
             continue;
           /* fallthrough */
 
@@ -3530,7 +3468,7 @@ dfaoptimize (struct dfa *d)
   size_t i;
   bool have_backref = false;
 
-  if (!using_utf8)
+  if (!d->localeinfo.using_utf8)
     return;
 
   for (i = 0; i < d->tindex; ++i)
@@ -3560,7 +3498,7 @@ dfaoptimize (struct dfa *d)
     }
 
   free_mbdata (d);
-  d->multibyte = false;
+  d->localeinfo.multibyte = false;
   d->dfaexec = dfaexec_sb;
   d->fast = true;
 }
@@ -3575,7 +3513,7 @@ dfassbuild (struct dfa *d)
   struct dfa *sup = dfaalloc ();
 
   *sup = *d;
-  sup->multibyte = false;
+  sup->localeinfo.multibyte = false;
   sup->dfaexec = dfaexec_sb;
   sup->multibyte_prop = NULL;
   sup->mbcsets = NULL;
@@ -3608,7 +3546,7 @@ dfassbuild (struct dfa *d)
         case BACKREF:
           zeroset (ccl);
           notset (ccl);
-          sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+          sup->tokens[j++] = CSET + charclass_index (sup, ccl);
           sup->tokens[j++] = STAR;
           if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
               || d->tokens[i + 1] == PLUS)
@@ -3619,7 +3557,7 @@ dfassbuild (struct dfa *d)
         case ENDWORD:
         case LIMWORD:
         case NOTLIMWORD:
-          if (d->multibyte)
+          if (d->localeinfo.multibyte)
             {
               /* These constraints aren't supported in a multibyte locale.
                  Ignore them in the superset DFA.  */
@@ -3636,7 +3574,7 @@ dfassbuild (struct dfa *d)
     }
   sup->tindex = j;
 
-  if (have_nchar && (have_achar || d->multibyte))
+  if (have_nchar && (have_achar || d->localeinfo.multibyte))
     d->superset = sup;
   else
     {
@@ -3678,7 +3616,7 @@ dfafree (struct dfa *d)
   free (d->charclasses);
   free (d->tokens);
 
-  if (d->multibyte)
+  if (d->localeinfo.multibyte)
     free_mbdata (d);
 
   for (i = 0; i < d->sindex; ++i)
@@ -4200,20 +4138,49 @@ dfamustfree (struct dfamust *dm)
 struct dfa *
 dfaalloc (void)
 {
-  struct dfa *d = xzalloc (sizeof *d);
-  d->multibyte = MB_CUR_MAX > 1;
-  d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
-  d->fast = !d->multibyte;
-  d->lex.cur_mb_len = 1;
-  return d;
+  return xmalloc (sizeof (struct dfa));
 }
 
+/* Initialize DFA.  */
 void
-dfa_init (void)
+dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
+           reg_syntax_t bits, bool fold, unsigned char eol)
 {
-  check_utf8 ();
-  check_unibyte_c ();
-  init_mbrtowc_cache ();
+  int i;
+  memset (dfa, 0, offsetof (struct dfa, dfaexec));
+  dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
+  dfa->simple_locale = using_simple_locale (linfo->multibyte);
+  dfa->localeinfo = *linfo;
+
+  dfa->fast = !dfa->localeinfo.multibyte;
+
+  dfa->lex.cur_mb_len = 1;
+  dfa->syntax.syntax_bits_set = true;
+  dfa->syntax.syntax_bits = bits;
+  dfa->syntax.case_fold = fold;
+  dfa->syntax.eolbyte = eol;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      unsigned char uc = i;
+
+      dfa->syntax.sbit[uc] = char_context (dfa, uc);
+      switch (dfa->syntax.sbit[uc])
+        {
+        case CTX_LETTER:
+          setbit (uc, dfa->syntax.letters);
+          break;
+        case CTX_NEWLINE:
+          setbit (uc, dfa->syntax.newline);
+          break;
+        }
+
+      /* POSIX requires that the five bytes in "\n\r./" (including the
+         terminating NUL) cannot occur inside a multibyte character.  */
+      dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
+                                     ? (uc & 0xc0) != 0x80
+                                     : strchr ("\n\r./", uc) != NULL);
+    }
 }
 
 /* vim:set shiftwidth=2: */
diff --git a/src/dfa.h b/src/dfa.h
index 585390a7..31baf7a1 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -24,6 +24,8 @@
 
 #include "xalloc.h" /* for _GL_ATTRIBUTE_MALLOC */
 
+struct localeinfo; /* See localeinfo.h.  */
+
 /* Element of a list of strings, at least one of which is known to
    appear in any R.E. matching the DFA. */
 struct dfamust
@@ -44,17 +46,22 @@ struct dfa;
    calling dfafree() on it. */
 extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC;
 
+/* Initialize or reinitialize a DFA.  This must be called before
+   any of the routines below.  The arguments are:
+   1. The DFA to operate on.
+   2. Information about the current locale.
+   3. The syntax bits described earlier in this file.
+   4. The case-folding flag.
+   5. The line terminator.  */
+extern void dfasyntax (struct dfa *, struct localeinfo const *,
+                       reg_syntax_t, bool, unsigned char);
+
 /* Build and return the struct dfamust from the given struct dfa. */
 extern struct dfamust *dfamust (struct dfa const *);
 
 /* Free the storage held by the components of a struct dfamust. */
 extern void dfamustfree (struct dfamust *);
 
-/* dfasyntax() takes four arguments; the first is the dfa to operate on, the
-   second sets the syntax bits described earlier in this file, the third sets
-   the case-folding flag, and the fourth specifies the line terminator. */
-extern void dfasyntax (struct dfa *, reg_syntax_t, bool, unsigned char);
-
 /* Compile the given string of the given length into the given struct dfa.
    Final argument is a flag specifying whether to build a searching or an
    exact matcher. */
@@ -99,8 +106,3 @@ extern void dfawarn (const char *);
    takes a single argument, a NUL-terminated string describing the error.
    The user must supply a dfaerror.  */
 extern _Noreturn void dfaerror (const char *);
-
-extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE;
-
-/* This must be called before calling any of the above dfa*() functions. */
-extern void dfa_init (void);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 10c4f51b..c2e0177b 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -22,6 +22,8 @@
 #include "intprops.h"
 #include "search.h"
 
+struct localeinfo localeinfo;
+
 /* Whether -w considers WC to be a word constituent.  */
 static bool
 wordchar (wint_t wc)
@@ -128,7 +130,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
   if (match_icase)
     syntax_bits |= RE_ICASE;
   re_set_syntax (syntax_bits);
-  dfasyntax (dfa, syntax_bits, match_icase, eolbyte);
+  dfasyntax (dfa, &localeinfo, syntax_bits, match_icase, eolbyte);
 
   /* For GNU regex, pass the patterns separately to detect errors like
      "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
@@ -277,7 +279,7 @@ EGexecute (char *buf, size_t size, size_t *match_size,
 
               if (exact_kwset_match)
                 {
-                  if (MB_CUR_MAX == 1 || dfa_using_utf8 ())
+                  if (MB_CUR_MAX == 1 || localeinfo.using_utf8)
                     goto success;
                   if (mb_start < beg)
                     mb_start = beg;
diff --git a/src/grep.c b/src/grep.c
index 0c84b2a3..fc22c7b6 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -642,7 +642,7 @@ initialize_unibyte_mask (void)
   unsigned char mask = 0;
   int ms1b = 1;
   for (int i = 1; i <= UCHAR_MAX; i++)
-    if ((mbclen_cache[i] != 1) & ! (mask & i))
+    if ((localeinfo.sbclen[i] != 1) & ! (mask & i))
       {
         while (ms1b * 2 <= i)
           ms1b *= 2;
@@ -2344,7 +2344,7 @@ main (int argc, char **argv)
   textdomain (PACKAGE);
 #endif
 
-  dfa_init ();
+  init_localeinfo (&localeinfo);
 
   atexit (clean_up_stdout);
 
@@ -2726,7 +2726,6 @@ main (int argc, char **argv)
   else
     usage (EXIT_TROUBLE);
 
-  build_mbclen_cache ();
   initialize_unibyte_mask ();
 
   /* In a unibyte locale, switch from fgrep to grep if
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 57fd4d77..508ebc5e 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -93,7 +93,7 @@ Fexecute (char *buf, size_t size, size_t *match_size,
     mb_check = longest = false;
   else
     {
-      mb_check = MB_CUR_MAX > 1 && !dfa_using_utf8 ();
+      mb_check = MB_CUR_MAX > 1 && !localeinfo.using_utf8;
       longest = mb_check | !!start_ptr | match_words;
     }
 
diff --git a/src/localeinfo.c b/src/localeinfo.c
new file mode 100644
index 00000000..329d4314
--- /dev/null
+++ b/src/localeinfo.c
@@ -0,0 +1,66 @@
+/* locale information
+
+   Copyright 2016 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written by Paul Eggert.  */
+
+#include <config.h>
+
+#include <localeinfo.h>
+
+#include <verify.h>
+
+#include <limits.h>
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* The sbclen implementation relies on this.  */
+verify (MB_LEN_MAX <= SCHAR_MAX);
+
+/* Return true if the locale uses UTF-8.  */
+
+static bool
+is_using_utf8 (void)
+{
+  wchar_t wc;
+  mbstate_t mbs = {0};
+  return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
+}
+
+/* Initialize *LOCALEINFO from the current locale.  */
+
+void
+init_localeinfo (struct localeinfo *localeinfo)
+{
+  int i;
+
+  localeinfo->multibyte = MB_CUR_MAX > 1;
+  localeinfo->using_utf8 = is_using_utf8 ();
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t s = {0};
+      wchar_t wc;
+      size_t len = mbrtowc (&wc, &c, 1, &s);
+      localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
+      localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
+    }
+}
diff --git a/src/localeinfo.h b/src/localeinfo.h
new file mode 100644
index 00000000..70b55a8d
--- /dev/null
+++ b/src/localeinfo.h
@@ -0,0 +1,47 @@
+/* locale information
+
+   Copyright 2016 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written by Paul Eggert.  */
+
+#include <limits.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+struct localeinfo
+{
+  /* MB_CUR_MAX > 1.  */
+  bool multibyte;
+
+  /* The locale uses UTF-8.  */
+  bool using_utf8;
+
+  /* An array indexed by byte values B that contains 1 if B is a
+     single-byte character, -1 if B is an encoding error, and -2 if B
+     is the leading byte of a multibyte character that contains more
+     than one byte.  */
+  signed char sbclen[UCHAR_MAX + 1];
+
+  /* An array indexed by byte values B that contains the corresponding
+     wide character (if any) for B if sbclen[B] == 1.  WEOF means the
+     byte is not a valid single-byte character, i.e., sbclen[B] == -1
+     or -2.  */
+  wint_t sbctowc[UCHAR_MAX + 1];
+};
+
+extern void init_localeinfo (struct localeinfo *);
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 3f76603d..9ffa22a3 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -114,7 +114,7 @@ Pcompile (char const *pattern, size_t size)
 
   if (1 < MB_CUR_MAX)
     {
-      if (! dfa_using_utf8 ())
+      if (! localeinfo.using_utf8)
         error (EXIT_TROUBLE, 0,
                _("-P supports only unibyte and UTF-8 locales"));
       multibyte_locale = true;
@@ -254,7 +254,7 @@ Pexecute (char *buf, size_t size, size_t *match_size,
           /* Skip past bytes that are easily determined to be encoding
              errors, treating them as data that cannot match.  This is
              faster than having pcre_exec check them.  */
-          while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
+          while (localeinfo.sbclen[to_uchar (*p)] == -1)
             {
               p++;
               subject = p;
diff --git a/src/search.h b/src/search.h
index 7dc19408..431a67da 100644
--- a/src/search.h
+++ b/src/search.h
@@ -33,6 +33,7 @@
 #include "dfa.h"
 #include "kwset.h"
 #include "xalloc.h"
+#include "localeinfo.h"
 
 _GL_INLINE_HEADER_BEGIN
 #ifndef SEARCH_INLINE
@@ -47,14 +48,12 @@ typedef signed char mb_len_map_t;
 
 /* searchutils.c */
 extern void kwsinit (kwset_t *);
-
-extern void build_mbclen_cache (void);
-extern size_t mbclen_cache[];
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
 extern wint_t mb_prev_wc (char const *, char const *, char const *);
 extern wint_t mb_next_wc (char const *, char const *);
 
 /* dfasearch.c */
+extern struct localeinfo localeinfo;
 extern void GEAcompile (char const *, size_t, reg_syntax_t);
 extern size_t EGexecute (char *, size_t, size_t *, char const *);
 
@@ -73,7 +72,7 @@ extern size_t Pexecute (char *, size_t, size_t *, char const *);
 SEARCH_INLINE size_t
 mb_clen (char const *s, size_t n, mbstate_t *mbs)
 {
-  size_t len = mbclen_cache[to_uchar (*s)];
+  size_t len = localeinfo.sbclen[to_uchar (*s)];
   return len == (size_t) -2 ? mbrlen (s, n, mbs) : len;
 }
 
diff --git a/src/searchutils.c b/src/searchutils.c
index d25e5f83..8081d418 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -24,8 +24,6 @@
 
 #define NCHAR (UCHAR_MAX + 1)
 
-size_t mbclen_cache[NCHAR];
-
 void
 kwsinit (kwset_t *kwset)
 {
@@ -46,22 +44,6 @@ kwsinit (kwset_t *kwset)
     xalloc_die ();
 }
 
-/* Initialize a cache of mbrlen values for each of its 1-byte inputs.  */
-void
-build_mbclen_cache (void)
-{
-  int i;
-
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      char c = i;
-      unsigned char uc = i;
-      mbstate_t mbs = { 0 };
-      size_t len = mbrlen (&c, 1, &mbs);
-      mbclen_cache[uc] = len ? len : 1;
-    }
-}
-
 /* In the buffer *MB_START, return the number of bytes needed to go
    back from CUR to the previous boundary, where a "boundary" is the
    start of a multibyte character or is an error-encoding byte.  The
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 77502ca2..355f44e2 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -42,7 +42,7 @@ AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
 # Tell the linker to omit references to unused shared libraries.
 AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS)
 LDADD = ../lib/libgreputils.a $(LIBINTL) ../lib/libgreputils.a
-dfa_match_aux_LDADD = ../src/dfa.$(OBJEXT) $(LDADD)
+dfa_match_aux_LDADD = ../src/dfa.$(OBJEXT) ../src/localeinfo.$(OBJEXT) $(LDADD)
 
 # The triple-backref test is expected to fail with both the system
 # matcher (i.e., with glibc) and with the included matcher.
diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c
index e6517352..e001b7de 100644
--- a/tests/dfa-match-aux.c
+++ b/tests/dfa-match-aux.c
@@ -24,6 +24,7 @@
 #include <string.h>
 #include <regex.h>
 #include <dfa.h>
+#include <localeinfo.h>
 
 #include "progname.h"
 
@@ -47,17 +48,17 @@ main (int argc, char **argv)
   struct dfa *dfa;
   char *beg, *end, *p;
   int allow_nl;
+  struct localeinfo localeinfo;
 
   set_program_name (argv[0]);
   if (argc < 3)
     exit (EXIT_FAILURE);
 
   setlocale (LC_ALL, "");
-
-  dfa_init ();
+  init_localeinfo (&localeinfo);
 
   dfa = dfaalloc ();
-  dfasyntax (dfa, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
+  dfasyntax (dfa, &localeinfo, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
   dfacomp (argv[1], strlen (argv[1]), dfa, 0);
 
   beg = argv[2];
-- 
cgit v1.2.1