summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2016-08-31 20:16:32 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2016-08-31 20:18:13 -0700
commit368d38f6c4118cfb7db1fd6500c45db1f53c0216 (patch)
treeb9e57607ba41ac7c3591b0e3c271f3a6d2395a81
parente3c694ca3729d04baa2320e58753d10bc1cf867a (diff)
downloadgrep-368d38f6c4118cfb7db1fd6500c45db1f53c0216.tar.gz
dfa: make dfa.c fully thread-safe
This follows up on Zev Weiss’s recent patches to make the DFA code thread-safe (Bug#24249). It removes the remaining static variables used by dfa.c. These variables are locale-dependent, so they would cause problems in multithreaded code where different threads are in different locales (e.g., via uselocale). I abstracted most of the variables into a new localeinfo module. * src/Makefile.am (grep_SOURCES): Add localeinfo.c. (noinst_HEADERS): Add localeinfo.h. * src/dfa.c: Include localeinfo.h. (struct dfa): Remove multibyte member, as it is now part of localeinfo. New members simple_locale and localeinfo. Put locale-related members at the end. (mbrtowc_cache): Remove; now part of dfa->localeinfo. (charclass_index): Rename back from dfa_charclass_index, since it's private. (unibyte_word_constituent): New arg DFA; use its sbctowc member. (using_utf8, dfa_using_utf8, init_mbrtowc_cache, check_utf8): Remove; now done by localeinfo members. All uses changed. (dfasyntax): New localeinfo arg. Move to end to avoid forward decls. Initialize the entire DFA. (unibyte_c, check_unibyte_c): Remove; now in simple_locale member. (using_simple_locale): Now takes bool instead of DFA. Do the locale check here, rather than in the caller, as the result is now cached in dfa->simple_locale. (dfaalloc): Just allocate the DFA. dfasyntax now initializes it. * src/dfa.h: Add forward decl of struct localeinfo. Adjust to new dfa.c API. * src/dfasearch.c (localeinfo): New var, replacing former static vars like mbrtowc_cache. * src/localeinfo.c, src/localeinfo.h: New files. * src/search.h: Include localeinfo.h. (localeinfo): New decl. * src/searchutils.c (mbclen_cache, build_mbclen_cache): Remove. All uses changed to localeinfo. * tests/Makefile.am (dfa_match_aux_LDADD): Add localeinfo.o. * tests/dfa-match-aux.c: Include localeinfo.h. (main): Adjust to changes in DFA API.
-rw-r--r--src/Makefile.am4
-rw-r--r--src/dfa.c269
-rw-r--r--src/dfa.h22
-rw-r--r--src/dfasearch.c6
-rw-r--r--src/grep.c5
-rw-r--r--src/kwsearch.c2
-rw-r--r--src/localeinfo.c66
-rw-r--r--src/localeinfo.h47
-rw-r--r--src/pcresearch.c4
-rw-r--r--src/search.h7
-rw-r--r--src/searchutils.c18
-rw-r--r--tests/Makefile.am2
-rw-r--r--tests/dfa-match-aux.c7
13 files changed, 262 insertions, 197 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 941384ea..2b0ba0f1 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -25,9 +25,9 @@ bin_PROGRAMS = grep
bin_SCRIPTS = egrep fgrep
grep_SOURCES = grep.c searchutils.c \
dfa.c dfasearch.c \
- kwset.c kwsearch.c \
+ kwset.c kwsearch.c localeinfo.c \
pcresearch.c
-noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h
+noinst_HEADERS = grep.h dfa.h kwset.h localeinfo.h search.h system.h
# Sometimes, the expansion of $(LIBINTL) includes -lc which may
# include modules defining variables like 'optind', so libgreputils.a
diff --git a/src/dfa.c b/src/dfa.c
index 8451c818..bf8c5463 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -24,6 +24,8 @@
#include "dfa.h"
+#include "localeinfo.h"
+
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
@@ -418,14 +420,9 @@ struct dfa
size_t nregexps; /* Count of parallel regexps being built
with dfaparse. */
bool fast; /* The DFA is fast. */
- bool multibyte; /* MB_CUR_MAX > 1. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
- /* dfaexec implementation. */
- char *(*dfaexec) (struct dfa *, char const *, char *,
- bool, size_t *, bool *);
-
/* The following are valid only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule.
@@ -511,6 +508,21 @@ struct dfa
state_num **mb_trans; /* Transition tables for states with ANYCHAR. */
state_num mb_trcount; /* Number of transition tables for states with
ANYCHAR that have actually been built. */
+
+ /* Information derived from the locale. This is at the end so that
+ a quick memset need not clear it specially. */
+
+ /* dfaexec implementation. */
+ char *(*dfaexec) (struct dfa *, char const *, char *,
+ bool, size_t *, bool *);
+
+ /* The locale is simple, like the C locale. These locales can be
+ processed more efficiently, e.g., the relationship between lower-
+ and upper-case letters is 1-1. */
+ bool simple_locale;
+
+ /* Other cached information derived from the locale. */
+ struct localeinfo localeinfo;
};
/* Some macros for user access to dfa internals. */
@@ -524,13 +536,8 @@ struct dfa
static void regexp (struct dfa *dfa);
-/* A table indexed by byte values that contains the corresponding wide
- character (if any) for that byte. WEOF means the byte is not a
- valid single-byte character. */
-static wint_t mbrtowc_cache[NOTCHAR];
-
/* Store into *PWC the result of converting the leading bytes of the
- multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+ multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
and updating the conversion state in *D. On conversion error,
convert just a single byte, to WEOF. Return the number of bytes
converted.
@@ -539,7 +546,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
* PWC points to wint_t, not to wchar_t.
* The last arg is a dfa *D instead of merely a multibyte conversion
- state D->mbs. D also contains an mbrtowc_cache for speed.
+ state D->mbs.
* N must be at least 1.
* S[N - 1] must be a sentinel byte.
* Shift encodings are not supported.
@@ -550,7 +557,7 @@ static size_t
mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
{
unsigned char uc = s[0];
- wint_t wc = mbrtowc_cache[uc];
+ wint_t wc = d->localeinfo.sbctowc[uc];
if (wc == WEOF)
{
@@ -727,7 +734,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
/* In DFA D, find the index of charclass S, or allocate a new one. */
static size_t
-dfa_charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass const s)
{
size_t i;
@@ -742,9 +749,9 @@ dfa_charclass_index (struct dfa *d, charclass const s)
}
static bool
-unibyte_word_constituent (unsigned char c)
+unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
{
- return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+ return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
}
static int
@@ -752,68 +759,11 @@ char_context (struct dfa const *dfa, unsigned char c)
{
if (c == dfa->syntax.eolbyte)
return CTX_NEWLINE;
- if (unibyte_word_constituent (c))
+ if (unibyte_word_constituent (dfa, c))
return CTX_LETTER;
return CTX_NONE;
}
-/* UTF-8 encoding allows some optimizations that we can't otherwise
- assume in a multibyte encoding. */
-static bool using_utf8;
-
-bool
-dfa_using_utf8 (void)
-{
- return using_utf8;
-}
-
-static void
-init_mbrtowc_cache (void)
-{
- int i;
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- char c = i;
- unsigned char uc = i;
- mbstate_t s = { 0 };
- wchar_t wc;
- mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
- }
-}
-
-/* Entry point to set syntax options. */
-void
-dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
-{
- int i;
- dfa->syntax.syntax_bits_set = true;
- dfa->syntax.syntax_bits = bits;
- dfa->syntax.case_fold = fold;
- dfa->syntax.eolbyte = eol;
-
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- unsigned char uc = i;
-
- /* Use mbrtowc_cache to calculate sbit. */
- dfa->syntax.sbit[uc] = char_context (dfa, uc);
- switch (dfa->syntax.sbit[uc])
- {
- case CTX_LETTER:
- setbit (uc, dfa->syntax.letters);
- break;
- case CTX_NEWLINE:
- setbit (uc, dfa->syntax.newline);
- break;
- }
-
- /* POSIX requires that the five bytes in "\n\r./" (including the
- terminating NUL) cannot occur inside a multibyte character. */
- dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
- : strchr ("\n\r./", uc) != NULL);
- }
-}
-
/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
this may happen when folding case in weird Turkish locales where
@@ -842,30 +792,10 @@ setbit_case_fold_c (int b, charclass c)
setbit (i, c);
}
-static void check_utf8 (void)
-{
- wchar_t wc;
- mbstate_t mbs = { 0 };
- using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-}
-
-static bool unibyte_c;
-
-static void check_unibyte_c (void)
-{
- char const *locale = setlocale (LC_ALL, NULL);
- unibyte_c = (!locale
- || STREQ (locale, "C")
- || STREQ (locale, "POSIX"));
-}
-
-/* The current locale is known to be a unibyte locale
- without multicharacter collating sequences and where range
- comparisons simply use the native encoding. These locales can be
- processed more efficiently. */
+/* Return true if the locale compatible with the C locale. */
static bool
-using_simple_locale (struct dfa const *dfa)
+using_simple_locale (bool multibyte)
{
/* The native character set is known to be compatible with
the C locale. The following test isn't perfect, but it's good
@@ -883,7 +813,15 @@ using_simple_locale (struct dfa const *dfa)
&& '}' == 125 && '~' == 126)
};
- return (native_c_charset & !dfa->multibyte) | unibyte_c;
+ if (native_c_charset && !multibyte)
+ return true;
+ else
+ {
+ /* Treat C and POSIX locales as being compatible. Also, treat
+ errors as compatible, as these are invariably from stubs. */
+ char const *loc = setlocale (LC_ALL, NULL);
+ return !loc || strcmp (loc, "C") == 0 || strcmp (loc, "POSIX") == 0;
+ }
}
/* Fetch the next lexical input character. Set C (of type int) to the
@@ -1034,7 +972,7 @@ parse_bracket_exp (struct dfa *dfa)
size_t chars_al;
chars_al = 0;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
&dfa->mbcsets_alloc,
@@ -1057,7 +995,7 @@ parse_bracket_exp (struct dfa *dfa)
{
FETCH_WC (dfa, c, wc, _("unbalanced ["));
invert = true;
- known_bracket_exp = using_simple_locale (dfa);
+ known_bracket_exp = dfa->simple_locale;
}
else
invert = false;
@@ -1112,7 +1050,7 @@ parse_bracket_exp (struct dfa *dfa)
if (!pred)
dfaerror (_("invalid character class"));
- if (dfa->multibyte && !pred->single_byte_only)
+ if (dfa->localeinfo.multibyte && !pred->single_byte_only)
known_bracket_exp = false;
else
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1172,9 +1110,9 @@ parse_bracket_exp (struct dfa *dfa)
/* Treat [x-y] as a range if x != y. */
if (wc != wc2 || wc == WEOF)
{
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
known_bracket_exp = false;
- else if (using_simple_locale (dfa))
+ else if (dfa->simple_locale)
{
int ci;
for (ci = c; ci <= c2; ci++)
@@ -1201,7 +1139,7 @@ parse_bracket_exp (struct dfa *dfa)
colon_warning_state |= (c == ':') ? 2 : 4;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
if (dfa->syntax.case_fold)
setbit_case_fold_c (c, ccl);
@@ -1238,22 +1176,22 @@ parse_bracket_exp (struct dfa *dfa)
if (! known_bracket_exp)
return BACKREF;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
work_mbc->invert = invert;
- work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
+ work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
return MBCSET;
}
if (invert)
{
- assert (!dfa->multibyte);
+ assert (!dfa->localeinfo.multibyte);
notset (ccl);
if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit ('\n', ccl);
}
- return CSET + dfa_charclass_index (dfa, ccl);
+ return CSET + charclass_index (dfa, ccl);
}
struct lexptr
@@ -1508,7 +1446,7 @@ lex (struct dfa *dfa)
case '.':
if (backslash)
goto normal_char;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
/* In multibyte environment period must match with a single
character not a byte. So we use ANYCHAR. */
@@ -1522,13 +1460,13 @@ lex (struct dfa *dfa)
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
case 's':
case 'S':
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1537,7 +1475,7 @@ lex (struct dfa *dfa)
if (c == 'S')
notset (ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1561,16 +1499,16 @@ lex (struct dfa *dfa)
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
- if (unibyte_word_constituent (c2))
+ if (unibyte_word_constituent (dfa, c2))
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1600,14 +1538,14 @@ lex (struct dfa *dfa)
dfa->lex.laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
return dfa->lex.lasttok = WCHAR;
if (dfa->syntax.case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
return dfa->lex.lasttok = c;
@@ -1627,11 +1565,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
{
dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
sizeof *dfa->tokens);
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
sizeof *dfa->multibyte_prop);
}
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop[dfa->tindex] = mbprop;
dfa->tokens[dfa->tindex++] = t;
@@ -1668,7 +1606,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc);
static void
addtok (struct dfa *dfa, token t)
{
- if (dfa->multibyte && t == MBCSET)
+ if (dfa->localeinfo.multibyte && t == MBCSET)
{
bool need_or = false;
struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1767,7 +1705,7 @@ add_utf8_anychar (struct dfa *dfa)
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', c);
}
- dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
+ dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
}
/* A valid UTF-8 character is
@@ -1851,7 +1789,7 @@ atom (struct dfa *dfa)
dfa->parse.tok = lex (dfa);
}
- else if (dfa->parse.tok == ANYCHAR && using_utf8)
+ else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1912,7 +1850,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
{
size_t i;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
for (i = 0; i < ntokens; ++i)
addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
else
@@ -1998,7 +1936,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
d->lex.lasttok = END;
d->lex.laststart = true;
d->lex.parens = 0;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
d->lex.cur_mb_len = 0;
memset (&d->mbs, 0, sizeof d->mbs);
@@ -2187,7 +2125,7 @@ state_index (struct dfa *d, position_set const *s, int context)
}
else if (d->tokens[s->elems[j].index] == BACKREF)
constraint = NO_CONSTRAINT;
- if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+ if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
{
int acceptable
= ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2664,7 +2602,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
setbit (d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+ else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
{
/* ANYCHAR must match a single character, so put it to
D->states[s].mbps which contains the positions which can
@@ -2810,7 +2748,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_letter = state;
for (i = 0; i < NOTCHAR; ++i)
- trans[i] = unibyte_word_constituent (i) ? state_letter : state;
+ trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
trans[d->syntax.eolbyte] = state_newline;
}
else
@@ -2827,7 +2765,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
insert (d->follows[grps[i].elems[j]].elems[k], &follows);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* If a token in follows.elems is not 1st byte of a multibyte
character, or the states of follows must accept the bytes
@@ -2860,7 +2798,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* If we are building a searching matcher, throw in the positions
of state 0 as well. */
- if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
+ if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
{
merge (&d->states[0].elems, &follows, &tmp);
copy (&tmp, &follows);
@@ -2916,7 +2854,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
if (c == d->syntax.eolbyte)
trans[c] = state_newline;
- else if (unibyte_word_constituent (c))
+ else if (unibyte_word_constituent (d, c))
trans[c] = state_letter;
else if (c < NOTCHAR)
trans[c] = state;
@@ -2957,7 +2895,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
@@ -2969,7 +2907,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
{
d->trans[oldalloc] = NULL;
d->fails[oldalloc] = NULL;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
d->mb_trans[oldalloc] = NULL;
}
}
@@ -3003,7 +2941,7 @@ build_state (state_num s, struct dfa *d)
}
d->trcount = d->min_trcount;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
for (i = d->min_trcount; i < d->tralloc; i++)
{
@@ -3454,7 +3392,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end,
return (char *) begin;
}
-/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte),
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
but faster and set *BACKREF if the DFA code does not support this
regexp usage. */
@@ -3512,7 +3450,7 @@ dfa_supported (struct dfa const *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (!d->multibyte)
+ if (!d->localeinfo.multibyte)
continue;
/* fallthrough */
@@ -3530,7 +3468,7 @@ dfaoptimize (struct dfa *d)
size_t i;
bool have_backref = false;
- if (!using_utf8)
+ if (!d->localeinfo.using_utf8)
return;
for (i = 0; i < d->tindex; ++i)
@@ -3560,7 +3498,7 @@ dfaoptimize (struct dfa *d)
}
free_mbdata (d);
- d->multibyte = false;
+ d->localeinfo.multibyte = false;
d->dfaexec = dfaexec_sb;
d->fast = true;
}
@@ -3575,7 +3513,7 @@ dfassbuild (struct dfa *d)
struct dfa *sup = dfaalloc ();
*sup = *d;
- sup->multibyte = false;
+ sup->localeinfo.multibyte = false;
sup->dfaexec = dfaexec_sb;
sup->multibyte_prop = NULL;
sup->mbcsets = NULL;
@@ -3608,7 +3546,7 @@ dfassbuild (struct dfa *d)
case BACKREF:
zeroset (ccl);
notset (ccl);
- sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+ sup->tokens[j++] = CSET + charclass_index (sup, ccl);
sup->tokens[j++] = STAR;
if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
|| d->tokens[i + 1] == PLUS)
@@ -3619,7 +3557,7 @@ dfassbuild (struct dfa *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* These constraints aren't supported in a multibyte locale.
Ignore them in the superset DFA. */
@@ -3636,7 +3574,7 @@ dfassbuild (struct dfa *d)
}
sup->tindex = j;
- if (have_nchar && (have_achar || d->multibyte))
+ if (have_nchar && (have_achar || d->localeinfo.multibyte))
d->superset = sup;
else
{
@@ -3678,7 +3616,7 @@ dfafree (struct dfa *d)
free (d->charclasses);
free (d->tokens);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
free_mbdata (d);
for (i = 0; i < d->sindex; ++i)
@@ -4200,20 +4138,49 @@ dfamustfree (struct dfamust *dm)
struct dfa *
dfaalloc (void)
{
- struct dfa *d = xzalloc (sizeof *d);
- d->multibyte = MB_CUR_MAX > 1;
- d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
- d->fast = !d->multibyte;
- d->lex.cur_mb_len = 1;
- return d;
+ return xmalloc (sizeof (struct dfa));
}
+/* Initialize DFA. */
void
-dfa_init (void)
+dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
+ reg_syntax_t bits, bool fold, unsigned char eol)
{
- check_utf8 ();
- check_unibyte_c ();
- init_mbrtowc_cache ();
+ int i;
+ memset (dfa, 0, offsetof (struct dfa, dfaexec));
+ dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
+ dfa->simple_locale = using_simple_locale (linfo->multibyte);
+ dfa->localeinfo = *linfo;
+
+ dfa->fast = !dfa->localeinfo.multibyte;
+
+ dfa->lex.cur_mb_len = 1;
+ dfa->syntax.syntax_bits_set = true;
+ dfa->syntax.syntax_bits = bits;
+ dfa->syntax.case_fold = fold;
+ dfa->syntax.eolbyte = eol;
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ unsigned char uc = i;
+
+ dfa->syntax.sbit[uc] = char_context (dfa, uc);
+ switch (dfa->syntax.sbit[uc])
+ {
+ case CTX_LETTER:
+ setbit (uc, dfa->syntax.letters);
+ break;
+ case CTX_NEWLINE:
+ setbit (uc, dfa->syntax.newline);
+ break;
+ }
+
+ /* POSIX requires that the five bytes in "\n\r./" (including the
+ terminating NUL) cannot occur inside a multibyte character. */
+ dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
+ ? (uc & 0xc0) != 0x80
+ : strchr ("\n\r./", uc) != NULL);
+ }
}
/* vim:set shiftwidth=2: */
diff --git a/src/dfa.h b/src/dfa.h
index 585390a7..31baf7a1 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -24,6 +24,8 @@
#include "xalloc.h" /* for _GL_ATTRIBUTE_MALLOC */
+struct localeinfo; /* See localeinfo.h. */
+
/* Element of a list of strings, at least one of which is known to
appear in any R.E. matching the DFA. */
struct dfamust
@@ -44,17 +46,22 @@ struct dfa;
calling dfafree() on it. */
extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC;
+/* Initialize or reinitialize a DFA. This must be called before
+ any of the routines below. The arguments are:
+ 1. The DFA to operate on.
+ 2. Information about the current locale.
+ 3. The syntax bits described earlier in this file.
+ 4. The case-folding flag.
+ 5. The line terminator. */
+extern void dfasyntax (struct dfa *, struct localeinfo const *,
+ reg_syntax_t, bool, unsigned char);
+
/* Build and return the struct dfamust from the given struct dfa. */
extern struct dfamust *dfamust (struct dfa const *);
/* Free the storage held by the components of a struct dfamust. */
extern void dfamustfree (struct dfamust *);
-/* dfasyntax() takes four arguments; the first is the dfa to operate on, the
- second sets the syntax bits described earlier in this file, the third sets
- the case-folding flag, and the fourth specifies the line terminator. */
-extern void dfasyntax (struct dfa *, reg_syntax_t, bool, unsigned char);
-
/* Compile the given string of the given length into the given struct dfa.
Final argument is a flag specifying whether to build a searching or an
exact matcher. */
@@ -99,8 +106,3 @@ extern void dfawarn (const char *);
takes a single argument, a NUL-terminated string describing the error.
The user must supply a dfaerror. */
extern _Noreturn void dfaerror (const char *);
-
-extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE;
-
-/* This must be called before calling any of the above dfa*() functions. */
-extern void dfa_init (void);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 10c4f51b..c2e0177b 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -22,6 +22,8 @@
#include "intprops.h"
#include "search.h"
+struct localeinfo localeinfo;
+
/* Whether -w considers WC to be a word constituent. */
static bool
wordchar (wint_t wc)
@@ -128,7 +130,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
if (match_icase)
syntax_bits |= RE_ICASE;
re_set_syntax (syntax_bits);
- dfasyntax (dfa, syntax_bits, match_icase, eolbyte);
+ dfasyntax (dfa, &localeinfo, syntax_bits, match_icase, eolbyte);
/* For GNU regex, pass the patterns separately to detect errors like
"[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
@@ -277,7 +279,7 @@ EGexecute (char *buf, size_t size, size_t *match_size,
if (exact_kwset_match)
{
- if (MB_CUR_MAX == 1 || dfa_using_utf8 ())
+ if (MB_CUR_MAX == 1 || localeinfo.using_utf8)
goto success;
if (mb_start < beg)
mb_start = beg;
diff --git a/src/grep.c b/src/grep.c
index 0c84b2a3..fc22c7b6 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -642,7 +642,7 @@ initialize_unibyte_mask (void)
unsigned char mask = 0;
int ms1b = 1;
for (int i = 1; i <= UCHAR_MAX; i++)
- if ((mbclen_cache[i] != 1) & ! (mask & i))
+ if ((localeinfo.sbclen[i] != 1) & ! (mask & i))
{
while (ms1b * 2 <= i)
ms1b *= 2;
@@ -2344,7 +2344,7 @@ main (int argc, char **argv)
textdomain (PACKAGE);
#endif
- dfa_init ();
+ init_localeinfo (&localeinfo);
atexit (clean_up_stdout);
@@ -2726,7 +2726,6 @@ main (int argc, char **argv)
else
usage (EXIT_TROUBLE);
- build_mbclen_cache ();
initialize_unibyte_mask ();
/* In a unibyte locale, switch from fgrep to grep if
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 57fd4d77..508ebc5e 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -93,7 +93,7 @@ Fexecute (char *buf, size_t size, size_t *match_size,
mb_check = longest = false;
else
{
- mb_check = MB_CUR_MAX > 1 && !dfa_using_utf8 ();
+ mb_check = MB_CUR_MAX > 1 && !localeinfo.using_utf8;
longest = mb_check | !!start_ptr | match_words;
}
diff --git a/src/localeinfo.c b/src/localeinfo.c
new file mode 100644
index 00000000..329d4314
--- /dev/null
+++ b/src/localeinfo.c
@@ -0,0 +1,66 @@
+/* locale information
+
+ Copyright 2016 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Written by Paul Eggert. */
+
+#include <config.h>
+
+#include <localeinfo.h>
+
+#include <verify.h>
+
+#include <limits.h>
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* The sbclen implementation relies on this. */
+verify (MB_LEN_MAX <= SCHAR_MAX);
+
+/* Return true if the locale uses UTF-8. */
+
+static bool
+is_using_utf8 (void)
+{
+ wchar_t wc;
+ mbstate_t mbs = {0};
+ return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
+}
+
+/* Initialize *LOCALEINFO from the current locale. */
+
+void
+init_localeinfo (struct localeinfo *localeinfo)
+{
+ int i;
+
+ localeinfo->multibyte = MB_CUR_MAX > 1;
+ localeinfo->using_utf8 = is_using_utf8 ();
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+ {
+ char c = i;
+ unsigned char uc = i;
+ mbstate_t s = {0};
+ wchar_t wc;
+ size_t len = mbrtowc (&wc, &c, 1, &s);
+ localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
+ localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
+ }
+}
diff --git a/src/localeinfo.h b/src/localeinfo.h
new file mode 100644
index 00000000..70b55a8d
--- /dev/null
+++ b/src/localeinfo.h
@@ -0,0 +1,47 @@
+/* locale information
+
+ Copyright 2016 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Written by Paul Eggert. */
+
+#include <limits.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+struct localeinfo
+{
+ /* MB_CUR_MAX > 1. */
+ bool multibyte;
+
+ /* The locale uses UTF-8. */
+ bool using_utf8;
+
+ /* An array indexed by byte values B that contains 1 if B is a
+ single-byte character, -1 if B is an encoding error, and -2 if B
+ is the leading byte of a multibyte character that contains more
+ than one byte. */
+ signed char sbclen[UCHAR_MAX + 1];
+
+ /* An array indexed by byte values B that contains the corresponding
+ wide character (if any) for B if sbclen[B] == 1. WEOF means the
+ byte is not a valid single-byte character, i.e., sbclen[B] == -1
+ or -2. */
+ wint_t sbctowc[UCHAR_MAX + 1];
+};
+
+extern void init_localeinfo (struct localeinfo *);
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 3f76603d..9ffa22a3 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -114,7 +114,7 @@ Pcompile (char const *pattern, size_t size)
if (1 < MB_CUR_MAX)
{
- if (! dfa_using_utf8 ())
+ if (! localeinfo.using_utf8)
error (EXIT_TROUBLE, 0,
_("-P supports only unibyte and UTF-8 locales"));
multibyte_locale = true;
@@ -254,7 +254,7 @@ Pexecute (char *buf, size_t size, size_t *match_size,
/* Skip past bytes that are easily determined to be encoding
errors, treating them as data that cannot match. This is
faster than having pcre_exec check them. */
- while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
+ while (localeinfo.sbclen[to_uchar (*p)] == -1)
{
p++;
subject = p;
diff --git a/src/search.h b/src/search.h
index 7dc19408..431a67da 100644
--- a/src/search.h
+++ b/src/search.h
@@ -33,6 +33,7 @@
#include "dfa.h"
#include "kwset.h"
#include "xalloc.h"
+#include "localeinfo.h"
_GL_INLINE_HEADER_BEGIN
#ifndef SEARCH_INLINE
@@ -47,14 +48,12 @@ typedef signed char mb_len_map_t;
/* searchutils.c */
extern void kwsinit (kwset_t *);
-
-extern void build_mbclen_cache (void);
-extern size_t mbclen_cache[];
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
extern wint_t mb_prev_wc (char const *, char const *, char const *);
extern wint_t mb_next_wc (char const *, char const *);
/* dfasearch.c */
+extern struct localeinfo localeinfo;
extern void GEAcompile (char const *, size_t, reg_syntax_t);
extern size_t EGexecute (char *, size_t, size_t *, char const *);
@@ -73,7 +72,7 @@ extern size_t Pexecute (char *, size_t, size_t *, char const *);
SEARCH_INLINE size_t
mb_clen (char const *s, size_t n, mbstate_t *mbs)
{
- size_t len = mbclen_cache[to_uchar (*s)];
+ size_t len = localeinfo.sbclen[to_uchar (*s)];
return len == (size_t) -2 ? mbrlen (s, n, mbs) : len;
}
diff --git a/src/searchutils.c b/src/searchutils.c
index d25e5f83..8081d418 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -24,8 +24,6 @@
#define NCHAR (UCHAR_MAX + 1)
-size_t mbclen_cache[NCHAR];
-
void
kwsinit (kwset_t *kwset)
{
@@ -46,22 +44,6 @@ kwsinit (kwset_t *kwset)
xalloc_die ();
}
-/* Initialize a cache of mbrlen values for each of its 1-byte inputs. */
-void
-build_mbclen_cache (void)
-{
- int i;
-
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- char c = i;
- unsigned char uc = i;
- mbstate_t mbs = { 0 };
- size_t len = mbrlen (&c, 1, &mbs);
- mbclen_cache[uc] = len ? len : 1;
- }
-}
-
/* In the buffer *MB_START, return the number of bytes needed to go
back from CUR to the previous boundary, where a "boundary" is the
start of a multibyte character or is an error-encoding byte. The
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 77502ca2..355f44e2 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -42,7 +42,7 @@ AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
# Tell the linker to omit references to unused shared libraries.
AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS)
LDADD = ../lib/libgreputils.a $(LIBINTL) ../lib/libgreputils.a
-dfa_match_aux_LDADD = ../src/dfa.$(OBJEXT) $(LDADD)
+dfa_match_aux_LDADD = ../src/dfa.$(OBJEXT) ../src/localeinfo.$(OBJEXT) $(LDADD)
# The triple-backref test is expected to fail with both the system
# matcher (i.e., with glibc) and with the included matcher.
diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c
index e6517352..e001b7de 100644
--- a/tests/dfa-match-aux.c
+++ b/tests/dfa-match-aux.c
@@ -24,6 +24,7 @@
#include <string.h>
#include <regex.h>
#include <dfa.h>
+#include <localeinfo.h>
#include "progname.h"
@@ -47,17 +48,17 @@ main (int argc, char **argv)
struct dfa *dfa;
char *beg, *end, *p;
int allow_nl;
+ struct localeinfo localeinfo;
set_program_name (argv[0]);
if (argc < 3)
exit (EXIT_FAILURE);
setlocale (LC_ALL, "");
-
- dfa_init ();
+ init_localeinfo (&localeinfo);
dfa = dfaalloc ();
- dfasyntax (dfa, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
+ dfasyntax (dfa, &localeinfo, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n');
dfacomp (argv[1], strlen (argv[1]), dfa, 0);
beg = argv[2];