From 368d38f6c4118cfb7db1fd6500c45db1f53c0216 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Wed, 31 Aug 2016 20:16:32 -0700 Subject: dfa: make dfa.c fully thread-safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This follows up on Zev Weiss’s recent patches to make the DFA code thread-safe (Bug#24249). It removes the remaining static variables used by dfa.c. These variables are locale-dependent, so they would cause problems in multithreaded code where different threads are in different locales (e.g., via uselocale). I abstracted most of the variables into a new localeinfo module. * src/Makefile.am (grep_SOURCES): Add localeinfo.c. (noinst_HEADERS): Add localeinfo.h. * src/dfa.c: Include localeinfo.h. (struct dfa): Remove multibyte member, as it is now part of localeinfo. New members simple_locale and localeinfo. Put locale-related members at the end. (mbrtowc_cache): Remove; now part of dfa->localeinfo. (charclass_index): Rename back from dfa_charclass_index, since it's private. (unibyte_word_constituent): New arg DFA; use its sbctowc member. (using_utf8, dfa_using_utf8, init_mbrtowc_cache, check_utf8): Remove; now done by localeinfo members. All uses changed. (dfasyntax): New localeinfo arg. Move to end to avoid forward decls. Initialize the entire DFA. (unibyte_c, check_unibyte_c): Remove; now in simple_locale member. (using_simple_locale): Now takes bool instead of DFA. Do the locale check here, rather than in the caller, as the result is now cached in dfa->simple_locale. (dfaalloc): Just allocate the DFA. dfasyntax now initializes it. * src/dfa.h: Add forward decl of struct localeinfo. Adjust to new dfa.c API. * src/dfasearch.c (localeinfo): New var, replacing former static vars like mbrtowc_cache. * src/localeinfo.c, src/localeinfo.h: New files. * src/search.h: Include localeinfo.h. (localeinfo): New decl. * src/searchutils.c (mbclen_cache, build_mbclen_cache): Remove. All uses changed to localeinfo. * tests/Makefile.am (dfa_match_aux_LDADD): Add localeinfo.o. * tests/dfa-match-aux.c: Include localeinfo.h. (main): Adjust to changes in DFA API. --- src/Makefile.am | 4 +- src/dfa.c | 269 ++++++++++++++++++++++---------------------------- src/dfa.h | 22 +++-- src/dfasearch.c | 6 +- src/grep.c | 5 +- src/kwsearch.c | 2 +- src/localeinfo.c | 66 +++++++++++++ src/localeinfo.h | 47 +++++++++ src/pcresearch.c | 4 +- src/search.h | 7 +- src/searchutils.c | 18 ---- tests/Makefile.am | 2 +- tests/dfa-match-aux.c | 7 +- 13 files changed, 262 insertions(+), 197 deletions(-) create mode 100644 src/localeinfo.c create mode 100644 src/localeinfo.h diff --git a/src/Makefile.am b/src/Makefile.am index 941384ea..2b0ba0f1 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -25,9 +25,9 @@ bin_PROGRAMS = grep bin_SCRIPTS = egrep fgrep grep_SOURCES = grep.c searchutils.c \ dfa.c dfasearch.c \ - kwset.c kwsearch.c \ + kwset.c kwsearch.c localeinfo.c \ pcresearch.c -noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h +noinst_HEADERS = grep.h dfa.h kwset.h localeinfo.h search.h system.h # Sometimes, the expansion of $(LIBINTL) includes -lc which may # include modules defining variables like 'optind', so libgreputils.a diff --git a/src/dfa.c b/src/dfa.c index 8451c818..bf8c5463 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -24,6 +24,8 @@ #include "dfa.h" +#include "localeinfo.h" + #include #include #include @@ -418,14 +420,9 @@ struct dfa size_t nregexps; /* Count of parallel regexps being built with dfaparse. */ bool fast; /* The DFA is fast. */ - bool multibyte; /* MB_CUR_MAX > 1. */ token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ mbstate_t mbs; /* Multibyte conversion state. */ - /* dfaexec implementation. */ - char *(*dfaexec) (struct dfa *, char const *, char *, - bool, size_t *, bool *); - /* The following are valid only if MB_CUR_MAX > 1. */ /* The value of multibyte_prop[i] is defined by following rule. @@ -511,6 +508,21 @@ struct dfa state_num **mb_trans; /* Transition tables for states with ANYCHAR. */ state_num mb_trcount; /* Number of transition tables for states with ANYCHAR that have actually been built. */ + + /* Information derived from the locale. This is at the end so that + a quick memset need not clear it specially. */ + + /* dfaexec implementation. */ + char *(*dfaexec) (struct dfa *, char const *, char *, + bool, size_t *, bool *); + + /* The locale is simple, like the C locale. These locales can be + processed more efficiently, e.g., the relationship between lower- + and upper-case letters is 1-1. */ + bool simple_locale; + + /* Other cached information derived from the locale. */ + struct localeinfo localeinfo; }; /* Some macros for user access to dfa internals. */ @@ -524,13 +536,8 @@ struct dfa static void regexp (struct dfa *dfa); -/* A table indexed by byte values that contains the corresponding wide - character (if any) for that byte. WEOF means the byte is not a - valid single-byte character. */ -static wint_t mbrtowc_cache[NOTCHAR]; - /* Store into *PWC the result of converting the leading bytes of the - multibyte buffer S of length N bytes, using the mbrtowc_cache in *D + multibyte buffer S of length N bytes, using D->localeinfo.sbctowc and updating the conversion state in *D. On conversion error, convert just a single byte, to WEOF. Return the number of bytes converted. @@ -539,7 +546,7 @@ static wint_t mbrtowc_cache[NOTCHAR]; * PWC points to wint_t, not to wchar_t. * The last arg is a dfa *D instead of merely a multibyte conversion - state D->mbs. D also contains an mbrtowc_cache for speed. + state D->mbs. * N must be at least 1. * S[N - 1] must be a sentinel byte. * Shift encodings are not supported. @@ -550,7 +557,7 @@ static size_t mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) { unsigned char uc = s[0]; - wint_t wc = mbrtowc_cache[uc]; + wint_t wc = d->localeinfo.sbctowc[uc]; if (wc == WEOF) { @@ -727,7 +734,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) /* In DFA D, find the index of charclass S, or allocate a new one. */ static size_t -dfa_charclass_index (struct dfa *d, charclass const s) +charclass_index (struct dfa *d, charclass const s) { size_t i; @@ -742,9 +749,9 @@ dfa_charclass_index (struct dfa *d, charclass const s) } static bool -unibyte_word_constituent (unsigned char c) +unibyte_word_constituent (struct dfa const *dfa, unsigned char c) { - return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_'); + return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_'); } static int @@ -752,68 +759,11 @@ char_context (struct dfa const *dfa, unsigned char c) { if (c == dfa->syntax.eolbyte) return CTX_NEWLINE; - if (unibyte_word_constituent (c)) + if (unibyte_word_constituent (dfa, c)) return CTX_LETTER; return CTX_NONE; } -/* UTF-8 encoding allows some optimizations that we can't otherwise - assume in a multibyte encoding. */ -static bool using_utf8; - -bool -dfa_using_utf8 (void) -{ - return using_utf8; -} - -static void -init_mbrtowc_cache (void) -{ - int i; - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - char c = i; - unsigned char uc = i; - mbstate_t s = { 0 }; - wchar_t wc; - mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; - } -} - -/* Entry point to set syntax options. */ -void -dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol) -{ - int i; - dfa->syntax.syntax_bits_set = true; - dfa->syntax.syntax_bits = bits; - dfa->syntax.case_fold = fold; - dfa->syntax.eolbyte = eol; - - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - unsigned char uc = i; - - /* Use mbrtowc_cache to calculate sbit. */ - dfa->syntax.sbit[uc] = char_context (dfa, uc); - switch (dfa->syntax.sbit[uc]) - { - case CTX_LETTER: - setbit (uc, dfa->syntax.letters); - break; - case CTX_NEWLINE: - setbit (uc, dfa->syntax.newline); - break; - } - - /* POSIX requires that the five bytes in "\n\r./" (including the - terminating NUL) cannot occur inside a multibyte character. */ - dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80 - : strchr ("\n\r./", uc) != NULL); - } -} - /* Set a bit in the charclass for the given wchar_t. Do nothing if WC is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, this may happen when folding case in weird Turkish locales where @@ -842,30 +792,10 @@ setbit_case_fold_c (int b, charclass c) setbit (i, c); } -static void check_utf8 (void) -{ - wchar_t wc; - mbstate_t mbs = { 0 }; - using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; -} - -static bool unibyte_c; - -static void check_unibyte_c (void) -{ - char const *locale = setlocale (LC_ALL, NULL); - unibyte_c = (!locale - || STREQ (locale, "C") - || STREQ (locale, "POSIX")); -} - -/* The current locale is known to be a unibyte locale - without multicharacter collating sequences and where range - comparisons simply use the native encoding. These locales can be - processed more efficiently. */ +/* Return true if the locale compatible with the C locale. */ static bool -using_simple_locale (struct dfa const *dfa) +using_simple_locale (bool multibyte) { /* The native character set is known to be compatible with the C locale. The following test isn't perfect, but it's good @@ -883,7 +813,15 @@ using_simple_locale (struct dfa const *dfa) && '}' == 125 && '~' == 126) }; - return (native_c_charset & !dfa->multibyte) | unibyte_c; + if (native_c_charset && !multibyte) + return true; + else + { + /* Treat C and POSIX locales as being compatible. Also, treat + errors as compatible, as these are invariably from stubs. */ + char const *loc = setlocale (LC_ALL, NULL); + return !loc || strcmp (loc, "C") == 0 || strcmp (loc, "POSIX") == 0; + } } /* Fetch the next lexical input character. Set C (of type int) to the @@ -1034,7 +972,7 @@ parse_bracket_exp (struct dfa *dfa) size_t chars_al; chars_al = 0; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets, &dfa->mbcsets_alloc, @@ -1057,7 +995,7 @@ parse_bracket_exp (struct dfa *dfa) { FETCH_WC (dfa, c, wc, _("unbalanced [")); invert = true; - known_bracket_exp = using_simple_locale (dfa); + known_bracket_exp = dfa->simple_locale; } else invert = false; @@ -1112,7 +1050,7 @@ parse_bracket_exp (struct dfa *dfa) if (!pred) dfaerror (_("invalid character class")); - if (dfa->multibyte && !pred->single_byte_only) + if (dfa->localeinfo.multibyte && !pred->single_byte_only) known_bracket_exp = false; else for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1172,9 +1110,9 @@ parse_bracket_exp (struct dfa *dfa) /* Treat [x-y] as a range if x != y. */ if (wc != wc2 || wc == WEOF) { - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) known_bracket_exp = false; - else if (using_simple_locale (dfa)) + else if (dfa->simple_locale) { int ci; for (ci = c; ci <= c2; ci++) @@ -1201,7 +1139,7 @@ parse_bracket_exp (struct dfa *dfa) colon_warning_state |= (c == ':') ? 2 : 4; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { if (dfa->syntax.case_fold) setbit_case_fold_c (c, ccl); @@ -1238,22 +1176,22 @@ parse_bracket_exp (struct dfa *dfa) if (! known_bracket_exp) return BACKREF; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { work_mbc->invert = invert; - work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl); + work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl); return MBCSET; } if (invert) { - assert (!dfa->multibyte); + assert (!dfa->localeinfo.multibyte); notset (ccl); if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit ('\n', ccl); } - return CSET + dfa_charclass_index (dfa, ccl); + return CSET + charclass_index (dfa, ccl); } struct lexptr @@ -1508,7 +1446,7 @@ lex (struct dfa *dfa) case '.': if (backslash) goto normal_char; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { /* In multibyte environment period must match with a single character not a byte. So we use ANYCHAR. */ @@ -1522,13 +1460,13 @@ lex (struct dfa *dfa) if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); case 's': case 'S': if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1537,7 +1475,7 @@ lex (struct dfa *dfa) if (c == 'S') notset (ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1561,16 +1499,16 @@ lex (struct dfa *dfa) if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) - if (unibyte_word_constituent (c2)) + if (unibyte_word_constituent (dfa, c2)) setbit (c2, ccl); if (c == 'W') notset (ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1600,14 +1538,14 @@ lex (struct dfa *dfa) dfa->lex.laststart = false; /* For multibyte character sets, folding is done in atom. Always return WCHAR. */ - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) return dfa->lex.lasttok = WCHAR; if (dfa->syntax.case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } return dfa->lex.lasttok = c; @@ -1627,11 +1565,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop) { dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc, sizeof *dfa->tokens); - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc, sizeof *dfa->multibyte_prop); } - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop[dfa->tindex] = mbprop; dfa->tokens[dfa->tindex++] = t; @@ -1668,7 +1606,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc); static void addtok (struct dfa *dfa, token t) { - if (dfa->multibyte && t == MBCSET) + if (dfa->localeinfo.multibyte && t == MBCSET) { bool need_or = false; struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1]; @@ -1767,7 +1705,7 @@ add_utf8_anychar (struct dfa *dfa) if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', c); } - dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c); + dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c); } /* A valid UTF-8 character is @@ -1851,7 +1789,7 @@ atom (struct dfa *dfa) dfa->parse.tok = lex (dfa); } - else if (dfa->parse.tok == ANYCHAR && using_utf8) + else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1912,7 +1850,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens) { size_t i; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) for (i = 0; i < ntokens; ++i) addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]); else @@ -1998,7 +1936,7 @@ dfaparse (char const *s, size_t len, struct dfa *d) d->lex.lasttok = END; d->lex.laststart = true; d->lex.parens = 0; - if (d->multibyte) + if (d->localeinfo.multibyte) { d->lex.cur_mb_len = 0; memset (&d->mbs, 0, sizeof d->mbs); @@ -2187,7 +2125,7 @@ state_index (struct dfa *d, position_set const *s, int context) } else if (d->tokens[s->elems[j].index] == BACKREF) constraint = NO_CONSTRAINT; - if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR) + if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR) { int acceptable = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE) @@ -2664,7 +2602,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->multibyte && d->tokens[pos.index] == ANYCHAR) + else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR) { /* ANYCHAR must match a single character, so put it to D->states[s].mbps which contains the positions which can @@ -2810,7 +2748,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) state_letter = state; for (i = 0; i < NOTCHAR; ++i) - trans[i] = unibyte_word_constituent (i) ? state_letter : state; + trans[i] = unibyte_word_constituent (d, i) ? state_letter : state; trans[d->syntax.eolbyte] = state_newline; } else @@ -2827,7 +2765,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k) insert (d->follows[grps[i].elems[j]].elems[k], &follows); - if (d->multibyte) + if (d->localeinfo.multibyte) { /* If a token in follows.elems is not 1st byte of a multibyte character, or the states of follows must accept the bytes @@ -2860,7 +2798,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* If we are building a searching matcher, throw in the positions of state 0 as well. */ - if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte)) + if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte)) { merge (&d->states[0].elems, &follows, &tmp); copy (&tmp, &follows); @@ -2916,7 +2854,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) if (c == d->syntax.eolbyte) trans[c] = state_newline; - else if (unibyte_word_constituent (c)) + else if (unibyte_word_constituent (d, c)) trans[c] = state_letter; else if (c < NOTCHAR) trans[c] = state; @@ -2957,7 +2895,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state) d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails); d->success = xnrealloc (d->success, newalloc, sizeof *d->success); d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines); - if (d->multibyte) + if (d->localeinfo.multibyte) { realtrans = d->mb_trans ? d->mb_trans - 1 : NULL; realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans); @@ -2969,7 +2907,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state) { d->trans[oldalloc] = NULL; d->fails[oldalloc] = NULL; - if (d->multibyte) + if (d->localeinfo.multibyte) d->mb_trans[oldalloc] = NULL; } } @@ -3003,7 +2941,7 @@ build_state (state_num s, struct dfa *d) } d->trcount = d->min_trcount; - if (d->multibyte) + if (d->localeinfo.multibyte) { for (i = d->min_trcount; i < d->tralloc; i++) { @@ -3454,7 +3392,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end, return (char *) begin; } -/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte), +/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte), but faster and set *BACKREF if the DFA code does not support this regexp usage. */ @@ -3512,7 +3450,7 @@ dfa_supported (struct dfa const *d) case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (!d->multibyte) + if (!d->localeinfo.multibyte) continue; /* fallthrough */ @@ -3530,7 +3468,7 @@ dfaoptimize (struct dfa *d) size_t i; bool have_backref = false; - if (!using_utf8) + if (!d->localeinfo.using_utf8) return; for (i = 0; i < d->tindex; ++i) @@ -3560,7 +3498,7 @@ dfaoptimize (struct dfa *d) } free_mbdata (d); - d->multibyte = false; + d->localeinfo.multibyte = false; d->dfaexec = dfaexec_sb; d->fast = true; } @@ -3575,7 +3513,7 @@ dfassbuild (struct dfa *d) struct dfa *sup = dfaalloc (); *sup = *d; - sup->multibyte = false; + sup->localeinfo.multibyte = false; sup->dfaexec = dfaexec_sb; sup->multibyte_prop = NULL; sup->mbcsets = NULL; @@ -3608,7 +3546,7 @@ dfassbuild (struct dfa *d) case BACKREF: zeroset (ccl); notset (ccl); - sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl); + sup->tokens[j++] = CSET + charclass_index (sup, ccl); sup->tokens[j++] = STAR; if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR || d->tokens[i + 1] == PLUS) @@ -3619,7 +3557,7 @@ dfassbuild (struct dfa *d) case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (d->multibyte) + if (d->localeinfo.multibyte) { /* These constraints aren't supported in a multibyte locale. Ignore them in the superset DFA. */ @@ -3636,7 +3574,7 @@ dfassbuild (struct dfa *d) } sup->tindex = j; - if (have_nchar && (have_achar || d->multibyte)) + if (have_nchar && (have_achar || d->localeinfo.multibyte)) d->superset = sup; else { @@ -3678,7 +3616,7 @@ dfafree (struct dfa *d) free (d->charclasses); free (d->tokens); - if (d->multibyte) + if (d->localeinfo.multibyte) free_mbdata (d); for (i = 0; i < d->sindex; ++i) @@ -4200,20 +4138,49 @@ dfamustfree (struct dfamust *dm) struct dfa * dfaalloc (void) { - struct dfa *d = xzalloc (sizeof *d); - d->multibyte = MB_CUR_MAX > 1; - d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; - d->fast = !d->multibyte; - d->lex.cur_mb_len = 1; - return d; + return xmalloc (sizeof (struct dfa)); } +/* Initialize DFA. */ void -dfa_init (void) +dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, + reg_syntax_t bits, bool fold, unsigned char eol) { - check_utf8 (); - check_unibyte_c (); - init_mbrtowc_cache (); + int i; + memset (dfa, 0, offsetof (struct dfa, dfaexec)); + dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb; + dfa->simple_locale = using_simple_locale (linfo->multibyte); + dfa->localeinfo = *linfo; + + dfa->fast = !dfa->localeinfo.multibyte; + + dfa->lex.cur_mb_len = 1; + dfa->syntax.syntax_bits_set = true; + dfa->syntax.syntax_bits = bits; + dfa->syntax.case_fold = fold; + dfa->syntax.eolbyte = eol; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + unsigned char uc = i; + + dfa->syntax.sbit[uc] = char_context (dfa, uc); + switch (dfa->syntax.sbit[uc]) + { + case CTX_LETTER: + setbit (uc, dfa->syntax.letters); + break; + case CTX_NEWLINE: + setbit (uc, dfa->syntax.newline); + break; + } + + /* POSIX requires that the five bytes in "\n\r./" (including the + terminating NUL) cannot occur inside a multibyte character. */ + dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8 + ? (uc & 0xc0) != 0x80 + : strchr ("\n\r./", uc) != NULL); + } } /* vim:set shiftwidth=2: */ diff --git a/src/dfa.h b/src/dfa.h index 585390a7..31baf7a1 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -24,6 +24,8 @@ #include "xalloc.h" /* for _GL_ATTRIBUTE_MALLOC */ +struct localeinfo; /* See localeinfo.h. */ + /* Element of a list of strings, at least one of which is known to appear in any R.E. matching the DFA. */ struct dfamust @@ -44,17 +46,22 @@ struct dfa; calling dfafree() on it. */ extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC; +/* Initialize or reinitialize a DFA. This must be called before + any of the routines below. The arguments are: + 1. The DFA to operate on. + 2. Information about the current locale. + 3. The syntax bits described earlier in this file. + 4. The case-folding flag. + 5. The line terminator. */ +extern void dfasyntax (struct dfa *, struct localeinfo const *, + reg_syntax_t, bool, unsigned char); + /* Build and return the struct dfamust from the given struct dfa. */ extern struct dfamust *dfamust (struct dfa const *); /* Free the storage held by the components of a struct dfamust. */ extern void dfamustfree (struct dfamust *); -/* dfasyntax() takes four arguments; the first is the dfa to operate on, the - second sets the syntax bits described earlier in this file, the third sets - the case-folding flag, and the fourth specifies the line terminator. */ -extern void dfasyntax (struct dfa *, reg_syntax_t, bool, unsigned char); - /* Compile the given string of the given length into the given struct dfa. Final argument is a flag specifying whether to build a searching or an exact matcher. */ @@ -99,8 +106,3 @@ extern void dfawarn (const char *); takes a single argument, a NUL-terminated string describing the error. The user must supply a dfaerror. */ extern _Noreturn void dfaerror (const char *); - -extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE; - -/* This must be called before calling any of the above dfa*() functions. */ -extern void dfa_init (void); diff --git a/src/dfasearch.c b/src/dfasearch.c index 10c4f51b..c2e0177b 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -22,6 +22,8 @@ #include "intprops.h" #include "search.h" +struct localeinfo localeinfo; + /* Whether -w considers WC to be a word constituent. */ static bool wordchar (wint_t wc) @@ -128,7 +130,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) if (match_icase) syntax_bits |= RE_ICASE; re_set_syntax (syntax_bits); - dfasyntax (dfa, syntax_bits, match_icase, eolbyte); + dfasyntax (dfa, &localeinfo, syntax_bits, match_icase, eolbyte); /* For GNU regex, pass the patterns separately to detect errors like "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and @@ -277,7 +279,7 @@ EGexecute (char *buf, size_t size, size_t *match_size, if (exact_kwset_match) { - if (MB_CUR_MAX == 1 || dfa_using_utf8 ()) + if (MB_CUR_MAX == 1 || localeinfo.using_utf8) goto success; if (mb_start < beg) mb_start = beg; diff --git a/src/grep.c b/src/grep.c index 0c84b2a3..fc22c7b6 100644 --- a/src/grep.c +++ b/src/grep.c @@ -642,7 +642,7 @@ initialize_unibyte_mask (void) unsigned char mask = 0; int ms1b = 1; for (int i = 1; i <= UCHAR_MAX; i++) - if ((mbclen_cache[i] != 1) & ! (mask & i)) + if ((localeinfo.sbclen[i] != 1) & ! (mask & i)) { while (ms1b * 2 <= i) ms1b *= 2; @@ -2344,7 +2344,7 @@ main (int argc, char **argv) textdomain (PACKAGE); #endif - dfa_init (); + init_localeinfo (&localeinfo); atexit (clean_up_stdout); @@ -2726,7 +2726,6 @@ main (int argc, char **argv) else usage (EXIT_TROUBLE); - build_mbclen_cache (); initialize_unibyte_mask (); /* In a unibyte locale, switch from fgrep to grep if diff --git a/src/kwsearch.c b/src/kwsearch.c index 57fd4d77..508ebc5e 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -93,7 +93,7 @@ Fexecute (char *buf, size_t size, size_t *match_size, mb_check = longest = false; else { - mb_check = MB_CUR_MAX > 1 && !dfa_using_utf8 (); + mb_check = MB_CUR_MAX > 1 && !localeinfo.using_utf8; longest = mb_check | !!start_ptr | match_words; } diff --git a/src/localeinfo.c b/src/localeinfo.c new file mode 100644 index 00000000..329d4314 --- /dev/null +++ b/src/localeinfo.c @@ -0,0 +1,66 @@ +/* locale information + + Copyright 2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written by Paul Eggert. */ + +#include + +#include + +#include + +#include +#include +#include +#include + +/* The sbclen implementation relies on this. */ +verify (MB_LEN_MAX <= SCHAR_MAX); + +/* Return true if the locale uses UTF-8. */ + +static bool +is_using_utf8 (void) +{ + wchar_t wc; + mbstate_t mbs = {0}; + return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; +} + +/* Initialize *LOCALEINFO from the current locale. */ + +void +init_localeinfo (struct localeinfo *localeinfo) +{ + int i; + + localeinfo->multibyte = MB_CUR_MAX > 1; + localeinfo->using_utf8 = is_using_utf8 (); + + for (i = CHAR_MIN; i <= CHAR_MAX; i++) + { + char c = i; + unsigned char uc = i; + mbstate_t s = {0}; + wchar_t wc; + size_t len = mbrtowc (&wc, &c, 1, &s); + localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len; + localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF; + } +} diff --git a/src/localeinfo.h b/src/localeinfo.h new file mode 100644 index 00000000..70b55a8d --- /dev/null +++ b/src/localeinfo.h @@ -0,0 +1,47 @@ +/* locale information + + Copyright 2016 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written by Paul Eggert. */ + +#include +#include +#include + +struct localeinfo +{ + /* MB_CUR_MAX > 1. */ + bool multibyte; + + /* The locale uses UTF-8. */ + bool using_utf8; + + /* An array indexed by byte values B that contains 1 if B is a + single-byte character, -1 if B is an encoding error, and -2 if B + is the leading byte of a multibyte character that contains more + than one byte. */ + signed char sbclen[UCHAR_MAX + 1]; + + /* An array indexed by byte values B that contains the corresponding + wide character (if any) for B if sbclen[B] == 1. WEOF means the + byte is not a valid single-byte character, i.e., sbclen[B] == -1 + or -2. */ + wint_t sbctowc[UCHAR_MAX + 1]; +}; + +extern void init_localeinfo (struct localeinfo *); diff --git a/src/pcresearch.c b/src/pcresearch.c index 3f76603d..9ffa22a3 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -114,7 +114,7 @@ Pcompile (char const *pattern, size_t size) if (1 < MB_CUR_MAX) { - if (! dfa_using_utf8 ()) + if (! localeinfo.using_utf8) error (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); multibyte_locale = true; @@ -254,7 +254,7 @@ Pexecute (char *buf, size_t size, size_t *match_size, /* Skip past bytes that are easily determined to be encoding errors, treating them as data that cannot match. This is faster than having pcre_exec check them. */ - while (mbclen_cache[to_uchar (*p)] == (size_t) -1) + while (localeinfo.sbclen[to_uchar (*p)] == -1) { p++; subject = p; diff --git a/src/search.h b/src/search.h index 7dc19408..431a67da 100644 --- a/src/search.h +++ b/src/search.h @@ -33,6 +33,7 @@ #include "dfa.h" #include "kwset.h" #include "xalloc.h" +#include "localeinfo.h" _GL_INLINE_HEADER_BEGIN #ifndef SEARCH_INLINE @@ -47,14 +48,12 @@ typedef signed char mb_len_map_t; /* searchutils.c */ extern void kwsinit (kwset_t *); - -extern void build_mbclen_cache (void); -extern size_t mbclen_cache[]; extern ptrdiff_t mb_goback (char const **, char const *, char const *); extern wint_t mb_prev_wc (char const *, char const *, char const *); extern wint_t mb_next_wc (char const *, char const *); /* dfasearch.c */ +extern struct localeinfo localeinfo; extern void GEAcompile (char const *, size_t, reg_syntax_t); extern size_t EGexecute (char *, size_t, size_t *, char const *); @@ -73,7 +72,7 @@ extern size_t Pexecute (char *, size_t, size_t *, char const *); SEARCH_INLINE size_t mb_clen (char const *s, size_t n, mbstate_t *mbs) { - size_t len = mbclen_cache[to_uchar (*s)]; + size_t len = localeinfo.sbclen[to_uchar (*s)]; return len == (size_t) -2 ? mbrlen (s, n, mbs) : len; } diff --git a/src/searchutils.c b/src/searchutils.c index d25e5f83..8081d418 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -24,8 +24,6 @@ #define NCHAR (UCHAR_MAX + 1) -size_t mbclen_cache[NCHAR]; - void kwsinit (kwset_t *kwset) { @@ -46,22 +44,6 @@ kwsinit (kwset_t *kwset) xalloc_die (); } -/* Initialize a cache of mbrlen values for each of its 1-byte inputs. */ -void -build_mbclen_cache (void) -{ - int i; - - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - char c = i; - unsigned char uc = i; - mbstate_t mbs = { 0 }; - size_t len = mbrlen (&c, 1, &mbs); - mbclen_cache[uc] = len ? len : 1; - } -} - /* In the buffer *MB_START, return the number of bytes needed to go back from CUR to the previous boundary, where a "boundary" is the start of a multibyte character or is an error-encoding byte. The diff --git a/tests/Makefile.am b/tests/Makefile.am index 77502ca2..355f44e2 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -42,7 +42,7 @@ AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS) # Tell the linker to omit references to unused shared libraries. AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS) LDADD = ../lib/libgreputils.a $(LIBINTL) ../lib/libgreputils.a -dfa_match_aux_LDADD = ../src/dfa.$(OBJEXT) $(LDADD) +dfa_match_aux_LDADD = ../src/dfa.$(OBJEXT) ../src/localeinfo.$(OBJEXT) $(LDADD) # The triple-backref test is expected to fail with both the system # matcher (i.e., with glibc) and with the included matcher. diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c index e6517352..e001b7de 100644 --- a/tests/dfa-match-aux.c +++ b/tests/dfa-match-aux.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "progname.h" @@ -47,17 +48,17 @@ main (int argc, char **argv) struct dfa *dfa; char *beg, *end, *p; int allow_nl; + struct localeinfo localeinfo; set_program_name (argv[0]); if (argc < 3) exit (EXIT_FAILURE); setlocale (LC_ALL, ""); - - dfa_init (); + init_localeinfo (&localeinfo); dfa = dfaalloc (); - dfasyntax (dfa, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n'); + dfasyntax (dfa, &localeinfo, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n'); dfacomp (argv[1], strlen (argv[1]), dfa, 0); beg = argv[2]; -- cgit v1.2.1