summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2016-09-01 11:45:18 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2016-09-01 11:45:53 -0700
commit2d65987386aebcb47fa339930e1c7850c92c2b5e (patch)
treed5274419488e876a9ec864cdeb8c5b2f802c756c
parent1e166fdaa69902793057ed2a63408f51c3b4273b (diff)
downloadgrep-2d65987386aebcb47fa339930e1c7850c92c2b5e.tar.gz
dfa: use single-byte algorithm even in non-UTF-8
Even in non-UTF8 locales, if the current input character is single byte, we can use CSET to match ANYCHAR. * src/dfa.c (struct dfa): New member canychar. Cache index of CSET for ANYCHAR. (lex): Make CSET for ANYCHAR. (state_index): Simplify. (dfastate): Consider CSET for ANYCHAR. (transit_state_singlebyte, transit_state): Remove handling for eolbyte, as we assume that eolbyte does not appear at current position. (dfaexec_main): Use algorithm for single byte character to any single byte character in input text always. (dfasyntax): Initialize canychar.
-rw-r--r--src/dfa.c115
1 files changed, 45 insertions, 70 deletions
diff --git a/src/dfa.c b/src/dfa.c
index 0f5109e2..e39d82a7 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -401,6 +401,7 @@ struct dfa
charclass *charclasses; /* Array of character sets for CSET tokens. */
size_t cindex; /* Index for adding new charclasses. */
size_t calloc; /* Number of charclasses allocated. */
+ size_t canychar; /* Index of anychar class, or (size_t) -1. */
/* Scanner state */
struct lexer_state lex;
@@ -1398,21 +1399,24 @@ lex (struct dfa *dfa)
case '.':
if (backslash)
goto normal_char;
- if (dfa->localeinfo.multibyte)
+ if (dfa->canychar == (size_t) -1)
{
- /* In multibyte environment period must match with a single
- character not a byte. So we use ANYCHAR. */
- dfa->lex.laststart = false;
- return dfa->lex.lasttok = ANYCHAR;
+ zeroset (ccl);
+ notset (ccl);
+ if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
+ clrbit ('\n', ccl);
+ if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
+ clrbit ('\0', ccl);
+ if (dfa->localeinfo.multibyte)
+ for (c2 = 0; c2 < NOTCHAR; c2++)
+ if (dfa->localeinfo.sbctowc[c2] == WEOF)
+ clrbit (c2, ccl);
+ dfa->canychar = charclass_index (dfa, ccl);
}
- zeroset (ccl);
- notset (ccl);
- if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
- clrbit ('\n', ccl);
- if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
- clrbit ('\0', ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = (dfa->localeinfo.multibyte
+ ? ANYCHAR
+ : CSET + dfa->canychar);
case 's':
case 'S':
@@ -2077,7 +2081,7 @@ state_index (struct dfa *d, position_set const *s, int context)
}
else if (d->tokens[s->elems[j].index] == BACKREF)
constraint = NO_CONSTRAINT;
- if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+ if (d->tokens[s->elems[j].index] == ANYCHAR)
{
int acceptable
= ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2554,13 +2558,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
setbit (d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
+ else if (d->tokens[pos.index] == ANYCHAR)
{
- /* ANYCHAR must match a single character, so put it to
- D->states[s].mbps which contains the positions which can
- match with a single character not a byte. If all
- positions with ANYCHAR do not depend on the context of
- the next character, put its follows instead to
+ copyset (d->charclasses[d->canychar], matches);
+
+ /* ANYCHAR must match with a single character, so we must put
+ it to D->states[s].mbps which contains the positions which
+ can match with a single character not a byte. If all
+ positions which has ANYCHAR does not depend on context of
+ next character, we put the follows instead of it to
D->states[s].mbps to optimize. */
if (d->states[s].curr_dependent)
{
@@ -2572,7 +2578,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
d->states[s].context, CTX_ANY))
{
if (d->states[s].mbps.nelem == 0)
- alloc_position_set (&d->states[s].mbps, 1);
+ alloc_position_set (&d->states[s].mbps,
+ d->follows[pos.index].nelem);
for (j = 0; j < d->follows[pos.index].nelem; j++)
insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
}
@@ -2950,16 +2957,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
{
state_num *t;
- if (**pp == d->syntax.eolbyte)
- {
- /* S is always an initial state in transit_state, so the
- transition table for the state must have been built already. */
- assert (d->trans[s] || d->fails[s]);
-
- ++*pp;
- return d->newlines[s];
- }
-
if (d->trans[s])
t = d->trans[s];
else if (d->fails[s])
@@ -2986,15 +2983,12 @@ static state_num
transit_state (struct dfa *d, state_num s, unsigned char const **pp,
unsigned char const *end)
{
- state_num s1;
+ state_num s1, s2;
wint_t wc;
int separate_contexts;
- state_num state, state_newline, mb_index;
size_t i, j;
int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
- int context = wc == d->syntax.eolbyte ? CTX_NEWLINE : CTX_NONE;
- bool context_newline = context == CTX_NEWLINE;
/* This state has some operators which can match a multibyte character. */
d->mb_follows.nelem = 0;
@@ -3016,7 +3010,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
for (i = 0; i < d->states[s1].mbps.nelem; i++)
{
if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
- d->states[s1].context, context))
+ d->states[s1].context, CTX_NONE))
continue;
for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
j++)
@@ -3025,10 +3019,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
}
separate_contexts = state_separate_contexts (&d->mb_follows);
- if (context_newline && separate_contexts & CTX_NEWLINE)
- s = state_index (d, &d->mb_follows, CTX_NEWLINE);
- else
- s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+ s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
realloc_trans_if_necessary (d, s);
return s;
@@ -3041,11 +3032,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
{
if (MAX_TRCOUNT <= d->mb_trcount)
{
- state_num s2;
- for (s2 = -1; s2 < d->tralloc; s2++)
+ state_num s3;
+ for (s3 = -1; s3 < d->tralloc; s3++)
{
- free (d->mb_trans[s2]);
- d->mb_trans[s2] = NULL;
+ free (d->mb_trans[s3]);
+ d->mb_trans[s3] = NULL;
}
for (i = 0; i < d->sindex; i++)
@@ -3055,22 +3046,16 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
d->states[s1].mb_trindex = d->mb_trcount++;
}
- mb_index = d->states[s1].mb_trindex * 2;
-
if (! d->mb_trans[s])
{
enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
- enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE };
+ enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
- for (i = 0; i < 2 * MAX_TRCOUNT; i++)
+ for (i = 0; i < MAX_TRCOUNT; i++)
d->mb_trans[s][i] = -1;
}
- else
- {
- state = d->mb_trans[s][mb_index + context_newline];
- if (0 <= state)
- return state;
- }
+ else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
+ return d->mb_trans[s][d->states[s1].mb_trindex];
if (s < 0)
copy (&d->states[s1].mbps, &d->mb_follows);
@@ -3078,17 +3063,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
separate_contexts = state_separate_contexts (&d->mb_follows);
- state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
- if (separate_contexts & CTX_NEWLINE)
- state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE);
- else
- state_newline = state;
- realloc_trans_if_necessary (d, state_newline);
+ s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+ realloc_trans_if_necessary (d, s2);
- d->mb_trans[s][mb_index] = state;
- d->mb_trans[s][mb_index + 1] = state_newline;
+ d->mb_trans[s][d->states[s1].mb_trindex] = s2;
- return context_newline ? state_newline : state;
+ return s2;
}
/* The initial state may encounter a byte which is not a single byte character
@@ -3215,10 +3195,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
}
}
- if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
- || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE))
- || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL))
- || (char *) p >= end)
+ if (d->states[s].mbps.nelem == 0
+ || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
{
/* If an input character does not match ANYCHAR, do it
like a single-byte character. */
@@ -3227,8 +3205,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
else
{
s = transit_state (d, s, &p, (unsigned char *) end);
- if (s >= 0 && p[-1] == eol)
- nlcount++;
mbp = p;
trans = d->trans;
}
@@ -3297,8 +3273,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
else
{
s = transit_state (d, s, &p, (unsigned char *) end);
- if (s >= 0 && p[-1] == eol)
- nlcount++;
mbp = p;
trans = d->trans;
}
@@ -4106,6 +4080,7 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
dfa->fast = !dfa->localeinfo.multibyte;
+ dfa->canychar = -1;
dfa->lex.cur_mb_len = 1;
dfa->syntax.syntax_bits_set = true;
dfa->syntax.syntax_bits = bits;