diff options
author | Paul Eggert <eggert@cs.ucla.edu> | 2016-05-01 22:56:39 -0700 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2016-05-01 22:56:39 -0700 |
commit | 724ac5564bc8431b28a37e556a2adf27d7b3c74d (patch) | |
tree | 231f0b0cb99d5b6a2628728caeb86f22e0fc4ede /src | |
parent | af6af288eac28951b5eee1eaaf373e22b2193b7b (diff) | |
download | grep-724ac5564bc8431b28a37e556a2adf27d7b3c74d.tar.gz |
dfa: speed up checking for character boundary
This should help performance with gawk; not so much with grep.
Suggested by Norihiro Tanaka in: http://bugs.gnu.org/18777
* src/dfa.c (never_trail): New static var.
(dfasyntax): Initialize it.
(skip_remains_mb): Use it to speed up a common case in Gawk.
Diffstat (limited to 'src')
-rw-r--r-- | src/dfa.c | 20 |
1 files changed, 17 insertions, 3 deletions
@@ -651,6 +651,10 @@ static unsigned char eolbyte; /* Cache of char-context values. */ static int sbit[NOTCHAR]; +/* If never_trail[B], the byte B cannot be a non-initial byte in a + multibyte character. */ +static bool never_trail[NOTCHAR]; + /* Set of characters considered letters. */ static charclass letters; @@ -712,6 +716,11 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) setbit (uc, newline); break; } + + /* POSIX requires that the five bytes in "\n\r./" (including the + terminating NUL) cannot occur inside a multibyte character. */ + never_trail[uc] = (using_utf8 () ? (uc & 0xc0) != 0x80 + : strchr ("\n\r./", uc) != NULL); } } @@ -3159,15 +3168,20 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, that are not a single byte character nor the first byte of a multibyte character. - Given DFA state d, use mbs_to_wchar to advance MBP until it reaches or - exceeds P. If WCP is non-NULL, set *WCP to the final wide character - processed, or if no wide character is processed, set it to WEOF. + Given DFA state d, use mbs_to_wchar to advance MBP until it reaches + or exceeds P, and return the advanced MBP. If WCP is non-NULL and + the result is greater than P, set *WCP to the final wide character + processed, or to WEOF if no wide character is processed. Otherwise, + if WCP is non-NULL, *WCP may or may not be updated. + Both P and MBP must be no larger than END. */ static unsigned char const * skip_remains_mb (struct dfa *d, unsigned char const *p, unsigned char const *mbp, char const *end, wint_t *wcp) { wint_t wc = WEOF; + if (never_trail[*p]) + return p; while (mbp < p) mbp += mbs_to_wchar (&wc, (char const *) mbp, end - (char const *) mbp, d); |