diff options
author | Paul Eggert <eggert@cs.ucla.edu> | 2016-12-23 12:25:24 -0800 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2016-12-23 17:22:54 -0800 |
commit | 4fa1971d98c79b56b466eff57117351dc395ee2a (patch) | |
tree | 1b08b69d48e5fa9b2f2837c60eacdd956cbc27b1 | |
parent | 4dd5274d6a8519d08ede792baafb0f9415cf4f9f (diff) | |
download | grep-4fa1971d98c79b56b466eff57117351dc395ee2a.tar.gz |
grep: specialize word-finding functions
This improves performance a bit.
* src/dfasearch.c, src/kwsearch.c (wordchar):
Remove; now in searchutils.c.
* src/grep.c (main): Call wordinit if -w.
* src/search.h: Adjust.
* src/searchutils.c: Include verify.h.
(word_start): New static var.
(wordchar): Move here from dfasearch.c and kwsearch.c.
(wordinit, wordchars_count, wordchar_next, wordchar_prev):
New functions.
(mb_prev_wc, mb_next_wc): Remove.
All callers changed to use the new functions instead.
-rw-r--r-- | src/dfasearch.c | 11 | ||||
-rw-r--r-- | src/grep.c | 1 | ||||
-rw-r--r-- | src/kwsearch.c | 11 | ||||
-rw-r--r-- | src/search.h | 5 | ||||
-rw-r--r-- | src/searchutils.c | 91 |
5 files changed, 80 insertions, 39 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c index 24a36cd5..87e1f7e5 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -26,13 +26,6 @@ struct localeinfo localeinfo; -/* Whether -w considers WC to be a word constituent. */ -static bool -wordchar (wint_t wc) -{ - return wc == L'_' || iswalnum (wc); -} - /* KWset compiled pattern. For Ecompile and Gcompile, we compile a list of strings, at least one of which is known to occur in any string matching the regexp. */ @@ -394,8 +387,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size, while (match <= best_match) { regoff_t shorter_len = 0; - if (!wordchar (mb_prev_wc (beg, match, end - 1)) - && !wordchar (mb_next_wc (match + len, end - 1))) + if (! wordchar_next (match + len, end - 1) + && ! wordchar_prev (beg, match, end - 1)) goto assess_pattern_match; if (len > 0) { @@ -2651,6 +2651,7 @@ main (int argc, char **argv) break; case 'w': + wordinit (); match_words = true; break; diff --git a/src/kwsearch.c b/src/kwsearch.c index 5596ebdd..b30dfd06 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -21,13 +21,6 @@ #include <config.h> #include "search.h" -/* Whether -w considers WC to be a word constituent. */ -static bool -wordchar (wint_t wc) -{ - return wc == L'_' || iswalnum (wc); -} - /* KWset compiled pattern. For Ecompile and Gcompile, we compile a list of strings, at least one of which is known to occur in any string matching the regexp. */ @@ -140,10 +133,10 @@ Fexecute (char const *buf, size_t size, size_t *match_size, char const *bol = memrchr (mb_start, eol, beg - mb_start); if (bol) mb_start = bol + 1; - if (! wordchar (mb_prev_wc (mb_start, beg, buf + size))) + if (! wordchar_prev (mb_start, beg, buf + size)) for (;;) { - if (! wordchar (mb_next_wc (beg + len, buf + size))) + if (! wordchar_next (beg + len, buf + size)) { if (start_ptr) goto success_in_beg_and_len; diff --git a/src/search.h b/src/search.h index 1ff5be22..6fe17975 100644 --- a/src/search.h +++ b/src/search.h @@ -46,10 +46,11 @@ _GL_INLINE_HEADER_BEGIN typedef signed char mb_len_map_t; /* searchutils.c */ +extern void wordinit (void); extern kwset_t kwsinit (bool); +extern size_t wordchar_next (char const *, char const *); +extern bool wordchar_prev (char const *, char const *, char const *); extern ptrdiff_t mb_goback (char const **, char const *, char const *); -extern wint_t mb_prev_wc (char const *, char const *, char const *); -extern wint_t mb_next_wc (char const *, char const *); /* dfasearch.c */ extern struct localeinfo localeinfo; diff --git a/src/searchutils.c b/src/searchutils.c index deaab609..e0a1db33 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -22,6 +22,30 @@ #define SYSTEM_INLINE _GL_EXTERN_INLINE #include "search.h" +#include <verify.h> + +/* For each byte B, word_start[B] is 1 if B is a single-byte character + that is a word constituent, 0 if B cannot start a word constituent, + and -1 if B might be or might not be the start of a word + constituent. */ +static wint_t word_start[NCHAR]; +verify (WEOF != 0 && WEOF != 1); + +/* Whether -w considers WC to be a word constituent. */ +static bool +wordchar (wint_t wc) +{ + return wc == L'_' || iswalnum (wc); +} + +void +wordinit (void) +{ + for (int i = 0; i < NCHAR; i++) + word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF + : wordchar (localeinfo.sbctowc[i])); +} + kwset_t kwsinit (bool mb_trans) { @@ -93,27 +117,56 @@ mb_goback (char const **mb_start, char const *cur, char const *end) return p == cur ? 0 : cur - p0; } -/* In the buffer BUF, return the wide character that is encoded just - before CUR. The buffer ends at END. Return WEOF if there is no - wide character just before CUR. */ -wint_t -mb_prev_wc (char const *buf, char const *cur, char const *end) +/* Examine the start of BUF (of size SIZE) for word constituents. + If COUNTALL, examine as many as possible; otherwise, examine at most one. + Return the total number of bytes in the examined characters. */ +static size_t +wordchars_count (char const *buf, char const *end, bool countall) { - if (cur == buf) - return WEOF; - char const *p = buf; - cur--; - cur -= mb_goback (&p, cur, end); - return mb_next_wc (cur, end); + size_t n = 0; + mbstate_t mbs = { 0 }; + while (n < end - buf) + { + wint_t ws = word_start[to_uchar (buf[n])]; + if (ws == 0) + break; + else if (ws == 1) + n++; + else + { + wchar_t wc = 0; + size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs); + if (!wordchar (wc)) + break; + n += wcbytes + !wcbytes; + } + if (!countall) + break; + } + return n; } -/* Return the wide character that is encoded at CUR. The buffer ends - at END. Return WEOF if there is no wide character encoded at CUR. */ -wint_t -mb_next_wc (char const *cur, char const *end) +/* If BUF starts with a word constituent, return the number of bytes + used to represent it; otherwise, return zero. The buffer ends at END. */ +size_t +wordchar_next (char const *buf, char const *end) { - wchar_t wc; - mbstate_t mbs = { 0 }; - return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 - ? wc : WEOF); + return wordchars_count (buf, end, false); +} + +/* In the buffer BUF, return true if the character whose encoding + contains the byte before CUR is a word constituent. The buffer + ends at END. */ +bool +wordchar_prev (char const *buf, char const *cur, char const *end) +{ + if (buf == cur) + return false; + cur--; + wint_t ws = word_start[to_uchar (*cur)]; + if (! localeinfo.multibyte) + return ws == 1; + char const *p = buf; + cur -= mb_goback (&p, cur, end); + return wordchar_next (cur, end) != 0; } |