diff options
author | Paul Eggert <eggert@cs.ucla.edu> | 2016-12-23 12:25:24 -0800 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2016-12-23 17:22:54 -0800 |
commit | 4fa1971d98c79b56b466eff57117351dc395ee2a (patch) | |
tree | 1b08b69d48e5fa9b2f2837c60eacdd956cbc27b1 /src/searchutils.c | |
parent | 4dd5274d6a8519d08ede792baafb0f9415cf4f9f (diff) | |
download | grep-4fa1971d98c79b56b466eff57117351dc395ee2a.tar.gz |
grep: specialize word-finding functions
This improves performance a bit.
* src/dfasearch.c, src/kwsearch.c (wordchar):
Remove; now in searchutils.c.
* src/grep.c (main): Call wordinit if -w.
* src/search.h: Adjust.
* src/searchutils.c: Include verify.h.
(word_start): New static var.
(wordchar): Move here from dfasearch.c and kwsearch.c.
(wordinit, wordchars_count, wordchar_next, wordchar_prev):
New functions.
(mb_prev_wc, mb_next_wc): Remove.
All callers changed to use the new functions instead.
Diffstat (limited to 'src/searchutils.c')
-rw-r--r-- | src/searchutils.c | 91 |
1 files changed, 72 insertions, 19 deletions
diff --git a/src/searchutils.c b/src/searchutils.c index deaab609..e0a1db33 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -22,6 +22,30 @@ #define SYSTEM_INLINE _GL_EXTERN_INLINE #include "search.h" +#include <verify.h> + +/* For each byte B, word_start[B] is 1 if B is a single-byte character + that is a word constituent, 0 if B cannot start a word constituent, + and -1 if B might be or might not be the start of a word + constituent. */ +static wint_t word_start[NCHAR]; +verify (WEOF != 0 && WEOF != 1); + +/* Whether -w considers WC to be a word constituent. */ +static bool +wordchar (wint_t wc) +{ + return wc == L'_' || iswalnum (wc); +} + +void +wordinit (void) +{ + for (int i = 0; i < NCHAR; i++) + word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF + : wordchar (localeinfo.sbctowc[i])); +} + kwset_t kwsinit (bool mb_trans) { @@ -93,27 +117,56 @@ mb_goback (char const **mb_start, char const *cur, char const *end) return p == cur ? 0 : cur - p0; } -/* In the buffer BUF, return the wide character that is encoded just - before CUR. The buffer ends at END. Return WEOF if there is no - wide character just before CUR. */ -wint_t -mb_prev_wc (char const *buf, char const *cur, char const *end) +/* Examine the start of BUF (of size SIZE) for word constituents. + If COUNTALL, examine as many as possible; otherwise, examine at most one. + Return the total number of bytes in the examined characters. */ +static size_t +wordchars_count (char const *buf, char const *end, bool countall) { - if (cur == buf) - return WEOF; - char const *p = buf; - cur--; - cur -= mb_goback (&p, cur, end); - return mb_next_wc (cur, end); + size_t n = 0; + mbstate_t mbs = { 0 }; + while (n < end - buf) + { + wint_t ws = word_start[to_uchar (buf[n])]; + if (ws == 0) + break; + else if (ws == 1) + n++; + else + { + wchar_t wc = 0; + size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs); + if (!wordchar (wc)) + break; + n += wcbytes + !wcbytes; + } + if (!countall) + break; + } + return n; } -/* Return the wide character that is encoded at CUR. The buffer ends - at END. Return WEOF if there is no wide character encoded at CUR. */ -wint_t -mb_next_wc (char const *cur, char const *end) +/* If BUF starts with a word constituent, return the number of bytes + used to represent it; otherwise, return zero. The buffer ends at END. */ +size_t +wordchar_next (char const *buf, char const *end) { - wchar_t wc; - mbstate_t mbs = { 0 }; - return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 - ? wc : WEOF); + return wordchars_count (buf, end, false); +} + +/* In the buffer BUF, return true if the character whose encoding + contains the byte before CUR is a word constituent. The buffer + ends at END. */ +bool +wordchar_prev (char const *buf, char const *cur, char const *end) +{ + if (buf == cur) + return false; + cur--; + wint_t ws = word_start[to_uchar (*cur)]; + if (! localeinfo.multibyte) + return ws == 1; + char const *p = buf; + cur -= mb_goback (&p, cur, end); + return wordchar_next (cur, end) != 0; } |