summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2016-12-23 12:25:24 -0800
committerPaul Eggert <eggert@cs.ucla.edu>2016-12-23 17:22:54 -0800
commit4fa1971d98c79b56b466eff57117351dc395ee2a (patch)
tree1b08b69d48e5fa9b2f2837c60eacdd956cbc27b1
parent4dd5274d6a8519d08ede792baafb0f9415cf4f9f (diff)
downloadgrep-4fa1971d98c79b56b466eff57117351dc395ee2a.tar.gz
grep: specialize word-finding functions
This improves performance a bit. * src/dfasearch.c, src/kwsearch.c (wordchar): Remove; now in searchutils.c. * src/grep.c (main): Call wordinit if -w. * src/search.h: Adjust. * src/searchutils.c: Include verify.h. (word_start): New static var. (wordchar): Move here from dfasearch.c and kwsearch.c. (wordinit, wordchars_count, wordchar_next, wordchar_prev): New functions. (mb_prev_wc, mb_next_wc): Remove. All callers changed to use the new functions instead.
-rw-r--r--src/dfasearch.c11
-rw-r--r--src/grep.c1
-rw-r--r--src/kwsearch.c11
-rw-r--r--src/search.h5
-rw-r--r--src/searchutils.c91
5 files changed, 80 insertions, 39 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 24a36cd5..87e1f7e5 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -26,13 +26,6 @@
struct localeinfo localeinfo;
-/* Whether -w considers WC to be a word constituent. */
-static bool
-wordchar (wint_t wc)
-{
- return wc == L'_' || iswalnum (wc);
-}
-
/* KWset compiled pattern. For Ecompile and Gcompile, we compile
a list of strings, at least one of which is known to occur in
any string matching the regexp. */
@@ -394,8 +387,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
while (match <= best_match)
{
regoff_t shorter_len = 0;
- if (!wordchar (mb_prev_wc (beg, match, end - 1))
- && !wordchar (mb_next_wc (match + len, end - 1)))
+ if (! wordchar_next (match + len, end - 1)
+ && ! wordchar_prev (beg, match, end - 1))
goto assess_pattern_match;
if (len > 0)
{
diff --git a/src/grep.c b/src/grep.c
index 3729ae0e..f9d1d86c 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2651,6 +2651,7 @@ main (int argc, char **argv)
break;
case 'w':
+ wordinit ();
match_words = true;
break;
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 5596ebdd..b30dfd06 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -21,13 +21,6 @@
#include <config.h>
#include "search.h"
-/* Whether -w considers WC to be a word constituent. */
-static bool
-wordchar (wint_t wc)
-{
- return wc == L'_' || iswalnum (wc);
-}
-
/* KWset compiled pattern. For Ecompile and Gcompile, we compile
a list of strings, at least one of which is known to occur in
any string matching the regexp. */
@@ -140,10 +133,10 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
char const *bol = memrchr (mb_start, eol, beg - mb_start);
if (bol)
mb_start = bol + 1;
- if (! wordchar (mb_prev_wc (mb_start, beg, buf + size)))
+ if (! wordchar_prev (mb_start, beg, buf + size))
for (;;)
{
- if (! wordchar (mb_next_wc (beg + len, buf + size)))
+ if (! wordchar_next (beg + len, buf + size))
{
if (start_ptr)
goto success_in_beg_and_len;
diff --git a/src/search.h b/src/search.h
index 1ff5be22..6fe17975 100644
--- a/src/search.h
+++ b/src/search.h
@@ -46,10 +46,11 @@ _GL_INLINE_HEADER_BEGIN
typedef signed char mb_len_map_t;
/* searchutils.c */
+extern void wordinit (void);
extern kwset_t kwsinit (bool);
+extern size_t wordchar_next (char const *, char const *);
+extern bool wordchar_prev (char const *, char const *, char const *);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
-extern wint_t mb_prev_wc (char const *, char const *, char const *);
-extern wint_t mb_next_wc (char const *, char const *);
/* dfasearch.c */
extern struct localeinfo localeinfo;
diff --git a/src/searchutils.c b/src/searchutils.c
index deaab609..e0a1db33 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,6 +22,30 @@
#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
+#include <verify.h>
+
+/* For each byte B, word_start[B] is 1 if B is a single-byte character
+ that is a word constituent, 0 if B cannot start a word constituent,
+ and -1 if B might be or might not be the start of a word
+ constituent. */
+static wint_t word_start[NCHAR];
+verify (WEOF != 0 && WEOF != 1);
+
+/* Whether -w considers WC to be a word constituent. */
+static bool
+wordchar (wint_t wc)
+{
+ return wc == L'_' || iswalnum (wc);
+}
+
+void
+wordinit (void)
+{
+ for (int i = 0; i < NCHAR; i++)
+ word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
+ : wordchar (localeinfo.sbctowc[i]));
+}
+
kwset_t
kwsinit (bool mb_trans)
{
@@ -93,27 +117,56 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
return p == cur ? 0 : cur - p0;
}
-/* In the buffer BUF, return the wide character that is encoded just
- before CUR. The buffer ends at END. Return WEOF if there is no
- wide character just before CUR. */
-wint_t
-mb_prev_wc (char const *buf, char const *cur, char const *end)
+/* Examine the start of BUF (of size SIZE) for word constituents.
+ If COUNTALL, examine as many as possible; otherwise, examine at most one.
+ Return the total number of bytes in the examined characters. */
+static size_t
+wordchars_count (char const *buf, char const *end, bool countall)
{
- if (cur == buf)
- return WEOF;
- char const *p = buf;
- cur--;
- cur -= mb_goback (&p, cur, end);
- return mb_next_wc (cur, end);
+ size_t n = 0;
+ mbstate_t mbs = { 0 };
+ while (n < end - buf)
+ {
+ wint_t ws = word_start[to_uchar (buf[n])];
+ if (ws == 0)
+ break;
+ else if (ws == 1)
+ n++;
+ else
+ {
+ wchar_t wc = 0;
+ size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs);
+ if (!wordchar (wc))
+ break;
+ n += wcbytes + !wcbytes;
+ }
+ if (!countall)
+ break;
+ }
+ return n;
}
-/* Return the wide character that is encoded at CUR. The buffer ends
- at END. Return WEOF if there is no wide character encoded at CUR. */
-wint_t
-mb_next_wc (char const *cur, char const *end)
+/* If BUF starts with a word constituent, return the number of bytes
+ used to represent it; otherwise, return zero. The buffer ends at END. */
+size_t
+wordchar_next (char const *buf, char const *end)
{
- wchar_t wc;
- mbstate_t mbs = { 0 };
- return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2
- ? wc : WEOF);
+ return wordchars_count (buf, end, false);
+}
+
+/* In the buffer BUF, return true if the character whose encoding
+ contains the byte before CUR is a word constituent. The buffer
+ ends at END. */
+bool
+wordchar_prev (char const *buf, char const *cur, char const *end)
+{
+ if (buf == cur)
+ return false;
+ cur--;
+ wint_t ws = word_start[to_uchar (*cur)];
+ if (! localeinfo.multibyte)
+ return ws == 1;
+ char const *p = buf;
+ cur -= mb_goback (&p, cur, end);
+ return wordchar_next (cur, end) != 0;
}