summaryrefslogtreecommitdiff
path: root/src/searchutils.c
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2014-01-30 12:56:04 -0800
committerJim Meyering <meyering@fb.com>2014-02-09 21:00:07 -0800
commitb75ce6f7c611cb98549dc736947198e812b587c4 (patch)
tree268283bcc9b69b7bd62209593b48404b79da8507 /src/searchutils.c
parentc5cb52ecb97af4bf052e1c1366b8eb93a54ba6a0 (diff)
downloadgrep-b75ce6f7c611cb98549dc736947198e812b587c4.tar.gz
speed up mb-boundary-detection after each preliminary match
After each kwsexec or dfaexec match, we must determine whether the tentative match falls in the middle of a multi-byte character. That is what our is_mb_middle function does, but it was expensive, even when most input consisted of single-byte characters. The main cost was for each call to mbrlen. This change constructs and uses a cache of the lengths returned by mbrlen for unibyte values. The largest speed-up (3x to 7x, CPU-dependent) is when most lines contain a match, yet few are printed, e.g., when using grep -v common-pattern ... to filter out all but a few lines. * src/search.h (build_mbclen_cache): Declare it. * src/main.c: Include "search.h". [MBS_SUPPORT] (main): Call build_mbclen_cache in a multibyte locale. * src/searchutils.c [HAVE_LANGINFO_CODESET]: Include <langinfo.h>. (mbclen_cache): New global. (build_mbclen_cache): New function. (is_mb_middle) [HAVE_LANGINFO_CODESET]: Use it. * NEWS (Improvements): Mention it.
Diffstat (limited to 'src/searchutils.c')
-rw-r--r--src/searchutils.c46
1 files changed, 42 insertions, 4 deletions
diff --git a/src/searchutils.c b/src/searchutils.c
index 778f4ad0..34784178 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,9 +19,14 @@
#include <config.h>
#include <assert.h>
#include "search.h"
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
#define NCHAR (UCHAR_MAX + 1)
+static size_t mbclen_cache[NCHAR];
+
void
kwsinit (kwset_t *kwset)
{
@@ -207,6 +212,20 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p)
return out;
}
+/* Initialize a cache of mbrlen values for each of its 1-byte inputs. */
+void
+build_mbclen_cache (void)
+{
+ int i;
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ char c = i;
+ unsigned char uc = i;
+ mbstate_t mbs = { 0 };
+ mbclen_cache[uc] = mbrlen (&c, 1, &mbs);
+ }
+}
bool
is_mb_middle (const char **good, const char *buf, const char *end,
@@ -215,12 +234,31 @@ is_mb_middle (const char **good, const char *buf, const char *end,
const char *p = *good;
const char *prev = p;
mbstate_t cur_state;
+#if HAVE_LANGINFO_CODESET
+ static int is_utf8 = -1;
+
+ if (is_utf8 == -1)
+ is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");
+
+ if (is_utf8 && buf - p > MB_CUR_MAX)
+ {
+ for (p = buf; buf - p > MB_CUR_MAX; p--)
+ if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
+ break;
+
+ if (buf - p == MB_CUR_MAX)
+ p = buf;
+ }
+#endif
+
+ memset (&cur_state, 0, sizeof cur_state);
- /* TODO: can be optimized for UTF-8. */
- memset(&cur_state, 0, sizeof(mbstate_t));
while (p < buf)
{
- size_t mbclen = mbrlen(p, end - p, &cur_state);
+ size_t mbclen = mbclen_cache[to_uchar (*p)];
+
+ if (mbclen == (size_t) -2)
+ mbclen = mbrlen (p, end - p, &cur_state);
/* Store the beginning of the previous complete multibyte character. */
if (mbclen != (size_t) -2)
@@ -231,7 +269,7 @@ is_mb_middle (const char **good, const char *buf, const char *end,
/* An invalid sequence, or a truncated multibyte character.
We treat it as a single byte character. */
mbclen = 1;
- memset(&cur_state, 0, sizeof cur_state);
+ memset (&cur_state, 0, sizeof cur_state);
}
p += mbclen;
}