From b75ce6f7c611cb98549dc736947198e812b587c4 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Thu, 30 Jan 2014 12:56:04 -0800 Subject: speed up mb-boundary-detection after each preliminary match After each kwsexec or dfaexec match, we must determine whether the tentative match falls in the middle of a multi-byte character. That is what our is_mb_middle function does, but it was expensive, even when most input consisted of single-byte characters. The main cost was for each call to mbrlen. This change constructs and uses a cache of the lengths returned by mbrlen for unibyte values. The largest speed-up (3x to 7x, CPU-dependent) is when most lines contain a match, yet few are printed, e.g., when using grep -v common-pattern ... to filter out all but a few lines. * src/search.h (build_mbclen_cache): Declare it. * src/main.c: Include "search.h". [MBS_SUPPORT] (main): Call build_mbclen_cache in a multibyte locale. * src/searchutils.c [HAVE_LANGINFO_CODESET]: Include . (mbclen_cache): New global. (build_mbclen_cache): New function. (is_mb_middle) [HAVE_LANGINFO_CODESET]: Use it. * NEWS (Improvements): Mention it. --- src/searchutils.c | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'src/searchutils.c') diff --git a/src/searchutils.c b/src/searchutils.c index 778f4ad0..34784178 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -19,9 +19,14 @@ #include #include #include "search.h" +#if HAVE_LANGINFO_CODESET +# include +#endif #define NCHAR (UCHAR_MAX + 1) +static size_t mbclen_cache[NCHAR]; + void kwsinit (kwset_t *kwset) { @@ -207,6 +212,20 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) return out; } +/* Initialize a cache of mbrlen values for each of its 1-byte inputs. */ +void +build_mbclen_cache (void) +{ + int i; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + char c = i; + unsigned char uc = i; + mbstate_t mbs = { 0 }; + mbclen_cache[uc] = mbrlen (&c, 1, &mbs); + } +} bool is_mb_middle (const char **good, const char *buf, const char *end, @@ -215,12 +234,31 @@ is_mb_middle (const char **good, const char *buf, const char *end, const char *p = *good; const char *prev = p; mbstate_t cur_state; +#if HAVE_LANGINFO_CODESET + static int is_utf8 = -1; + + if (is_utf8 == -1) + is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8"); + + if (is_utf8 && buf - p > MB_CUR_MAX) + { + for (p = buf; buf - p > MB_CUR_MAX; p--) + if (mbclen_cache[to_uchar (*p)] != (size_t) -1) + break; + + if (buf - p == MB_CUR_MAX) + p = buf; + } +#endif + + memset (&cur_state, 0, sizeof cur_state); - /* TODO: can be optimized for UTF-8. */ - memset(&cur_state, 0, sizeof(mbstate_t)); while (p < buf) { - size_t mbclen = mbrlen(p, end - p, &cur_state); + size_t mbclen = mbclen_cache[to_uchar (*p)]; + + if (mbclen == (size_t) -2) + mbclen = mbrlen (p, end - p, &cur_state); /* Store the beginning of the previous complete multibyte character. */ if (mbclen != (size_t) -2) @@ -231,7 +269,7 @@ is_mb_middle (const char **good, const char *buf, const char *end, /* An invalid sequence, or a truncated multibyte character. We treat it as a single byte character. */ mbclen = 1; - memset(&cur_state, 0, sizeof cur_state); + memset (&cur_state, 0, sizeof cur_state); } p += mbclen; } -- cgit v1.2.1