diff options
author | Norihiro Tanaka <noritnk@kcn.ne.jp> | 2014-01-30 12:56:04 -0800 |
---|---|---|
committer | Jim Meyering <meyering@fb.com> | 2014-02-09 21:00:07 -0800 |
commit | b75ce6f7c611cb98549dc736947198e812b587c4 (patch) | |
tree | 268283bcc9b69b7bd62209593b48404b79da8507 /src/searchutils.c | |
parent | c5cb52ecb97af4bf052e1c1366b8eb93a54ba6a0 (diff) | |
download | grep-b75ce6f7c611cb98549dc736947198e812b587c4.tar.gz |
speed up mb-boundary-detection after each preliminary match
After each kwsexec or dfaexec match, we must determine whether
the tentative match falls in the middle of a multi-byte character.
That is what our is_mb_middle function does, but it was expensive,
even when most input consisted of single-byte characters. The main
cost was for each call to mbrlen. This change constructs and uses
a cache of the lengths returned by mbrlen for unibyte values.
The largest speed-up (3x to 7x, CPU-dependent) is when most
lines contain a match, yet few are printed, e.g., when using
grep -v common-pattern ... to filter out all but a few lines.
* src/search.h (build_mbclen_cache): Declare it.
* src/main.c: Include "search.h".
[MBS_SUPPORT] (main): Call build_mbclen_cache in a multibyte locale.
* src/searchutils.c [HAVE_LANGINFO_CODESET]: Include <langinfo.h>.
(mbclen_cache): New global.
(build_mbclen_cache): New function.
(is_mb_middle) [HAVE_LANGINFO_CODESET]: Use it.
* NEWS (Improvements): Mention it.
Diffstat (limited to 'src/searchutils.c')
-rw-r--r-- | src/searchutils.c | 46 |
1 files changed, 42 insertions, 4 deletions
diff --git a/src/searchutils.c b/src/searchutils.c index 778f4ad0..34784178 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -19,9 +19,14 @@ #include <config.h> #include <assert.h> #include "search.h" +#if HAVE_LANGINFO_CODESET +# include <langinfo.h> +#endif #define NCHAR (UCHAR_MAX + 1) +static size_t mbclen_cache[NCHAR]; + void kwsinit (kwset_t *kwset) { @@ -207,6 +212,20 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) return out; } +/* Initialize a cache of mbrlen values for each of its 1-byte inputs. */ +void +build_mbclen_cache (void) +{ + int i; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + char c = i; + unsigned char uc = i; + mbstate_t mbs = { 0 }; + mbclen_cache[uc] = mbrlen (&c, 1, &mbs); + } +} bool is_mb_middle (const char **good, const char *buf, const char *end, @@ -215,12 +234,31 @@ is_mb_middle (const char **good, const char *buf, const char *end, const char *p = *good; const char *prev = p; mbstate_t cur_state; +#if HAVE_LANGINFO_CODESET + static int is_utf8 = -1; + + if (is_utf8 == -1) + is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8"); + + if (is_utf8 && buf - p > MB_CUR_MAX) + { + for (p = buf; buf - p > MB_CUR_MAX; p--) + if (mbclen_cache[to_uchar (*p)] != (size_t) -1) + break; + + if (buf - p == MB_CUR_MAX) + p = buf; + } +#endif + + memset (&cur_state, 0, sizeof cur_state); - /* TODO: can be optimized for UTF-8. */ - memset(&cur_state, 0, sizeof(mbstate_t)); while (p < buf) { - size_t mbclen = mbrlen(p, end - p, &cur_state); + size_t mbclen = mbclen_cache[to_uchar (*p)]; + + if (mbclen == (size_t) -2) + mbclen = mbrlen (p, end - p, &cur_state); /* Store the beginning of the previous complete multibyte character. */ if (mbclen != (size_t) -2) @@ -231,7 +269,7 @@ is_mb_middle (const char **good, const char *buf, const char *end, /* An invalid sequence, or a truncated multibyte character. We treat it as a single byte character. */ mbclen = 1; - memset(&cur_state, 0, sizeof cur_state); + memset (&cur_state, 0, sizeof cur_state); } p += mbclen; } |