speed up mb-boundary-detection after each preliminary match

After each kwsexec or dfaexec match, we must determine whether the tentative match falls in the middle of a multi-byte character. That is what our is_mb_middle function does, but it was expensive, even when most input consisted of single-byte characters. The main cost was for each call to mbrlen. This change constructs and uses a cache of the lengths returned by mbrlen for unibyte values. The largest speed-up (3x to 7x, CPU-dependent) is when most lines contain a match, yet few are printed, e.g., when using grep -v common-pattern ... to filter out all but a few lines. * src/search.h (build_mbclen_cache): Declare it. * src/main.c: Include "search.h". [MBS_SUPPORT] (main): Call build_mbclen_cache in a multibyte locale. * src/searchutils.c [HAVE_LANGINFO_CODESET]: Include <langinfo.h>. (mbclen_cache): New global. (build_mbclen_cache): New function. (is_mb_middle) [HAVE_LANGINFO_CODESET]: Use it. * NEWS (Improvements): Mention it.
author: Norihiro Tanaka <noritnk@kcn.ne.jp> 2014-01-30 12:56:04 -0800
committer: Jim Meyering <meyering@fb.com> 2014-02-09 21:00:07 -0800
commit: b75ce6f7c611cb98549dc736947198e812b587c4 (patch)
tree: 268283bcc9b69b7bd62209593b48404b79da8507 /src/searchutils.c
parent: c5cb52ecb97af4bf052e1c1366b8eb93a54ba6a0 (diff)
download: grep-b75ce6f7c611cb98549dc736947198e812b587c4.tar.gz
1 files changed, 42 insertions, 4 deletions
diff --git a/src/searchutils.c b/src/searchutils.c
index 778f4ad0..34784178 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,9 +19,14 @@
 #include <config.h>
 #include <assert.h>
 #include "search.h"
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
 
 #define NCHAR (UCHAR_MAX + 1)
 
+static size_t mbclen_cache[NCHAR];
+
 void
 kwsinit (kwset_t *kwset)
 {
@@ -207,6 +212,20 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p)
   return out;
 }
 
+/* Initialize a cache of mbrlen values for each of its 1-byte inputs.  */
+void
+build_mbclen_cache (void)
+{
+  int i;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t mbs = { 0 };
+      mbclen_cache[uc] = mbrlen (&c, 1, &mbs);
+    }
+}
 
 bool
 is_mb_middle (const char **good, const char *buf, const char *end,
@@ -215,12 +234,31 @@ is_mb_middle (const char **good, const char *buf, const char *end,
   const char *p = *good;
   const char *prev = p;
   mbstate_t cur_state;
+#if HAVE_LANGINFO_CODESET
+  static int is_utf8 = -1;
+
+  if (is_utf8 == -1)
+    is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");
+
+  if (is_utf8 && buf - p > MB_CUR_MAX)
+    {
+      for (p = buf; buf - p > MB_CUR_MAX; p--)
+        if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
+          break;
+
+      if (buf - p == MB_CUR_MAX)
+        p = buf;
+    }
+#endif
+
+  memset (&cur_state, 0, sizeof cur_state);
 
-  /* TODO: can be optimized for UTF-8.  */
-  memset(&cur_state, 0, sizeof(mbstate_t));
   while (p < buf)
     {
-      size_t mbclen = mbrlen(p, end - p, &cur_state);
+      size_t mbclen = mbclen_cache[to_uchar (*p)];
+
+      if (mbclen == (size_t) -2)
+        mbclen = mbrlen (p, end - p, &cur_state);
 
       /* Store the beginning of the previous complete multibyte character.  */
       if (mbclen != (size_t) -2)
@@ -231,7 +269,7 @@ is_mb_middle (const char **good, const char *buf, const char *end,
           /* An invalid sequence, or a truncated multibyte character.
              We treat it as a single byte character.  */
           mbclen = 1;
-          memset(&cur_state, 0, sizeof cur_state);
+          memset (&cur_state, 0, sizeof cur_state);
         }
       p += mbclen;
     }
author	Norihiro Tanaka <noritnk@kcn.ne.jp>	2014-01-30 12:56:04 -0800
committer	Jim Meyering <meyering@fb.com>	2014-02-09 21:00:07 -0800
commit	b75ce6f7c611cb98549dc736947198e812b587c4 (patch)
tree	268283bcc9b69b7bd62209593b48404b79da8507 /src/searchutils.c
parent	c5cb52ecb97af4bf052e1c1366b8eb93a54ba6a0 (diff)
download	grep-b75ce6f7c611cb98549dc736947198e812b587c4.tar.gz