From b75ce6f7c611cb98549dc736947198e812b587c4 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <noritnk@kcn.ne.jp>
Date: Thu, 30 Jan 2014 12:56:04 -0800
Subject: speed up mb-boundary-detection after each preliminary match

After each kwsexec or dfaexec match, we must determine whether
the tentative match falls in the middle of a multi-byte character.
That is what our is_mb_middle function does, but it was expensive,
even when most input consisted of single-byte characters.  The main
cost was for each call to mbrlen.  This change constructs and uses
a cache of the lengths returned by mbrlen for unibyte values.
The largest speed-up (3x to 7x, CPU-dependent) is when most
lines contain a match, yet few are printed, e.g., when using
grep -v common-pattern ... to filter out all but a few lines.

* src/search.h (build_mbclen_cache): Declare it.
* src/main.c: Include "search.h".
[MBS_SUPPORT] (main): Call build_mbclen_cache in a multibyte locale.
* src/searchutils.c [HAVE_LANGINFO_CODESET]: Include <langinfo.h>.
(mbclen_cache): New global.
(build_mbclen_cache): New function.
(is_mb_middle) [HAVE_LANGINFO_CODESET]: Use it.
* NEWS (Improvements): Mention it.
---
 src/searchutils.c | 46 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

(limited to 'src/searchutils.c')

diff --git a/src/searchutils.c b/src/searchutils.c
index 778f4ad0..34784178 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,9 +19,14 @@
 #include <config.h>
 #include <assert.h>
 #include "search.h"
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
 
 #define NCHAR (UCHAR_MAX + 1)
 
+static size_t mbclen_cache[NCHAR];
+
 void
 kwsinit (kwset_t *kwset)
 {
@@ -207,6 +212,20 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p)
   return out;
 }
 
+/* Initialize a cache of mbrlen values for each of its 1-byte inputs.  */
+void
+build_mbclen_cache (void)
+{
+  int i;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t mbs = { 0 };
+      mbclen_cache[uc] = mbrlen (&c, 1, &mbs);
+    }
+}
 
 bool
 is_mb_middle (const char **good, const char *buf, const char *end,
@@ -215,12 +234,31 @@ is_mb_middle (const char **good, const char *buf, const char *end,
   const char *p = *good;
   const char *prev = p;
   mbstate_t cur_state;
+#if HAVE_LANGINFO_CODESET
+  static int is_utf8 = -1;
+
+  if (is_utf8 == -1)
+    is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");
+
+  if (is_utf8 && buf - p > MB_CUR_MAX)
+    {
+      for (p = buf; buf - p > MB_CUR_MAX; p--)
+        if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
+          break;
+
+      if (buf - p == MB_CUR_MAX)
+        p = buf;
+    }
+#endif
+
+  memset (&cur_state, 0, sizeof cur_state);
 
-  /* TODO: can be optimized for UTF-8.  */
-  memset(&cur_state, 0, sizeof(mbstate_t));
   while (p < buf)
     {
-      size_t mbclen = mbrlen(p, end - p, &cur_state);
+      size_t mbclen = mbclen_cache[to_uchar (*p)];
+
+      if (mbclen == (size_t) -2)
+        mbclen = mbrlen (p, end - p, &cur_state);
 
       /* Store the beginning of the previous complete multibyte character.  */
       if (mbclen != (size_t) -2)
@@ -231,7 +269,7 @@ is_mb_middle (const char **good, const char *buf, const char *end,
           /* An invalid sequence, or a truncated multibyte character.
              We treat it as a single byte character.  */
           mbclen = 1;
-          memset(&cur_state, 0, sizeof cur_state);
+          memset (&cur_state, 0, sizeof cur_state);
         }
       p += mbclen;
     }
-- 
cgit v1.2.1