grep: scan back thru UTF-8 a bit faster

* src/searchutils.c (mb_goback): When scanning backward through UTF-8, check the length implied by the putative byte 1 before bothering to invoke mb_clen. This length check also lets us use mbrlen directly rather than calling mb_clen, which would eventually defer to mbrlen anyway.
author: Paul Eggert <eggert@cs.ucla.edu> 2021-08-24 00:37:01 -0700
committer: Paul Eggert <eggert@cs.ucla.edu> 2021-08-24 00:43:28 -0700
commit: b7d83f46d81a304e188c82877430765c29a75610 (patch)
tree: 19f3085e5ee197bd8b5120ef3a86e8aca6fdd8cc /src
parent: 643e5573888c49ec7022c2cee04a312b70166d0d (diff)
download: grep-b7d83f46d81a304e188c82877430765c29a75610.tar.gz
1 files changed, 13 insertions, 6 deletions
diff --git a/src/searchutils.c b/src/searchutils.c
index f16dd846..0080dd75 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -107,13 +107,20 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
         for (int i = 1; i <= 3; i++)
           if ((cur[-i] & 0xc0) != 0x80)
             {
-              mbstate_t mbs = { 0 };
-              size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
-              if (i < clen && clen <= MB_LEN_MAX)
+              /* True if the length implied by the putative byte 1 at
+                 CUR[-I] extends at least through *CUR.  */
+              bool long_enough = (~cur[-i] & 0xff) >> (7 - i) == 0;
+
+              if (long_enough)
                 {
-                  /* This multibyte character contains *CUR.  */
-                  p0 = cur - i;
-                  p = p0 + clen;
+                  mbstate_t mbs = { 0 };
+                  size_t clen = mbrlen (cur - i, end - (cur - i), &mbs);
+                  if (clen <= MB_LEN_MAX)
+                    {
+                      /* This multibyte character contains *CUR.  */
+                      p0 = cur - i;
+                      p = p0 + clen;
+                    }
                 }
               break;
             }
author	Paul Eggert <eggert@cs.ucla.edu>	2021-08-24 00:37:01 -0700
committer	Paul Eggert <eggert@cs.ucla.edu>	2021-08-24 00:43:28 -0700
commit	b7d83f46d81a304e188c82877430765c29a75610 (patch)
tree	19f3085e5ee197bd8b5120ef3a86e8aca6fdd8cc /src
parent	643e5573888c49ec7022c2cee04a312b70166d0d (diff)
download	grep-b7d83f46d81a304e188c82877430765c29a75610.tar.gz