summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2021-08-24 00:37:01 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2021-08-24 00:43:28 -0700
commitb7d83f46d81a304e188c82877430765c29a75610 (patch)
tree19f3085e5ee197bd8b5120ef3a86e8aca6fdd8cc /src
parent643e5573888c49ec7022c2cee04a312b70166d0d (diff)
downloadgrep-b7d83f46d81a304e188c82877430765c29a75610.tar.gz
grep: scan back thru UTF-8 a bit faster
* src/searchutils.c (mb_goback): When scanning backward through UTF-8, check the length implied by the putative byte 1 before bothering to invoke mb_clen. This length check also lets us use mbrlen directly rather than calling mb_clen, which would eventually defer to mbrlen anyway.
Diffstat (limited to 'src')
-rw-r--r--src/searchutils.c19
1 files changed, 13 insertions, 6 deletions
diff --git a/src/searchutils.c b/src/searchutils.c
index f16dd846..0080dd75 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -107,13 +107,20 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
for (int i = 1; i <= 3; i++)
if ((cur[-i] & 0xc0) != 0x80)
{
- mbstate_t mbs = { 0 };
- size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
- if (i < clen && clen <= MB_LEN_MAX)
+ /* True if the length implied by the putative byte 1 at
+ CUR[-I] extends at least through *CUR. */
+ bool long_enough = (~cur[-i] & 0xff) >> (7 - i) == 0;
+
+ if (long_enough)
{
- /* This multibyte character contains *CUR. */
- p0 = cur - i;
- p = p0 + clen;
+ mbstate_t mbs = { 0 };
+ size_t clen = mbrlen (cur - i, end - (cur - i), &mbs);
+ if (clen <= MB_LEN_MAX)
+ {
+ /* This multibyte character contains *CUR. */
+ p0 = cur - i;
+ p = p0 + clen;
+ }
}
break;
}