diff options
author | Paul Eggert <eggert@cs.ucla.edu> | 2021-08-24 00:37:01 -0700 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2021-08-24 00:43:28 -0700 |
commit | b7d83f46d81a304e188c82877430765c29a75610 (patch) | |
tree | 19f3085e5ee197bd8b5120ef3a86e8aca6fdd8cc /src | |
parent | 643e5573888c49ec7022c2cee04a312b70166d0d (diff) | |
download | grep-b7d83f46d81a304e188c82877430765c29a75610.tar.gz |
grep: scan back thru UTF-8 a bit faster
* src/searchutils.c (mb_goback): When scanning backward through
UTF-8, check the length implied by the putative byte 1 before
bothering to invoke mb_clen. This length check also lets us use
mbrlen directly rather than calling mb_clen, which would
eventually defer to mbrlen anyway.
Diffstat (limited to 'src')
-rw-r--r-- | src/searchutils.c | 19 |
1 files changed, 13 insertions, 6 deletions
diff --git a/src/searchutils.c b/src/searchutils.c index f16dd846..0080dd75 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -107,13 +107,20 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur, for (int i = 1; i <= 3; i++) if ((cur[-i] & 0xc0) != 0x80) { - mbstate_t mbs = { 0 }; - size_t clen = mb_clen (cur - i, end - (cur - i), &mbs); - if (i < clen && clen <= MB_LEN_MAX) + /* True if the length implied by the putative byte 1 at + CUR[-I] extends at least through *CUR. */ + bool long_enough = (~cur[-i] & 0xff) >> (7 - i) == 0; + + if (long_enough) { - /* This multibyte character contains *CUR. */ - p0 = cur - i; - p = p0 + clen; + mbstate_t mbs = { 0 }; + size_t clen = mbrlen (cur - i, end - (cur - i), &mbs); + if (clen <= MB_LEN_MAX) + { + /* This multibyte character contains *CUR. */ + p0 = cur - i; + p = p0 + clen; + } } break; } |