summaryrefslogtreecommitdiff
path: root/src/searchutils.c
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2021-08-24 00:34:20 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2021-08-24 00:43:28 -0700
commit643e5573888c49ec7022c2cee04a312b70166d0d (patch)
treea15b538b1a6c8138c0e51c294fb2bf89e5b67e72 /src/searchutils.c
parent869989fa834c34ca2d5602555111c11f179ec8e4 (diff)
downloadgrep-643e5573888c49ec7022c2cee04a312b70166d0d.tar.gz
grep: tweak mb_goback performance
* src/searchutils.c (mb_goback): Set *MBCLEN only in non-UTF-8 encodings, since that’s the only time it’s needed, and this lets us see more clearly that the UTF-8 clen value is not useful to the caller.
Diffstat (limited to 'src/searchutils.c')
-rw-r--r--src/searchutils.c16
1 files changed, 11 insertions, 5 deletions
diff --git a/src/searchutils.c b/src/searchutils.c
index 03b4c593..f16dd846 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -93,24 +93,25 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
{
const char *p = *mb_start;
const char *p0 = p;
- size_t clen;
if (cur <= p)
return cur - p;
if (localeinfo.using_utf8)
{
+ /* UTF-8 permits scanning backward to the previous character.
+ Start by assuming CUR is at a character boundary. */
p = cur;
- clen = 1;
if ((*cur & 0xc0) == 0x80)
for (int i = 1; i <= 3; i++)
if ((cur[-i] & 0xc0) != 0x80)
{
mbstate_t mbs = { 0 };
- clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
if (i < clen && clen <= MB_LEN_MAX)
{
+ /* This multibyte character contains *CUR. */
p0 = cur - i;
p = p0 + clen;
}
@@ -119,7 +120,11 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
}
else
{
+ /* In non-UTF-8 encodings, to find character boundaries one must
+ in general scan forward from the start of the buffer. */
mbstate_t mbs = { 0 };
+ size_t clen;
+
do
{
clen = mb_clen (p, end - p, &mbs);
@@ -135,11 +140,12 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
p += clen;
}
while (p < cur);
+
+ if (mbclen)
+ *mbclen = clen;
}
*mb_start = p;
- if (mbclen)
- *mbclen = clen;
return p == cur ? 0 : cur - p0;
}