summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2021-08-23 07:43:36 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2021-08-24 00:43:27 -0700
commit70b84b9294480c8b5f12d7a0cd95e54584b08288 (patch)
tree06b3307cc94b7b9868ae4b64daacac5342618e89
parent01b7b13f8376187bc870f4b5c6d91ded35a151d0 (diff)
downloadgrep-70b84b9294480c8b5f12d7a0cd95e54584b08288.tar.gz
grep: tweak mb_goback and comment it better
* src/searchutils.c (mb_goback): Improve the comment to better describe this confusing function. And remove an unnecessary test of cur vs end.
-rw-r--r--src/searchutils.c43
1 files changed, 30 insertions, 13 deletions
diff --git a/src/searchutils.c b/src/searchutils.c
index b63990a9..aad0fcc1 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -58,18 +58,35 @@ kwsinit (bool mb_trans)
return kwsalloc (trans);
}
-/* In the buffer *MB_START, return the number of bytes needed to go
- back from CUR to the previous boundary, where a "boundary" is the
- start of a multibyte character or is an error-encoding byte. The
- buffer ends at END (i.e., one past the address of the buffer's last
- byte). If CUR is already at a boundary, return 0. If CUR is no
- larger than *MB_START, return CUR - *MB_START without modifying
- *MB_START or *MBCLEN.
-
- When returning zero, set *MB_START to CUR. When returning a
- positive value, set *MB_START to the next boundary after CUR,
- or to END if there is no such boundary, and if MBCLEN is nonnull
- set *MBCLEN to the length of the preceding character. */
+/* Return the number of bytes needed to go back to the start of a
+ multibyte character in a buffer. The buffer starts at *MB_START.
+ (See below for MBCLEN's role.) The multibyte character contains
+ the byte addressed by CUR. The buffer ends just before END, which
+ must not be less than CUR.
+
+ If CUR is no larger than *MB_START, return CUR - *MB_START without
+ modifying *MB_START or dealing with MBCLEN. Otherwise, update
+ *MB_START to point to the first multibyte character starting on or
+ after CUR, and if MBCLEN is nonnull then deal with MBCLEN as follows:
+
+ - If this function returns 0 and the locale is multibyte and is
+ not UTF-8, set *MBCLEN to the number of bytes in the multibyte
+ character containing the byte addressed by (CUR - 1).
+
+ - Otherwise, possibly set *MBCLEN to an unspecified value.
+
+ *MB_START should point to the start of a multibyte character, or to
+ an encoding-error byte.
+
+ *END should be a sentinel byte - one of '\0', '\r', '\n', '.', '/',
+ which POSIX says cannot be part of any other character. Also,
+ there should be a byte string immediately before *MB_START that
+ contains a sentinel byte. This means it is OK to scan backwards
+ before *MB_START as long as the scan stops at a sentinel byte, and
+ similarly it is OK to scan forwards from CUR (without checking END)
+ so long as the scan stops at a sentinel byte.
+
+ Treat encoding errors as if they were single-byte characters. */
ptrdiff_t
mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
char const *end)
@@ -86,7 +103,7 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
p = cur;
clen = 1;
- if (cur < end && (*cur & 0xc0) == 0x80)
+ if ((*cur & 0xc0) == 0x80)
for (int i = 1; i <= 3; i++)
if ((cur[-i] & 0xc0) != 0x80)
{