From e3694e90b4789ccafaf022a29d9ce08ff11375c2 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Tue, 24 Aug 2021 17:19:22 -0700
Subject: grep: prefer signed to unsigned integers

This improves runtime checking for integer overflow when compiling
with gcc -fsanitize=undefined and the like.  It also avoids
the need for some integer casts, which can be error-prone.
* bootstrap.conf (gnulib_modules): Add idx.
* src/dfasearch.c (struct dfa_comp, kwsmusts):
(possible_backrefs_in_pattern, regex_compile, GEAcompile)
(EGexecute):
* src/grep.c (struct patloc, patlocs_allocated, patlocs_used)
(n_patterns, update_patterns, pattern_file_name, poison_len)
(asan_poison, fwrite_errno, compile_fp_t, execute_fp_t)
(buf_has_encoding_errors, buf_has_nulls, file_must_have_nulls)
(bufalloc, pagesize, all_zeros, fillbuf, nlscan)
(print_line_head, print_line_middle, print_line_tail, grepbuf)
(grep, contains_encoding_error, fgrep_icase_available)
(fgrep_icase_charlen, fgrep_to_grep_pattern, try_fgrep_pattern)
(main):
* src/kwsearch.c (struct kwsearch, Fcompile, Fexecute):
* src/kwset.c (struct trie, struct kwset, kwsalloc, kwsincr)
(kwswords, treefails, memchr_kwset, acexec_trans, kwsexec)
(treedelta, kwsprep, bm_delta2_search, bmexec_trans, bmexec)
(acexec):
* src/kwset.h (struct kwsmatch):
* src/pcresearch.c (Pcompile, Pexecute):
* src/search.h (mb_clen):
* src/searchutils.c (kwsinit, mb_goback, wordchars_count)
(wordchars_size, wordchar_next, wordchar_prev):
Prefer idx_t to size_t or ptrdiff_t for nonnegative sizes,
and prefer ptrdiff_t to size_t for sizes plus error values.
* src/grep.c (uword_size): New constant, used for signed
size calculations.
(totalnl, add_count, totalcc, print_offset, print_line_head, grep):
Prefer intmax_t to uintmax_t for wide integer calculations.
(fgrep_icase_charlen): Prefer ptrdiff_t to int for size offsets.
* src/grep.h: Include idx.h.
* src/search.h (imbrlen): New function, like mbrlen except
with idx_t and ptrdiff_t.
---
 src/searchutils.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'src/searchutils.c')

diff --git a/src/searchutils.c b/src/searchutils.c
index 0080dd75..ebc4a115 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -47,7 +47,7 @@ kwsinit (bool mb_trans)
 
   if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
     {
-      trans = xmalloc (NCHAR);
+      trans = ximalloc (NCHAR);
       /* If I is a single-byte character that becomes a different
          single-byte character when uppercased, set trans[I]
          to that character.  Otherwise, set trans[I] to I.  */
@@ -88,7 +88,7 @@ kwsinit (bool mb_trans)
 
    Treat encoding errors as if they were single-byte characters.  */
 ptrdiff_t
-mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
+mb_goback (char const **mb_start, idx_t *mbclen, char const *cur,
            char const *end)
 {
   const char *p = *mb_start;
@@ -114,8 +114,8 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
               if (long_enough)
                 {
                   mbstate_t mbs = { 0 };
-                  size_t clen = mbrlen (cur - i, end - (cur - i), &mbs);
-                  if (clen <= MB_LEN_MAX)
+                  ptrdiff_t clen = imbrlen (cur - i, end - (cur - i), &mbs);
+                  if (0 <= clen)
                     {
                       /* This multibyte character contains *CUR.  */
                       p0 = cur - i;
@@ -130,13 +130,13 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
       /* In non-UTF-8 encodings, to find character boundaries one must
          in general scan forward from the start of the buffer.  */
       mbstate_t mbs = { 0 };
-      size_t clen;
+      ptrdiff_t clen;
 
       do
         {
           clen = mb_clen (p, end - p, &mbs);
 
-          if (MB_LEN_MAX < clen)
+          if (clen < 0)
             {
               /* An invalid sequence, or a truncated multibyte character.
                  Treat it as a single byte character.  */
@@ -159,10 +159,10 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
 /* Examine the start of BUF (which goes to END) for word constituents.
    If COUNTALL, examine as many as possible; otherwise, examine at most one.
    Return the total number of bytes in the examined characters.  */
-static size_t
+static idx_t
 wordchars_count (char const *buf, char const *end, bool countall)
 {
-  size_t n = 0;
+  idx_t n = 0;
   mbstate_t mbs = { 0 };
   while (n < end - buf)
     {
@@ -188,7 +188,7 @@ wordchars_count (char const *buf, char const *end, bool countall)
 /* Examine the start of BUF for the longest prefix containing just
    word constituents.  Return the total number of bytes in the prefix.
    The buffer ends at END.  */
-size_t
+idx_t
 wordchars_size (char const *buf, char const *end)
 {
   return wordchars_count (buf, end, true);
@@ -196,7 +196,7 @@ wordchars_size (char const *buf, char const *end)
 
 /* If BUF starts with a word constituent, return the number of bytes
    used to represent it; otherwise, return zero.  The buffer ends at END.  */
-size_t
+idx_t
 wordchar_next (char const *buf, char const *end)
 {
   return wordchars_count (buf, end, false);
@@ -205,7 +205,7 @@ wordchar_next (char const *buf, char const *end)
 /* In the buffer BUF, return nonzero if the character whose encoding
    contains the byte before CUR is a word constituent.  The buffer
    ends at END.  */
-size_t
+idx_t
 wordchar_prev (char const *buf, char const *cur, char const *end)
 {
   if (buf == cur)
-- 
cgit v1.2.1