grep: prefer signed to unsigned integers

This improves runtime checking for integer overflow when compiling with gcc -fsanitize=undefined and the like. It also avoids the need for some integer casts, which can be error-prone. * bootstrap.conf (gnulib_modules): Add idx. * src/dfasearch.c (struct dfa_comp, kwsmusts): (possible_backrefs_in_pattern, regex_compile, GEAcompile) (EGexecute): * src/grep.c (struct patloc, patlocs_allocated, patlocs_used) (n_patterns, update_patterns, pattern_file_name, poison_len) (asan_poison, fwrite_errno, compile_fp_t, execute_fp_t) (buf_has_encoding_errors, buf_has_nulls, file_must_have_nulls) (bufalloc, pagesize, all_zeros, fillbuf, nlscan) (print_line_head, print_line_middle, print_line_tail, grepbuf) (grep, contains_encoding_error, fgrep_icase_available) (fgrep_icase_charlen, fgrep_to_grep_pattern, try_fgrep_pattern) (main): * src/kwsearch.c (struct kwsearch, Fcompile, Fexecute): * src/kwset.c (struct trie, struct kwset, kwsalloc, kwsincr) (kwswords, treefails, memchr_kwset, acexec_trans, kwsexec) (treedelta, kwsprep, bm_delta2_search, bmexec_trans, bmexec) (acexec): * src/kwset.h (struct kwsmatch): * src/pcresearch.c (Pcompile, Pexecute): * src/search.h (mb_clen): * src/searchutils.c (kwsinit, mb_goback, wordchars_count) (wordchars_size, wordchar_next, wordchar_prev): Prefer idx_t to size_t or ptrdiff_t for nonnegative sizes, and prefer ptrdiff_t to size_t for sizes plus error values. * src/grep.c (uword_size): New constant, used for signed size calculations. (totalnl, add_count, totalcc, print_offset, print_line_head, grep): Prefer intmax_t to uintmax_t for wide integer calculations. (fgrep_icase_charlen): Prefer ptrdiff_t to int for size offsets. * src/grep.h: Include idx.h. * src/search.h (imbrlen): New function, like mbrlen except with idx_t and ptrdiff_t.
author: Paul Eggert <eggert@cs.ucla.edu> 2021-08-24 17:19:22 -0700
committer: Paul Eggert <eggert@cs.ucla.edu> 2021-08-25 12:11:27 -0700
commit: e3694e90b4789ccafaf022a29d9ce08ff11375c2 (patch)
tree: b09bc151ec222cfe5e6757a0cd85fe05ec3125a5 /src/searchutils.c
parent: b7d83f46d81a304e188c82877430765c29a75610 (diff)
download: grep-e3694e90b4789ccafaf022a29d9ce08ff11375c2.tar.gz
1 files changed, 11 insertions, 11 deletions
diff --git a/src/searchutils.c b/src/searchutils.c
index 0080dd75..ebc4a115 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -47,7 +47,7 @@ kwsinit (bool mb_trans)
 
   if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
     {
-      trans = xmalloc (NCHAR);
+      trans = ximalloc (NCHAR);
       /* If I is a single-byte character that becomes a different
          single-byte character when uppercased, set trans[I]
          to that character.  Otherwise, set trans[I] to I.  */
@@ -88,7 +88,7 @@ kwsinit (bool mb_trans)
 
    Treat encoding errors as if they were single-byte characters.  */
 ptrdiff_t
-mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
+mb_goback (char const **mb_start, idx_t *mbclen, char const *cur,
            char const *end)
 {
   const char *p = *mb_start;
@@ -114,8 +114,8 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
               if (long_enough)
                 {
                   mbstate_t mbs = { 0 };
-                  size_t clen = mbrlen (cur - i, end - (cur - i), &mbs);
-                  if (clen <= MB_LEN_MAX)
+                  ptrdiff_t clen = imbrlen (cur - i, end - (cur - i), &mbs);
+                  if (0 <= clen)
                     {
                       /* This multibyte character contains *CUR.  */
                       p0 = cur - i;
@@ -130,13 +130,13 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
       /* In non-UTF-8 encodings, to find character boundaries one must
          in general scan forward from the start of the buffer.  */
       mbstate_t mbs = { 0 };
-      size_t clen;
+      ptrdiff_t clen;
 
       do
         {
           clen = mb_clen (p, end - p, &mbs);
 
-          if (MB_LEN_MAX < clen)
+          if (clen < 0)
             {
               /* An invalid sequence, or a truncated multibyte character.
                  Treat it as a single byte character.  */
@@ -159,10 +159,10 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
 /* Examine the start of BUF (which goes to END) for word constituents.
    If COUNTALL, examine as many as possible; otherwise, examine at most one.
    Return the total number of bytes in the examined characters.  */
-static size_t
+static idx_t
 wordchars_count (char const *buf, char const *end, bool countall)
 {
-  size_t n = 0;
+  idx_t n = 0;
   mbstate_t mbs = { 0 };
   while (n < end - buf)
     {
@@ -188,7 +188,7 @@ wordchars_count (char const *buf, char const *end, bool countall)
 /* Examine the start of BUF for the longest prefix containing just
    word constituents.  Return the total number of bytes in the prefix.
    The buffer ends at END.  */
-size_t
+idx_t
 wordchars_size (char const *buf, char const *end)
 {
   return wordchars_count (buf, end, true);
@@ -196,7 +196,7 @@ wordchars_size (char const *buf, char const *end)
 
 /* If BUF starts with a word constituent, return the number of bytes
    used to represent it; otherwise, return zero.  The buffer ends at END.  */
-size_t
+idx_t
 wordchar_next (char const *buf, char const *end)
 {
   return wordchars_count (buf, end, false);
@@ -205,7 +205,7 @@ wordchar_next (char const *buf, char const *end)
 /* In the buffer BUF, return nonzero if the character whose encoding
    contains the byte before CUR is a word constituent.  The buffer
    ends at END.  */
-size_t
+idx_t
 wordchar_prev (char const *buf, char const *cur, char const *end)
 {
   if (buf == cur)
author	Paul Eggert <eggert@cs.ucla.edu>	2021-08-24 17:19:22 -0700
committer	Paul Eggert <eggert@cs.ucla.edu>	2021-08-25 12:11:27 -0700
commit	e3694e90b4789ccafaf022a29d9ce08ff11375c2 (patch)
tree	b09bc151ec222cfe5e6757a0cd85fe05ec3125a5 /src/searchutils.c
parent	b7d83f46d81a304e188c82877430765c29a75610 (diff)
download	grep-e3694e90b4789ccafaf022a29d9ce08ff11375c2.tar.gz