From e3694e90b4789ccafaf022a29d9ce08ff11375c2 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Tue, 24 Aug 2021 17:19:22 -0700 Subject: grep: prefer signed to unsigned integers This improves runtime checking for integer overflow when compiling with gcc -fsanitize=undefined and the like. It also avoids the need for some integer casts, which can be error-prone. * bootstrap.conf (gnulib_modules): Add idx. * src/dfasearch.c (struct dfa_comp, kwsmusts): (possible_backrefs_in_pattern, regex_compile, GEAcompile) (EGexecute): * src/grep.c (struct patloc, patlocs_allocated, patlocs_used) (n_patterns, update_patterns, pattern_file_name, poison_len) (asan_poison, fwrite_errno, compile_fp_t, execute_fp_t) (buf_has_encoding_errors, buf_has_nulls, file_must_have_nulls) (bufalloc, pagesize, all_zeros, fillbuf, nlscan) (print_line_head, print_line_middle, print_line_tail, grepbuf) (grep, contains_encoding_error, fgrep_icase_available) (fgrep_icase_charlen, fgrep_to_grep_pattern, try_fgrep_pattern) (main): * src/kwsearch.c (struct kwsearch, Fcompile, Fexecute): * src/kwset.c (struct trie, struct kwset, kwsalloc, kwsincr) (kwswords, treefails, memchr_kwset, acexec_trans, kwsexec) (treedelta, kwsprep, bm_delta2_search, bmexec_trans, bmexec) (acexec): * src/kwset.h (struct kwsmatch): * src/pcresearch.c (Pcompile, Pexecute): * src/search.h (mb_clen): * src/searchutils.c (kwsinit, mb_goback, wordchars_count) (wordchars_size, wordchar_next, wordchar_prev): Prefer idx_t to size_t or ptrdiff_t for nonnegative sizes, and prefer ptrdiff_t to size_t for sizes plus error values. * src/grep.c (uword_size): New constant, used for signed size calculations. (totalnl, add_count, totalcc, print_offset, print_line_head, grep): Prefer intmax_t to uintmax_t for wide integer calculations. (fgrep_icase_charlen): Prefer ptrdiff_t to int for size offsets. * src/grep.h: Include idx.h. * src/search.h (imbrlen): New function, like mbrlen except with idx_t and ptrdiff_t. --- src/searchutils.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'src/searchutils.c') diff --git a/src/searchutils.c b/src/searchutils.c index 0080dd75..ebc4a115 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -47,7 +47,7 @@ kwsinit (bool mb_trans) if (match_icase && (MB_CUR_MAX == 1 || mb_trans)) { - trans = xmalloc (NCHAR); + trans = ximalloc (NCHAR); /* If I is a single-byte character that becomes a different single-byte character when uppercased, set trans[I] to that character. Otherwise, set trans[I] to I. */ @@ -88,7 +88,7 @@ kwsinit (bool mb_trans) Treat encoding errors as if they were single-byte characters. */ ptrdiff_t -mb_goback (char const **mb_start, size_t *mbclen, char const *cur, +mb_goback (char const **mb_start, idx_t *mbclen, char const *cur, char const *end) { const char *p = *mb_start; @@ -114,8 +114,8 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur, if (long_enough) { mbstate_t mbs = { 0 }; - size_t clen = mbrlen (cur - i, end - (cur - i), &mbs); - if (clen <= MB_LEN_MAX) + ptrdiff_t clen = imbrlen (cur - i, end - (cur - i), &mbs); + if (0 <= clen) { /* This multibyte character contains *CUR. */ p0 = cur - i; @@ -130,13 +130,13 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur, /* In non-UTF-8 encodings, to find character boundaries one must in general scan forward from the start of the buffer. */ mbstate_t mbs = { 0 }; - size_t clen; + ptrdiff_t clen; do { clen = mb_clen (p, end - p, &mbs); - if (MB_LEN_MAX < clen) + if (clen < 0) { /* An invalid sequence, or a truncated multibyte character. Treat it as a single byte character. */ @@ -159,10 +159,10 @@ mb_goback (char const **mb_start, size_t *mbclen, char const *cur, /* Examine the start of BUF (which goes to END) for word constituents. If COUNTALL, examine as many as possible; otherwise, examine at most one. Return the total number of bytes in the examined characters. */ -static size_t +static idx_t wordchars_count (char const *buf, char const *end, bool countall) { - size_t n = 0; + idx_t n = 0; mbstate_t mbs = { 0 }; while (n < end - buf) { @@ -188,7 +188,7 @@ wordchars_count (char const *buf, char const *end, bool countall) /* Examine the start of BUF for the longest prefix containing just word constituents. Return the total number of bytes in the prefix. The buffer ends at END. */ -size_t +idx_t wordchars_size (char const *buf, char const *end) { return wordchars_count (buf, end, true); @@ -196,7 +196,7 @@ wordchars_size (char const *buf, char const *end) /* If BUF starts with a word constituent, return the number of bytes used to represent it; otherwise, return zero. The buffer ends at END. */ -size_t +idx_t wordchar_next (char const *buf, char const *end) { return wordchars_count (buf, end, false); @@ -205,7 +205,7 @@ wordchar_next (char const *buf, char const *end) /* In the buffer BUF, return nonzero if the character whose encoding contains the byte before CUR is a word constituent. The buffer ends at END. */ -size_t +idx_t wordchar_prev (char const *buf, char const *cur, char const *end) { if (buf == cur) -- cgit v1.2.1