diff options
author | Paul Eggert <eggert@cs.ucla.edu> | 2021-11-13 13:52:23 -0800 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2021-11-14 12:13:28 -0800 |
commit | 6e1450408a7921771c41973761995e06445ba18b (patch) | |
tree | 1ce7af13818f196ccd5216fda4ea2a3d5de4bd7a /src/pcresearch.c | |
parent | 3935b2a4f656e8435812df25cf7aab9f7e61b406 (diff) | |
download | grep-6e1450408a7921771c41973761995e06445ba18b.tar.gz |
grep: speed up, fix bad-UTF8 check with -P
* src/pcresearch.c (bad_utf8_from_pcre2): New function. Fix bug
where PCRE2_ERROR_UTF8_ERR1 was not treated as an encoding error.
Improve performance when PCRE2_MATCH_INVALID_UTF is defined.
(Pexecute): Use it.
Diffstat (limited to 'src/pcresearch.c')
-rw-r--r-- | src/pcresearch.c | 16 |
1 files changed, 14 insertions, 2 deletions
diff --git a/src/pcresearch.c b/src/pcresearch.c index 286e1dc3..953aca24 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -104,6 +104,18 @@ jit_exec (struct pcre_comp *pc, char const *subject, PCRE2_SIZE search_bytes, } } +/* Return true if E is an error code for bad UTF-8, and if pcre2_match + could return E because PCRE lacks PCRE2_MATCH_INVALID_UTF. */ +static bool +bad_utf8_from_pcre2 (int e) +{ +#ifdef PCRE2_MATCH_INVALID_UTF + return false; +#else + return PCRE2_ERROR_UTF8_ERR21 <= e && e <= PCRE2_ERROR_UTF8_ERR1; +#endif +} + /* Compile the -P style PATTERN, containing SIZE bytes that are followed by '\n'. Return a description of the compiled pattern. */ @@ -248,9 +260,9 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, e = jit_exec (pc, subject, line_end - subject, search_offset, options); - /* PCRE2 provides 22 different error codes for bad UTF-8 */ - if (! (PCRE2_ERROR_UTF8_ERR21 <= e && e < PCRE2_ERROR_UTF8_ERR1)) + if (!bad_utf8_from_pcre2 (e)) break; + PCRE2_SIZE valid_bytes = pcre2_get_startchar (pc->data); if (search_offset <= valid_bytes) |