summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2021-11-13 13:52:23 -0800
committerPaul Eggert <eggert@cs.ucla.edu>2021-11-14 12:13:28 -0800
commit6e1450408a7921771c41973761995e06445ba18b (patch)
tree1ce7af13818f196ccd5216fda4ea2a3d5de4bd7a /src
parent3935b2a4f656e8435812df25cf7aab9f7e61b406 (diff)
downloadgrep-6e1450408a7921771c41973761995e06445ba18b.tar.gz
grep: speed up, fix bad-UTF8 check with -P
* src/pcresearch.c (bad_utf8_from_pcre2): New function. Fix bug where PCRE2_ERROR_UTF8_ERR1 was not treated as an encoding error. Improve performance when PCRE2_MATCH_INVALID_UTF is defined. (Pexecute): Use it.
Diffstat (limited to 'src')
-rw-r--r--src/pcresearch.c16
1 files changed, 14 insertions, 2 deletions
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 286e1dc3..953aca24 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -104,6 +104,18 @@ jit_exec (struct pcre_comp *pc, char const *subject, PCRE2_SIZE search_bytes,
}
}
+/* Return true if E is an error code for bad UTF-8, and if pcre2_match
+ could return E because PCRE lacks PCRE2_MATCH_INVALID_UTF. */
+static bool
+bad_utf8_from_pcre2 (int e)
+{
+#ifdef PCRE2_MATCH_INVALID_UTF
+ return false;
+#else
+ return PCRE2_ERROR_UTF8_ERR21 <= e && e <= PCRE2_ERROR_UTF8_ERR1;
+#endif
+}
+
/* Compile the -P style PATTERN, containing SIZE bytes that are
followed by '\n'. Return a description of the compiled pattern. */
@@ -248,9 +260,9 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
e = jit_exec (pc, subject, line_end - subject,
search_offset, options);
- /* PCRE2 provides 22 different error codes for bad UTF-8 */
- if (! (PCRE2_ERROR_UTF8_ERR21 <= e && e < PCRE2_ERROR_UTF8_ERR1))
+ if (!bad_utf8_from_pcre2 (e))
break;
+
PCRE2_SIZE valid_bytes = pcre2_get_startchar (pc->data);
if (search_offset <= valid_bytes)