diff options
author | Paul Eggert <eggert@cs.ucla.edu> | 2023-03-19 01:50:00 -0700 |
---|---|---|
committer | Jim Meyering <meyering@meta.com> | 2023-03-19 08:43:01 -0700 |
commit | 99330c2b1dc8b619dff8a5a6a35f524d382508c8 (patch) | |
tree | dba7c6cf4aa7081208994e4ce5c5f2d4c36329de /src | |
parent | 373b4434ebc15f447ca6f96007ed6181c9a2a496 (diff) | |
download | grep-99330c2b1dc8b619dff8a5a6a35f524d382508c8.tar.gz |
grep: forward port to PCRE2 10.43
* doc/grep.texi: Document this.
* src/grep.c: Move recent changes into pcresearch.c.
(P_MATCHER_INDEX): Remove.
(pcre_pattern_expand_backslash_d): Move from here ...
* src/pcresearch.c: ... to here.
(PCRE2_EXTRA_ASCII_BSD): Default to 0.
(Pcompile): Use PCRE2_EXTRA_ASCII_BSD if available,
and expand \d to [0-9] otherwise.
Diffstat (limited to 'src')
-rw-r--r-- | src/grep.c | 82 | ||||
-rw-r--r-- | src/pcresearch.c | 90 |
2 files changed, 88 insertions, 84 deletions
@@ -2089,8 +2089,7 @@ static struct #endif }; /* Keep these in sync with the 'matchers' table. */ -enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0, - P_MATCHER_INDEX = 6 }; +enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 }; /* Return the index of the matcher corresponding to M if available. MATCHER is the index of the previous matcher, or -1 if none. @@ -2379,80 +2378,6 @@ fgrep_to_grep_pattern (char **keys_p, idx_t *len_p) *len_p = p - new_keys; } -/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII - digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise - match non-ASCII digits in some locales. Use \p{Nd} if you require to match - those. */ -static void -pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) -{ - idx_t len = *len_p; - char *keys = *keys_p; - mbstate_t mb_state = { 0 }; - char *new_keys = xnmalloc (len / 2 + 1, 5); - char *p = new_keys; - bool prev_backslash = false; - - for (ptrdiff_t n; len; keys += n, len -= n) - { - n = mb_clen (keys, len, &mb_state); - switch (n) - { - case -2: - n = len; - FALLTHROUGH; - default: - if (prev_backslash) - { - prev_backslash = false; - *p++ = '\\'; - } - p = mempcpy (p, keys, n); - break; - - case -1: - if (prev_backslash) - { - prev_backslash = false; - *p++ = '\\'; - } - memset (&mb_state, 0, sizeof mb_state); - n = 1; - FALLTHROUGH; - case 1: - if (prev_backslash) - { - prev_backslash = false; - switch (*keys) - { - case 'd': - p = mempcpy (p, "[0-9]", 5); - break; - default: - *p++ = '\\'; - *p++ = *keys; - break; - } - } - else - { - if (*keys == '\\') - prev_backslash = true; - else - *p++ = *keys; - } - break; - } - } - - if (prev_backslash) - *p++ = '\\'; - *p = '\n'; - free (*keys_p); - *keys_p = new_keys; - *len_p = p - new_keys; -} - /* If it is easy, convert the MATCHER-style patterns KEYS (of size *LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and return F_MATCHER_INDEX. If not, leave KEYS and *LEN_P alone and @@ -3045,11 +2970,6 @@ main (int argc, char **argv) matcher = try_fgrep_pattern (matcher, keys, &keycc); } - /* If -P, replace each \d with [0-9]. - Those who want to match non-ASCII digits must use \p{Nd}. */ - if (matcher == P_MATCHER_INDEX) - pcre_pattern_expand_backslash_d (&keys, &keycc); - execute = matchers[matcher].execute; compiled_pattern = matchers[matcher].compile (keys, keycc, matchers[matcher].syntax, diff --git a/src/pcresearch.c b/src/pcresearch.c index 5b111bea..d3701816 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -35,6 +35,9 @@ # define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT # define pcre2_set_depth_limit pcre2_set_recursion_limit #endif +#ifndef PCRE2_EXTRA_ASCII_BSD +# define PCRE2_EXTRA_ASCII_BSD 0 +#endif struct pcre_comp { @@ -130,12 +133,89 @@ bad_utf8_from_pcre2 (int e) #endif } +/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII + digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise + match non-ASCII digits in some locales. Use \p{Nd} if you require to match + those. */ +static void +pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) +{ + idx_t len = *len_p; + char *keys = *keys_p; + mbstate_t mb_state = { 0 }; + char *new_keys = xnmalloc (len / 2 + 1, 5); + char *p = new_keys; + bool prev_backslash = false; + + for (ptrdiff_t n; len; keys += n, len -= n) + { + n = mb_clen (keys, len, &mb_state); + switch (n) + { + case -2: + n = len; + FALLTHROUGH; + default: + if (prev_backslash) + { + prev_backslash = false; + *p++ = '\\'; + } + p = mempcpy (p, keys, n); + break; + + case -1: + if (prev_backslash) + { + prev_backslash = false; + *p++ = '\\'; + } + memset (&mb_state, 0, sizeof mb_state); + n = 1; + FALLTHROUGH; + case 1: + if (prev_backslash) + { + prev_backslash = false; + switch (*keys) + { + case 'd': + p = mempcpy (p, "[0-9]", 5); + break; + default: + *p++ = '\\'; + *p++ = *keys; + break; + } + } + else + { + if (*keys == '\\') + prev_backslash = true; + else + *p++ = *keys; + } + break; + } + } + + if (prev_backslash) + *p++ = '\\'; + *p = '\n'; + free (*keys_p); + *keys_p = new_keys; + *len_p = p - new_keys; +} + /* Compile the -P style PATTERN, containing SIZE bytes that are followed by '\n'. Return a description of the compiled pattern. */ void * Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) { + if (! PCRE2_EXTRA_ASCII_BSD) + pcre_pattern_expand_backslash_d (&pattern, &size); + PCRE2_SIZE e; int ec; int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0); @@ -168,12 +248,16 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) if (rawmemchr (pattern, '\n') != patlim) die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern")); +#ifdef PCRE2_EXTRA_MATCH_LINE + uint32_t extra_options = (PCRE2_EXTRA_ASCII_BSD + | (match_lines ? PCRE2_EXTRA_MATCH_LINE : 0)); + pcre2_set_compile_extra_options (ccontext, extra_options); +#endif + void *re_storage = NULL; if (match_lines) { -#ifdef PCRE2_EXTRA_MATCH_LINE - pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_MATCH_LINE); -#else +#ifndef PCRE2_EXTRA_MATCH_LINE static char const /* These sizes omit trailing NUL. */ xprefix[4] = "^(?:", xsuffix[2] = ")$"; idx_t re_size = size + sizeof xprefix + sizeof xsuffix; |