diff options
author | Jim Meyering <meyering@fb.com> | 2023-03-18 08:28:36 -0700 |
---|---|---|
committer | Jim Meyering <meyering@meta.com> | 2023-03-18 17:08:09 -0700 |
commit | c83ffc197ec483c6f44f907346f34127ec044ef0 (patch) | |
tree | d3b01f6a00fe5a9573f596e45c4f5ad8b8a856b5 /src | |
parent | 7979ea7ddbf83f3203d53b6351c3717ce0af91c4 (diff) | |
download | grep-c83ffc197ec483c6f44f907346f34127ec044ef0.tar.gz |
grep: -P (--perl-regexp) \d: match only ASCII digits
Prior to grep-3.9, the PCRE matcher had always treated \d just
like [0-9]. grep-3.9's fix for \w and \b mistakenly relaxed \d
to also match multibyte digits.
* src/grep.c (P_MATCHER_INDEX): Define enum.
(pcre_pattern_expand_backslash_d): New function.
(main): Call it for -P.
* NEWS (Bug fixes): Mention it.
* doc/grep.texi: Document it: with -P, \d matches only ASCII digits.
Provide a PCRE documentation URL and an example of how
to use (?s) with -z.
* tests/pcre-ascii-digits: New test.
* tests/Makefile.am (TESTS): Add that file name.
Reported as https://bugs.gnu.org/62267
Diffstat (limited to 'src')
-rw-r--r-- | src/grep.c | 82 |
1 files changed, 81 insertions, 1 deletions
@@ -2089,7 +2089,8 @@ static struct #endif }; /* Keep these in sync with the 'matchers' table. */ -enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 }; +enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0, + P_MATCHER_INDEX = 6 }; /* Return the index of the matcher corresponding to M if available. MATCHER is the index of the previous matcher, or -1 if none. @@ -2378,6 +2379,80 @@ fgrep_to_grep_pattern (char **keys_p, idx_t *len_p) *len_p = p - new_keys; } +/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII + digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise + match non-ASCII digits in some locales. Use \p{Nd} if you require to match + those. */ +static void +pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) +{ + idx_t len = *len_p; + char *keys = *keys_p; + mbstate_t mb_state = { 0 }; + char *new_keys = xnmalloc (len / 2 + 1, 5); + char *p = new_keys; + bool prev_backslash = false; + + for (ptrdiff_t n; len; keys += n, len -= n) + { + n = mb_clen (keys, len, &mb_state); + switch (n) + { + case -2: + n = len; + FALLTHROUGH; + default: + if (prev_backslash) + { + prev_backslash = false; + *p++ = '\\'; + } + p = mempcpy (p, keys, n); + break; + + case -1: + if (prev_backslash) + { + prev_backslash = false; + *p++ = '\\'; + } + memset (&mb_state, 0, sizeof mb_state); + n = 1; + FALLTHROUGH; + case 1: + if (prev_backslash) + { + prev_backslash = false; + switch (*keys) + { + case 'd': + p = mempcpy (p, "[0-9]", 5); + break; + default: + *p++ = '\\'; + *p++ = *keys; + break; + } + } + else + { + if (*keys == '\\') + prev_backslash = true; + else + *p++ = *keys; + } + break; + } + } + + if (prev_backslash) + *p++ = '\\'; + *p = '\n'; + free (*keys_p); + *keys_p = new_keys; + *len_p = p - new_keys; +} + /* If it is easy, convert the MATCHER-style patterns KEYS (of size *LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and return F_MATCHER_INDEX. If not, leave KEYS and *LEN_P alone and @@ -2970,6 +3045,11 @@ main (int argc, char **argv) matcher = try_fgrep_pattern (matcher, keys, &keycc); } + /* If -P, replace each \d with [0-9]. + Those who want to match non-ASCII digits must use \p{Nd}. */ + if (matcher == P_MATCHER_INDEX) + pcre_pattern_expand_backslash_d (&keys, &keycc); + execute = matchers[matcher].execute; compiled_pattern = matchers[matcher].compile (keys, keycc, matchers[matcher].syntax, |