summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJim Meyering <meyering@fb.com>2023-03-18 08:28:36 -0700
committerJim Meyering <meyering@meta.com>2023-03-18 17:08:09 -0700
commitc83ffc197ec483c6f44f907346f34127ec044ef0 (patch)
treed3b01f6a00fe5a9573f596e45c4f5ad8b8a856b5 /src
parent7979ea7ddbf83f3203d53b6351c3717ce0af91c4 (diff)
downloadgrep-c83ffc197ec483c6f44f907346f34127ec044ef0.tar.gz
grep: -P (--perl-regexp) \d: match only ASCII digits
Prior to grep-3.9, the PCRE matcher had always treated \d just like [0-9]. grep-3.9's fix for \w and \b mistakenly relaxed \d to also match multibyte digits. * src/grep.c (P_MATCHER_INDEX): Define enum. (pcre_pattern_expand_backslash_d): New function. (main): Call it for -P. * NEWS (Bug fixes): Mention it. * doc/grep.texi: Document it: with -P, \d matches only ASCII digits. Provide a PCRE documentation URL and an example of how to use (?s) with -z. * tests/pcre-ascii-digits: New test. * tests/Makefile.am (TESTS): Add that file name. Reported as https://bugs.gnu.org/62267
Diffstat (limited to 'src')
-rw-r--r--src/grep.c82
1 files changed, 81 insertions, 1 deletions
diff --git a/src/grep.c b/src/grep.c
index 7547b641..6ba881ef 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2089,7 +2089,8 @@ static struct
#endif
};
/* Keep these in sync with the 'matchers' table. */
-enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 };
+enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0,
+ P_MATCHER_INDEX = 6 };
/* Return the index of the matcher corresponding to M if available.
MATCHER is the index of the previous matcher, or -1 if none.
@@ -2378,6 +2379,80 @@ fgrep_to_grep_pattern (char **keys_p, idx_t *len_p)
*len_p = p - new_keys;
}
+/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
+ digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
+ match non-ASCII digits in some locales. Use \p{Nd} if you require to match
+ those. */
+static void
+pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
+{
+ idx_t len = *len_p;
+ char *keys = *keys_p;
+ mbstate_t mb_state = { 0 };
+ char *new_keys = xnmalloc (len / 2 + 1, 5);
+ char *p = new_keys;
+ bool prev_backslash = false;
+
+ for (ptrdiff_t n; len; keys += n, len -= n)
+ {
+ n = mb_clen (keys, len, &mb_state);
+ switch (n)
+ {
+ case -2:
+ n = len;
+ FALLTHROUGH;
+ default:
+ if (prev_backslash)
+ {
+ prev_backslash = false;
+ *p++ = '\\';
+ }
+ p = mempcpy (p, keys, n);
+ break;
+
+ case -1:
+ if (prev_backslash)
+ {
+ prev_backslash = false;
+ *p++ = '\\';
+ }
+ memset (&mb_state, 0, sizeof mb_state);
+ n = 1;
+ FALLTHROUGH;
+ case 1:
+ if (prev_backslash)
+ {
+ prev_backslash = false;
+ switch (*keys)
+ {
+ case 'd':
+ p = mempcpy (p, "[0-9]", 5);
+ break;
+ default:
+ *p++ = '\\';
+ *p++ = *keys;
+ break;
+ }
+ }
+ else
+ {
+ if (*keys == '\\')
+ prev_backslash = true;
+ else
+ *p++ = *keys;
+ }
+ break;
+ }
+ }
+
+ if (prev_backslash)
+ *p++ = '\\';
+ *p = '\n';
+ free (*keys_p);
+ *keys_p = new_keys;
+ *len_p = p - new_keys;
+}
+
/* If it is easy, convert the MATCHER-style patterns KEYS (of size
*LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and
return F_MATCHER_INDEX. If not, leave KEYS and *LEN_P alone and
@@ -2970,6 +3045,11 @@ main (int argc, char **argv)
matcher = try_fgrep_pattern (matcher, keys, &keycc);
}
+ /* If -P, replace each \d with [0-9].
+ Those who want to match non-ASCII digits must use \p{Nd}. */
+ if (matcher == P_MATCHER_INDEX)
+ pcre_pattern_expand_backslash_d (&keys, &keycc);
+
execute = matchers[matcher].execute;
compiled_pattern =
matchers[matcher].compile (keys, keycc, matchers[matcher].syntax,