From c63a0950ff852c94e27d14b6d0eea001eddb7de1 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Sat, 1 Apr 2023 13:55:26 -0700
Subject: grep: fix -P [\d] by fixing \w only if PCRE2 10.43
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our prepass-based fixes for the -P \d bug have caused repeated
further bugs.  Avoid the need for a prepass, by using PCRE2_UCP
only if PCRE2_EXTRA_ASCII_BSD is also supported.  Since the -P \w
bug was present from grep 2.5 through 3.8 it’s OK if we wait a
little longer to fix it.
* NEWS: Mention this.
* src/pcresearch.c (pcre_pattern_expand_backslash_d}: Remove.
Remove its use.
(Pcompile): Use PCRE2_UCP only if PCRE2_EXTRA_ASCII_BSD.
* tests/pcre-ascii-digits, tests/pcre-utf8-w:
Skip tests on older PCRE2 implementations.
---
 src/pcresearch.c | 97 +++++++-------------------------------------------------
 1 file changed, 11 insertions(+), 86 deletions(-)

(limited to 'src')

diff --git a/src/pcresearch.c b/src/pcresearch.c
index 34b2aeb9..e77509c4 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -133,97 +133,12 @@ bad_utf8_from_pcre2 (int e)
 #endif
 }
 
-#if ! PCRE2_EXTRA_ASCII_BSD
-/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
-   digits.  Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
-   match non-ASCII digits in some locales.  Use \p{Nd} if you require to match
-   those.  Similarly, replace each \D with [^0-9].
-   FIXME: remove in 2025, or whenever we no longer accommodate pcre2-10.42
-   and prior.  */
-static void
-pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
-{
-  idx_t len = *len_p;
-  char *keys = *keys_p;
-  mbstate_t mb_state = { 0 };
-  char *new_keys = xnmalloc (len / 2 + 1, 5);
-  char *p = new_keys;
-  bool prev_backslash = false;
-
-  for (ptrdiff_t n; len; keys += n, len -= n)
-    {
-      n = mb_clen (keys, len, &mb_state);
-      switch (n)
-        {
-        case -2:
-          n = len;
-          FALLTHROUGH;
-        default:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              *p++ = '\\';
-            }
-          p = mempcpy (p, keys, n);
-          break;
-
-        case -1:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              *p++ = '\\';
-            }
-          memset (&mb_state, 0, sizeof mb_state);
-          n = 1;
-          FALLTHROUGH;
-        case 1:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              switch (*keys)
-                {
-                case 'd':
-                  p = mempcpy (p, "[0-9]", 5);
-                  break;
-                case 'D':
-                  p = mempcpy (p, "[^0-9]", 6);
-                  break;
-                default:
-                  *p++ = '\\';
-                  *p++ = *keys;
-                  break;
-                }
-            }
-          else
-            {
-              if (*keys == '\\')
-                prev_backslash = true;
-              else
-                *p++ = *keys;
-            }
-          break;
-        }
-    }
-
-  if (prev_backslash)
-    *p++ = '\\';
-  *p = '\n';
-  free (*keys_p);
-  *keys_p = new_keys;
-  *len_p = p - new_keys;
-}
-#endif
-
 /* Compile the -P style PATTERN, containing SIZE bytes that are
    followed by '\n'.  Return a description of the compiled pattern.  */
 
 void *
 Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
 {
-#if ! PCRE2_EXTRA_ASCII_BSD
-  pcre_pattern_expand_backslash_d (&pattern, &size);
-#endif
-
   PCRE2_SIZE e;
   int ec;
   int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
@@ -241,7 +156,17 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
              _("-P supports only unibyte locales on this platform"));
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
-      flags |= (PCRE2_UTF | PCRE2_UCP);
+
+      flags |= PCRE2_UTF;
+
+      /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
+         so that \d does not have the undesirable effect of matching
+         non-ASCII digits.  Otherwise (i.e., with PCRE2 10.42 and earlier),
+         escapes like \w have only their ASCII interpretations,
+         but that's better than the confusion that would ensue if \d
+         matched non-ASCII digits.  */
+      flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
+
 #if 0
       /* Do not match individual code units but only UTF-8.  */
       flags |= PCRE2_NEVER_BACKSLASH_C;
-- 
cgit v1.2.1