diff options
-rw-r--r-- | NEWS | 8 | ||||
-rw-r--r-- | src/pcresearch.c | 97 | ||||
-rwxr-xr-x | tests/pcre-ascii-digits | 19 | ||||
-rwxr-xr-x | tests/pcre-utf8-w | 2 |
4 files changed, 27 insertions, 99 deletions
@@ -2,6 +2,14 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Bug fixes + + With -P, patterns like [\d] now work again. The fix relies on PCRE2 + support for the PCRE2_EXTRA_ASCII_BSD flag planned for PCRE2 10.43. + With PCRE2 version 10.42 or earlier, behavior reverts to that of + grep 3.8, in that patterns like \w and \b use ASCII rather than + Unicode interpretations. + * Noteworthy changes in release 3.10 (2023-03-22) [stable] diff --git a/src/pcresearch.c b/src/pcresearch.c index 34b2aeb9..e77509c4 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -133,97 +133,12 @@ bad_utf8_from_pcre2 (int e) #endif } -#if ! PCRE2_EXTRA_ASCII_BSD -/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII - digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise - match non-ASCII digits in some locales. Use \p{Nd} if you require to match - those. Similarly, replace each \D with [^0-9]. - FIXME: remove in 2025, or whenever we no longer accommodate pcre2-10.42 - and prior. */ -static void -pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p) -{ - idx_t len = *len_p; - char *keys = *keys_p; - mbstate_t mb_state = { 0 }; - char *new_keys = xnmalloc (len / 2 + 1, 5); - char *p = new_keys; - bool prev_backslash = false; - - for (ptrdiff_t n; len; keys += n, len -= n) - { - n = mb_clen (keys, len, &mb_state); - switch (n) - { - case -2: - n = len; - FALLTHROUGH; - default: - if (prev_backslash) - { - prev_backslash = false; - *p++ = '\\'; - } - p = mempcpy (p, keys, n); - break; - - case -1: - if (prev_backslash) - { - prev_backslash = false; - *p++ = '\\'; - } - memset (&mb_state, 0, sizeof mb_state); - n = 1; - FALLTHROUGH; - case 1: - if (prev_backslash) - { - prev_backslash = false; - switch (*keys) - { - case 'd': - p = mempcpy (p, "[0-9]", 5); - break; - case 'D': - p = mempcpy (p, "[^0-9]", 6); - break; - default: - *p++ = '\\'; - *p++ = *keys; - break; - } - } - else - { - if (*keys == '\\') - prev_backslash = true; - else - *p++ = *keys; - } - break; - } - } - - if (prev_backslash) - *p++ = '\\'; - *p = '\n'; - free (*keys_p); - *keys_p = new_keys; - *len_p = p - new_keys; -} -#endif - /* Compile the -P style PATTERN, containing SIZE bytes that are followed by '\n'. Return a description of the compiled pattern. */ void * Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) { -#if ! PCRE2_EXTRA_ASCII_BSD - pcre_pattern_expand_backslash_d (&pattern, &size); -#endif - PCRE2_SIZE e; int ec; int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0); @@ -241,7 +156,17 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) _("-P supports only unibyte locales on this platform")); if (! localeinfo.using_utf8) die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); - flags |= (PCRE2_UTF | PCRE2_UCP); + + flags |= PCRE2_UTF; + + /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP + so that \d does not have the undesirable effect of matching + non-ASCII digits. Otherwise (i.e., with PCRE2 10.42 and earlier), + escapes like \w have only their ASCII interpretations, + but that's better than the confusion that would ensue if \d + matched non-ASCII digits. */ + flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0; + #if 0 /* Do not match individual code units but only UTF-8. */ flags |= PCRE2_NEVER_BACKSLASH_C; diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits index de9fe383..9dfc0fae 100755 --- a/tests/pcre-ascii-digits +++ b/tests/pcre-ascii-digits @@ -17,6 +17,8 @@ require_pcre_ echo . | grep -qP '(*UTF).' 2>/dev/null \ || skip_ 'PCRE unicode support is compiled out' +echo 0 | grep -qP '(?aD)\d' \ + || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD' fail=0 @@ -44,19 +46,10 @@ printf '\331\2404\n' > in2 || framework_failure_ returns_ 1 grep -P '\d\d' in2 > out || fail=1 compare /dev/null out || fail=1 -# The following tests work only when built with 10.43 or newer, -# with which, grep accepts the mode-setting '(?aD)': -if echo 0 | grep -qP '(?aD)\d'; then +grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1 +compare in2 out || fail=1 - grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1 - compare in2 out || fail=1 - - returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1 - compare /dev/null out || fail=1 - -else - warn_ 'skipped some tests: use PCRE2 10.43 or newer to enable' \ - 'support for e.g., (?aD) and (?-aD)' -fi +returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1 +compare /dev/null out || fail=1 Exit $fail diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w index a88ace4d..aa347840 100755 --- a/tests/pcre-utf8-w +++ b/tests/pcre-utf8-w @@ -16,6 +16,8 @@ require_pcre_ echo . | grep -qP '(*UTF).' 2>/dev/null \ || skip_ 'PCRE unicode support is compiled out' +echo 0 | grep -qP '(?aD)\d' \ + || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD' fail=0 |