diff options
-rw-r--r-- | NEWS | 6 | ||||
-rw-r--r-- | THANKS.in | 2 | ||||
-rw-r--r-- | src/pcresearch.c | 2 | ||||
-rw-r--r-- | tests/Makefile.am | 1 | ||||
-rwxr-xr-x | tests/pcre-utf8-w | 28 |
5 files changed, 38 insertions, 1 deletions
@@ -4,6 +4,12 @@ GNU grep NEWS -*- outline -*- ** Bug fixes + With -P, some non-ASCII UTF8 characters were not recognized as + word-constituent due to our omission of the PCRE2_UCP flag. E.g., + given f(){ echo Perú|LC_ALL=en_US.UTF-8 grep -Po "$1"; } and + this command, echo $(f 'r\w'):$(f '.\b'), before it would print ":r". + After the fix, it prints the correct results: "rú:ú". + When given multiple patterns the last of which has a back-reference, grep no longer sometimes mistakenly matches lines in some cases. [Bug#36148#13 introduced in grep 3.4] @@ -35,6 +35,7 @@ Gerald Stoller gerald_stoller@hotmail.com Grant McDorman grant@isgtec.com Greg Boyd gboyd.ccsf@gmail.com Greg Louis glouis@dynamicro.on.ca +Gro-Tsen https://twitter.com/gro_tsen Guglielmo 'bond' Bondioni g.bondioni@libero.it H. Merijn Brand h.m.brand@hccnet.nl Harald Hanche-Olsen hanche@math.ntnu.no @@ -50,6 +51,7 @@ Joel N. Weber II devnull@gnu.org John Hughes john@nitelite.calvacom.fr Jorge Stolfi stolfi@dcc.unicamp.br Karl Heuer kwzh@gnu.org +Karl Petterson karl.pettersson@klpn.se Kaveh R. Ghazi ghazi@caip.rutgers.edu Kazuro Furukawa furukawa@apricot.kek.jp Keith Bostic bostic@bsdi.com diff --git a/src/pcresearch.c b/src/pcresearch.c index a107f4d0..45b67eed 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -149,7 +149,7 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) { if (! localeinfo.using_utf8) die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); - flags |= PCRE2_UTF; + flags |= (PCRE2_UTF | PCRE2_UCP); #if 0 /* Do not match individual code units but only UTF-8. */ flags |= PCRE2_NEVER_BACKSLASH_C; diff --git a/tests/Makefile.am b/tests/Makefile.am index e0b0503c..a47cf5c0 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -147,6 +147,7 @@ TESTS = \ pcre-jitstack \ pcre-o \ pcre-utf8 \ + pcre-utf8-w \ pcre-w \ pcre-wx-backref \ pcre-z \ diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w new file mode 100755 index 00000000..4cd7db69 --- /dev/null +++ b/tests/pcre-utf8-w @@ -0,0 +1,28 @@ +#!/bin/sh +# Ensure non-ASCII UTF-8 characters are correctly identified as word-consituent +# +# Copyright (C) 2023 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src +require_en_utf8_locale_ +LC_ALL=en_US.UTF-8 +export LC_ALL +require_pcre_ + +fail=0 + +echo 'Perú'> in || framework_failure_ + +echo 'ú' > exp || framework_failure_ +grep -Po '.\b' in > out || fail=1 +compare exp out || fail=1 + +echo 'rú' > exp || framework_failure_ +grep -Po 'r\w' in > out || fail=1 +compare exp out || fail=1 + +Exit $fail |