summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS6
-rw-r--r--THANKS.in2
-rw-r--r--src/pcresearch.c2
-rw-r--r--tests/Makefile.am1
-rwxr-xr-xtests/pcre-utf8-w28
5 files changed, 38 insertions, 1 deletions
diff --git a/NEWS b/NEWS
index b4047082..24ee084f 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,12 @@ GNU grep NEWS -*- outline -*-
** Bug fixes
+ With -P, some non-ASCII UTF8 characters were not recognized as
+ word-constituent due to our omission of the PCRE2_UCP flag. E.g.,
+ given f(){ echo Perú|LC_ALL=en_US.UTF-8 grep -Po "$1"; } and
+ this command, echo $(f 'r\w'):$(f '.\b'), before it would print ":r".
+ After the fix, it prints the correct results: "rú:ú".
+
When given multiple patterns the last of which has a back-reference,
grep no longer sometimes mistakenly matches lines in some cases.
[Bug#36148#13 introduced in grep 3.4]
diff --git a/THANKS.in b/THANKS.in
index 9872bfaa..d0d6f926 100644
--- a/THANKS.in
+++ b/THANKS.in
@@ -35,6 +35,7 @@ Gerald Stoller gerald_stoller@hotmail.com
Grant McDorman grant@isgtec.com
Greg Boyd gboyd.ccsf@gmail.com
Greg Louis glouis@dynamicro.on.ca
+Gro-Tsen https://twitter.com/gro_tsen
Guglielmo 'bond' Bondioni g.bondioni@libero.it
H. Merijn Brand h.m.brand@hccnet.nl
Harald Hanche-Olsen hanche@math.ntnu.no
@@ -50,6 +51,7 @@ Joel N. Weber II devnull@gnu.org
John Hughes john@nitelite.calvacom.fr
Jorge Stolfi stolfi@dcc.unicamp.br
Karl Heuer kwzh@gnu.org
+Karl Petterson karl.pettersson@klpn.se
Kaveh R. Ghazi ghazi@caip.rutgers.edu
Kazuro Furukawa furukawa@apricot.kek.jp
Keith Bostic bostic@bsdi.com
diff --git a/src/pcresearch.c b/src/pcresearch.c
index a107f4d0..45b67eed 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -149,7 +149,7 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
{
if (! localeinfo.using_utf8)
die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
- flags |= PCRE2_UTF;
+ flags |= (PCRE2_UTF | PCRE2_UCP);
#if 0
/* Do not match individual code units but only UTF-8. */
flags |= PCRE2_NEVER_BACKSLASH_C;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index e0b0503c..a47cf5c0 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -147,6 +147,7 @@ TESTS = \
pcre-jitstack \
pcre-o \
pcre-utf8 \
+ pcre-utf8-w \
pcre-w \
pcre-wx-backref \
pcre-z \
diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w
new file mode 100755
index 00000000..4cd7db69
--- /dev/null
+++ b/tests/pcre-utf8-w
@@ -0,0 +1,28 @@
+#!/bin/sh
+# Ensure non-ASCII UTF-8 characters are correctly identified as word-consituent
+#
+# Copyright (C) 2023 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_en_utf8_locale_
+LC_ALL=en_US.UTF-8
+export LC_ALL
+require_pcre_
+
+fail=0
+
+echo 'Perú'> in || framework_failure_
+
+echo 'ú' > exp || framework_failure_
+grep -Po '.\b' in > out || fail=1
+compare exp out || fail=1
+
+echo 'rú' > exp || framework_failure_
+grep -Po 'r\w' in > out || fail=1
+compare exp out || fail=1
+
+Exit $fail