5 files changed, 138 insertions, 146 deletions
diff --git a/doc/grep.in.1 b/doc/grep.in.1
index b014f657..208cb76d 100644
--- a/doc/grep.in.1
+++ b/doc/grep.in.1
@@ -756,7 +756,7 @@ In other implementations, basic regular expressions are less powerful.
 The following description applies to extended regular expressions;
 differences for basic regular expressions are summarized afterwards.
 Perl-compatible regular expressions give additional functionality, and are
-documented in B<pcresyntax>(3) and B<pcrepattern>(3), but work only if
+documented in B<pcre2syntax>(3) and B<pcre2pattern>(3), but work only if
 PCRE support is enabled.
 .PP
 The fundamental building blocks are the regular expressions
@@ -1360,9 +1360,9 @@ from the globbing syntax that the shell uses to match file names.
 .BR sort (1),
 .BR xargs (1),
 .BR read (2),
-.BR pcre (3),
-.BR pcresyntax (3),
-.BR pcrepattern (3),
+.BR pcre2 (3),
+.BR pcre2syntax (3),
+.BR pcre2pattern (3),
 .BR terminfo (5),
 .BR glob (7),
 .BR regex (7)
diff --git a/doc/grep.texi b/doc/grep.texi
index e5b9fd8a..c3c4bbfa 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -1168,7 +1168,7 @@ In other implementations, basic regular expressions are less powerful.
 The following description applies to extended regular expressions;
 differences for basic regular expressions are summarized afterwards.
 Perl-compatible regular expressions give additional functionality, and
-are documented in the @i{pcresyntax}(3) and @i{pcrepattern}(3) manual
+are documented in the @i{pcre2syntax}(3) and @i{pcre2pattern}(3) manual
 pages, but work only if PCRE is available in the system.
 
 @menu
diff --git a/m4/pcre.m4 b/m4/pcre.m4
index 78b7fda7..a1c6c824 100644
--- a/m4/pcre.m4
+++ b/m4/pcre.m4
@@ -1,4 +1,4 @@
-# pcre.m4 - check for libpcre support
+# pcre.m4 - check for PCRE library support
 
 # Copyright (C) 2010-2021 Free Software Foundation, Inc.
 # This file is free software; the Free Software Foundation
@@ -9,7 +9,7 @@ AC_DEFUN([gl_FUNC_PCRE],
 [
   AC_ARG_ENABLE([perl-regexp],
     AS_HELP_STRING([--disable-perl-regexp],
-                   [disable perl-regexp (pcre) support]),
+                   [disable perl-regexp (pcre2) support]),
     [case $enableval in
        yes|no) test_pcre=$enableval;;
        *) AC_MSG_ERROR([invalid value $enableval for --disable-perl-regexp]);;
@@ -21,24 +21,25 @@ AC_DEFUN([gl_FUNC_PCRE],
   use_pcre=no
 
   if test $test_pcre != no; then
-    PKG_CHECK_MODULES([PCRE], [libpcre], [], [: ${PCRE_LIBS=-lpcre}])
+    PKG_CHECK_MODULES([PCRE], [libpcre2-8], [], [: ${PCRE_LIBS=-lpcre2-8}])
 
-    AC_CACHE_CHECK([for pcre_compile], [pcre_cv_have_pcre_compile],
+    AC_CACHE_CHECK([for pcre2_compile], [pcre_cv_have_pcre2_compile],
       [pcre_saved_CFLAGS=$CFLAGS
        pcre_saved_LIBS=$LIBS
        CFLAGS="$CFLAGS $PCRE_CFLAGS"
        LIBS="$PCRE_LIBS $LIBS"
        AC_LINK_IFELSE(
-         [AC_LANG_PROGRAM([[#include <pcre.h>
+         [AC_LANG_PROGRAM([[#define PCRE2_CODE_UNIT_WIDTH 8
+                            #include <pcre2.h>
                           ]],
-            [[pcre *p = pcre_compile (0, 0, 0, 0, 0);
+            [[pcre2_code *p = pcre2_compile (0, 0, 0, 0, 0, 0);
               return !p;]])],
-         [pcre_cv_have_pcre_compile=yes],
-         [pcre_cv_have_pcre_compile=no])
+         [pcre_cv_have_pcre2_compile=yes],
+         [pcre_cv_have_pcre2_compile=no])
        CFLAGS=$pcre_saved_CFLAGS
        LIBS=$pcre_saved_LIBS])
 
-    if test "$pcre_cv_have_pcre_compile" = yes; then
+    if test "$pcre_cv_have_pcre2_compile" = yes; then
       use_pcre=yes
     elif test $test_pcre = maybe; then
       AC_MSG_WARN([AC_PACKAGE_NAME will be built without pcre support.])
@@ -50,7 +51,7 @@ AC_DEFUN([gl_FUNC_PCRE],
   if test $use_pcre = yes; then
     AC_DEFINE([HAVE_LIBPCRE], [1],
       [Define to 1 if you have the Perl Compatible Regular Expressions
-       library (-lpcre).])
+       library (-lpcre2).])
   else
     PCRE_CFLAGS=
     PCRE_LIBS=
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 09f92c85..630678bf 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -17,41 +17,32 @@
    02110-1301, USA.  */
 
 /* Written August 1992 by Mike Haertel. */
+/* Updated for PCRE2 by Carlo Arenas. */
 
 #include <config.h>
 #include "search.h"
 #include "die.h"
 
-#include <pcre.h>
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 
-/* This must be at least 2; everything after that is for performance
-   in pcre_exec.  */
-enum { NSUB = 300 };
-
-#ifndef PCRE_EXTRA_MATCH_LIMIT_RECURSION
-# define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
-#endif
-#ifndef PCRE_STUDY_JIT_COMPILE
-# define PCRE_STUDY_JIT_COMPILE 0
-#endif
-#ifndef PCRE_STUDY_EXTRA_NEEDED
-# define PCRE_STUDY_EXTRA_NEEDED 0
+/* Needed for backward compatibility for PCRE2 < 10.30  */
+#ifndef PCRE2_CONFIG_DEPTHLIMIT
+#define PCRE2_CONFIG_DEPTHLIMIT PCRE2_CONFIG_RECURSIONLIMIT
+#define PCRE2_ERROR_DEPTHLIMIT  PCRE2_ERROR_RECURSIONLIMIT
+#define pcre2_set_depth_limit   pcre2_set_recursion_limit
 #endif
 
 struct pcre_comp
 {
-  /* Compiled internal form of a Perl regular expression.  */
-  pcre *cre;
-
-  /* Additional information about the pattern.  */
-  pcre_extra *extra;
-
-#if PCRE_STUDY_JIT_COMPILE
   /* The JIT stack and its maximum size.  */
-  pcre_jit_stack *jit_stack;
-  int jit_stack_size;
-#endif
+  pcre2_jit_stack *jit_stack;
+  PCRE2_SIZE jit_stack_size;
 
+  /* Compiled internal form of a Perl regular expression.  */
+  pcre2_code *cre;
+  pcre2_match_context *mcontext;
+  pcre2_match_data *data;
   /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
      string matches when that flag is used.  */
   int empty_match[2];
@@ -60,54 +51,49 @@ struct pcre_comp
 
 /* Match the already-compiled PCRE pattern against the data in SUBJECT,
    of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
-   options OPTIONS, and storing resulting matches into SUB.  Return
-   the (nonnegative) match location or a (negative) error number.  */
+   options OPTIONS.
+   Return the (nonnegative) match count or a (negative) error number.  */
 static int
-jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes,
-          int search_offset, int options, int *sub)
+jit_exec (struct pcre_comp *pc, char const *subject, PCRE2_SIZE search_bytes,
+          PCRE2_SIZE search_offset, int options)
 {
   while (true)
     {
-      int e = pcre_exec (pc->cre, pc->extra, subject, search_bytes,
-                         search_offset, options, sub, NSUB);
-
-#if PCRE_STUDY_JIT_COMPILE
-      /* Going over this would trigger an int overflow bug within PCRE.  */
-      int jitstack_max = INT_MAX - 8 * 1024;
-
-      if (e == PCRE_ERROR_JIT_STACKLIMIT
-          && 0 < pc->jit_stack_size && pc->jit_stack_size <= jitstack_max / 2)
+      int e = pcre2_match (pc->cre, (PCRE2_SPTR)subject, search_bytes,
+                           search_offset, options, pc->data, pc->mcontext);
+      if (e == PCRE2_ERROR_JIT_STACKLIMIT
+          && 0 < pc->jit_stack_size && pc->jit_stack_size <= INT_MAX / 2)
         {
-          int old_size = pc->jit_stack_size;
-          int new_size = pc->jit_stack_size = old_size * 2;
+          PCRE2_SIZE old_size = pc->jit_stack_size;
+          PCRE2_SIZE new_size = pc->jit_stack_size = old_size * 2;
+
           if (pc->jit_stack)
-            pcre_jit_stack_free (pc->jit_stack);
-          pc->jit_stack = pcre_jit_stack_alloc (old_size, new_size);
-          if (!pc->jit_stack)
+            pcre2_jit_stack_free (pc->jit_stack);
+          pc->jit_stack = pcre2_jit_stack_create (old_size, new_size, NULL);
+
+          if (!pc->mcontext)
+            pc->mcontext = pcre2_match_context_create (NULL);
+
+          if (!pc->jit_stack || !pc->mcontext)
             die (EXIT_TROUBLE, 0,
                  _("failed to allocate memory for the PCRE JIT stack"));
-          pcre_assign_jit_stack (pc->extra, NULL, pc->jit_stack);
+          pcre2_jit_stack_assign (pc->mcontext, NULL, pc->jit_stack);
           continue;
         }
-#endif
-
-#if PCRE_EXTRA_MATCH_LIMIT_RECURSION
-      if (e == PCRE_ERROR_RECURSIONLIMIT
-          && (PCRE_STUDY_EXTRA_NEEDED || pc->extra))
+      if (e == PCRE2_ERROR_DEPTHLIMIT)
         {
-          unsigned long lim
-            = (pc->extra->flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION
-               ? pc->extra->match_limit_recursion
-               : 0);
-          if (lim <= ULONG_MAX / 2)
-            {
-              pc->extra->match_limit_recursion = lim ? 2 * lim : (1 << 24) - 1;
-              pc->extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
-              continue;
-            }
-        }
-#endif
+          uint32_t lim;
+          pcre2_config (PCRE2_CONFIG_DEPTHLIMIT, &lim);
+          if (lim >= UINT32_MAX / 2)
+            return e;
+
+          lim <<= 1;
+          if (!pc->mcontext)
+            pc->mcontext = pcre2_match_context_create (NULL);
 
+          pcre2_set_depth_limit (pc->mcontext, lim);
+          continue;
+        }
       return e;
     }
 }
@@ -118,27 +104,35 @@ jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes,
 void *
 Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
 {
-  int e;
-  char const *ep;
+  PCRE2_SIZE e;
+  int ec;
+  PCRE2_UCHAR8 ep[128]; /* 120 code units is suggested to avoid truncation  */
   static char const wprefix[] = "(?<!\\w)(?:";
   static char const wsuffix[] = ")(?!\\w)";
   static char const xprefix[] = "^(?:";
   static char const xsuffix[] = ")$";
   int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
                          sizeof xprefix - 1 + sizeof xsuffix - 1);
-  char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
-  int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0);
+  unsigned char *re = xmalloc (size + fix_len_max + 1);
+  int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
   char *patlim = pattern + size;
-  char *n = re;
-  char const *p;
-  char const *pnul;
+  char *n = (char *)re;
   struct pcre_comp *pc = xcalloc (1, sizeof (*pc));
+  pcre2_compile_context *ccontext = pcre2_compile_context_create(NULL);
 
   if (localeinfo.multibyte)
     {
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
-      flags |= PCRE_UTF8;
+      flags |= PCRE2_UTF;
+#if 0
+      /* do not match individual code units but only UTF-8  */
+      flags |= PCRE2_NEVER_BACKSLASH_C;
+#endif
+#ifdef PCRE2_MATCH_INVALID_UTF
+      /* consider invalid UTF-8 as a barrier, instead of error  */
+      flags |= PCRE2_MATCH_INVALID_UTF;
+#endif
     }
 
   /* FIXME: Remove this restriction.  */
@@ -151,56 +145,42 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
   if (match_lines)
     strcpy (n, xprefix);
   n += strlen (n);
-
-  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
-     replace each NUL byte in the pattern with the four characters
-     "\000", removing a preceding backslash if there are an odd
-     number of backslashes before the NUL.  */
-  *patlim = '\0';
-  for (p = pattern; (pnul = p + strlen (p)) < patlim; p = pnul + 1)
+  memcpy (n, pattern, size);
+  n += size;
+  if (match_words && !match_lines)
     {
-      memcpy (n, p, pnul - p);
-      n += pnul - p;
-      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
-        continue;
-      n -= (pnul - p) & 1;
-      strcpy (n, "\\000");
-      n += 4;
-    }
-  memcpy (n, p, patlim - p + 1);
-  n += patlim - p;
-  *patlim = '\n';
-
-  if (match_words)
     strcpy (n, wsuffix);
+    n += strlen(wsuffix);
+    }
   if (match_lines)
+    {
     strcpy (n, xsuffix);
+    n += strlen(xsuffix);
+    }
 
-  pc->cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
+  pcre2_set_character_tables (ccontext, pcre2_maketables (NULL));
+  pc->cre = pcre2_compile (re, n - (char *)re, flags, &ec, &e, ccontext);
   if (!pc->cre)
-    die (EXIT_TROUBLE, 0, "%s", ep);
-
-  int pcre_study_flags = PCRE_STUDY_EXTRA_NEEDED | PCRE_STUDY_JIT_COMPILE;
-  pc->extra = pcre_study (pc->cre, pcre_study_flags, &ep);
-  if (ep)
-    die (EXIT_TROUBLE, 0, "%s", ep);
+    {
+      pcre2_get_error_message (ec, ep, sizeof (ep));
+      die (EXIT_TROUBLE, 0, "%s", ep);
+    }
 
-#if PCRE_STUDY_JIT_COMPILE
-  if (pcre_fullinfo (pc->cre, pc->extra, PCRE_INFO_JIT, &e))
-    die (EXIT_TROUBLE, 0, _("internal error (should never happen)"));
+  pc->data = pcre2_match_data_create_from_pattern (pc->cre, NULL);
 
-  /* The PCRE documentation says that a 32 KiB stack is the default.  */
-  if (e)
-    pc->jit_stack_size = 32 << 10;
-#endif
+  ec = pcre2_jit_compile (pc->cre, PCRE2_JIT_COMPLETE);
+  if (ec && ec != PCRE2_ERROR_JIT_BADOPTION && ec != PCRE2_ERROR_NOMEMORY)
+    die (EXIT_TROUBLE, 0, _("JIT internal error: %d"), ec);
+  else
+    {
+      /* The PCRE documentation says that a 32 KiB stack is the default.  */
+      pc->jit_stack_size = 32 << 10;
+    }
 
   free (re);
 
-  int sub[NSUB];
-  pc->empty_match[false] = pcre_exec (pc->cre, pc->extra, "", 0, 0,
-                                      PCRE_NOTBOL, sub, NSUB);
-  pc->empty_match[true] = pcre_exec (pc->cre, pc->extra, "", 0, 0, 0, sub,
-                                     NSUB);
+  pc->empty_match[false] = jit_exec (pc, "", 0, 0, PCRE2_NOTBOL);
+  pc->empty_match[true] = jit_exec (pc, "", 0, 0, 0);
 
   return pc;
 }
@@ -209,15 +189,15 @@ ptrdiff_t
 Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
           char const *start_ptr)
 {
-  int sub[NSUB];
   char const *p = start_ptr ? start_ptr : buf;
   bool bol = p[-1] == eolbyte;
   char const *line_start = buf;
-  int e = PCRE_ERROR_NOMATCH;
+  int e = PCRE2_ERROR_NOMATCH;
   char const *line_end;
   struct pcre_comp *pc = vcp;
+  PCRE2_SIZE *sub = pcre2_get_ovector_pointer (pc->data);
 
-  /* The search address to pass to pcre_exec.  This is the start of
+  /* The search address to pass to PCRE.  This is the start of
      the buffer, or just past the most-recently discovered encoding
      error or line end.  */
   char const *subject = buf;
@@ -229,14 +209,14 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
          better and the correctness issues were too puzzling.  See
          Bug#22655.  */
       line_end = rawmemchr (p, eolbyte);
-      if (INT_MAX < line_end - p)
+      if (PCRE2_SIZE_MAX < line_end - p)
         die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
       for (;;)
         {
           /* Skip past bytes that are easily determined to be encoding
              errors, treating them as data that cannot match.  This is
-             faster than having pcre_exec check them.  */
+             faster than having PCRE check them.  */
           while (localeinfo.sbclen[to_uchar (*p)] == -1)
             {
               p++;
@@ -244,10 +224,10 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
               bol = false;
             }
 
-          int search_offset = p - subject;
+          PCRE2_SIZE search_offset = p - subject;
 
           /* Check for an empty match; this is faster than letting
-             pcre_exec do it.  */
+             PCRE do it.  */
           if (p == line_end)
             {
               sub[0] = sub[1] = search_offset;
@@ -257,13 +237,14 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
 
           int options = 0;
           if (!bol)
-            options |= PCRE_NOTBOL;
+            options |= PCRE2_NOTBOL;
 
-          e = jit_exec (pc, subject, line_end - subject, search_offset,
-                        options, sub);
-          if (e != PCRE_ERROR_BADUTF8)
+          e = jit_exec (pc, subject, line_end - subject,
+                        search_offset, options);
+          /* PCRE2 provides 22 different error codes for bad UTF-8  */
+          if (! (PCRE2_ERROR_UTF8_ERR21 <= e && e < PCRE2_ERROR_UTF8_ERR1))
             break;
-          int valid_bytes = sub[0];
+          PCRE2_SIZE valid_bytes = pcre2_get_startchar (pc->data);
 
           if (search_offset <= valid_bytes)
             {
@@ -273,14 +254,15 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
                   /* Handle the empty-match case specially, for speed.
                      This optimization is valid if VALID_BYTES is zero,
                      which means SEARCH_OFFSET is also zero.  */
+                  sub[0] = valid_bytes;
                   sub[1] = 0;
                   e = pc->empty_match[bol];
                 }
               else
                 e = jit_exec (pc, subject, valid_bytes, search_offset,
-                              options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
+                              options | PCRE2_NO_UTF_CHECK | PCRE2_NOTEOL);
 
-              if (e != PCRE_ERROR_NOMATCH)
+              if (e != PCRE2_ERROR_NOMATCH)
                 break;
 
               /* Treat the encoding error as data that cannot match.  */
@@ -291,7 +273,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
           subject += valid_bytes + 1;
         }
 
-      if (e != PCRE_ERROR_NOMATCH)
+      if (e != PCRE2_ERROR_NOMATCH)
         break;
       bol = true;
       p = subject = line_start = line_end + 1;
@@ -302,26 +284,35 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
     {
       switch (e)
         {
-        case PCRE_ERROR_NOMATCH:
+        case PCRE2_ERROR_NOMATCH:
           break;
 
-        case PCRE_ERROR_NOMEMORY:
+        case PCRE2_ERROR_NOMEMORY:
           die (EXIT_TROUBLE, 0, _("%s: memory exhausted"), input_filename ());
 
-#if PCRE_STUDY_JIT_COMPILE
-        case PCRE_ERROR_JIT_STACKLIMIT:
+        case PCRE2_ERROR_JIT_STACKLIMIT:
           die (EXIT_TROUBLE, 0, _("%s: exhausted PCRE JIT stack"),
                input_filename ());
-#endif
 
-        case PCRE_ERROR_MATCHLIMIT:
+        case PCRE2_ERROR_MATCHLIMIT:
           die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's backtracking limit"),
                input_filename ());
 
-        case PCRE_ERROR_RECURSIONLIMIT:
-          die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's recursion limit"),
+        case PCRE2_ERROR_DEPTHLIMIT:
+          die (EXIT_TROUBLE, 0,
+               _("%s: exceeded PCRE's nested backtracking limit"),
                input_filename ());
 
+        case PCRE2_ERROR_RECURSELOOP:
+          die (EXIT_TROUBLE, 0, _("%s: PCRE detected recurse loop"),
+               input_filename ());
+
+#ifdef PCRE2_ERROR_HEAPLIMIT
+        case PCRE2_ERROR_HEAPLIMIT:
+          die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's heap limit"),
+               input_filename ());
+#endif
+
         default:
           /* For now, we lump all remaining PCRE failures into this basket.
              If anyone cares to provide sample grep usage that can trigger
diff --git a/tests/filename-lineno.pl b/tests/filename-lineno.pl
index 1e84b45e..1ff3d6af 100755
--- a/tests/filename-lineno.pl
+++ b/tests/filename-lineno.pl
@@ -101,13 +101,13 @@ my @Tests =
    ],
    ['invalid-re-P-paren', '-P ")"', {EXIT=>2},
     {ERR => $ENV{PCRE_WORKS} == 1
-       ? "$prog: unmatched parentheses\n"
+       ? "$prog: unmatched closing parenthesis\n"
        : $no_pcre
     },
    ],
    ['invalid-re-P-star-paren', '-P "a.*)"', {EXIT=>2},
     {ERR => $ENV{PCRE_WORKS} == 1
-       ? "$prog: unmatched parentheses\n"
+       ? "$prog: unmatched closing parenthesis\n"
        : $no_pcre
     },
    ],