summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarlo Marcelo Arenas Belón <carenas@gmail.com>2021-11-12 16:45:04 -0800
committerPaul Eggert <eggert@cs.ucla.edu>2021-11-14 12:13:28 -0800
commite0d39a9133e1507345d73ac5aff85f037f39aa54 (patch)
tree6180f608c8e7aeb689e8fba01da7d5848b0501a5
parentb07c82ccdb9fcb97d99a5871134b2227e7f08915 (diff)
downloadgrep-e0d39a9133e1507345d73ac5aff85f037f39aa54.tar.gz
grep: migrate to pcre2
Mostly a bug by bug translation of the original code to the PCRE2 API. Code still could do with some optimizations but should be good as a starting point. The API changes the sign of some types and therefore some ugly casts were needed, some of the changes are just to make sure all variables fit into the newer types better. Includes backward compatibility and could be made to build all the way to 10.00, but assumes a recent enough version and has been tested with 10.23 (from CentOS 7, the oldest). Performance seems equivalent, and it also seems functionally complete. * m4/pcre.m4 (gl_FUNC_PCRE): Check for PCRE2, not the original PCRE. * src/pcresearch.c (struct pcre_comp, jit_exec) (Pcompile, Pexecute): Use PCRE2, not the original PCRE. * tests/filename-lineno.pl: Adjust to match PCRE2 diagnostics.
-rw-r--r--doc/grep.in.18
-rw-r--r--doc/grep.texi2
-rw-r--r--m4/pcre.m421
-rw-r--r--src/pcresearch.c249
-rwxr-xr-xtests/filename-lineno.pl4
5 files changed, 138 insertions, 146 deletions
diff --git a/doc/grep.in.1 b/doc/grep.in.1
index b014f657..208cb76d 100644
--- a/doc/grep.in.1
+++ b/doc/grep.in.1
@@ -756,7 +756,7 @@ In other implementations, basic regular expressions are less powerful.
The following description applies to extended regular expressions;
differences for basic regular expressions are summarized afterwards.
Perl-compatible regular expressions give additional functionality, and are
-documented in B<pcresyntax>(3) and B<pcrepattern>(3), but work only if
+documented in B<pcre2syntax>(3) and B<pcre2pattern>(3), but work only if
PCRE support is enabled.
.PP
The fundamental building blocks are the regular expressions
@@ -1360,9 +1360,9 @@ from the globbing syntax that the shell uses to match file names.
.BR sort (1),
.BR xargs (1),
.BR read (2),
-.BR pcre (3),
-.BR pcresyntax (3),
-.BR pcrepattern (3),
+.BR pcre2 (3),
+.BR pcre2syntax (3),
+.BR pcre2pattern (3),
.BR terminfo (5),
.BR glob (7),
.BR regex (7)
diff --git a/doc/grep.texi b/doc/grep.texi
index e5b9fd8a..c3c4bbfa 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -1168,7 +1168,7 @@ In other implementations, basic regular expressions are less powerful.
The following description applies to extended regular expressions;
differences for basic regular expressions are summarized afterwards.
Perl-compatible regular expressions give additional functionality, and
-are documented in the @i{pcresyntax}(3) and @i{pcrepattern}(3) manual
+are documented in the @i{pcre2syntax}(3) and @i{pcre2pattern}(3) manual
pages, but work only if PCRE is available in the system.
@menu
diff --git a/m4/pcre.m4 b/m4/pcre.m4
index 78b7fda7..a1c6c824 100644
--- a/m4/pcre.m4
+++ b/m4/pcre.m4
@@ -1,4 +1,4 @@
-# pcre.m4 - check for libpcre support
+# pcre.m4 - check for PCRE library support
# Copyright (C) 2010-2021 Free Software Foundation, Inc.
# This file is free software; the Free Software Foundation
@@ -9,7 +9,7 @@ AC_DEFUN([gl_FUNC_PCRE],
[
AC_ARG_ENABLE([perl-regexp],
AS_HELP_STRING([--disable-perl-regexp],
- [disable perl-regexp (pcre) support]),
+ [disable perl-regexp (pcre2) support]),
[case $enableval in
yes|no) test_pcre=$enableval;;
*) AC_MSG_ERROR([invalid value $enableval for --disable-perl-regexp]);;
@@ -21,24 +21,25 @@ AC_DEFUN([gl_FUNC_PCRE],
use_pcre=no
if test $test_pcre != no; then
- PKG_CHECK_MODULES([PCRE], [libpcre], [], [: ${PCRE_LIBS=-lpcre}])
+ PKG_CHECK_MODULES([PCRE], [libpcre2-8], [], [: ${PCRE_LIBS=-lpcre2-8}])
- AC_CACHE_CHECK([for pcre_compile], [pcre_cv_have_pcre_compile],
+ AC_CACHE_CHECK([for pcre2_compile], [pcre_cv_have_pcre2_compile],
[pcre_saved_CFLAGS=$CFLAGS
pcre_saved_LIBS=$LIBS
CFLAGS="$CFLAGS $PCRE_CFLAGS"
LIBS="$PCRE_LIBS $LIBS"
AC_LINK_IFELSE(
- [AC_LANG_PROGRAM([[#include <pcre.h>
+ [AC_LANG_PROGRAM([[#define PCRE2_CODE_UNIT_WIDTH 8
+ #include <pcre2.h>
]],
- [[pcre *p = pcre_compile (0, 0, 0, 0, 0);
+ [[pcre2_code *p = pcre2_compile (0, 0, 0, 0, 0, 0);
return !p;]])],
- [pcre_cv_have_pcre_compile=yes],
- [pcre_cv_have_pcre_compile=no])
+ [pcre_cv_have_pcre2_compile=yes],
+ [pcre_cv_have_pcre2_compile=no])
CFLAGS=$pcre_saved_CFLAGS
LIBS=$pcre_saved_LIBS])
- if test "$pcre_cv_have_pcre_compile" = yes; then
+ if test "$pcre_cv_have_pcre2_compile" = yes; then
use_pcre=yes
elif test $test_pcre = maybe; then
AC_MSG_WARN([AC_PACKAGE_NAME will be built without pcre support.])
@@ -50,7 +51,7 @@ AC_DEFUN([gl_FUNC_PCRE],
if test $use_pcre = yes; then
AC_DEFINE([HAVE_LIBPCRE], [1],
[Define to 1 if you have the Perl Compatible Regular Expressions
- library (-lpcre).])
+ library (-lpcre2).])
else
PCRE_CFLAGS=
PCRE_LIBS=
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 09f92c85..630678bf 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -17,41 +17,32 @@
02110-1301, USA. */
/* Written August 1992 by Mike Haertel. */
+/* Updated for PCRE2 by Carlo Arenas. */
#include <config.h>
#include "search.h"
#include "die.h"
-#include <pcre.h>
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
-/* This must be at least 2; everything after that is for performance
- in pcre_exec. */
-enum { NSUB = 300 };
-
-#ifndef PCRE_EXTRA_MATCH_LIMIT_RECURSION
-# define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
-#endif
-#ifndef PCRE_STUDY_JIT_COMPILE
-# define PCRE_STUDY_JIT_COMPILE 0
-#endif
-#ifndef PCRE_STUDY_EXTRA_NEEDED
-# define PCRE_STUDY_EXTRA_NEEDED 0
+/* Needed for backward compatibility for PCRE2 < 10.30 */
+#ifndef PCRE2_CONFIG_DEPTHLIMIT
+#define PCRE2_CONFIG_DEPTHLIMIT PCRE2_CONFIG_RECURSIONLIMIT
+#define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
+#define pcre2_set_depth_limit pcre2_set_recursion_limit
#endif
struct pcre_comp
{
- /* Compiled internal form of a Perl regular expression. */
- pcre *cre;
-
- /* Additional information about the pattern. */
- pcre_extra *extra;
-
-#if PCRE_STUDY_JIT_COMPILE
/* The JIT stack and its maximum size. */
- pcre_jit_stack *jit_stack;
- int jit_stack_size;
-#endif
+ pcre2_jit_stack *jit_stack;
+ PCRE2_SIZE jit_stack_size;
+ /* Compiled internal form of a Perl regular expression. */
+ pcre2_code *cre;
+ pcre2_match_context *mcontext;
+ pcre2_match_data *data;
/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
string matches when that flag is used. */
int empty_match[2];
@@ -60,54 +51,49 @@ struct pcre_comp
/* Match the already-compiled PCRE pattern against the data in SUBJECT,
of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
- options OPTIONS, and storing resulting matches into SUB. Return
- the (nonnegative) match location or a (negative) error number. */
+ options OPTIONS.
+ Return the (nonnegative) match count or a (negative) error number. */
static int
-jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes,
- int search_offset, int options, int *sub)
+jit_exec (struct pcre_comp *pc, char const *subject, PCRE2_SIZE search_bytes,
+ PCRE2_SIZE search_offset, int options)
{
while (true)
{
- int e = pcre_exec (pc->cre, pc->extra, subject, search_bytes,
- search_offset, options, sub, NSUB);
-
-#if PCRE_STUDY_JIT_COMPILE
- /* Going over this would trigger an int overflow bug within PCRE. */
- int jitstack_max = INT_MAX - 8 * 1024;
-
- if (e == PCRE_ERROR_JIT_STACKLIMIT
- && 0 < pc->jit_stack_size && pc->jit_stack_size <= jitstack_max / 2)
+ int e = pcre2_match (pc->cre, (PCRE2_SPTR)subject, search_bytes,
+ search_offset, options, pc->data, pc->mcontext);
+ if (e == PCRE2_ERROR_JIT_STACKLIMIT
+ && 0 < pc->jit_stack_size && pc->jit_stack_size <= INT_MAX / 2)
{
- int old_size = pc->jit_stack_size;
- int new_size = pc->jit_stack_size = old_size * 2;
+ PCRE2_SIZE old_size = pc->jit_stack_size;
+ PCRE2_SIZE new_size = pc->jit_stack_size = old_size * 2;
+
if (pc->jit_stack)
- pcre_jit_stack_free (pc->jit_stack);
- pc->jit_stack = pcre_jit_stack_alloc (old_size, new_size);
- if (!pc->jit_stack)
+ pcre2_jit_stack_free (pc->jit_stack);
+ pc->jit_stack = pcre2_jit_stack_create (old_size, new_size, NULL);
+
+ if (!pc->mcontext)
+ pc->mcontext = pcre2_match_context_create (NULL);
+
+ if (!pc->jit_stack || !pc->mcontext)
die (EXIT_TROUBLE, 0,
_("failed to allocate memory for the PCRE JIT stack"));
- pcre_assign_jit_stack (pc->extra, NULL, pc->jit_stack);
+ pcre2_jit_stack_assign (pc->mcontext, NULL, pc->jit_stack);
continue;
}
-#endif
-
-#if PCRE_EXTRA_MATCH_LIMIT_RECURSION
- if (e == PCRE_ERROR_RECURSIONLIMIT
- && (PCRE_STUDY_EXTRA_NEEDED || pc->extra))
+ if (e == PCRE2_ERROR_DEPTHLIMIT)
{
- unsigned long lim
- = (pc->extra->flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION
- ? pc->extra->match_limit_recursion
- : 0);
- if (lim <= ULONG_MAX / 2)
- {
- pc->extra->match_limit_recursion = lim ? 2 * lim : (1 << 24) - 1;
- pc->extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
- continue;
- }
- }
-#endif
+ uint32_t lim;
+ pcre2_config (PCRE2_CONFIG_DEPTHLIMIT, &lim);
+ if (lim >= UINT32_MAX / 2)
+ return e;
+
+ lim <<= 1;
+ if (!pc->mcontext)
+ pc->mcontext = pcre2_match_context_create (NULL);
+ pcre2_set_depth_limit (pc->mcontext, lim);
+ continue;
+ }
return e;
}
}
@@ -118,27 +104,35 @@ jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes,
void *
Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
{
- int e;
- char const *ep;
+ PCRE2_SIZE e;
+ int ec;
+ PCRE2_UCHAR8 ep[128]; /* 120 code units is suggested to avoid truncation */
static char const wprefix[] = "(?<!\\w)(?:";
static char const wsuffix[] = ")(?!\\w)";
static char const xprefix[] = "^(?:";
static char const xsuffix[] = ")$";
int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
sizeof xprefix - 1 + sizeof xsuffix - 1);
- char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
- int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0);
+ unsigned char *re = xmalloc (size + fix_len_max + 1);
+ int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
char *patlim = pattern + size;
- char *n = re;
- char const *p;
- char const *pnul;
+ char *n = (char *)re;
struct pcre_comp *pc = xcalloc (1, sizeof (*pc));
+ pcre2_compile_context *ccontext = pcre2_compile_context_create(NULL);
if (localeinfo.multibyte)
{
if (! localeinfo.using_utf8)
die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
- flags |= PCRE_UTF8;
+ flags |= PCRE2_UTF;
+#if 0
+ /* do not match individual code units but only UTF-8 */
+ flags |= PCRE2_NEVER_BACKSLASH_C;
+#endif
+#ifdef PCRE2_MATCH_INVALID_UTF
+ /* consider invalid UTF-8 as a barrier, instead of error */
+ flags |= PCRE2_MATCH_INVALID_UTF;
+#endif
}
/* FIXME: Remove this restriction. */
@@ -151,56 +145,42 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
if (match_lines)
strcpy (n, xprefix);
n += strlen (n);
-
- /* The PCRE interface doesn't allow NUL bytes in the pattern, so
- replace each NUL byte in the pattern with the four characters
- "\000", removing a preceding backslash if there are an odd
- number of backslashes before the NUL. */
- *patlim = '\0';
- for (p = pattern; (pnul = p + strlen (p)) < patlim; p = pnul + 1)
+ memcpy (n, pattern, size);
+ n += size;
+ if (match_words && !match_lines)
{
- memcpy (n, p, pnul - p);
- n += pnul - p;
- for (p = pnul; pattern < p && p[-1] == '\\'; p--)
- continue;
- n -= (pnul - p) & 1;
- strcpy (n, "\\000");
- n += 4;
- }
- memcpy (n, p, patlim - p + 1);
- n += patlim - p;
- *patlim = '\n';
-
- if (match_words)
strcpy (n, wsuffix);
+ n += strlen(wsuffix);
+ }
if (match_lines)
+ {
strcpy (n, xsuffix);
+ n += strlen(xsuffix);
+ }
- pc->cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
+ pcre2_set_character_tables (ccontext, pcre2_maketables (NULL));
+ pc->cre = pcre2_compile (re, n - (char *)re, flags, &ec, &e, ccontext);
if (!pc->cre)
- die (EXIT_TROUBLE, 0, "%s", ep);
-
- int pcre_study_flags = PCRE_STUDY_EXTRA_NEEDED | PCRE_STUDY_JIT_COMPILE;
- pc->extra = pcre_study (pc->cre, pcre_study_flags, &ep);
- if (ep)
- die (EXIT_TROUBLE, 0, "%s", ep);
+ {
+ pcre2_get_error_message (ec, ep, sizeof (ep));
+ die (EXIT_TROUBLE, 0, "%s", ep);
+ }
-#if PCRE_STUDY_JIT_COMPILE
- if (pcre_fullinfo (pc->cre, pc->extra, PCRE_INFO_JIT, &e))
- die (EXIT_TROUBLE, 0, _("internal error (should never happen)"));
+ pc->data = pcre2_match_data_create_from_pattern (pc->cre, NULL);
- /* The PCRE documentation says that a 32 KiB stack is the default. */
- if (e)
- pc->jit_stack_size = 32 << 10;
-#endif
+ ec = pcre2_jit_compile (pc->cre, PCRE2_JIT_COMPLETE);
+ if (ec && ec != PCRE2_ERROR_JIT_BADOPTION && ec != PCRE2_ERROR_NOMEMORY)
+ die (EXIT_TROUBLE, 0, _("JIT internal error: %d"), ec);
+ else
+ {
+ /* The PCRE documentation says that a 32 KiB stack is the default. */
+ pc->jit_stack_size = 32 << 10;
+ }
free (re);
- int sub[NSUB];
- pc->empty_match[false] = pcre_exec (pc->cre, pc->extra, "", 0, 0,
- PCRE_NOTBOL, sub, NSUB);
- pc->empty_match[true] = pcre_exec (pc->cre, pc->extra, "", 0, 0, 0, sub,
- NSUB);
+ pc->empty_match[false] = jit_exec (pc, "", 0, 0, PCRE2_NOTBOL);
+ pc->empty_match[true] = jit_exec (pc, "", 0, 0, 0);
return pc;
}
@@ -209,15 +189,15 @@ ptrdiff_t
Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
char const *start_ptr)
{
- int sub[NSUB];
char const *p = start_ptr ? start_ptr : buf;
bool bol = p[-1] == eolbyte;
char const *line_start = buf;
- int e = PCRE_ERROR_NOMATCH;
+ int e = PCRE2_ERROR_NOMATCH;
char const *line_end;
struct pcre_comp *pc = vcp;
+ PCRE2_SIZE *sub = pcre2_get_ovector_pointer (pc->data);
- /* The search address to pass to pcre_exec. This is the start of
+ /* The search address to pass to PCRE. This is the start of
the buffer, or just past the most-recently discovered encoding
error or line end. */
char const *subject = buf;
@@ -229,14 +209,14 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
better and the correctness issues were too puzzling. See
Bug#22655. */
line_end = rawmemchr (p, eolbyte);
- if (INT_MAX < line_end - p)
+ if (PCRE2_SIZE_MAX < line_end - p)
die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
for (;;)
{
/* Skip past bytes that are easily determined to be encoding
errors, treating them as data that cannot match. This is
- faster than having pcre_exec check them. */
+ faster than having PCRE check them. */
while (localeinfo.sbclen[to_uchar (*p)] == -1)
{
p++;
@@ -244,10 +224,10 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
bol = false;
}
- int search_offset = p - subject;
+ PCRE2_SIZE search_offset = p - subject;
/* Check for an empty match; this is faster than letting
- pcre_exec do it. */
+ PCRE do it. */
if (p == line_end)
{
sub[0] = sub[1] = search_offset;
@@ -257,13 +237,14 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
int options = 0;
if (!bol)
- options |= PCRE_NOTBOL;
+ options |= PCRE2_NOTBOL;
- e = jit_exec (pc, subject, line_end - subject, search_offset,
- options, sub);
- if (e != PCRE_ERROR_BADUTF8)
+ e = jit_exec (pc, subject, line_end - subject,
+ search_offset, options);
+ /* PCRE2 provides 22 different error codes for bad UTF-8 */
+ if (! (PCRE2_ERROR_UTF8_ERR21 <= e && e < PCRE2_ERROR_UTF8_ERR1))
break;
- int valid_bytes = sub[0];
+ PCRE2_SIZE valid_bytes = pcre2_get_startchar (pc->data);
if (search_offset <= valid_bytes)
{
@@ -273,14 +254,15 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
/* Handle the empty-match case specially, for speed.
This optimization is valid if VALID_BYTES is zero,
which means SEARCH_OFFSET is also zero. */
+ sub[0] = valid_bytes;
sub[1] = 0;
e = pc->empty_match[bol];
}
else
e = jit_exec (pc, subject, valid_bytes, search_offset,
- options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
+ options | PCRE2_NO_UTF_CHECK | PCRE2_NOTEOL);
- if (e != PCRE_ERROR_NOMATCH)
+ if (e != PCRE2_ERROR_NOMATCH)
break;
/* Treat the encoding error as data that cannot match. */
@@ -291,7 +273,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
subject += valid_bytes + 1;
}
- if (e != PCRE_ERROR_NOMATCH)
+ if (e != PCRE2_ERROR_NOMATCH)
break;
bol = true;
p = subject = line_start = line_end + 1;
@@ -302,26 +284,35 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
{
switch (e)
{
- case PCRE_ERROR_NOMATCH:
+ case PCRE2_ERROR_NOMATCH:
break;
- case PCRE_ERROR_NOMEMORY:
+ case PCRE2_ERROR_NOMEMORY:
die (EXIT_TROUBLE, 0, _("%s: memory exhausted"), input_filename ());
-#if PCRE_STUDY_JIT_COMPILE
- case PCRE_ERROR_JIT_STACKLIMIT:
+ case PCRE2_ERROR_JIT_STACKLIMIT:
die (EXIT_TROUBLE, 0, _("%s: exhausted PCRE JIT stack"),
input_filename ());
-#endif
- case PCRE_ERROR_MATCHLIMIT:
+ case PCRE2_ERROR_MATCHLIMIT:
die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's backtracking limit"),
input_filename ());
- case PCRE_ERROR_RECURSIONLIMIT:
- die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's recursion limit"),
+ case PCRE2_ERROR_DEPTHLIMIT:
+ die (EXIT_TROUBLE, 0,
+ _("%s: exceeded PCRE's nested backtracking limit"),
input_filename ());
+ case PCRE2_ERROR_RECURSELOOP:
+ die (EXIT_TROUBLE, 0, _("%s: PCRE detected recurse loop"),
+ input_filename ());
+
+#ifdef PCRE2_ERROR_HEAPLIMIT
+ case PCRE2_ERROR_HEAPLIMIT:
+ die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's heap limit"),
+ input_filename ());
+#endif
+
default:
/* For now, we lump all remaining PCRE failures into this basket.
If anyone cares to provide sample grep usage that can trigger
diff --git a/tests/filename-lineno.pl b/tests/filename-lineno.pl
index 1e84b45e..1ff3d6af 100755
--- a/tests/filename-lineno.pl
+++ b/tests/filename-lineno.pl
@@ -101,13 +101,13 @@ my @Tests =
],
['invalid-re-P-paren', '-P ")"', {EXIT=>2},
{ERR => $ENV{PCRE_WORKS} == 1
- ? "$prog: unmatched parentheses\n"
+ ? "$prog: unmatched closing parenthesis\n"
: $no_pcre
},
],
['invalid-re-P-star-paren', '-P "a.*)"', {EXIT=>2},
{ERR => $ENV{PCRE_WORKS} == 1
- ? "$prog: unmatched parentheses\n"
+ ? "$prog: unmatched closing parenthesis\n"
: $no_pcre
},
],