summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2016-11-19 22:48:37 -0800
committerPaul Eggert <eggert@cs.ucla.edu>2016-11-19 23:53:39 -0800
commitcfdb4ba4249ac5860979d11436c6a925a298688e (patch)
treea7b1c19ea27aa1b609be41078125bb1cacfa2bac
parentda94c91a81fc63275371d0580d8688b6abd85346 (diff)
downloadgrep-cfdb4ba4249ac5860979d11436c6a925a298688e.tar.gz
grep: -P no longer uses PCRE_MULTILINE
This reverts commit f6603c4e1e04dbb87a7232c4b44acc6afdf65fef, as the extra performance is not worth the trouble for PCRE users. Problem reported by Stephane Chazelas in: http://bugs.gnu.org/22655#103 * NEWS: Document this and the next patch. * src/dfasearch.c (EGexecute): * src/grep.c (execute_fp_t): * src/kwsearch.c (Fexecute): * src/pcresearch.c (Pexecute): First arg is now a const pointer again. * src/grep.c (buf_has_encoding_errors): Now static. * src/grep.h (buf_has_encoding_errors): Remove decl. * src/search.h: Adjust decls. * src/pcresearch.c (reflags): Remove. All uses removed. (Pcompile, Pexecute): Do not use PCRE_MULTILINE.
-rw-r--r--NEWS8
-rw-r--r--src/dfasearch.c2
-rw-r--r--src/grep.c4
-rw-r--r--src/grep.h1
-rw-r--r--src/kwsearch.c2
-rw-r--r--src/pcresearch.c101
-rw-r--r--src/search.h6
7 files changed, 22 insertions, 102 deletions
diff --git a/NEWS b/NEWS
index 4972c018..6138b487 100644
--- a/NEWS
+++ b/NEWS
@@ -10,9 +10,11 @@ GNU grep NEWS -*- outline -*-
>/dev/null" where PROGRAM dies when writing into a broken pipe.
[bug introduced in grep-2.26]
- grep -Pz no longer rejects patterns containing ^ and $, is more
- cautious about special patterns like (?-m) and (*FAIL), and works
- when combined with -x. [bug introduced in grep-2.23]
+ grep -P no longer attempts multiline matches. This works more
+ intuitively with unusual patterns, and means that grep -Pz no longer
+ rejects patterns containing ^ and $ and works when combined with -x.
+ [bugs introduced in grep-2.23] A downside is that grep -P is now
+ significantly slower, albeit typically still faster than pcregrep.
grep -m0 -L PAT FILE now outputs "FILE". [bug introduced in grep-2.5]
diff --git a/src/dfasearch.c b/src/dfasearch.c
index d41b6fd1..ded99173 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -216,7 +216,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
}
size_t
-EGexecute (char *buf, size_t size, size_t *match_size,
+EGexecute (char const *buf, size_t size, size_t *match_size,
char const *start_ptr)
{
char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
diff --git a/src/grep.c b/src/grep.c
index f120bcfd..4538f225 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -589,7 +589,7 @@ static bool seek_data_failed;
/* Functions we'll use to search. */
typedef void (*compile_fp_t) (char const *, size_t);
-typedef size_t (*execute_fp_t) (char *, size_t, size_t *, char const *);
+typedef size_t (*execute_fp_t) (char const *, size_t, size_t *, char const *);
static compile_fp_t compile;
static execute_fp_t execute;
@@ -696,7 +696,7 @@ skip_easy_bytes (char const *buf)
/* Return true if BUF, of size SIZE, has an encoding error.
BUF must be followed by at least sizeof (uword) bytes,
the first of which may be modified. */
-bool
+static bool
buf_has_encoding_errors (char *buf, size_t size)
{
if (! unibyte_mask)
diff --git a/src/grep.h b/src/grep.h
index b45992f9..d10145e2 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -29,7 +29,6 @@ extern bool match_words; /* -w */
extern bool match_lines; /* -x */
extern char eolbyte; /* -z */
-extern bool buf_has_encoding_errors (char *, size_t);
extern char const *pattern_file_name (size_t, size_t *);
#endif
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 29d140cd..c3e69b39 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -78,7 +78,7 @@ Fcompile (char const *pattern, size_t size)
}
size_t
-Fexecute (char *buf, size_t size, size_t *match_size,
+Fexecute (char const *buf, size_t size, size_t *match_size,
char const *start_ptr)
{
char const *beg, *try, *end, *mb_start;
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 01616c2c..1948acfd 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -32,9 +32,6 @@ enum { NSUB = 300 };
/* Compiled internal form of a Perl regular expression. */
static pcre *cre;
-/* PCRE options used to compile the pattern. */
-static int reflags;
-
/* Additional information about the pattern. */
static pcre_extra *extra;
@@ -107,15 +104,13 @@ Pcompile (char const *pattern, size_t size)
int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
sizeof xprefix - 1 + sizeof xsuffix - 1);
char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
- int flags = (PCRE_MULTILINE
- | (match_icase ? PCRE_CASELESS : 0));
+ int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0);
char const *patlim = pattern + size;
char *n = re;
char const *p;
char const *pnul;
- bool multibyte_locale = 1 < MB_CUR_MAX;
- if (multibyte_locale)
+ if (1 < MB_CUR_MAX)
{
if (! localeinfo.using_utf8)
die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
@@ -126,32 +121,6 @@ Pcompile (char const *pattern, size_t size)
if (memchr (pattern, '\n', size))
die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
- if (! eolbyte)
- {
- bool line_at_a_time = match_lines;
- if (! line_at_a_time)
- {
- bool escaped = false;
- bool after_unescaped_left_bracket = false;
- for (p = pattern; *p; p++)
- if (escaped)
- escaped = after_unescaped_left_bracket = false;
- else
- {
- if (*p == '$' || (*p == '^' && !after_unescaped_left_bracket)
- || (*p == '(' && (p[1] == '?' || p[1] == '*')))
- {
- line_at_a_time = true;
- break;
- }
- escaped = *p == '\\';
- after_unescaped_left_bracket = *p == '[';
- }
- }
- if (line_at_a_time)
- flags = (flags & ~ PCRE_MULTILINE) | PCRE_DOLLAR_ENDONLY;
- }
-
*n = '\0';
if (match_words)
strcpy (n, wprefix);
@@ -182,7 +151,6 @@ Pcompile (char const *pattern, size_t size)
if (match_lines)
strcpy (n, xsuffix);
- reflags = flags;
cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
if (!cre)
die (EXIT_TROUBLE, 0, "%s", ep);
@@ -210,7 +178,7 @@ Pcompile (char const *pattern, size_t size)
}
size_t
-Pexecute (char *buf, size_t size, size_t *match_size,
+Pexecute (char const *buf, size_t size, size_t *match_size,
char const *start_ptr)
{
#if !HAVE_LIBPCRE
@@ -229,38 +197,14 @@ Pexecute (char *buf, size_t size, size_t *match_size,
error. */
char const *subject = buf;
- /* If the pattern has no problematic operators and the input is
- unibyte or is free of encoding errors, a multiline search is
- typically more efficient. Otherwise, a single-line search is
- either less confusing because the problematic operators are
- interpreted more naturally, or it is typically faster because
- pcre_exec doesn't waste time validating the entire input
- buffer. */
- bool multiline = (reflags & PCRE_MULTILINE) != 0;
- if (multiline && (reflags & PCRE_UTF8) != 0)
- {
- multiline = ! buf_has_encoding_errors (buf, size - 1);
- buf[size - 1] = eolbyte;
- }
-
for (; p < buf + size; p = line_start = line_end + 1)
{
- bool too_big;
-
- if (multiline)
- {
- size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
- size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
- line_end = memrchr (p, eolbyte, scan_size);
- too_big = ! line_end;
- }
- else
- {
- line_end = memchr (p, eolbyte, buf + size - p);
- too_big = INT_MAX < line_end - p;
- }
-
- if (too_big)
+ /* Use a single_line search. Although this code formerly used
+ PCRE_MULTILINE for performance, the performance wasn't always
+ better and the correctness issues were too puzzling. See
+ Bug#22655. */
+ line_end = memchr (p, eolbyte, buf + size - p);
+ if (INT_MAX < line_end - p)
die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
for (;;)
@@ -289,27 +233,11 @@ Pexecute (char *buf, size_t size, size_t *match_size,
int options = 0;
if (!bol)
options |= PCRE_NOTBOL;
- if (multiline)
- options |= PCRE_NO_UTF8_CHECK;
e = jit_exec (subject, line_end - subject, search_offset,
options, sub);
if (e != PCRE_ERROR_BADUTF8)
- {
- if (0 < e && multiline && sub[1] - sub[0] != 0)
- {
- char const *nl = memchr (subject + sub[0], eolbyte,
- sub[1] - sub[0]);
- if (nl)
- {
- /* This match crosses a line boundary; reject it. */
- p = subject + sub[0];
- line_end = nl;
- continue;
- }
- }
- break;
- }
+ break;
int valid_bytes = sub[0];
if (search_offset <= valid_bytes)
@@ -382,15 +310,6 @@ Pexecute (char *buf, size_t size, size_t *match_size,
beg = matchbeg;
end = matchend;
}
- else if (multiline)
- {
- char const *prev_nl = memrchr (line_start - 1, eolbyte,
- matchbeg - (line_start - 1));
- char const *next_nl = memchr (matchend, eolbyte,
- line_end + 1 - matchend);
- beg = prev_nl + 1;
- end = next_nl + 1;
- }
else
{
beg = line_start;
diff --git a/src/search.h b/src/search.h
index b6c1945e..4957a639 100644
--- a/src/search.h
+++ b/src/search.h
@@ -54,15 +54,15 @@ extern wint_t mb_next_wc (char const *, char const *);
/* dfasearch.c */
extern struct localeinfo localeinfo;
extern void GEAcompile (char const *, size_t, reg_syntax_t);
-extern size_t EGexecute (char *, size_t, size_t *, char const *);
+extern size_t EGexecute (char const *, size_t, size_t *, char const *);
/* kwsearch.c */
extern void Fcompile (char const *, size_t);
-extern size_t Fexecute (char *, size_t, size_t *, char const *);
+extern size_t Fexecute (char const *, size_t, size_t *, char const *);
/* pcresearch.c */
extern void Pcompile (char const *, size_t);
-extern size_t Pexecute (char *, size_t, size_t *, char const *);
+extern size_t Pexecute (char const *, size_t, size_t *, char const *);
/* Return the number of bytes in the character at the start of S, which
is of size N. N must be positive. MBS is the conversion state.