diff options
author | Paul Eggert <eggert@cs.ucla.edu> | 2016-11-19 22:48:37 -0800 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2016-11-19 23:53:39 -0800 |
commit | cfdb4ba4249ac5860979d11436c6a925a298688e (patch) | |
tree | a7b1c19ea27aa1b609be41078125bb1cacfa2bac | |
parent | da94c91a81fc63275371d0580d8688b6abd85346 (diff) | |
download | grep-cfdb4ba4249ac5860979d11436c6a925a298688e.tar.gz |
grep: -P no longer uses PCRE_MULTILINE
This reverts commit f6603c4e1e04dbb87a7232c4b44acc6afdf65fef,
as the extra performance is not worth the trouble for PCRE users.
Problem reported by Stephane Chazelas in:
http://bugs.gnu.org/22655#103
* NEWS: Document this and the next patch.
* src/dfasearch.c (EGexecute):
* src/grep.c (execute_fp_t):
* src/kwsearch.c (Fexecute):
* src/pcresearch.c (Pexecute):
First arg is now a const pointer again.
* src/grep.c (buf_has_encoding_errors): Now static.
* src/grep.h (buf_has_encoding_errors): Remove decl.
* src/search.h: Adjust decls.
* src/pcresearch.c (reflags): Remove. All uses removed.
(Pcompile, Pexecute): Do not use PCRE_MULTILINE.
-rw-r--r-- | NEWS | 8 | ||||
-rw-r--r-- | src/dfasearch.c | 2 | ||||
-rw-r--r-- | src/grep.c | 4 | ||||
-rw-r--r-- | src/grep.h | 1 | ||||
-rw-r--r-- | src/kwsearch.c | 2 | ||||
-rw-r--r-- | src/pcresearch.c | 101 | ||||
-rw-r--r-- | src/search.h | 6 |
7 files changed, 22 insertions, 102 deletions
@@ -10,9 +10,11 @@ GNU grep NEWS -*- outline -*- >/dev/null" where PROGRAM dies when writing into a broken pipe. [bug introduced in grep-2.26] - grep -Pz no longer rejects patterns containing ^ and $, is more - cautious about special patterns like (?-m) and (*FAIL), and works - when combined with -x. [bug introduced in grep-2.23] + grep -P no longer attempts multiline matches. This works more + intuitively with unusual patterns, and means that grep -Pz no longer + rejects patterns containing ^ and $ and works when combined with -x. + [bugs introduced in grep-2.23] A downside is that grep -P is now + significantly slower, albeit typically still faster than pcregrep. grep -m0 -L PAT FILE now outputs "FILE". [bug introduced in grep-2.5] diff --git a/src/dfasearch.c b/src/dfasearch.c index d41b6fd1..ded99173 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -216,7 +216,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) } size_t -EGexecute (char *buf, size_t size, size_t *match_size, +EGexecute (char const *buf, size_t size, size_t *match_size, char const *start_ptr) { char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start; @@ -589,7 +589,7 @@ static bool seek_data_failed; /* Functions we'll use to search. */ typedef void (*compile_fp_t) (char const *, size_t); -typedef size_t (*execute_fp_t) (char *, size_t, size_t *, char const *); +typedef size_t (*execute_fp_t) (char const *, size_t, size_t *, char const *); static compile_fp_t compile; static execute_fp_t execute; @@ -696,7 +696,7 @@ skip_easy_bytes (char const *buf) /* Return true if BUF, of size SIZE, has an encoding error. BUF must be followed by at least sizeof (uword) bytes, the first of which may be modified. */ -bool +static bool buf_has_encoding_errors (char *buf, size_t size) { if (! unibyte_mask) @@ -29,7 +29,6 @@ extern bool match_words; /* -w */ extern bool match_lines; /* -x */ extern char eolbyte; /* -z */ -extern bool buf_has_encoding_errors (char *, size_t); extern char const *pattern_file_name (size_t, size_t *); #endif diff --git a/src/kwsearch.c b/src/kwsearch.c index 29d140cd..c3e69b39 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -78,7 +78,7 @@ Fcompile (char const *pattern, size_t size) } size_t -Fexecute (char *buf, size_t size, size_t *match_size, +Fexecute (char const *buf, size_t size, size_t *match_size, char const *start_ptr) { char const *beg, *try, *end, *mb_start; diff --git a/src/pcresearch.c b/src/pcresearch.c index 01616c2c..1948acfd 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -32,9 +32,6 @@ enum { NSUB = 300 }; /* Compiled internal form of a Perl regular expression. */ static pcre *cre; -/* PCRE options used to compile the pattern. */ -static int reflags; - /* Additional information about the pattern. */ static pcre_extra *extra; @@ -107,15 +104,13 @@ Pcompile (char const *pattern, size_t size) int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1, sizeof xprefix - 1 + sizeof xsuffix - 1); char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4); - int flags = (PCRE_MULTILINE - | (match_icase ? PCRE_CASELESS : 0)); + int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0); char const *patlim = pattern + size; char *n = re; char const *p; char const *pnul; - bool multibyte_locale = 1 < MB_CUR_MAX; - if (multibyte_locale) + if (1 < MB_CUR_MAX) { if (! localeinfo.using_utf8) die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); @@ -126,32 +121,6 @@ Pcompile (char const *pattern, size_t size) if (memchr (pattern, '\n', size)) die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern")); - if (! eolbyte) - { - bool line_at_a_time = match_lines; - if (! line_at_a_time) - { - bool escaped = false; - bool after_unescaped_left_bracket = false; - for (p = pattern; *p; p++) - if (escaped) - escaped = after_unescaped_left_bracket = false; - else - { - if (*p == '$' || (*p == '^' && !after_unescaped_left_bracket) - || (*p == '(' && (p[1] == '?' || p[1] == '*'))) - { - line_at_a_time = true; - break; - } - escaped = *p == '\\'; - after_unescaped_left_bracket = *p == '['; - } - } - if (line_at_a_time) - flags = (flags & ~ PCRE_MULTILINE) | PCRE_DOLLAR_ENDONLY; - } - *n = '\0'; if (match_words) strcpy (n, wprefix); @@ -182,7 +151,6 @@ Pcompile (char const *pattern, size_t size) if (match_lines) strcpy (n, xsuffix); - reflags = flags; cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); if (!cre) die (EXIT_TROUBLE, 0, "%s", ep); @@ -210,7 +178,7 @@ Pcompile (char const *pattern, size_t size) } size_t -Pexecute (char *buf, size_t size, size_t *match_size, +Pexecute (char const *buf, size_t size, size_t *match_size, char const *start_ptr) { #if !HAVE_LIBPCRE @@ -229,38 +197,14 @@ Pexecute (char *buf, size_t size, size_t *match_size, error. */ char const *subject = buf; - /* If the pattern has no problematic operators and the input is - unibyte or is free of encoding errors, a multiline search is - typically more efficient. Otherwise, a single-line search is - either less confusing because the problematic operators are - interpreted more naturally, or it is typically faster because - pcre_exec doesn't waste time validating the entire input - buffer. */ - bool multiline = (reflags & PCRE_MULTILINE) != 0; - if (multiline && (reflags & PCRE_UTF8) != 0) - { - multiline = ! buf_has_encoding_errors (buf, size - 1); - buf[size - 1] = eolbyte; - } - for (; p < buf + size; p = line_start = line_end + 1) { - bool too_big; - - if (multiline) - { - size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1); - size_t scan_size = MIN (pcre_size_max + 1, buf + size - p); - line_end = memrchr (p, eolbyte, scan_size); - too_big = ! line_end; - } - else - { - line_end = memchr (p, eolbyte, buf + size - p); - too_big = INT_MAX < line_end - p; - } - - if (too_big) + /* Use a single_line search. Although this code formerly used + PCRE_MULTILINE for performance, the performance wasn't always + better and the correctness issues were too puzzling. See + Bug#22655. */ + line_end = memchr (p, eolbyte, buf + size - p); + if (INT_MAX < line_end - p) die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); for (;;) @@ -289,27 +233,11 @@ Pexecute (char *buf, size_t size, size_t *match_size, int options = 0; if (!bol) options |= PCRE_NOTBOL; - if (multiline) - options |= PCRE_NO_UTF8_CHECK; e = jit_exec (subject, line_end - subject, search_offset, options, sub); if (e != PCRE_ERROR_BADUTF8) - { - if (0 < e && multiline && sub[1] - sub[0] != 0) - { - char const *nl = memchr (subject + sub[0], eolbyte, - sub[1] - sub[0]); - if (nl) - { - /* This match crosses a line boundary; reject it. */ - p = subject + sub[0]; - line_end = nl; - continue; - } - } - break; - } + break; int valid_bytes = sub[0]; if (search_offset <= valid_bytes) @@ -382,15 +310,6 @@ Pexecute (char *buf, size_t size, size_t *match_size, beg = matchbeg; end = matchend; } - else if (multiline) - { - char const *prev_nl = memrchr (line_start - 1, eolbyte, - matchbeg - (line_start - 1)); - char const *next_nl = memchr (matchend, eolbyte, - line_end + 1 - matchend); - beg = prev_nl + 1; - end = next_nl + 1; - } else { beg = line_start; diff --git a/src/search.h b/src/search.h index b6c1945e..4957a639 100644 --- a/src/search.h +++ b/src/search.h @@ -54,15 +54,15 @@ extern wint_t mb_next_wc (char const *, char const *); /* dfasearch.c */ extern struct localeinfo localeinfo; extern void GEAcompile (char const *, size_t, reg_syntax_t); -extern size_t EGexecute (char *, size_t, size_t *, char const *); +extern size_t EGexecute (char const *, size_t, size_t *, char const *); /* kwsearch.c */ extern void Fcompile (char const *, size_t); -extern size_t Fexecute (char *, size_t, size_t *, char const *); +extern size_t Fexecute (char const *, size_t, size_t *, char const *); /* pcresearch.c */ extern void Pcompile (char const *, size_t); -extern size_t Pexecute (char *, size_t, size_t *, char const *); +extern size_t Pexecute (char const *, size_t, size_t *, char const *); /* Return the number of bytes in the character at the start of S, which is of size N. N must be positive. MBS is the conversion state. |