diff options
author | Paul Eggert <eggert@cs.ucla.edu> | 2014-09-16 15:48:44 -0700 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2014-09-17 20:37:48 -0700 |
commit | 9fa500407137f49f6edc3c6b4ee6c7096f0190c5 (patch) | |
tree | 7f07cc141ebe3cac021ca8ab90c77f2d02afc7ea /src/pcresearch.c | |
parent | 3688f2f9cc0868dd7b828d11c8ba030400a5b2c3 (diff) | |
download | grep-9fa500407137f49f6edc3c6b4ee6c7096f0190c5.tar.gz |
grep: improve -P performance in typical cases
* src/grep.c, src/grep.h (enum textbin): Move to grep.h.
(input_textbin, validated_boundary): New vars.
* src/grep.c (grepbuf, grep): Initialize them.
* src/pcresearch.c (Pexecute): Do a multiline search
when the input is known to be free of encoding errors.
Quickly discard bytes that are obviously encoding errors.
Quickly match empty strings.
Diffstat (limited to 'src/pcresearch.c')
-rw-r--r-- | src/pcresearch.c | 120 |
1 files changed, 104 insertions, 16 deletions
diff --git a/src/pcresearch.c b/src/pcresearch.c index 1b15e53b..6f016b68 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -156,28 +156,91 @@ Pexecute (char const *buf, size_t size, size_t *match_size, char const *line_start = buf; int e = PCRE_ERROR_NOMATCH; char const *line_end; + char const *validated = validated_boundary; + + /* If the input type is unknown, the caller is still testing the + input, which means the current buffer cannot contain encoding + errors and a multiline search is typically more efficient. + Otherwise, a single-line search is typically faster, so that + pcre_exec doesn't waste time validating the entire input + buffer. */ + bool multiline = input_textbin == TEXTBIN_UNKNOWN; - /* pcre_exec mishandles matches that cross line boundaries. - PCRE_MULTILINE isn't a win, partly because it's incompatible with - -z, and partly because it checks the entire input buffer and is - therefore slow on a large buffer containing many matches. - Avoid these problems by matching line-by-line. */ for (; p < buf + size; p = line_start = line_end + 1) { - line_end = memchr (p, eolbyte, buf + size - p); + bool too_big; - if (INT_MAX < line_end - p) + if (multiline) + { + size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1); + size_t scan_size = MIN (pcre_size_max + 1, buf + size - p); + line_end = memrchr (p, eolbyte, scan_size); + too_big = ! line_end; + } + else + { + line_end = memchr (p, eolbyte, buf + size - p); + too_big = INT_MAX < line_end - p; + } + + if (too_big) error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); - /* Treat encoding-error bytes as data that cannot match. */ for (;;) { - int options = bol ? 0 : PCRE_NOTBOL; - int valid_bytes; - e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, NSUB); - if (e != PCRE_ERROR_BADUTF8) - break; - valid_bytes = sub[0]; + /* Skip past bytes that are easily determined to be encoding + errors, treating them as data that cannot match. This is + faster than having pcre_exec check them. */ + while (mbclen_cache[to_uchar (*p)] == (size_t) -1) + { + p++; + bol = false; + } + + /* Check for an empty match; this is faster than letting + pcre_exec do it. */ + int search_bytes = line_end - p; + if (search_bytes == 0) + { + sub[0] = sub[1] = 0; + e = empty_match[bol]; + break; + } + + int options = 0; + if (!bol) + options |= PCRE_NOTBOL; + if (multiline || p + search_bytes <= validated) + options |= PCRE_NO_UTF8_CHECK; + + int valid_bytes = validated - p; + if (valid_bytes < 0) + { + e = pcre_exec (cre, extra, p, search_bytes, 0, + options, sub, NSUB); + if (e != PCRE_ERROR_BADUTF8) + { + validated = p + search_bytes; + if (0 < e && multiline && sub[1] - sub[0] != 0) + { + char const *nl = memchr (p + sub[0], eolbyte, + sub[1] - sub[0]); + if (nl) + { + /* This match crosses a line boundary; reject it. */ + p += sub[0]; + line_end = nl; + continue; + } + } + break; + } + valid_bytes = sub[0]; + validated = p + valid_bytes; + } + + /* Try to match the string before the encoding error. + Again, handle the empty-match case specially, for speed. */ if (valid_bytes == 0) { sub[1] = 0; @@ -189,6 +252,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size, sub, NSUB); if (e != PCRE_ERROR_NOMATCH) break; + + /* Treat the encoding error as data that cannot match. */ p += valid_bytes + 1; bol = false; } @@ -198,6 +263,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size, bol = true; } + validated_boundary = validated; + if (e <= 0) { switch (e) @@ -224,8 +291,29 @@ Pexecute (char const *buf, size_t size, size_t *match_size, } else { - char const *beg = start_ptr ? p + sub[0] : line_start; - char const *end = start_ptr ? p + sub[1] : line_end + 1; + char const *matchbeg = p + sub[0]; + char const *matchend = p + sub[1]; + char const *beg; + char const *end; + if (start_ptr) + { + beg = matchbeg; + end = matchend; + } + else if (multiline) + { + char const *prev_nl = memrchr (line_start - 1, eolbyte, + matchbeg - (line_start - 1)); + char const *next_nl = memchr (matchend, eolbyte, + line_end + 1 - matchend); + beg = prev_nl + 1; + end = next_nl + 1; + } + else + { + beg = line_start; + end = line_end + 1; + } *match_size = end - beg; return beg - buf; } |