diff options
author | Norihiro Tanaka <noritnk@kcn.ne.jp> | 2019-12-22 16:39:09 -0800 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2019-12-22 16:40:08 -0800 |
commit | abb7f4f2325f26f930ff59b702fe42568a8e81e7 (patch) | |
tree | 734e40a3a595c4f491e102b5883fb8829cc2984a /src | |
parent | cf09252295c554dd3eba4cdb8eb53911fb495f40 (diff) | |
download | grep-abb7f4f2325f26f930ff59b702fe42568a8e81e7.tar.gz |
grep: grouping of a pattern with multiple lines
When grep uses regex, it splits a pattern with multiple lines by
newline character into fragments. Compilation and execution run for
each fragment. That causes slowdown. By this change, each fragment is
divided into groups by whether the fragment includes back references.
A fragment with back references constitutes group, and all fragments
that lack back references also constitute a group.
This change extremely speeds-up following case.
$ seq -f '%040g' 0 9999 | sed '1s/$/\\(0\\)\\1/' >pat
$ yes 00000000000000000000000000000000000000000x | head -10000 >in
$ time -p env LC_ALL=C src/grep -f pat in
* src/dfasearch.c (find_backref_in_pattern, regex_compile):
New functions.
(GEAcompile): Use the new functions to group fragments
as mentioned above.
Diffstat (limited to 'src')
-rw-r--r-- | src/dfasearch.c | 127 |
1 files changed, 107 insertions, 20 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c index 518c385f..42d16dc9 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -103,6 +103,59 @@ kwsmusts (struct dfa_comp *dc) dfamustfree (dm); } +static bool +find_backref_in_pattern (const char *keys, size_t len) +{ + for (; len; keys++, len--) + if (keys[0] == '\\') + { + if (1 <= keys[1] && keys[1] <= '9') + return true; + + if (keys[1] == '\\') + keys++; + } + + return false; +} + +static bool +regex_compile (struct dfa_comp *dc, const char *p, size_t len, size_t pcount, + size_t lineno, bool syntax_only) +{ + struct re_pattern_buffer pat0; + struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount]; + pat->buffer = NULL; + pat->allocated = 0; + + /* Do not use a fastmap with -i, to work around glibc Bug#20381. */ + pat->fastmap = (syntax_only || match_icase) ? NULL : xmalloc (UCHAR_MAX + 1); + + pat->translate = NULL; + + char const *err = re_compile_pattern (p, len, pat); + + if (!err) + return true; + + /* With patterns specified only on the command line, emit the bare + diagnostic. Otherwise, include a filename:lineno: prefix. */ + size_t new_lineno; + const char *pat_filename; + + if (lineno != (size_t) -1) + pat_filename = pattern_file_name (lineno + 1, &new_lineno); + else + pat_filename = "\0"; + + if (*pat_filename == '\0') + error (0, 0, "%s", err); + else + error (0, 0, "%s:%zu: %s", pat_filename, new_lineno, err); + + return false; +} + void * GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits) { @@ -126,6 +179,11 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits) bool compilation_failed = false; size_t palloc = 0; + char const *prev = pattern; + char *buf = NULL; + size_t buflen = 0; + size_t lineno = 0; + do { size_t len; @@ -138,39 +196,68 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits) else len = patlim - p; + bool backref = find_backref_in_pattern (p, len); + + if (backref && prev < p) + { + len = p - prev; + buf = xrealloc (buf, (buflen + len) * sizeof *buf); + memcpy (buf + buflen, p, len); + buflen += len; + } + if (palloc <= dc->pcount) dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof *dc->patterns); - struct re_pattern_buffer *pat = &dc->patterns[dc->pcount]; - pat->buffer = NULL; - pat->allocated = 0; - /* Do not use a fastmap with -i, to work around glibc Bug#20381. */ - pat->fastmap = match_icase ? NULL : xmalloc (UCHAR_MAX + 1); + if (!regex_compile (dc, p, len, dc->pcount, lineno, !backref)) + compilation_failed = true; - pat->translate = NULL; + p = sep; + lineno++; - char const *err = re_compile_pattern (p, len, pat); - if (err) + if (backref) { - /* With patterns specified only on the command line, emit the bare - diagnostic. Otherwise, include a filename:lineno: prefix. */ - size_t lineno; - char const *pat_filename = pattern_file_name (dc->pcount + 1, - &lineno); - if (*pat_filename == '\0') - error (0, 0, "%s", err); - else - error (0, 0, "%s:%zu: %s", pat_filename, lineno, err); - compilation_failed = true; + dc->pcount++; + prev = p; } - dc->pcount++; - p = sep; } while (p); if (compilation_failed) exit (EXIT_TROUBLE); + if (prev != NULL) + { + if (pattern < prev) + { + size_t len = patlim - prev; + buf = xrealloc (buf, (buflen + len) * sizeof *buf); + memcpy (buf + buflen, prev, len); + buflen += len; + } + else + { + buf = pattern; + buflen = size; + } + } + + if (buf != NULL) + { + dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof *dc->patterns); + + for (size_t i = 0; i < dc->pcount; i++) + dc->patterns[i + 1] = dc->patterns[i]; + + if (!regex_compile (dc, buf, buflen, 0, -1, false)) + abort (); + + dc->pcount++; + + if (buf != pattern) + free (buf); + } + /* In the match_words and match_lines cases, we use a different pattern for the DFA matcher that will quickly throw out cases that won't work. Then if DFA succeeds we do some hairy stuff using the regex matcher |