summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2019-12-22 16:39:09 -0800
committerPaul Eggert <eggert@cs.ucla.edu>2019-12-22 16:40:08 -0800
commitabb7f4f2325f26f930ff59b702fe42568a8e81e7 (patch)
tree734e40a3a595c4f491e102b5883fb8829cc2984a /src
parentcf09252295c554dd3eba4cdb8eb53911fb495f40 (diff)
downloadgrep-abb7f4f2325f26f930ff59b702fe42568a8e81e7.tar.gz
grep: grouping of a pattern with multiple lines
When grep uses regex, it splits a pattern with multiple lines by newline character into fragments. Compilation and execution run for each fragment. That causes slowdown. By this change, each fragment is divided into groups by whether the fragment includes back references. A fragment with back references constitutes group, and all fragments that lack back references also constitute a group. This change extremely speeds-up following case. $ seq -f '%040g' 0 9999 | sed '1s/$/\\(0\\)\\1/' >pat $ yes 00000000000000000000000000000000000000000x | head -10000 >in $ time -p env LC_ALL=C src/grep -f pat in * src/dfasearch.c (find_backref_in_pattern, regex_compile): New functions. (GEAcompile): Use the new functions to group fragments as mentioned above.
Diffstat (limited to 'src')
-rw-r--r--src/dfasearch.c127
1 files changed, 107 insertions, 20 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 518c385f..42d16dc9 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -103,6 +103,59 @@ kwsmusts (struct dfa_comp *dc)
dfamustfree (dm);
}
+static bool
+find_backref_in_pattern (const char *keys, size_t len)
+{
+ for (; len; keys++, len--)
+ if (keys[0] == '\\')
+ {
+ if (1 <= keys[1] && keys[1] <= '9')
+ return true;
+
+ if (keys[1] == '\\')
+ keys++;
+ }
+
+ return false;
+}
+
+static bool
+regex_compile (struct dfa_comp *dc, const char *p, size_t len, size_t pcount,
+ size_t lineno, bool syntax_only)
+{
+ struct re_pattern_buffer pat0;
+ struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount];
+ pat->buffer = NULL;
+ pat->allocated = 0;
+
+ /* Do not use a fastmap with -i, to work around glibc Bug#20381. */
+ pat->fastmap = (syntax_only || match_icase) ? NULL : xmalloc (UCHAR_MAX + 1);
+
+ pat->translate = NULL;
+
+ char const *err = re_compile_pattern (p, len, pat);
+
+ if (!err)
+ return true;
+
+ /* With patterns specified only on the command line, emit the bare
+ diagnostic. Otherwise, include a filename:lineno: prefix. */
+ size_t new_lineno;
+ const char *pat_filename;
+
+ if (lineno != (size_t) -1)
+ pat_filename = pattern_file_name (lineno + 1, &new_lineno);
+ else
+ pat_filename = "\0";
+
+ if (*pat_filename == '\0')
+ error (0, 0, "%s", err);
+ else
+ error (0, 0, "%s:%zu: %s", pat_filename, new_lineno, err);
+
+ return false;
+}
+
void *
GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
{
@@ -126,6 +179,11 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
bool compilation_failed = false;
size_t palloc = 0;
+ char const *prev = pattern;
+ char *buf = NULL;
+ size_t buflen = 0;
+ size_t lineno = 0;
+
do
{
size_t len;
@@ -138,39 +196,68 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
else
len = patlim - p;
+ bool backref = find_backref_in_pattern (p, len);
+
+ if (backref && prev < p)
+ {
+ len = p - prev;
+ buf = xrealloc (buf, (buflen + len) * sizeof *buf);
+ memcpy (buf + buflen, p, len);
+ buflen += len;
+ }
+
if (palloc <= dc->pcount)
dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof *dc->patterns);
- struct re_pattern_buffer *pat = &dc->patterns[dc->pcount];
- pat->buffer = NULL;
- pat->allocated = 0;
- /* Do not use a fastmap with -i, to work around glibc Bug#20381. */
- pat->fastmap = match_icase ? NULL : xmalloc (UCHAR_MAX + 1);
+ if (!regex_compile (dc, p, len, dc->pcount, lineno, !backref))
+ compilation_failed = true;
- pat->translate = NULL;
+ p = sep;
+ lineno++;
- char const *err = re_compile_pattern (p, len, pat);
- if (err)
+ if (backref)
{
- /* With patterns specified only on the command line, emit the bare
- diagnostic. Otherwise, include a filename:lineno: prefix. */
- size_t lineno;
- char const *pat_filename = pattern_file_name (dc->pcount + 1,
- &lineno);
- if (*pat_filename == '\0')
- error (0, 0, "%s", err);
- else
- error (0, 0, "%s:%zu: %s", pat_filename, lineno, err);
- compilation_failed = true;
+ dc->pcount++;
+ prev = p;
}
- dc->pcount++;
- p = sep;
}
while (p);
if (compilation_failed)
exit (EXIT_TROUBLE);
+ if (prev != NULL)
+ {
+ if (pattern < prev)
+ {
+ size_t len = patlim - prev;
+ buf = xrealloc (buf, (buflen + len) * sizeof *buf);
+ memcpy (buf + buflen, prev, len);
+ buflen += len;
+ }
+ else
+ {
+ buf = pattern;
+ buflen = size;
+ }
+ }
+
+ if (buf != NULL)
+ {
+ dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof *dc->patterns);
+
+ for (size_t i = 0; i < dc->pcount; i++)
+ dc->patterns[i + 1] = dc->patterns[i];
+
+ if (!regex_compile (dc, buf, buflen, 0, -1, false))
+ abort ();
+
+ dc->pcount++;
+
+ if (buf != pattern)
+ free (buf);
+ }
+
/* In the match_words and match_lines cases, we use a different pattern
for the DFA matcher that will quickly throw out cases that won't work.
Then if DFA succeeds we do some hairy stuff using the regex matcher