From 33e4602c96e639ec7d56b92ffe3614aa700d3d76 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 7 Sep 2020 17:20:11 -0700 Subject: Omit duplicate regexps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not pass two copies of the same regexp to the regular-expression engine. Although the engines should perform nearly as well even with the copies, in practice they do not. Problem reported by Luca Borzacchiello (Bug#43040). * bootstrap.conf (gnulib_modules): Add hash. * src/grep.c: Include stdint.h, for SIZE_WIDTH. Include hash.h. (struct patloc, patloc, patlocs_allocated, patlocs_used): Rename from struct FL_pair, fl_pair, n_fl_pair_slots, n_pattern_files, respectively, since the data type is no longer a pair. All uses changed. (struct patloc): New member FILELINE. The lineno member is now ptrdiff_t since nowadays we prefer signed types. (pattern_array, patterns_table): New static vars. (count_nl_bytes, fl_add): Remove; no longer used. (hash_pattern, compare_patterns, update_patterns): New functions. update_patterns does what fl_add used to do, plus remove dups. (pattern_file_name): Adjust to change from fl_pair to patloc. (main): Move some variables to inner blocks for clarity. Maintain the pattern_table hash of all patterns. Update pattern_array to match keys, and use update_patterns instead of fl_add to remove duplicate keys. * tests/filename-lineno.pl (invalid-re-2-files) (invalid-re-2-files2, invalid-re-2e): Ensure regexps are unique in tests so that dups aren’t removed in diagnostics. (invalid-re-line-numbers): New test. --- NEWS | 4 + bootstrap.conf | 1 + src/grep.c | 277 ++++++++++++++++++++++++++++++----------------- tests/filename-lineno.pl | 19 +++- 4 files changed, 198 insertions(+), 103 deletions(-) diff --git a/NEWS b/NEWS index 5f4c0b4e..f661adf1 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,10 @@ GNU grep NEWS -*- outline -*- line is selected, not when a file is listed. The behavior in grep 3.2 through 3.4 was causing compatibility problems. +** Bug fixes + + A performance regression with many duplicate patterns has been fixed. + [Bug#43040 introduced in grep 3.4] * Noteworthy changes in release 3.4 (2020-01-02) [stable] diff --git a/bootstrap.conf b/bootstrap.conf index ef142fcd..fceb3188 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -49,6 +49,7 @@ git-version-gen gitlog-to-changelog gnu-web-doc-update gnupload +hash ignore-value intprops inttypes diff --git a/src/grep.c b/src/grep.c index 5764b2a9..c359ea9b 100644 --- a/src/grep.c +++ b/src/grep.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "system.h" @@ -41,6 +42,7 @@ #include "getopt.h" #include "getprogname.h" #include "grep.h" +#include "hash.h" #include "intprops.h" #include "propername.h" #include "quote.h" @@ -82,72 +84,136 @@ static bool align_tabs; /* Print width of line numbers and byte offsets. Nonzero if ALIGN_TABS. */ static int offset_width; -/* See below */ -struct FL_pair +/* An entry in the PATLOC array saying where patterns came from. */ +struct patloc { + /* Line number of the pattern in PATTERN_ARRAY. Line numbers + start at 0, and each pattern is terminated by '\n'. */ + ptrdiff_t lineno; + + /* Input location of the pattern. The FILENAME "-" represents + standard input, and "" represents the command line. FILELINE is + origin-1 for files and is irrelevant for the command line. */ char const *filename; - size_t lineno; + ptrdiff_t fileline; }; -/* A list of lineno,filename pairs corresponding to -f FILENAME - arguments. Since we store the concatenation of all patterns in - a single array, KEYS, be they from the command line via "-e PAT" - or read from one or more -f-specified FILENAMES. Given this - invocation, grep -f <(seq 5) -f <(seq 2) -f <(seq 3) FILE, there - will be three entries in LF_PAIR: {1, x} {6, y} {8, z}, where - x, y and z are just place-holders for shell-generated names. */ -static struct FL_pair *fl_pair; -static size_t n_fl_pair_slots; -/* Count not only -f-specified files, but also individual -e operands - and any command-line argument that serves as a regular expression. */ -static size_t n_pattern_files; - -/* The number of patterns seen so far. - It is advanced by fl_add and, when needed, used in pattern_file_name - to derive a file-relative line number. */ +/* The array of pattern locations. The concatenation of all patterns + is stored in a single array, KEYS. Given the invocation + 'grep -f <(seq 5) -f <(seq 6) -f <(seq 3)', there will initially be + 28 bytes in KEYS. After duplicate patterns are removed, KEYS + will have 12 bytes and PATLOC will be {0,x,1}, {10,y,1} + where x, y and z are just place-holders for shell-generated names + since and z is omitted as it contains only duplicates. Sometimes + removing duplicates will grow PATLOC, since each run of + removed patterns not at a file start or end requires another + PATLOC entry for the first non-removed pattern. */ +static struct patloc *patloc; +static size_t patlocs_allocated, patlocs_used; + +/* Pointer to the array of patterns, each terminated by newline. */ +static char *pattern_array; + +/* The number of unique patterns seen so far. */ static size_t n_patterns; -/* Return the number of newline bytes in BUF with size SIZE. */ +/* Hash table of patterns seen so far. */ +static Hash_table *pattern_table; + +/* Hash and compare newline-terminated patterns for textual equality. + Patterns are represented by origin-1 offsets into PATTERN_ARRAY, + cast to void *. The origin-1 is so that the first pattern offset + does not appear to be a null pointer when cast to void *. */ static size_t _GL_ATTRIBUTE_PURE -count_nl_bytes (char const *buf, size_t size) -{ - char const *p = buf; - char const *end_p = buf + size; - size_t n = 0; - while ((p = memchr (p, '\n', end_p - p))) - p++, n++; - return n; +hash_pattern (void const *pat, size_t n_buckets) +{ + size_t h = 0; + intptr_t pat_offset = (intptr_t) pat - 1; + for (char const *s = pattern_array + pat_offset; *s != '\n'; s++) + h = *s + ((h << 9) | (h >> (SIZE_WIDTH - 9))); + return h % n_buckets; +} +static bool _GL_ATTRIBUTE_PURE +compare_patterns (void const *a, void const *b) +{ + intptr_t a_offset = (intptr_t) a - 1; + intptr_t b_offset = (intptr_t) b - 1; + char const *p = pattern_array + a_offset; + char const *q = pattern_array + b_offset; + for (; *p == *q; p++, q++) + if (*p == '\n') + return true; + return false; } -/* Append a FILENAME,line-number pair to FL_PAIR, and update - pattern-related counts from the contents of BUF with SIZE bytes. */ -static void -fl_add (char const *buf, size_t size, char const *filename) +/* Update KEYS to remove duplicate patterns, and return the number of + bytes in the resulting KEYS. KEYS contains a sequence of patterns + each terminated by '\n'. The first DUPFREE_SIZE bytes are a + sequence of patterns with no duplicates; SIZE is the total number + of bytes in KEYS. If some patterns past the first DUPFREE_SIZE + bytes are not duplicates, update PATLOCS accordingly. */ +static ptrdiff_t +update_patterns (char *keys, ptrdiff_t dupfree_size, ptrdiff_t size, + char const *filename) { - if (n_fl_pair_slots <= n_pattern_files) - fl_pair = x2nrealloc (fl_pair, &n_fl_pair_slots, sizeof *fl_pair); + char *dst = keys + dupfree_size; + ptrdiff_t fileline = 1; + int prev_inserted = 0; + + char const *srclim = keys + size; + ptrdiff_t patsize; + for (char const *src = keys + dupfree_size; src < srclim; src += patsize) + { + char const *patend = memchr (src, '\n', srclim - src); + patsize = patend + 1 - src; + memmove (dst, src, patsize); + + intptr_t dst_offset_1 = dst - keys + 1; + int inserted = hash_insert_if_absent (pattern_table, + (void *) dst_offset_1, NULL); + if (inserted) + { + if (inserted < 0) + xalloc_die (); + dst += patsize; - fl_pair[n_pattern_files].lineno = n_patterns + 1; - fl_pair[n_pattern_files].filename = filename; - n_pattern_files++; - n_patterns += count_nl_bytes (buf, size); + /* Add a PATLOCS entry unless this input line is simply the + next one in the same file. */ + if (!prev_inserted) + { + if (patlocs_used == patlocs_allocated) + patloc = x2nrealloc (patloc, &patlocs_allocated, + sizeof *patloc); + patloc[patlocs_used++] + = (struct patloc) { .lineno = n_patterns, + .filename = filename, + .fileline = fileline }; + } + n_patterns++; + } + + prev_inserted = inserted; + fileline++; + } + + return dst - keys; } -/* Map the line number, LINENO, of one of the input patterns to the - name of the file from which it came. If it was read from stdin - or if it was specified on the command line, return "-". */ +/* Map LINENO, the origin-1 line number of one of the input patterns, + to the name of the file from which it came. Return "-" if it was + read from stdin, "" if it was specified on the command line. + Set *NEW_LINENO to the origin-1 line number of PATTERN in the file, + or to an unspecified value if PATTERN came from the command line. */ char const * _GL_ATTRIBUTE_PURE pattern_file_name (size_t lineno, size_t *new_lineno) { - size_t i; - for (i = 1; i < n_pattern_files; i++) - { - if (lineno < fl_pair[i].lineno) - break; - } - - *new_lineno = lineno - fl_pair[i - 1].lineno + 1; - return fl_pair[i - 1].filename; + lineno--; + ptrdiff_t i; + for (i = 1; i < patlocs_used; i++) + if (lineno < patloc[i].lineno) + break; + *new_lineno = lineno - patloc[i - 1].lineno + patloc[i - 1].fileline; + return patloc[i - 1].filename; } #if HAVE_ASAN @@ -2330,6 +2396,7 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p) } } + *p = '\n'; free (*keys_p); *keys_p = new_keys; *len_p = p - new_keys; @@ -2424,9 +2491,8 @@ int main (int argc, char **argv) { char *keys = NULL; - size_t keycc = 0, oldcc, keyalloc = 0; + size_t keycc = 0, keyalloc = 0; int matcher = -1; - size_t cc; int opt, prepended; int prev_optind, last_recursive; int fread_errno; @@ -2467,6 +2533,10 @@ main (int argc, char **argv) last_recursive = 0; + pattern_table = hash_initialize (0, 0, hash_pattern, compare_patterns, 0); + if (!pattern_table) + xalloc_die (); + prepended = prepend_default_options (getenv ("GREP_OPTIONS"), &argc, &argv); if (prepended) error (0, 0, _("warning: GREP_OPTIONS is deprecated;" @@ -2565,50 +2635,52 @@ main (int argc, char **argv) break; case 'e': - cc = strlen (optarg); - if (keyalloc < keycc + cc + 1) - { - keyalloc = keycc + cc + 1; - keys = x2realloc (keys, &keyalloc); - } - oldcc = keycc; - memcpy (keys + oldcc, optarg, cc); - keycc += cc; - keys[keycc++] = '\n'; - fl_add (keys + oldcc, cc + 1, ""); + { + ptrdiff_t cc = strlen (optarg); + if (keyalloc < keycc + cc + 1) + { + keyalloc = keycc + cc + 1; + pattern_array = keys = x2realloc (keys, &keyalloc); + } + char *keyend = mempcpy (keys + keycc, optarg, cc); + *keyend = '\n'; + keycc = update_patterns (keys, keycc, keycc + cc + 1, ""); + } break; case 'f': - if (STREQ (optarg, "-")) - { - if (binary) - xset_binary_mode (STDIN_FILENO, O_BINARY); - fp = stdin; - } - else - { - fp = fopen (optarg, binary ? "rb" : "r"); - if (!fp) - die (EXIT_TROUBLE, errno, "%s", optarg); - } - oldcc = keycc; - for (;; keycc += cc) - { - if (keyalloc <= keycc + 1) - keys = x2realloc (keys, &keyalloc); - cc = fread (keys + keycc, 1, keyalloc - (keycc + 1), fp); - if (cc == 0) - break; - } - fread_errno = errno; - if (ferror (fp)) - die (EXIT_TROUBLE, fread_errno, "%s", optarg); - if (fp != stdin) - fclose (fp); - /* Append final newline if file ended in non-newline. */ - if (oldcc != keycc && keys[keycc - 1] != '\n') - keys[keycc++] = '\n'; - fl_add (keys + oldcc, keycc - oldcc, optarg); + { + if (STREQ (optarg, "-")) + { + if (binary) + xset_binary_mode (STDIN_FILENO, O_BINARY); + fp = stdin; + } + else + { + fp = fopen (optarg, binary ? "rb" : "r"); + if (!fp) + die (EXIT_TROUBLE, errno, "%s", optarg); + } + ptrdiff_t newkeycc = keycc, cc; + for (;; newkeycc += cc) + { + if (keyalloc <= newkeycc + 1) + pattern_array = keys = x2realloc (keys, &keyalloc); + cc = fread (keys + newkeycc, 1, keyalloc - (newkeycc + 1), fp); + if (cc == 0) + break; + } + fread_errno = errno; + if (ferror (fp)) + die (EXIT_TROUBLE, fread_errno, "%s", optarg); + if (fp != stdin) + fclose (fp); + /* Append final newline if file ended in non-newline. */ + if (newkeycc != keycc && keys[newkeycc - 1] != '\n') + keys[newkeycc++] = '\n'; + keycc = update_patterns (keys, keycc, newkeycc, optarg); + } break; case 'h': @@ -2800,22 +2872,26 @@ main (int argc, char **argv) /* No keys were specified (e.g. -f /dev/null). Match nothing. */ out_invert ^= true; match_lines = match_words = false; + keys[keycc++] = '\n'; } - else - /* Strip trailing newline. */ - --keycc; } else if (optind < argc) { /* Make a copy so that it can be reallocated or freed later. */ - keycc = strlen (argv[optind]); - keys = xmemdup (argv[optind++], keycc + 1); - fl_add (keys, keycc, ""); + pattern_array = keys = xstrdup (argv[optind++]); + ptrdiff_t patlen = strlen (keys); + keys[patlen] = '\n'; + keycc = update_patterns (keys, 0, patlen + 1, ""); n_patterns++; } else usage (EXIT_TROUBLE); + /* Strip trailing newline from keys. */ + keycc--; + + hash_free (pattern_table); + bool possibly_tty = false; struct stat tmp_stat; if (! exit_on_match && fstat (STDOUT_FILENO, &tmp_stat) == 0) @@ -2887,7 +2963,8 @@ main (int argc, char **argv) : (contains_encoding_error (keys, keycc) || (match_icase && !fgrep_icase_available (keys, keycc))))) { - fgrep_to_grep_pattern (&keys, &keycc); + fgrep_to_grep_pattern (&pattern_array, &keycc); + keys = pattern_array; matcher = G_MATCHER_INDEX; } /* With two or more patterns, if -F works then switch from either -E diff --git a/tests/filename-lineno.pl b/tests/filename-lineno.pl index 998af73d..8940b1c7 100755 --- a/tests/filename-lineno.pl +++ b/tests/filename-lineno.pl @@ -48,7 +48,7 @@ my @Tests = # Show that with two or more errors, grep now prints all diagnostics: ['invalid-re-2-files', '-f g -f h', {EXIT=>2}, {AUX=>{g=>"1\n2[[\n3\n4[[\n"}}, - {AUX=>{h=>"\n\n[[\n"}}, + {AUX=>{h=>"5\n6\n7[[\n"}}, $err_subst, {ERR => "$prog: g:2: Unmatched [...\n" . "$prog: g:4: Unmatched [...\n" @@ -59,7 +59,7 @@ my @Tests = # Like the above, but on the other lines. ['invalid-re-2-files2', '-f g -f h', {EXIT=>2}, {AUX=>{g=>"1[[\n2\n3[[\n4\n"}}, - {AUX=>{h=>"[[\n[[\n\n"}}, + {AUX=>{h=>"5[[\n6[[\n7\n"}}, $err_subst, {ERR => "$prog: g:1: Unmatched [...\n" . "$prog: g:3: Unmatched [...\n" @@ -68,9 +68,22 @@ my @Tests = }, ], + # Make sure the line numbers are right when some regexps are duplicates. + ['invalid-re-line-numbers', '-f g -f h', {EXIT=>2}, + {AUX=>{g=>"1[[\n\n3[[\n\n5[[\n"}}, + {AUX=>{h=>"1[[\n\n\n4[[\n\n6[[\n"}}, + $err_subst, + {ERR => "$prog: g:1: Unmatched [...\n" + . "$prog: g:3: Unmatched [...\n" + . "$prog: g:5: Unmatched [...\n" + . "$prog: h:4: Unmatched [...\n" + . "$prog: h:6: Unmatched [...\n" + }, + ], + # Show that with two '-e'-specified erroneous regexps, # there is no file name or line number. - ['invalid-re-2e', '-e "[[" -e "[["', {EXIT=>2}, + ['invalid-re-2e', '-e "1[[" -e "2[["', {EXIT=>2}, $err_subst, {ERR => "$prog: Unmatched [...\n" x 2}, ], -- cgit v1.2.1