grep: be less picky about encoding errors

This fixes a longstanding problem introduced in grep 2.21, which is overly picky about binary files. * NEWS: * doc/grep.texi (File and Directory Selection): Document this. * src/grep.c (input_textbin, textbin_is_binary, buffer_textbin) (file_textbin): Remove. All uses removed. (encoding_error_output): New static var. (buf_has_encoding_errors, buf_has_nulls, file_must_have_nulls): New functions, which reuse bits and pieces of the removed functions. (lastout, print_line_head, print_line_middle, print_line_tail, prline) (prpending, prtext, grepbuf): Avoid use of const, now that we have functions that require modifying a sentinel. (print_line_head): New arg LEN. All uses changed. (print_line_head, print_line_tail): Return indicator whether the output line was printed. All uses changed. (print_line_middle): Exit early on encoding error. (grep): Use new method for determining whether file is binary. * src/grep.h (enum textbin, TEXTBIN_BINARY, TEXTBIN_UNKNOWN) (TEXTBIN_TEXT, input_textbin): Remove decls. All uses removed. * src/pcresearch.c (Pexecute): Remove multiline optimization, since the main program no longer checks for encoding errors on input. * tests/encoding-error: New file. * tests/Makefile.am (TESTS): Add it.
author: Paul Eggert <eggert@cs.ucla.edu> 2015-12-30 19:10:14 -0800
committer: Paul Eggert <eggert@cs.ucla.edu> 2015-12-30 19:10:57 -0800
commit: 8521001643bc6a28c760552824eaea5ecee0aa8c (patch)
tree: 5bd32eb561873be110f222d5016b9503322fc922 /src/pcresearch.c
parent: f0a7e87ff12f6d0f804275b5c7bfb2dc3cc0c777 (diff)
download: grep-8521001643bc6a28c760552824eaea5ecee0aa8c.tar.gz
1 files changed, 6 insertions, 50 deletions
diff --git a/src/pcresearch.c b/src/pcresearch.c
index dc683451..c403032d 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -194,32 +194,13 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
      error.  */
   char const *subject = buf;
 
-  /* If the input type is unknown, the caller is still testing the
-     input, which means the current buffer cannot contain encoding
-     errors and a multiline search is typically more efficient.
-     Otherwise, a single-line search is typically faster, so that
-     pcre_exec doesn't waste time validating the entire input
-     buffer.  */
-  bool multiline = input_textbin == TEXTBIN_UNKNOWN;
-
   for (; p < buf + size; p = line_start = line_end + 1)
     {
-      bool too_big;
-
-      if (multiline)
-        {
-          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
-          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
-          line_end = memrchr (p, eolbyte, scan_size);
-          too_big = ! line_end;
-        }
-      else
-        {
-          line_end = memchr (p, eolbyte, buf + size - p);
-          too_big = INT_MAX < line_end - p;
-        }
-
-      if (too_big)
+      /* A single-line search is typically faster, so that
+         pcre_exec doesn't waste time validating the entire input
+         buffer.  */
+      line_end = memchr (p, eolbyte, buf + size - p);
+      if (INT_MAX < line_end - p)
         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
       for (;;)
@@ -247,27 +228,11 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
           int options = 0;
           if (!bol)
             options |= PCRE_NOTBOL;
-          if (multiline)
-            options |= PCRE_NO_UTF8_CHECK;
 
           e = jit_exec (subject, line_end - subject, search_offset,
                         options, sub);
           if (e != PCRE_ERROR_BADUTF8)
-            {
-              if (0 < e && multiline && sub[1] - sub[0] != 0)
-                {
-                  char const *nl = memchr (subject + sub[0], eolbyte,
-                                           sub[1] - sub[0]);
-                  if (nl)
-                    {
-                      /* This match crosses a line boundary; reject it.  */
-                      p = subject + sub[0];
-                      line_end = nl;
-                      continue;
-                    }
-                }
-              break;
-            }
+            break;
           int valid_bytes = sub[0];
 
           /* Try to match the string before the encoding error.  */
@@ -339,15 +304,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
           beg = matchbeg;
           end = matchend;
         }
-      else if (multiline)
-        {
-          char const *prev_nl = memrchr (line_start - 1, eolbyte,
-                                         matchbeg - (line_start - 1));
-          char const *next_nl = memchr (matchend, eolbyte,
-                                        line_end + 1 - matchend);
-          beg = prev_nl + 1;
-          end = next_nl + 1;
-        }
       else
         {
           beg = line_start;
author	Paul Eggert <eggert@cs.ucla.edu>	2015-12-30 19:10:14 -0800
committer	Paul Eggert <eggert@cs.ucla.edu>	2015-12-30 19:10:57 -0800
commit	8521001643bc6a28c760552824eaea5ecee0aa8c (patch)
tree	5bd32eb561873be110f222d5016b9503322fc922 /src/pcresearch.c
parent	f0a7e87ff12f6d0f804275b5c7bfb2dc3cc0c777 (diff)
download	grep-8521001643bc6a28c760552824eaea5ecee0aa8c.tar.gz