Improve on fix for Bug#22181

* src/pcresearch.c (Pexecute): Update subject when skipping past easily-determined encoding errors, as this is faster than letting pcre_exec skip them. On my platform this improves performance 4.7x on a benchmark created via "yes $(printf '\200\200\200\200 \200\200\200\200\200\200\200\200\200\200\200\200\200\200\200\200x\n') | head -n 1000000 >j; grep -oP y j" in a UTF-8 locale. Rework code that deals with PCRE_ERROR_BADUTF8 return, to avoid an incorrect (albeit currently harmless) 'bol = false' assignment.
author: Paul Eggert <eggert@cs.ucla.edu> 2016-01-06 22:40:23 -0800
committer: Paul Eggert <eggert@cs.ucla.edu> 2016-01-06 22:41:19 -0800
commit: 5cb49d2f375f0606ac9d916af6024d4b92ba0786 (patch)
tree: 176b3d293ee6280c288844482612a1da7fdbe962 /src/pcresearch.c
parent: 4f04b821d3bd00283cbfed17b81f68dba5fdd9cb (diff)
download: grep-5cb49d2f375f0606ac9d916af6024d4b92ba0786.tar.gz
1 files changed, 21 insertions, 19 deletions
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 8f3d935b..c0b86786 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -229,6 +229,7 @@ Pexecute (char *buf, size_t size, size_t *match_size,
           while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
             {
               p++;
+              subject = p;
               bol = false;
             }
 
@@ -269,29 +270,30 @@ Pexecute (char *buf, size_t size, size_t *match_size,
             }
           int valid_bytes = sub[0];
 
-          /* Try to match the string before the encoding error.  */
-          if (valid_bytes < search_offset)
-            e = PCRE_ERROR_NOMATCH;
-          else if (valid_bytes == 0)
+          if (search_offset <= valid_bytes)
             {
-              /* Handle the empty-match case specially, for speed.
-                 This optimization is valid if VALID_BYTES is zero,
-                 which means SEARCH_OFFSET is also zero.  */
-              sub[1] = 0;
-              e = empty_match[bol];
-            }
-          else
-            e = jit_exec (subject, valid_bytes, search_offset,
-                          options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
+              /* Try to match the string before the encoding error.  */
+              if (valid_bytes == 0)
+                {
+                  /* Handle the empty-match case specially, for speed.
+                     This optimization is valid if VALID_BYTES is zero,
+                     which means SEARCH_OFFSET is also zero.  */
+                  sub[1] = 0;
+                  e = empty_match[bol];
+                }
+              else
+                e = jit_exec (subject, valid_bytes, search_offset,
+                              options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
 
-          if (e != PCRE_ERROR_NOMATCH)
-            break;
+              if (e != PCRE_ERROR_NOMATCH)
+                break;
+
+              /* Treat the encoding error as data that cannot match.  */
+              p = subject + valid_bytes + 1;
+              bol = false;
+            }
 
-          /* Treat the encoding error as data that cannot match.  */
           subject += valid_bytes + 1;
-          if (p < subject)
-            p = subject;
-          bol = false;
         }
 
       if (e != PCRE_ERROR_NOMATCH)
author	Paul Eggert <eggert@cs.ucla.edu>	2016-01-06 22:40:23 -0800
committer	Paul Eggert <eggert@cs.ucla.edu>	2016-01-06 22:41:19 -0800
commit	5cb49d2f375f0606ac9d916af6024d4b92ba0786 (patch)
tree	176b3d293ee6280c288844482612a1da7fdbe962 /src/pcresearch.c
parent	4f04b821d3bd00283cbfed17b81f68dba5fdd9cb (diff)
download	grep-5cb49d2f375f0606ac9d916af6024d4b92ba0786.tar.gz