summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2016-01-07 21:28:23 -0800
committerPaul Eggert <eggert@cs.ucla.edu>2016-01-07 21:30:31 -0800
commitd1160ec6d239b2e0f20c2fb3395e3b70963bf916 (patch)
treed641664f6c8597183c96d9b8b92d9cfe99547fd5
parent5cb49d2f375f0606ac9d916af6024d4b92ba0786 (diff)
downloadgrep-d1160ec6d239b2e0f20c2fb3395e3b70963bf916.tar.gz
grep: improve unibyte -P performance
This is a followon to the recent changes prompted by Bug#20526. In <http://bugs.gnu.org/bug=20526#86> Norihiro Tanaka pointed out that grep mistakenly assumed that unibyte locales cannot have encoding errors. Here, the mistake hurt performance significantly. On Fedora 23 x86-64 in the C locale, this patch improved grep's performance by a factor of 7 when run as "grep -P 'z.*a'" on the output of "yes $(printf '\200\n') | head -n 1000000000". * src/pcresearch.c (multibyte_locale) [HAVE_LIBPCRE]: New static var. (Pcompile): Set it. (Pexecute): Use it to avoid the need to call buf_has_encoding_errors in unibyte locales.
-rw-r--r--src/pcresearch.c24
1 files changed, 17 insertions, 7 deletions
diff --git a/src/pcresearch.c b/src/pcresearch.c
index c0b86786..1fae94d8 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -84,6 +84,8 @@ jit_exec (char const *subject, int search_bytes, int search_offset,
/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
string matches when that flag is used. */
static int empty_match[2];
+
+static bool multibyte_locale;
#endif
void
@@ -104,10 +106,14 @@ Pcompile (char const *pattern, size_t size)
char const *p;
char const *pnul;
- if (using_utf8 ())
- flags |= PCRE_UTF8;
- else if (MB_CUR_MAX != 1)
- error (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
+ if (1 < MB_CUR_MAX)
+ {
+ if (! using_utf8 ())
+ error (EXIT_TROUBLE, 0,
+ _("-P supports only unibyte and UTF-8 locales"));
+ multibyte_locale = true;
+ flags |= PCRE_UTF8;
+ }
/* FIXME: Remove these restrictions. */
if (memchr (pattern, '\n', size))
@@ -194,12 +200,16 @@ Pexecute (char *buf, size_t size, size_t *match_size,
error. */
char const *subject = buf;
- /* If the input is free of encoding errors a multiline search is
+ /* If the input is unibyte or is free of encoding errors a multiline search is
typically more efficient. Otherwise, a single-line search is
typically faster, so that pcre_exec doesn't waste time validating
the entire input buffer. */
- bool multiline = ! buf_has_encoding_errors (buf, size - 1);
- buf[size - 1] = eolbyte;
+ bool multiline = true;
+ if (multibyte_locale)
+ {
+ multiline = ! buf_has_encoding_errors (buf, size - 1);
+ buf[size - 1] = eolbyte;
+ }
for (; p < buf + size; p = line_start = line_end + 1)
{