summaryrefslogtreecommitdiff
path: root/src/kwset.c
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2014-04-22 23:34:22 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2014-04-22 23:55:18 -0700
commit14892aa6e0c21f49e5ec6d203253074ef15fedb0 (patch)
tree4d93f53c1624ce4150e6b331ca2379938abf5bda /src/kwset.c
parent73893ffbada36599fb6ec2eb489b6a7decf0c248 (diff)
downloadgrep-14892aa6e0c21f49e5ec6d203253074ef15fedb0.tar.gz
kwset: simplify and speed up Boyer-Moore unibyte -i in some cases
This improves the performance of, for example, yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 | grep -i jk in a unibyte locale. * src/kwset.c (memchr_trans): New function. (bmexec): Use it. Simplify the code and remove some of the confusing gotos and breaks and labels. Do not treat glibc memchr as a special case; if non-glibc memchr is slow, that is lower priority and I suppose we can try to work around the problem in gnulib.
Diffstat (limited to 'src/kwset.c')
-rw-r--r--src/kwset.c77
1 files changed, 33 insertions, 44 deletions
diff --git a/src/kwset.c b/src/kwset.c
index 78fb0b21..f86ee03f 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -524,6 +524,20 @@ bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len,
return false;
}
+/* Return the address of the first byte in the buffer S that equals C.
+ S contains N bytes. If TRANS is nonnull, use it to transliterate
+ S's bytes before comparing them. */
+static char const *
+memchr_trans (char const *s, char c, size_t n, char const *trans)
+{
+ if (! trans)
+ return memchr (s, c, n);
+ char const *slim = s + n;
+ for (; s < slim; s++)
+ if (trans[U(*s)] == c)
+ return s;
+ return NULL;
+}
/* Fast boyer-moore search. */
static size_t _GL_ATTRIBUTE_PURE
@@ -541,18 +555,8 @@ bmexec (kwset_t kwset, char const *text, size_t size)
return -1;
if (len == 1)
{
- if (trans)
- {
- for (tp = text; tp < text + size; tp++)
- if (trans[U(*tp)] == kwset->target[0])
- return tp - text;
- return -1;
- }
- else
- {
- tp = memchr (text, kwset->target[0], size);
- return tp ? tp - text : -1;
- }
+ tp = memchr_trans (text, kwset->target[0], size, trans);
+ return tp ? tp - text : -1;
}
d1 = kwset->delta;
@@ -564,48 +568,33 @@ bmexec (kwset_t kwset, char const *text, size_t size)
/* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
if (size > 12 * len)
/* 11 is not a bug, the initial offset happens only once. */
- for (ep = text + size - 11 * len;;)
+ for (ep = text + size - 11 * len; tp <= ep; )
{
- while (tp <= ep)
+ d = d1[U(tp[-1])], tp += d;
+ d = d1[U(tp[-1])], tp += d;
+ if (d != 0)
{
d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
- if (d == 0)
- goto found;
- d = d1[U(tp[-1])], tp += d;
- d = d1[U(tp[-1])], tp += d;
- d = d1[U(tp[-1])], tp += d;
- if (d == 0)
- goto found;
- d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
- d = d1[U(tp[-1])], tp += d;
- if (d == 0)
- goto found;
- /* memchar() of glibc is faster than seeking by delta1 on
- some platforms. When there is no chance to match for a
- while, use it on them. */
-#if defined(__GLIBC__) && (defined(__i386__) || defined(__x86_64__))
- if (!trans)
- {
- tp = memchr (tp - 1, gc1, size + text - tp + 1);
- if (tp)
- {
- ++tp;
- goto found;
- }
- else
- return -1;
- }
- else
-#endif
+ if (d != 0)
{
d = d1[U(tp[-1])], tp += d;
d = d1[U(tp[-1])], tp += d;
+ d = d1[U(tp[-1])], tp += d;
+ if (d != 0)
+ {
+ /* Typically memchr is faster than seeking by
+ delta1 when there is no chance to match for
+ a while. */
+ tp--;
+ tp = memchr_trans (tp, gc1, text + size - tp, trans);
+ if (! tp)
+ return -1;
+ tp++;
+ }
}
}
- break;
- found:
if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
return tp - text;
}