summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2014-09-25 17:04:49 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2014-09-26 01:06:25 -0700
commitaaefbfa486df2587e25c8b59d1cca5f2092e7c7a (patch)
tree7f645c5651f1efeea1f6b3c697b2bb3a23842b5f
parentf6de00f6cec3831b8f334de7dbd1b59115627457 (diff)
downloadgrep-aaefbfa486df2587e25c8b59d1cca5f2092e7c7a.tar.gz
grep: scan for valid multibyte strings more quickly
Scan valid multibyte strings more quickly in the common case of encodings that are upward compatible with ASCII, such as UTF-8. You'd think there'd be a fast standard way to do this nowadays, but nooooo.... Problem reported by Jim Meyering in: http://bugs.gnu.org/18454#56 * src/grep.c (HIBYTE): New constant. (easy_encoding): New static var. (init_easy_encoding, skip_easy_bytes): New functions. (uword): New type. (buffer_textbin): Skip easy bytes quickly. Don't bother with mb_clen here, since skip_easy_bytes typically captures the easy cases; just use mbrlen directly. (buffer_textbin, file_textbin): First arg is no longer a const pointer, since the byte past the end is now an overwritten sentinel. (fillbuf): Make room for a uword after the buffer, for skip_easy_bytes. (main): Call init_easy_encoding.
-rw-r--r--src/grep.c93
1 files changed, 76 insertions, 17 deletions
diff --git a/src/grep.c b/src/grep.c
index 35d33586..1e5d0fa4 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -454,9 +454,60 @@ textbin_is_binary (enum textbin textbin)
return textbin < TEXTBIN_UNKNOWN;
}
-/* Return the text type of data in BUF, of size SIZE. */
+/* The high-order bit of a byte. */
+enum { HIBYTE = 0x80 };
+
+/* True if every byte with HIBYTE off is a single-byte character.
+ UTF-8 has this property. */
+static bool easy_encoding;
+
+static void
+init_easy_encoding (void)
+{
+ easy_encoding = true;
+ for (int i = 0; i < HIBYTE; i++)
+ easy_encoding &= mbclen_cache[i] == 1;
+}
+
+/* An unsigned type suitable for fast matching. */
+typedef uintmax_t uword;
+
+/* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
+ that is not easy, and return a pointer to the first non-easy byte.
+ In easy encodings, the easy bytes all have HIBYTE off.
+ In other encodings, no byte is easy. */
+static char const * _GL_ATTRIBUTE_PURE
+skip_easy_bytes (char const *buf)
+{
+ if (!easy_encoding)
+ return buf;
+
+ uword uword_max = -1;
+
+ /* 0x8080..., extended to be wide enough for uword. */
+ uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE;
+
+ /* Search a byte at a time until the pointer is aligned, then a
+ uword at a time until a match is found, then a byte at a time to
+ identify the exact byte. The uword search may go slightly past
+ the buffer end, but that's benign. */
+ char const *p;
+ uword const *s;
+ for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
+ if (*p & HIBYTE)
+ return p;
+ for (s = (uword const *) p; ! (*s & hibyte_mask); s++)
+ continue;
+ for (p = (char const *) s; ! (*p & HIBYTE); p++)
+ continue;
+ return p;
+}
+
+/* Return the text type of data in BUF, of size SIZE.
+ BUF must be followed by at least sizeof (uword) bytes,
+ which may be arbitrarily written to or read from. */
static enum textbin
-buffer_textbin (char const *buf, size_t size)
+buffer_textbin (char *buf, size_t size)
{
if (eolbyte && memchr (buf, '\0', size))
return TEXTBIN_BINARY;
@@ -467,9 +518,10 @@ buffer_textbin (char const *buf, size_t size)
size_t clen;
char const *p;
- for (p = buf; p < buf + size; p += clen)
+ buf[size] = -1;
+ for (p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen)
{
- clen = mb_clen (p, buf + size - p, &mbs);
+ clen = mbrlen (p, buf + size - p, &mbs);
if ((size_t) -2 <= clen)
return clen == (size_t) -2 ? TEXTBIN_UNKNOWN : TEXTBIN_BINARY;
}
@@ -478,24 +530,26 @@ buffer_textbin (char const *buf, size_t size)
return TEXTBIN_TEXT;
}
-/* Return the text type of a file. BUF, of size BUFSIZE, is the initial
- buffer read from the file with descriptor FD and status ST. */
+/* Return the text type of a file. BUF, of size SIZE, is the initial
+ buffer read from the file with descriptor FD and status ST.
+ BUF must be followed by at least sizeof (uword) bytes,
+ which may be arbitrarily written to or read from. */
static enum textbin
-file_textbin (char const *buf, size_t bufsize, int fd, struct stat const *st)
+file_textbin (char *buf, size_t size, int fd, struct stat const *st)
{
- enum textbin textbin = buffer_textbin (buf, bufsize);
+ enum textbin textbin = buffer_textbin (buf, size);
if (textbin_is_binary (textbin))
return textbin;
if (usable_st_size (st))
{
- if (st->st_size <= bufsize)
+ if (st->st_size <= size)
return textbin == TEXTBIN_UNKNOWN ? TEXTBIN_BINARY : textbin;
/* If the file has holes, it must contain a null byte somewhere. */
if (SEEK_HOLE != SEEK_SET && eolbyte)
{
- off_t cur = bufsize;
+ off_t cur = size;
if (O_BINARY || fd == STDIN_FILENO)
{
cur = lseek (fd, 0, SEEK_CUR);
@@ -611,7 +665,8 @@ reset (int fd, struct stat const *st)
pagesize = getpagesize ();
if (pagesize == 0 || 2 * pagesize + 1 <= pagesize)
abort ();
- bufalloc = ALIGN_TO (INITIAL_BUFSIZE, pagesize) + pagesize + 1;
+ bufalloc = (ALIGN_TO (INITIAL_BUFSIZE, pagesize)
+ + pagesize + sizeof (uword));
buffer = xmalloc (bufalloc);
}
@@ -652,7 +707,7 @@ fillbuf (size_t save, struct stat const *st)
that we want to save. */
size_t saved_offset = buflim - save - buffer;
- if (pagesize <= buffer + bufalloc - buflim)
+ if (pagesize <= buffer + bufalloc - sizeof (uword) - buflim)
{
readbuf = buflim;
bufbeg = buflim - save;
@@ -665,8 +720,10 @@ fillbuf (size_t save, struct stat const *st)
char *newbuf;
/* Grow newsize until it is at least as great as minsize. */
- for (newsize = bufalloc - pagesize - 1; newsize < minsize; newsize *= 2)
- if (newsize * 2 < newsize || newsize * 2 + pagesize + 1 < newsize * 2)
+ for (newsize = bufalloc - pagesize - sizeof (uword);
+ newsize < minsize;
+ newsize *= 2)
+ if ((SIZE_MAX - pagesize - sizeof (uword)) / 2 < newsize)
xalloc_die ();
/* Try not to allocate more memory than the file size indicates,
@@ -686,8 +743,9 @@ fillbuf (size_t save, struct stat const *st)
}
/* Add enough room so that the buffer is aligned and has room
- for byte sentinels fore and aft. */
- newalloc = newsize + pagesize + 1;
+ for byte sentinels fore and aft, and so that a uword can
+ be read aft. */
+ newalloc = newsize + pagesize + sizeof (uword);
newbuf = bufalloc < newalloc ? xmalloc (bufalloc = newalloc) : buffer;
readbuf = ALIGN_TO (newbuf + 1 + save, pagesize);
@@ -701,7 +759,7 @@ fillbuf (size_t save, struct stat const *st)
}
}
- readsize = buffer + bufalloc - readbuf;
+ readsize = buffer + bufalloc - sizeof (uword) - readbuf;
readsize -= readsize % pagesize;
while (true)
@@ -2417,6 +2475,7 @@ main (int argc, char **argv)
usage (EXIT_TROUBLE);
build_mbclen_cache ();
+ init_easy_encoding ();
/* If fgrep in a multibyte locale, then use grep if either
(1) case is ignored (where grep is typically faster), or