summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2016-09-01 08:43:48 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2016-09-01 08:53:16 -0700
commit1e166fdaa69902793057ed2a63408f51c3b4273b (patch)
treece8c9baaac32220793f1a1cf86400e2d6259dcca
parent960ad317db21e781b04010f4128bb149273a3327 (diff)
downloadgrep-1e166fdaa69902793057ed2a63408f51c3b4273b.tar.gz
grep: avoid code duplication with -iF
This follows up on the -iF performance improvement (Bug#23752). * NEWS: Simplify description of -iF improvement. * src/dfa.c: Do not include wctype.h. (lonesome_lower, case_folded_counterparts): Move to localeinfo.c. (CASE_FOLDED_BUFSIZE): Move to localeinfo.h. * src/grep.c: Do not include wctype.h. (lonesome_lower): Remove. (fgrep_icase_available): Use case_folded_counterparts instead. Do not call it for the same character twice. Return false on wcrtomb failures (which should never happen). (fgrep_to_grep_pattern, main): Simplify. Let fgrep_to_grep’s caller fiddle with the global variables. * src/localeinfo.c: Include <wctype.h> (lonesome_lower, case_folded_counterparts): Move here from src/dfa.c. Return int, not unsigned int. Verify that CASE_FOLDED_BUFSIZE is big enough. * src/localeinfo.h (CASE_FOLDED_BUFSIZE): Now 32, so that we don’t expose lonesome_lower’s size. * src/searchutils.c (kwsinit): Return new kwset instead of storing it via a pointer. All callers changed. Simplify a bit.
-rw-r--r--NEWS9
-rw-r--r--src/dfa.c48
-rw-r--r--src/dfasearch.c2
-rw-r--r--src/grep.c135
-rw-r--r--src/kwsearch.c2
-rw-r--r--src/localeinfo.c47
-rw-r--r--src/localeinfo.h7
-rw-r--r--src/search.h2
-rw-r--r--src/searchutils.c24
9 files changed, 115 insertions, 161 deletions
diff --git a/NEWS b/NEWS
index 65e663dd..91f1bfc5 100644
--- a/NEWS
+++ b/NEWS
@@ -10,18 +10,15 @@ GNU grep NEWS -*- outline -*-
as it now uses the Aho-Corasick algorithm instead of the
Commentz-Walter algorithm in that case.
+ grep -iF is typically much faster in a multibyte locale, if the
+ pattern and its case counterparts contain only single byte characters.
+
In multibyte locales that are not UTF-8, grep now handles leading
"." in patterns more efficiently.
grep now prints a "FILENAME:LINENO: " prefix when diagnosing an
invalid regular expression that was read from an '-f'-specified file.
- grep -F now tries fgrep matcher for case insensitive matching in
- multibyte locale. It improves performance for a case which a pattern
- is composed of only single byte characters and their all counterparts
- are also single byte characters and the pattern does not have invalid
- sequences.
-
* Noteworthy changes in release 2.25 (2016-04-21) [stable]
diff --git a/src/dfa.c b/src/dfa.c
index cc96740e..0f5109e2 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -50,7 +50,6 @@
#define _(str) gettext (str)
#include <wchar.h>
-#include <wctype.h>
/* HPUX defines these as macros in sys/param.h. */
#ifdef setbit
@@ -857,53 +856,6 @@ using_simple_locale (bool multibyte)
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
-/* The set of wchar_t values C such that there's a useful locale
- somewhere where C != towupper (C) && C != towlower (towupper (C)).
- For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
- towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
- towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
-static short const lonesome_lower[] =
- {
- 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
- 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
- /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
- counterpart in locales predating Unicode 4.0.0 (April 2003). */
- 0x03F2,
-
- 0x03F5, 0x1E9B, 0x1FBE,
- };
-
-/* Maximum number of characters that can be the case-folded
- counterparts of a single character, not counting the character
- itself. This is 1 for towupper, 1 for towlower, and 1 for each
- entry in LONESOME_LOWER. */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
- itself, and store them into FOLDED. Return the number of characters
- stored. */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
- unsigned int i;
- unsigned int n = 0;
- wint_t uc = towupper (c);
- wint_t lc = towlower (uc);
- if (uc != c)
- folded[n++] = uc;
- if (lc != uc && lc != c && towupper (lc) == uc)
- folded[n++] = lc;
- for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
- {
- wint_t li = lonesome_lower[i];
- if (li != lc && li != uc && li != c && towupper (li) == uc)
- folded[n++] = li;
- }
- return n;
-}
-
typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 548ef080..61b1f708 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -89,7 +89,7 @@ kwsmusts (void)
struct dfamust *dm = dfamust (dfa);
if (!dm)
return;
- kwsinit (&kwset, false);
+ kwset = kwsinit (false);
if (dm->exact)
{
/* Prepare a substring whose presence implies a match.
diff --git a/src/grep.c b/src/grep.c
index ae3b6e7a..15c6dc6f 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -22,7 +22,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <wchar.h>
-#include <wctype.h>
#include <fcntl.h>
#include <inttypes.h>
#include <stdarg.h>
@@ -2265,91 +2264,53 @@ contains_encoding_error (char const *pat, size_t patlen)
return false;
}
-/* The set of wchar_t values C such that there's a useful locale
- somewhere where C != towupper (C) && C != towlower (towupper (C)).
- For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
- towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
- towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
-static short const lonesome_lower[] =
- {
- 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
- 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
- /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
- counterpart in locales predating Unicode 4.0.0 (April 2003). */
- 0x03F2,
-
- 0x03F5, 0x1E9B, 0x1FBE
- };
+/* Return true if the fgrep pattern PAT, of size PATLEN, matches only
+ single-byte characters including case folding, and so is suitable
+ to be converted to a grep pattern. */
static bool
fgrep_icase_available (char const *pat, size_t patlen)
{
- for (size_t i = 0; i < patlen; ++i)
- {
- unsigned char c = pat[i];
- if (localeinfo.sbclen[c] > 1)
- return false;
- }
+ bool used[UCHAR_MAX + 1] = { 0, };
- for (size_t i = 0; i < patlen; ++i)
+ for (size_t i = 0; i < patlen; i++)
{
unsigned char c = pat[i];
-
- wint_t wc = localeinfo.sbctowc[c];
- if (wc == WEOF)
+ if (localeinfo.sbctowc[c] == WEOF)
return false;
+ used[c] = true;
+ }
- wint_t uwc = towupper (wc);
- if (uwc != wc)
- {
- char s[MB_LEN_MAX];
- mbstate_t mb_state = { 0 };
- size_t len = wcrtomb (s, uwc, &mb_state);
- if (len > 1 && len != (size_t) -1)
- return false;
- }
-
- wint_t lwc = towlower (uwc);
- if (lwc != uwc && lwc != wc && towupper (lwc) == uwc)
- {
- char s[MB_LEN_MAX];
- mbstate_t mb_state = { 0 };
- size_t len = wcrtomb (s, lwc, &mb_state);
- if (len > 1 && len != (size_t) -1)
- return false;
- }
+ for (int c = 0; c <= UCHAR_MAX; c++)
+ if (used[c])
+ {
+ wint_t wc = localeinfo.sbctowc[c];
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ size_t nfolded = case_folded_counterparts (wc, folded);
- for (size_t j = 0; lonesome_lower[j]; j++)
- {
- wint_t li = lonesome_lower[j];
- if (li != lwc && li != uwc && li != wc && towupper (li) == uwc)
- {
- char s[MB_LEN_MAX];
- mbstate_t mb_state = { 0 };
- size_t len = wcrtomb (s, li, &mb_state);
- if (len > 1 && len != (size_t) -1)
- return false;
- }
- }
- }
+ for (size_t i = 0; i < nfolded; i++)
+ {
+ char s[MB_LEN_MAX];
+ mbstate_t mb_state = { 0 };
+ if (1 < wcrtomb (s, folded[i], &mb_state))
+ return false;
+ }
+ }
return true;
}
-/* Change a pattern for fgrep into grep. */
+/* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style. */
+
static void
fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
{
- char *keys, *new_keys, *p;
+ size_t len = *len_p;
+ char *keys = *keys_p;
mbstate_t mb_state = { 0 };
- size_t len, n;
-
- len = *len_p;
- keys = *keys_p;
-
- new_keys = xnmalloc (len + 1, 2);
- p = new_keys;
+ char *new_keys = xnmalloc (len + 1, 2);
+ char *p = new_keys;
+ size_t n;
for (; len; keys += n, len -= n)
{
@@ -2365,14 +2326,15 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
case (size_t) -1:
memset (&mb_state, 0, sizeof mb_state);
+ n = 1;
/* Fall through. */
case 1:
- *p = '\\';
- p += strchr ("$*.[\\^", *keys) != NULL;
- /* Fall through. */
- case 0:
+ switch (*keys)
+ {
+ case '$': case '*': case '.': case '[': case '\\': case '^':
+ *p++ = '\\'; break;
+ }
*p++ = *keys;
- n = 1;
break;
}
}
@@ -2380,10 +2342,6 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
free (*keys_p);
*keys_p = new_keys;
*len_p = p - new_keys;
-
- matcher = "grep";
- compile = Gcompile;
- execute = EGexecute;
}
int
@@ -2814,19 +2772,18 @@ main (int argc, char **argv)
/* In a unibyte locale, switch from fgrep to grep if
the pattern matches words (where grep is typically faster).
In a multibyte locale, switch from fgrep to grep if either
- (1) case is ignored (where grep is typically faster), or
- (2) the pattern has an encoding error (where fgrep might not work). */
- if (compile == Fcompile)
+ (1) the pattern has an encoding error (where fgrep might not work), or
+ (2) case is ignored and a fast fgrep ignore-case is not available. */
+ if (compile == Fcompile
+ && (MB_CUR_MAX <= 1
+ ? match_words
+ : (contains_encoding_error (keys, keycc)
+ || (match_icase && !fgrep_icase_available (keys, keycc)))))
{
- if (MB_CUR_MAX > 1)
- {
- if (contains_encoding_error (keys, keycc))
- fgrep_to_grep_pattern (&keys, &keycc);
- else if (match_icase && !fgrep_icase_available (keys, keycc))
- fgrep_to_grep_pattern (&keys, &keycc);
- }
- else if (match_words)
- fgrep_to_grep_pattern (&keys, &keycc);
+ fgrep_to_grep_pattern (&keys, &keycc);
+ matcher = "grep";
+ compile = Gcompile;
+ execute = EGexecute;
}
compile (keys, keycc);
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 7fe08aa4..29d140cd 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -38,7 +38,7 @@ Fcompile (char const *pattern, size_t size)
{
size_t total = size;
- kwsinit (&kwset, true);
+ kwset = kwsinit (true);
char const *p = pattern;
do
diff --git a/src/localeinfo.c b/src/localeinfo.c
index 329d4314..ca96afc7 100644
--- a/src/localeinfo.c
+++ b/src/localeinfo.c
@@ -29,6 +29,7 @@
#include <locale.h>
#include <stdlib.h>
#include <string.h>
+#include <wctype.h>
/* The sbclen implementation relies on this. */
verify (MB_LEN_MAX <= SCHAR_MAX);
@@ -64,3 +65,49 @@ init_localeinfo (struct localeinfo *localeinfo)
localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
}
}
+
+/* The set of wchar_t values C such that there's a useful locale
+ somewhere where C != towupper (C) && C != towlower (towupper (C)).
+ For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+ towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+ towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
+static short const lonesome_lower[] =
+ {
+ 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+ 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+ /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+ counterpart in locales predating Unicode 4.0.0 (April 2003). */
+ 0x03F2,
+
+ 0x03F5, 0x1E9B, 0x1FBE,
+ };
+
+/* Verify that the worst case fits. This is 1 for towupper, 1 for
+ towlower, and 1 for each entry in LONESOME_LOWER. */
+verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
+ <= CASE_FOLDED_BUFSIZE);
+
+/* Find the characters equal to C after case-folding, other than C
+ itself, and store them into FOLDED. Return the number of characters
+ stored. */
+
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+ int i;
+ int n = 0;
+ wint_t uc = towupper (c);
+ wint_t lc = towlower (uc);
+ if (uc != c)
+ folded[n++] = uc;
+ if (lc != uc && lc != c && towupper (lc) == uc)
+ folded[n++] = lc;
+ for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+ {
+ wint_t li = lonesome_lower[i];
+ if (li != lc && li != uc && li != c && towupper (li) == uc)
+ folded[n++] = li;
+ }
+ return n;
+}
diff --git a/src/localeinfo.h b/src/localeinfo.h
index 70b55a8d..cf2f9a69 100644
--- a/src/localeinfo.h
+++ b/src/localeinfo.h
@@ -45,3 +45,10 @@ struct localeinfo
};
extern void init_localeinfo (struct localeinfo *);
+
+/* Maximum number of characters that can be the case-folded
+ counterparts of a single character, not counting the character
+ itself. This is a generous upper bound. */
+enum { CASE_FOLDED_BUFSIZE = 32 };
+
+extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/src/search.h b/src/search.h
index 534a49e7..fb4e5c82 100644
--- a/src/search.h
+++ b/src/search.h
@@ -47,7 +47,7 @@ _GL_INLINE_HEADER_BEGIN
typedef signed char mb_len_map_t;
/* searchutils.c */
-extern void kwsinit (kwset_t *, bool);
+extern kwset_t kwsinit (bool);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
extern wint_t mb_prev_wc (char const *, char const *, char const *);
extern wint_t mb_next_wc (char const *, char const *);
diff --git a/src/searchutils.c b/src/searchutils.c
index 87f51a4a..73d6c1c7 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -24,42 +24,36 @@
#define NCHAR (UCHAR_MAX + 1)
-void
-kwsinit (kwset_t *kwset, bool mb_trans)
+kwset_t
+kwsinit (bool mb_trans)
{
static char trans[NCHAR];
- int i;
+ char *transptr = NULL;
if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
{
if (MB_CUR_MAX == 1)
- for (i = 0; i < NCHAR; ++i)
+ for (int i = 0; i < NCHAR; i++)
trans[i] = toupper (i);
else
- for (i = 0; i < NCHAR; ++i)
+ for (int i = 0; i < NCHAR; i++)
{
wint_t wc = localeinfo.sbctowc[i];
wint_t uwc = towupper (wc);
if (uwc != wc)
{
- char s[MB_LEN_MAX];
mbstate_t mbs = { 0 };
- size_t len = wcrtomb (s, uwc, &mbs);
- if (len > 1)
+ size_t len = wcrtomb (&trans[i], uwc, &mbs);
+ if (len != 1)
abort ();
- trans[i] = s[0];
}
else
trans[i] = i;
}
-
- *kwset = kwsalloc (trans, false);
+ transptr = trans;
}
- else
- *kwset = kwsalloc (NULL, false);
- if (!*kwset)
- xalloc_die ();
+ return kwsalloc (transptr, false);
}
/* In the buffer *MB_START, return the number of bytes needed to go