summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2016-08-31 23:44:08 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2016-09-01 08:52:47 -0700
commit960ad317db21e781b04010f4128bb149273a3327 (patch)
tree20d0faaef5877a432f12ef4843e8f851cc04c7b5
parentdc9d5bf33857b6c84f233e7c0d3192993b287928 (diff)
downloadgrep-960ad317db21e781b04010f4128bb149273a3327.tar.gz
grep: speed up -iF in multibyte locales
In a multibyte locale, if a pattern is composed of only single byte characters and their all counterparts are also single byte characters and the pattern does not have invalid sequences, grep -iF uses the fgrep matcher, the same as in a single byte locale (Bug#23752). * NEWS: Mention it. * src/grep.c (lonesome_lower): New constant. (fgrep_icase_available): New function. (fgrep_to_grep_pattern): Simplify it. (main): Use them. * src/searchutils.c (kwsinit): New arg MB_TRANS; all uses changed. Try fgrep matcher for case insensitive matching by grep -F in multibyte locale.
-rw-r--r--NEWS6
-rw-r--r--src/dfasearch.c2
-rw-r--r--src/grep.c116
-rw-r--r--src/kwsearch.c2
-rw-r--r--src/search.h2
-rw-r--r--src/searchutils.c26
6 files changed, 129 insertions, 25 deletions
diff --git a/NEWS b/NEWS
index 21db87a9..65e663dd 100644
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,12 @@ GNU grep NEWS -*- outline -*-
grep now prints a "FILENAME:LINENO: " prefix when diagnosing an
invalid regular expression that was read from an '-f'-specified file.
+ grep -F now tries fgrep matcher for case insensitive matching in
+ multibyte locale. It improves performance for a case which a pattern
+ is composed of only single byte characters and their all counterparts
+ are also single byte characters and the pattern does not have invalid
+ sequences.
+
* Noteworthy changes in release 2.25 (2016-04-21) [stable]
diff --git a/src/dfasearch.c b/src/dfasearch.c
index c2e0177b..548ef080 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -89,7 +89,7 @@ kwsmusts (void)
struct dfamust *dm = dfamust (dfa);
if (!dm)
return;
- kwsinit (&kwset);
+ kwsinit (&kwset, false);
if (dm->exact)
{
/* Prepare a substring whose presence implies a match.
diff --git a/src/grep.c b/src/grep.c
index fc22c7b6..ae3b6e7a 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2265,14 +2265,91 @@ contains_encoding_error (char const *pat, size_t patlen)
return false;
}
+/* The set of wchar_t values C such that there's a useful locale
+ somewhere where C != towupper (C) && C != towlower (towupper (C)).
+ For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+ towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+ towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
+static short const lonesome_lower[] =
+ {
+ 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+ 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+ /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+ counterpart in locales predating Unicode 4.0.0 (April 2003). */
+ 0x03F2,
+
+ 0x03F5, 0x1E9B, 0x1FBE
+ };
+
+static bool
+fgrep_icase_available (char const *pat, size_t patlen)
+{
+ for (size_t i = 0; i < patlen; ++i)
+ {
+ unsigned char c = pat[i];
+ if (localeinfo.sbclen[c] > 1)
+ return false;
+ }
+
+ for (size_t i = 0; i < patlen; ++i)
+ {
+ unsigned char c = pat[i];
+
+ wint_t wc = localeinfo.sbctowc[c];
+ if (wc == WEOF)
+ return false;
+
+ wint_t uwc = towupper (wc);
+ if (uwc != wc)
+ {
+ char s[MB_LEN_MAX];
+ mbstate_t mb_state = { 0 };
+ size_t len = wcrtomb (s, uwc, &mb_state);
+ if (len > 1 && len != (size_t) -1)
+ return false;
+ }
+
+ wint_t lwc = towlower (uwc);
+ if (lwc != uwc && lwc != wc && towupper (lwc) == uwc)
+ {
+ char s[MB_LEN_MAX];
+ mbstate_t mb_state = { 0 };
+ size_t len = wcrtomb (s, lwc, &mb_state);
+ if (len > 1 && len != (size_t) -1)
+ return false;
+ }
+
+ for (size_t j = 0; lonesome_lower[j]; j++)
+ {
+ wint_t li = lonesome_lower[j];
+ if (li != lwc && li != uwc && li != wc && towupper (li) == uwc)
+ {
+ char s[MB_LEN_MAX];
+ mbstate_t mb_state = { 0 };
+ size_t len = wcrtomb (s, li, &mb_state);
+ if (len > 1 && len != (size_t) -1)
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
/* Change a pattern for fgrep into grep. */
static void
-fgrep_to_grep_pattern (size_t len, char const *keys,
- size_t *new_len, char **new_keys)
+fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
{
- char *p = *new_keys = xnmalloc (len + 1, 2);
+ char *keys, *new_keys, *p;
mbstate_t mb_state = { 0 };
- size_t n;
+ size_t len, n;
+
+ len = *len_p;
+ keys = *keys_p;
+
+ new_keys = xnmalloc (len + 1, 2);
+ p = new_keys;
for (; len; keys += n, len -= n)
{
@@ -2300,7 +2377,13 @@ fgrep_to_grep_pattern (size_t len, char const *keys,
}
}
- *new_len = p - *new_keys;
+ free (*keys_p);
+ *keys_p = new_keys;
+ *len_p = p - new_keys;
+
+ matcher = "grep";
+ compile = Gcompile;
+ execute = EGexecute;
}
int
@@ -2733,20 +2816,17 @@ main (int argc, char **argv)
In a multibyte locale, switch from fgrep to grep if either
(1) case is ignored (where grep is typically faster), or
(2) the pattern has an encoding error (where fgrep might not work). */
- if (compile == Fcompile
- && (MB_CUR_MAX <= 1
- ? match_words
- : match_icase || contains_encoding_error (keys, keycc)))
+ if (compile == Fcompile)
{
- size_t new_keycc;
- char *new_keys;
- fgrep_to_grep_pattern (keycc, keys, &new_keycc, &new_keys);
- free (keys);
- keys = new_keys;
- keycc = new_keycc;
- matcher = "grep";
- compile = Gcompile;
- execute = EGexecute;
+ if (MB_CUR_MAX > 1)
+ {
+ if (contains_encoding_error (keys, keycc))
+ fgrep_to_grep_pattern (&keys, &keycc);
+ else if (match_icase && !fgrep_icase_available (keys, keycc))
+ fgrep_to_grep_pattern (&keys, &keycc);
+ }
+ else if (match_words)
+ fgrep_to_grep_pattern (&keys, &keycc);
}
compile (keys, keycc);
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 508ebc5e..7fe08aa4 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -38,7 +38,7 @@ Fcompile (char const *pattern, size_t size)
{
size_t total = size;
- kwsinit (&kwset);
+ kwsinit (&kwset, true);
char const *p = pattern;
do
diff --git a/src/search.h b/src/search.h
index 431a67da..534a49e7 100644
--- a/src/search.h
+++ b/src/search.h
@@ -47,7 +47,7 @@ _GL_INLINE_HEADER_BEGIN
typedef signed char mb_len_map_t;
/* searchutils.c */
-extern void kwsinit (kwset_t *);
+extern void kwsinit (kwset_t *, bool);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
extern wint_t mb_prev_wc (char const *, char const *, char const *);
extern wint_t mb_next_wc (char const *, char const *);
diff --git a/src/searchutils.c b/src/searchutils.c
index 8081d418..87f51a4a 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -25,15 +25,33 @@
#define NCHAR (UCHAR_MAX + 1)
void
-kwsinit (kwset_t *kwset)
+kwsinit (kwset_t *kwset, bool mb_trans)
{
static char trans[NCHAR];
int i;
- if (match_icase && MB_CUR_MAX == 1)
+ if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
{
- for (i = 0; i < NCHAR; ++i)
- trans[i] = toupper (i);
+ if (MB_CUR_MAX == 1)
+ for (i = 0; i < NCHAR; ++i)
+ trans[i] = toupper (i);
+ else
+ for (i = 0; i < NCHAR; ++i)
+ {
+ wint_t wc = localeinfo.sbctowc[i];
+ wint_t uwc = towupper (wc);
+ if (uwc != wc)
+ {
+ char s[MB_LEN_MAX];
+ mbstate_t mbs = { 0 };
+ size_t len = wcrtomb (s, uwc, &mbs);
+ if (len > 1)
+ abort ();
+ trans[i] = s[0];
+ }
+ else
+ trans[i] = i;
+ }
*kwset = kwsalloc (trans, false);
}