summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2014-01-13 08:42:25 -0800
committerJim Meyering <meyering@fb.com>2014-01-26 08:32:41 -0800
commit3e64071cce24398133e6f9989225b4dbe58b6c97 (patch)
tree4e4d39b3aa7f970ce44a90aa04e83caf66be0b1b
parentf804d7afbf75f08fd209ad08358bf76c18cc1d56 (diff)
downloadgrep-3e64071cce24398133e6f9989225b4dbe58b6c97.tar.gz
dfasearch: skip kwset optimization when multi-byte+case-insensitive
Now that DFA searching works with multi-byte locales, the only remaining reason to case-convert the searched input is the kwset optimization. But multi-byte case-conversion is so expensive that it's not worthwhile even to attempt that optimization. * src/dfasearch.c (kwsmusts): Skip this function in ignore-case mode when the locale is multi-byte. (EGexecute): Now that this code need not handle multi-byte case-ignoring matches, remove the expensive copy/case-conversion code. With no case-converted buffer, there is no longer any need to call mb_case_map_apply, so remove it and associated code. (kwsincr_case): Remove function. Now, every use of this function is equivalent to a use of kwsincr. Replace all uses. * tests/turkish-eyes: Test all of -E, -F and -G.
-rw-r--r--src/dfasearch.c51
-rwxr-xr-xtests/turkish-eyes6
2 files changed, 19 insertions, 38 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 46581ffa..69856a83 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -46,6 +46,11 @@ static struct patterns
static struct patterns *patterns;
static size_t pcount;
+/* Number of compiled fixed strings known to exactly match the regexp.
+ If kwsexec returns < kwset_exact_matches, then we don't need to
+ call the regexp matcher at all. */
+static size_t kwset_exact_matches;
+
void
dfaerror (char const *mesg)
{
@@ -69,22 +74,6 @@ dfawarn (char const *mesg)
dfaerror (mesg);
}
-/* Number of compiled fixed strings known to exactly match the regexp.
- If kwsexec returns < kwset_exact_matches, then we don't need to
- call the regexp matcher at all. */
-static size_t kwset_exact_matches;
-
-static char const *
-kwsincr_case (const char *must)
-{
- size_t n = strlen (must);
- mb_len_map_t *map = NULL;
- const char *buf = (match_icase && MB_CUR_MAX > 1
- ? mbtolower (must, &n, &map)
- : must);
- return kwsincr (kwset, buf, n);
-}
-
/* If the DFA turns out to have some set of fixed strings one of
which must occur in the match, then we build a kwset matcher
to find those strings, and thus quickly filter out impossible
@@ -95,6 +84,12 @@ kwsmusts (void)
struct dfamust const *dm;
char const *err;
+ /* With case-insensitive matching in a multi-byte locale, do not
+ use kwsearch, because in that case, it would be too expensive,
+ requiring that we case-convert all searched input. */
+ if (MB_CUR_MAX > 1 && match_icase)
+ return;
+
dm = dfamusts (dfa);
if (dm)
{
@@ -107,7 +102,7 @@ kwsmusts (void)
if (!dm->exact)
continue;
++kwset_exact_matches;
- if ((err = kwsincr_case (dm->must)) != NULL)
+ if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
}
/* Now, we compile the substrings that will require
@@ -116,7 +111,7 @@ kwsmusts (void)
{
if (dm->exact)
continue;
- if ((err = kwsincr_case (dm->must)) != NULL)
+ if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
}
if ((err = kwsprep (kwset)) != NULL)
@@ -217,21 +212,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
regoff_t start;
size_t len, best_len;
struct kwsmatch kwsm;
- size_t i, ret_val;
- mb_len_map_t *map = NULL;
-
- if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- {
- /* mbtolower adds a NUL byte at the end. That will provide
- space for the sentinel byte dfaexec may add. */
- char *case_buf = mbtolower (buf, &size, &map);
- if (start_ptr)
- start_ptr = case_buf + (start_ptr - buf);
- buf = case_buf;
- }
- }
+ size_t i;
mb_start = buf;
buflim = buf + size;
@@ -418,8 +399,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
len = end - beg;
success_in_len:;
size_t off = beg - buf;
- mb_case_map_apply (map, &off, &len);
*match_size = len;
- ret_val = off;
- return ret_val;
+ return off;
}
diff --git a/tests/turkish-eyes b/tests/turkish-eyes
index 68301e7b..cc585dae 100755
--- a/tests/turkish-eyes
+++ b/tests/turkish-eyes
@@ -38,7 +38,9 @@ i=$(printf '\304\261') # lowercase dotless i
search_str="$i:i I:$I"
printf "$data\n" > in || framework_failure_
-LC_ALL=$L grep -i "^$search_str\$" in > out || fail=1
-compare out in || fail=1
+for opt in -E -F -G; do
+ LC_ALL=$L grep $opt -i "$search_str" in > out || fail=1
+ compare out in || fail=1
+done
Exit $fail