diff options
author | Norihiro Tanaka <noritnk@kcn.ne.jp> | 2014-01-13 08:42:25 -0800 |
---|---|---|
committer | Jim Meyering <meyering@fb.com> | 2014-01-26 08:32:41 -0800 |
commit | 3e64071cce24398133e6f9989225b4dbe58b6c97 (patch) | |
tree | 4e4d39b3aa7f970ce44a90aa04e83caf66be0b1b | |
parent | f804d7afbf75f08fd209ad08358bf76c18cc1d56 (diff) | |
download | grep-3e64071cce24398133e6f9989225b4dbe58b6c97.tar.gz |
dfasearch: skip kwset optimization when multi-byte+case-insensitive
Now that DFA searching works with multi-byte locales, the only remaining
reason to case-convert the searched input is the kwset optimization.
But multi-byte case-conversion is so expensive that it's not
worthwhile even to attempt that optimization.
* src/dfasearch.c (kwsmusts): Skip this function in ignore-case mode
when the locale is multi-byte.
(EGexecute): Now that this code need not handle multi-byte case-ignoring
matches, remove the expensive copy/case-conversion code.
With no case-converted buffer, there is no longer any need to call
mb_case_map_apply, so remove it and associated code.
(kwsincr_case): Remove function. Now, every use of this function
is equivalent to a use of kwsincr. Replace all uses.
* tests/turkish-eyes: Test all of -E, -F and -G.
-rw-r--r-- | src/dfasearch.c | 51 | ||||
-rwxr-xr-x | tests/turkish-eyes | 6 |
2 files changed, 19 insertions, 38 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c index 46581ffa..69856a83 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -46,6 +46,11 @@ static struct patterns static struct patterns *patterns; static size_t pcount; +/* Number of compiled fixed strings known to exactly match the regexp. + If kwsexec returns < kwset_exact_matches, then we don't need to + call the regexp matcher at all. */ +static size_t kwset_exact_matches; + void dfaerror (char const *mesg) { @@ -69,22 +74,6 @@ dfawarn (char const *mesg) dfaerror (mesg); } -/* Number of compiled fixed strings known to exactly match the regexp. - If kwsexec returns < kwset_exact_matches, then we don't need to - call the regexp matcher at all. */ -static size_t kwset_exact_matches; - -static char const * -kwsincr_case (const char *must) -{ - size_t n = strlen (must); - mb_len_map_t *map = NULL; - const char *buf = (match_icase && MB_CUR_MAX > 1 - ? mbtolower (must, &n, &map) - : must); - return kwsincr (kwset, buf, n); -} - /* If the DFA turns out to have some set of fixed strings one of which must occur in the match, then we build a kwset matcher to find those strings, and thus quickly filter out impossible @@ -95,6 +84,12 @@ kwsmusts (void) struct dfamust const *dm; char const *err; + /* With case-insensitive matching in a multi-byte locale, do not + use kwsearch, because in that case, it would be too expensive, + requiring that we case-convert all searched input. */ + if (MB_CUR_MAX > 1 && match_icase) + return; + dm = dfamusts (dfa); if (dm) { @@ -107,7 +102,7 @@ kwsmusts (void) if (!dm->exact) continue; ++kwset_exact_matches; - if ((err = kwsincr_case (dm->must)) != NULL) + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL) error (EXIT_TROUBLE, 0, "%s", err); } /* Now, we compile the substrings that will require @@ -116,7 +111,7 @@ kwsmusts (void) { if (dm->exact) continue; - if ((err = kwsincr_case (dm->must)) != NULL) + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL) error (EXIT_TROUBLE, 0, "%s", err); } if ((err = kwsprep (kwset)) != NULL) @@ -217,21 +212,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size, regoff_t start; size_t len, best_len; struct kwsmatch kwsm; - size_t i, ret_val; - mb_len_map_t *map = NULL; - - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - /* mbtolower adds a NUL byte at the end. That will provide - space for the sentinel byte dfaexec may add. */ - char *case_buf = mbtolower (buf, &size, &map); - if (start_ptr) - start_ptr = case_buf + (start_ptr - buf); - buf = case_buf; - } - } + size_t i; mb_start = buf; buflim = buf + size; @@ -418,8 +399,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size, len = end - beg; success_in_len:; size_t off = beg - buf; - mb_case_map_apply (map, &off, &len); *match_size = len; - ret_val = off; - return ret_val; + return off; } diff --git a/tests/turkish-eyes b/tests/turkish-eyes index 68301e7b..cc585dae 100755 --- a/tests/turkish-eyes +++ b/tests/turkish-eyes @@ -38,7 +38,9 @@ i=$(printf '\304\261') # lowercase dotless i search_str="$i:i I:$I" printf "$data\n" > in || framework_failure_ -LC_ALL=$L grep -i "^$search_str\$" in > out || fail=1 -compare out in || fail=1 +for opt in -E -F -G; do + LC_ALL=$L grep $opt -i "$search_str" in > out || fail=1 + compare out in || fail=1 +done Exit $fail |