From eb3292b3b205e50d0373f26ff0950ec82f49c14a Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 5 May 2014 20:19:19 -0700 Subject: grep: fix encoding-error incompatibilities among regex, DFA, KWset This follows up to http://bugs.gnu.org/17376 and fixes a different set of incompatibilities, namely between the regex matcher and the other matchers, when the pattern contains encoding errors. The GNU regex matcher is not consistent in this area: sometimes an encoding error matches only itself, and sometimes it matches part of a multibyte character. There is no documentation for grep's behavior in this area and users don't seem to care, and it's simpler to defer to the regex matcher for problematic cases like these. * NEWS: Document this. * src/dfa.c (ctok): Remove. All uses removed. (parse_bracket_exp, atom): Use BACKREF if a pattern contains an encoding error, so that the matcher will revert to regex. * src/dfasearch.c, src/grep.c, src/pcresearch.c, src/searchutils.c: Don't include dfa.h, since search.h now does that for us. * src/dfasearch.c (EGexecute): * src/kwsearch.c (Fexecute): In a UTF-8 locale, there's no need to worry about matching part of a multibyte character. * src/grep.c (contains_encoding_error): New static function. (main): Use it, so that grep -F is consistent with plain fgrep when the pattern contains an encoding error. * src/search.h: Include dfa.h, so that kwsearch.c can call using_utf8. * src/searchutils.c (is_mb_middle): Remove UTF-8-specific code. Callers now ensure that we are in a non-UTF-8 locale. The code was clearly wrong, anyway. * tests/fgrep-infloop, tests/invalid-multibyte-infloop: * tests/prefix-of-multibyte: Do not require that grep have a particular behavor for this test. It's OK to match (exit status 0), not match (exit status 1), or report an error (exit status 2), since the pattern contains an encoding error and grep's behavior is not specified for such patterns. Test only that KWset, DFA, and regex agree. * tests/prefix-of-multibyte: Add tests for ABCABC and __..._ABCABC___. --- src/grep.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'src/grep.c') diff --git a/src/grep.c b/src/grep.c index d58ef03a..a661fc0b 100644 --- a/src/grep.c +++ b/src/grep.c @@ -32,7 +32,6 @@ #include "c-ctype.h" #include "closeout.h" #include "colorize.h" -#include "dfa.h" #include "error.h" #include "exclude.h" #include "exitfail.h" @@ -1888,6 +1887,22 @@ parse_grep_colors (void) return; } +/* Return true if PAT (of length PATLEN) contains an encoding error. */ +static bool +contains_encoding_error (char const *pat, size_t patlen) +{ + mbstate_t mbs = { 0 }; + size_t i, charlen; + + for (i = 0; i < patlen; i += charlen + (charlen == 0)) + { + charlen = mbrlen (pat + i, patlen - i, &mbs); + if ((size_t) -2 <= charlen) + return true; + } + return false; +} + /* Change a pattern for fgrep into grep. */ static void fgrep_to_grep_pattern (size_t len, char const *keys, @@ -2318,9 +2333,11 @@ main (int argc, char **argv) else usage (EXIT_TROUBLE); - /* If case-insensitive fgrep in a multibyte locale, improve - performance by using grep instead. */ - if (match_icase && compile == Fcompile && MB_CUR_MAX > 1) + /* If fgrep in a multibyte locale, then use grep if either + (1) case is ignored (where grep is typically faster), or + (2) the pattern has an encoding error (where fgrep might not work). */ + if (compile == Fcompile && MB_CUR_MAX > 1 + && (match_icase || contains_encoding_error (keys, keycc))) { size_t new_keycc; char *new_keys; -- cgit v1.2.1