summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2020-09-23 18:57:57 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2020-09-23 19:56:24 -0700
commit016e590a8198009bce0e1078f6d4c7e037e2df3c (patch)
tree65f7540d987aa52c0a74e4462c613f1e136d8291 /src
parentc6b0b7df3a4824b9bd3ee2bb96f96ab6b1a7cb76 (diff)
downloadgrep-016e590a8198009bce0e1078f6d4c7e037e2df3c.tar.gz
grep: fix more Turkish-eyes bugs
Fix more bugs recently uncovered by Norihiro Tanaka (Bug#43577). * NEWS: Mention new bug report. * src/grep.c (ok_fold): New static var. (setup_ok_fold): New function. (fgrep_icase_charlen): Reject single-byte characters if they match some multibyte characters when ignoring case. This part of the patch is partly derived from <https://bugs.gnu.org/43577#14>, which means it is: Co-authored-by: Norihiro Tanaka <noritnk@kcn.ne.jp> (main): Call setup_ok_fold if ok_fold might be needed. * src/searchutils.c (kwsinit): With the grep.c changes, this code can now revert to classic 7th Edition Unix style; aborting would be wrong. * tests/turkish-eyes: Add tests for these bugs.
Diffstat (limited to 'src')
-rw-r--r--src/grep.c116
-rw-r--r--src/searchutils.c23
2 files changed, 86 insertions, 53 deletions
diff --git a/src/grep.c b/src/grep.c
index 11856d86..1a52c898 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2300,37 +2300,75 @@ contains_encoding_error (char const *pat, size_t patlen)
return false;
}
+/* When ignoring case and (-E or -F or -G), then for each single-byte
+ character I, ok_fold[I] is 1 if every case folded counterpart of I
+ is also single-byte, and is -1 otherwise. */
+static signed char ok_fold[NCHAR];
+static void
+setup_ok_fold (void)
+{
+ for (int i = 0; i < NCHAR; i++)
+ {
+ wint_t wi = localeinfo.sbctowc[i];
+ if (wi == WEOF)
+ continue;
+
+ int ok = 1;
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ for (int n = case_folded_counterparts (wi, folded); 0 <= --n; )
+ {
+ char buf[MB_LEN_MAX];
+ mbstate_t s = { 0 };
+ if (wcrtomb (buf, folded[n], &s) != 1)
+ {
+ ok = -1;
+ break;
+ }
+ }
+ ok_fold[i] = ok;
+ }
+}
+
/* Return the number of bytes in the initial character of PAT, of size
PATLEN, if Fcompile can handle that character. Return -1 if
Fcompile cannot handle it. MBS is the multibyte conversion state.
-
- Fcompile can handle a character C if C is single-byte, or if C has no
- case folded counterparts and toupper translates none of its bytes. */
+ PATLEN must be nonzero. */
static int
fgrep_icase_charlen (char const *pat, size_t patlen, mbstate_t *mbs)
{
- int n = localeinfo.sbclen[to_uchar (*pat)];
- if (n < 0)
+ unsigned char pat0 = pat[0];
+
+ /* If PAT starts with a single-byte character, Fcompile works if
+ every case folded counterpart is also single-byte. */
+ if (localeinfo.sbctowc[pat0] != WEOF)
+ return ok_fold[pat0];
+
+ wchar_t wc;
+ size_t wn = mbrtowc (&wc, pat, patlen, mbs);
+
+ /* If PAT starts with an encoding error, Fcompile does not work. */
+ if (MB_LEN_MAX < wn)
+ return -1;
+
+ /* PAT starts with a multibyte character. Fcompile works if the
+ character has no case folded counterparts and toupper translates
+ none of its encoding's bytes. */
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ if (case_folded_counterparts (wc, folded))
+ return -1;
+ for (int i = wn; 0 < --i; )
{
- wchar_t wc;
- wchar_t folded[CASE_FOLDED_BUFSIZE];
- size_t wn = mbrtowc (&wc, pat, patlen, mbs);
- if (MB_LEN_MAX < wn || case_folded_counterparts (wc, folded))
+ unsigned char c = pat[i];
+ if (toupper (c) != c)
return -1;
- for (int i = wn; 0 < --i; )
- {
- unsigned char c = pat[i];
- if (toupper (c) != c)
- return -1;
- }
- n = wn;
}
- return n;
+ return wn;
}
/* Return true if the -F patterns PAT, of size PATLEN, contain only
- single-byte characters or characters not subject to case folding,
+ single-byte characters that case-fold only to single-byte
+ characters, or multibyte characters not subject to case folding,
and so can be processed by Fcompile. */
static bool
@@ -2950,26 +2988,34 @@ main (int argc, char **argv)
if (matcher < 0)
matcher = G_MATCHER_INDEX;
- /* In a single-byte locale, switch from -F to -G if it is a single
- pattern that matches words, where -G is typically faster. In a
- multi-byte locale, switch if the patterns have an encoding error
- (where -F does not work) or if -i and the patterns will not work
- for -iF. */
if (matcher == F_MATCHER_INDEX
- && (! localeinfo.multibyte
- ? n_patterns == 1 && match_words
- : (contains_encoding_error (keys, keycc)
- || (match_icase && !fgrep_icase_available (keys, keycc)))))
+ || matcher == E_MATCHER_INDEX || matcher == G_MATCHER_INDEX)
{
- fgrep_to_grep_pattern (&pattern_array, &keycc);
- keys = pattern_array;
- matcher = G_MATCHER_INDEX;
+ if (match_icase)
+ setup_ok_fold ();
+
+ /* In a single-byte locale, switch from -F to -G if it is a single
+ pattern that matches words, where -G is typically faster. In a
+ multibyte locale, switch if the patterns have an encoding error
+ (where -F does not work) or if -i and the patterns will not work
+ for -iF. */
+ if (matcher == F_MATCHER_INDEX)
+ {
+ if (! localeinfo.multibyte
+ ? n_patterns == 1 && match_words
+ : (contains_encoding_error (keys, keycc)
+ || (match_icase && !fgrep_icase_available (keys, keycc))))
+ {
+ fgrep_to_grep_pattern (&pattern_array, &keycc);
+ keys = pattern_array;
+ matcher = G_MATCHER_INDEX;
+ }
+ }
+ /* With two or more patterns, if -F works then switch from either -E
+ or -G, as -F is probably faster then. */
+ else if (1 < n_patterns)
+ matcher = try_fgrep_pattern (matcher, keys, &keycc);
}
- /* With two or more patterns, if -F works then switch from either -E
- or -G, as -F is probably faster then. */
- else if ((matcher == G_MATCHER_INDEX || matcher == E_MATCHER_INDEX)
- && 1 < n_patterns)
- matcher = try_fgrep_pattern (matcher, keys, &keycc);
execute = matchers[matcher].execute;
compiled_pattern =
diff --git a/src/searchutils.c b/src/searchutils.c
index c4bb8020..aa110639 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -48,24 +48,11 @@ kwsinit (bool mb_trans)
if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
{
trans = xmalloc (NCHAR);
- if (MB_CUR_MAX == 1)
- for (int i = 0; i < NCHAR; i++)
- trans[i] = toupper (i);
- else
- for (int i = 0; i < NCHAR; i++)
- {
- wint_t wc = localeinfo.sbctowc[i];
- wint_t uwc = towupper (wc);
- if (uwc != wc)
- {
- mbstate_t mbs = { 0 };
- size_t len = wcrtomb (&trans[i], uwc, &mbs);
- if (len != 1)
- abort ();
- }
- else
- trans[i] = i;
- }
+ /* If I is a single-byte character that becomes a different
+ single-byte character when uppercased, set trans[I]
+ to that character. Otherwise, set trans[I] to I. */
+ for (int i = 0; i < NCHAR; i++)
+ trans[i] = toupper (i);
}
return kwsalloc (trans);