diff options
author | Paolo Bonzini <bonzini@gnu.org> | 2010-03-14 15:33:58 +0100 |
---|---|---|
committer | Paolo Bonzini <bonzini@gnu.org> | 2010-03-15 15:20:17 +0100 |
commit | f86dac527954cc4c2d86b05ab1481e7c8f26f286 (patch) | |
tree | d9edd185ee1c611dae86fc14941c480c4728ce59 | |
parent | 459663d87b9dc0af768857ae1c01e6156c58c2e2 (diff) | |
download | grep-f86dac527954cc4c2d86b05ab1481e7c8f26f286.tar.gz |
dfa: fix handling of ranges in multibyte character sets
* src/dfa.c (parse_bracket_exp_mb): Add separate ranges for
lowercase and uppercase endpoints if folding case.
* tests/Makefile.am (TESTS): Add case-fold-char-range.
* tests/case-fold-char-range: New.
-rw-r--r-- | NEWS | 10 | ||||
-rw-r--r-- | src/dfa.c | 18 | ||||
-rw-r--r-- | tests/Makefile.am | 1 | ||||
-rw-r--r-- | tests/case-fold-char-range | 22 |
4 files changed, 44 insertions, 7 deletions
@@ -4,11 +4,11 @@ GNU grep NEWS -*- outline -*- ** Bug fixes - grep -i with a character class would malfunction in multi-byte locales. - For example, echo Y | LC_ALL=en_US.UTF-8 grep -i '[y]' would print nothing. - Character types would malfunction in multi-byte locales similarly; for - example, echo Y | LC_ALL=en_US.UTF-8 grep -i '[[:lower:]]' would print - nothing. + Character classes would malfunction in multi-byte locales when using grep -i. + Examples which would print nothing for LC_ALL=en_US.UTF-8 include: + - for ranges, echo Z | grep -i '[a-z]' + - for single characters, echo Y | grep -i '[y]' + - for character types, echo Y | grep -i '[[:lower:]]' Various bugs in grep -P, caused by expressions such as [^b] or \S matching newlines, were fixed. grep -P also supports the special sequences \Z and @@ -569,6 +569,8 @@ parse_bracket_exp_mb (void) wc1 = fetch_wc(_("unbalanced [")); } + /* When case folding map a range, say [m-z] (or even [M-z]) to the + pair of ranges, [m-z] [M-Z]. */ if (range_sts_al == 0) { MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); @@ -576,10 +578,22 @@ parse_bracket_exp_mb (void) } REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, range_sts_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, range_ends_al, work_mbc->nranges + 1); - work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; + work_mbc->range_sts[work_mbc->nranges] = + case_fold ? towlower(wc) : (wchar_t)wc; + work_mbc->range_ends[work_mbc->nranges++] = + case_fold ? towlower(wc2) : (wchar_t)wc2; + + if (case_fold) + { + REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, + range_sts_al, work_mbc->nranges + 1); + work_mbc->range_sts[work_mbc->nranges] = towupper(wc); + REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, + range_ends_al, work_mbc->nranges + 1); + work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2); + } } else if (wc != WEOF) /* build normal characters. */ diff --git a/tests/Makefile.am b/tests/Makefile.am index b71b2ee4..6a56dcff 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -18,6 +18,7 @@ TESTS = \ backref.sh \ bre.sh \ case-fold-char-class \ + case-fold-char-range \ case-fold-char-type \ dfaexec-multibyte \ empty.sh \ diff --git a/tests/case-fold-char-range b/tests/case-fold-char-range new file mode 100644 index 00000000..9b3120fb --- /dev/null +++ b/tests/case-fold-char-range @@ -0,0 +1,22 @@ +#!/bin/sh +# This would fail for grep-2.5.3 +: ${srcdir=.} +. "$srcdir/init.sh"; path_prepend_ ../src + +printf 'A\nZ\n' > exp1 || framework_failure +fail=0 + +for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do + printf 'A\n1\nZ\n.\n' | LC_ALL=$LOC grep -i '[a-z]' > out1 || fail=1 + compare out1 exp1 || fail=1 +done + +# This actually passes also for grep-2.5.3 +printf 'a\nz\n' > exp2 || framework_failure + +for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do + printf 'a\n1\nz\n.\n' | LC_ALL=$LOC grep -i '[A-Z]' > out2 || fail=1 + compare out2 exp2 || fail=1 +done + +Exit $fail |