summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <bonzini@gnu.org>2010-03-14 15:33:58 +0100
committerPaolo Bonzini <bonzini@gnu.org>2010-03-15 15:20:17 +0100
commitf86dac527954cc4c2d86b05ab1481e7c8f26f286 (patch)
treed9edd185ee1c611dae86fc14941c480c4728ce59
parent459663d87b9dc0af768857ae1c01e6156c58c2e2 (diff)
downloadgrep-f86dac527954cc4c2d86b05ab1481e7c8f26f286.tar.gz
dfa: fix handling of ranges in multibyte character sets
* src/dfa.c (parse_bracket_exp_mb): Add separate ranges for lowercase and uppercase endpoints if folding case. * tests/Makefile.am (TESTS): Add case-fold-char-range. * tests/case-fold-char-range: New.
-rw-r--r--NEWS10
-rw-r--r--src/dfa.c18
-rw-r--r--tests/Makefile.am1
-rw-r--r--tests/case-fold-char-range22
4 files changed, 44 insertions, 7 deletions
diff --git a/NEWS b/NEWS
index 2ade4fc7..4f9c7787 100644
--- a/NEWS
+++ b/NEWS
@@ -4,11 +4,11 @@ GNU grep NEWS -*- outline -*-
** Bug fixes
- grep -i with a character class would malfunction in multi-byte locales.
- For example, echo Y | LC_ALL=en_US.UTF-8 grep -i '[y]' would print nothing.
- Character types would malfunction in multi-byte locales similarly; for
- example, echo Y | LC_ALL=en_US.UTF-8 grep -i '[[:lower:]]' would print
- nothing.
+ Character classes would malfunction in multi-byte locales when using grep -i.
+ Examples which would print nothing for LC_ALL=en_US.UTF-8 include:
+ - for ranges, echo Z | grep -i '[a-z]'
+ - for single characters, echo Y | grep -i '[y]'
+ - for character types, echo Y | grep -i '[[:lower:]]'
Various bugs in grep -P, caused by expressions such as [^b] or \S matching
newlines, were fixed. grep -P also supports the special sequences \Z and
diff --git a/src/dfa.c b/src/dfa.c
index 6c7494ec..90e3c187 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -569,6 +569,8 @@ parse_bracket_exp_mb (void)
wc1 = fetch_wc(_("unbalanced ["));
}
+ /* When case folding map a range, say [m-z] (or even [M-z]) to the
+ pair of ranges, [m-z] [M-Z]. */
if (range_sts_al == 0)
{
MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
@@ -576,10 +578,22 @@ parse_bracket_exp_mb (void)
}
REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
range_sts_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
+ work_mbc->range_sts[work_mbc->nranges] =
+ case_fold ? towlower(wc) : (wchar_t)wc;
+ work_mbc->range_ends[work_mbc->nranges++] =
+ case_fold ? towlower(wc2) : (wchar_t)wc2;
+
+ if (case_fold)
+ {
+ REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+ range_sts_al, work_mbc->nranges + 1);
+ work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
+ REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+ range_ends_al, work_mbc->nranges + 1);
+ work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
+ }
}
else if (wc != WEOF)
/* build normal characters. */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b71b2ee4..6a56dcff 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -18,6 +18,7 @@ TESTS = \
backref.sh \
bre.sh \
case-fold-char-class \
+ case-fold-char-range \
case-fold-char-type \
dfaexec-multibyte \
empty.sh \
diff --git a/tests/case-fold-char-range b/tests/case-fold-char-range
new file mode 100644
index 00000000..9b3120fb
--- /dev/null
+++ b/tests/case-fold-char-range
@@ -0,0 +1,22 @@
+#!/bin/sh
+# This would fail for grep-2.5.3
+: ${srcdir=.}
+. "$srcdir/init.sh"; path_prepend_ ../src
+
+printf 'A\nZ\n' > exp1 || framework_failure
+fail=0
+
+for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do
+ printf 'A\n1\nZ\n.\n' | LC_ALL=$LOC grep -i '[a-z]' > out1 || fail=1
+ compare out1 exp1 || fail=1
+done
+
+# This actually passes also for grep-2.5.3
+printf 'a\nz\n' > exp2 || framework_failure
+
+for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do
+ printf 'a\n1\nz\n.\n' | LC_ALL=$LOC grep -i '[A-Z]' > out2 || fail=1
+ compare out2 exp2 || fail=1
+done
+
+Exit $fail