diff options
author | Jim Meyering <meyering@fb.com> | 2013-09-22 10:50:05 -0700 |
---|---|---|
committer | Jim Meyering <meyering@fb.com> | 2013-10-01 17:10:49 -0700 |
commit | 01ec90be31ced413499acbafb9165f52d5903eaa (patch) | |
tree | bea59674f5ba4a4109c20a8ee4c3a1e27bc6e2c6 | |
parent | e27403159f3dafba8dcb541d7ada76b5caf92d99 (diff) | |
download | grep-01ec90be31ced413499acbafb9165f52d5903eaa.tar.gz |
dfa: fix \s and \S to work for multibyte
* src/dfa.c (lex): In multibyte mode, we can't treat \s and \S as we do
in single-byte mode. Map them to [[:space:]] and [^[:space:]] respectively,
to make the DFA matcher use the regex-matcher for this term.
* tests/multibyte-white-space: New file. Test for the bug.
* tests/Makefile.am (TESTS): Add it.
This bug was introduced with the addition of DFA support
for \s and \S in commit v2.5.4-112-gf979ca0.
-rw-r--r-- | NEWS | 7 | ||||
-rw-r--r-- | src/dfa.c | 47 | ||||
-rw-r--r-- | tests/Makefile.am | 1 | ||||
-rwxr-xr-x | tests/multibyte-white-space | 51 |
4 files changed, 98 insertions, 8 deletions
@@ -4,6 +4,13 @@ GNU grep NEWS -*- outline -*- ** Bug fixes + grep's \s and \S failed to work with multi-byte white space characters. + For example, \s would fail to match a non-breaking space, and this + would print nothing: LC_ALL=en_US.utf8 printf '\xc2\xa0' | grep '\s' + A related bug is that \S would mistakenly match an invalid multibyte + character. E.g. this would match: printf '\x82\n' | src/grep '^\S$' + [bug present since 2.6] + grep -i would segfault on systems using UTF-16-based wchar_t (Cygwin) when converting an input string containing certain 4-byte UTF-8 sequences to lower case. The conversions to wchar_t and back to @@ -1435,14 +1435,45 @@ lex (void) case 'S': if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - zeroset (ccl); - for (c2 = 0; c2 < NOTCHAR; ++c2) - if (isspace (c2)) - setbit (c2, ccl); - if (c == 'S') - notset (ccl); - laststart = 0; - return lasttok = CSET + charclass_index (ccl); + if (MB_CUR_MAX == 1) + { + zeroset (ccl); + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (isspace (c2)) + setbit (c2, ccl); + if (c == 'S') + notset (ccl); + laststart = 0; + return lasttok = CSET + charclass_index (ccl); + } + +#define PUSH_LEX_STATE(s) \ + do \ + { \ + char const *lexptr_saved = lexptr; \ + size_t lexleft_saved = lexleft; \ + lexptr = (s); \ + lexleft = strlen (lexptr) + +#define POP_LEX_STATE() \ + lexptr = lexptr_saved; \ + lexleft = lexleft_saved; \ + } \ + while (0) + + /* FIXME: see if optimizing this, as is done with ANYCHAR and + add_utf8_anychar, makes sense. */ + + /* \s and \S are documented to be equivalent to [[:space:]] and + [^[:space:]] respectively, so tell the lexer to process those + strings, each minus its "already processed" '['. */ + PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]"); + + lasttok = parse_bracket_exp (); + + POP_LEX_STATE (); + + return lasttok; case 'w': case 'W': diff --git a/tests/Makefile.am b/tests/Makefile.am index 581f6888..760f7939 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -70,6 +70,7 @@ TESTS = \ invalid-multibyte-infloop \ khadafy \ max-count-vs-context \ + multibyte-white-space \ empty-line-mb \ unibyte-bracket-expr \ high-bit-range \ diff --git a/tests/multibyte-white-space b/tests/multibyte-white-space new file mode 100755 index 00000000..df2fe1b8 --- /dev/null +++ b/tests/multibyte-white-space @@ -0,0 +1,51 @@ +#! /bin/sh +# Test whether \s matches SP and UTF-8 multi-byte white space characters. +# +# Copyright (C) 2013 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +require_en_utf8_locale_ + +LC_ALL=en_US.UTF-8 +export LC_ALL + +# FIXME: including any the following in the list below would +# make this test fail on Fedora 19/glibc-2.17-18.fc19. +# Restore them to the list once it is fixed. +these_fail_with_glibc=' +U+00A0 NO-BREAK SPACE: c2 a0 +U+2007 FIGURE SPACE: e2 80 87 +U+200B ZERO WIDTH SPACE: e2 80 8b +U+202F NARROW NO-BREAK SPACE: e2 80 af +' + +utf8_space_characters=$(sed 's/.*://;s/ /\\x/g' <<\EOF +U+0020 SPACE: 20 +U+1680 OGHAM SPACE MARK: e1 9a 80 +U+2000 EN QUAD: e2 80 80 +U+2001 EM QUAD: e2 80 81 +U+2002 EN SPACE: e2 80 82 +U+2003 EM SPACE: e2 80 83 +U+2004 THREE-PER-EM SPACE: e2 80 84 +U+2005 FOUR-PER-EM SPACE: e2 80 85 +U+2006 SIX-PER-EM SPACE: e2 80 86 +U+2008 PUNCTUATION SPACE: e2 80 88 +U+2009 THIN SPACE: e2 80 89 +U+200A HAIR SPACE: e2 80 8a +U+205F MEDIUM MATHEMATICAL SPACE: e2 81 9f +U+3000 IDEOGRAPHIC SPACE: e3 80 80 +EOF +) + +fail=0 + +for i in $utf8_space_characters; do + printf "$i\n" | grep -q '^\s$' || { warn_ "$i FAILED"; fail=1; } +done + +Exit $fail |