dfa: fix \s and \S to work for multibyte

* src/dfa.c (lex): In multibyte mode, we can't treat \s and \S as we do in single-byte mode. Map them to [[:space:]] and [^[:space:]] respectively, to make the DFA matcher use the regex-matcher for this term. * tests/multibyte-white-space: New file. Test for the bug. * tests/Makefile.am (TESTS): Add it. This bug was introduced with the addition of DFA support for \s and \S in commit v2.5.4-112-gf979ca0.
author: Jim Meyering <meyering@fb.com> 2013-09-22 10:50:05 -0700
committer: Jim Meyering <meyering@fb.com> 2013-10-01 17:10:49 -0700
commit: 01ec90be31ced413499acbafb9165f52d5903eaa (patch)
tree: bea59674f5ba4a4109c20a8ee4c3a1e27bc6e2c6
parent: e27403159f3dafba8dcb541d7ada76b5caf92d99 (diff)
download: grep-01ec90be31ced413499acbafb9165f52d5903eaa.tar.gz
4 files changed, 98 insertions, 8 deletions
diff --git a/NEWS b/NEWS
index 2f0e862d..a2733672 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,13 @@ GNU grep NEWS                                    -*- outline -*-
 
 ** Bug fixes
 
+  grep's \s and \S failed to work with multi-byte white space characters.
+  For example, \s would fail to match a non-breaking space, and this
+  would print nothing: LC_ALL=en_US.utf8 printf '\xc2\xa0' | grep '\s'
+  A related bug is that \S would mistakenly match an invalid multibyte
+  character.  E.g. this would match: printf '\x82\n' | src/grep '^\S$'
+  [bug present since 2.6]
+
   grep -i would segfault on systems using UTF-16-based wchar_t (Cygwin)
   when converting an input string containing certain 4-byte UTF-8
   sequences to lower case.  The conversions to wchar_t and back to
diff --git a/src/dfa.c b/src/dfa.c
index e464fa19..de6c6717 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1435,14 +1435,45 @@ lex (void)
         case 'S':
           if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
-          zeroset (ccl);
-          for (c2 = 0; c2 < NOTCHAR; ++c2)
-            if (isspace (c2))
-              setbit (c2, ccl);
-          if (c == 'S')
-            notset (ccl);
-          laststart = 0;
-          return lasttok = CSET + charclass_index (ccl);
+          if (MB_CUR_MAX == 1)
+            {
+              zeroset (ccl);
+              for (c2 = 0; c2 < NOTCHAR; ++c2)
+                if (isspace (c2))
+                  setbit (c2, ccl);
+              if (c == 'S')
+                notset (ccl);
+              laststart = 0;
+              return lasttok = CSET + charclass_index (ccl);
+            }
+
+#define PUSH_LEX_STATE(s)			\
+  do						\
+    {						\
+      char const *lexptr_saved = lexptr;	\
+      size_t lexleft_saved = lexleft;		\
+      lexptr = (s);				\
+      lexleft = strlen (lexptr)
+
+#define POP_LEX_STATE()				\
+      lexptr = lexptr_saved;			\
+      lexleft = lexleft_saved;			\
+    }						\
+  while (0)
+
+          /* FIXME: see if optimizing this, as is done with ANYCHAR and
+             add_utf8_anychar, makes sense.  */
+
+          /* \s and \S are documented to be equivalent to [[:space:]] and
+             [^[:space:]] respectively, so tell the lexer to process those
+             strings, each minus its "already processed" '['.  */
+          PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]");
+
+          lasttok = parse_bracket_exp ();
+
+          POP_LEX_STATE ();
+
+          return lasttok;
 
         case 'w':
         case 'W':
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 581f6888..760f7939 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -70,6 +70,7 @@ TESTS =						\
   invalid-multibyte-infloop			\
   khadafy					\
   max-count-vs-context				\
+  multibyte-white-space				\
   empty-line-mb					\
   unibyte-bracket-expr				\
   high-bit-range				\
diff --git a/tests/multibyte-white-space b/tests/multibyte-white-space
new file mode 100755
index 00000000..df2fe1b8
--- /dev/null
+++ b/tests/multibyte-white-space
@@ -0,0 +1,51 @@
+#! /bin/sh
+# Test whether \s matches SP and UTF-8 multi-byte white space characters.
+#
+# Copyright (C) 2013 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+require_en_utf8_locale_
+
+LC_ALL=en_US.UTF-8
+export LC_ALL
+
+# FIXME: including any the following in the list below would
+# make this test fail on Fedora 19/glibc-2.17-18.fc19.
+# Restore them to the list once it is fixed.
+these_fail_with_glibc='
+U+00A0 NO-BREAK SPACE: c2 a0
+U+2007 FIGURE SPACE: e2 80 87
+U+200B ZERO WIDTH SPACE: e2 80 8b
+U+202F NARROW NO-BREAK SPACE: e2 80 af
+'
+
+utf8_space_characters=$(sed 's/.*://;s/ /\\x/g' <<\EOF
+U+0020 SPACE: 20
+U+1680 OGHAM SPACE MARK: e1 9a 80
+U+2000 EN QUAD: e2 80 80
+U+2001 EM QUAD: e2 80 81
+U+2002 EN SPACE: e2 80 82
+U+2003 EM SPACE: e2 80 83
+U+2004 THREE-PER-EM SPACE: e2 80 84
+U+2005 FOUR-PER-EM SPACE: e2 80 85
+U+2006 SIX-PER-EM SPACE: e2 80 86
+U+2008 PUNCTUATION SPACE: e2 80 88
+U+2009 THIN SPACE: e2 80 89
+U+200A HAIR SPACE: e2 80 8a
+U+205F MEDIUM MATHEMATICAL SPACE: e2 81 9f
+U+3000 IDEOGRAPHIC SPACE: e3 80 80
+EOF
+)
+
+fail=0
+
+for i in $utf8_space_characters; do
+  printf "$i\n" | grep -q '^\s$' || { warn_ "$i FAILED"; fail=1; }
+done
+
+Exit $fail
author	Jim Meyering <meyering@fb.com>	2013-09-22 10:50:05 -0700
committer	Jim Meyering <meyering@fb.com>	2013-10-01 17:10:49 -0700
commit	01ec90be31ced413499acbafb9165f52d5903eaa (patch)
tree	bea59674f5ba4a4109c20a8ee4c3a1e27bc6e2c6
parent	e27403159f3dafba8dcb541d7ada76b5caf92d99 (diff)
download	grep-01ec90be31ced413499acbafb9165f52d5903eaa.tar.gz