summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEli Zaretskii <eliz@gnu.org>2011-10-02 21:33:53 +0200
committerJim Meyering <meyering@redhat.com>2011-10-04 08:26:50 +0200
commit7d20c09e3e7cf3af9060f395e884fca285ce3598 (patch)
tree41c1c0812dbded75ad10a966d857b2989fbe9f51
parent49684e05ed0362928b9fd2d14ecc3153300b702f (diff)
downloadgrep-7d20c09e3e7cf3af9060f395e884fca285ce3598.tar.gz
dfa: don't mishandle high-bit bytes in a regexp with signed-char
This appears to arise only on systems for which "char" is signed. * src/dfa.c (FETCH_WC, FETCH): Produce an unsigned value, rather than a sign-extended one. Fixes a bug on MS-Windows with compiling patterns that include characters with the 8-th bit set. (to_uchar): Define. From coreutils. Reported by David Millis <tvtronix@yahoo.com>. See http://thread.gmane.org/gmane.comp.gnu.grep.bugs/3893 * NEWS (Bug fixes): Mention it.
-rw-r--r--NEWS5
-rw-r--r--src/dfa.c9
2 files changed, 12 insertions, 2 deletions
diff --git a/NEWS b/NEWS
index 8578e821..2b06af4e 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS -*- outline -*-
* Noteworthy changes in release ?.? (????-??-??) [?]
+** Bug fixes
+
+ grep no longer mishandles high-bit-set pattern bytes on systems
+ where "char" is a signed type. [bug appears to affect only MS-Windows]
+
grep now rejects a command like "grep -r pattern . > out",
in which the output file is also one of the inputs,
because it can result in an "infinite" disk-filling loop.
diff --git a/src/dfa.c b/src/dfa.c
index 86114350..dc879159 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -86,6 +86,11 @@
/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
typedef int charclass[CHARCLASS_INTS];
+/* Convert a possibly-signed character to an unsigned character. This is
+ a bit safer than casting to unsigned char, since it catches some type
+ errors that the cast doesn't. */
+static inline unsigned char to_uchar (char ch) { return ch; }
+
/* Sometimes characters can only be matched depending on the surrounding
context. Such context decisions depend on what the previous character
was, and the value of the current (lookahead) character. Context
@@ -686,7 +691,7 @@ static unsigned char const *buf_end; /* reference to end in dfaexec(). */
{ \
cur_mb_len = 1; \
--lexleft; \
- (wc) = (c) = (unsigned char) *lexptr++; \
+ (wc) = (c) = to_uchar (*lexptr++); \
} \
else \
{ \
@@ -715,7 +720,7 @@ static unsigned char const *buf_end; /* reference to end in dfaexec(). */
else \
return lasttok = END; \
} \
- (c) = (unsigned char) *lexptr++; \
+ (c) = to_uchar (*lexptr++); \
--lexleft; \
} while(0)