diff options
author | Paolo Bonzini <bonzini@gnu.org> | 2012-02-05 18:00:45 +0100 |
---|---|---|
committer | Jim Meyering <meyering@redhat.com> | 2012-02-26 11:01:14 +0100 |
commit | 74dc111398feff35e54978d442b9a488c05829b5 (patch) | |
tree | a5abc7d7a20126a4c65152a52b593996f3464eb7 | |
parent | d7acc337e47563b8334b56c5d479c81a4a060c8f (diff) | |
download | grep-74dc111398feff35e54978d442b9a488c05829b5.tar.gz |
dfa: fix a subtle constraint encoding bug
* src/dfa.c (SUCCEEDS_IN_CONTEXT, PREV_NEWLINE_DEPENDENT,
PREV_LETTER_DEPENDENT): Rewrite to handle all 3*3=9 possible
combinations of previous and next character contexts.
(MATCHES_NEWLINE_CONTEXT, MATCHES_LETTER_CONTEXT): Remove.
(NO_CONSTRAINT, BEGLINE_CONSTRAINT, ENDLINE_CONSTRAINT,
BEGWORD_CONSTRAINT, ENDWORD_CONSTRAINT, LIMWORD_CONSTRAINT,
NOTLIMWORD_CONSTRAINT): Switch to new encoding.
* NEWS: Document resulting bugfix.
* tests/spencer1.tests: Add regression test.
-rw-r--r-- | NEWS | 5 | ||||
-rw-r--r-- | src/dfa.c | 54 | ||||
-rw-r--r-- | tests/spencer1.tests | 12 |
3 files changed, 43 insertions, 28 deletions
@@ -42,6 +42,11 @@ GNU grep NEWS -*- outline -*- grep no longer emits an error message and quits on MS-Windows when invoked with the -r option. + grep no longer misinterprets some alternations involving anchors + (^, $, \< \> \B, \b). For example, grep -E "(^|\B)a" no + longer reports a match for the string "x a". + [bug present since "the beginning"] + ** New features If no file operand is given, and a command-line -r or equivalent @@ -115,46 +115,44 @@ static inline unsigned char to_uchar (char ch) { return ch; } is set indicates that the constraint succeeds in the corresponding context. - bit 7 - previous and current are newlines - bit 6 - previous was newline, current isn't - bit 5 - previous wasn't newline, current is - bit 4 - neither previous nor current is a newline - bit 3 - previous and current are word-constituents - bit 2 - previous was word-constituent, current isn't - bit 1 - previous wasn't word-constituent, current is - bit 0 - neither previous nor current is word-constituent + bit 8-11 - valid contexts when next character is CTX_NEWLINE + bit 4-7 - valid contexts when next character is CTX_LETTER + bit 0-3 - valid contexts when next character is CTX_NONE The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint succeeds in a particular context. Prev is a bitmask of possible - context values for the previous character, curr is the bitmask of - possible context values for the lookahead character. */ -#define MATCHES_NEWLINE_CONTEXT(constraint, prev, curr) \ - ((constraint) & \ - 1 << (((prev & ~CTX_NEWLINE) ? 0 : 2) + ((curr & ~CTX_NEWLINE) ? 0 : 1) + 4)) -#define MATCHES_LETTER_CONTEXT(constraint, prev, curr) \ - ((constraint) & \ - 1 << (((prev & ~CTX_LETTER) ? 0 : 2) + ((curr & ~CTX_LETTER) ? 0 : 1))) + context values for the previous character, curr is the (single-bit) + context value for the lookahead character. */ +#define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf) +#define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf) +#define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf) + #define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \ - (MATCHES_NEWLINE_CONTEXT(constraint, prev, curr) \ - && MATCHES_LETTER_CONTEXT(constraint, prev, curr)) + ((((curr) & CTX_NONE ? OTHER_CONSTRAINT(constraint) : 0) \ + | ((curr) & CTX_LETTER ? LETTER_CONSTRAINT(constraint) : 0) \ + | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT(constraint) : 0)) & (prev)) /* The following macros give information about what a constraint depends on. */ +#define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111) +#define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111) +#define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111) + #define PREV_NEWLINE_DEPENDENT(constraint) \ - (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30)) + (PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint)) #define PREV_LETTER_DEPENDENT(constraint) \ - (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03)) + (PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint)) /* Tokens that match the empty string subject to some constraint actually work by applying that constraint to determine what may follow them, taking into account what has gone before. The following values are the constraints corresponding to the special tokens previously defined. */ -#define NO_CONSTRAINT 0xff -#define BEGLINE_CONSTRAINT 0xcf -#define ENDLINE_CONSTRAINT 0xaf -#define BEGWORD_CONSTRAINT 0xf2 -#define ENDWORD_CONSTRAINT 0xf4 -#define LIMWORD_CONSTRAINT 0xf6 -#define NOTLIMWORD_CONSTRAINT 0xf9 +#define NO_CONSTRAINT 0x777 +#define BEGLINE_CONSTRAINT 0x444 +#define ENDLINE_CONSTRAINT 0x700 +#define BEGWORD_CONSTRAINT 0x050 +#define ENDWORD_CONSTRAINT 0x202 +#define LIMWORD_CONSTRAINT 0x252 +#define NOTLIMWORD_CONSTRAINT 0x525 /* The regexp is parsed into an array of tokens in postfix form. Some tokens are operators and others are terminal symbols. Most (but not all) of these @@ -282,7 +280,7 @@ typedef struct position_set elems; /* Positions this state could match. */ unsigned char context; /* Context from previous state. */ char backref; /* True if this state matches a \<digit>. */ - unsigned char constraint; /* Constraint for this state to accept. */ + unsigned short constraint; /* Constraint for this state to accept. */ int first_end; /* Token value of the first END in elems. */ position_set mbps; /* Positions which can match multibyte characters. e.g. period. diff --git a/tests/spencer1.tests b/tests/spencer1.tests index ecbed0fc..855265f9 100644 --- a/tests/spencer1.tests +++ b/tests/spencer1.tests @@ -129,3 +129,15 @@ 0@a(bc)d@abcd 0@a[-]?c@ac 0@(....).*\1@beriberi +0@(^|\B)a@abc +0@(^|\B)a@xyzabc +1@(^|\B)a@xyz abc +0@^a|\Ba@abc +0@^a|\Ba@xyzabc +1@^a|\Ba@xyz abc +0@(^|\>)a@abc +1@(^|\>)a@xyzabc +1@(^|\>)a@xyz abc +0@^a|\>a@abc +1@^a|\>a@xyzabc +1@^a|\>a@xyz abc |