summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <bonzini@gnu.org>2012-02-05 18:00:45 +0100
committerJim Meyering <meyering@redhat.com>2012-02-26 11:01:14 +0100
commit74dc111398feff35e54978d442b9a488c05829b5 (patch)
treea5abc7d7a20126a4c65152a52b593996f3464eb7
parentd7acc337e47563b8334b56c5d479c81a4a060c8f (diff)
downloadgrep-74dc111398feff35e54978d442b9a488c05829b5.tar.gz
dfa: fix a subtle constraint encoding bug
* src/dfa.c (SUCCEEDS_IN_CONTEXT, PREV_NEWLINE_DEPENDENT, PREV_LETTER_DEPENDENT): Rewrite to handle all 3*3=9 possible combinations of previous and next character contexts. (MATCHES_NEWLINE_CONTEXT, MATCHES_LETTER_CONTEXT): Remove. (NO_CONSTRAINT, BEGLINE_CONSTRAINT, ENDLINE_CONSTRAINT, BEGWORD_CONSTRAINT, ENDWORD_CONSTRAINT, LIMWORD_CONSTRAINT, NOTLIMWORD_CONSTRAINT): Switch to new encoding. * NEWS: Document resulting bugfix. * tests/spencer1.tests: Add regression test.
-rw-r--r--NEWS5
-rw-r--r--src/dfa.c54
-rw-r--r--tests/spencer1.tests12
3 files changed, 43 insertions, 28 deletions
diff --git a/NEWS b/NEWS
index 1d687ec4..f895ed49 100644
--- a/NEWS
+++ b/NEWS
@@ -42,6 +42,11 @@ GNU grep NEWS -*- outline -*-
grep no longer emits an error message and quits on MS-Windows when
invoked with the -r option.
+ grep no longer misinterprets some alternations involving anchors
+ (^, $, \< \> \B, \b). For example, grep -E "(^|\B)a" no
+ longer reports a match for the string "x a".
+ [bug present since "the beginning"]
+
** New features
If no file operand is given, and a command-line -r or equivalent
diff --git a/src/dfa.c b/src/dfa.c
index 7f4730c7..3e19b402 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -115,46 +115,44 @@ static inline unsigned char to_uchar (char ch) { return ch; }
is set indicates that the constraint succeeds in the corresponding
context.
- bit 7 - previous and current are newlines
- bit 6 - previous was newline, current isn't
- bit 5 - previous wasn't newline, current is
- bit 4 - neither previous nor current is a newline
- bit 3 - previous and current are word-constituents
- bit 2 - previous was word-constituent, current isn't
- bit 1 - previous wasn't word-constituent, current is
- bit 0 - neither previous nor current is word-constituent
+ bit 8-11 - valid contexts when next character is CTX_NEWLINE
+ bit 4-7 - valid contexts when next character is CTX_LETTER
+ bit 0-3 - valid contexts when next character is CTX_NONE
The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
succeeds in a particular context. Prev is a bitmask of possible
- context values for the previous character, curr is the bitmask of
- possible context values for the lookahead character. */
-#define MATCHES_NEWLINE_CONTEXT(constraint, prev, curr) \
- ((constraint) & \
- 1 << (((prev & ~CTX_NEWLINE) ? 0 : 2) + ((curr & ~CTX_NEWLINE) ? 0 : 1) + 4))
-#define MATCHES_LETTER_CONTEXT(constraint, prev, curr) \
- ((constraint) & \
- 1 << (((prev & ~CTX_LETTER) ? 0 : 2) + ((curr & ~CTX_LETTER) ? 0 : 1)))
+ context values for the previous character, curr is the (single-bit)
+ context value for the lookahead character. */
+#define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)
+#define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf)
+#define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf)
+
#define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
- (MATCHES_NEWLINE_CONTEXT(constraint, prev, curr) \
- && MATCHES_LETTER_CONTEXT(constraint, prev, curr))
+ ((((curr) & CTX_NONE ? OTHER_CONSTRAINT(constraint) : 0) \
+ | ((curr) & CTX_LETTER ? LETTER_CONSTRAINT(constraint) : 0) \
+ | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT(constraint) : 0)) & (prev))
/* The following macros give information about what a constraint depends on. */
+#define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
+#define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111)
+#define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111)
+
#define PREV_NEWLINE_DEPENDENT(constraint) \
- (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
+ (PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
#define PREV_LETTER_DEPENDENT(constraint) \
- (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
+ (PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
/* Tokens that match the empty string subject to some constraint actually
work by applying that constraint to determine what may follow them,
taking into account what has gone before. The following values are
the constraints corresponding to the special tokens previously defined. */
-#define NO_CONSTRAINT 0xff
-#define BEGLINE_CONSTRAINT 0xcf
-#define ENDLINE_CONSTRAINT 0xaf
-#define BEGWORD_CONSTRAINT 0xf2
-#define ENDWORD_CONSTRAINT 0xf4
-#define LIMWORD_CONSTRAINT 0xf6
-#define NOTLIMWORD_CONSTRAINT 0xf9
+#define NO_CONSTRAINT 0x777
+#define BEGLINE_CONSTRAINT 0x444
+#define ENDLINE_CONSTRAINT 0x700
+#define BEGWORD_CONSTRAINT 0x050
+#define ENDWORD_CONSTRAINT 0x202
+#define LIMWORD_CONSTRAINT 0x252
+#define NOTLIMWORD_CONSTRAINT 0x525
/* The regexp is parsed into an array of tokens in postfix form. Some tokens
are operators and others are terminal symbols. Most (but not all) of these
@@ -282,7 +280,7 @@ typedef struct
position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */
char backref; /* True if this state matches a \<digit>. */
- unsigned char constraint; /* Constraint for this state to accept. */
+ unsigned short constraint; /* Constraint for this state to accept. */
int first_end; /* Token value of the first END in elems. */
position_set mbps; /* Positions which can match multibyte
characters. e.g. period.
diff --git a/tests/spencer1.tests b/tests/spencer1.tests
index ecbed0fc..855265f9 100644
--- a/tests/spencer1.tests
+++ b/tests/spencer1.tests
@@ -129,3 +129,15 @@
0@a(bc)d@abcd
0@a[-]?c@ac
0@(....).*\1@beriberi
+0@(^|\B)a@abc
+0@(^|\B)a@xyzabc
+1@(^|\B)a@xyz abc
+0@^a|\Ba@abc
+0@^a|\Ba@xyzabc
+1@^a|\Ba@xyz abc
+0@(^|\>)a@abc
+1@(^|\>)a@xyzabc
+1@(^|\>)a@xyz abc
+0@^a|\>a@abc
+1@^a|\>a@xyzabc
+1@^a|\>a@xyz abc