summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-03-19 16:29:12 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-03-19 16:29:12 +0000
commit71b68580ed5c7d81eaa00be59d860272eabfe3d3 (patch)
tree6c12edddaa4f3d083ccf0ef7070b77a6f46cae93
parent536545ea7dd23c619f577c193a19108327f7dfb6 (diff)
downloadpcre-71b68580ed5c7d81eaa00be59d860272eabfe3d3.tar.gz
Code changes for simpler backtracking handling (docs to follow).
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1296 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog15
-rw-r--r--pcre_exec.c67
-rw-r--r--testdata/testinput168
-rw-r--r--testdata/testinput261
-rw-r--r--testdata/testoutput198
-rw-r--r--testdata/testoutput290
6 files changed, 290 insertions, 109 deletions
diff --git a/ChangeLog b/ChangeLog
index 54496ab..032cd2d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -110,11 +110,16 @@ Version 8.33 xx-xxxx-201x
30. Update RunTest with additional test selector options.
-31. PCRE has been changed to be more compatible with Perl when there is more
- than one backtracking verb present. Previously, in something like
- (*COMMIT)(*SKIP), COMMIT would override SKIP. Apart from one anomaly (which
- has been reported), Perl seems to act on whichever backtracking verb is
- reached first, so PCRE has been changed to follow this behaviour.
+31. The way PCRE handles backtracking verbs has been changed in to ways.
+
+ (1) Previously, in something like (*COMMIT)(*SKIP), COMMIT would override
+ SKIP. Now, PCRE acts on whichever backtracking verb is reached first by
+ backtracking. In some cases this makes it more Perl-compatible, but Perl's
+ rather obscure rules do not always do the same thing.
+
+ (2) Previously, backtracking verbs were confined within assertions. This is
+ no longer the case. Again, this sometimes improves Perl compatibility, and
+ sometimes does not.
Version 8.32 30-November-2012
diff --git a/pcre_exec.c b/pcre_exec.c
index 877e3af..bcc2c63 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -1603,6 +1603,8 @@ for (;;)
}
else condassert = FALSE;
+ /* Loop for each branch */
+
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
@@ -1613,18 +1615,28 @@ for (;;)
}
md->mark = save_mark;
- /* A COMMIT failure must fail the entire assertion, without trying any
- subsequent branches. */
-
- if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
+ /* See comment in the code for capturing groups above about handling
+ THEN. */
- /* PCRE does not allow THEN to escape beyond an assertion; it
- is treated as NOMATCH. */
+ if (rrc == MATCH_THEN)
+ {
+ next = ecode + GET(ecode,1);
+ if (md->start_match_ptr < next &&
+ (*ecode == OP_ALT || *next == OP_ALT))
+ rrc = MATCH_NOMATCH;
+ }
+
+ /* Anything other than NOMATCH causes the assertion to fail. This
+ includes COMMIT, SKIP, and PRUNE. However, this consistent approach does
+ not always have exactly the same effect as in Perl. */
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode, 1);
}
while (*ecode == OP_ALT);
+
+ /* If we have tried all the alternative branches, the assertion has
+ failed. */
if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
@@ -1632,17 +1644,16 @@ for (;;)
if (condassert) RRETURN(MATCH_MATCH);
- /* Continue from after the assertion, updating the offsets high water
- mark, since extracts may have been taken during the assertion. */
+ /* Continue from after a successful assertion, updating the offsets high
+ water mark, since extracts may have been taken during the assertion. */
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
ecode += 1 + LINK_SIZE;
offset_top = md->end_offset_top;
continue;
- /* Negative assertion: all branches must fail to match. Encountering SKIP,
- PRUNE, or COMMIT means we must assume failure without checking subsequent
- branches. */
+ /* Negative assertion: all branches must fail to match for the assertion to
+ succeed. */
case OP_ASSERT_NOT:
case OP_ASSERTBACK_NOT:
@@ -1654,28 +1665,42 @@ for (;;)
}
else condassert = FALSE;
+ /* Loop for each alternative branch. */
+
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
md->mark = save_mark;
+
+ /* A successful match means the assertion has failed. */
+
if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
- if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
+
+ /* See comment in the code for capturing groups above about handling
+ THEN. */
+
+ if (rrc == MATCH_THEN)
{
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- break;
+ next = ecode + GET(ecode,1);
+ if (md->start_match_ptr < next &&
+ (*ecode == OP_ALT || *next == OP_ALT))
+ rrc = MATCH_NOMATCH;
}
+
+ /* No match on a branch means we must carry on and try the next branch.
+ Anything else, in particular, SKIP, PRUNE, etc. causes a failure in the
+ enclosing branch. This is a consistent approach, but does not always have
+ the same effect as in Perl. */
- /* PCRE does not allow THEN to escape beyond an assertion; it is treated
- as NOMATCH. */
-
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
+
+ /* All branches in the assertion failed to match. */
if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
-
- ecode += 1 + LINK_SIZE;
+ ecode += 1 + LINK_SIZE; /* Continue with current branch */
continue;
/* Move the subject pointer back. This occurs only at the start of
diff --git a/testdata/testinput1 b/testdata/testinput1
index f643bbb..b7b5934 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -4656,16 +4656,10 @@
/(?<pn> \( ( [^()]++ | (?&pn) )* \) )/x
(ab(cd)ef)
-/^(?!a(*SKIP)b)/
- ac
-
/^(?=a(*SKIP)b|ac)/
** Failers
ac
-/^(?=a(*THEN)b|ac)/
- ac
-
/^(?=a(*PRUNE)b)/
ab
** Failers
@@ -4674,9 +4668,6 @@
/^(?=a(*ACCEPT)b)/
ac
-/^(?(?!a(*SKIP)b))/
- ac
-
/(?>a\Kb)/
ab
@@ -4899,33 +4890,15 @@ however, we need the complication for Perl. ---/
/(A (A|B(*ACCEPT)|C) D)(E)/x
AB
-/\A.*?(?:a|b(*THEN)c)/
- ba
-
-/\A.*?(?:a|bc)/
- ba
-
-/\A.*?(a|b(*THEN)c)/
- ba
-
/\A.*?(a|bc)/
ba
-/\A.*?(?:a|b(*THEN)c)++/
- ba
-
/\A.*?(?:a|bc)++/
ba
-/\A.*?(a|b(*THEN)c)++/
- ba
-
/\A.*?(a|bc)++/
ba
-/\A.*?(?:a|b(*THEN)c|d)/
- ba
-
/\A.*?(?:a|bc|d)/
ba
@@ -5253,9 +5226,6 @@ name were given. ---/
/(a(*COMMIT)b){0}a(?1)|aac/
aac
-/(?!a(*COMMIT)b)ac|cd/
- ac
-
/((?:a?)*)*c/
aac
@@ -5309,9 +5279,6 @@ name were given. ---/
/(?:(a(*SKIP)b)){0}(?:(?1)|ac)/
ac
-/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/
- aac
-
/(?<=(*SKIP)ac)a/
aa
@@ -5442,4 +5409,39 @@ backtracked onto, whether or not it has a label. --/
/a(*:m)a(*COMMIT)(*SKIP:m)b|a+c/K
aaaaaac
+/.?(a|b(*THEN)c)/
+ ba
+
+/(a(*COMMIT)b)c|abd/
+ abc
+ abd
+
+/(?=a(*COMMIT)b)abc|abd/
+ abc
+ abd
+
+/(?>a(*COMMIT)b)c|abd/
+ abc
+ abd
+
+/a(?=b(*COMMIT)c)[^d]|abd/
+ abd
+ abc
+
+/a(?=bc).|abd/
+ abd
+ abc
+
+/a(?>b(*COMMIT)c)d|abd/
+ abceabd
+
+/a(?>bc)d|abd/
+ abceabd
+
+/(?>a(*COMMIT)b)c|abd/
+ abd
+
+/(?>a(*COMMIT)c)d|abd/
+ abd
+
/-- End of testinput1 --/
diff --git a/testdata/testinput2 b/testdata/testinput2
index 70248b6..27e2d3c 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -3855,5 +3855,66 @@ settings of the anchored and startline bits. --/
/aaaaa(*COMMIT)(*PRUNE)b|a+c/
aaaaaac
+
+/-- Here are some that Perl treats differently because of the way it handles
+backtracking verbs. --/
+
+ /^(?!a(*SKIP)b)/
+ ac
+
+ /^(?!a(*SKIP)b)../
+ acd
+
+/(?!a(*SKIP)b)../
+ acd
+
+/^(?(?!a(*SKIP)b))/
+ ac
+
+/^(?!a(*PRUNE)b)../
+ acd
+
+/(?!a(*PRUNE)b)../
+ acd
+
+ /(?!a(*COMMIT)b)ac|cd/
+ ac
+
+ /(?!a(*COMMIT)b)ac|ad/
+ ac
+ ad
+
+/^(?!a(*THEN)b|ac)../
+ ac
+ ad
+
+/^(?=a(*THEN)b|ac)/
+ ac
+
+/\A.*?(?:a|b(*THEN)c)/
+ ba
+
+/\A.*?(?:a|bc)/
+ ba
+
+/\A.*?(?:a|b(*THEN)c)++/
+ ba
+
+/\A.*?(?:a|b(*THEN)c|d)/
+ ba
+
+/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/
+ aac
+
+/\A.*?(a|b(*THEN)c)/
+ ba
+
+/^(A(*THEN)B|C(*THEN)D)/
+ CD
+
+/^(A(*THEN)B|A(*THEN)D)/
+ AD
+
+/-- End of Perl diffences --/
/-- End of testinput2 --/
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index ae10a64..b38a876 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -7770,20 +7770,12 @@ No match
1: (ab(cd)ef)
2: ef
-/^(?!a(*SKIP)b)/
- ac
- 0:
-
/^(?=a(*SKIP)b|ac)/
** Failers
No match
ac
No match
-/^(?=a(*THEN)b|ac)/
- ac
- 0:
-
/^(?=a(*PRUNE)b)/
ab
0:
@@ -7796,10 +7788,6 @@ No match
ac
0:
-/^(?(?!a(*SKIP)b))/
- ac
- 0:
-
/(?>a\Kb)/
ab
0: b
@@ -8169,46 +8157,20 @@ No match
1: AB
2: B
-/\A.*?(?:a|b(*THEN)c)/
- ba
- 0: ba
-
-/\A.*?(?:a|bc)/
- ba
- 0: ba
-
-/\A.*?(a|b(*THEN)c)/
- ba
- 0: ba
- 1: a
-
/\A.*?(a|bc)/
ba
0: ba
1: a
-/\A.*?(?:a|b(*THEN)c)++/
- ba
- 0: ba
-
/\A.*?(?:a|bc)++/
ba
0: ba
-/\A.*?(a|b(*THEN)c)++/
- ba
- 0: ba
- 1: a
-
/\A.*?(a|bc)++/
ba
0: ba
1: a
-/\A.*?(?:a|b(*THEN)c|d)/
- ba
- 0: ba
-
/\A.*?(?:a|bc|d)/
ba
0: ba
@@ -8719,10 +8681,6 @@ No match
aac
0: aac
-/(?!a(*COMMIT)b)ac|cd/
- ac
- 0: ac
-
/((?:a?)*)*c/
aac
0: aac
@@ -8803,10 +8761,6 @@ No match
ac
0: ac
-/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/
- aac
- 0: aac
-
/(?<=(*SKIP)ac)a/
aa
No match
@@ -8967,4 +8921,56 @@ No match
aaaaaac
0: ac
+/.?(a|b(*THEN)c)/
+ ba
+ 0: ba
+ 1: a
+
+/(a(*COMMIT)b)c|abd/
+ abc
+ 0: abc
+ 1: ab
+ abd
+No match
+
+/(?=a(*COMMIT)b)abc|abd/
+ abc
+ 0: abc
+ abd
+ 0: abd
+
+/(?>a(*COMMIT)b)c|abd/
+ abc
+ 0: abc
+ abd
+ 0: abd
+
+/a(?=b(*COMMIT)c)[^d]|abd/
+ abd
+No match
+ abc
+ 0: ab
+
+/a(?=bc).|abd/
+ abd
+ 0: abd
+ abc
+ 0: ab
+
+/a(?>b(*COMMIT)c)d|abd/
+ abceabd
+No match
+
+/a(?>bc)d|abd/
+ abceabd
+ 0: abd
+
+/(?>a(*COMMIT)b)c|abd/
+ abd
+ 0: abd
+
+/(?>a(*COMMIT)c)d|abd/
+ abd
+No match
+
/-- End of testinput1 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 3956544..7ef3f96 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -12345,13 +12345,11 @@ COMMIT to escape from the assertion. --/
/(?=a(*COMMIT)b|ac)ac|ac/
ac
- 0: ac
+No match
/(?=a(*COMMIT)b|(ac)) ac | (a)c/x
ac
- 0: ac
- 1: <unset>
- 2: a
+No match
"AB(C(D))(E(F))?(?(?=\2)(?=\4))"
ABCDGHI\O03
@@ -12648,5 +12646,89 @@ Partial match: 123a
/aaaaa(*COMMIT)(*PRUNE)b|a+c/
aaaaaac
0: aaaac
+
+/-- Here are some that Perl treats differently because of the way it handles
+backtracking verbs. --/
+
+ /^(?!a(*SKIP)b)/
+ ac
+No match
+
+ /^(?!a(*SKIP)b)../
+ acd
+No match
+
+/(?!a(*SKIP)b)../
+ acd
+ 0: cd
+
+/^(?(?!a(*SKIP)b))/
+ ac
+No match
+
+/^(?!a(*PRUNE)b)../
+ acd
+No match
+
+/(?!a(*PRUNE)b)../
+ acd
+ 0: cd
+
+ /(?!a(*COMMIT)b)ac|cd/
+ ac
+No match
+
+ /(?!a(*COMMIT)b)ac|ad/
+ ac
+No match
+ ad
+No match
+
+/^(?!a(*THEN)b|ac)../
+ ac
+No match
+ ad
+ 0: ad
+
+/^(?=a(*THEN)b|ac)/
+ ac
+ 0:
+
+/\A.*?(?:a|b(*THEN)c)/
+ ba
+ 0: ba
+
+/\A.*?(?:a|bc)/
+ ba
+ 0: ba
+
+/\A.*?(?:a|b(*THEN)c)++/
+ ba
+ 0: ba
+
+/\A.*?(?:a|b(*THEN)c|d)/
+ ba
+ 0: ba
+
+/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/
+ aac
+ 0: aac
+
+/\A.*?(a|b(*THEN)c)/
+ ba
+ 0: ba
+ 1: a
+
+/^(A(*THEN)B|C(*THEN)D)/
+ CD
+ 0: CD
+ 1: CD
+
+/^(A(*THEN)B|A(*THEN)D)/
+ AD
+ 0: AD
+ 1: AD
+
+/-- End of Perl diffences --/
/-- End of testinput2 --/