diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-03-19 16:29:12 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-03-19 16:29:12 +0000 |
commit | 71b68580ed5c7d81eaa00be59d860272eabfe3d3 (patch) | |
tree | 6c12edddaa4f3d083ccf0ef7070b77a6f46cae93 | |
parent | 536545ea7dd23c619f577c193a19108327f7dfb6 (diff) | |
download | pcre-71b68580ed5c7d81eaa00be59d860272eabfe3d3.tar.gz |
Code changes for simpler backtracking handling (docs to follow).
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1296 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 15 | ||||
-rw-r--r-- | pcre_exec.c | 67 | ||||
-rw-r--r-- | testdata/testinput1 | 68 | ||||
-rw-r--r-- | testdata/testinput2 | 61 | ||||
-rw-r--r-- | testdata/testoutput1 | 98 | ||||
-rw-r--r-- | testdata/testoutput2 | 90 |
6 files changed, 290 insertions, 109 deletions
@@ -110,11 +110,16 @@ Version 8.33 xx-xxxx-201x 30. Update RunTest with additional test selector options. -31. PCRE has been changed to be more compatible with Perl when there is more - than one backtracking verb present. Previously, in something like - (*COMMIT)(*SKIP), COMMIT would override SKIP. Apart from one anomaly (which - has been reported), Perl seems to act on whichever backtracking verb is - reached first, so PCRE has been changed to follow this behaviour. +31. The way PCRE handles backtracking verbs has been changed in to ways. + + (1) Previously, in something like (*COMMIT)(*SKIP), COMMIT would override + SKIP. Now, PCRE acts on whichever backtracking verb is reached first by + backtracking. In some cases this makes it more Perl-compatible, but Perl's + rather obscure rules do not always do the same thing. + + (2) Previously, backtracking verbs were confined within assertions. This is + no longer the case. Again, this sometimes improves Perl compatibility, and + sometimes does not. Version 8.32 30-November-2012 diff --git a/pcre_exec.c b/pcre_exec.c index 877e3af..bcc2c63 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -1603,6 +1603,8 @@ for (;;) } else condassert = FALSE; + /* Loop for each branch */ + do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4); @@ -1613,18 +1615,28 @@ for (;;) } md->mark = save_mark; - /* A COMMIT failure must fail the entire assertion, without trying any - subsequent branches. */ - - if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH); + /* See comment in the code for capturing groups above about handling + THEN. */ - /* PCRE does not allow THEN to escape beyond an assertion; it - is treated as NOMATCH. */ + if (rrc == MATCH_THEN) + { + next = ecode + GET(ecode,1); + if (md->start_match_ptr < next && + (*ecode == OP_ALT || *next == OP_ALT)) + rrc = MATCH_NOMATCH; + } + + /* Anything other than NOMATCH causes the assertion to fail. This + includes COMMIT, SKIP, and PRUNE. However, this consistent approach does + not always have exactly the same effect as in Perl. */ - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode, 1); } while (*ecode == OP_ALT); + + /* If we have tried all the alternative branches, the assertion has + failed. */ if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); @@ -1632,17 +1644,16 @@ for (;;) if (condassert) RRETURN(MATCH_MATCH); - /* Continue from after the assertion, updating the offsets high water - mark, since extracts may have been taken during the assertion. */ + /* Continue from after a successful assertion, updating the offsets high + water mark, since extracts may have been taken during the assertion. */ do ecode += GET(ecode,1); while (*ecode == OP_ALT); ecode += 1 + LINK_SIZE; offset_top = md->end_offset_top; continue; - /* Negative assertion: all branches must fail to match. Encountering SKIP, - PRUNE, or COMMIT means we must assume failure without checking subsequent - branches. */ + /* Negative assertion: all branches must fail to match for the assertion to + succeed. */ case OP_ASSERT_NOT: case OP_ASSERTBACK_NOT: @@ -1654,28 +1665,42 @@ for (;;) } else condassert = FALSE; + /* Loop for each alternative branch. */ + do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); md->mark = save_mark; + + /* A successful match means the assertion has failed. */ + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH); - if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) + + /* See comment in the code for capturing groups above about handling + THEN. */ + + if (rrc == MATCH_THEN) { - do ecode += GET(ecode,1); while (*ecode == OP_ALT); - break; + next = ecode + GET(ecode,1); + if (md->start_match_ptr < next && + (*ecode == OP_ALT || *next == OP_ALT)) + rrc = MATCH_NOMATCH; } + + /* No match on a branch means we must carry on and try the next branch. + Anything else, in particular, SKIP, PRUNE, etc. causes a failure in the + enclosing branch. This is a consistent approach, but does not always have + the same effect as in Perl. */ - /* PCRE does not allow THEN to escape beyond an assertion; it is treated - as NOMATCH. */ - - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); + + /* All branches in the assertion failed to match. */ if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ - - ecode += 1 + LINK_SIZE; + ecode += 1 + LINK_SIZE; /* Continue with current branch */ continue; /* Move the subject pointer back. This occurs only at the start of diff --git a/testdata/testinput1 b/testdata/testinput1 index f643bbb..b7b5934 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -4656,16 +4656,10 @@ /(?<pn> \( ( [^()]++ | (?&pn) )* \) )/x (ab(cd)ef) -/^(?!a(*SKIP)b)/ - ac - /^(?=a(*SKIP)b|ac)/ ** Failers ac -/^(?=a(*THEN)b|ac)/ - ac - /^(?=a(*PRUNE)b)/ ab ** Failers @@ -4674,9 +4668,6 @@ /^(?=a(*ACCEPT)b)/ ac -/^(?(?!a(*SKIP)b))/ - ac - /(?>a\Kb)/ ab @@ -4899,33 +4890,15 @@ however, we need the complication for Perl. ---/ /(A (A|B(*ACCEPT)|C) D)(E)/x AB -/\A.*?(?:a|b(*THEN)c)/ - ba - -/\A.*?(?:a|bc)/ - ba - -/\A.*?(a|b(*THEN)c)/ - ba - /\A.*?(a|bc)/ ba -/\A.*?(?:a|b(*THEN)c)++/ - ba - /\A.*?(?:a|bc)++/ ba -/\A.*?(a|b(*THEN)c)++/ - ba - /\A.*?(a|bc)++/ ba -/\A.*?(?:a|b(*THEN)c|d)/ - ba - /\A.*?(?:a|bc|d)/ ba @@ -5253,9 +5226,6 @@ name were given. ---/ /(a(*COMMIT)b){0}a(?1)|aac/ aac -/(?!a(*COMMIT)b)ac|cd/ - ac - /((?:a?)*)*c/ aac @@ -5309,9 +5279,6 @@ name were given. ---/ /(?:(a(*SKIP)b)){0}(?:(?1)|ac)/ ac -/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/ - aac - /(?<=(*SKIP)ac)a/ aa @@ -5442,4 +5409,39 @@ backtracked onto, whether or not it has a label. --/ /a(*:m)a(*COMMIT)(*SKIP:m)b|a+c/K aaaaaac +/.?(a|b(*THEN)c)/ + ba + +/(a(*COMMIT)b)c|abd/ + abc + abd + +/(?=a(*COMMIT)b)abc|abd/ + abc + abd + +/(?>a(*COMMIT)b)c|abd/ + abc + abd + +/a(?=b(*COMMIT)c)[^d]|abd/ + abd + abc + +/a(?=bc).|abd/ + abd + abc + +/a(?>b(*COMMIT)c)d|abd/ + abceabd + +/a(?>bc)d|abd/ + abceabd + +/(?>a(*COMMIT)b)c|abd/ + abd + +/(?>a(*COMMIT)c)d|abd/ + abd + /-- End of testinput1 --/ diff --git a/testdata/testinput2 b/testdata/testinput2 index 70248b6..27e2d3c 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -3855,5 +3855,66 @@ settings of the anchored and startline bits. --/ /aaaaa(*COMMIT)(*PRUNE)b|a+c/ aaaaaac + +/-- Here are some that Perl treats differently because of the way it handles +backtracking verbs. --/ + + /^(?!a(*SKIP)b)/ + ac + + /^(?!a(*SKIP)b)../ + acd + +/(?!a(*SKIP)b)../ + acd + +/^(?(?!a(*SKIP)b))/ + ac + +/^(?!a(*PRUNE)b)../ + acd + +/(?!a(*PRUNE)b)../ + acd + + /(?!a(*COMMIT)b)ac|cd/ + ac + + /(?!a(*COMMIT)b)ac|ad/ + ac + ad + +/^(?!a(*THEN)b|ac)../ + ac + ad + +/^(?=a(*THEN)b|ac)/ + ac + +/\A.*?(?:a|b(*THEN)c)/ + ba + +/\A.*?(?:a|bc)/ + ba + +/\A.*?(?:a|b(*THEN)c)++/ + ba + +/\A.*?(?:a|b(*THEN)c|d)/ + ba + +/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/ + aac + +/\A.*?(a|b(*THEN)c)/ + ba + +/^(A(*THEN)B|C(*THEN)D)/ + CD + +/^(A(*THEN)B|A(*THEN)D)/ + AD + +/-- End of Perl diffences --/ /-- End of testinput2 --/ diff --git a/testdata/testoutput1 b/testdata/testoutput1 index ae10a64..b38a876 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -7770,20 +7770,12 @@ No match 1: (ab(cd)ef) 2: ef -/^(?!a(*SKIP)b)/ - ac - 0: - /^(?=a(*SKIP)b|ac)/ ** Failers No match ac No match -/^(?=a(*THEN)b|ac)/ - ac - 0: - /^(?=a(*PRUNE)b)/ ab 0: @@ -7796,10 +7788,6 @@ No match ac 0: -/^(?(?!a(*SKIP)b))/ - ac - 0: - /(?>a\Kb)/ ab 0: b @@ -8169,46 +8157,20 @@ No match 1: AB 2: B -/\A.*?(?:a|b(*THEN)c)/ - ba - 0: ba - -/\A.*?(?:a|bc)/ - ba - 0: ba - -/\A.*?(a|b(*THEN)c)/ - ba - 0: ba - 1: a - /\A.*?(a|bc)/ ba 0: ba 1: a -/\A.*?(?:a|b(*THEN)c)++/ - ba - 0: ba - /\A.*?(?:a|bc)++/ ba 0: ba -/\A.*?(a|b(*THEN)c)++/ - ba - 0: ba - 1: a - /\A.*?(a|bc)++/ ba 0: ba 1: a -/\A.*?(?:a|b(*THEN)c|d)/ - ba - 0: ba - /\A.*?(?:a|bc|d)/ ba 0: ba @@ -8719,10 +8681,6 @@ No match aac 0: aac -/(?!a(*COMMIT)b)ac|cd/ - ac - 0: ac - /((?:a?)*)*c/ aac 0: aac @@ -8803,10 +8761,6 @@ No match ac 0: ac -/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/ - aac - 0: aac - /(?<=(*SKIP)ac)a/ aa No match @@ -8967,4 +8921,56 @@ No match aaaaaac 0: ac +/.?(a|b(*THEN)c)/ + ba + 0: ba + 1: a + +/(a(*COMMIT)b)c|abd/ + abc + 0: abc + 1: ab + abd +No match + +/(?=a(*COMMIT)b)abc|abd/ + abc + 0: abc + abd + 0: abd + +/(?>a(*COMMIT)b)c|abd/ + abc + 0: abc + abd + 0: abd + +/a(?=b(*COMMIT)c)[^d]|abd/ + abd +No match + abc + 0: ab + +/a(?=bc).|abd/ + abd + 0: abd + abc + 0: ab + +/a(?>b(*COMMIT)c)d|abd/ + abceabd +No match + +/a(?>bc)d|abd/ + abceabd + 0: abd + +/(?>a(*COMMIT)b)c|abd/ + abd + 0: abd + +/(?>a(*COMMIT)c)d|abd/ + abd +No match + /-- End of testinput1 --/ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 3956544..7ef3f96 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -12345,13 +12345,11 @@ COMMIT to escape from the assertion. --/ /(?=a(*COMMIT)b|ac)ac|ac/ ac - 0: ac +No match /(?=a(*COMMIT)b|(ac)) ac | (a)c/x ac - 0: ac - 1: <unset> - 2: a +No match "AB(C(D))(E(F))?(?(?=\2)(?=\4))" ABCDGHI\O03 @@ -12648,5 +12646,89 @@ Partial match: 123a /aaaaa(*COMMIT)(*PRUNE)b|a+c/ aaaaaac 0: aaaac + +/-- Here are some that Perl treats differently because of the way it handles +backtracking verbs. --/ + + /^(?!a(*SKIP)b)/ + ac +No match + + /^(?!a(*SKIP)b)../ + acd +No match + +/(?!a(*SKIP)b)../ + acd + 0: cd + +/^(?(?!a(*SKIP)b))/ + ac +No match + +/^(?!a(*PRUNE)b)../ + acd +No match + +/(?!a(*PRUNE)b)../ + acd + 0: cd + + /(?!a(*COMMIT)b)ac|cd/ + ac +No match + + /(?!a(*COMMIT)b)ac|ad/ + ac +No match + ad +No match + +/^(?!a(*THEN)b|ac)../ + ac +No match + ad + 0: ad + +/^(?=a(*THEN)b|ac)/ + ac + 0: + +/\A.*?(?:a|b(*THEN)c)/ + ba + 0: ba + +/\A.*?(?:a|bc)/ + ba + 0: ba + +/\A.*?(?:a|b(*THEN)c)++/ + ba + 0: ba + +/\A.*?(?:a|b(*THEN)c|d)/ + ba + 0: ba + +/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/ + aac + 0: aac + +/\A.*?(a|b(*THEN)c)/ + ba + 0: ba + 1: a + +/^(A(*THEN)B|C(*THEN)D)/ + CD + 0: CD + 1: CD + +/^(A(*THEN)B|A(*THEN)D)/ + AD + 0: AD + 1: AD + +/-- End of Perl diffences --/ /-- End of testinput2 --/ |