diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-04-22 17:35:23 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-04-22 17:35:23 +0000 |
commit | cfb27a536da197b768d56cc7c18e17752bcb9b38 (patch) | |
tree | 3e0eb84f3fa1ed63b5724c1ce220fcd8aa1cb6e5 | |
parent | 519e96a9dba044fda33be393d49696f50ced96ba (diff) | |
download | pcre-cfb27a536da197b768d56cc7c18e17752bcb9b38.tar.gz |
Use tail recursion in maximizing character and character type repetitions, to
reduce stack usage.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1311 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 3 | ||||
-rw-r--r-- | pcre_exec.c | 94 | ||||
-rw-r--r-- | testdata/testinput1 | 20 | ||||
-rw-r--r-- | testdata/testinput4 | 28 | ||||
-rw-r--r-- | testdata/testoutput1 | 30 | ||||
-rw-r--r-- | testdata/testoutput2 | 2 | ||||
-rw-r--r-- | testdata/testoutput4 | 42 |
7 files changed, 176 insertions, 43 deletions
@@ -134,6 +134,9 @@ Version 8.33 xx-xxxx-201x 35. Implement PCRE_NEVER_UTF to lock out the use of UTF, in particular, blocking (*UTF) etc. + +36. In the interpreter, maximizing pattern repetitions for characters and + character types now use tail recursion, which reduces stack usage. Version 8.32 30-November-2012 diff --git a/pcre_exec.c b/pcre_exec.c index 5b030a9..6baaf45 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -3387,7 +3387,22 @@ for (;;) max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; - /* Common code for all repeated single-character matches. */ + /* Common code for all repeated single-character matches. We first check + for the minimum number of characters. If the minimum equals the maximum, we + are done. Otherwise, if minimizing, check the rest of the pattern for a + match; if there isn't one, advance up to the maximum, one character at a + time. + + If maximizing, advance up to the maximum number of matching characters, + until eptr is past the end of the maximum run. If possessive, we are + then done (no backing up). Otherwise, match at this position; anything + other than no match is immediately returned. For nomatch, back up one + character, unless we are matching \R and the last thing matched was + \r\n, in which case, back up two bytes. When we reach the first optional + character position, we can save stack by doing a tail recurse. + + The various UTF/non-UTF and caseful/caseless cases are handled separately, + for speed. */ REPEATCHAR: #ifdef SUPPORT_UTF @@ -3471,13 +3486,13 @@ for (;;) } } - if (possessive) continue; - + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr == pp) { RRETURN(MATCH_NOMATCH); } + /* if (eptr == pp) { RRETURN(MATCH_NOMATCH); } */ #ifdef SUPPORT_UCP eptr--; BACKCHAR(eptr); @@ -3577,10 +3592,10 @@ for (;;) eptr++; } - if (possessive) continue; - - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM25); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -3635,10 +3650,10 @@ for (;;) if (fc != RAWUCHARTEST(eptr)) break; eptr++; } - if (possessive) continue; - - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM27); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -3815,7 +3830,7 @@ for (;;) } } else -#endif +#endif /* SUPPORT_UTF */ /* Not UTF mode */ { for (i = 1; i <= min; i++) @@ -3853,7 +3868,7 @@ for (;;) } } else -#endif +#endif /*SUPPORT_UTF */ /* Not UTF mode */ { for (fi = min;; fi++) @@ -3895,17 +3910,18 @@ for (;;) if (fc == d || (unsigned int)foc == d) break; eptr += len; } - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); } } else -#endif +#endif /* SUPPORT_UTF */ /* Not UTF mode */ { for (i = min; i < max; i++) @@ -3918,9 +3934,10 @@ for (;;) if (fc == *eptr || foc == *eptr) break; eptr++; } - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM31); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; @@ -4030,12 +4047,13 @@ for (;;) if (fc == d) break; eptr += len; } - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); } } @@ -4053,9 +4071,10 @@ for (;;) if (fc == *eptr) break; eptr++; } - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM35); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; @@ -5613,12 +5632,13 @@ for (;;) /* eptr is now past the end of the maximum run */ - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; if (utf) BACKCHAR(eptr); } } @@ -5655,13 +5675,13 @@ for (;;) /* eptr is now past the end of the maximum run */ - if (possessive) continue; - + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; for (;;) /* Move back over one extended */ { if (!utf) c = *eptr; else @@ -5936,18 +5956,13 @@ for (;;) RRETURN(PCRE_ERROR_INTERNAL); } - /* eptr is now past the end of the maximum run. If possessive, we are - done (no backing up). Otherwise, match at this position; anything other - than no match is immediately returned. For nomatch, back up one - character, unless we are matching \R and the last thing matched was - \r\n, in which case, back up two bytes. */ - - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL && RAWUCHAR(eptr - 1) == CHAR_CR) eptr--; @@ -6185,15 +6200,10 @@ for (;;) RRETURN(PCRE_ERROR_INTERNAL); } - /* eptr is now past the end of the maximum run. If possessive, we are - done (no backing up). Otherwise, match at this position; anything other - than no match is immediately returned. For nomatch, back up one - character (byte), unless we are matching \R and the last thing matched - was \r\n, in which case, back up two bytes. */ - - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; diff --git a/testdata/testinput1 b/testdata/testinput1 index 6319a25..c5f21e0 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -5588,4 +5588,24 @@ AbcdCBefgBhiBqz /(*THEN:m(m)(?&y)(?(DEFINE)(?<y>b))/K abc +/^\d*\w{4}/ + 1234 + 123 + +/^[^b]*\w{4}/ + aaaa + aaa + +/^[^b]*\w{4}/i + aaaa + aaa + +/^a*\w{4}/ + aaaa + aaa + +/^a*\w{4}/i + aaaa + aaa + /-- End of testinput1 --/ diff --git a/testdata/testinput4 b/testdata/testinput4 index a6ebcba..5d77ada 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -691,4 +691,32 @@ \x{fdee} \x{fdef} +/^\d*\w{4}/8 + 1234 + 123 + +/^\p{Any}*\d{4}/8 + 1234 + 123 + +/^\X*\w{4}/8 + 1234 + 123 + +/^[^b]*\w{4}/8 + aaaa + aaa + +/^[^b]*\w{4}/8i + aaaa + aaa + +/^\x{100}*.{4}/8 + \x{100}\x{100}\x{100}\x{100} + \x{100}\x{100}\x{100} + +/^\x{100}*.{4}/8i + \x{100}\x{100}\x{100}\x{100} + \x{100}\x{100}\x{100} + /-- End of testinput4 --/ diff --git a/testdata/testoutput1 b/testdata/testoutput1 index c07816b..4033b10 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -9172,4 +9172,34 @@ MK: m(m 0: b MK: m(m +/^\d*\w{4}/ + 1234 + 0: 1234 + 123 +No match + +/^[^b]*\w{4}/ + aaaa + 0: aaaa + aaa +No match + +/^[^b]*\w{4}/i + aaaa + 0: aaaa + aaa +No match + +/^a*\w{4}/ + aaaa + 0: aaaa + aaa +No match + +/^a*\w{4}/i + aaaa + 0: aaaa + aaa +No match + /-- End of testinput1 --/ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index a3f799a..de64502 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -4348,7 +4348,7 @@ Minimum match() recursion limit = 6 1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaz\M Minimum match() limit = 32768 -Minimum match() recursion limit = 42 +Minimum match() recursion limit = 29 No match /(aaa(?C1)bbb|ab)/I diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 10b17e7..0b8a306 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1227,4 +1227,46 @@ MK: a\x{a3}b \x{fdef} 0: \x{fdef} +/^\d*\w{4}/8 + 1234 + 0: 1234 + 123 +No match + +/^\p{Any}*\d{4}/8 + 1234 + 0: 1234 + 123 +No match + +/^\X*\w{4}/8 + 1234 + 0: 1234 + 123 +No match + +/^[^b]*\w{4}/8 + aaaa + 0: aaaa + aaa +No match + +/^[^b]*\w{4}/8i + aaaa + 0: aaaa + aaa +No match + +/^\x{100}*.{4}/8 + \x{100}\x{100}\x{100}\x{100} + 0: \x{100}\x{100}\x{100}\x{100} + \x{100}\x{100}\x{100} +No match + +/^\x{100}*.{4}/8i + \x{100}\x{100}\x{100}\x{100} + 0: \x{100}\x{100}\x{100}\x{100} + \x{100}\x{100}\x{100} +No match + /-- End of testinput4 --/ |