summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-04-22 17:35:23 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-04-22 17:35:23 +0000
commitcfb27a536da197b768d56cc7c18e17752bcb9b38 (patch)
tree3e0eb84f3fa1ed63b5724c1ce220fcd8aa1cb6e5
parent519e96a9dba044fda33be393d49696f50ced96ba (diff)
downloadpcre-cfb27a536da197b768d56cc7c18e17752bcb9b38.tar.gz
Use tail recursion in maximizing character and character type repetitions, to
reduce stack usage. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1311 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog3
-rw-r--r--pcre_exec.c94
-rw-r--r--testdata/testinput120
-rw-r--r--testdata/testinput428
-rw-r--r--testdata/testoutput130
-rw-r--r--testdata/testoutput22
-rw-r--r--testdata/testoutput442
7 files changed, 176 insertions, 43 deletions
diff --git a/ChangeLog b/ChangeLog
index b48abbd..a65b5d1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -134,6 +134,9 @@ Version 8.33 xx-xxxx-201x
35. Implement PCRE_NEVER_UTF to lock out the use of UTF, in particular,
blocking (*UTF) etc.
+
+36. In the interpreter, maximizing pattern repetitions for characters and
+ character types now use tail recursion, which reduces stack usage.
Version 8.32 30-November-2012
diff --git a/pcre_exec.c b/pcre_exec.c
index 5b030a9..6baaf45 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -3387,7 +3387,22 @@ for (;;)
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
- /* Common code for all repeated single-character matches. */
+ /* Common code for all repeated single-character matches. We first check
+ for the minimum number of characters. If the minimum equals the maximum, we
+ are done. Otherwise, if minimizing, check the rest of the pattern for a
+ match; if there isn't one, advance up to the maximum, one character at a
+ time.
+
+ If maximizing, advance up to the maximum number of matching characters,
+ until eptr is past the end of the maximum run. If possessive, we are
+ then done (no backing up). Otherwise, match at this position; anything
+ other than no match is immediately returned. For nomatch, back up one
+ character, unless we are matching \R and the last thing matched was
+ \r\n, in which case, back up two bytes. When we reach the first optional
+ character position, we can save stack by doing a tail recurse.
+
+ The various UTF/non-UTF and caseful/caseless cases are handled separately,
+ for speed. */
REPEATCHAR:
#ifdef SUPPORT_UTF
@@ -3471,13 +3486,13 @@ for (;;)
}
}
- if (possessive) continue;
-
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
+ /* if (eptr == pp) { RRETURN(MATCH_NOMATCH); } */
#ifdef SUPPORT_UCP
eptr--;
BACKCHAR(eptr);
@@ -3577,10 +3592,10 @@ for (;;)
eptr++;
}
- if (possessive) continue;
-
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -3635,10 +3650,10 @@ for (;;)
if (fc != RAWUCHARTEST(eptr)) break;
eptr++;
}
- if (possessive) continue;
-
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -3815,7 +3830,7 @@ for (;;)
}
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
for (i = 1; i <= min; i++)
@@ -3853,7 +3868,7 @@ for (;;)
}
}
else
-#endif
+#endif /*SUPPORT_UTF */
/* Not UTF mode */
{
for (fi = min;; fi++)
@@ -3895,17 +3910,18 @@ for (;;)
if (fc == d || (unsigned int)foc == d) break;
eptr += len;
}
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
}
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
for (i = min; i < max; i++)
@@ -3918,9 +3934,10 @@ for (;;)
if (fc == *eptr || foc == *eptr) break;
eptr++;
}
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@@ -4030,12 +4047,13 @@ for (;;)
if (fc == d) break;
eptr += len;
}
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
}
}
@@ -4053,9 +4071,10 @@ for (;;)
if (fc == *eptr) break;
eptr++;
}
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@@ -5613,12 +5632,13 @@ for (;;)
/* eptr is now past the end of the maximum run */
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
if (utf) BACKCHAR(eptr);
}
}
@@ -5655,13 +5675,13 @@ for (;;)
/* eptr is now past the end of the maximum run */
- if (possessive) continue;
-
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
for (;;) /* Move back over one extended */
{
if (!utf) c = *eptr; else
@@ -5936,18 +5956,13 @@ for (;;)
RRETURN(PCRE_ERROR_INTERNAL);
}
- /* eptr is now past the end of the maximum run. If possessive, we are
- done (no backing up). Otherwise, match at this position; anything other
- than no match is immediately returned. For nomatch, back up one
- character, unless we are matching \R and the last thing matched was
- \r\n, in which case, back up two bytes. */
-
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
@@ -6185,15 +6200,10 @@ for (;;)
RRETURN(PCRE_ERROR_INTERNAL);
}
- /* eptr is now past the end of the maximum run. If possessive, we are
- done (no backing up). Otherwise, match at this position; anything other
- than no match is immediately returned. For nomatch, back up one
- character (byte), unless we are matching \R and the last thing matched
- was \r\n, in which case, back up two bytes. */
-
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
diff --git a/testdata/testinput1 b/testdata/testinput1
index 6319a25..c5f21e0 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5588,4 +5588,24 @@ AbcdCBefgBhiBqz
/(*THEN:m(m)(?&y)(?(DEFINE)(?<y>b))/K
abc
+/^\d*\w{4}/
+ 1234
+ 123
+
+/^[^b]*\w{4}/
+ aaaa
+ aaa
+
+/^[^b]*\w{4}/i
+ aaaa
+ aaa
+
+/^a*\w{4}/
+ aaaa
+ aaa
+
+/^a*\w{4}/i
+ aaaa
+ aaa
+
/-- End of testinput1 --/
diff --git a/testdata/testinput4 b/testdata/testinput4
index a6ebcba..5d77ada 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -691,4 +691,32 @@
\x{fdee}
\x{fdef}
+/^\d*\w{4}/8
+ 1234
+ 123
+
+/^\p{Any}*\d{4}/8
+ 1234
+ 123
+
+/^\X*\w{4}/8
+ 1234
+ 123
+
+/^[^b]*\w{4}/8
+ aaaa
+ aaa
+
+/^[^b]*\w{4}/8i
+ aaaa
+ aaa
+
+/^\x{100}*.{4}/8
+ \x{100}\x{100}\x{100}\x{100}
+ \x{100}\x{100}\x{100}
+
+/^\x{100}*.{4}/8i
+ \x{100}\x{100}\x{100}\x{100}
+ \x{100}\x{100}\x{100}
+
/-- End of testinput4 --/
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index c07816b..4033b10 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9172,4 +9172,34 @@ MK: m(m
0: b
MK: m(m
+/^\d*\w{4}/
+ 1234
+ 0: 1234
+ 123
+No match
+
+/^[^b]*\w{4}/
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^[^b]*\w{4}/i
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^a*\w{4}/
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^a*\w{4}/i
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
/-- End of testinput1 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index a3f799a..de64502 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -4348,7 +4348,7 @@ Minimum match() recursion limit = 6
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaz\M
Minimum match() limit = 32768
-Minimum match() recursion limit = 42
+Minimum match() recursion limit = 29
No match
/(aaa(?C1)bbb|ab)/I
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 10b17e7..0b8a306 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1227,4 +1227,46 @@ MK: a\x{a3}b
\x{fdef}
0: \x{fdef}
+/^\d*\w{4}/8
+ 1234
+ 0: 1234
+ 123
+No match
+
+/^\p{Any}*\d{4}/8
+ 1234
+ 0: 1234
+ 123
+No match
+
+/^\X*\w{4}/8
+ 1234
+ 0: 1234
+ 123
+No match
+
+/^[^b]*\w{4}/8
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^[^b]*\w{4}/8i
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^\x{100}*.{4}/8
+ \x{100}\x{100}\x{100}\x{100}
+ 0: \x{100}\x{100}\x{100}\x{100}
+ \x{100}\x{100}\x{100}
+No match
+
+/^\x{100}*.{4}/8i
+ \x{100}\x{100}\x{100}\x{100}
+ 0: \x{100}\x{100}\x{100}\x{100}
+ \x{100}\x{100}\x{100}
+No match
+
/-- End of testinput4 --/