summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorDavid Mitchell <davem@iabyn.com>2013-05-31 16:16:46 +0100
committerDavid Mitchell <davem@iabyn.com>2013-06-02 22:28:54 +0100
commit1cb95af747617861d08c1213f05d8f1e4b55f942 (patch)
tree8ee92adf125643d951a48729f6818c1341c390f2 /regexec.c
parent2ac8ff4bf339d781f96fdfabd34d199a7f9569da (diff)
downloadperl-1cb95af747617861d08c1213f05d8f1e4b55f942.tar.gz
better document the regex super-linear pos cache
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c45
1 files changed, 41 insertions, 4 deletions
diff --git a/regexec.c b/regexec.c
index e1580ded42..a144138dd3 100644
--- a/regexec.c
+++ b/regexec.c
@@ -5097,7 +5097,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
maxopenparen = 0;
- /* XXXX This is too dramatic a measure... */
+ /* invalidate the S-L poscache. We're now executing a
+ * different set of WHILEM ops (and their associated
+ * indexes) against the same string, so the bits in the
+ * cache are meaningless. Setting maxiter to zero forces
+ * the cache to be invalidated and zeroed before reuse.
+ * XXX This is too dramatic a measure. Ideally we should
+ * save the old cache and restore when running the outer
+ * pattern again */
reginfo->poscache_maxiter = 0;
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
@@ -5128,7 +5135,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
cur_eval = ST.prev_eval;
cur_curlyx = ST.prev_curlyx;
- /* XXXX This is too dramatic a measure... */
+ /* Invalidate cache. See "invalidate" comment above. */
reginfo->poscache_maxiter = 0;
if ( nochange_depth )
nochange_depth--;
@@ -5147,7 +5154,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
regcppop(rex, &maxopenparen);
cur_eval = ST.prev_eval;
cur_curlyx = ST.prev_curlyx;
- /* XXXX This is too dramatic a measure... */
+ /* Invalidate cache. See "invalidate" comment above. */
reginfo->poscache_maxiter = 0;
if ( nochange_depth )
nochange_depth--;
@@ -5426,7 +5433,37 @@ NULL
goto do_whilem_B_max;
}
- /* super-linear cache processing */
+ /* super-linear cache processing.
+ *
+ * The idea here is that for certain types of CURLYX/WHILEM -
+ * principally those whose upper bound is infinity (and
+ * excluding regexes that have things like \1 and other very
+ * non-regular expresssiony things), then if a pattern like
+ * /....A*.../ fails and we backtrack to the WHILEM, then we
+ * make a note that this particular WHILEM op was at string
+ * position 47 (say) when the rest of pattern failed. Then, if
+ * we ever find ourselves back at that WHILEM, and at string
+ * position 47 again, we can just fail immediately rather than
+ * running the rest of the pattern again.
+ *
+ * This is very handy when patterns start to go
+ * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
+ * with a combinatorial explosion of backtracking.
+ *
+ * The cache is implemented as a bit array, with one bit per
+ * string byte position per WHILEM op (up to 16) - so its
+ * between 0.25 and 2x the string size.
+ *
+ * To avoid allocating a poscache buffer every time, we do an
+ * initially countdown; only after we have executed a WHILEM
+ * op (string-length x #WHILEMs) times do we allocate the
+ * cache.
+ *
+ * The top 4 bits of scan->flags byte say how many different
+ * relevant CURLLYX/WHILEM op pairs there are, while the
+ * bottom 4-bits is the identifying index number of this
+ * WHILEM.
+ */
if (scan->flags) {