regexec.c: Stop looking for match sooner

This is a partial reversion of commit 7c1b9f38fcbfdb3a9e1766e02bcb991d1a5452d9 which went unnecessarily far in fixing the problem. After studying the situation some more, I see more clearly what was going on. The point is that if you have only 2 characters left in the string, but the pattern requires 3 to work, it's guaranteed to fail, so pointless, and unnecessary work, to try. So don't being a match trial at a position when there are fewer than the minimum number of characters necessary. That is what the code before that commit did. However it neglected the fact that it is possible for a single character to match multiple ones, so there is not a 1:1 ratio. This new commit assumes the worst possible ratio to calculate how far into a string is the furthest a successful match could start. This is going to in most cases still look too far, but it is much better than always going up to the final character, as the previous patch did. The maximum ratio is guaranteed by Unicode to be 3:1, but when the target isn't in UTF-8, the max is 2:1, determined simply by inspection of the defined folds. And actually, currently, the single case where it isn't 1:1 doesn't come up here, because regcomp.c guarantees that that match doesn't generate one of these EXACTFish nodes. However, I expect that to change for 5.16, and so am preparing for that case by making it 2:1.
author: Karl Williamson <public@khwilliamson.com> 2011-10-16 11:43:08 -0600
committer: Karl Williamson <public@khwilliamson.com> 2011-10-17 17:04:28 -0600
commit: e067297c376fbbb5a0dc8428c65d922f11e1f4c6 (patch)
tree: 593b25c52b8899ff520f4842e536a6c0baba17ab /regexec.c
parent: 8a90a8fee1032a1bdee2a164f8265ff160fe22f0 (diff)
download: perl-e067297c376fbbb5a0dc8428c65d922f11e1f4c6.tar.gz
1 files changed, 26 insertions, 2 deletions
diff --git a/regexec.c b/regexec.c
index 4abb71234c..6100fce1e1 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1528,6 +1528,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	    break;
 
 	do_exactf_utf8:
+	{
+	    unsigned expansion;
+
 
 	    /* If one of the operands is in utf8, we can't use the simpler
 	     * folding above, due to the fact that many different characters
@@ -1540,13 +1543,33 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 		    ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
 		    : ln;
 
-	    /* Set the end position to the final character available */
-	    e = HOP3c(strend, -1, s);
+	    /* We have 'lnc' characters to match in the pattern, but because of
+	     * multi-character folding, each character in the target can match
+	     * up to 3 characters (Unicode guarantees it will never exceed
+	     * this) if it is utf8-encoded; and up to 2 if not (based on the
+	     * fact that the Latin 1 folds are already determined, and the
+	     * only multi-char fold in that range is the sharp-s folding to
+	     * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
+	     * string character.  Adjust lnc accordingly, always matching at
+	     * least 1 */
+	    expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
+	    lnc = (lnc < expansion) ? 1 : lnc / expansion;
+
+	    /* As in the non-UTF8 case, if we have to match 3 characters, and
+	     * only 2 are left, it's guaranteed to fail, so don't start a
+	     * match that would require us to go beyond the end of the string
+	     */
+	    e = HOP3c(strend, -((I32)lnc), s);
 
 	    if (!reginfo && e < s) {
 		e = s;			/* Due to minlen logic of intuit() */
 	    }
 
+	    /* XXX Note that we could recalculate e every so-often through the
+	     * loop to stop earlier, as the worst case expansion above will
+	     * rarely be met, and as we go along we would usually find that e
+	     * moves further to the left.  Unclear if worth the expense */
+
 	    while (s <= e) {
 		char *my_strend= (char *)strend;
 		if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
@@ -1558,6 +1581,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 		s += UTF8SKIP(s);
 	    }
 	    break;
+	}
 	case BOUNDL:
 	    PL_reg_flags |= RF_tainted;
 	    FBC_BOUND(isALNUM_LC,
author	Karl Williamson <public@khwilliamson.com>	2011-10-16 11:43:08 -0600
committer	Karl Williamson <public@khwilliamson.com>	2011-10-17 17:04:28 -0600
commit	e067297c376fbbb5a0dc8428c65d922f11e1f4c6 (patch)
tree	593b25c52b8899ff520f4842e536a6c0baba17ab /regexec.c
parent	8a90a8fee1032a1bdee2a164f8265ff160fe22f0 (diff)
download	perl-e067297c376fbbb5a0dc8428c65d922f11e1f4c6.tar.gz