summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2014-09-22 13:59:39 -0600
committerKarl Williamson <khw@cpan.org>2014-09-29 13:07:07 -0600
commitb35552de5cea8eb47ccb046284ecb9a099430255 (patch)
tree9cddbc9de1b38404bcf6fdae9e65f46b5a5d3e79 /regcomp.c
parentdea37815c59831b7e586fa51968348fbb8009e1a (diff)
downloadperl-b35552de5cea8eb47ccb046284ecb9a099430255.tar.gz
Tighten uses of regex synthetic start class
A synthetic start class (SSC) is generated by the regular expression pattern compiler to give a consolidation of all the possible things that can match at the beginning of where a pattern can possibly match. For example qr/a?bfoo/; requires the match to begin with either an 'a' or a 'b'. There are no other possibilities. We can set things up to quickly scan for either of these in the target string, and only when one of these is found do we need to look for 'foo'. There is an overhead associated with using SSCs. If the number of possibilities that the SSC excludes is relatively small, it can be counter-productive to use them. This patch creates a crude sieve to decide whether to use an SSC or not. If the SSC doesn't exclude at least half the "likely" possiblities, it is discarded. This patch is a starting point, and can be refined if necessary as we gain experience. See thread beginning with http://nntp.perl.org/group/perl.perl5.porters/212644 In many patterns, no SSC is generated; and with the advent of tries, SSC's have become less important, so whatever we do is not terribly critical.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c69
1 files changed, 67 insertions, 2 deletions
diff --git a/regcomp.c b/regcomp.c
index 5703cf0ef2..c8df34806e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1444,6 +1444,71 @@ S_ssc_clear_locale(regnode_ssc *ssc)
ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
}
+#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
+
+STATIC bool
+S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
+{
+ /* The synthetic start class is used to hopefully quickly winnow down
+ * places where a pattern could start a match in the target string. If it
+ * doesn't really narrow things down that much, there isn't much point to
+ * having the overhead of using it. This function uses some very crude
+ * heuristics to decide if to use the ssc or not.
+ *
+ * It returns TRUE if 'ssc' rules out more than half what it considers to
+ * be the "likely" possible matches, but of course it doesn't know what the
+ * actual things being matched are going to be; these are only guesses
+ *
+ * For /l matches, it assumes that the only likely matches are going to be
+ * in the 0-255 range, uniformly distributed, so half of that is 127
+ * For /a and /d matches, it assumes that the likely matches will be just
+ * the ASCII range, so half of that is 63
+ * For /u and there isn't anything matching above the Latin1 range, it
+ * assumes that that is the only range likely to be matched, and uses
+ * half that as the cut-off: 127. If anything matches above Latin1,
+ * it assumes that all of Unicode could match (uniformly), except for
+ * non-Unicode code points and things in the General Category "Other"
+ * (unassigned, private use, surrogates, controls and formats). This
+ * is a much large number. */
+
+ const U32 max_match = (LOC)
+ ? 127
+ : (! UNI_SEMANTICS)
+ ? 63
+ : (invlist_highest(ssc->invlist) < 256)
+ ? 127
+ : ((NON_OTHER_COUNT + 1) / 2) - 1;
+ U32 count = 0; /* Running total of number of code points matched by
+ 'ssc' */
+ UV start, end; /* Start and end points of current range in inversion
+ list */
+
+ PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
+
+ invlist_iterinit(ssc->invlist);
+ while (invlist_iternext(ssc->invlist, &start, &end)) {
+
+ /* /u is the only thing that we expect to match above 255; so if not /u
+ * and even if there are matches above 255, ignore them. This catches
+ * things like \d under /d which does match the digits above 255, but
+ * since the pattern is /d, it is not likely to be expecting them */
+ if (! UNI_SEMANTICS) {
+ if (start > 255) {
+ break;
+ }
+ end = MIN(end, 255);
+ }
+ count += end - start + 1;
+ if (count > max_match) {
+ invlist_iterfinish(ssc->invlist);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+
STATIC void
S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
{
@@ -7072,7 +7137,7 @@ reStudy:
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
&& stclass_flag
&& ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
- && !ssc_is_anything(data.start_class))
+ && is_ssc_worth_it(pRExC_state, data.start_class))
{
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
@@ -7152,7 +7217,7 @@ reStudy:
= r->float_substr = r->float_utf8 = NULL;
if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
- && ! ssc_is_anything(data.start_class))
+ && is_ssc_worth_it(pRExC_state, data.start_class))
{
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));