summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2014-09-17 04:36:57 +0200
committerYves Orton <demerphq@gmail.com>2014-09-17 04:47:34 +0200
commit1645b83c5ceecd8a95db0310d80125d8b188eb83 (patch)
tree897530317120932fc88111dda78ff9b9d0b50a10 /regcomp.c
parentaa48e906ca55e0da8e1317549a4ddafff3837f3f (diff)
downloadperl-1645b83c5ceecd8a95db0310d80125d8b188eb83.tar.gz
Perl RT #122761 - split /\A/ should not behave like split /^/m
Long ago a weird special case was hacked into split so that it treated C<split /^/> as if it was C<split /^/m>. At the time this was done by letting the split PP code inspect the pattern, and IFF it matched "^\0" the special behavior was enabled (which also bypasses using the regex engine for matching.) Later on when we added pluggable regex engines and when we encountered various counter-intuitive behaviors related to split we changed who this worked so that the regex engine would set flags appropriate for split to use. This meant that regex plugins using totally different regex syntax could still enable the optimisation. At the same time I modified how we detected this pattern type by looking at the *compiled* regops, and not the raw pattern. This had the side effect of making things like C< split /(?:)^/ > also enable the optimisation. Unfortunately this did not play nicely with the fact that /^/ produces an SBOL node, as does /\A/, but we definitely don't want C<split /\A/> to behave like C<split /^/m>. In fact C<split /\A/> should behave like a noop (which means there is room for a future optimisation here if someone cares to implement it.) In the discussion attached to the ticket I propose what I consider to be a better fix, default split patterns to be compiled by default with the /m modifier enabled. This patch does NOT do this. It is instead the "simple" patch. This means that C<split /^/> behaves like C<split /^/m> but C<split /^x/> does NOT behave like C<split /^x/m> which I consider to be a bug which I will fix in a future patch.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c14
1 files changed, 13 insertions, 1 deletions
diff --git a/regcomp.c b/regcomp.c
index 22be5ccc14..0c4dfe7fbb 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -7188,7 +7188,12 @@ reStudy:
if (PL_regkind[fop] == NOTHING && nop == END)
r->extflags |= RXf_NULL;
- else if (PL_regkind[fop] == BOL && nop == END)
+ else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
+ /* when fop is SBOL first->flags will be true only when it was
+ * produced by parsing /\A/, and not when parsing /^/. This is
+ * very important for the split code as there we want to
+ * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
+ * See rt #122761 for more details. -- Yves */
r->extflags |= RXf_START_ONLY;
else if (fop == PLUS
&& PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
@@ -11427,6 +11432,11 @@ tryagain:
case 'A':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, SBOL);
+ /* SBOL is shared with /^/ so we set the flags so we can tell
+ * /\A/ from /^/ in split. We check ret because first pass we
+ * have no regop struct to set the flags on. */
+ if (PASS2)
+ ret->flags = 1;
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'G':
@@ -16205,6 +16215,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
+ else if (OP(o) == SBOL)
+ Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
#else
PERL_UNUSED_CONTEXT;
PERL_UNUSED_ARG(sv);