Perl RT #122761 - split /\A/ should not behave like split /^/m

Long ago a weird special case was hacked into split so that it treated C<split /^/> as if it was C<split /^/m>. At the time this was done by letting the split PP code inspect the pattern, and IFF it matched "^\0" the special behavior was enabled (which also bypasses using the regex engine for matching.) Later on when we added pluggable regex engines and when we encountered various counter-intuitive behaviors related to split we changed who this worked so that the regex engine would set flags appropriate for split to use. This meant that regex plugins using totally different regex syntax could still enable the optimisation. At the same time I modified how we detected this pattern type by looking at the *compiled* regops, and not the raw pattern. This had the side effect of making things like C< split /(?:)^/ > also enable the optimisation. Unfortunately this did not play nicely with the fact that /^/ produces an SBOL node, as does /\A/, but we definitely don't want C<split /\A/> to behave like C<split /^/m>. In fact C<split /\A/> should behave like a noop (which means there is room for a future optimisation here if someone cares to implement it.) In the discussion attached to the ticket I propose what I consider to be a better fix, default split patterns to be compiled by default with the /m modifier enabled. This patch does NOT do this. It is instead the "simple" patch. This means that C<split /^/> behaves like C<split /^/m> but C<split /^x/> does NOT behave like C<split /^x/m> which I consider to be a bug which I will fix in a future patch.
author: Yves Orton <demerphq@gmail.com> 2014-09-17 04:36:57 +0200
committer: Yves Orton <demerphq@gmail.com> 2014-09-17 04:47:34 +0200
commit: 1645b83c5ceecd8a95db0310d80125d8b188eb83 (patch)
tree: 897530317120932fc88111dda78ff9b9d0b50a10
parent: aa48e906ca55e0da8e1317549a4ddafff3837f3f (diff)
download: perl-1645b83c5ceecd8a95db0310d80125d8b188eb83.tar.gz
3 files changed, 19 insertions, 3 deletions
diff --git a/regcomp.c b/regcomp.c
index 22be5ccc14..0c4dfe7fbb 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -7188,7 +7188,12 @@ reStudy:
 
         if (PL_regkind[fop] == NOTHING && nop == END)
             r->extflags |= RXf_NULL;
-        else if (PL_regkind[fop] == BOL && nop == END)
+        else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
+            /* when fop is SBOL first->flags will be true only when it was
+             * produced by parsing /\A/, and not when parsing /^/. This is
+             * very important for the split code as there we want to
+             * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
+             * See rt #122761 for more details. -- Yves */
             r->extflags |= RXf_START_ONLY;
         else if (fop == PLUS
                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
@@ -11427,6 +11432,11 @@ tryagain:
 	case 'A':
 	    RExC_seen_zerolen++;
 	    ret = reg_node(pRExC_state, SBOL);
+            /* SBOL is shared with /^/ so we set the flags so we can tell
+             * /\A/ from /^/ in split. We check ret because first pass we
+             * have no regop struct to set the flags on. */
+            if (PASS2)
+                ret->flags = 1;
 	    *flagp |= SIMPLE;
 	    goto finish_meta_pat;
 	case 'G':
@@ -16205,6 +16215,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
     }
     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
 	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
+    else if (OP(o) == SBOL)
+        Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
 #else
     PERL_UNUSED_CONTEXT;
     PERL_UNUSED_ARG(sv);
diff --git a/regcomp.sym b/regcomp.sym
index b285647086..6908712a05 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -25,6 +25,7 @@ END         END,        no        ; End of program.
 SUCCEED     END,        no        ; Return from a subroutine, basically.
 
 #* Line Start Anchors:
+#Note flags field for SBOL indicates if it is a /^/ or a /\A/
 SBOL        BOL,        no        ; Match "" at beginning of line: /^/, /\A/
 MBOL        BOL,        no        ; Same, assuming multiline: /^/m
 
diff --git a/t/op/split.t b/t/op/split.t
index 007eb00f68..2d038ed8da 100644
--- a/t/op/split.t
+++ b/t/op/split.t
@@ -6,7 +6,7 @@ BEGIN {
     set_up_inc('../lib');
 }
 
-plan tests => 119;
+plan tests => 120;
 
 $FS = ':';
 
@@ -180,7 +180,10 @@ is($cnt, scalar(@ary));
 
 # /^/ treated as /^/m
 $_ = join ':', split /^/, "ab\ncd\nef\n";
-is($_, "ab\n:cd\n:ef\n");
+is($_, "ab\n:cd\n:ef\n","check that split /^/ is treated as split /^/m");
+
+$_ = join ':', split /\A/, "ab\ncd\nef\n";
+is($_, "ab\ncd\nef\n","check that split /\A/ is NOT treated as split /^/m");
 
 # see if @a = @b = split(...) optimization works
 @list1 = @list2 = split ('p',"a p b c p");
author	Yves Orton <demerphq@gmail.com>	2014-09-17 04:36:57 +0200
committer	Yves Orton <demerphq@gmail.com>	2014-09-17 04:47:34 +0200
commit	1645b83c5ceecd8a95db0310d80125d8b188eb83 (patch)
tree	897530317120932fc88111dda78ff9b9d0b50a10
parent	aa48e906ca55e0da8e1317549a4ddafff3837f3f (diff)
download	perl-1645b83c5ceecd8a95db0310d80125d8b188eb83.tar.gz