From 1645b83c5ceecd8a95db0310d80125d8b188eb83 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Wed, 17 Sep 2014 04:36:57 +0200 Subject: Perl RT #122761 - split /\A/ should not behave like split /^/m Long ago a weird special case was hacked into split so that it treated C as if it was C. At the time this was done by letting the split PP code inspect the pattern, and IFF it matched "^\0" the special behavior was enabled (which also bypasses using the regex engine for matching.) Later on when we added pluggable regex engines and when we encountered various counter-intuitive behaviors related to split we changed who this worked so that the regex engine would set flags appropriate for split to use. This meant that regex plugins using totally different regex syntax could still enable the optimisation. At the same time I modified how we detected this pattern type by looking at the *compiled* regops, and not the raw pattern. This had the side effect of making things like C< split /(?:)^/ > also enable the optimisation. Unfortunately this did not play nicely with the fact that /^/ produces an SBOL node, as does /\A/, but we definitely don't want C to behave like C. In fact C should behave like a noop (which means there is room for a future optimisation here if someone cares to implement it.) In the discussion attached to the ticket I propose what I consider to be a better fix, default split patterns to be compiled by default with the /m modifier enabled. This patch does NOT do this. It is instead the "simple" patch. This means that C behaves like C but C does NOT behave like C which I consider to be a bug which I will fix in a future patch. --- regcomp.c | 14 +++++++++++++- regcomp.sym | 1 + t/op/split.t | 7 +++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/regcomp.c b/regcomp.c index 22be5ccc14..0c4dfe7fbb 100644 --- a/regcomp.c +++ b/regcomp.c @@ -7188,7 +7188,12 @@ reStudy: if (PL_regkind[fop] == NOTHING && nop == END) r->extflags |= RXf_NULL; - else if (PL_regkind[fop] == BOL && nop == END) + else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END) + /* when fop is SBOL first->flags will be true only when it was + * produced by parsing /\A/, and not when parsing /^/. This is + * very important for the split code as there we want to + * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m. + * See rt #122761 for more details. -- Yves */ r->extflags |= RXf_START_ONLY; else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE @@ -11427,6 +11432,11 @@ tryagain: case 'A': RExC_seen_zerolen++; ret = reg_node(pRExC_state, SBOL); + /* SBOL is shared with /^/ so we set the flags so we can tell + * /\A/ from /^/ in split. We check ret because first pass we + * have no regop struct to set the flags on. */ + if (PASS2) + ret->flags = 1; *flagp |= SIMPLE; goto finish_meta_pat; case 'G': @@ -16205,6 +16215,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); + else if (OP(o) == SBOL) + Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(sv); diff --git a/regcomp.sym b/regcomp.sym index b285647086..6908712a05 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -25,6 +25,7 @@ END END, no ; End of program. SUCCEED END, no ; Return from a subroutine, basically. #* Line Start Anchors: +#Note flags field for SBOL indicates if it is a /^/ or a /\A/ SBOL BOL, no ; Match "" at beginning of line: /^/, /\A/ MBOL BOL, no ; Same, assuming multiline: /^/m diff --git a/t/op/split.t b/t/op/split.t index 007eb00f68..2d038ed8da 100644 --- a/t/op/split.t +++ b/t/op/split.t @@ -6,7 +6,7 @@ BEGIN { set_up_inc('../lib'); } -plan tests => 119; +plan tests => 120; $FS = ':'; @@ -180,7 +180,10 @@ is($cnt, scalar(@ary)); # /^/ treated as /^/m $_ = join ':', split /^/, "ab\ncd\nef\n"; -is($_, "ab\n:cd\n:ef\n"); +is($_, "ab\n:cd\n:ef\n","check that split /^/ is treated as split /^/m"); + +$_ = join ':', split /\A/, "ab\ncd\nef\n"; +is($_, "ab\ncd\nef\n","check that split /\A/ is NOT treated as split /^/m"); # see if @a = @b = split(...) optimization works @list1 = @list2 = split ('p',"a p b c p"); -- cgit v1.2.1