diff options
-rw-r--r-- | lib/encoding.pm | 23 | ||||
-rw-r--r-- | lib/encoding.t | 16 | ||||
-rw-r--r-- | regcomp.c | 15 | ||||
-rw-r--r-- | regexec.c | 44 |
4 files changed, 75 insertions, 23 deletions
diff --git a/lib/encoding.pm b/lib/encoding.pm index 6f5970f2ca..94ee3231fb 100644 --- a/lib/encoding.pm +++ b/lib/encoding.pm @@ -57,14 +57,33 @@ encoding pragma you can change this default. The pragma is a per script, not a per block lexical. Only the last C<use encoding> matters, and it affects B<the whole script>. +Notice that only literals (string or regular expression) having only +legacy code points are affected: if you mix data like this + + \xDF\x{100} + +the data is assumed to be in (Latin 1 and) Unicode, not in your native +encoding. In other words, this will match in "greek": + + "\xDF" =~ /\x{3af}/ + +but this will not + + "\xDF\x{100}" =~ /\x{3af}\x{100}/ + +since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}> +because of the C<\x{100}> on the left. You should not be mixing your +legacy data and Unicode in the same string. + If no encoding is specified, the environment variable L<PERL_ENCODING> is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no encoding can be found, C<Unknown encoding '...'> error will be thrown. =head1 KNOWN PROBLEMS -Literals in regular expressions are not affected by this pragma. -They very probably should. +For native multibyte encodings (either fixed or variable length) +the current implementation of the regular expressions may introduce +recoding errors for longer regular expression literals than 127 bytes. =head1 SEE ALSO diff --git a/lib/encoding.t b/lib/encoding.t index 03634410f4..8b14c8881a 100644 --- a/lib/encoding.t +++ b/lib/encoding.t @@ -1,4 +1,4 @@ -print "1..15\n"; +print "1..19\n"; use encoding "latin1"; # ignored (overwritten by the next line) use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...) @@ -68,3 +68,17 @@ print "ok 14\n"; print "not " unless ord(substr($c, 2, 1)) == 0x3af; print "ok 15\n"; +# regex literals + +print "not " unless "\xDF" =~ /\x{3AF}/; +print "ok 16\n"; + +print "not " unless "\x{3AF}" =~ /\xDF/; +print "ok 17\n"; + +print "not " unless "\xDF" =~ /\xDF/; +print "ok 18\n"; + +print "not " unless "\x{3AF}" =~ /\x{3AF}/; +print "ok 19\n"; + @@ -3155,6 +3155,21 @@ tryagain: break; } + if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) { + STRLEN oldlen = STR_LEN(ret); + SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); + char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding); + STRLEN newlen = SvCUR(sv); + if (!SIZE_ONLY) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", + oldlen, STRING(ret), newlen, s)); + Copy(s, STRING(ret), newlen, char); + STR_LEN(ret) += newlen - oldlen; + RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); + } else + RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); + } + return(ret); } @@ -408,7 +408,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, PL_reg_flags |= RF_utf8; if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) { - DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n")); + DEBUG_r(PerlIO_printf(Perl_debug_log, + "String too short... [re_intuit_start]\n")); goto fail; } strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos; @@ -1474,19 +1475,10 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } minlen = prog->minlen; - if (do_utf8 && !(prog->reganch & ROPT_CANY_SEEN)) { - if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) { - DEBUG_r(PerlIO_printf(Perl_debug_log, - "Too short (in characters)...\n")); - goto phooey; - } - } - else { - if (strend - startpos < minlen) { - DEBUG_r(PerlIO_printf(Perl_debug_log, - "Too short (in bytes)...\n")); - goto phooey; - } + if (strend - startpos < minlen) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "String too short [regexec_flags]...\n")); + goto phooey; } /* Check validity of program. */ @@ -2215,14 +2207,26 @@ S_regmatch(pTHX_ regnode *prog) char *l = locinput; char *e = s + ln; STRLEN len; + if (do_utf8) while (s < e) { + UV uv; + if (l >= PL_regeol) sayNO; - if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len)) - sayNO; - s += len; - l += len; + uv = NATIVE_TO_UNI(*(U8*)s); + if (UTF8_IS_START(uv)) { + len = UTF8SKIP(s); + if (memNE(s, l, len)) + sayNO; + l += len; + s += len; + } else { + if (uv != utf8_to_uvchr((U8*)l, &len)) + sayNO; + l += len; + s ++; + } } else while (s < e) { @@ -2230,8 +2234,8 @@ S_regmatch(pTHX_ regnode *prog) sayNO; if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len)) sayNO; - s+=len; - l++; + s += len; + l ++; } locinput = l; nextchr = UCHARAT(locinput); |