diff options
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | lib/unicore/mktables | 38 | ||||
-rw-r--r-- | regcomp.c | 46 | ||||
-rw-r--r-- | regcomp.h | 18 | ||||
-rwxr-xr-x | t/op/pat.t | 3 | ||||
-rw-r--r-- | t/op/reg_posixcc.t | 127 |
6 files changed, 220 insertions, 13 deletions
@@ -4017,6 +4017,7 @@ t/op/read.t See if read() works t/op/recurse.t See if deep recursion works t/op/ref.t See if refs and objects work t/op/reg_email.t See if regex recursion works by parsing email addresses +t/op/reg_posixcc.t See if posix characterclasses behave consistantly t/op/reg_email_thr.t See if regex recursion works by parsing email addresses in another thread t/op/regexp_noamp.t See if regular expressions work with optimizations t/op/regexp_notrie.t See if regular expressions work without trie optimisation diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 64de8b1fee..242465d548 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -779,6 +779,31 @@ sub simple_dumper { ## ## Process UnicodeData.txt (Categories, etc.) ## +# These are the character mappings as defined in the POSIX standard +# and in the case of PerlSpace and PerlWord as is defined in the test macros +# for binary strings. IOW, PerlWord is [A-Za-z_] and PerlSpace is [\f\r\n\t ] +# This differs from Word and the existing SpacePerl (note the prefix/suffix difference) +# which is basically the Unicode WhiteSpace without the vertical tab included +# +my %TRUE_POSIX_PERL_CC= ( + PosixAlnum => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 0x0061..0x007a )}, + PosixAlpha => { map { $_ => 1 } ( 0x0041..0x005a, 0x0061..0x007a )}, + # Not Needed: Ascii => { map { $_ => 1 } ( 0x0000..0x007f )}, + PosixBlank => { map { $_ => 1 } ( 0x0009, 0x0020 )}, + PosixCntrl => { map { $_ => 1 } ( 0x0000..0x001f, 0x007f )}, + PosixGraph => { map { $_ => 1 } ( 0x0021..0x007e )}, + PosixLower => { map { $_ => 1 } ( 0x0061..0x007a )}, + PosixPrint => { map { $_ => 1 } ( 0x0020..0x007e )}, + PosixPunct => { map { $_ => 1 } ( 0x0021..0x002f, 0x003a..0x0040, 0x005b..0x0060, 0x007b..0x007e )}, + PosixSpace => { map { $_ => 1 } ( 0x0009..0x000d, 0x0020 )}, + PosixUpper => { map { $_ => 1 } ( 0x0041..0x005a )}, + # Not needed: PosixXdigit => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x0046, 0x0061..0x0066 )}, + PosixDigit => { map { $_ => 1 } ( 0x0030..0x0039 )}, + + PerlSpace => { map { $_ => 1 } ( 0x0009..0x000a, 0x000c..0x000d, 0x0020 )}, + PerlWord => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 0x005f, 0x0061..0x007a )}, +); + sub UnicodeData_Txt() { my $Bidi = Table->New(); @@ -795,7 +820,7 @@ sub UnicodeData_Txt() $DC{can} = Table->New(); $DC{com} = Table->New(); - ## Initialize Perl-generated categories + ## Initialize Broken Perl-generated categories ## (Categories from UnicodeData.txt are auto-initialized in gencat) $Cat{Alnum} = Table->New(Is => 'Alnum', Desc => "[[:Alnum:]]", Fuzzy => 0); @@ -839,6 +864,10 @@ sub UnicodeData_Txt() $To{Title} = Table->New(); $To{Digit} = Table->New(); + foreach my $cat (keys %TRUE_POSIX_PERL_CC) { + $Cat{$cat} = Table->New(Is=>$cat, Fuzzy => 0); + } + sub gencat($$$$) { my ($name, ## Name ("LATIN CAPITAL LETTER A") @@ -920,6 +949,13 @@ sub UnicodeData_Txt() $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9 || ($code >= 0x41 && $code <= 0x46) ## A..F || ($code >= 0x61 && $code <= 0x66); ## a..f + if ($code<=0x7F) { + foreach my $cat (keys %TRUE_POSIX_PERL_CC) { + if ($TRUE_POSIX_PERL_CC{$cat}{$code}) { + $Cat{$cat}->$op($code); + } + } + } } ## open ane read file..... @@ -7705,6 +7705,22 @@ case ANYOF_N##NAME: \ what = WORD; \ break +/* + We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test + so that it is possible to override the option here without having to + rebuild the entire core. as we are required to do if we change regcomp.h + which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. +*/ +#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS +#define BROKEN_UNICODE_CHARCLASS_MAPPINGS +#endif + +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS +#define POSIX_CC_UNI_NAME(CCNAME) CCNAME +#else +#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME +#endif + /* parse a class specification and produce either an ANYOF node that matches the pattern or if the pattern matches a single char only and @@ -7991,18 +8007,24 @@ parseit: * A similar issue a little earlier when switching on value. * --jhi */ switch ((I32)namedclass) { + + case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); + case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); + case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); + case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); + case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); + case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); + case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); + case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); + case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); + case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS case _C_C_T_(ALNUM, isALNUM(value), "Word"); - case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum"); - case _C_C_T_(ALPHA, isALPHA(value), "Alpha"); - case _C_C_T_(BLANK, isBLANK(value), "Blank"); - case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl"); - case _C_C_T_(GRAPH, isGRAPH(value), "Graph"); - case _C_C_T_(LOWER, isLOWER(value), "Lower"); - case _C_C_T_(PRINT, isPRINT(value), "Print"); - case _C_C_T_(PSXSPC, isPSXSPC(value), "Space"); - case _C_C_T_(PUNCT, isPUNCT(value), "Punct"); case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); - case _C_C_T_(UPPER, isUPPER(value), "Upper"); +#else + case _C_C_T_(SPACE, isSPACE(value), "PerlSpace"); + case _C_C_T_(ALNUM, isALNUM(value), "PerlWord"); +#endif case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); @@ -8049,7 +8071,7 @@ parseit: ANYOF_BITMAP_SET(ret, value); } yesno = '+'; - what = "Digit"; + what = POSIX_CC_UNI_NAME("Digit"); break; case ANYOF_NDIGIT: if (LOC) @@ -8062,7 +8084,7 @@ parseit: ANYOF_BITMAP_SET(ret, value); } yesno = '!'; - what = "Digit"; + what = POSIX_CC_UNI_NAME("Digit"); break; case ANYOF_MAX: /* this is to handle \p and \P */ @@ -18,6 +18,24 @@ typedef OP OP_4tree; /* Will be redefined later. */ /* Be really agressive about optimising patterns with trie sequences? */ #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1 +/* Use old style unicode mappings for perl and posix character classes + * + * NOTE: Enabling this essentially breaks character class matching against unicode + * strings, so that POSIX char classes match when they shouldn't, and \d matches + * way more than 10 characters, and sometimes a charclass and its complement either + * both match or neither match. + * NOTE: Disabling this will cause various backwards compatibility issues to rear + * their head, and tests to fail. However it will make the charclass behaviour + * consistant regardless of internal string type, and make character class inversions + * consistant. The tests that fail in the regex engine are basically broken tests. + * + * Personally I think 5.12 should disable this for sure. Its a bit more debatable for + * 5.10, so for now im leaving it enabled. + * + * -demerphq + */ +#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1 + /* Should the optimiser take positive assertions into account? */ #define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0 diff --git a/t/op/pat.t b/t/op/pat.t index 774e10b7f0..ecd19acef4 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4630,6 +4630,9 @@ if ($::running_as_thread) { } SKIP: { + # XXX: This set of tests is essentially broken, POSIX character classes + # should not have differing definitions under unicode. + # There are property names for that. unless ($ordA == 65) { skip("Assumes ASCII", 4) } my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/} diff --git a/t/op/reg_posixcc.t b/t/op/reg_posixcc.t new file mode 100644 index 0000000000..73353993b2 --- /dev/null +++ b/t/op/reg_posixcc.t @@ -0,0 +1,127 @@ +#!perl + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; +} + +use strict; +use warnings; +use Test::More tests => 1; +my @pats=( + "\\w", + "\\W", + "\\s", + "\\S", + "\\d", + "\\D", + "[:alnum:]", + "[:^alnum:]", + "[:alpha:]", + "[:^alpha:]", + "[:ascii:]", + "[:^ascii:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:graph:]", + "[:^graph:]", + "[:lower:]", + "[:^lower:]", + "[:print:]", + "[:^print:]", + "[:punct:]", + "[:^punct:]", + "[:upper:]", + "[:^upper:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" ); +sub rangify { + my $ary= shift; + my $fmt= shift || '%d'; + my $sep= shift || ' '; + my $rng= shift || '..'; + + + my $first= $ary->[0]; + my $last= $ary->[0]; + my $ret= sprintf $fmt, $first; + for my $idx (1..$#$ary) { + if ( $ary->[$idx] != $last + 1) { + if ($last!=$first) { + $ret.=sprintf "%s$fmt",$rng, $last; + } + $first= $last= $ary->[$idx]; + $ret.=sprintf "%s$fmt",$sep,$first; + } else { + $last= $ary->[$idx]; + } + } + if ( $last != $first) { + $ret.=sprintf "%s$fmt",$rng, $last; + } + return $ret; +} + +my $description = ""; +while (@pats) { + my ($yes,$no)= splice @pats,0,2; + + my %err_by_type; + my %singles; + foreach my $b (0..255) { + my %got; + for my $type ('unicode','not-unicode') { + my $str=chr($b).chr($b); + if ($type eq 'unicode') { + $str.=chr(256); + chop $str; + } + if ($str=~/[$yes][$no]/) { + push @{$err_by_type{$type}},$b; + } + $got{"[$yes]"}{$type} = $str=~/[$yes]/ ? 1 : 0; + $got{"[$no]"}{$type} = $str=~/[$no]/ ? 1 : 0; + $got{"[^$yes]"}{$type} = $str=~/[^$yes]/ ? 1 : 0; + $got{"[^$no]"}{$type} = $str=~/[^$no]/ ? 1 : 0; + } + foreach my $which ("[$yes]","[$no]","[^$yes]","[^$no]") { + if ($got{$which}{'unicode'} != $got{$which}{'not-unicode'}) { + push @{$singles{$which}},$b; + } + } + } + + + if (%err_by_type || %singles) { + $description||=" Error:\n"; + $description .= "/[$yes][$no]/\n"; + if (%err_by_type) { + foreach my $type (keys %err_by_type) { + $description .= "\tmatches $type codepoints:\t"; + $description .= rangify($err_by_type{$type}); + $description .= "\n"; + } + $description .= "\n"; + } + if (%singles) { + $description .= "Unicode/Nonunicode mismatches:\n"; + foreach my $type (keys %singles) { + $description .= "\t$type:\t"; + $description .= rangify($singles{$type}); + $description .= "\n"; + } + $description .= "\n"; + } + + } + +} +TODO: { + local $TODO = "Only works under PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS = 0"; + is( $description, "", "POSIX and perl charclasses should not depend on string type"); +}; +__DATA__ |