summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MANIFEST1
-rw-r--r--lib/unicore/mktables38
-rw-r--r--regcomp.c46
-rw-r--r--regcomp.h18
-rwxr-xr-xt/op/pat.t3
-rw-r--r--t/op/reg_posixcc.t127
6 files changed, 220 insertions, 13 deletions
diff --git a/MANIFEST b/MANIFEST
index b315e0b3d4..45a0dd6a98 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -4017,6 +4017,7 @@ t/op/read.t See if read() works
t/op/recurse.t See if deep recursion works
t/op/ref.t See if refs and objects work
t/op/reg_email.t See if regex recursion works by parsing email addresses
+t/op/reg_posixcc.t See if posix characterclasses behave consistantly
t/op/reg_email_thr.t See if regex recursion works by parsing email addresses in another thread
t/op/regexp_noamp.t See if regular expressions work with optimizations
t/op/regexp_notrie.t See if regular expressions work without trie optimisation
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 64de8b1fee..242465d548 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -779,6 +779,31 @@ sub simple_dumper {
##
## Process UnicodeData.txt (Categories, etc.)
##
+# These are the character mappings as defined in the POSIX standard
+# and in the case of PerlSpace and PerlWord as is defined in the test macros
+# for binary strings. IOW, PerlWord is [A-Za-z_] and PerlSpace is [\f\r\n\t ]
+# This differs from Word and the existing SpacePerl (note the prefix/suffix difference)
+# which is basically the Unicode WhiteSpace without the vertical tab included
+#
+my %TRUE_POSIX_PERL_CC= (
+ PosixAlnum => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 0x0061..0x007a )},
+ PosixAlpha => { map { $_ => 1 } ( 0x0041..0x005a, 0x0061..0x007a )},
+ # Not Needed: Ascii => { map { $_ => 1 } ( 0x0000..0x007f )},
+ PosixBlank => { map { $_ => 1 } ( 0x0009, 0x0020 )},
+ PosixCntrl => { map { $_ => 1 } ( 0x0000..0x001f, 0x007f )},
+ PosixGraph => { map { $_ => 1 } ( 0x0021..0x007e )},
+ PosixLower => { map { $_ => 1 } ( 0x0061..0x007a )},
+ PosixPrint => { map { $_ => 1 } ( 0x0020..0x007e )},
+ PosixPunct => { map { $_ => 1 } ( 0x0021..0x002f, 0x003a..0x0040, 0x005b..0x0060, 0x007b..0x007e )},
+ PosixSpace => { map { $_ => 1 } ( 0x0009..0x000d, 0x0020 )},
+ PosixUpper => { map { $_ => 1 } ( 0x0041..0x005a )},
+ # Not needed: PosixXdigit => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x0046, 0x0061..0x0066 )},
+ PosixDigit => { map { $_ => 1 } ( 0x0030..0x0039 )},
+
+ PerlSpace => { map { $_ => 1 } ( 0x0009..0x000a, 0x000c..0x000d, 0x0020 )},
+ PerlWord => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 0x005f, 0x0061..0x007a )},
+);
+
sub UnicodeData_Txt()
{
my $Bidi = Table->New();
@@ -795,7 +820,7 @@ sub UnicodeData_Txt()
$DC{can} = Table->New();
$DC{com} = Table->New();
- ## Initialize Perl-generated categories
+ ## Initialize Broken Perl-generated categories
## (Categories from UnicodeData.txt are auto-initialized in gencat)
$Cat{Alnum} =
Table->New(Is => 'Alnum', Desc => "[[:Alnum:]]", Fuzzy => 0);
@@ -839,6 +864,10 @@ sub UnicodeData_Txt()
$To{Title} = Table->New();
$To{Digit} = Table->New();
+ foreach my $cat (keys %TRUE_POSIX_PERL_CC) {
+ $Cat{$cat} = Table->New(Is=>$cat, Fuzzy => 0);
+ }
+
sub gencat($$$$)
{
my ($name, ## Name ("LATIN CAPITAL LETTER A")
@@ -920,6 +949,13 @@ sub UnicodeData_Txt()
$Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9
|| ($code >= 0x41 && $code <= 0x46) ## A..F
|| ($code >= 0x61 && $code <= 0x66); ## a..f
+ if ($code<=0x7F) {
+ foreach my $cat (keys %TRUE_POSIX_PERL_CC) {
+ if ($TRUE_POSIX_PERL_CC{$cat}{$code}) {
+ $Cat{$cat}->$op($code);
+ }
+ }
+ }
}
## open ane read file.....
diff --git a/regcomp.c b/regcomp.c
index 10f89947ff..c9d8677536 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -7705,6 +7705,22 @@ case ANYOF_N##NAME: \
what = WORD; \
break
+/*
+ We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+ so that it is possible to override the option here without having to
+ rebuild the entire core. as we are required to do if we change regcomp.h
+ which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
/*
parse a class specification and produce either an ANYOF node that
matches the pattern or if the pattern matches a single char only and
@@ -7991,18 +8007,24 @@ parseit:
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
+
+ case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
+ case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+ case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+ case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+ case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+ case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+ case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+ case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
+ case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+ case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
case _C_C_T_(ALNUM, isALNUM(value), "Word");
- case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
- case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
- case _C_C_T_(BLANK, isBLANK(value), "Blank");
- case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
- case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
- case _C_C_T_(LOWER, isLOWER(value), "Lower");
- case _C_C_T_(PRINT, isPRINT(value), "Print");
- case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
- case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
- case _C_C_T_(UPPER, isUPPER(value), "Upper");
+#else
+ case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
+ case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+#endif
case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
@@ -8049,7 +8071,7 @@ parseit:
ANYOF_BITMAP_SET(ret, value);
}
yesno = '+';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_NDIGIT:
if (LOC)
@@ -8062,7 +8084,7 @@ parseit:
ANYOF_BITMAP_SET(ret, value);
}
yesno = '!';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_MAX:
/* this is to handle \p and \P */
diff --git a/regcomp.h b/regcomp.h
index 535b7a98c7..9bf567701c 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -18,6 +18,24 @@ typedef OP OP_4tree; /* Will be redefined later. */
/* Be really agressive about optimising patterns with trie sequences? */
#define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1
+/* Use old style unicode mappings for perl and posix character classes
+ *
+ * NOTE: Enabling this essentially breaks character class matching against unicode
+ * strings, so that POSIX char classes match when they shouldn't, and \d matches
+ * way more than 10 characters, and sometimes a charclass and its complement either
+ * both match or neither match.
+ * NOTE: Disabling this will cause various backwards compatibility issues to rear
+ * their head, and tests to fail. However it will make the charclass behaviour
+ * consistant regardless of internal string type, and make character class inversions
+ * consistant. The tests that fail in the regex engine are basically broken tests.
+ *
+ * Personally I think 5.12 should disable this for sure. Its a bit more debatable for
+ * 5.10, so for now im leaving it enabled.
+ *
+ * -demerphq
+ */
+#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1
+
/* Should the optimiser take positive assertions into account? */
#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0
diff --git a/t/op/pat.t b/t/op/pat.t
index 774e10b7f0..ecd19acef4 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4630,6 +4630,9 @@ if ($::running_as_thread) {
}
SKIP: {
+ # XXX: This set of tests is essentially broken, POSIX character classes
+ # should not have differing definitions under unicode.
+ # There are property names for that.
unless ($ordA == 65) { skip("Assumes ASCII", 4) }
my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/}
diff --git a/t/op/reg_posixcc.t b/t/op/reg_posixcc.t
new file mode 100644
index 0000000000..73353993b2
--- /dev/null
+++ b/t/op/reg_posixcc.t
@@ -0,0 +1,127 @@
+#!perl
+
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '../lib';
+}
+
+use strict;
+use warnings;
+use Test::More tests => 1;
+my @pats=(
+ "\\w",
+ "\\W",
+ "\\s",
+ "\\S",
+ "\\d",
+ "\\D",
+ "[:alnum:]",
+ "[:^alnum:]",
+ "[:alpha:]",
+ "[:^alpha:]",
+ "[:ascii:]",
+ "[:^ascii:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
+ "[:graph:]",
+ "[:^graph:]",
+ "[:lower:]",
+ "[:^lower:]",
+ "[:print:]",
+ "[:^print:]",
+ "[:punct:]",
+ "[:^punct:]",
+ "[:upper:]",
+ "[:^upper:]",
+ "[:xdigit:]",
+ "[:^xdigit:]",
+ "[:space:]",
+ "[:^space:]",
+ "[:blank:]",
+ "[:^blank:]" );
+sub rangify {
+ my $ary= shift;
+ my $fmt= shift || '%d';
+ my $sep= shift || ' ';
+ my $rng= shift || '..';
+
+
+ my $first= $ary->[0];
+ my $last= $ary->[0];
+ my $ret= sprintf $fmt, $first;
+ for my $idx (1..$#$ary) {
+ if ( $ary->[$idx] != $last + 1) {
+ if ($last!=$first) {
+ $ret.=sprintf "%s$fmt",$rng, $last;
+ }
+ $first= $last= $ary->[$idx];
+ $ret.=sprintf "%s$fmt",$sep,$first;
+ } else {
+ $last= $ary->[$idx];
+ }
+ }
+ if ( $last != $first) {
+ $ret.=sprintf "%s$fmt",$rng, $last;
+ }
+ return $ret;
+}
+
+my $description = "";
+while (@pats) {
+ my ($yes,$no)= splice @pats,0,2;
+
+ my %err_by_type;
+ my %singles;
+ foreach my $b (0..255) {
+ my %got;
+ for my $type ('unicode','not-unicode') {
+ my $str=chr($b).chr($b);
+ if ($type eq 'unicode') {
+ $str.=chr(256);
+ chop $str;
+ }
+ if ($str=~/[$yes][$no]/) {
+ push @{$err_by_type{$type}},$b;
+ }
+ $got{"[$yes]"}{$type} = $str=~/[$yes]/ ? 1 : 0;
+ $got{"[$no]"}{$type} = $str=~/[$no]/ ? 1 : 0;
+ $got{"[^$yes]"}{$type} = $str=~/[^$yes]/ ? 1 : 0;
+ $got{"[^$no]"}{$type} = $str=~/[^$no]/ ? 1 : 0;
+ }
+ foreach my $which ("[$yes]","[$no]","[^$yes]","[^$no]") {
+ if ($got{$which}{'unicode'} != $got{$which}{'not-unicode'}) {
+ push @{$singles{$which}},$b;
+ }
+ }
+ }
+
+
+ if (%err_by_type || %singles) {
+ $description||=" Error:\n";
+ $description .= "/[$yes][$no]/\n";
+ if (%err_by_type) {
+ foreach my $type (keys %err_by_type) {
+ $description .= "\tmatches $type codepoints:\t";
+ $description .= rangify($err_by_type{$type});
+ $description .= "\n";
+ }
+ $description .= "\n";
+ }
+ if (%singles) {
+ $description .= "Unicode/Nonunicode mismatches:\n";
+ foreach my $type (keys %singles) {
+ $description .= "\t$type:\t";
+ $description .= rangify($singles{$type});
+ $description .= "\n";
+ }
+ $description .= "\n";
+ }
+
+ }
+
+}
+TODO: {
+ local $TODO = "Only works under PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS = 0";
+ is( $description, "", "POSIX and perl charclasses should not depend on string type");
+};
+__DATA__