summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2008-11-07 20:20:21 +0000
committerDavid Mitchell <davem@iabyn.com>2009-01-05 21:04:00 +0000
commita4a5e9c8b03b429df0f41c267c6465848d52ed49 (patch)
treeb801e257dc5e9f2f3675cc0d1e8bccee04f809b3 /regcomp.c
parentd95b79a3450f7055135f9dc4421d80f9ebe23c91 (diff)
downloadperl-a4a5e9c8b03b429df0f41c267c6465848d52ed49.tar.gz
create new unicode props as defined in POSIX spec (optionally use them in the regex engine)
Perlbug #60156 and #49302 (and probably others) resolve down to the problem that the definition of \s and \w and \d and the POSIX charclasses are different for unicode strings and for non-unicode strings. This broke the character class logic in the regex engine. The easiest fix to make the character class logic sane again is to define new properties which do match. This change creates new property classes that can be used instead of the traditional ones (it does not change the previously defined ones). If the define in regcomp.h: is changed to 0, then the new mappings will be used. This will fix a bunch of bugs that are reported as TODO items in the new reg_posixcc.t test file. p4raw-id: //depot/perl@34769 (cherry picked from commit da7fcca4b8d6fb4dc88e0305bf9830bf24912ebd)
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c46
1 files changed, 34 insertions, 12 deletions
diff --git a/regcomp.c b/regcomp.c
index 10f89947ff..c9d8677536 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -7705,6 +7705,22 @@ case ANYOF_N##NAME: \
what = WORD; \
break
+/*
+ We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+ so that it is possible to override the option here without having to
+ rebuild the entire core. as we are required to do if we change regcomp.h
+ which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
/*
parse a class specification and produce either an ANYOF node that
matches the pattern or if the pattern matches a single char only and
@@ -7991,18 +8007,24 @@ parseit:
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
+
+ case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
+ case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+ case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+ case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+ case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+ case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+ case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+ case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
+ case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+ case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
case _C_C_T_(ALNUM, isALNUM(value), "Word");
- case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
- case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
- case _C_C_T_(BLANK, isBLANK(value), "Blank");
- case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
- case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
- case _C_C_T_(LOWER, isLOWER(value), "Lower");
- case _C_C_T_(PRINT, isPRINT(value), "Print");
- case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
- case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
- case _C_C_T_(UPPER, isUPPER(value), "Upper");
+#else
+ case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
+ case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+#endif
case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
@@ -8049,7 +8071,7 @@ parseit:
ANYOF_BITMAP_SET(ret, value);
}
yesno = '+';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_NDIGIT:
if (LOC)
@@ -8062,7 +8084,7 @@ parseit:
ANYOF_BITMAP_SET(ret, value);
}
yesno = '!';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_MAX:
/* this is to handle \p and \P */