summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-03-18 08:36:17 -0600
committerKarl Williamson <public@khwilliamson.com>2011-03-18 09:18:00 -0600
commitdd58aee1a749e6188a04cac2e4ba58a7004c1ec1 (patch)
tree49bd56e35c794fe000447b3aff2d9d0a6ee3dc11
parentfff7535cca3d7faa5b59aa82de74bd71e938ea92 (diff)
downloadperl-dd58aee1a749e6188a04cac2e4ba58a7004c1ec1.tar.gz
regex: Fix locale regression
Things like \S have not been accessible to the synthetic start class under locale matching rules. They have been placed there, but the start class didn't know they were there. This patch sets ANYOF_CLASS in initializing the synthetic start class so that downstream code knows it is a charclass_class, and removes the code that partially allowed this bit to be shared, and which isn't needed in 5.14, and more thought would have to go into doing it than was reflected in the code. I can't come up with a test case that would verify that this works, because of general locale testing issues, except it looked at a dump of the generated regex synthetic start class, but the dump isn't the same thing as the real behavior, and using one is also subject to breakage if the regex code changes in the slightest.
-rw-r--r--regcomp.c29
-rw-r--r--regcomp.h49
2 files changed, 28 insertions, 50 deletions
diff --git a/regcomp.c b/regcomp.c
index 75da2bfa06..addc0d0266 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -728,7 +728,8 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c
ANYOF_BITMAP_SETALL(cl);
ANYOF_CLASS_ZERO(cl); /* all bits set, so class is irrelevant */
- cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
+ cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
+ |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
/* If any portion of the regex is to operate under locale rules,
* initialization includes it. The reason this isn't done for all regexes
@@ -775,8 +776,9 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
/* These two functions currently do the exact same thing */
#define cl_init_zero S_cl_init
-/* 'And' a given class with another one. Can create false positives */
-/* cl should not be inverted */
+/* 'AND' a given class with another one. Can create false positives. 'cl'
+ * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
+ * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
STATIC void
S_cl_and(struct regnode_charclass_class *cl,
const struct regnode_charclass_class *and_with)
@@ -866,8 +868,9 @@ S_cl_and(struct regnode_charclass_class *cl,
}
}
-/* 'OR' a given class with another one. Can create false positives */
-/* cl should not be inverted */
+/* 'OR' a given class with another one. Can create false positives. 'cl'
+ * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
+ * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
STATIC void
S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
{
@@ -9542,20 +9545,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
if (SIZE_ONLY) {
RExC_size += ANYOF_SKIP;
-#ifdef ANYOF_ADD_LOC_SKIP
- if (LOC) {
- RExC_size += ANYOF_ADD_LOC_SKIP;
- }
-#endif
listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
}
else {
RExC_emit += ANYOF_SKIP;
if (LOC) {
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
-#ifdef ANYOF_ADD_LOC_SKIP
- RExC_emit += ANYOF_ADD_LOC_SKIP;
-#endif
}
ANYOF_BITMAP_ZERO(ret);
listsv = newSVpvs("# comment\n");
@@ -9784,14 +9779,10 @@ parseit:
if (LOC && namedclass < ANYOF_MAX && ! need_class) {
need_class = 1;
if (SIZE_ONLY) {
-#ifdef ANYOF_CLASS_ADD_SKIP
- RExC_size += ANYOF_CLASS_ADD_SKIP;
-#endif
+ RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
}
else {
-#ifdef ANYOF_CLASS_ADD_SKIP
- RExC_emit += ANYOF_CLASS_ADD_SKIP;
-#endif
+ RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
ANYOF_CLASS_ZERO(ret);
}
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
diff --git a/regcomp.h b/regcomp.h
index cc6708b0e8..b701cb6681 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -309,14 +309,13 @@ struct regnode_charclass_class {
/* Flags for node->flags of ANYOF. These are in short supply, so some games
* are done to share them, as described below. If necessary, the ANYOF_LOCALE
- * and ANYOF_CLASS bits could be shared with a space penalty for locale nodes
- * (and the code at the time this comment was written, is written so that all
- * that is necessary to make the change would be to redefine the ANYOF_CLASS
- * define). Once the planned change to compile all the above-latin1 code points
- * is done, then the UNICODE_ALL bit can be freed up. If flags need to be
- * added that are applicable to the synthetic start class only, with some work,
- * they could be put in the next-node field, or in an unused bit of the
- * classflags field. */
+ * and ANYOF_CLASS bits could be shared with a space penalty for locale nodes,
+ * but this isn't quite so easy, as the optimizer also uses ANYOF_CLASS.
+ * Once the planned change to compile all the above-latin1 code points is done,
+ * then the UNICODE_ALL bit can be freed up, with a small performance penalty.
+ * If flags need to be added that are applicable to the synthetic start class
+ * only, with some work, they could be put in the next-node field, or in an
+ * unused bit of the classflags field. */
#define ANYOF_LOCALE 0x01
@@ -331,8 +330,10 @@ struct regnode_charclass_class {
#define ANYOF_INVERT 0x04
-/* CLASS is never set unless LOCALE is too: has runtime \d, \w, [:posix:], ...
- * The non-locale ones are resolved at compile-time */
+/* Set if this is a struct regnode_charclass_class vs a regnode_charclass. This
+ * is used for runtime \d, \w, [:posix:], ..., which are used only in locale
+ * and the optimizer's synthetic start class. Non-locale \d, etc are resolved
+ * at compile-time */
#define ANYOF_CLASS 0x08
#define ANYOF_LARGE ANYOF_CLASS /* Same; name retained for back compat */
@@ -457,28 +458,14 @@ struct regnode_charclass_class {
#define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode))
#define ANYOF_CLASS_SKIP ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode))
-/* The class bit can be set to the locale one if necessary to save bits at the
- * expense of having locale ANYOF nodes always have a class bit map, and hence
- * take up extra space. This allows convenient changing it as development
- * proceeds on this */
-#if ANYOF_CLASS == ANYOF_LOCALE
-# undef ANYOF_CLASS_ADD_SKIP
-# define ANYOF_ADD_LOC_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP)
-
- /* Quicker way to see if there are actually any tests. This is because
- * currently the set of tests can be empty even when the class bitmap is
- * allocated */
-# if ANYOF_CLASSBITMAP_SIZE != 4
-# error ANYOF_CLASSBITMAP_SIZE is expected to be 4
-# endif
-# define ANYOF_CLASS_TEST_ANY_SET(p) /* assumes sizeof(p) = 4 */ \
- memNE (((struct regnode_charclass_class*)(p))->classflags, \
- "\0\0\0\0", ANYOF_CLASSBITMAP_SIZE)
-#else
-# define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP)
-# undef ANYOF_ADD_LOC_SKIP
-# define ANYOF_CLASS_TEST_ANY_SET(p) (ANYOF_FLAGS(p) & ANYOF_CLASS)
+#if ANYOF_CLASSBITMAP_SIZE != 4
+# error ANYOF_CLASSBITMAP_SIZE is expected to be 4
#endif
+#define ANYOF_CLASS_TEST_ANY_SET(p) ((ANYOF_FLAGS(p) & ANYOF_CLASS) \
+ && memNE (((struct regnode_charclass_class*)(p))->classflags, \
+ "\0\0\0\0", ANYOF_CLASSBITMAP_SIZE))
+/*#define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP)
+ * */
/*