summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2014-11-24 13:19:21 -0700
committerKarl Williamson <khw@cpan.org>2014-11-24 13:43:07 -0700
commitc7d255944c0b238f9cec18e728822535d42a9ed2 (patch)
tree4ac5dfc5e6cbd25c3a26fad3f166b37ab639acca /regcomp.c
parent22e7ef05c1f7a7fcd58d10d6e720579b9bbea728 (diff)
downloadperl-c7d255944c0b238f9cec18e728822535d42a9ed2.tar.gz
Make /[\N{}-\N{}]/ match Unicodely on EBCDIC
This makes [\N{U+06}-\N{U+09}] match U+06, U+07, U+08, U+09 even on EBCDIC platforms, allowing one to write portable ranges. For 1047 EBCDIC this would match 0x2E, 0x2F, 0x16, and 0x05. Thanks to Yaroslave Kuzmin for finding a bug in an earlier incarnation of this patch.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c82
1 files changed, 56 insertions, 26 deletions
diff --git a/regcomp.c b/regcomp.c
index 442d0ba171..85a142edf2 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13742,6 +13742,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
/* In a range, counts how many 0-2 of the ends of it came from literals,
* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
UV literal_endpoint = 0;
+
+ /* Is the range unicode? which means on a platform that isn't 1-1 native
+ * to Unicode (i.e. non-ASCII), each code point in it should be considered
+ * to be a Unicode value. */
+ bool unicode_range = FALSE;
#endif
bool invert = FALSE; /* Is this class to be complemented */
@@ -13947,8 +13952,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
}
/* Here, is a single code point, and <value> contains it */
#ifdef EBCDIC
- /* We consider named characters to be literal characters */
+ /* We consider named characters to be literal characters,
+ * and they are Unicode */
literal_endpoint++;
+ unicode_range = TRUE;
#endif
}
break;
@@ -14406,8 +14413,23 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
* minus sign */
if (range) {
+#ifdef EBCDIC
+ /* For unicode ranges, we have to test that the Unicode as opposed
+ * to the native values are not decreasing. (Above 255, and there
+ * is no difference between native and Unicode) */
+ if (unicode_range && prevvalue < 255 && value < 255) {
+ if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
+ goto backwards_range;
+ }
+ }
+ else
+#endif
if (prevvalue > value) /* b-a */ {
- const int w = RExC_parse - rangebegin;
+ int w;
+#ifdef EBCDIC
+ backwards_range:
+#endif
+ w = RExC_parse - rangebegin;
vFAIL2utf8f(
"Invalid [] range \"%"UTF8f"\"",
UTF8fARG(UTF, w, rangebegin));
@@ -14542,32 +14564,40 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
prevvalue, value);
#else
- SV* this_range = _new_invlist(1);
- _append_range_to_invlist(this_range, prevvalue, value);
-
- /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
- * If this range was specified using something like 'i-j', we want
- * to include only the 'i' and the 'j', and not anything in
- * between, so exclude non-ASCII, non-alphabetics from it.
- * However, if the range was specified with something like
- * [\x89-\x91] or [\x89-j], all code points within it should be
- * included. literal_endpoint==2 means both ends of the range used
- * a literal character, not \x{foo} */
- if (literal_endpoint == 2
- && ((isLOWER_A(prevvalue) && isLOWER_A(value))
- || (isUPPER_A(prevvalue) && isUPPER_A(value))))
+ /* On non-ASCII platforms, for ranges that span all of 0..255, and
+ * ones that don't require special handling, we can just add the
+ * range like we do for ASCII platforms */
+ if ((UNLIKELY(prevvalue == 0) && value >= 255)
+ || ! (prevvalue < 256
+ && (unicode_range
+ || (literal_endpoint == 2
+ && ((isLOWER_A(prevvalue) && isLOWER_A(value))
+ || (isUPPER_A(prevvalue)
+ && isUPPER_A(value)))))))
{
- _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
- &this_range);
-
- /* Since 'this_range' now only contains ascii, the intersection
- * of it with anything will still yield only ascii */
- _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
- &this_range);
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ prevvalue, value);
+ }
+ else {
+ /* Here, requires special handling. This can be because it is
+ * a range whose code points are considered to be Unicode, and
+ * so must be individually translated into native, or because
+ * its a subrange of 'A-Z' or 'a-z' which each aren't
+ * contiguous in EBCDIC, but we have defined them to include
+ * only the "expected" upper or lower case ASCII alphabetics.
+ * Subranges above 255 are the same in native and Unicode, so
+ * can be added as a range */
+ U8 start = NATIVE_TO_LATIN1(prevvalue);
+ unsigned j;
+ U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
+ for (j = start; j <= end; j++) {
+ cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
+ }
+ if (value > 255) {
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ 256, value);
+ }
}
- _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
- literal_endpoint = 0;
- SvREFCNT_dec_NN(this_range);
#endif
}