summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2014-09-05 09:09:28 -0600
committerKarl Williamson <khw@cpan.org>2014-09-06 21:44:49 -0600
commit8f0cd35a38dde9ab975f5ee1a663b81939e17745 (patch)
tree1b79e320980b4937f349841c068458ce5d68c529 /regcomp.c
parenta5454c469023876ca9422440f302f587dba2a438 (diff)
downloadperl-8f0cd35a38dde9ab975f5ee1a663b81939e17745.tar.gz
Allow \N{named seq} in qr/[...]/
This commit changes the regex handler to properly match in many instances a \N{named sequence} in a bracketed character class. A named sequence is one which consists of a string of multiple characters but given one name. Unicode has hundreds of them, like LATIN CAPITAL LETTER A WITH MACRON AND GRAVE. These are encoded by Unicode when there is some user community that thinks of the conglomeration as a single unit, but there was no prior standard that had it so, and it is possible to encode it in Unicode using other means, typically a sequence of a base character followed by some combining marks. (If there had not been such a prior standard, 8859-1, things like LATIN CAPITAL LETTER A WITH GRAVE would have been put into Unicode this way too.) If they did not do it this way, they would run out of availble code points much sooner. Not having these as single characters adds a burden to the programmer having to deal with them. Hiding this detail as much as possible makes it easier to program. This commit hides this in one more place than previously. It takes advantage of the infrastructure added some releases ago dealing with the fact that the match of some single characters case-insensitively can be 2 or even 3 characters. "ss" =~ /[ß]/i; is the most prominent example. We earlier discovered that /[^ß]/ leads to unexpected behavior, and using one of these sequences as an endpoint in a range is also unclear as to what is meant. This commit leaves existing behavior for those cases. That behavior is to use just the first code point in the sequence for regular [...], and to generate a fatal syntax error for (?[...]).
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c51
1 files changed, 29 insertions, 22 deletions
diff --git a/regcomp.c b/regcomp.c
index f531026d65..73ad315c29 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13329,24 +13329,28 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
}
STATIC AV *
-S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_count)
+S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
{
- /* This adds the string scalar <multi_fold> to the array
- * <multi_char_matches>. <multi_fold> is known to have exactly
+ /* This adds the string scalar <multi_string> to the array
+ * <multi_char_matches>. <multi_string> is known to have exactly
* <cp_count> code points in it. This is used when constructing a
* bracketed character class and we find something that needs to match more
* than a single character.
*
- * <multi_char_matches> is actually an array of arrays. There will be one
- * or two top-level elements: [2], and/or [3]. The [2] element is an
- * array, each element thereof is a character which folds to TWO
- * characters; [3] is for folds to THREE characters. (Unicode guarantees a
- * maximum of 3 characters in any fold.) When we rewrite the character
- * class below, we will do so such that the longest folds are written
- * first, so that it prefers the longest matching strings first. This is
- * done even if it turns out that any quantifier is non-greedy, out of
- * programmer laziness. Tom Christiansen has agreed that this is ok. This
- * makes the test for the ligature 'ffi' come before the test for 'ff' */
+ * <multi_char_matches> is actually an array of arrays. Each top-level
+ * element is an array that contains all the strings known so far that are
+ * the same length. And that length (in number of code points) is the same
+ * as the index of the top-level array. Hence, the [2] element is an
+ * array, each element thereof is a string containing TWO code points; while element
+ * [3] is for strings of THREE characters, and so on. Since this is for
+ * multi-char strings there can never be a [0] nor [1] element.
+ *
+ * When we rewrite the character class below, we will do so such that the
+ * longest strings are written first, so that it prefers the longest
+ * matching strings first. This is done even if it turns out that any
+ * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
+ * Christiansen has agreed that this is ok. This makes the test for the
+ * ligature 'ffi' come before the test for 'ff', for example */
AV* this_array;
AV** this_array_ptr;
@@ -13366,7 +13370,7 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_
av_store(multi_char_matches, cp_count,
(SV*) this_array);
}
- av_push(this_array, multi_fold);
+ av_push(this_array, multi_string);
return multi_char_matches;
}
@@ -13650,23 +13654,26 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
}
}
else { /* cp_count > 1 */
- /* We only pay attention to the first char of
- * multichar strings being returned in char
- * classes. I kinda wonder if this makes sense as
- * it does change the behaviour from earlier
- * versions, OTOH that behaviour was broken as
- * well. XXX Solution is to recharacterize as
- * [rest-of-class]|multi1|multi2... */
+ if (! RExC_in_multi_char_class) {
+ if (invert || range || *RExC_parse == '-') {
if (strict) {
RExC_parse--;
- vFAIL("\\N{} in character class restricted to one character");
+ vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
}
else if (PASS2) {
ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
}
+ }
+ else {
+ multi_char_matches
+ = add_multi_match(multi_char_matches,
+ as_text,
+ cp_count);
+ }
break; /* <value> contains the first code
point. Drop out of the switch to
process it */
+ }
} /* End of cp_count != 1 */
/* This element should not be processed further in this