summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc2
-rw-r--r--lib/diagnostics.t2
-rw-r--r--pod/perldelta.pod14
-rw-r--r--pod/perldiag.pod31
-rw-r--r--pod/perlrecharclass.pod64
-rw-r--r--proto.h4
-rw-r--r--regcomp.c51
-rw-r--r--t/re/pat_advanced.t5
-rw-r--r--t/re/reg_mesg.t6
9 files changed, 122 insertions, 57 deletions
diff --git a/embed.fnc b/embed.fnc
index d25c78ed47..88adce209b 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2099,7 +2099,7 @@ Es |void |set_ANYOF_arg |NN RExC_state_t* const pRExC_state \
|NULLOK SV* const swash \
|const bool has_user_defined_property
Es |AV* |add_multi_match|NULLOK AV* multi_char_matches \
- |NN SV* multi_fold \
+ |NN SV* multi_string \
|const STRLEN cp_count
Es |regnode*|regclass |NN RExC_state_t *pRExC_state \
|NN I32 *flagp|U32 depth|const bool stop_at_1 \
diff --git a/lib/diagnostics.t b/lib/diagnostics.t
index 4ac2ebfe2b..0b35d16c06 100644
--- a/lib/diagnostics.t
+++ b/lib/diagnostics.t
@@ -106,7 +106,7 @@ seek STDERR, 0,0;
$warning = '';
warn "Using just the first character returned by \\N{} in character class in regex; marked by <-- HERE in m/%s/";
like $warning,
- qr/A charnames handler may return a sequence/s,
+ qr/Named Unicode character escapes/s,
'multi-line entries in perldiag.pod match';
# ; at end of entry in perldiag.pod
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 016c1bfd48..4d601dd5f2 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -443,6 +443,20 @@ Under some conditions a warning raised in compilation of regular
expression patterns could be displayed multiple times. This is now
fixed.
+=item *
+
+C<qr/[\N{named sequence}]/> now works properly in many instances. Some
+names known to C<\N{...}> refer to a sequence of multiple characters,
+instead of the usual single character. Bracketed character classes
+generally only match single characters, but now special handling has
+been added so that they can match named sequences, but not if the class
+is inverted or the sequence is specified as the beginning or end of a
+range. In these cases, the only behavior change from before is a slight
+rewording of the fatal error message given when this class is part of a
+C<?[...])> construct. When the C<[...]> stands alone, the same
+non-fatal warning as before is raised, and only the first character in
+the sequence is used, again just as before.
+
=back
=head1 Known Problems
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index df94c98a19..80b60028be 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3362,15 +3362,23 @@ bracketed character class, for the same reason that C<.> in a character
class loses its specialness: it matches almost everything, which is
probably not what you want.
-=item \N{} in character class restricted to one character in regex; marked
+=item \N{} in inverted character class or as a range end-point is restricted to one character in regex; marked
by S<<-- HERE> in m/%s/
(F) Named Unicode character escapes C<(\N{...})> may return a
-multi-character sequence. Such an escape may not be used in
-a character class, because character classes always match one
-character of input. Check that the correct escape has been used,
-and the correct charname handler is in scope. The S<<-- HERE> shows
-whereabouts in the regular expression the problem was discovered.
+multi-character sequence. Even though a character class is supposed to
+match just one character of input, perl will match the whole thing
+correctly, except when the class is inverted (C<[^...]>, or the escape
+is the beginning or final end point of a range. The mathematically
+logical behavior for what matches when inverting is very different than
+what people expect, so we have decided to forbid it.
+Similarly unclear is what should be generated when the C<\N{...}> is
+used as one of the end points of the range, such as in
+
+ [\x{41}-\N{ARABIC SEQUENCE YEH WITH HAMZA ABOVE WITH AE}]
+
+What is meant here is unclear, as the C<\N{...}> escape is a sequence of
+code points, so this is made an error.
=item \N{NAME} must be resolved by the lexer in regex; marked by
S<<-- HERE> in m/%s/
@@ -6507,9 +6515,14 @@ You need to add either braces or blanks to disambiguate.
=item Using just the first character returned by \N{} in character class in
regex; marked by S<<-- HERE> in m/%s/
-(W regexp) A charnames handler may return a sequence of more than one
-character. Currently all but the first one are discarded when used in
-a regular expression pattern bracketed character class.
+(W regexp) Named Unicode character escapes C<(\N{...})> may return a
+multi-character sequence. Even though a character class is supposed to
+match just one character of input, perl will match the whole thing
+correctly, except when the class is inverted (C<[^...]>, or the escape
+is the beginning or final end point of a range. For these, what should
+happen isn't clear at all. In these circumstances, Perl discards all
+but the first character of the returned sequence, which is not likely
+what you want.
=item Using !~ with %s doesn't make sense
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index a8dda141a9..5cd0ae7aab 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -457,30 +457,59 @@ Examples:
-------
-* There is an exception to a bracketed character class matching a
-single character only. When the class is to match caselessly under C</i>
-matching rules, and a character that is explicitly mentioned inside the
-class matches a
+* There are two exceptions to a bracketed character class matching a
+single character only. Each requires special handling by Perl to make
+things work:
+
+=over
+
+=item *
+
+When the class is to match caselessly under C</i> matching rules, and a
+character that is explicitly mentioned inside the class matches a
multiple-character sequence caselessly under Unicode rules, the class
-(when not L<inverted|/Negation>) will also match that sequence. For
-example, Unicode says that the letter C<LATIN SMALL LETTER SHARP S>
-should match the sequence C<ss> under C</i> rules. Thus,
+will also match that sequence. For example, Unicode says that the
+letter C<LATIN SMALL LETTER SHARP S> should match the sequence C<ss>
+under C</i> rules. Thus,
'ss' =~ /\A\N{LATIN SMALL LETTER SHARP S}\z/i # Matches
'ss' =~ /\A[aeioust\N{LATIN SMALL LETTER SHARP S}]\z/i # Matches
-For this to happen, the character must be explicitly specified, and not
-be part of a multi-character range (not even as one of its endpoints).
-(L</Character Ranges> will be explained shortly.) Therefore,
+For this to happen, the class must not be inverted (see L</Negation>)
+and the character must be explicitly specified, and not be part of a
+multi-character range (not even as one of its endpoints). (L</Character
+Ranges> will be explained shortly.) Therefore,
'ss' =~ /\A[\0-\x{ff}]\z/i # Doesn't match
'ss' =~ /\A[\0-\N{LATIN SMALL LETTER SHARP S}]\z/i # No match
- 'ss' =~ /\A[\xDF-\xDF]\z/i # Matches on ASCII platforms, since \XDF
- # is LATIN SMALL LETTER SHARP S, and the
- # range is just a single element
+ 'ss' =~ /\A[\xDF-\xDF]\z/i # Matches on ASCII platforms, since
+ # \XDF is LATIN SMALL LETTER SHARP S,
+ # and the range is just a single
+ # element
Note that it isn't a good idea to specify these types of ranges anyway.
+=item *
+
+Some names known to C<\N{...}> refer to a sequence of multiple characters,
+instead of the usual single character. When one of these is included in
+the class, the entire sequence is matched. For example,
+
+ "\N{TAMIL LETTER KA}\N{TAMIL VOWEL SIGN AU}"
+ =~ / ^ [\N{TAMIL SYLLABLE KAU}] $ /x;
+
+matches, because C<\N{TAMIL SYLLABLE KAU}> is a named sequence
+consisting of the two characters matched against. Like the other
+instance where a bracketed class can match multi characters, and for
+similar reasons, the class must not be inverted, and the named sequence
+may not appear in a range, even one where it is both endpoints. If
+these happen, it is a fatal error if the character class is within an
+extended L<C<(?[...])>|/Extended Bracketed Character Classes>
+class; and only the first code point is used (with
+a C<regexp>-type warning raised) otherwise.
+
+=back
+
=head3 Special Characters Inside a Bracketed Character Class
Most characters that are meta characters in regular expressions (that
@@ -597,9 +626,10 @@ the caret as one of the characters to match, either escape the caret or
else don't list it first.
In inverted bracketed character classes, Perl ignores the Unicode rules
-that normally say that certain characters should match a sequence of
-multiple characters under caseless C</i> matching. Following those
-rules could lead to highly confusing situations:
+that normally say that named sequence, and certain characters should
+match a sequence of multiple characters use under caseless C</i>
+matching. Following those rules could lead to highly confusing
+situations:
"ss" =~ /^[^\xDF]+$/ui; # Matches!
@@ -608,7 +638,7 @@ what C<\xDF> matches under C</i>. C<"s"> isn't C<\xDF>, but Unicode
says that C<"ss"> is what C<\xDF> matches under C</i>. So which one
"wins"? Do you fail the match because the string has C<ss> or accept it
because it has an C<s> followed by another C<s>? Perl has chosen the
-latter.
+latter. (See note in L</Bracketed Character Classes> above.)
Examples:
diff --git a/proto.h b/proto.h
index e733e79fc3..82496b6294 100644
--- a/proto.h
+++ b/proto.h
@@ -6729,10 +6729,10 @@ STATIC U32 S_add_data(RExC_state_t* const pRExC_state, const char* const s, cons
#define PERL_ARGS_ASSERT_ADD_DATA \
assert(pRExC_state); assert(s)
-STATIC AV* S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_count)
+STATIC AV* S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_ADD_MULTI_MATCH \
- assert(multi_fold)
+ assert(multi_string)
PERL_STATIC_INLINE void S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 *flagp, STRLEN len, UV code_point, bool downgradable)
__attribute__nonnull__(pTHX_1)
diff --git a/regcomp.c b/regcomp.c
index f531026d65..73ad315c29 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13329,24 +13329,28 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
}
STATIC AV *
-S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_count)
+S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
{
- /* This adds the string scalar <multi_fold> to the array
- * <multi_char_matches>. <multi_fold> is known to have exactly
+ /* This adds the string scalar <multi_string> to the array
+ * <multi_char_matches>. <multi_string> is known to have exactly
* <cp_count> code points in it. This is used when constructing a
* bracketed character class and we find something that needs to match more
* than a single character.
*
- * <multi_char_matches> is actually an array of arrays. There will be one
- * or two top-level elements: [2], and/or [3]. The [2] element is an
- * array, each element thereof is a character which folds to TWO
- * characters; [3] is for folds to THREE characters. (Unicode guarantees a
- * maximum of 3 characters in any fold.) When we rewrite the character
- * class below, we will do so such that the longest folds are written
- * first, so that it prefers the longest matching strings first. This is
- * done even if it turns out that any quantifier is non-greedy, out of
- * programmer laziness. Tom Christiansen has agreed that this is ok. This
- * makes the test for the ligature 'ffi' come before the test for 'ff' */
+ * <multi_char_matches> is actually an array of arrays. Each top-level
+ * element is an array that contains all the strings known so far that are
+ * the same length. And that length (in number of code points) is the same
+ * as the index of the top-level array. Hence, the [2] element is an
+ * array, each element thereof is a string containing TWO code points; while element
+ * [3] is for strings of THREE characters, and so on. Since this is for
+ * multi-char strings there can never be a [0] nor [1] element.
+ *
+ * When we rewrite the character class below, we will do so such that the
+ * longest strings are written first, so that it prefers the longest
+ * matching strings first. This is done even if it turns out that any
+ * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
+ * Christiansen has agreed that this is ok. This makes the test for the
+ * ligature 'ffi' come before the test for 'ff', for example */
AV* this_array;
AV** this_array_ptr;
@@ -13366,7 +13370,7 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_
av_store(multi_char_matches, cp_count,
(SV*) this_array);
}
- av_push(this_array, multi_fold);
+ av_push(this_array, multi_string);
return multi_char_matches;
}
@@ -13650,23 +13654,26 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
}
}
else { /* cp_count > 1 */
- /* We only pay attention to the first char of
- * multichar strings being returned in char
- * classes. I kinda wonder if this makes sense as
- * it does change the behaviour from earlier
- * versions, OTOH that behaviour was broken as
- * well. XXX Solution is to recharacterize as
- * [rest-of-class]|multi1|multi2... */
+ if (! RExC_in_multi_char_class) {
+ if (invert || range || *RExC_parse == '-') {
if (strict) {
RExC_parse--;
- vFAIL("\\N{} in character class restricted to one character");
+ vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
}
else if (PASS2) {
ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
}
+ }
+ else {
+ multi_char_matches
+ = add_multi_match(multi_char_matches,
+ as_text,
+ cp_count);
+ }
break; /* <value> contains the first code
point. Drop out of the switch to
process it */
+ }
} /* End of cp_count != 1 */
/* This element should not be processed further in this
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index 986eb87de4..fb30a9c085 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -998,9 +998,8 @@ sub run_tests {
#
my $w;
local $SIG {__WARN__} = sub {$w .= "@_"};
- eval 'q(xxWxx) =~ /[\N{WARN}]/';
- ok $w && $w =~ /Using just the first character returned by \\N\{} in character class/,
- "single character in [\\N{}] warning";
+ $result = eval 'q(WARN) =~ /[\N{WARN}]/';
+ ok !$@ && $result && ! $w, '\N{} returning multi-char works';
undef $w;
eval q [ok "\0" !~ /[\N{EMPTY-STR}XY]/,
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 9d75e397b6..78be9ee1fe 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -205,7 +205,7 @@ my @death =
'm/(?[[\w-x]])/' => 'False [] range "\w-" {#} m/(?[[\w-{#}x]])/',
'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" {#} m/(?[[a-\pM{#}]])/',
'm/(?[[\pM-x]])/' => 'False [] range "\pM-" {#} m/(?[[\pM-{#}x]])/',
- 'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character {#} m/(?[[\N{U+100.300{#}}]])/',
+ 'm/(?[[^\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in inverted character class or as a range end-point is restricted to one character {#} m/(?[[^\N{U+100.300{#}}]])/',
'm/(?[ \p{Digit} & (?(?[ \p{Thai} | \p{Lao} ]))])/' => 'Sequence (?(...) not recognized {#} m/(?[ \p{Digit} & (?({#}?[ \p{Thai} | \p{Lao} ]))])/',
'm/(?[ \p{Digit} & (?:(?[ \p{Thai} | \p{Lao} ]))])/' => 'Expecting \'(?flags:(?[...\' {#} m/(?[ \p{Digit} & (?{#}:(?[ \p{Thai} | \p{Lao} ]))])/',
'm/\o{/' => 'Missing right brace on \o{ {#} m/\o{{#}/',
@@ -335,7 +335,9 @@ my @warning = (
'm/[\w-x]\x{100}/' => 'False [] range "\w-" {#} m/[\w-{#}x]\x{100}/',
'm/[a-\pM]\x{100}/' => 'False [] range "a-\pM" {#} m/[a-\pM{#}]\x{100}/',
'm/[\pM-x]\x{100}/' => 'False [] range "\pM-" {#} m/[\pM-{#}x]\x{100}/',
- 'm/[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]/' => 'Using just the first character returned by \N{} in character class {#} m/[\N{U+100.300}{#}]/',
+ 'm/[^\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]/' => 'Using just the first character returned by \N{} in character class {#} m/[^\N{U+100.300}{#}]/',
+ 'm/[\x03-\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]/' => 'Using just the first character returned by \N{} in character class {#} m/[\x03-\N{U+100.300}{#}]/',
+ 'm/[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}-\x{10FFFF}]/' => 'Using just the first character returned by \N{} in character class {#} m/[\N{U+100.300}{#}-\x{10FFFF}]/',
"m'\\y\\x{100}'" => 'Unrecognized escape \y passed through {#} m/\y{#}\x{100}/',
'/x{3,1}/' => 'Quantifier {n,m} with n > m can\'t match {#} m/x{3,1}{#}/',
'/\08/' => '\'\08\' resolved to \'\o{0}8\' {#} m/\08{#}/',