diff options
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | lib/diagnostics.t | 2 | ||||
-rw-r--r-- | pod/perldelta.pod | 14 | ||||
-rw-r--r-- | pod/perldiag.pod | 31 | ||||
-rw-r--r-- | pod/perlrecharclass.pod | 64 | ||||
-rw-r--r-- | proto.h | 4 | ||||
-rw-r--r-- | regcomp.c | 51 | ||||
-rw-r--r-- | t/re/pat_advanced.t | 5 | ||||
-rw-r--r-- | t/re/reg_mesg.t | 6 |
9 files changed, 122 insertions, 57 deletions
@@ -2099,7 +2099,7 @@ Es |void |set_ANYOF_arg |NN RExC_state_t* const pRExC_state \ |NULLOK SV* const swash \ |const bool has_user_defined_property Es |AV* |add_multi_match|NULLOK AV* multi_char_matches \ - |NN SV* multi_fold \ + |NN SV* multi_string \ |const STRLEN cp_count Es |regnode*|regclass |NN RExC_state_t *pRExC_state \ |NN I32 *flagp|U32 depth|const bool stop_at_1 \ diff --git a/lib/diagnostics.t b/lib/diagnostics.t index 4ac2ebfe2b..0b35d16c06 100644 --- a/lib/diagnostics.t +++ b/lib/diagnostics.t @@ -106,7 +106,7 @@ seek STDERR, 0,0; $warning = ''; warn "Using just the first character returned by \\N{} in character class in regex; marked by <-- HERE in m/%s/"; like $warning, - qr/A charnames handler may return a sequence/s, + qr/Named Unicode character escapes/s, 'multi-line entries in perldiag.pod match'; # ; at end of entry in perldiag.pod diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 016c1bfd48..4d601dd5f2 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -443,6 +443,20 @@ Under some conditions a warning raised in compilation of regular expression patterns could be displayed multiple times. This is now fixed. +=item * + +C<qr/[\N{named sequence}]/> now works properly in many instances. Some +names known to C<\N{...}> refer to a sequence of multiple characters, +instead of the usual single character. Bracketed character classes +generally only match single characters, but now special handling has +been added so that they can match named sequences, but not if the class +is inverted or the sequence is specified as the beginning or end of a +range. In these cases, the only behavior change from before is a slight +rewording of the fatal error message given when this class is part of a +C<?[...])> construct. When the C<[...]> stands alone, the same +non-fatal warning as before is raised, and only the first character in +the sequence is used, again just as before. + =back =head1 Known Problems diff --git a/pod/perldiag.pod b/pod/perldiag.pod index df94c98a19..80b60028be 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -3362,15 +3362,23 @@ bracketed character class, for the same reason that C<.> in a character class loses its specialness: it matches almost everything, which is probably not what you want. -=item \N{} in character class restricted to one character in regex; marked +=item \N{} in inverted character class or as a range end-point is restricted to one character in regex; marked by S<<-- HERE> in m/%s/ (F) Named Unicode character escapes C<(\N{...})> may return a -multi-character sequence. Such an escape may not be used in -a character class, because character classes always match one -character of input. Check that the correct escape has been used, -and the correct charname handler is in scope. The S<<-- HERE> shows -whereabouts in the regular expression the problem was discovered. +multi-character sequence. Even though a character class is supposed to +match just one character of input, perl will match the whole thing +correctly, except when the class is inverted (C<[^...]>, or the escape +is the beginning or final end point of a range. The mathematically +logical behavior for what matches when inverting is very different than +what people expect, so we have decided to forbid it. +Similarly unclear is what should be generated when the C<\N{...}> is +used as one of the end points of the range, such as in + + [\x{41}-\N{ARABIC SEQUENCE YEH WITH HAMZA ABOVE WITH AE}] + +What is meant here is unclear, as the C<\N{...}> escape is a sequence of +code points, so this is made an error. =item \N{NAME} must be resolved by the lexer in regex; marked by S<<-- HERE> in m/%s/ @@ -6507,9 +6515,14 @@ You need to add either braces or blanks to disambiguate. =item Using just the first character returned by \N{} in character class in regex; marked by S<<-- HERE> in m/%s/ -(W regexp) A charnames handler may return a sequence of more than one -character. Currently all but the first one are discarded when used in -a regular expression pattern bracketed character class. +(W regexp) Named Unicode character escapes C<(\N{...})> may return a +multi-character sequence. Even though a character class is supposed to +match just one character of input, perl will match the whole thing +correctly, except when the class is inverted (C<[^...]>, or the escape +is the beginning or final end point of a range. For these, what should +happen isn't clear at all. In these circumstances, Perl discards all +but the first character of the returned sequence, which is not likely +what you want. =item Using !~ with %s doesn't make sense diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index a8dda141a9..5cd0ae7aab 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -457,30 +457,59 @@ Examples: ------- -* There is an exception to a bracketed character class matching a -single character only. When the class is to match caselessly under C</i> -matching rules, and a character that is explicitly mentioned inside the -class matches a +* There are two exceptions to a bracketed character class matching a +single character only. Each requires special handling by Perl to make +things work: + +=over + +=item * + +When the class is to match caselessly under C</i> matching rules, and a +character that is explicitly mentioned inside the class matches a multiple-character sequence caselessly under Unicode rules, the class -(when not L<inverted|/Negation>) will also match that sequence. For -example, Unicode says that the letter C<LATIN SMALL LETTER SHARP S> -should match the sequence C<ss> under C</i> rules. Thus, +will also match that sequence. For example, Unicode says that the +letter C<LATIN SMALL LETTER SHARP S> should match the sequence C<ss> +under C</i> rules. Thus, 'ss' =~ /\A\N{LATIN SMALL LETTER SHARP S}\z/i # Matches 'ss' =~ /\A[aeioust\N{LATIN SMALL LETTER SHARP S}]\z/i # Matches -For this to happen, the character must be explicitly specified, and not -be part of a multi-character range (not even as one of its endpoints). -(L</Character Ranges> will be explained shortly.) Therefore, +For this to happen, the class must not be inverted (see L</Negation>) +and the character must be explicitly specified, and not be part of a +multi-character range (not even as one of its endpoints). (L</Character +Ranges> will be explained shortly.) Therefore, 'ss' =~ /\A[\0-\x{ff}]\z/i # Doesn't match 'ss' =~ /\A[\0-\N{LATIN SMALL LETTER SHARP S}]\z/i # No match - 'ss' =~ /\A[\xDF-\xDF]\z/i # Matches on ASCII platforms, since \XDF - # is LATIN SMALL LETTER SHARP S, and the - # range is just a single element + 'ss' =~ /\A[\xDF-\xDF]\z/i # Matches on ASCII platforms, since + # \XDF is LATIN SMALL LETTER SHARP S, + # and the range is just a single + # element Note that it isn't a good idea to specify these types of ranges anyway. +=item * + +Some names known to C<\N{...}> refer to a sequence of multiple characters, +instead of the usual single character. When one of these is included in +the class, the entire sequence is matched. For example, + + "\N{TAMIL LETTER KA}\N{TAMIL VOWEL SIGN AU}" + =~ / ^ [\N{TAMIL SYLLABLE KAU}] $ /x; + +matches, because C<\N{TAMIL SYLLABLE KAU}> is a named sequence +consisting of the two characters matched against. Like the other +instance where a bracketed class can match multi characters, and for +similar reasons, the class must not be inverted, and the named sequence +may not appear in a range, even one where it is both endpoints. If +these happen, it is a fatal error if the character class is within an +extended L<C<(?[...])>|/Extended Bracketed Character Classes> +class; and only the first code point is used (with +a C<regexp>-type warning raised) otherwise. + +=back + =head3 Special Characters Inside a Bracketed Character Class Most characters that are meta characters in regular expressions (that @@ -597,9 +626,10 @@ the caret as one of the characters to match, either escape the caret or else don't list it first. In inverted bracketed character classes, Perl ignores the Unicode rules -that normally say that certain characters should match a sequence of -multiple characters under caseless C</i> matching. Following those -rules could lead to highly confusing situations: +that normally say that named sequence, and certain characters should +match a sequence of multiple characters use under caseless C</i> +matching. Following those rules could lead to highly confusing +situations: "ss" =~ /^[^\xDF]+$/ui; # Matches! @@ -608,7 +638,7 @@ what C<\xDF> matches under C</i>. C<"s"> isn't C<\xDF>, but Unicode says that C<"ss"> is what C<\xDF> matches under C</i>. So which one "wins"? Do you fail the match because the string has C<ss> or accept it because it has an C<s> followed by another C<s>? Perl has chosen the -latter. +latter. (See note in L</Bracketed Character Classes> above.) Examples: @@ -6729,10 +6729,10 @@ STATIC U32 S_add_data(RExC_state_t* const pRExC_state, const char* const s, cons #define PERL_ARGS_ASSERT_ADD_DATA \ assert(pRExC_state); assert(s) -STATIC AV* S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_count) +STATIC AV* S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT_ADD_MULTI_MATCH \ - assert(multi_fold) + assert(multi_string) PERL_STATIC_INLINE void S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 *flagp, STRLEN len, UV code_point, bool downgradable) __attribute__nonnull__(pTHX_1) @@ -13329,24 +13329,28 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl } STATIC AV * -S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_count) +S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) { - /* This adds the string scalar <multi_fold> to the array - * <multi_char_matches>. <multi_fold> is known to have exactly + /* This adds the string scalar <multi_string> to the array + * <multi_char_matches>. <multi_string> is known to have exactly * <cp_count> code points in it. This is used when constructing a * bracketed character class and we find something that needs to match more * than a single character. * - * <multi_char_matches> is actually an array of arrays. There will be one - * or two top-level elements: [2], and/or [3]. The [2] element is an - * array, each element thereof is a character which folds to TWO - * characters; [3] is for folds to THREE characters. (Unicode guarantees a - * maximum of 3 characters in any fold.) When we rewrite the character - * class below, we will do so such that the longest folds are written - * first, so that it prefers the longest matching strings first. This is - * done even if it turns out that any quantifier is non-greedy, out of - * programmer laziness. Tom Christiansen has agreed that this is ok. This - * makes the test for the ligature 'ffi' come before the test for 'ff' */ + * <multi_char_matches> is actually an array of arrays. Each top-level + * element is an array that contains all the strings known so far that are + * the same length. And that length (in number of code points) is the same + * as the index of the top-level array. Hence, the [2] element is an + * array, each element thereof is a string containing TWO code points; while element + * [3] is for strings of THREE characters, and so on. Since this is for + * multi-char strings there can never be a [0] nor [1] element. + * + * When we rewrite the character class below, we will do so such that the + * longest strings are written first, so that it prefers the longest + * matching strings first. This is done even if it turns out that any + * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom + * Christiansen has agreed that this is ok. This makes the test for the + * ligature 'ffi' come before the test for 'ff', for example */ AV* this_array; AV** this_array_ptr; @@ -13366,7 +13370,7 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_ av_store(multi_char_matches, cp_count, (SV*) this_array); } - av_push(this_array, multi_fold); + av_push(this_array, multi_string); return multi_char_matches; } @@ -13650,23 +13654,26 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } else { /* cp_count > 1 */ - /* We only pay attention to the first char of - * multichar strings being returned in char - * classes. I kinda wonder if this makes sense as - * it does change the behaviour from earlier - * versions, OTOH that behaviour was broken as - * well. XXX Solution is to recharacterize as - * [rest-of-class]|multi1|multi2... */ + if (! RExC_in_multi_char_class) { + if (invert || range || *RExC_parse == '-') { if (strict) { RExC_parse--; - vFAIL("\\N{} in character class restricted to one character"); + vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character"); } else if (PASS2) { ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class"); } + } + else { + multi_char_matches + = add_multi_match(multi_char_matches, + as_text, + cp_count); + } break; /* <value> contains the first code point. Drop out of the switch to process it */ + } } /* End of cp_count != 1 */ /* This element should not be processed further in this diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index 986eb87de4..fb30a9c085 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -998,9 +998,8 @@ sub run_tests { # my $w; local $SIG {__WARN__} = sub {$w .= "@_"}; - eval 'q(xxWxx) =~ /[\N{WARN}]/'; - ok $w && $w =~ /Using just the first character returned by \\N\{} in character class/, - "single character in [\\N{}] warning"; + $result = eval 'q(WARN) =~ /[\N{WARN}]/'; + ok !$@ && $result && ! $w, '\N{} returning multi-char works'; undef $w; eval q [ok "\0" !~ /[\N{EMPTY-STR}XY]/, diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 9d75e397b6..78be9ee1fe 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -205,7 +205,7 @@ my @death = 'm/(?[[\w-x]])/' => 'False [] range "\w-" {#} m/(?[[\w-{#}x]])/', 'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" {#} m/(?[[a-\pM{#}]])/', 'm/(?[[\pM-x]])/' => 'False [] range "\pM-" {#} m/(?[[\pM-{#}x]])/', - 'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character {#} m/(?[[\N{U+100.300{#}}]])/', + 'm/(?[[^\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in inverted character class or as a range end-point is restricted to one character {#} m/(?[[^\N{U+100.300{#}}]])/', 'm/(?[ \p{Digit} & (?(?[ \p{Thai} | \p{Lao} ]))])/' => 'Sequence (?(...) not recognized {#} m/(?[ \p{Digit} & (?({#}?[ \p{Thai} | \p{Lao} ]))])/', 'm/(?[ \p{Digit} & (?:(?[ \p{Thai} | \p{Lao} ]))])/' => 'Expecting \'(?flags:(?[...\' {#} m/(?[ \p{Digit} & (?{#}:(?[ \p{Thai} | \p{Lao} ]))])/', 'm/\o{/' => 'Missing right brace on \o{ {#} m/\o{{#}/', @@ -335,7 +335,9 @@ my @warning = ( 'm/[\w-x]\x{100}/' => 'False [] range "\w-" {#} m/[\w-{#}x]\x{100}/', 'm/[a-\pM]\x{100}/' => 'False [] range "a-\pM" {#} m/[a-\pM{#}]\x{100}/', 'm/[\pM-x]\x{100}/' => 'False [] range "\pM-" {#} m/[\pM-{#}x]\x{100}/', - 'm/[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]/' => 'Using just the first character returned by \N{} in character class {#} m/[\N{U+100.300}{#}]/', + 'm/[^\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]/' => 'Using just the first character returned by \N{} in character class {#} m/[^\N{U+100.300}{#}]/', + 'm/[\x03-\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]/' => 'Using just the first character returned by \N{} in character class {#} m/[\x03-\N{U+100.300}{#}]/', + 'm/[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}-\x{10FFFF}]/' => 'Using just the first character returned by \N{} in character class {#} m/[\N{U+100.300}{#}-\x{10FFFF}]/', "m'\\y\\x{100}'" => 'Unrecognized escape \y passed through {#} m/\y{#}\x{100}/', '/x{3,1}/' => 'Quantifier {n,m} with n > m can\'t match {#} m/x{3,1}{#}/', '/\08/' => '\'\08\' resolved to \'\o{0}8\' {#} m/\08{#}/', |