9 files changed, 122 insertions, 57 deletions
diff --git a/embed.fnc b/embed.fnc
index d25c78ed47..88adce209b 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2099,7 +2099,7 @@ Es	|void	 |set_ANYOF_arg	|NN RExC_state_t* const pRExC_state \
 				|NULLOK SV* const swash                    \
 				|const bool has_user_defined_property
 Es	|AV*	 |add_multi_match|NULLOK AV* multi_char_matches		    \
-				|NN SV* multi_fold			    \
+				|NN SV* multi_string			    \
 				|const STRLEN cp_count
 Es	|regnode*|regclass	|NN RExC_state_t *pRExC_state \
 				|NN I32 *flagp|U32 depth|const bool stop_at_1 \
diff --git a/lib/diagnostics.t b/lib/diagnostics.t
index 4ac2ebfe2b..0b35d16c06 100644
--- a/lib/diagnostics.t
+++ b/lib/diagnostics.t
@@ -106,7 +106,7 @@ seek STDERR, 0,0;
 $warning = '';
 warn "Using just the first character returned by \\N{} in character class in regex; marked by <-- HERE in m/%s/";
 like $warning,
-    qr/A charnames handler may return a sequence/s,
+    qr/Named Unicode character escapes/s,
     'multi-line entries in perldiag.pod match';
 
 # ; at end of entry in perldiag.pod
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 016c1bfd48..4d601dd5f2 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -443,6 +443,20 @@ Under some conditions a warning raised in compilation of regular
 expression patterns could be displayed multiple times.  This is now
 fixed.
 
+=item *
+
+C<qr/[\N{named sequence}]/> now works properly in many instances.  Some
+names known to C<\N{...}> refer to a sequence of multiple characters,
+instead of the usual single character.  Bracketed character classes
+generally only match single characters, but now special handling has
+been added so that they can match named sequences, but not if the class
+is inverted or the sequence is specified as the beginning or end of a
+range.  In these cases, the only behavior change from before is a slight
+rewording of the fatal error message given when this class is part of a
+C<?[...])> construct.  When the C<[...]> stands alone, the same
+non-fatal warning as before is raised, and only the first character in
+the sequence is used, again just as before.
+
 =back
 
 =head1 Known Problems
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index df94c98a19..80b60028be 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3362,15 +3362,23 @@ bracketed character class, for the same reason that C<.> in a character
 class loses its specialness: it matches almost everything, which is
 probably not what you want.
 
-=item \N{} in character class restricted to one character in regex; marked
+=item \N{} in inverted character class or as a range end-point is restricted to one character in regex; marked
 by S<<-- HERE> in m/%s/
 
 (F) Named Unicode character escapes C<(\N{...})> may return a
-multi-character sequence.  Such an escape may not be used in
-a character class, because character classes always match one
-character of input.  Check that the correct escape has been used,
-and the correct charname handler is in scope.  The S<<-- HERE> shows
-whereabouts in the regular expression the problem was discovered.
+multi-character sequence.  Even though a character class is supposed to
+match just one character of input, perl will match the whole thing
+correctly, except when the class is inverted (C<[^...]>, or the escape
+is the beginning or final end point of a range.  The mathematically
+logical behavior for what matches when inverting is very different than
+what people expect, so we have decided to forbid it.
+Similarly unclear is what should be generated when the C<\N{...}> is
+used as one of the end points of the range, such as in
+
+ [\x{41}-\N{ARABIC SEQUENCE YEH WITH HAMZA ABOVE WITH AE}]
+
+What is meant here is unclear, as the C<\N{...}> escape is a sequence of
+code points, so this is made an error.
 
 =item \N{NAME} must be resolved by the lexer in regex; marked by
 S<<-- HERE> in m/%s/
@@ -6507,9 +6515,14 @@ You need to add either braces or blanks to disambiguate.
 =item Using just the first character returned by \N{} in character class in 
 regex; marked by S<<-- HERE> in m/%s/
 
-(W regexp) A charnames handler may return a sequence of more than one
-character.  Currently all but the first one are discarded when used in
-a regular expression pattern bracketed character class.
+(W regexp) Named Unicode character escapes C<(\N{...})> may return a
+multi-character sequence.  Even though a character class is supposed to
+match just one character of input, perl will match the whole thing
+correctly, except when the class is inverted (C<[^...]>, or the escape
+is the beginning or final end point of a range.  For these, what should
+happen isn't clear at all.  In these circumstances, Perl discards all
+but the first character of the returned sequence, which is not likely
+what you want.
 
 =item Using !~ with %s doesn't make sense
 
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index a8dda141a9..5cd0ae7aab 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -457,30 +457,59 @@ Examples:
 
  -------
 
-* There is an exception to a bracketed character class matching a
-single character only.  When the class is to match caselessly under C</i>
-matching rules, and a character that is explicitly mentioned inside the
-class matches a
+* There are two exceptions to a bracketed character class matching a
+single character only.  Each requires special handling by Perl to make
+things work:
+
+=over
+
+=item *
+
+When the class is to match caselessly under C</i> matching rules, and a
+character that is explicitly mentioned inside the class matches a
 multiple-character sequence caselessly under Unicode rules, the class
-(when not L<inverted|/Negation>) will also match that sequence.  For
-example, Unicode says that the letter C<LATIN SMALL LETTER SHARP S>
-should match the sequence C<ss> under C</i> rules.  Thus,
+will also match that sequence.  For example, Unicode says that the
+letter C<LATIN SMALL LETTER SHARP S> should match the sequence C<ss>
+under C</i> rules.  Thus,
 
  'ss' =~ /\A\N{LATIN SMALL LETTER SHARP S}\z/i             # Matches
  'ss' =~ /\A[aeioust\N{LATIN SMALL LETTER SHARP S}]\z/i    # Matches
 
-For this to happen, the character must be explicitly specified, and not
-be part of a multi-character range (not even as one of its endpoints).
-(L</Character Ranges> will be explained shortly.)  Therefore,
+For this to happen, the class must not be inverted (see L</Negation>)
+and the character must be explicitly specified, and not be part of a
+multi-character range (not even as one of its endpoints).  (L</Character
+Ranges> will be explained shortly.) Therefore,
 
  'ss' =~ /\A[\0-\x{ff}]\z/i        # Doesn't match
  'ss' =~ /\A[\0-\N{LATIN SMALL LETTER SHARP S}]\z/i    # No match
- 'ss' =~ /\A[\xDF-\xDF]\z/i    # Matches on ASCII platforms, since \XDF
-                               # is LATIN SMALL LETTER SHARP S, and the
-                               # range is just a single element
+ 'ss' =~ /\A[\xDF-\xDF]\z/i    # Matches on ASCII platforms, since
+                               # \XDF is LATIN SMALL LETTER SHARP S,
+                               # and the range is just a single
+                               # element
 
 Note that it isn't a good idea to specify these types of ranges anyway.
 
+=item *
+
+Some names known to C<\N{...}> refer to a sequence of multiple characters,
+instead of the usual single character.  When one of these is included in
+the class, the entire sequence is matched.  For example,
+
+  "\N{TAMIL LETTER KA}\N{TAMIL VOWEL SIGN AU}"
+                              =~ / ^ [\N{TAMIL SYLLABLE KAU}]  $ /x;
+
+matches, because C<\N{TAMIL SYLLABLE KAU}> is a named sequence
+consisting of the two characters matched against.  Like the other
+instance where a bracketed class can match multi characters, and for
+similar reasons, the class must not be inverted, and the named sequence
+may not appear in a range, even one where it is both endpoints.  If
+these happen, it is a fatal error if the character class is within an
+extended L<C<(?[...])>|/Extended Bracketed Character Classes>
+class; and only the first code point is used (with
+a C<regexp>-type warning raised) otherwise.
+
+=back
+
 =head3 Special Characters Inside a Bracketed Character Class
 
 Most characters that are meta characters in regular expressions (that
@@ -597,9 +626,10 @@ the caret as one of the characters to match, either escape the caret or
 else don't list it first.
 
 In inverted bracketed character classes, Perl ignores the Unicode rules
-that normally say that certain characters should match a sequence of
-multiple characters under caseless C</i> matching.  Following those
-rules could lead to highly confusing situations:
+that normally say that named sequence, and certain characters should
+match a sequence of multiple characters use under caseless C</i>
+matching.  Following those rules could lead to highly confusing
+situations:
 
  "ss" =~ /^[^\xDF]+$/ui;   # Matches!
 
@@ -608,7 +638,7 @@ what C<\xDF> matches under C</i>.  C<"s"> isn't C<\xDF>, but Unicode
 says that C<"ss"> is what C<\xDF> matches under C</i>.  So which one
 "wins"? Do you fail the match because the string has C<ss> or accept it
 because it has an C<s> followed by another C<s>?  Perl has chosen the
-latter.
+latter.  (See note in L</Bracketed Character Classes> above.)
 
 Examples:
 
diff --git a/proto.h b/proto.h
index e733e79fc3..82496b6294 100644
--- a/proto.h
+++ b/proto.h
@@ -6729,10 +6729,10 @@ STATIC U32	S_add_data(RExC_state_t* const pRExC_state, const char* const s, cons
 #define PERL_ARGS_ASSERT_ADD_DATA	\
 	assert(pRExC_state); assert(s)
 
-STATIC AV*	S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_count)
+STATIC AV*	S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
 			__attribute__nonnull__(pTHX_2);
 #define PERL_ARGS_ASSERT_ADD_MULTI_MATCH	\
-	assert(multi_fold)
+	assert(multi_string)
 
 PERL_STATIC_INLINE void	S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 *flagp, STRLEN len, UV code_point, bool downgradable)
 			__attribute__nonnull__(pTHX_1)
diff --git a/regcomp.c b/regcomp.c
index f531026d65..73ad315c29 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13329,24 +13329,28 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
 }
 
 STATIC AV *
-S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_count)
+S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
 {
-    /* This adds the string scalar <multi_fold> to the array
-     * <multi_char_matches>.  <multi_fold> is known to have exactly
+    /* This adds the string scalar <multi_string> to the array
+     * <multi_char_matches>.  <multi_string> is known to have exactly
      * <cp_count> code points in it.  This is used when constructing a
      * bracketed character class and we find something that needs to match more
      * than a single character.
      *
-     * <multi_char_matches> is actually an array of arrays.  There will be one
-     * or two top-level elements: [2], and/or [3].  The [2] element is an
-     * array, each element thereof is a character which folds to TWO
-     * characters; [3] is for folds to THREE characters.  (Unicode guarantees a
-     * maximum of 3 characters in any fold.)  When we rewrite the character
-     * class below, we will do so such that the longest folds are written
-     * first, so that it prefers the longest matching strings first.  This is
-     * done even if it turns out that any quantifier is non-greedy, out of
-     * programmer laziness.  Tom Christiansen has agreed that this is ok.  This
-     * makes the test for the ligature 'ffi' come before the test for 'ff' */
+     * <multi_char_matches> is actually an array of arrays.  Each top-level
+     * element is an array that contains all the strings known so far that are
+     * the same length.  And that length (in number of code points) is the same
+     * as the index of the top-level array.  Hence, the [2] element is an
+     * array, each element thereof is a string containing TWO code points; while element
+     * [3] is for strings of THREE characters, and so on.  Since this is for
+     * multi-char strings there can never be a [0] nor [1] element.
+     *
+     * When we rewrite the character class below, we will do so such that the
+     * longest strings are written first, so that it prefers the longest
+     * matching strings first.  This is done even if it turns out that any
+     * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
+     * Christiansen has agreed that this is ok.  This makes the test for the
+     * ligature 'ffi' come before the test for 'ff', for example */
 
     AV* this_array;
     AV** this_array_ptr;
@@ -13366,7 +13370,7 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_fold, const STRLEN cp_
         av_store(multi_char_matches, cp_count,
                  (SV*) this_array);
     }
-    av_push(this_array, multi_fold);
+    av_push(this_array, multi_string);
 
     return multi_char_matches;
 }
@@ -13650,23 +13654,26 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                             }
                         }
                         else { /* cp_count > 1 */
-                            /* We only pay attention to the first char of
-                             * multichar strings being returned in char
-                             * classes. I kinda wonder if this makes sense as
-                             * it does change the behaviour from earlier
-                             * versions, OTOH that behaviour was broken as
-                             * well. XXX Solution is to recharacterize as
-                             * [rest-of-class]|multi1|multi2...  */
+                            if (! RExC_in_multi_char_class) {
+                                if (invert || range || *RExC_parse == '-') {
                                     if (strict) {
                                         RExC_parse--;
-                                        vFAIL("\\N{} in character class restricted to one character");
+                                        vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
                                     }
                                     else if (PASS2) {
                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
                                     }
+                                }
+                                else {
+                                    multi_char_matches
+                                        = add_multi_match(multi_char_matches,
+                                                          as_text,
+                                                          cp_count);
+                                }
                                 break; /* <value> contains the first code
                                           point. Drop out of the switch to
                                           process it */
+                            }
                         } /* End of cp_count != 1 */
 
                         /* This element should not be processed further in this
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index 986eb87de4..fb30a9c085 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -998,9 +998,8 @@ sub run_tests {
         #
         my $w;
         local $SIG {__WARN__} = sub {$w .= "@_"};
-        eval 'q(xxWxx) =~ /[\N{WARN}]/';
-        ok $w && $w =~ /Using just the first character returned by \\N\{} in character class/,
-                 "single character in [\\N{}] warning";
+        $result = eval 'q(WARN) =~ /[\N{WARN}]/';
+        ok !$@ && $result && ! $w,  '\N{} returning multi-char works';
 
         undef $w;
         eval q [ok "\0" !~ /[\N{EMPTY-STR}XY]/,
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 9d75e397b6..78be9ee1fe 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -205,7 +205,7 @@ my @death =
  'm/(?[[\w-x]])/' => 'False [] range "\w-" {#} m/(?[[\w-{#}x]])/',
  'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" {#} m/(?[[a-\pM{#}]])/',
  'm/(?[[\pM-x]])/' => 'False [] range "\pM-" {#} m/(?[[\pM-{#}x]])/',
- 'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character {#} m/(?[[\N{U+100.300{#}}]])/',
+ 'm/(?[[^\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in inverted character class or as a range end-point is restricted to one character {#} m/(?[[^\N{U+100.300{#}}]])/',
  'm/(?[ \p{Digit} & (?(?[ \p{Thai} | \p{Lao} ]))])/' => 'Sequence (?(...) not recognized {#} m/(?[ \p{Digit} & (?({#}?[ \p{Thai} | \p{Lao} ]))])/',
  'm/(?[ \p{Digit} & (?:(?[ \p{Thai} | \p{Lao} ]))])/' => 'Expecting \'(?flags:(?[...\' {#} m/(?[ \p{Digit} & (?{#}:(?[ \p{Thai} | \p{Lao} ]))])/',
  'm/\o{/' => 'Missing right brace on \o{ {#} m/\o{{#}/',
@@ -335,7 +335,9 @@ my @warning = (
     'm/[\w-x]\x{100}/' => 'False [] range "\w-" {#} m/[\w-{#}x]\x{100}/',
     'm/[a-\pM]\x{100}/' => 'False [] range "a-\pM" {#} m/[a-\pM{#}]\x{100}/',
     'm/[\pM-x]\x{100}/' => 'False [] range "\pM-" {#} m/[\pM-{#}x]\x{100}/',
-    'm/[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]/' => 'Using just the first character returned by \N{} in character class {#} m/[\N{U+100.300}{#}]/',
+    'm/[^\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]/' => 'Using just the first character returned by \N{} in character class {#} m/[^\N{U+100.300}{#}]/',
+    'm/[\x03-\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]/' => 'Using just the first character returned by \N{} in character class {#} m/[\x03-\N{U+100.300}{#}]/',
+    'm/[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}-\x{10FFFF}]/' => 'Using just the first character returned by \N{} in character class {#} m/[\N{U+100.300}{#}-\x{10FFFF}]/',
     "m'\\y\\x{100}'"     => 'Unrecognized escape \y passed through {#} m/\y{#}\x{100}/',
     '/x{3,1}/'   => 'Quantifier {n,m} with n > m can\'t match {#} m/x{3,1}{#}/',
     '/\08/' => '\'\08\' resolved to \'\o{0}8\' {#} m/\08{#}/',