4 files changed, 47 insertions, 13 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 5fe56df39b..c7f2c1d65f 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -100,7 +100,8 @@ my $UTF8_GOT_SHORT              = $UTF8_ALLOW_SHORT;
 my $UTF8_ALLOW_LONG             = 0x0010;
 my $UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020;
 my $UTF8_GOT_LONG               = $UTF8_ALLOW_LONG;
-my $UTF8_GOT_OVERFLOW           = 0x0080;
+my $UTF8_ALLOW_OVERFLOW         = 0x0080;
+my $UTF8_GOT_OVERFLOW           = $UTF8_ALLOW_OVERFLOW;
 my $UTF8_DISALLOW_SURROGATE     = 0x0100;
 my $UTF8_GOT_SURROGATE          = $UTF8_DISALLOW_SURROGATE;
 my $UTF8_WARN_SURROGATE         = 0x0200;
@@ -1347,8 +1348,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
         [ "overflow malformation",
             "\xfe\x84\x80\x80\x80\x80\x80",  # Represents 2**32
             7,
-            0,  # There is no way to allow this malformation
-            $UTF8_GOT_OVERFLOW,
+            $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
             7, 2,
             qr/overflows/
@@ -1356,8 +1356,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
         [ "overflow malformation",
             "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
             $max_bytes,
-            0,  # There is no way to allow this malformation
-            $UTF8_GOT_OVERFLOW,
+            $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
             $max_bytes, 1,
             qr/overflows/
@@ -1399,8 +1398,7 @@ else { # 64-bit ASCII, or EBCDIC of any size.
             I8_to_native(
                     "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
             $max_bytes,
-            0,  # There is no way to allow this malformation
-            $UTF8_GOT_OVERFLOW,
+            $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
             $max_bytes, 8,
             qr/overflows/
@@ -1414,8 +1412,7 @@ else { # 64-bit ASCII, or EBCDIC of any size.
                 : I8_to_native(
                     "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $max_bytes,
-                0,  # There is no way to allow this malformation
-                $UTF8_GOT_OVERFLOW,
+                $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
                 $REPLACEMENT,
                 $max_bytes, (isASCII) ? 3 : 2,
                 qr/overflows/
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 1c770d233a..372b5fe34b 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -355,6 +355,17 @@ This malformation is where the UTF-8 looks valid syntactically, but
 there is a shorter sequence that yields the same code point.  This has
 been forbidden since Unicode version 3.1.
 
+=item *
+
+The functions C<utf8n_to_uvchr> and its derivatives now accept an input
+flag to allow the overflow malformation.  This malformation is when the
+UTF-8 may be syntactically valid, but the code point it represents is
+not capable of being represented in the word length on the platform.
+What "allowed" means in this case is that the function doesn't return an
+error, and advances the parse pointer to beyond the UTF-8 in question,
+but it returns the Unicode REPLACEMENT CHARACTER as the value of the
+code point (since the real value is not representable).
+
 =back
 
 =head1 Selected Bug Fixes
diff --git a/utf8.c b/utf8.c
index d5e675b043..85432dcfb0 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1259,6 +1259,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                               /* isn't problematic if < this */
     if (   (   (   LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
             || (   UNLIKELY(possible_problems)
+
+                          /* if overflow, we know without looking further
+                           * precisely which of the problematic types it is,
+                           * and we deal with those in the overflow handling
+                           * code */
+                && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
                 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
 	&& ((flags & ( UTF8_DISALLOW_NONCHAR
                       |UTF8_DISALLOW_SURROGATE
@@ -1371,7 +1377,21 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     *errors |= UTF8_GOT_ABOVE_31_BIT;
                 }
 
-                disallowed = TRUE;
+                /* Disallow if any of the three categories say to */
+                if ( ! (flags & UTF8_ALLOW_OVERFLOW)
+                    || (flags & ( UTF8_DISALLOW_SUPER
+                                 |UTF8_DISALLOW_ABOVE_31_BIT)))
+                {
+                    disallowed = TRUE;
+                }
+
+
+                /* Likewise, warn if any say to, plus if deprecation warnings
+                 * are on, because this code point is above IV_MAX */
+                if (  ckWARN_d(WARN_DEPRECATED)
+                    || ! (flags & UTF8_ALLOW_OVERFLOW)
+                    ||   (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
+                {
 
                 /* The warnings code explicitly says it doesn't handle the case
                  * of packWARN2 and two categories which have parent-child
@@ -1391,6 +1411,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                         _byte_dump_string(s0, send - s0));
                     }
                 }
+                }
             }
             else if (possible_problems & UTF8_GOT_EMPTY) {
                 possible_problems &= ~UTF8_GOT_EMPTY;
diff --git a/utf8.h b/utf8.h
index 3dde45a1dd..d7c4e1ad30 100644
--- a/utf8.h
+++ b/utf8.h
@@ -745,8 +745,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
 #define UTF8_ALLOW_LONG_AND_ITS_VALUE   (UTF8_ALLOW_LONG|0x0020)
 #define UTF8_GOT_LONG                   UTF8_ALLOW_LONG
 
-/* Currently no way to allow overflow */
-#define UTF8_GOT_OVERFLOW               0x0080
+#define UTF8_ALLOW_OVERFLOW             0x0080
+#define UTF8_GOT_OVERFLOW               UTF8_ALLOW_OVERFLOW
 
 #define UTF8_DISALLOW_SURROGATE		0x0100	/* Unicode surrogates */
 #define UTF8_GOT_SURROGATE		UTF8_DISALLOW_SURROGATE
@@ -790,10 +790,15 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
 #define UTF8_WARN_ILLEGAL_INTERCHANGE \
                           (UTF8_WARN_ILLEGAL_C9_INTERCHANGE|UTF8_WARN_NONCHAR)
 
+/* This is used typically for code that is willing to accept inputs of
+ * illformed UTF-8 sequences, for whatever reason.  However, all such sequences
+ * evaluate to the REPLACEMENT CHARACTER unless other flags overriding this are
+ * also present. */
 #define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION                                \
                         |UTF8_ALLOW_NON_CONTINUATION                            \
                         |UTF8_ALLOW_SHORT                                       \
-                        |UTF8_ALLOW_LONG)
+                        |UTF8_ALLOW_LONG                                        \
+                        |UTF8_ALLOW_OVERFLOW)
 
 /* Accept any Perl-extended UTF-8 that evaluates to any UV on the platform, but
  * not any malformed.  This is the default.  (Note that UVs above IV_MAX are