summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/XS-APItest/t/utf8.t15
-rw-r--r--pod/perldelta.pod11
-rw-r--r--utf8.c23
-rw-r--r--utf8.h11
4 files changed, 47 insertions, 13 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 5fe56df39b..c7f2c1d65f 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -100,7 +100,8 @@ my $UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT;
my $UTF8_ALLOW_LONG = 0x0010;
my $UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020;
my $UTF8_GOT_LONG = $UTF8_ALLOW_LONG;
-my $UTF8_GOT_OVERFLOW = 0x0080;
+my $UTF8_ALLOW_OVERFLOW = 0x0080;
+my $UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW;
my $UTF8_DISALLOW_SURROGATE = 0x0100;
my $UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE;
my $UTF8_WARN_SURROGATE = 0x0200;
@@ -1347,8 +1348,7 @@ if (isASCII && ! $is64bit) { # 32-bit ASCII platform
[ "overflow malformation",
"\xfe\x84\x80\x80\x80\x80\x80", # Represents 2**32
7,
- 0, # There is no way to allow this malformation
- $UTF8_GOT_OVERFLOW,
+ $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
$REPLACEMENT,
7, 2,
qr/overflows/
@@ -1356,8 +1356,7 @@ if (isASCII && ! $is64bit) { # 32-bit ASCII platform
[ "overflow malformation",
"\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
$max_bytes,
- 0, # There is no way to allow this malformation
- $UTF8_GOT_OVERFLOW,
+ $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
$REPLACEMENT,
$max_bytes, 1,
qr/overflows/
@@ -1399,8 +1398,7 @@ else { # 64-bit ASCII, or EBCDIC of any size.
I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
$max_bytes,
- 0, # There is no way to allow this malformation
- $UTF8_GOT_OVERFLOW,
+ $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
$REPLACEMENT,
$max_bytes, 8,
qr/overflows/
@@ -1414,8 +1412,7 @@ else { # 64-bit ASCII, or EBCDIC of any size.
: I8_to_native(
"\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
$max_bytes,
- 0, # There is no way to allow this malformation
- $UTF8_GOT_OVERFLOW,
+ $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
$REPLACEMENT,
$max_bytes, (isASCII) ? 3 : 2,
qr/overflows/
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 1c770d233a..372b5fe34b 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -355,6 +355,17 @@ This malformation is where the UTF-8 looks valid syntactically, but
there is a shorter sequence that yields the same code point. This has
been forbidden since Unicode version 3.1.
+=item *
+
+The functions C<utf8n_to_uvchr> and its derivatives now accept an input
+flag to allow the overflow malformation. This malformation is when the
+UTF-8 may be syntactically valid, but the code point it represents is
+not capable of being represented in the word length on the platform.
+What "allowed" means in this case is that the function doesn't return an
+error, and advances the parse pointer to beyond the UTF-8 in question,
+but it returns the Unicode REPLACEMENT CHARACTER as the value of the
+code point (since the real value is not representable).
+
=back
=head1 Selected Bug Fixes
diff --git a/utf8.c b/utf8.c
index d5e675b043..85432dcfb0 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1259,6 +1259,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
/* isn't problematic if < this */
if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
|| ( UNLIKELY(possible_problems)
+
+ /* if overflow, we know without looking further
+ * precisely which of the problematic types it is,
+ * and we deal with those in the overflow handling
+ * code */
+ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
&& isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
&& ((flags & ( UTF8_DISALLOW_NONCHAR
|UTF8_DISALLOW_SURROGATE
@@ -1371,7 +1377,21 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
*errors |= UTF8_GOT_ABOVE_31_BIT;
}
- disallowed = TRUE;
+ /* Disallow if any of the three categories say to */
+ if ( ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & ( UTF8_DISALLOW_SUPER
+ |UTF8_DISALLOW_ABOVE_31_BIT)))
+ {
+ disallowed = TRUE;
+ }
+
+
+ /* Likewise, warn if any say to, plus if deprecation warnings
+ * are on, because this code point is above IV_MAX */
+ if ( ckWARN_d(WARN_DEPRECATED)
+ || ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
+ {
/* The warnings code explicitly says it doesn't handle the case
* of packWARN2 and two categories which have parent-child
@@ -1391,6 +1411,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
_byte_dump_string(s0, send - s0));
}
}
+ }
}
else if (possible_problems & UTF8_GOT_EMPTY) {
possible_problems &= ~UTF8_GOT_EMPTY;
diff --git a/utf8.h b/utf8.h
index 3dde45a1dd..d7c4e1ad30 100644
--- a/utf8.h
+++ b/utf8.h
@@ -745,8 +745,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define UTF8_ALLOW_LONG_AND_ITS_VALUE (UTF8_ALLOW_LONG|0x0020)
#define UTF8_GOT_LONG UTF8_ALLOW_LONG
-/* Currently no way to allow overflow */
-#define UTF8_GOT_OVERFLOW 0x0080
+#define UTF8_ALLOW_OVERFLOW 0x0080
+#define UTF8_GOT_OVERFLOW UTF8_ALLOW_OVERFLOW
#define UTF8_DISALLOW_SURROGATE 0x0100 /* Unicode surrogates */
#define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE
@@ -790,10 +790,15 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define UTF8_WARN_ILLEGAL_INTERCHANGE \
(UTF8_WARN_ILLEGAL_C9_INTERCHANGE|UTF8_WARN_NONCHAR)
+/* This is used typically for code that is willing to accept inputs of
+ * illformed UTF-8 sequences, for whatever reason. However, all such sequences
+ * evaluate to the REPLACEMENT CHARACTER unless other flags overriding this are
+ * also present. */
#define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
|UTF8_ALLOW_NON_CONTINUATION \
|UTF8_ALLOW_SHORT \
- |UTF8_ALLOW_LONG)
+ |UTF8_ALLOW_LONG \
+ |UTF8_ALLOW_OVERFLOW)
/* Accept any Perl-extended UTF-8 that evaluates to any UV on the platform, but
* not any malformed. This is the default. (Note that UVs above IV_MAX are