diff options
-rw-r--r-- | ext/XS-APItest/t/utf8.t | 15 | ||||
-rw-r--r-- | pod/perldelta.pod | 11 | ||||
-rw-r--r-- | utf8.c | 23 | ||||
-rw-r--r-- | utf8.h | 11 |
4 files changed, 47 insertions, 13 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index 5fe56df39b..c7f2c1d65f 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -100,7 +100,8 @@ my $UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT; my $UTF8_ALLOW_LONG = 0x0010; my $UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020; my $UTF8_GOT_LONG = $UTF8_ALLOW_LONG; -my $UTF8_GOT_OVERFLOW = 0x0080; +my $UTF8_ALLOW_OVERFLOW = 0x0080; +my $UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW; my $UTF8_DISALLOW_SURROGATE = 0x0100; my $UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE; my $UTF8_WARN_SURROGATE = 0x0200; @@ -1347,8 +1348,7 @@ if (isASCII && ! $is64bit) { # 32-bit ASCII platform [ "overflow malformation", "\xfe\x84\x80\x80\x80\x80\x80", # Represents 2**32 7, - 0, # There is no way to allow this malformation - $UTF8_GOT_OVERFLOW, + $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW, $REPLACEMENT, 7, 2, qr/overflows/ @@ -1356,8 +1356,7 @@ if (isASCII && ! $is64bit) { # 32-bit ASCII platform [ "overflow malformation", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", $max_bytes, - 0, # There is no way to allow this malformation - $UTF8_GOT_OVERFLOW, + $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW, $REPLACEMENT, $max_bytes, 1, qr/overflows/ @@ -1399,8 +1398,7 @@ else { # 64-bit ASCII, or EBCDIC of any size. I8_to_native( "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"), $max_bytes, - 0, # There is no way to allow this malformation - $UTF8_GOT_OVERFLOW, + $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW, $REPLACEMENT, $max_bytes, 8, qr/overflows/ @@ -1414,8 +1412,7 @@ else { # 64-bit ASCII, or EBCDIC of any size. : I8_to_native( "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), $max_bytes, - 0, # There is no way to allow this malformation - $UTF8_GOT_OVERFLOW, + $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW, $REPLACEMENT, $max_bytes, (isASCII) ? 3 : 2, qr/overflows/ diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 1c770d233a..372b5fe34b 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -355,6 +355,17 @@ This malformation is where the UTF-8 looks valid syntactically, but there is a shorter sequence that yields the same code point. This has been forbidden since Unicode version 3.1. +=item * + +The functions C<utf8n_to_uvchr> and its derivatives now accept an input +flag to allow the overflow malformation. This malformation is when the +UTF-8 may be syntactically valid, but the code point it represents is +not capable of being represented in the word length on the platform. +What "allowed" means in this case is that the function doesn't return an +error, and advances the parse pointer to beyond the UTF-8 in question, +but it returns the Unicode REPLACEMENT CHARACTER as the value of the +code point (since the real value is not representable). + =back =head1 Selected Bug Fixes @@ -1259,6 +1259,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* isn't problematic if < this */ if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST) || ( UNLIKELY(possible_problems) + + /* if overflow, we know without looking further + * precisely which of the problematic types it is, + * and we deal with those in the overflow handling + * code */ + && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0))) && ((flags & ( UTF8_DISALLOW_NONCHAR |UTF8_DISALLOW_SURROGATE @@ -1371,7 +1377,21 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_ABOVE_31_BIT; } - disallowed = TRUE; + /* Disallow if any of the three categories say to */ + if ( ! (flags & UTF8_ALLOW_OVERFLOW) + || (flags & ( UTF8_DISALLOW_SUPER + |UTF8_DISALLOW_ABOVE_31_BIT))) + { + disallowed = TRUE; + } + + + /* Likewise, warn if any say to, plus if deprecation warnings + * are on, because this code point is above IV_MAX */ + if ( ckWARN_d(WARN_DEPRECATED) + || ! (flags & UTF8_ALLOW_OVERFLOW) + || (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT))) + { /* The warnings code explicitly says it doesn't handle the case * of packWARN2 and two categories which have parent-child @@ -1391,6 +1411,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, _byte_dump_string(s0, send - s0)); } } + } } else if (possible_problems & UTF8_GOT_EMPTY) { possible_problems &= ~UTF8_GOT_EMPTY; @@ -745,8 +745,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_ALLOW_LONG_AND_ITS_VALUE (UTF8_ALLOW_LONG|0x0020) #define UTF8_GOT_LONG UTF8_ALLOW_LONG -/* Currently no way to allow overflow */ -#define UTF8_GOT_OVERFLOW 0x0080 +#define UTF8_ALLOW_OVERFLOW 0x0080 +#define UTF8_GOT_OVERFLOW UTF8_ALLOW_OVERFLOW #define UTF8_DISALLOW_SURROGATE 0x0100 /* Unicode surrogates */ #define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE @@ -790,10 +790,15 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_WARN_ILLEGAL_INTERCHANGE \ (UTF8_WARN_ILLEGAL_C9_INTERCHANGE|UTF8_WARN_NONCHAR) +/* This is used typically for code that is willing to accept inputs of + * illformed UTF-8 sequences, for whatever reason. However, all such sequences + * evaluate to the REPLACEMENT CHARACTER unless other flags overriding this are + * also present. */ #define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \ |UTF8_ALLOW_NON_CONTINUATION \ |UTF8_ALLOW_SHORT \ - |UTF8_ALLOW_LONG) + |UTF8_ALLOW_LONG \ + |UTF8_ALLOW_OVERFLOW) /* Accept any Perl-extended UTF-8 that evaluates to any UV on the platform, but * not any malformed. This is the default. (Note that UVs above IV_MAX are |