summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-10-05 19:09:02 -0600
committerKarl Williamson <khw@cpan.org>2016-10-13 11:18:12 -0600
commit2b5e7bc2e60b4c4b5d87aa66e066363d9dce7930 (patch)
treef634e7245187a35ca5b793e67350b3cb41f4377a /utf8.h
parent1980a0f48b7a9b6e99cda0d5ae69cbb49da3cbf4 (diff)
downloadperl-2b5e7bc2e60b4c4b5d87aa66e066363d9dce7930.tar.gz
utf8n_to_uvchr(): Note multiple malformations
Some UTF-8 sequences can have multiple malformations. For example, a sequence can be the start of an overlong representation of a code point, and still be incomplete. Until this commit what was generally done was to stop looking when the first malformation was found. This was not correct behavior, as that malformation may be allowed, while another unallowed one went unnoticed. (But this did not actually create security holes, as those allowed malformations replaced the input with a REPLACEMENT CHARACTER.) This commit refactors the error handling of this function to set a flag and keep going if a malformation is found that doesn't preclude others. Then each is handled in a loop at the end, warning if warranted. The result is that there is a warning for each malformation for which warnings should be generated, and an error return is made if any one is disallowed. Overflow doesn't happen except for very high code points, well above the Unicode range, and above fitting in 31 bits. Hence the latter 2 potential malformations are subsets of overflow, so only one warning is output--the most dire. This will speed up the normal case slightly, as the test for overflow is pulled out of the loop, allowing the UV to overflow. Then a single test after the loop is done to see if there was overflow or not.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h21
1 files changed, 13 insertions, 8 deletions
diff --git a/utf8.h b/utf8.h
index 7cd163a241..c55ce26f8e 100644
--- a/utf8.h
+++ b/utf8.h
@@ -526,13 +526,6 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
| ((NATIVE_UTF8_TO_I8((U8)new)) \
& UTF_CONTINUATION_MASK))
-/* If a value is anded with this, and the result is non-zero, then using the
- * original value in UTF8_ACCUMULATE will overflow, shifting bits off the left
- * */
-#define UTF_ACCUMULATION_OVERFLOW_MASK \
- (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) \
- - UTF_ACCUMULATION_SHIFT))
-
/* This works in the face of malformed UTF-8. */
#define UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, e) (UTF8_IS_DOWNGRADEABLE_START(*s) \
&& ( (e) - (s) > 1) \
@@ -718,26 +711,37 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */
+#define UTF8_GOT_EMPTY UTF8_ALLOW_EMPTY
/* Allow first byte to be a continuation byte */
#define UTF8_ALLOW_CONTINUATION 0x0002
+#define UTF8_GOT_CONTINUATION UTF8_ALLOW_CONTINUATION
-/* Allow second... bytes to be non-continuation bytes */
+/* Unexpected continuation byte */
#define UTF8_ALLOW_NON_CONTINUATION 0x0004
+#define UTF8_GOT_NON_CONTINUATION UTF8_ALLOW_NON_CONTINUATION
/* expecting more bytes than were available in the string */
#define UTF8_ALLOW_SHORT 0x0008
+#define UTF8_GOT_SHORT UTF8_ALLOW_SHORT
/* Overlong sequence; i.e., the code point can be specified in fewer bytes. */
#define UTF8_ALLOW_LONG 0x0010
+#define UTF8_GOT_LONG UTF8_ALLOW_LONG
+
+/* Currently no way to allow overflow */
+#define UTF8_GOT_OVERFLOW 0x0020
#define UTF8_DISALLOW_SURROGATE 0x0040 /* Unicode surrogates */
+#define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE
#define UTF8_WARN_SURROGATE 0x0080
#define UTF8_DISALLOW_NONCHAR 0x0100 /* Unicode non-character */
+#define UTF8_GOT_NONCHAR UTF8_DISALLOW_NONCHAR
#define UTF8_WARN_NONCHAR 0x0200 /* code points */
#define UTF8_DISALLOW_SUPER 0x0400 /* Super-set of Unicode: code */
+#define UTF8_GOT_SUPER UTF8_DISALLOW_SUPER
#define UTF8_WARN_SUPER 0x0800 /* points above the legal max */
/* Code points which never were part of the original UTF-8 standard, which only
@@ -745,6 +749,7 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
* The first byte of these code points is FE or FF on ASCII platforms. If the
* first byte is FF, it will overflow a 32-bit word. */
#define UTF8_DISALLOW_ABOVE_31_BIT 0x1000
+#define UTF8_GOT_ABOVE_31_BIT UTF8_DISALLOW_ABOVE_31_BIT
#define UTF8_WARN_ABOVE_31_BIT 0x2000
/* For back compat, these old names are misleading for UTF_EBCDIC */