utf8n_to_uvchr(): Note multiple malformations

Some UTF-8 sequences can have multiple malformations. For example, a sequence can be the start of an overlong representation of a code point, and still be incomplete. Until this commit what was generally done was to stop looking when the first malformation was found. This was not correct behavior, as that malformation may be allowed, while another unallowed one went unnoticed. (But this did not actually create security holes, as those allowed malformations replaced the input with a REPLACEMENT CHARACTER.) This commit refactors the error handling of this function to set a flag and keep going if a malformation is found that doesn't preclude others. Then each is handled in a loop at the end, warning if warranted. The result is that there is a warning for each malformation for which warnings should be generated, and an error return is made if any one is disallowed. Overflow doesn't happen except for very high code points, well above the Unicode range, and above fitting in 31 bits. Hence the latter 2 potential malformations are subsets of overflow, so only one warning is output--the most dire. This will speed up the normal case slightly, as the test for overflow is pulled out of the loop, allowing the UV to overflow. Then a single test after the loop is done to see if there was overflow or not.
author: Karl Williamson <khw@cpan.org> 2016-10-05 19:09:02 -0600
committer: Karl Williamson <khw@cpan.org> 2016-10-13 11:18:12 -0600
commit: 2b5e7bc2e60b4c4b5d87aa66e066363d9dce7930 (patch)
tree: f634e7245187a35ca5b793e67350b3cb41f4377a /utf8.h
parent: 1980a0f48b7a9b6e99cda0d5ae69cbb49da3cbf4 (diff)
download: perl-2b5e7bc2e60b4c4b5d87aa66e066363d9dce7930.tar.gz
1 files changed, 13 insertions, 8 deletions
diff --git a/utf8.h b/utf8.h
index 7cd163a241..c55ce26f8e 100644
--- a/utf8.h
+++ b/utf8.h
@@ -526,13 +526,6 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
                                    | ((NATIVE_UTF8_TO_I8((U8)new))             \
                                        & UTF_CONTINUATION_MASK))
 
-/* If a value is anded with this, and the result is non-zero, then using the
- * original value in UTF8_ACCUMULATE will overflow, shifting bits off the left
- * */
-#define UTF_ACCUMULATION_OVERFLOW_MASK					\
-    (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS)           \
-           - UTF_ACCUMULATION_SHIFT))
-
 /* This works in the face of malformed UTF-8. */
 #define UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, e) (UTF8_IS_DOWNGRADEABLE_START(*s) \
                                                && ( (e) - (s) > 1)             \
@@ -718,26 +711,37 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
 
 
 #define UTF8_ALLOW_EMPTY		0x0001	/* Allow a zero length string */
+#define UTF8_GOT_EMPTY                  UTF8_ALLOW_EMPTY
 
 /* Allow first byte to be a continuation byte */
 #define UTF8_ALLOW_CONTINUATION		0x0002
+#define UTF8_GOT_CONTINUATION		UTF8_ALLOW_CONTINUATION
 
-/* Allow second... bytes to be non-continuation bytes */
+/* Unexpected continuation byte */
 #define UTF8_ALLOW_NON_CONTINUATION	0x0004
+#define UTF8_GOT_NON_CONTINUATION	UTF8_ALLOW_NON_CONTINUATION
 
 /* expecting more bytes than were available in the string */
 #define UTF8_ALLOW_SHORT		0x0008
+#define UTF8_GOT_SHORT		        UTF8_ALLOW_SHORT
 
 /* Overlong sequence; i.e., the code point can be specified in fewer bytes. */
 #define UTF8_ALLOW_LONG                 0x0010
+#define UTF8_GOT_LONG                   UTF8_ALLOW_LONG
+
+/* Currently no way to allow overflow */
+#define UTF8_GOT_OVERFLOW               0x0020
 
 #define UTF8_DISALLOW_SURROGATE		0x0040	/* Unicode surrogates */
+#define UTF8_GOT_SURROGATE		UTF8_DISALLOW_SURROGATE
 #define UTF8_WARN_SURROGATE		0x0080
 
 #define UTF8_DISALLOW_NONCHAR           0x0100	/* Unicode non-character */
+#define UTF8_GOT_NONCHAR                UTF8_DISALLOW_NONCHAR
 #define UTF8_WARN_NONCHAR               0x0200	/*  code points */
 
 #define UTF8_DISALLOW_SUPER		0x0400	/* Super-set of Unicode: code */
+#define UTF8_GOT_SUPER		        UTF8_DISALLOW_SUPER
 #define UTF8_WARN_SUPER		        0x0800	/* points above the legal max */
 
 /* Code points which never were part of the original UTF-8 standard, which only
@@ -745,6 +749,7 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
  * The first byte of these code points is FE or FF on ASCII platforms.  If the
  * first byte is FF, it will overflow a 32-bit word. */
 #define UTF8_DISALLOW_ABOVE_31_BIT      0x1000
+#define UTF8_GOT_ABOVE_31_BIT           UTF8_DISALLOW_ABOVE_31_BIT
 #define UTF8_WARN_ABOVE_31_BIT          0x2000
 
 /* For back compat, these old names are misleading for UTF_EBCDIC */
author	Karl Williamson <khw@cpan.org>	2016-10-05 19:09:02 -0600
committer	Karl Williamson <khw@cpan.org>	2016-10-13 11:18:12 -0600
commit	2b5e7bc2e60b4c4b5d87aa66e066363d9dce7930 (patch)
tree	f634e7245187a35ca5b793e67350b3cb41f4377a /utf8.h
parent	1980a0f48b7a9b6e99cda0d5ae69cbb49da3cbf4 (diff)
download	perl-2b5e7bc2e60b4c4b5d87aa66e066363d9dce7930.tar.gz