summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-06-27 02:39:50 -0600
committerKarl Williamson <khw@cpan.org>2021-08-07 05:14:43 -0600
commitd49e4ce1792863bdc3e344a52aef19110508e1ac (patch)
tree893d7f5c65c49f9669a922a57d26437521e0f605 /utf8.c
parent856dd4e17cf3619ca108f22071d97cdaed91b763 (diff)
downloadperl-d49e4ce1792863bdc3e344a52aef19110508e1ac.tar.gz
utf8.c: Refactor internal function
The insight in the previous commit allows this function to become much more compact.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c60
1 files changed, 25 insertions, 35 deletions
diff --git a/utf8.c b/utf8.c
index 714d5b1c3a..cd8e84833e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -711,46 +711,32 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
* EBCDIC platforms C0, C1, C2, C3, C4, E0
*/
- const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
- const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+ U8 s1;
PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
assert(len > 1 && UTF8_IS_START(*s));
- /* Each platform has overlongs after the start bytes given above (expressed
- * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
- * the logic is the same, except the E0 overlong has already been excluded
- * on EBCDIC platforms. The values below were found by manually
- * inspecting the UTF-8 patterns. See the tables in utf8.h and
- * utfebcdic.h. */
+ s1 = NATIVE_UTF8_TO_I8(s[1]);
-#define F0_ABOVE_OVERLONG UTF_MIN_CONTINUATION_BYTE + 0x10
-#define F8_ABOVE_OVERLONG UTF_MIN_CONTINUATION_BYTE + 0x08
-#define FC_ABOVE_OVERLONG UTF_MIN_CONTINUATION_BYTE + 0x04
-#define FE_ABOVE_OVERLONG UTF_MIN_CONTINUATION_BYTE + 0x02
-# ifdef EBCDIC
-# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
- /* I8(0xfe) is FF */
-# else
-
- if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
- return 1;
- }
+ /* Each platform has overlongs after the start bytes given above (expressed
+ * in I8 for EBCDIC). The values below were found by manually inspecting
+ * the UTF-8 patterns. See the tables in utf8.h and utfebcdic.h. */
-# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
-# endif
+ switch (NATIVE_UTF8_TO_I8(s[0])) {
+ default:
+ return 0;
+#ifndef EBCDIC /* the E0 overlong has already been excluded on EBCDIC
+ platforms. */
+ case 0xE0: return s1 < 0xA0;
+#endif
- if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
- || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
- || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
- || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
- {
- return 1;
+ case 0xF0: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x10;
+ case 0xF8: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x08;
+ case 0xFC: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x04;
+ case 0xFE: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x02;
+ case 0xFF: return isFF_overlong(s, len);
}
-
- /* Check for the FF overlong */
- return isFF_overlong(s, len);
}
PERL_STATIC_INLINE int
@@ -765,6 +751,14 @@ S_isFF_overlong(const U8 * const s, const STRLEN len)
PERL_ARGS_ASSERT_ISFF_OVERLONG;
+#ifdef EBCDIC
+ /* This works on all three EBCDIC code pages traditionally supported by
+ * perl */
+# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
+#else
+# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
+#endif
+
/* To be an FF overlong, all the available bytes must match */
if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
@@ -918,10 +912,6 @@ S_does_utf8_overflow(const U8 * const s,
#endif
-#undef F0_ABOVE_OVERLONG
-#undef F8_ABOVE_OVERLONG
-#undef FC_ABOVE_OVERLONG
-#undef FE_ABOVE_OVERLONG
#undef FF_OVERLONG_PREFIX
STRLEN