From 784d4f31222f1bf7421b1aab87276f4878d60363 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 3 Sep 2016 14:12:27 -0600 Subject: isUTF8_CHAR(): Bring UTF-EBCDIC to parity with ASCII This changes the macro isUTF8_CHAR to have the same number of code points built-in for EBCDIC as ASCII. This obsoletes the IS_UTF8_CHAR_FAST macro, which is removed. Previously, the code generated by regen/regcharclass.pl for ASCII platforms was hand copied into utf8.h, and LIKELY's manually added, then the generating code was commented out. Now this has been done with EBCDIC platforms as well. This makes regenerating regcharclass.h faster. The copied macro in utf8.h is moved by this commit to within the main code section for non-EBCDIC compiles, cutting the number of #ifdef's down, and the comments about it are changed somewhat. --- utf8.h | 88 ++++++++++++++++++++++++++++-------------------------------------- 1 file changed, 37 insertions(+), 51 deletions(-) (limited to 'utf8.h') diff --git a/utf8.h b/utf8.h index 62826adab9..7202dc4467 100644 --- a/utf8.h +++ b/utf8.h @@ -303,6 +303,33 @@ C is Unicode if above 255; otherwise is platform-native. * encounter */ #define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED) +/* A helper macro for isUTF8_CHAR, so use that one instead of this. This was + * generated by regen/regcharclass.pl, and then moved here. Then it was + * hand-edited to add some LIKELY() calls, presuming that malformations are + * unlikely. The lines that generated it were then commented out. This was + * done because it takes on the order of 10 minutes to generate, and is never + * going to change, unless the generated code is improved, and figuring out + * the LIKELYs there would be hard. + * + UTF8_CHAR: Matches legal UTF-8 variant code points up through 0x1FFFFFF + + 0x80 - 0x1FFFFF +*/ +/*** GENERATED CODE ***/ +#define is_UTF8_CHAR_utf8_no_length_checks(s) \ +( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ + ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ +: ( 0xE0 == ((U8*)s)[0] ) ? \ + ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ +: ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \ + ( LIKELY( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ +: ( 0xF0 == ((U8*)s)[0] ) ? \ + ( LIKELY( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ +: ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 ) + +/* The above macro handles UTF-8 that has this start byte as the maximum */ +#define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF7 + #endif /* EBCDIC vs ASCII */ /* 2**UTF_ACCUMULATION_SHIFT - 1 */ @@ -857,48 +884,6 @@ point's representation. /* If you want to exclude surrogates, and beyond legal Unicode, see the blame * log for earlier versions which gave details for these */ -/* A helper macro for isUTF8_CHAR, so use that one, and not this one. This is - * retained solely for backwards compatibility and may be deprecated and - * removed in a future Perl version. - * - * regen/regcharclass.pl generates is_UTF8_CHAR_utf8() macros for up to these - * number of bytes. So this has to be coordinated with that file */ -#ifdef EBCDIC -# define IS_UTF8_CHAR_FAST(n) ((n) <= 3) -#else -# define IS_UTF8_CHAR_FAST(n) ((n) <= 4) -#endif - -#ifndef EBCDIC -/* A helper macro for isUTF8_CHAR, so use that one instead of this. This was - * generated by regen/regcharclass.pl, and then moved here. Then it was - * hand-edited to add some LIKELY() calls, presuming that malformations are - * unlikely. The lines that generated it were then commented out. This was - * done because it takes on the order of 10 minutes to generate, and is never - * going to change, unless the generated code is improved, and figuring out - * there the LIKELYs would be hard. - * - * The EBCDIC versions have been cut to not cover all of legal Unicode, - * otherwise they take too long to generate; besides there is a separate one - * for each code page, so they are in regcharclass.h instead of here */ -/* - UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes - - 0x80 - 0x1FFFFF -*/ -/*** GENERATED CODE ***/ -#define is_UTF8_CHAR_utf8_no_length_checks(s) \ -( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ - ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ -: ( 0xE0 == ((U8*)s)[0] ) ? \ - ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ -: ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \ - ( LIKELY( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ -: ( 0xF0 == ((U8*)s)[0] ) ? \ - ( LIKELY( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ -: ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 ) -#endif - /* =for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e @@ -925,15 +910,16 @@ is a valid UTF-8 character. =cut */ -#define isUTF8_CHAR(s, e) (UNLIKELY((e) <= (s)) \ - ? 0 \ - : (UTF8_IS_INVARIANT(*s)) \ - ? 1 \ - : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ - ? 0 \ - : LIKELY(IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \ - ? is_UTF8_CHAR_utf8_no_length_checks(s) \ - : _is_utf8_char_slow(s, UTF8SKIP(s))) +#define isUTF8_CHAR(s, e) \ + (UNLIKELY((e) <= (s)) \ + ? 0 \ + : (UTF8_IS_INVARIANT(*s)) \ + ? 1 \ + : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ + ? 0 \ + : LIKELY(NATIVE_UTF8_TO_I8(*s) <= _IS_UTF8_CHAR_HIGHEST_START_BYTE) \ + ? is_UTF8_CHAR_utf8_no_length_checks(s) \ + : _is_utf8_char_slow(s, UTF8SKIP(s))) #define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end) -- cgit v1.2.1