diff options
author | Karl Williamson <khw@cpan.org> | 2015-09-04 09:57:11 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-09-04 10:21:17 -0600 |
commit | e4fd731240d0d51e9cda61101a7d593dd9660e22 (patch) | |
tree | f650ee0ba38884ef21d804b61ead185af721db64 /utfebcdic.h | |
parent | 0f1913794d9137557b4ae7771a8a24ab8b5ee247 (diff) | |
download | perl-e4fd731240d0d51e9cda61101a7d593dd9660e22.tar.gz |
Change some UTF-EBCDIC macro handling defns
This commit changes the definitions of some macros for UTF-8 handling on
EBCDIC platforms. The previous definitions transformed the bytes into
I8 and did tests on the transformed values. The change is to use
previously unused bits in l1_char_class_tab.h so the transform isn't
needed, and generally only one branch is. These macros are called from
the inner loops of, for example, regex backtracking.
Diffstat (limited to 'utfebcdic.h')
-rw-r--r-- | utfebcdic.h | 33 |
1 files changed, 19 insertions, 14 deletions
diff --git a/utfebcdic.h b/utfebcdic.h index d9e1402ce2..1df7b3827f 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -172,23 +172,28 @@ END_EXTERN_C #define UNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) -/* UTF-EBCDIC semantic macros - transform back into I8 and then compare +/* UTF-EBCDIC semantic macros - We used to transform back into I8 and then + * compare, but now only have to do a single lookup by using a bit in + * l1_char_class_tab.h. * Comments as to the meaning of each are given at their corresponding utf8.h * definitions. */ -#define UTF8_IS_START(c) (NATIVE_UTF8_TO_I8(c) >= 0xC5 \ - && NATIVE_UTF8_TO_I8(c) != 0xE0) -#define UTF8_IS_CONTINUATION(c) ((NATIVE_UTF8_TO_I8(c) & 0xE0) == 0xA0) -#define UTF8_IS_CONTINUED(c) (NATIVE_UTF8_TO_I8(c) >= 0xA0) - -#define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_UTF8_TO_I8(c) >= 0xC5 \ - && NATIVE_UTF8_TO_I8(c) <= 0xC7) -/* Saying it this way adds a runtime test, but removes 2 run-time lookups */ -/*#define UTF8_IS_DOWNGRADEABLE_START(c) ((c) == I8_TO_NATIVE_UTF8(0xC5) \ - || (c) == I8_TO_NATIVE_UTF8(0xC6) \ - || (c) == I8_TO_NATIVE_UTF8(0xC7)) -*/ -#define UTF8_IS_ABOVE_LATIN1(c) (NATIVE_UTF8_TO_I8(c) >= 0xC8) +#define UTF8_IS_START(c) _generic_isCC(c, _CC_UTF8_IS_START) +#define UTF8_IS_CONTINUATION(c) _generic_isCC(c, _CC_UTF8_IS_CONTINUATION) + +/* Equivalent to ! UVCHR_IS_INVARIANT(c) */ +#define UTF8_IS_CONTINUED(c) cBOOL(FITS_IN_8_BITS(c) \ + && ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) + +#define UTF8_IS_DOWNGRADEABLE_START(c) _generic_isCC(c, \ + _CC_UTF8_IS_DOWNGRADEABLE_START) + +/* Equivalent to (UTF8_IS_START(c) && ! UTF8_IS_DOWNGRADEABLE_START(c)) + * Makes sure that the START bit is set and the DOWNGRADEABLE bit isn't */ +#define UTF8_IS_ABOVE_LATIN1(c) cBOOL(FITS_IN_8_BITS(c) \ + && ((PL_charclass[(U8) (c)] & ( _CC_mask(_CC_UTF8_IS_START) \ + |_CC_mask(_CC_UTF8_IS_DOWNGRADEABLE_START))) \ + == _CC_mask(_CC_UTF8_IS_START))) /* Can't exceed 7 on EBCDIC platforms */ #define UTF_START_MARK(len) (0xFF & (0xFE << (7-(len)))) |