utf8.h: Use new macro type from previous commit

This allows for an efficient isUTF8_CHAR macro, which does its own length checking, and uses the UTF8_INVARIANT macro for the first byte. On EBCDIC systems this macro which does a table lookup is quite a bit more efficient than all the branches that would normally have to be done.
author: Karl Williamson <khw@cpan.org> 2014-05-05 22:17:33 -0600
committer: Karl Williamson <khw@cpan.org> 2014-05-31 11:52:06 -0600
commit: 5dca92787911972e6827cbb3173c9b1f44ea8613 (patch)
tree: 4441de42446a50729bec14d361bc769963294449 /utf8.h
parent: 40f914fd7fc2115d5df1c2b1ecc1d960d5f0a210 (diff)
download: perl-5dca92787911972e6827cbb3173c9b1f44ea8613.tar.gz
1 files changed, 25 insertions, 35 deletions
diff --git a/utf8.h b/utf8.h
index a18faa2e35..924380db85 100644
--- a/utf8.h
+++ b/utf8.h
@@ -606,48 +606,38 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  * don't take too long to generate, and there is a separate one for each code
  * page, so they are in regcharclass.h instead of here */
 /*
-	UTF8_CHAR: Matches utf8 from 1 to 4 bytes
+	UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes
 
-	0x0 - 0x1FFFFF
+	0x80 - 0x1FFFFF
 */
 /*** GENERATED CODE ***/
-#define is_UTF8_CHAR_utf8_safe(s,e)                                         \
-( ((e)-(s) > 3) ?                                                           \
-    ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1                                \
-    : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                      \
-	( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                      \
-    : ( 0xE0 == ((U8*)s)[0] ) ?                                             \
-	( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-    : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ?                      \
-	( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-    : ( 0xF0 == ((U8*)s)[0] ) ?                                             \
-	( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-    : ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-: ((e)-(s) > 2) ?                                                           \
-    ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1                                \
-    : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                      \
-	( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                      \
-    : ( 0xE0 == ((U8*)s)[0] ) ?                                             \
-	( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-    : ( ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ((e)-(s) > 1) ?                                                           \
-    ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1                                \
-    : ( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) ? 2 : 0 )\
-: ((e)-(s) > 0) ?                                                           \
-    ( ( ((U8*)s)[0] & 0x80 ) == 0x00 )                                      \
-: 0 )
+#define is_UTF8_CHAR_utf8_no_length_checks(s)                               \
+( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                          \
+    ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                          \
+: ( 0xE0 == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ?                          \
+    ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xF0 == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+: ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
 #endif
 
 /*
- * =for apidoc isUTF8_CHAR
- *
- * Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or
- * UTF-EBCDIC) encoded character, looking no further than C<e - s> bytes into
- * C<s>.  Returns 0 if the sequence starting at C<s> through C<e - 1> is not
- * well-formed UTF-8
+=head1 Unicode Support
+
+=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
+
+Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or
+UTF-EBCDIC) encoded character, looking no further than C<e - s> bytes into
+C<s>.  Returns 0 if the sequence starting at C<s> through C<e - 1> is not
+well-formed UTF-8
 
 Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
-machines) is a valid UTF-8 character. */
+machines) is a valid UTF-8 character.
+
+=cut
+*/
 
 #define isUTF8_CHAR(s, e)   (((e) <= (s))                                   \
                              ? 0                                            \
@@ -656,7 +646,7 @@ machines) is a valid UTF-8 character. */
                                : (((e) - (s)) < UTF8SKIP(s))                \
                                  ? 0                                        \
                                  : (IS_UTF8_CHAR_FAST(UTF8SKIP(s)))         \
-                                   ? is_UTF8_CHAR_utf8_safe(s,e)            \
+                                   ? is_UTF8_CHAR_utf8_no_length_checks(s)  \
                                    : _is_utf8_char_slow(s, e))
 
 /* Do not use; should be deprecated.  Use isUTF8_CHAR() instead; this is
author	Karl Williamson <khw@cpan.org>	2014-05-05 22:17:33 -0600
committer	Karl Williamson <khw@cpan.org>	2014-05-31 11:52:06 -0600
commit	5dca92787911972e6827cbb3173c9b1f44ea8613 (patch)
tree	4441de42446a50729bec14d361bc769963294449 /utf8.h
parent	40f914fd7fc2115d5df1c2b1ecc1d960d5f0a210 (diff)
download	perl-5dca92787911972e6827cbb3173c9b1f44ea8613.tar.gz