summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-03-19 19:29:17 -0600
committerKarl Williamson <public@khwilliamson.com>2011-03-20 12:16:13 -0600
commit90826b5cd27738a30509f332296d8b985731d3fc (patch)
tree0dfdc33edd5165afb20ffa9e6a9712d56d222e31
parente286af2d135c6b1b03be2bd322f22f89e1b1aa5d (diff)
downloadperl-90826b5cd27738a30509f332296d8b985731d3fc.tar.gz
regcharclass: Add tricky fold characters.
The tricky fold characters need to be expanded to include the ones that map to the same ones as the original set. This isn't because the new ones have a length issue, it's that they get left out of comparisons because of the special regnodes generated for the tricky ones.
-rw-r--r--regcharclass.h82
-rwxr-xr-xregen/regcharclass.pl5
2 files changed, 81 insertions, 6 deletions
diff --git a/regcharclass.h b/regcharclass.h
index ea5cb99733..47d4b41925 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -361,9 +361,12 @@
/*
TRICKYFOLD: Problematic fold case letters.
- 0x00DF # LATIN1 SMALL LETTER SHARP S
+ 0x00DF # LATIN SMALL LETTER SHARP S
0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+ 0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
+ 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
+ 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
*/
/*** GENERATED CODE ***/
#define is_TRICKYFOLD(s,is_utf8) \
@@ -372,12 +375,32 @@
( ( 0x9F == ((U8*)s)[1] ) ? 2 : 0 ) \
: ( 0xCE == ((U8*)s)[0] ) ? \
( ( 0x90 == ((U8*)s)[1] || 0xB0 == ((U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xE1 == ((U8*)s)[0] ) ? \
+ ( ( 0xBA == ((U8*)s)[1] ) ? \
+ ( ( 0x9E == ((U8*)s)[2] ) ? 3 : 0 ) \
+ : ( 0xBF == ((U8*)s)[1] ) ? \
+ ( ( 0x93 == ((U8*)s)[2] || 0xA3 == ((U8*)s)[2] ) ? 3 : 0 ) \
+ : 0 ) \
: 0 ) \
: ( 0xDF == ((U8*)s)[0] ) )
/*** GENERATED CODE ***/
#define is_TRICKYFOLD_safe(s,e,is_utf8) \
-( ((e)-(s) > 1) ? \
+( ((e)-(s) > 2) ? \
+ ( ( is_utf8 ) ? \
+ ( ( 0xC3 == ((U8*)s)[0] ) ? \
+ ( ( 0x9F == ((U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xCE == ((U8*)s)[0] ) ? \
+ ( ( 0x90 == ((U8*)s)[1] || 0xB0 == ((U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xE1 == ((U8*)s)[0] ) ? \
+ ( ( 0xBA == ((U8*)s)[1] ) ? \
+ ( ( 0x9E == ((U8*)s)[2] ) ? 3 : 0 ) \
+ : ( 0xBF == ((U8*)s)[1] ) ? \
+ ( ( 0x93 == ((U8*)s)[2] || 0xA3 == ((U8*)s)[2] ) ? 3 : 0 ) \
+ : 0 ) \
+ : 0 ) \
+ : ( 0xDF == ((U8*)s)[0] ) ) \
+: ((e)-(s) > 1) ? \
( ( is_utf8 ) ? \
( ( 0xC3 == ((U8*)s)[0] ) ? \
( ( 0x9F == ((U8*)s)[1] ) ? 2 : 0 ) \
@@ -395,7 +418,10 @@
#define is_TRICKYFOLD_cp(cp) \
( 0xDF == cp || ( 0xDF < cp && \
( 0x390 == cp || ( 0x390 < cp && \
-0x3B0 == cp ) ) ) )
+( 0x3B0 == cp || ( 0x3B0 < cp && \
+( 0x1E9E == cp || ( 0x1E9E < cp && \
+( 0x1FD3 == cp || ( 0x1FD3 < cp && \
+0x1FE3 == cp ) ) ) ) ) ) ) ) ) )
/*** GENERATED CODE ***/
#define what_TRICKYFOLD(s,is_utf8) \
@@ -405,12 +431,35 @@
: ( 0xCE == ((U8*)s)[0] ) ? \
( ( 0x90 == ((U8*)s)[1] ) ? 0x390 \
: ( 0xB0 == ((U8*)s)[1] ) ? 0x3B0 : 0 ) \
+ : ( 0xE1 == ((U8*)s)[0] ) ? \
+ ( ( 0xBA == ((U8*)s)[1] ) ? \
+ ( ( 0x9E == ((U8*)s)[2] ) ? 0x1E9E : 0 ) \
+ : ( 0xBF == ((U8*)s)[1] ) ? \
+ ( ( 0x93 == ((U8*)s)[2] ) ? 0x1FD3 \
+ : ( 0xA3 == ((U8*)s)[2] ) ? 0x1FE3 : 0 ) \
+ : 0 ) \
: 0 ) \
: ( 0xDF == ((U8*)s)[0] ) ? 0xDF : 0 )
/*** GENERATED CODE ***/
#define what_TRICKYFOLD_safe(s,e,is_utf8) \
-( ((e)-(s) > 1) ? \
+( ((e)-(s) > 2) ? \
+ ( ( is_utf8 ) ? \
+ ( ( 0xC3 == ((U8*)s)[0] ) ? \
+ ( ( 0x9F == ((U8*)s)[1] ) ? 0xDF : 0 ) \
+ : ( 0xCE == ((U8*)s)[0] ) ? \
+ ( ( 0x90 == ((U8*)s)[1] ) ? 0x390 \
+ : ( 0xB0 == ((U8*)s)[1] ) ? 0x3B0 : 0 ) \
+ : ( 0xE1 == ((U8*)s)[0] ) ? \
+ ( ( 0xBA == ((U8*)s)[1] ) ? \
+ ( ( 0x9E == ((U8*)s)[2] ) ? 0x1E9E : 0 ) \
+ : ( 0xBF == ((U8*)s)[1] ) ? \
+ ( ( 0x93 == ((U8*)s)[2] ) ? 0x1FD3 \
+ : ( 0xA3 == ((U8*)s)[2] ) ? 0x1FE3 : 0 ) \
+ : 0 ) \
+ : 0 ) \
+ : ( 0xDF == ((U8*)s)[0] ) ? 0xDF : 0 ) \
+: ((e)-(s) > 1) ? \
( ( is_utf8 ) ? \
( ( 0xC3 == ((U8*)s)[0] ) ? \
( ( 0x9F == ((U8*)s)[1] ) ? 0xDF : 0 ) \
@@ -431,12 +480,35 @@
: ( 0xCE == ((U8*)s)[0] ) ? \
( ( 0x90 == ((U8*)s)[1] ) ? len=2, 0x390 \
: ( 0xB0 == ((U8*)s)[1] ) ? len=2, 0x3B0 : 0 ) \
+ : ( 0xE1 == ((U8*)s)[0] ) ? \
+ ( ( 0xBA == ((U8*)s)[1] ) ? \
+ ( ( 0x9E == ((U8*)s)[2] ) ? len=3, 0x1E9E : 0 ) \
+ : ( 0xBF == ((U8*)s)[1] ) ? \
+ ( ( 0x93 == ((U8*)s)[2] ) ? len=3, 0x1FD3 \
+ : ( 0xA3 == ((U8*)s)[2] ) ? len=3, 0x1FE3 : 0 ) \
+ : 0 ) \
: 0 ) \
: ( 0xDF == ((U8*)s)[0] ) ? len=1, 0xDF : 0 )
/*** GENERATED CODE ***/
#define what_len_TRICKYFOLD_safe(s,e,is_utf8,len) \
-( ((e)-(s) > 1) ? \
+( ((e)-(s) > 2) ? \
+ ( ( is_utf8 ) ? \
+ ( ( 0xC3 == ((U8*)s)[0] ) ? \
+ ( ( 0x9F == ((U8*)s)[1] ) ? len=2, 0xDF : 0 ) \
+ : ( 0xCE == ((U8*)s)[0] ) ? \
+ ( ( 0x90 == ((U8*)s)[1] ) ? len=2, 0x390 \
+ : ( 0xB0 == ((U8*)s)[1] ) ? len=2, 0x3B0 : 0 ) \
+ : ( 0xE1 == ((U8*)s)[0] ) ? \
+ ( ( 0xBA == ((U8*)s)[1] ) ? \
+ ( ( 0x9E == ((U8*)s)[2] ) ? len=3, 0x1E9E : 0 ) \
+ : ( 0xBF == ((U8*)s)[1] ) ? \
+ ( ( 0x93 == ((U8*)s)[2] ) ? len=3, 0x1FD3 \
+ : ( 0xA3 == ((U8*)s)[2] ) ? len=3, 0x1FE3 : 0 ) \
+ : 0 ) \
+ : 0 ) \
+ : ( 0xDF == ((U8*)s)[0] ) ? len=1, 0xDF : 0 ) \
+: ((e)-(s) > 1) ? \
( ( is_utf8 ) ? \
( ( 0xC3 == ((U8*)s)[0] ) ? \
( ( 0x9F == ((U8*)s)[1] ) ? len=2, 0xDF : 0 ) \
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index c3ea8a62b0..2e89b2da28 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -731,6 +731,9 @@ VERTWS: Vertical Whitespace: \v \V
TRICKYFOLD: Problematic fold case letters.
=> generic cp generic-cp generic-both :fast safe
-0x00DF # LATIN1 SMALL LETTER SHARP S
+0x00DF # LATIN SMALL LETTER SHARP S
0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
+0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
+0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0