summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MANIFEST1
-rw-r--r--Todo-5.0054
-rw-r--r--embed.h80
-rwxr-xr-xembed.pl21
-rw-r--r--embedvar.h12
-rw-r--r--global.sym18
-rw-r--r--handy.h55
-rw-r--r--intrpvar.h6
-rw-r--r--lib/unicode/Bidirectional.pl26
-rw-r--r--lib/unicode/Block.pl1
-rw-r--r--lib/unicode/Category.pl26
-rw-r--r--lib/unicode/Is/ASCII.pl3
-rw-r--r--lib/unicode/Is/Alnum.pl29
-rw-r--r--lib/unicode/Is/Alpha.pl26
-rw-r--r--lib/unicode/Is/BidiL.pl26
-rw-r--r--lib/unicode/Is/Cntrl.pl12
-rw-r--r--lib/unicode/Is/Digit.pl2
-rw-r--r--lib/unicode/Is/Graph.pl327
-rw-r--r--lib/unicode/Is/L.pl26
-rw-r--r--lib/unicode/Is/Lo.pl26
-rw-r--r--lib/unicode/Is/Print.pl26
-rw-r--r--lib/unicode/Is/Punct.pl70
-rw-r--r--lib/unicode/Is/Space.pl1
-rw-r--r--lib/unicode/Is/Word.pl250
-rw-r--r--lib/unicode/Is/XDigit.pl5
-rw-r--r--lib/unicode/Is/Z.pl1
-rw-r--r--lib/unicode/Is/Zs.pl1
-rw-r--r--lib/unicode/Name.pl691
-rw-r--r--lib/unicode/To/Digit.pl1
-rwxr-xr-xlib/unicode/mktables.PL28
-rw-r--r--objXSUB.h88
-rw-r--r--pod/perldelta.pod7
-rw-r--r--pod/perldiag.pod20
-rw-r--r--pod/perlre.pod94
-rw-r--r--proto.h21
-rw-r--r--regcomp.c673
-rw-r--r--regcomp.h88
-rw-r--r--regcomp.sym72
-rw-r--r--regexec.c1465
-rw-r--r--regnodes.h498
-rwxr-xr-xt/op/pat.t9
-rw-r--r--t/op/re_tests32
-rwxr-xr-xt/op/regexp.t10
-rwxr-xr-xt/pragma/utf8.t35
-rw-r--r--t/pragma/warn/regcomp26
-rw-r--r--utf8.c112
46 files changed, 4325 insertions, 726 deletions
diff --git a/MANIFEST b/MANIFEST
index 11543e1ae0..6aa3d5f4b2 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -826,6 +826,7 @@ lib/unicode/Is/So.pl Unicode character database
lib/unicode/Is/Space.pl Unicode character database
lib/unicode/Is/Syllable.pl Unicode character database
lib/unicode/Is/Upper.pl Unicode character database
+lib/unicode/Is/Word.pl Unicode character database
lib/unicode/Is/Z.pl Unicode character database
lib/unicode/Is/Zl.pl Unicode character database
lib/unicode/Is/Zp.pl Unicode character database
diff --git a/Todo-5.005 b/Todo-5.005
index a8831b1370..b70060350f 100644
--- a/Todo-5.005
+++ b/Todo-5.005
@@ -36,8 +36,8 @@ Locales
decimal separator (3,1415927 is Europeanese for an approximation of pi)
Regexen
- POSIX [:foo:] character classes
- ([=bar=] and [.zap.] would nice too but there's no API for them)
+ POSIX [=bar=] and [.zap.] would nice too but there's no API for them
+ (=bar= could be done with Unicode, though)
approximate matching
Reliable Signals
diff --git a/embed.h b/embed.h
index 0871c6f34a..ba070961ea 100644
--- a/embed.h
+++ b/embed.h
@@ -203,35 +203,53 @@
#define io_close Perl_io_close
#define invert Perl_invert
#define is_uni_alnum Perl_is_uni_alnum
+#define is_uni_alnumc Perl_is_uni_alnumc
#define is_uni_idfirst Perl_is_uni_idfirst
#define is_uni_alpha Perl_is_uni_alpha
+#define is_uni_ascii Perl_is_uni_ascii
#define is_uni_space Perl_is_uni_space
+#define is_uni_cntrl Perl_is_uni_cntrl
+#define is_uni_graph Perl_is_uni_graph
#define is_uni_digit Perl_is_uni_digit
#define is_uni_upper Perl_is_uni_upper
#define is_uni_lower Perl_is_uni_lower
#define is_uni_print Perl_is_uni_print
+#define is_uni_punct Perl_is_uni_punct
+#define is_uni_xdigit Perl_is_uni_xdigit
#define to_uni_upper Perl_to_uni_upper
#define to_uni_title Perl_to_uni_title
#define to_uni_lower Perl_to_uni_lower
#define is_uni_alnum_lc Perl_is_uni_alnum_lc
+#define is_uni_alnumc_lc Perl_is_uni_alnumc_lc
#define is_uni_idfirst_lc Perl_is_uni_idfirst_lc
#define is_uni_alpha_lc Perl_is_uni_alpha_lc
+#define is_uni_ascii_lc Perl_is_uni_ascii_lc
#define is_uni_space_lc Perl_is_uni_space_lc
+#define is_uni_cntrl_lc Perl_is_uni_cntrl_lc
+#define is_uni_graph_lc Perl_is_uni_graph_lc
#define is_uni_digit_lc Perl_is_uni_digit_lc
#define is_uni_upper_lc Perl_is_uni_upper_lc
#define is_uni_lower_lc Perl_is_uni_lower_lc
#define is_uni_print_lc Perl_is_uni_print_lc
+#define is_uni_punct_lc Perl_is_uni_punct_lc
+#define is_uni_xdigit_lc Perl_is_uni_xdigit_lc
#define to_uni_upper_lc Perl_to_uni_upper_lc
#define to_uni_title_lc Perl_to_uni_title_lc
#define to_uni_lower_lc Perl_to_uni_lower_lc
#define is_utf8_alnum Perl_is_utf8_alnum
+#define is_utf8_alnumc Perl_is_utf8_alnumc
#define is_utf8_idfirst Perl_is_utf8_idfirst
#define is_utf8_alpha Perl_is_utf8_alpha
+#define is_utf8_ascii Perl_is_utf8_ascii
#define is_utf8_space Perl_is_utf8_space
+#define is_utf8_cntrl Perl_is_utf8_cntrl
#define is_utf8_digit Perl_is_utf8_digit
+#define is_utf8_graph Perl_is_utf8_graph
#define is_utf8_upper Perl_is_utf8_upper
#define is_utf8_lower Perl_is_utf8_lower
#define is_utf8_print Perl_is_utf8_print
+#define is_utf8_punct Perl_is_utf8_punct
+#define is_utf8_xdigit Perl_is_utf8_xdigit
#define is_utf8_mark Perl_is_utf8_mark
#define jmaybe Perl_jmaybe
#define keyword Perl_keyword
@@ -846,6 +864,7 @@
#define add_data S_add_data
#define re_croak2 S_re_croak2
#define regpposixcc S_regpposixcc
+#define checkposixcc S_checkposixcc
#define clear_re S_clear_re
#endif
#if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT)
@@ -986,6 +1005,7 @@
#define ck_require Perl_ck_require
#define ck_rfun Perl_ck_rfun
#define ck_rvconst Perl_ck_rvconst
+#define ck_sassign Perl_ck_sassign
#define ck_scmp Perl_ck_scmp
#define ck_select Perl_ck_select
#define ck_shift Perl_ck_shift
@@ -1520,35 +1540,53 @@
#define io_close(a) Perl_io_close(aTHX_ a)
#define invert(a) Perl_invert(aTHX_ a)
#define is_uni_alnum(a) Perl_is_uni_alnum(aTHX_ a)
+#define is_uni_alnumc(a) Perl_is_uni_alnumc(aTHX_ a)
#define is_uni_idfirst(a) Perl_is_uni_idfirst(aTHX_ a)
#define is_uni_alpha(a) Perl_is_uni_alpha(aTHX_ a)
+#define is_uni_ascii(a) Perl_is_uni_ascii(aTHX_ a)
#define is_uni_space(a) Perl_is_uni_space(aTHX_ a)
+#define is_uni_cntrl(a) Perl_is_uni_cntrl(aTHX_ a)
+#define is_uni_graph(a) Perl_is_uni_graph(aTHX_ a)
#define is_uni_digit(a) Perl_is_uni_digit(aTHX_ a)
#define is_uni_upper(a) Perl_is_uni_upper(aTHX_ a)
#define is_uni_lower(a) Perl_is_uni_lower(aTHX_ a)
#define is_uni_print(a) Perl_is_uni_print(aTHX_ a)
+#define is_uni_punct(a) Perl_is_uni_punct(aTHX_ a)
+#define is_uni_xdigit(a) Perl_is_uni_xdigit(aTHX_ a)
#define to_uni_upper(a) Perl_to_uni_upper(aTHX_ a)
#define to_uni_title(a) Perl_to_uni_title(aTHX_ a)
#define to_uni_lower(a) Perl_to_uni_lower(aTHX_ a)
#define is_uni_alnum_lc(a) Perl_is_uni_alnum_lc(aTHX_ a)
+#define is_uni_alnumc_lc(a) Perl_is_uni_alnumc_lc(aTHX_ a)
#define is_uni_idfirst_lc(a) Perl_is_uni_idfirst_lc(aTHX_ a)
#define is_uni_alpha_lc(a) Perl_is_uni_alpha_lc(aTHX_ a)
+#define is_uni_ascii_lc(a) Perl_is_uni_ascii_lc(aTHX_ a)
#define is_uni_space_lc(a) Perl_is_uni_space_lc(aTHX_ a)
+#define is_uni_cntrl_lc(a) Perl_is_uni_cntrl_lc(aTHX_ a)
+#define is_uni_graph_lc(a) Perl_is_uni_graph_lc(aTHX_ a)
#define is_uni_digit_lc(a) Perl_is_uni_digit_lc(aTHX_ a)
#define is_uni_upper_lc(a) Perl_is_uni_upper_lc(aTHX_ a)
#define is_uni_lower_lc(a) Perl_is_uni_lower_lc(aTHX_ a)
#define is_uni_print_lc(a) Perl_is_uni_print_lc(aTHX_ a)
+#define is_uni_punct_lc(a) Perl_is_uni_punct_lc(aTHX_ a)
+#define is_uni_xdigit_lc(a) Perl_is_uni_xdigit_lc(aTHX_ a)
#define to_uni_upper_lc(a) Perl_to_uni_upper_lc(aTHX_ a)
#define to_uni_title_lc(a) Perl_to_uni_title_lc(aTHX_ a)
#define to_uni_lower_lc(a) Perl_to_uni_lower_lc(aTHX_ a)
#define is_utf8_alnum(a) Perl_is_utf8_alnum(aTHX_ a)
+#define is_utf8_alnumc(a) Perl_is_utf8_alnumc(aTHX_ a)
#define is_utf8_idfirst(a) Perl_is_utf8_idfirst(aTHX_ a)
#define is_utf8_alpha(a) Perl_is_utf8_alpha(aTHX_ a)
+#define is_utf8_ascii(a) Perl_is_utf8_ascii(aTHX_ a)
#define is_utf8_space(a) Perl_is_utf8_space(aTHX_ a)
+#define is_utf8_cntrl(a) Perl_is_utf8_cntrl(aTHX_ a)
#define is_utf8_digit(a) Perl_is_utf8_digit(aTHX_ a)
+#define is_utf8_graph(a) Perl_is_utf8_graph(aTHX_ a)
#define is_utf8_upper(a) Perl_is_utf8_upper(aTHX_ a)
#define is_utf8_lower(a) Perl_is_utf8_lower(aTHX_ a)
#define is_utf8_print(a) Perl_is_utf8_print(aTHX_ a)
+#define is_utf8_punct(a) Perl_is_utf8_punct(aTHX_ a)
+#define is_utf8_xdigit(a) Perl_is_utf8_xdigit(aTHX_ a)
#define is_utf8_mark(a) Perl_is_utf8_mark(aTHX_ a)
#define jmaybe(a) Perl_jmaybe(aTHX_ a)
#define keyword(a,b) Perl_keyword(aTHX_ a,b)
@@ -2153,6 +2191,7 @@
#define study_chunk(a,b,c,d,e) S_study_chunk(aTHX_ a,b,c,d,e)
#define add_data(a,b) S_add_data(aTHX_ a,b)
#define regpposixcc(a) S_regpposixcc(aTHX_ a)
+#define checkposixcc() S_checkposixcc(aTHX)
#define clear_re(a) S_clear_re(aTHX_ a)
#endif
#if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT)
@@ -2293,6 +2332,7 @@
#define ck_require(a) Perl_ck_require(aTHX_ a)
#define ck_rfun(a) Perl_ck_rfun(aTHX_ a)
#define ck_rvconst(a) Perl_ck_rvconst(aTHX_ a)
+#define ck_sassign(a) Perl_ck_sassign(aTHX_ a)
#define ck_scmp(a) Perl_ck_scmp(aTHX_ a)
#define ck_select(a) Perl_ck_select(aTHX_ a)
#define ck_shift(a) Perl_ck_shift(aTHX_ a)
@@ -3014,12 +3054,20 @@
#define invert Perl_invert
#define Perl_is_uni_alnum CPerlObj::Perl_is_uni_alnum
#define is_uni_alnum Perl_is_uni_alnum
+#define Perl_is_uni_alnumc CPerlObj::Perl_is_uni_alnumc
+#define is_uni_alnumc Perl_is_uni_alnumc
#define Perl_is_uni_idfirst CPerlObj::Perl_is_uni_idfirst
#define is_uni_idfirst Perl_is_uni_idfirst
#define Perl_is_uni_alpha CPerlObj::Perl_is_uni_alpha
#define is_uni_alpha Perl_is_uni_alpha
+#define Perl_is_uni_ascii CPerlObj::Perl_is_uni_ascii
+#define is_uni_ascii Perl_is_uni_ascii
#define Perl_is_uni_space CPerlObj::Perl_is_uni_space
#define is_uni_space Perl_is_uni_space
+#define Perl_is_uni_cntrl CPerlObj::Perl_is_uni_cntrl
+#define is_uni_cntrl Perl_is_uni_cntrl
+#define Perl_is_uni_graph CPerlObj::Perl_is_uni_graph
+#define is_uni_graph Perl_is_uni_graph
#define Perl_is_uni_digit CPerlObj::Perl_is_uni_digit
#define is_uni_digit Perl_is_uni_digit
#define Perl_is_uni_upper CPerlObj::Perl_is_uni_upper
@@ -3028,6 +3076,10 @@
#define is_uni_lower Perl_is_uni_lower
#define Perl_is_uni_print CPerlObj::Perl_is_uni_print
#define is_uni_print Perl_is_uni_print
+#define Perl_is_uni_punct CPerlObj::Perl_is_uni_punct
+#define is_uni_punct Perl_is_uni_punct
+#define Perl_is_uni_xdigit CPerlObj::Perl_is_uni_xdigit
+#define is_uni_xdigit Perl_is_uni_xdigit
#define Perl_to_uni_upper CPerlObj::Perl_to_uni_upper
#define to_uni_upper Perl_to_uni_upper
#define Perl_to_uni_title CPerlObj::Perl_to_uni_title
@@ -3036,12 +3088,20 @@
#define to_uni_lower Perl_to_uni_lower
#define Perl_is_uni_alnum_lc CPerlObj::Perl_is_uni_alnum_lc
#define is_uni_alnum_lc Perl_is_uni_alnum_lc
+#define Perl_is_uni_alnumc_lc CPerlObj::Perl_is_uni_alnumc_lc
+#define is_uni_alnumc_lc Perl_is_uni_alnumc_lc
#define Perl_is_uni_idfirst_lc CPerlObj::Perl_is_uni_idfirst_lc
#define is_uni_idfirst_lc Perl_is_uni_idfirst_lc
#define Perl_is_uni_alpha_lc CPerlObj::Perl_is_uni_alpha_lc
#define is_uni_alpha_lc Perl_is_uni_alpha_lc
+#define Perl_is_uni_ascii_lc CPerlObj::Perl_is_uni_ascii_lc
+#define is_uni_ascii_lc Perl_is_uni_ascii_lc
#define Perl_is_uni_space_lc CPerlObj::Perl_is_uni_space_lc
#define is_uni_space_lc Perl_is_uni_space_lc
+#define Perl_is_uni_cntrl_lc CPerlObj::Perl_is_uni_cntrl_lc
+#define is_uni_cntrl_lc Perl_is_uni_cntrl_lc
+#define Perl_is_uni_graph_lc CPerlObj::Perl_is_uni_graph_lc
+#define is_uni_graph_lc Perl_is_uni_graph_lc
#define Perl_is_uni_digit_lc CPerlObj::Perl_is_uni_digit_lc
#define is_uni_digit_lc Perl_is_uni_digit_lc
#define Perl_is_uni_upper_lc CPerlObj::Perl_is_uni_upper_lc
@@ -3050,6 +3110,10 @@
#define is_uni_lower_lc Perl_is_uni_lower_lc
#define Perl_is_uni_print_lc CPerlObj::Perl_is_uni_print_lc
#define is_uni_print_lc Perl_is_uni_print_lc
+#define Perl_is_uni_punct_lc CPerlObj::Perl_is_uni_punct_lc
+#define is_uni_punct_lc Perl_is_uni_punct_lc
+#define Perl_is_uni_xdigit_lc CPerlObj::Perl_is_uni_xdigit_lc
+#define is_uni_xdigit_lc Perl_is_uni_xdigit_lc
#define Perl_to_uni_upper_lc CPerlObj::Perl_to_uni_upper_lc
#define to_uni_upper_lc Perl_to_uni_upper_lc
#define Perl_to_uni_title_lc CPerlObj::Perl_to_uni_title_lc
@@ -3058,20 +3122,32 @@
#define to_uni_lower_lc Perl_to_uni_lower_lc
#define Perl_is_utf8_alnum CPerlObj::Perl_is_utf8_alnum
#define is_utf8_alnum Perl_is_utf8_alnum
+#define Perl_is_utf8_alnumc CPerlObj::Perl_is_utf8_alnumc
+#define is_utf8_alnumc Perl_is_utf8_alnumc
#define Perl_is_utf8_idfirst CPerlObj::Perl_is_utf8_idfirst
#define is_utf8_idfirst Perl_is_utf8_idfirst
#define Perl_is_utf8_alpha CPerlObj::Perl_is_utf8_alpha
#define is_utf8_alpha Perl_is_utf8_alpha
+#define Perl_is_utf8_ascii CPerlObj::Perl_is_utf8_ascii
+#define is_utf8_ascii Perl_is_utf8_ascii
#define Perl_is_utf8_space CPerlObj::Perl_is_utf8_space
#define is_utf8_space Perl_is_utf8_space
+#define Perl_is_utf8_cntrl CPerlObj::Perl_is_utf8_cntrl
+#define is_utf8_cntrl Perl_is_utf8_cntrl
#define Perl_is_utf8_digit CPerlObj::Perl_is_utf8_digit
#define is_utf8_digit Perl_is_utf8_digit
+#define Perl_is_utf8_graph CPerlObj::Perl_is_utf8_graph
+#define is_utf8_graph Perl_is_utf8_graph
#define Perl_is_utf8_upper CPerlObj::Perl_is_utf8_upper
#define is_utf8_upper Perl_is_utf8_upper
#define Perl_is_utf8_lower CPerlObj::Perl_is_utf8_lower
#define is_utf8_lower Perl_is_utf8_lower
#define Perl_is_utf8_print CPerlObj::Perl_is_utf8_print
#define is_utf8_print Perl_is_utf8_print
+#define Perl_is_utf8_punct CPerlObj::Perl_is_utf8_punct
+#define is_utf8_punct Perl_is_utf8_punct
+#define Perl_is_utf8_xdigit CPerlObj::Perl_is_utf8_xdigit
+#define is_utf8_xdigit Perl_is_utf8_xdigit
#define Perl_is_utf8_mark CPerlObj::Perl_is_utf8_mark
#define is_utf8_mark Perl_is_utf8_mark
#define Perl_jmaybe CPerlObj::Perl_jmaybe
@@ -4235,6 +4311,8 @@
#define re_croak2 S_re_croak2
#define S_regpposixcc CPerlObj::S_regpposixcc
#define regpposixcc S_regpposixcc
+#define S_checkposixcc CPerlObj::S_checkposixcc
+#define checkposixcc S_checkposixcc
#define S_clear_re CPerlObj::S_clear_re
#define clear_re S_clear_re
#endif
@@ -4489,6 +4567,8 @@
#define ck_rfun Perl_ck_rfun
#define Perl_ck_rvconst CPerlObj::Perl_ck_rvconst
#define ck_rvconst Perl_ck_rvconst
+#define Perl_ck_sassign CPerlObj::Perl_ck_sassign
+#define ck_sassign Perl_ck_sassign
#define Perl_ck_scmp CPerlObj::Perl_ck_scmp
#define ck_scmp Perl_ck_scmp
#define Perl_ck_select CPerlObj::Perl_ck_select
diff --git a/embed.pl b/embed.pl
index ed7f3e45b5..206dbbfaed 100755
--- a/embed.pl
+++ b/embed.pl
@@ -947,35 +947,53 @@ p |char* |instr |const char* big|const char* little
p |bool |io_close |IO* io
p |OP* |invert |OP* cmd
p |bool |is_uni_alnum |U32 c
+p |bool |is_uni_alnumc |U32 c
p |bool |is_uni_idfirst |U32 c
p |bool |is_uni_alpha |U32 c
+p |bool |is_uni_ascii |U32 c
p |bool |is_uni_space |U32 c
+p |bool |is_uni_cntrl |U32 c
+p |bool |is_uni_graph |U32 c
p |bool |is_uni_digit |U32 c
p |bool |is_uni_upper |U32 c
p |bool |is_uni_lower |U32 c
p |bool |is_uni_print |U32 c
+p |bool |is_uni_punct |U32 c
+p |bool |is_uni_xdigit |U32 c
p |U32 |to_uni_upper |U32 c
p |U32 |to_uni_title |U32 c
p |U32 |to_uni_lower |U32 c
p |bool |is_uni_alnum_lc|U32 c
+p |bool |is_uni_alnumc_lc|U32 c
p |bool |is_uni_idfirst_lc|U32 c
p |bool |is_uni_alpha_lc|U32 c
+p |bool |is_uni_ascii_lc|U32 c
p |bool |is_uni_space_lc|U32 c
+p |bool |is_uni_cntrl_lc|U32 c
+p |bool |is_uni_graph_lc|U32 c
p |bool |is_uni_digit_lc|U32 c
p |bool |is_uni_upper_lc|U32 c
p |bool |is_uni_lower_lc|U32 c
p |bool |is_uni_print_lc|U32 c
+p |bool |is_uni_punct_lc|U32 c
+p |bool |is_uni_xdigit_lc|U32 c
p |U32 |to_uni_upper_lc|U32 c
p |U32 |to_uni_title_lc|U32 c
p |U32 |to_uni_lower_lc|U32 c
p |bool |is_utf8_alnum |U8 *p
+p |bool |is_utf8_alnumc |U8 *p
p |bool |is_utf8_idfirst|U8 *p
p |bool |is_utf8_alpha |U8 *p
+p |bool |is_utf8_ascii |U8 *p
p |bool |is_utf8_space |U8 *p
+p |bool |is_utf8_cntrl |U8 *p
p |bool |is_utf8_digit |U8 *p
+p |bool |is_utf8_graph |U8 *p
p |bool |is_utf8_upper |U8 *p
p |bool |is_utf8_lower |U8 *p
p |bool |is_utf8_print |U8 *p
+p |bool |is_utf8_punct |U8 *p
+p |bool |is_utf8_xdigit |U8 *p
p |bool |is_utf8_mark |U8 *p
p |OP* |jmaybe |OP* arg
p |I32 |keyword |char* d|I32 len
@@ -1646,7 +1664,8 @@ s |I32 |study_chunk |regnode **scanp|I32 *deltap \
|regnode *last|scan_data_t *data|U32 flags
s |I32 |add_data |I32 n|char *s
rs |void|re_croak2 |const char* pat1|const char* pat2|...
-s |char*|regpposixcc |I32 value
+s |I32 |regpposixcc |I32 value
+s |void |checkposixcc
s |void |clear_re |void *r
#endif
diff --git a/embedvar.h b/embedvar.h
index f759b632ae..74e7ca50d3 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -508,16 +508,22 @@
#define PL_uid (PL_curinterp->Iuid)
#define PL_unsafe (PL_curinterp->Iunsafe)
#define PL_utf8_alnum (PL_curinterp->Iutf8_alnum)
+#define PL_utf8_alnumc (PL_curinterp->Iutf8_alnumc)
#define PL_utf8_alpha (PL_curinterp->Iutf8_alpha)
+#define PL_utf8_ascii (PL_curinterp->Iutf8_ascii)
+#define PL_utf8_cntrl (PL_curinterp->Iutf8_cntrl)
#define PL_utf8_digit (PL_curinterp->Iutf8_digit)
+#define PL_utf8_graph (PL_curinterp->Iutf8_graph)
#define PL_utf8_lower (PL_curinterp->Iutf8_lower)
#define PL_utf8_mark (PL_curinterp->Iutf8_mark)
#define PL_utf8_print (PL_curinterp->Iutf8_print)
+#define PL_utf8_punct (PL_curinterp->Iutf8_punct)
#define PL_utf8_space (PL_curinterp->Iutf8_space)
#define PL_utf8_tolower (PL_curinterp->Iutf8_tolower)
#define PL_utf8_totitle (PL_curinterp->Iutf8_totitle)
#define PL_utf8_toupper (PL_curinterp->Iutf8_toupper)
#define PL_utf8_upper (PL_curinterp->Iutf8_upper)
+#define PL_utf8_xdigit (PL_curinterp->Iutf8_xdigit)
#define PL_uudmap (PL_curinterp->Iuudmap)
#define PL_warnhook (PL_curinterp->Iwarnhook)
#define PL_xiv_arenaroot (PL_curinterp->Ixiv_arenaroot)
@@ -765,16 +771,22 @@
#define PL_Iuid PL_uid
#define PL_Iunsafe PL_unsafe
#define PL_Iutf8_alnum PL_utf8_alnum
+#define PL_Iutf8_alnumc PL_utf8_alnumc
#define PL_Iutf8_alpha PL_utf8_alpha
+#define PL_Iutf8_ascii PL_utf8_ascii
+#define PL_Iutf8_cntrl PL_utf8_cntrl
#define PL_Iutf8_digit PL_utf8_digit
+#define PL_Iutf8_graph PL_utf8_graph
#define PL_Iutf8_lower PL_utf8_lower
#define PL_Iutf8_mark PL_utf8_mark
#define PL_Iutf8_print PL_utf8_print
+#define PL_Iutf8_punct PL_utf8_punct
#define PL_Iutf8_space PL_utf8_space
#define PL_Iutf8_tolower PL_utf8_tolower
#define PL_Iutf8_totitle PL_utf8_totitle
#define PL_Iutf8_toupper PL_utf8_toupper
#define PL_Iutf8_upper PL_utf8_upper
+#define PL_Iutf8_xdigit PL_utf8_xdigit
#define PL_Iuudmap PL_uudmap
#define PL_Iwarnhook PL_warnhook
#define PL_Ixiv_arenaroot PL_xiv_arenaroot
diff --git a/global.sym b/global.sym
index 87ece3c083..c5597d1b57 100644
--- a/global.sym
+++ b/global.sym
@@ -177,35 +177,53 @@ Perl_instr
Perl_io_close
Perl_invert
Perl_is_uni_alnum
+Perl_is_uni_alnumc
Perl_is_uni_idfirst
Perl_is_uni_alpha
+Perl_is_uni_ascii
Perl_is_uni_space
+Perl_is_uni_cntrl
+Perl_is_uni_graph
Perl_is_uni_digit
Perl_is_uni_upper
Perl_is_uni_lower
Perl_is_uni_print
+Perl_is_uni_punct
+Perl_is_uni_xdigit
Perl_to_uni_upper
Perl_to_uni_title
Perl_to_uni_lower
Perl_is_uni_alnum_lc
+Perl_is_uni_alnumc_lc
Perl_is_uni_idfirst_lc
Perl_is_uni_alpha_lc
+Perl_is_uni_ascii_lc
Perl_is_uni_space_lc
+Perl_is_uni_cntrl_lc
+Perl_is_uni_graph_lc
Perl_is_uni_digit_lc
Perl_is_uni_upper_lc
Perl_is_uni_lower_lc
Perl_is_uni_print_lc
+Perl_is_uni_punct_lc
+Perl_is_uni_xdigit_lc
Perl_to_uni_upper_lc
Perl_to_uni_title_lc
Perl_to_uni_lower_lc
Perl_is_utf8_alnum
+Perl_is_utf8_alnumc
Perl_is_utf8_idfirst
Perl_is_utf8_alpha
+Perl_is_utf8_ascii
Perl_is_utf8_space
+Perl_is_utf8_cntrl
Perl_is_utf8_digit
+Perl_is_utf8_graph
Perl_is_utf8_upper
Perl_is_utf8_lower
Perl_is_utf8_print
+Perl_is_utf8_punct
+Perl_is_utf8_xdigit
Perl_is_utf8_mark
Perl_jmaybe
Perl_keyword
diff --git a/handy.h b/handy.h
index 851f348228..95bcec7e2a 100644
--- a/handy.h
+++ b/handy.h
@@ -215,13 +215,25 @@ typedef unsigned short U16;
/* In EBCDIC we do not do locales: therefore() isupper() is fine. */
# define isUPPER(c) isupper(c)
# define isLOWER(c) islower(c)
+# define isALNUMC(c) isalnum(c)
+# define isASCII(c) isascii(c)
+# define isCNTRL(c) iscntrl(c)
+# define isGRAPH(c) isgraph(c)
# define isPRINT(c) isprint(c)
+# define isPUNCT(c) ispunct(c)
+# define isXDIGIT(c) isxdigit(c)
# define toUPPER(c) toupper(c)
# define toLOWER(c) tolower(c)
#else
# define isUPPER(c) ((c) >= 'A' && (c) <= 'Z')
# define isLOWER(c) ((c) >= 'a' && (c) <= 'z')
+# define isALNUMC(c) (isALPHA(c) || isDIGIT(c))
+# define isASCII(c) ((c) <= 127)
+# define isCNTRL(c) ((c) < ' ')
+# define isGRAPH(c) (isALNUM(c) || isPUNCT(c))
# define isPRINT(c) (((c) > 32 && (c) < 127) || isSPACE(c))
+# define isPUNCT(c) (((c) >= 33 && (c) <= 47) || ((c) >= 58 && (c) <= 64) || ((c) >= 91 && (c) <= 96) || ((c) >= 123 && (c) <= 126))
+# define isXDIGIT(c) (isdigit(c) || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F'))
# define toUPPER(c) (isLOWER(c) ? (c) - ('a' - 'A') : (c))
# define toLOWER(c) (isUPPER(c) ? (c) + ('a' - 'A') : (c))
#endif
@@ -229,8 +241,7 @@ typedef unsigned short U16;
#ifdef USE_NEXT_CTYPE
# define isALNUM_LC(c) \
- (NXIsAlpha((unsigned int)(c)) || NXIsDigit((unsigned int)(c)) || \
- (char)(c) == '_')
+ (NXIsAlnum((unsigned int)(c)) || (char)(c) == '_')
# define isIDFIRST_LC(c) \
(NXIsAlpha((unsigned int)(c)) || (char)(c) == '_')
# define isALPHA_LC(c) NXIsAlpha((unsigned int)(c))
@@ -238,37 +249,47 @@ typedef unsigned short U16;
# define isDIGIT_LC(c) NXIsDigit((unsigned int)(c))
# define isUPPER_LC(c) NXIsUpper((unsigned int)(c))
# define isLOWER_LC(c) NXIsLower((unsigned int)(c))
+# define isALNUMC_LC(c) NXIsAlnum((unsigned int)(c))
+# define isCNTRL_LC(c) NXIsCntrl((unsigned int)(c))
+# define isGRAPH_LC(c) NXIsGraph((unsigned int)(c))
# define isPRINT_LC(c) NXIsPrint((unsigned int)(c))
+# define isPUNCT_LC(c) NXIsPunct((unsigned int)(c))
# define toUPPER_LC(c) NXToUpper((unsigned int)(c))
# define toLOWER_LC(c) NXToLower((unsigned int)(c))
#else /* !USE_NEXT_CTYPE */
+
# if defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII))
-# define isALNUM_LC(c) \
- (isalpha((unsigned char)(c)) || \
- isdigit((unsigned char)(c)) || (char)(c) == '_')
+# define isALNUM_LC(c) (isalnum((unsigned char)(c)) || (char)(c) == '_')
# define isIDFIRST_LC(c) (isalpha((unsigned char)(c)) || (char)(c) == '_')
# define isALPHA_LC(c) isalpha((unsigned char)(c))
# define isSPACE_LC(c) isspace((unsigned char)(c))
# define isDIGIT_LC(c) isdigit((unsigned char)(c))
# define isUPPER_LC(c) isupper((unsigned char)(c))
# define isLOWER_LC(c) islower((unsigned char)(c))
+# define isALNUMC_LC(c) isalnum((unsigned char)(c))
+# define isCNTRL_LC(c) iscntrl((unsigned char)(c))
+# define isGRAPH_LC(c) isgraph((unsigned char)(c))
# define isPRINT_LC(c) isprint((unsigned char)(c))
+# define isPUNCT_LC(c) ispunct((unsigned char)(c))
# define toUPPER_LC(c) toupper((unsigned char)(c))
# define toLOWER_LC(c) tolower((unsigned char)(c))
# else
-# define isALNUM_LC(c) \
- (isascii(c) && (isalpha(c) || isdigit(c) || (c) == '_'))
+# define isALNUM_LC(c) (isascii(c) && (isalnum(c) || (c) == '_'))
# define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_'))
# define isALPHA_LC(c) (isascii(c) && isalpha(c))
# define isSPACE_LC(c) (isascii(c) && isspace(c))
# define isDIGIT_LC(c) (isascii(c) && isdigit(c))
# define isUPPER_LC(c) (isascii(c) && isupper(c))
# define isLOWER_LC(c) (isascii(c) && islower(c))
+# define isALNUMC_LC(c) (isascii(c) && isalnum(c))
+# define isCNTRL_LC(c) (isascii(c) && iscntrl(c))
+# define isGRAPH_LC(c) (isascii(c) && isgraph(c))
# define isPRINT_LC(c) (isascii(c) && isprint(c))
+# define isPUNCT_LC(c) (isascii(c) && ispunct(c))
# define toUPPER_LC(c) toupper(c)
# define toLOWER_LC(c) tolower(c)
@@ -282,7 +303,13 @@ typedef unsigned short U16;
#define isDIGIT_uni(c) is_uni_digit(c)
#define isUPPER_uni(c) is_uni_upper(c)
#define isLOWER_uni(c) is_uni_lower(c)
+#define isALNUMC_uni(c) is_uni_alnumc(c)
+#define isASCII_uni(c) is_uni_ascii(c)
+#define isCNTRL_uni(c) is_uni_cntrl(c)
+#define isGRAPH_uni(c) is_uni_graph(c)
#define isPRINT_uni(c) is_uni_print(c)
+#define isPUNCT_uni(c) is_uni_punct(c)
+#define isXDIGIT_uni(c) is_uni_xdigit(c)
#define toUPPER_uni(c) to_uni_upper(c)
#define toTITLE_uni(c) to_uni_title(c)
#define toLOWER_uni(c) to_uni_lower(c)
@@ -294,7 +321,11 @@ typedef unsigned short U16;
#define isDIGIT_LC_uni(c) (c < 256 ? isDIGIT_LC(c) : is_uni_digit_lc(c))
#define isUPPER_LC_uni(c) (c < 256 ? isUPPER_LC(c) : is_uni_upper_lc(c))
#define isLOWER_LC_uni(c) (c < 256 ? isLOWER_LC(c) : is_uni_lower_lc(c))
+#define isALNUMC_LC_uni(c) (c < 256 ? isALNUMC_LC(c) : is_uni_alnumc_lc(c))
+#define isCNTRL_LC_uni(c) (c < 256 ? isCNTRL_LC(c) : is_uni_cntrl_lc(c))
+#define isGRAPH_LC_uni(c) (c < 256 ? isGRAPH_LC(c) : is_uni_graph_lc(c))
#define isPRINT_LC_uni(c) (c < 256 ? isPRINT_LC(c) : is_uni_print_lc(c))
+#define isPUNCT_LC_uni(c) (c < 256 ? isPUNCT_LC(c) : is_uni_punct_lc(c))
#define toUPPER_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_upper_lc(c))
#define toTITLE_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c))
#define toLOWER_LC_uni(c) (c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c))
@@ -306,7 +337,13 @@ typedef unsigned short U16;
#define isDIGIT_utf8(p) is_utf8_digit(p)
#define isUPPER_utf8(p) is_utf8_upper(p)
#define isLOWER_utf8(p) is_utf8_lower(p)
+#define isALNUMC_utf8(p) is_utf8_alnumc(p)
+#define isASCII_utf8(p) is_utf8_ascii(p)
+#define isCNTRL_utf8(p) is_utf8_cntrl(p)
+#define isGRAPH_utf8(p) is_utf8_graph(p)
#define isPRINT_utf8(p) is_utf8_print(p)
+#define isPUNCT_utf8(p) is_utf8_punct(p)
+#define isXDIGIT_utf8(p) is_utf8_xdigit(p)
#define toUPPER_utf8(p) to_utf8_upper(p)
#define toTITLE_utf8(p) to_utf8_title(p)
#define toLOWER_utf8(p) to_utf8_lower(p)
@@ -318,7 +355,11 @@ typedef unsigned short U16;
#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv(p, 0))
#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv(p, 0))
#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv(p, 0))
+#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv(p, 0))
+#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv(p, 0))
+#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv(p, 0))
#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv(p, 0))
+#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv(p, 0))
#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv(p, 0))
#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, 0))
#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, 0))
diff --git a/intrpvar.h b/intrpvar.h
index 5cff858675..e2d1d5f85d 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -322,12 +322,18 @@ PERLVAR(Inumeric_radix, char)
/* utf8 character classes */
PERLVAR(Iutf8_alnum, SV *)
+PERLVAR(Iutf8_alnumc, SV *)
+PERLVAR(Iutf8_ascii, SV *)
PERLVAR(Iutf8_alpha, SV *)
PERLVAR(Iutf8_space, SV *)
+PERLVAR(Iutf8_cntrl, SV *)
+PERLVAR(Iutf8_graph, SV *)
PERLVAR(Iutf8_digit, SV *)
PERLVAR(Iutf8_upper, SV *)
PERLVAR(Iutf8_lower, SV *)
PERLVAR(Iutf8_print, SV *)
+PERLVAR(Iutf8_punct, SV *)
+PERLVAR(Iutf8_xdigit, SV *)
PERLVAR(Iutf8_mark, SV *)
PERLVAR(Iutf8_toupper, SV *)
PERLVAR(Iutf8_totitle, SV *)
diff --git a/lib/unicode/Bidirectional.pl b/lib/unicode/Bidirectional.pl
index 1523d505c1..e10210dc31 100644
--- a/lib/unicode/Bidirectional.pl
+++ b/lib/unicode/Bidirectional.pl
@@ -233,6 +233,32 @@ return <<'END';
1100 1159 L
115f 11a2 L
11a8 11f9 L
+1200 1206 L
+1208 1246 L
+1248 L
+124a 124d L
+1250 1256 L
+1258 L
+125a 125d L
+1260 1286 L
+1288 L
+128a 128d L
+1290 12ae L
+12b0 L
+12b2 12b5 L
+12b8 12be L
+12c0 L
+12c2 12c5 L
+12c8 12ce L
+12d0 12d6 L
+12d8 12ee L
+12f0 130e L
+1310 L
+1312 1315 L
+1318 131e L
+1320 1346 L
+1348 135a L
+1361 137c L
1e00 1e9b L
1ea0 1ef9 L
1f00 1f15 L
diff --git a/lib/unicode/Block.pl b/lib/unicode/Block.pl
index ce9289aa93..1c0b280f4c 100644
--- a/lib/unicode/Block.pl
+++ b/lib/unicode/Block.pl
@@ -27,7 +27,6 @@ return <<'END';
1100 11FF Hangul Jamo
1E00 1EFF Latin Extended Additional
1F00 1FFF Greek Extended
-1200 137F Ethiopic
2000 206F General Punctuation
2070 209F Superscripts and Subscripts
20A0 20CF Currency Symbols
diff --git a/lib/unicode/Category.pl b/lib/unicode/Category.pl
index 3b47570d1a..5c0842c8bf 100644
--- a/lib/unicode/Category.pl
+++ b/lib/unicode/Category.pl
@@ -762,7 +762,31 @@ return <<'END';
1100 1159 Lo
115f 11a2 Lo
11a8 11f9 Lo
-1200 135a Lo
+1200 1206 Lo
+1208 1246 Lo
+1248 Lo
+124a 124d Lo
+1250 1256 Lo
+1258 Lo
+125a 125d Lo
+1260 1286 Lo
+1288 Lo
+128a 128d Lo
+1290 12ae Lo
+12b0 Lo
+12b2 12b5 Lo
+12b8 12be Lo
+12c0 Lo
+12c2 12c5 Lo
+12c8 12ce Lo
+12d0 12d6 Lo
+12d8 12ee Lo
+12f0 130e Lo
+1310 Lo
+1312 1315 Lo
+1318 131e Lo
+1320 1346 Lo
+1348 135a Lo
1361 1368 Po
1369 1371 Nd
1372 137c No
diff --git a/lib/unicode/Is/ASCII.pl b/lib/unicode/Is/ASCII.pl
new file mode 100644
index 0000000000..b7843e932f
--- /dev/null
+++ b/lib/unicode/Is/ASCII.pl
@@ -0,0 +1,3 @@
+return <<'END';
+0000 007f
+END
diff --git a/lib/unicode/Is/Alnum.pl b/lib/unicode/Is/Alnum.pl
index aa82e4ff89..ac48257e9a 100644
--- a/lib/unicode/Is/Alnum.pl
+++ b/lib/unicode/Is/Alnum.pl
@@ -1,7 +1,6 @@
return <<'END';
0030 0039
0041 005a
-005f
0061 007a
00aa
00b5
@@ -156,8 +155,32 @@ return <<'END';
1100 1159
115f 11a2
11a8 11f9
-1200 135a
-1369 137c
+1200 1206
+1208 1246
+1248
+124a 124d
+1250 1256
+1258
+125a 125d
+1260 1286
+1288
+128a 128d
+1290 12ae
+12b0
+12b2 12b5
+12b8 12be
+12c0
+12c2 12c5
+12c8 12ce
+12d0 12d6
+12d8 12ee
+12f0 130e
+1310
+1312 1315
+1318 131e
+1320 1346
+1348 135a
+1369 1371
1e00 1e9b
1ea0 1ef9
1f00 1f15
diff --git a/lib/unicode/Is/Alpha.pl b/lib/unicode/Is/Alpha.pl
index ea6fa7f8b6..9de0521776 100644
--- a/lib/unicode/Is/Alpha.pl
+++ b/lib/unicode/Is/Alpha.pl
@@ -141,7 +141,31 @@ return <<'END';
1100 1159
115f 11a2
11a8 11f9
-1200 135a
+1200 1206
+1208 1246
+1248
+124a 124d
+1250 1256
+1258
+125a 125d
+1260 1286
+1288
+128a 128d
+1290 12ae
+12b0
+12b2 12b5
+12b8 12be
+12c0
+12c2 12c5
+12c8 12ce
+12d0 12d6
+12d8 12ee
+12f0 130e
+1310
+1312 1315
+1318 131e
+1320 1346
+1348 135a
1e00 1e9b
1ea0 1ef9
1f00 1f15
diff --git a/lib/unicode/Is/BidiL.pl b/lib/unicode/Is/BidiL.pl
index 85de325625..c17ef10a5b 100644
--- a/lib/unicode/Is/BidiL.pl
+++ b/lib/unicode/Is/BidiL.pl
@@ -186,6 +186,32 @@ return <<'END';
1100 1159
115f 11a2
11a8 11f9
+1200 1206
+1208 1246
+1248
+124a 124d
+1250 1256
+1258
+125a 125d
+1260 1286
+1288
+128a 128d
+1290 12ae
+12b0
+12b2 12b5
+12b8 12be
+12c0
+12c2 12c5
+12c8 12ce
+12d0 12d6
+12d8 12ee
+12f0 130e
+1310
+1312 1315
+1318 131e
+1320 1346
+1348 135a
+1361 137c
1e00 1e9b
1ea0 1ef9
1f00 1f15
diff --git a/lib/unicode/Is/Cntrl.pl b/lib/unicode/Is/Cntrl.pl
new file mode 100644
index 0000000000..274239f9d2
--- /dev/null
+++ b/lib/unicode/Is/Cntrl.pl
@@ -0,0 +1,12 @@
+return <<'END';
+0000 001f
+007f 009f
+200c 200f
+202a 202e
+206a 206f
+d800 db7f
+db80 dbff
+dc00 dfff
+e000 f8ff
+feff
+END
diff --git a/lib/unicode/Is/Digit.pl b/lib/unicode/Is/Digit.pl
index 2181f150ff..a25e28f9e9 100644
--- a/lib/unicode/Is/Digit.pl
+++ b/lib/unicode/Is/Digit.pl
@@ -14,6 +14,6 @@ return <<'END';
0e50 0e59
0ed0 0ed9
0f20 0f29
-1369 137c
+1369 1371
ff10 ff19
END
diff --git a/lib/unicode/Is/Graph.pl b/lib/unicode/Is/Graph.pl
new file mode 100644
index 0000000000..7a8c225176
--- /dev/null
+++ b/lib/unicode/Is/Graph.pl
@@ -0,0 +1,327 @@
+return <<'END';
+0021 007e
+00a0 01f5
+01fa 0217
+0250 02a8
+02b0 02de
+02e0 02e9
+0300 0345
+0360 0361
+0374 0375
+037a
+037e
+0384 038a
+038c
+038e 03a1
+03a3 03ce
+03d0 03d6
+03da
+03dc
+03de
+03e0
+03e2 03f3
+0401 040c
+040e 044f
+0451 045c
+045e 0486
+0490 04c4
+04c7 04c8
+04cb 04cc
+04d0 04eb
+04ee 04f5
+04f8 04f9
+0531 0556
+0559 055f
+0561 0587
+0589
+0591 05a1
+05a3 05b9
+05bb 05c4
+05d0 05ea
+05f0 05f4
+060c
+061b
+061f
+0621 063a
+0640 0652
+0660 066d
+0670 06b7
+06ba 06be
+06c0 06ce
+06d0 06ed
+06f0 06f9
+0901 0903
+0905 0939
+093c 094d
+0950 0954
+0958 0970
+0981 0983
+0985 098c
+098f 0990
+0993 09a8
+09aa 09b0
+09b2
+09b6 09b9
+09bc
+09be 09c4
+09c7 09c8
+09cb 09cd
+09d7
+09dc 09dd
+09df 09e3
+09e6 09fa
+0a02
+0a05 0a0a
+0a0f 0a10
+0a13 0a28
+0a2a 0a30
+0a32 0a33
+0a35 0a36
+0a38 0a39
+0a3c
+0a3e 0a42
+0a47 0a48
+0a4b 0a4d
+0a59 0a5c
+0a5e
+0a66 0a74
+0a81 0a83
+0a85 0a8b
+0a8d
+0a8f 0a91
+0a93 0aa8
+0aaa 0ab0
+0ab2 0ab3
+0ab5 0ab9
+0abc 0ac5
+0ac7 0ac9
+0acb 0acd
+0ad0
+0ae0
+0ae6 0aef
+0b01 0b03
+0b05 0b0c
+0b0f 0b10
+0b13 0b28
+0b2a 0b30
+0b32 0b33
+0b36 0b39
+0b3c 0b43
+0b47 0b48
+0b4b 0b4d
+0b56 0b57
+0b5c 0b5d
+0b5f 0b61
+0b66 0b70
+0b82 0b83
+0b85 0b8a
+0b8e 0b90
+0b92 0b95
+0b99 0b9a
+0b9c
+0b9e 0b9f
+0ba3 0ba4
+0ba8 0baa
+0bae 0bb5
+0bb7 0bb9
+0bbe 0bc2
+0bc6 0bc8
+0bca 0bcd
+0bd7
+0be7 0bf2
+0c01 0c03
+0c05 0c0c
+0c0e 0c10
+0c12 0c28
+0c2a 0c33
+0c35 0c39
+0c3e 0c44
+0c46 0c48
+0c4a 0c4d
+0c55 0c56
+0c60 0c61
+0c66 0c6f
+0c82 0c83
+0c85 0c8c
+0c8e 0c90
+0c92 0ca8
+0caa 0cb3
+0cb5 0cb9
+0cbe 0cc4
+0cc6 0cc8
+0cca 0ccd
+0cd5 0cd6
+0cde
+0ce0 0ce1
+0ce6 0cef
+0d02 0d03
+0d05 0d0c
+0d0e 0d10
+0d12 0d28
+0d2a 0d39
+0d3e 0d43
+0d46 0d48
+0d4a 0d4d
+0d57
+0d60 0d61
+0d66 0d6f
+0e01 0e3a
+0e3f 0e5b
+0e81 0e82
+0e84
+0e87 0e88
+0e8a
+0e8d
+0e94 0e97
+0e99 0e9f
+0ea1 0ea3
+0ea5
+0ea7
+0eaa 0eab
+0ead 0eb9
+0ebb 0ebd
+0ec0 0ec4
+0ec6
+0ec8 0ecd
+0ed0 0ed9
+0edc 0edd
+0f00 0f47
+0f49 0f69
+0f71 0f8b
+0f90 0f95
+0f97
+0f99 0fad
+0fb1 0fb7
+0fb9
+10a0 10c5
+10d0 10f6
+10fb
+1100 1159
+115f 11a2
+11a8 11f9
+1200 1206
+1208 1246
+1248
+124a 124d
+1250 1256
+1258
+125a 125d
+1260 1286
+1288
+128a 128d
+1290 12ae
+12b0
+12b2 12b5
+12b8 12be
+12c0
+12c2 12c5
+12c8 12ce
+12d0 12d6
+12d8 12ee
+12f0 130e
+1310
+1312 1315
+1318 131e
+1320 1346
+1348 135a
+1361 137c
+1e00 1e9b
+1ea0 1ef9
+1f00 1f15
+1f18 1f1d
+1f20 1f45
+1f48 1f4d
+1f50 1f57
+1f59
+1f5b
+1f5d
+1f5f 1f7d
+1f80 1fb4
+1fb6 1fc4
+1fc6 1fd3
+1fd6 1fdb
+1fdd 1fef
+1ff2 1ff4
+1ff6 1ffe
+2000 200b
+2010 2029
+2030 2046
+2070
+2074 208e
+20a0 20ac
+20d0 20e1
+2100 2138
+2153 2182
+2190 21ea
+2200 22f1
+2300
+2302 237a
+2400 2424
+2440 244a
+2460 24ea
+2500 2595
+25a0 25ef
+2600 2613
+261a 266f
+2701 2704
+2706 2709
+270c 2727
+2729 274b
+274d
+274f 2752
+2756
+2758 275e
+2761 2767
+2776 2794
+2798 27af
+27b1 27be
+3000 3037
+303f
+3041 3094
+3099 309e
+30a1 30fe
+3105 312c
+3131 318e
+3190 319f
+3200 321c
+3220 3243
+3260 327b
+327f 32b0
+32c0 32cb
+32d0 32fe
+3300 3376
+337b 33dd
+33e0 33fe
+4e00 9fa5
+ac00 d7a3
+f900 fa2d
+fb00 fb06
+fb13 fb17
+fb1e fb36
+fb38 fb3c
+fb3e
+fb40 fb41
+fb43 fb44
+fb46 fbb1
+fbd3 fd3f
+fd50 fd8f
+fd92 fdc7
+fdf0 fdfb
+fe20 fe23
+fe30 fe44
+fe49 fe52
+fe54 fe66
+fe68 fe6b
+fe70 fe72
+fe74
+fe76 fefc
+ff01 ff5e
+ff61 ffbe
+ffc2 ffc7
+ffca ffcf
+ffd2 ffd7
+ffda ffdc
+ffe0 ffe6
+ffe8 ffee
+fffc fffd
+END
diff --git a/lib/unicode/Is/L.pl b/lib/unicode/Is/L.pl
index 9c8e3cf21f..06796fd1bb 100644
--- a/lib/unicode/Is/L.pl
+++ b/lib/unicode/Is/L.pl
@@ -145,7 +145,31 @@ return <<'END';
1100 1159
115f 11a2
11a8 11f9
-1200 135a
+1200 1206
+1208 1246
+1248
+124a 124d
+1250 1256
+1258
+125a 125d
+1260 1286
+1288
+128a 128d
+1290 12ae
+12b0
+12b2 12b5
+12b8 12be
+12c0
+12c2 12c5
+12c8 12ce
+12d0 12d6
+12d8 12ee
+12f0 130e
+1310
+1312 1315
+1318 131e
+1320 1346
+1348 135a
1e00 1e9b
1ea0 1ef9
1f00 1f15
diff --git a/lib/unicode/Is/Lo.pl b/lib/unicode/Is/Lo.pl
index 30f776da52..07da29ed1b 100644
--- a/lib/unicode/Is/Lo.pl
+++ b/lib/unicode/Is/Lo.pl
@@ -107,7 +107,31 @@ return <<'END';
1100 1159
115f 11a2
11a8 11f9
-1200 135a
+1200 1206
+1208 1246
+1248
+124a 124d
+1250 1256
+1258
+125a 125d
+1260 1286
+1288
+128a 128d
+1290 12ae
+12b0
+12b2 12b5
+12b8 12be
+12c0
+12c2 12c5
+12c8 12ce
+12d0 12d6
+12d8 12ee
+12f0 130e
+1310
+1312 1315
+1318 131e
+1320 1346
+1348 135a
2135 2138
3041 3094
30a1 30fa
diff --git a/lib/unicode/Is/Print.pl b/lib/unicode/Is/Print.pl
index eef2d314c0..c2e607006b 100644
--- a/lib/unicode/Is/Print.pl
+++ b/lib/unicode/Is/Print.pl
@@ -199,6 +199,32 @@ return <<'END';
1100 1159
115f 11a2
11a8 11f9
+1200 1206
+1208 1246
+1248
+124a 124d
+1250 1256
+1258
+125a 125d
+1260 1286
+1288
+128a 128d
+1290 12ae
+12b0
+12b2 12b5
+12b8 12be
+12c0
+12c2 12c5
+12c8 12ce
+12d0 12d6
+12d8 12ee
+12f0 130e
+1310
+1312 1315
+1318 131e
+1320 1346
+1348 135a
+1361 137c
1e00 1e9b
1ea0 1ef9
1f00 1f15
diff --git a/lib/unicode/Is/Punct.pl b/lib/unicode/Is/Punct.pl
new file mode 100644
index 0000000000..0d52205b0a
--- /dev/null
+++ b/lib/unicode/Is/Punct.pl
@@ -0,0 +1,70 @@
+return <<'END';
+0021 0023
+0025 002a
+002c 002f
+003a 003b
+003f 0040
+005b 005d
+005f
+007b
+007d
+00a1
+00ab
+00ad
+00b7
+00bb
+00bf
+0374 0375
+037e
+0387
+055a 055f
+0589
+05be
+05c0
+05c3
+05f3 05f4
+060c
+061b
+061f
+066a 066d
+06d4
+0964 0965
+0970
+0e2f
+0e5a 0e5b
+0eaf
+0f04 0f12
+0f3a 0f3d
+0f85
+10fb
+1361 1368
+2010 2027
+2030 2043
+2045 2046
+207d 207e
+208d 208e
+2329 232a
+3001 3003
+3006
+3008 3011
+3014 301f
+3030
+30fb
+fd3e fd3f
+fe30 fe44
+fe49 fe52
+fe54 fe61
+fe63
+fe68
+fe6a fe6b
+ff01 ff03
+ff05 ff0a
+ff0c ff0f
+ff1a ff1b
+ff1f ff20
+ff3b ff3d
+ff3f
+ff5b
+ff5d
+ff61 ff65
+END
diff --git a/lib/unicode/Is/Space.pl b/lib/unicode/Is/Space.pl
index 903f854416..715afc3ef3 100644
--- a/lib/unicode/Is/Space.pl
+++ b/lib/unicode/Is/Space.pl
@@ -3,7 +3,6 @@ return <<'END';
000c 000d
0020
00a0
-1361
2000 200b
2028 2029
3000
diff --git a/lib/unicode/Is/Word.pl b/lib/unicode/Is/Word.pl
new file mode 100644
index 0000000000..6a30246b20
--- /dev/null
+++ b/lib/unicode/Is/Word.pl
@@ -0,0 +1,250 @@
+return <<'END';
+0030 0039
+0041 005a
+005f
+0061 007a
+00aa
+00b5
+00ba
+00c0 00d6
+00d8 00f6
+00f8 01c4
+01c6 01c7
+01c9 01ca
+01cc 01f1
+01f3 01f5
+01fa 0217
+0250 02a8
+0386
+0388 038a
+038c
+038e 03a1
+03a3 03ce
+03d0 03d6
+03da
+03dc
+03de
+03e0
+03e2 03f3
+0401 040c
+040e 044f
+0451 045c
+045e 0481
+0490 04c4
+04c7 04c8
+04cb 04cc
+04d0 04eb
+04ee 04f5
+04f8 04f9
+0531 0556
+0561 0587
+05d0 05ea
+05f0 05f2
+0621 063a
+0641 064a
+0660 0669
+0671 06b7
+06ba 06be
+06c0 06ce
+06d0 06d3
+06d5
+06f0 06f9
+0905 0939
+093d
+0958 0961
+0966 096f
+0985 098c
+098f 0990
+0993 09a8
+09aa 09b0
+09b2
+09b6 09b9
+09dc 09dd
+09df 09e1
+09e6 09f1
+0a05 0a0a
+0a0f 0a10
+0a13 0a28
+0a2a 0a30
+0a32 0a33
+0a35 0a36
+0a38 0a39
+0a59 0a5c
+0a5e
+0a66 0a6f
+0a72 0a74
+0a85 0a8b
+0a8d
+0a8f 0a91
+0a93 0aa8
+0aaa 0ab0
+0ab2 0ab3
+0ab5 0ab9
+0abd
+0ae0
+0ae6 0aef
+0b05 0b0c
+0b0f 0b10
+0b13 0b28
+0b2a 0b30
+0b32 0b33
+0b36 0b39
+0b3d
+0b5c 0b5d
+0b5f 0b61
+0b66 0b6f
+0b85 0b8a
+0b8e 0b90
+0b92 0b95
+0b99 0b9a
+0b9c
+0b9e 0b9f
+0ba3 0ba4
+0ba8 0baa
+0bae 0bb5
+0bb7 0bb9
+0be7 0bef
+0c05 0c0c
+0c0e 0c10
+0c12 0c28
+0c2a 0c33
+0c35 0c39
+0c60 0c61
+0c66 0c6f
+0c85 0c8c
+0c8e 0c90
+0c92 0ca8
+0caa 0cb3
+0cb5 0cb9
+0cde
+0ce0 0ce1
+0ce6 0cef
+0d05 0d0c
+0d0e 0d10
+0d12 0d28
+0d2a 0d39
+0d60 0d61
+0d66 0d6f
+0e01 0e2e
+0e30
+0e32 0e33
+0e40 0e45
+0e50 0e59
+0e81 0e82
+0e84
+0e87 0e88
+0e8a
+0e8d
+0e94 0e97
+0e99 0e9f
+0ea1 0ea3
+0ea5
+0ea7
+0eaa 0eab
+0ead 0eae
+0eb0
+0eb2 0eb3
+0ebd
+0ec0 0ec4
+0ed0 0ed9
+0edc 0edd
+0f20 0f29
+0f40 0f47
+0f49 0f69
+10a0 10c5
+10d0 10f6
+1100 1159
+115f 11a2
+11a8 11f9
+1200 1206
+1208 1246
+1248
+124a 124d
+1250 1256
+1258
+125a 125d
+1260 1286
+1288
+128a 128d
+1290 12ae
+12b0
+12b2 12b5
+12b8 12be
+12c0
+12c2 12c5
+12c8 12ce
+12d0 12d6
+12d8 12ee
+12f0 130e
+1310
+1312 1315
+1318 131e
+1320 1346
+1348 135a
+1369 1371
+1e00 1e9b
+1ea0 1ef9
+1f00 1f15
+1f18 1f1d
+1f20 1f45
+1f48 1f4d
+1f50 1f57
+1f59
+1f5b
+1f5d
+1f5f 1f7d
+1f80 1fb4
+1fb6 1fbc
+1fbe
+1fc2 1fc4
+1fc6 1fcc
+1fd0 1fd3
+1fd6 1fdb
+1fe0 1fec
+1ff2 1ff4
+1ff6 1ffc
+207f
+2102
+2107
+210a 2113
+2115
+2118 211d
+2124
+2126
+2128
+212a 2131
+2133 2138
+3041 3094
+30a1 30fa
+3105 312c
+3131 318e
+4e00 9fa5
+ac00 d7a3
+f900 fa2d
+fb00 fb06
+fb13 fb17
+fb1f fb28
+fb2a fb36
+fb38 fb3c
+fb3e
+fb40 fb41
+fb43 fb44
+fb46 fbb1
+fbd3 fd3d
+fd50 fd8f
+fd92 fdc7
+fdf0 fdfb
+fe70 fe72
+fe74
+fe76 fefc
+ff10 ff19
+ff21 ff3a
+ff41 ff5a
+ff66 ff6f
+ff71 ff9d
+ffa0 ffbe
+ffc2 ffc7
+ffca ffcf
+ffd2 ffd7
+ffda ffdc
+END
diff --git a/lib/unicode/Is/XDigit.pl b/lib/unicode/Is/XDigit.pl
new file mode 100644
index 0000000000..f0b7044eb6
--- /dev/null
+++ b/lib/unicode/Is/XDigit.pl
@@ -0,0 +1,5 @@
+return <<'END';
+0030 0039
+0041 0046
+0061 0066
+END
diff --git a/lib/unicode/Is/Z.pl b/lib/unicode/Is/Z.pl
index af595da354..9e83d9427f 100644
--- a/lib/unicode/Is/Z.pl
+++ b/lib/unicode/Is/Z.pl
@@ -1,7 +1,6 @@
return <<'END';
0020
00a0
-1361
2000 200b
2028 2029
3000
diff --git a/lib/unicode/Is/Zs.pl b/lib/unicode/Is/Zs.pl
index 403728c0db..87d4455d02 100644
--- a/lib/unicode/Is/Zs.pl
+++ b/lib/unicode/Is/Zs.pl
@@ -1,7 +1,6 @@
return <<'END';
0020
00a0
-1361
2000 200b
3000
END
diff --git a/lib/unicode/Name.pl b/lib/unicode/Name.pl
index 0925bad71a..45099acd4d 100644
--- a/lib/unicode/Name.pl
+++ b/lib/unicode/Name.pl
@@ -2740,352 +2740,351 @@ return <<'END';
11f7 HANGUL JONGSEONG HIEUH-MIEUM
11f8 HANGUL JONGSEONG HIEUH-PIEUP
11f9 HANGUL JONGSEONG YEORINHIEUH
-1200 ETHIOPIC SYLLABLE HA
-1201 ETHIOPIC SYLLABLE HU
-1202 ETHIOPIC SYLLABLE HI
-1203 ETHIOPIC SYLLABLE HAA
-1204 ETHIOPIC SYLLABLE HEE
-1205 ETHIOPIC SYLLABLE HE
-1206 ETHIOPIC SYLLABLE HO
-1208 ETHIOPIC SYLLABLE LA
-1209 ETHIOPIC SYLLABLE LU
-120A ETHIOPIC SYLLABLE LI
-120B ETHIOPIC SYLLABLE LAA
-120C ETHIOPIC SYLLABLE LEE
-120D ETHIOPIC SYLLABLE LE
-120E ETHIOPIC SYLLABLE LO
-120F ETHIOPIC SYLLABLE LWA
-1210 ETHIOPIC SYLLABLE HHA
-1211 ETHIOPIC SYLLABLE HHU
-1212 ETHIOPIC SYLLABLE HHI
-1213 ETHIOPIC SYLLABLE HHAA
-1214 ETHIOPIC SYLLABLE HHEE
-1215 ETHIOPIC SYLLABLE HHE
-1217 ETHIOPIC SYLLABLE HHWA
-1218 ETHIOPIC SYLLABLE MA
-1219 ETHIOPIC SYLLABLE MU
-121A ETHIOPIC SYLLABLE MI
-121B ETHIOPIC SYLLABLE MAA
-121C ETHIOPIC SYLLABLE MEE
-121D ETHIOPIC SYLLABLE ME
-121E ETHIOPIC SYLLABLE MO
-121F ETHIOPIC SYLLABLE MWAA
-1220 ETHIOPIC SYLLABLE SZA
-1221 ETHIOPIC SYLLABLE SZU
-1222 ETHIOPIC SYLLABLE SZI
-1223 ETHIOPIC SYLLABLE SZAA
-1224 ETHIOPIC SYLLABLE SZEE
-1225 ETHIOPIC SYLLABLE SZE
-1226 ETHIOPIC SYLLABLE SZO
-1227 ETHIOPIC SYLLABLE SZWA
-1228 ETHIOPIC SYLLABLE RA
-1229 ETHIOPIC SYLLABLE RU
-122A ETHIOPIC SYLLABLE RI
-122B ETHIOPIC SYLLABLE RAA
-122C ETHIOPIC SYLLABLE REE
-122D ETHIOPIC SYLLABLE RE
-122E ETHIOPIC SYLLABLE RO
-122F ETHIOPIC SYLLABLE RWA
-1230 ETHIOPIC SYLLABLE SA
-1231 ETHIOPIC SYLLABLE SU
-1232 ETHIOPIC SYLLABLE SI
-1233 ETHIOPIC SYLLABLE SAA
-1234 ETHIOPIC SYLLABLE SEE
-1235 ETHIOPIC SYLLABLE SE
-1236 ETHIOPIC SYLLABLE SO
-1237 ETHIOPIC SYLLABLE SWA
-1238 ETHIOPIC SYLLABLE SHA
-1239 ETHIOPIC SYLLABLE SHU
-123A ETHIOPIC SYLLABLE SHI
-123B ETHIOPIC SYLLABLE SHAA
-123C ETHIOPIC SYLLABLE SHEE
-123D ETHIOPIC SYLLABLE SHE
-123E ETHIOPIC SYLLABLE SHO
-123F ETHIOPIC SYLLABLE SHWA
-1240 ETHIOPIC SYLLABLE QA
-1241 ETHIOPIC SYLLABLE QU
-1242 ETHIOPIC SYLLABLE QI
-1243 ETHIOPIC SYLLABLE QAA
-1244 ETHIOPIC SYLLABLE QEE
-1245 ETHIOPIC SYLLABLE QE
-1246 ETHIOPIC SYLLABLE QO
-1248 ETHIOPIC SYLLABLE QWA
-124A ETHIOPIC SYLLABLE QWI
-124B ETHIOPIC SYLLABLE QWAA
-124C ETHIOPIC SYLLABLE QWEE
-124D ETHIOPIC SYLLABLE QWE
-1250 ETHIOPIC SYLLABLE QHA
-1251 ETHIOPIC SYLLABLE QHU
-1252 ETHIOPIC SYLLABLE QHI
-1253 ETHIOPIC SYLLABLE QHAA
-1254 ETHIOPIC SYLLABLE QHEE
-1255 ETHIOPIC SYLLABLE QHE
-1256 ETHIOPIC SYLLABLE QHO
-1258 ETHIOPIC SYLLABLE QHWA
-125A ETHIOPIC SYLLABLE QHWI
-125B ETHIOPIC SYLLABLE QHWAA
-125C ETHIOPIC SYLLABLE QHWEE
-125D ETHIOPIC SYLLABLE QHWE
-1260 ETHIOPIC SYLLABLE BA
-1261 ETHIOPIC SYLLABLE BU
-1262 ETHIOPIC SYLLABLE BI
-1263 ETHIOPIC SYLLABLE BAA
-1264 ETHIOPIC SYLLABLE BEE
-1265 ETHIOPIC SYLLABLE BE
-1266 ETHIOPIC SYLLABLE BO
-1267 ETHIOPIC SYLLABLE BWAA
-1268 ETHIOPIC SYLLABLE VA
-1269 ETHIOPIC SYLLABLE VU
-126A ETHIOPIC SYLLABLE VI
-126B ETHIOPIC SYLLABLE VAA
-126C ETHIOPIC SYLLABLE VEE
-126D ETHIOPIC SYLLABLE VE
-126E ETHIOPIC SYLLABLE VO
-126F ETHIOPIC SYLLABLE VWA
-1270 ETHIOPIC SYLLABLE TA
-1271 ETHIOPIC SYLLABLE TU
-1272 ETHIOPIC SYLLABLE TI
-1273 ETHIOPIC SYLLABLE TAA
-1274 ETHIOPIC SYLLABLE TEE
-1275 ETHIOPIC SYLLABLE TE
-1276 ETHIOPIC SYLLABLE TO
-1277 ETHIOPIC SYLLABLE TWA
-1278 ETHIOPIC SYLLABLE CA
-1279 ETHIOPIC SYLLABLE CU
-127A ETHIOPIC SYLLABLE CI
-127B ETHIOPIC SYLLABLE CAA
-127C ETHIOPIC SYLLABLE CEE
-127D ETHIOPIC SYLLABLE CE
-127E ETHIOPIC SYLLABLE CO
-127F ETHIOPIC SYLLABLE CWA
-1280 ETHIOPIC SYLLABLE XA
-1281 ETHIOPIC SYLLABLE XU
-1282 ETHIOPIC SYLLABLE XI
-1283 ETHIOPIC SYLLABLE XAA
-1284 ETHIOPIC SYLLABLE XEE
-1285 ETHIOPIC SYLLABLE XE
-1286 ETHIOPIC SYLLABLE XO
-1288 ETHIOPIC SYLLABLE XWA
-128A ETHIOPIC SYLLABLE XWI
-128B ETHIOPIC SYLLABLE XWAA
-128C ETHIOPIC SYLLABLE XWEE
-128D ETHIOPIC SYLLABLE XWE
-1290 ETHIOPIC SYLLABLE NA
-1291 ETHIOPIC SYLLABLE NU
-1292 ETHIOPIC SYLLABLE NI
-1293 ETHIOPIC SYLLABLE NAA
-1294 ETHIOPIC SYLLABLE NEE
-1295 ETHIOPIC SYLLABLE NE
-1296 ETHIOPIC SYLLABLE NO
-1297 ETHIOPIC SYLLABLE NWA
-1298 ETHIOPIC SYLLABLE NYA
-1299 ETHIOPIC SYLLABLE NYU
-129A ETHIOPIC SYLLABLE NYI
-129B ETHIOPIC SYLLABLE NYAA
-129C ETHIOPIC SYLLABLE NYEE
-129D ETHIOPIC SYLLABLE NYE
-129E ETHIOPIC SYLLABLE NYO
-129F ETHIOPIC SYLLABLE NYWA
-12A0 ETHIOPIC SYLLABLE GLOTTAL A
-12A1 ETHIOPIC SYLLABLE GLOTTAL U
-12A2 ETHIOPIC SYLLABLE GLOTTAL I
-12A3 ETHIOPIC SYLLABLE GLOTTAL AA
-12A4 ETHIOPIC SYLLABLE GLOTTAL EE
-12A5 ETHIOPIC SYLLABLE GLOTTAL E
-12A6 ETHIOPIC SYLLABLE GLOTTAL O
-12A7 ETHIOPIC SYLLABLE GLOTTAL WA
-12A8 ETHIOPIC SYLLABLE KA
-12A9 ETHIOPIC SYLLABLE KU
-12AA ETHIOPIC SYLLABLE KI
-12AB ETHIOPIC SYLLABLE KAA
-12AC ETHIOPIC SYLLABLE KEE
-12AD ETHIOPIC SYLLABLE KE
-12AE ETHIOPIC SYLLABLE KO
-12B0 ETHIOPIC SYLLABLE KWA
-12B2 ETHIOPIC SYLLABLE KWI
-12B3 ETHIOPIC SYLLABLE KWAA
-12B4 ETHIOPIC SYLLABLE KWEE
-12B5 ETHIOPIC SYLLABLE KWE
-12B8 ETHIOPIC SYLLABLE KXA
-12B9 ETHIOPIC SYLLABLE KXU
-12BA ETHIOPIC SYLLABLE KXI
-12BB ETHIOPIC SYLLABLE KXAA
-12BC ETHIOPIC SYLLABLE KXEE
-12BD ETHIOPIC SYLLABLE KXE
-12BE ETHIOPIC SYLLABLE KXO
-12C0 ETHIOPIC SYLLABLE KXWA
-12C2 ETHIOPIC SYLLABLE KXWI
-12C3 ETHIOPIC SYLLABLE KXWAA
-12C4 ETHIOPIC SYLLABLE KXWEE
-12C5 ETHIOPIC SYLLABLE KXWE
-12C8 ETHIOPIC SYLLABLE WA
-12C9 ETHIOPIC SYLLABLE WU
-12CA ETHIOPIC SYLLABLE WI
-12CB ETHIOPIC SYLLABLE WAA
-12CC ETHIOPIC SYLLABLE WEE
-12CD ETHIOPIC SYLLABLE WE
-12CE ETHIOPIC SYLLABLE WO
-12D0 ETHIOPIC SYLLABLE PHARYNGEAL A
-12D1 ETHIOPIC SYLLABLE PHARYNGEAL U
-12D2 ETHIOPIC SYLLABLE PHARYNGEAL I
-12D3 ETHIOPIC SYLLABLE PHARYNGEAL AA
-12D4 ETHIOPIC SYLLABLE PHARYNGEAL EE
-12D5 ETHIOPIC SYLLABLE PHARYNGEAL E
-12D6 ETHIOPIC SYLLABLE PHARYNGEAL O
-12D8 ETHIOPIC SYLLABLE ZA
-12D9 ETHIOPIC SYLLABLE ZU
-12DA ETHIOPIC SYLLABLE ZI
-12DB ETHIOPIC SYLLABLE ZAA
-12DC ETHIOPIC SYLLABLE ZEE
-12DD ETHIOPIC SYLLABLE ZE
-12DE ETHIOPIC SYLLABLE ZO
-12DF ETHIOPIC SYLLABLE ZWA
-12E0 ETHIOPIC SYLLABLE ZHA
-12E1 ETHIOPIC SYLLABLE ZHU
-12E2 ETHIOPIC SYLLABLE ZHI
-12E3 ETHIOPIC SYLLABLE ZHAA
-12E4 ETHIOPIC SYLLABLE ZHEE
-12E5 ETHIOPIC SYLLABLE ZHE
-12E6 ETHIOPIC SYLLABLE ZHO
-12E7 ETHIOPIC SYLLABLE ZHWA
-12E8 ETHIOPIC SYLLABLE YA
-12E9 ETHIOPIC SYLLABLE YU
-12EA ETHIOPIC SYLLABLE YI
-12EB ETHIOPIC SYLLABLE YAA
-12EC ETHIOPIC SYLLABLE YEE
-12ED ETHIOPIC SYLLABLE YE
-12EE ETHIOPIC SYLLABLE YO
-12EF ETHIOPIC SYLLABLE YWA
-12F0 ETHIOPIC SYLLABLE DA
-12F1 ETHIOPIC SYLLABLE DU
-12F2 ETHIOPIC SYLLABLE DI
-12F3 ETHIOPIC SYLLABLE DAA
-12F4 ETHIOPIC SYLLABLE DEE
-12F5 ETHIOPIC SYLLABLE DE
-12F6 ETHIOPIC SYLLABLE DO
-12F7 ETHIOPIC SYLLABLE DWA
-12F8 ETHIOPIC SYLLABLE DDA
-12F9 ETHIOPIC SYLLABLE DDU
-12FA ETHIOPIC SYLLABLE DDI
-12FB ETHIOPIC SYLLABLE DDAA
-12FC ETHIOPIC SYLLABLE DDEE
-12FD ETHIOPIC SYLLABLE DDE
-12FE ETHIOPIC SYLLABLE DDO
-12FF ETHIOPIC SYLLABLE DDWA
-1300 ETHIOPIC SYLLABLE JA
-1301 ETHIOPIC SYLLABLE JU
-1302 ETHIOPIC SYLLABLE JI
-1303 ETHIOPIC SYLLABLE JAA
-1304 ETHIOPIC SYLLABLE JEE
-1305 ETHIOPIC SYLLABLE JE
-1306 ETHIOPIC SYLLABLE JO
-1307 ETHIOPIC SYLLABLE JWA
-1308 ETHIOPIC SYLLABLE GA
-1309 ETHIOPIC SYLLABLE GU
-130A ETHIOPIC SYLLABLE GI
-130B ETHIOPIC SYLLABLE GAA
-130C ETHIOPIC SYLLABLE GEE
-130D ETHIOPIC SYLLABLE GE
-130E ETHIOPIC SYLLABLE GO
-1310 ETHIOPIC SYLLABLE GWA
-1312 ETHIOPIC SYLLABLE GWI
-1313 ETHIOPIC SYLLABLE GWAA
-1314 ETHIOPIC SYLLABLE GWEE
-1315 ETHIOPIC SYLLABLE GWE
-1318 ETHIOPIC SYLLABLE GGA
-1319 ETHIOPIC SYLLABLE GGU
-131A ETHIOPIC SYLLABLE GGI
-131B ETHIOPIC SYLLABLE GGAA
-131C ETHIOPIC SYLLABLE GGEE
-131D ETHIOPIC SYLLABLE GGE
-131E ETHIOPIC SYLLABLE GGO
-131F ETHIOPIC SYLLABLE GGWAA
-1320 ETHIOPIC SYLLABLE THA
-1321 ETHIOPIC SYLLABLE THU
-1322 ETHIOPIC SYLLABLE THI
-1323 ETHIOPIC SYLLABLE THAA
-1324 ETHIOPIC SYLLABLE THEE
-1325 ETHIOPIC SYLLABLE THE
-1326 ETHIOPIC SYLLABLE THO
-1327 ETHIOPIC SYLLABLE THWA
-1328 ETHIOPIC SYLLABLE CHA
-1329 ETHIOPIC SYLLABLE CHU
-132A ETHIOPIC SYLLABLE CHI
-132B ETHIOPIC SYLLABLE CHAA
-132C ETHIOPIC SYLLABLE CHEE
-132D ETHIOPIC SYLLABLE CHE
-132E ETHIOPIC SYLLABLE CHO
-132F ETHIOPIC SYLLABLE CHWA
-1330 ETHIOPIC SYLLABLE PHA
-1331 ETHIOPIC SYLLABLE PHU
-1332 ETHIOPIC SYLLABLE PHI
-1333 ETHIOPIC SYLLABLE PHAA
-1334 ETHIOPIC SYLLABLE PHEE
-1335 ETHIOPIC SYLLABLE PHE
-1336 ETHIOPIC SYLLABLE PHO
-1337 ETHIOPIC SYLLABLE PHWA
-1338 ETHIOPIC SYLLABLE TSA
-1339 ETHIOPIC SYLLABLE TSU
-133A ETHIOPIC SYLLABLE TSI
-133B ETHIOPIC SYLLABLE TSAA
-133C ETHIOPIC SYLLABLE TSEE
-133D ETHIOPIC SYLLABLE TSE
-133E ETHIOPIC SYLLABLE TSO
-133F ETHIOPIC SYLLABLE TSWA
-1340 ETHIOPIC SYLLABLE TZA
-1341 ETHIOPIC SYLLABLE TZU
-1342 ETHIOPIC SYLLABLE TZI
-1343 ETHIOPIC SYLLABLE TZAA
-1344 ETHIOPIC SYLLABLE TZEE
-1345 ETHIOPIC SYLLABLE TZE
-1346 ETHIOPIC SYLLABLE TZO
-1348 ETHIOPIC SYLLABLE FA
-1349 ETHIOPIC SYLLABLE FU
-134A ETHIOPIC SYLLABLE FI
-134B ETHIOPIC SYLLABLE FAA
-134C ETHIOPIC SYLLABLE FEE
-134D ETHIOPIC SYLLABLE FE
-134E ETHIOPIC SYLLABLE FO
-134F ETHIOPIC SYLLABLE FWAA
-1350 ETHIOPIC SYLLABLE PA
-1351 ETHIOPIC SYLLABLE PU
-1352 ETHIOPIC SYLLABLE PI
-1353 ETHIOPIC SYLLABLE PAA
-1354 ETHIOPIC SYLLABLE PEE
-1355 ETHIOPIC SYLLABLE PE
-1356 ETHIOPIC SYLLABLE PO
-1357 ETHIOPIC SYLLABLE PWAA
-1358 ETHIOPIC SYLLABLE MYA
-1359 ETHIOPIC SYLLABLE RYA
-135A ETHIOPIC SYLLABLE FYA
-1361 ETHIOPIC WORDSPACE
-1362 ETHIOPIC FULL STOP
-1363 ETHIOPIC COMMA
-1364 ETHIOPIC SEMICOLON
-1365 ETHIOPIC COLON
-1366 ETHIOPIC PREFACE COLON
-1367 ETHIOPIC QUESTION MARK
-1368 ETHIOPIC PARAGRAPH SEPARATOR
-1369 ETHIOPIC DIGIT ONE
-136A ETHIOPIC DIGIT TWO
-136B ETHIOPIC DIGIT THREE
-136C ETHIOPIC DIGIT FOUR
-136D ETHIOPIC DIGIT FIVE
-136E ETHIOPIC DIGIT SIX
-136F ETHIOPIC DIGIT SEVEN
-1370 ETHIOPIC DIGIT EIGHT
-1371 ETHIOPIC DIGIT NINE
-1372 ETHIOPIC NUMBER TEN
-1373 ETHIOPIC NUMBER TWENTY
-1374 ETHIOPIC NUMBER THIRTY
-1375 ETHIOPIC NUMBER FORTY
-1376 ETHIOPIC NUMBER FIFTY
-1377 ETHIOPIC NUMBER SIXTY
-1378 ETHIOPIC NUMBER SEVENTY
-1379 ETHIOPIC NUMBER EIGHTY
-137A ETHIOPIC NUMBER NINETY
-137B ETHIOPIC NUMBER HUNDRED
-137C ETHIOPIC NUMBER TEN THOUSAND
+1200 ETHIOPIC SYLLABLE HA
+1201 ETHIOPIC SYLLABLE HU
+1202 ETHIOPIC SYLLABLE HI
+1203 ETHIOPIC SYLLABLE HAA
+1204 ETHIOPIC SYLLABLE HEE
+1205 ETHIOPIC SYLLABLE HE
+1206 ETHIOPIC SYLLABLE HO
+1208 ETHIOPIC SYLLABLE LA
+1209 ETHIOPIC SYLLABLE LU
+120a ETHIOPIC SYLLABLE LI
+120b ETHIOPIC SYLLABLE LAA
+120c ETHIOPIC SYLLABLE LEE
+120d ETHIOPIC SYLLABLE LE
+120e ETHIOPIC SYLLABLE LO
+120f ETHIOPIC SYLLABLE LWA
+1210 ETHIOPIC SYLLABLE HHA
+1211 ETHIOPIC SYLLABLE HHU
+1212 ETHIOPIC SYLLABLE HHI
+1213 ETHIOPIC SYLLABLE HHAA
+1214 ETHIOPIC SYLLABLE HHEE
+1215 ETHIOPIC SYLLABLE HHE
+1216 ETHIOPIC SYLLABLE HHO
+1217 ETHIOPIC SYLLABLE HHWA
+1218 ETHIOPIC SYLLABLE MA
+1219 ETHIOPIC SYLLABLE MU
+121a ETHIOPIC SYLLABLE MI
+121b ETHIOPIC SYLLABLE MAA
+121c ETHIOPIC SYLLABLE MEE
+121d ETHIOPIC SYLLABLE ME
+121e ETHIOPIC SYLLABLE MO
+121f ETHIOPIC SYLLABLE MWA
+1220 ETHIOPIC SYLLABLE SZA
+1221 ETHIOPIC SYLLABLE SZU
+1222 ETHIOPIC SYLLABLE SZI
+1223 ETHIOPIC SYLLABLE SZAA
+1224 ETHIOPIC SYLLABLE SZEE
+1225 ETHIOPIC SYLLABLE SZE
+1226 ETHIOPIC SYLLABLE SZO
+1227 ETHIOPIC SYLLABLE SZWA
+1228 ETHIOPIC SYLLABLE RA
+1229 ETHIOPIC SYLLABLE RU
+122a ETHIOPIC SYLLABLE RI
+122b ETHIOPIC SYLLABLE RAA
+122c ETHIOPIC SYLLABLE REE
+122d ETHIOPIC SYLLABLE RE
+122e ETHIOPIC SYLLABLE RO
+122f ETHIOPIC SYLLABLE RWA
+1230 ETHIOPIC SYLLABLE SA
+1231 ETHIOPIC SYLLABLE SU
+1232 ETHIOPIC SYLLABLE SI
+1233 ETHIOPIC SYLLABLE SAA
+1234 ETHIOPIC SYLLABLE SEE
+1235 ETHIOPIC SYLLABLE SE
+1236 ETHIOPIC SYLLABLE SO
+1237 ETHIOPIC SYLLABLE SWA
+1238 ETHIOPIC SYLLABLE SHA
+1239 ETHIOPIC SYLLABLE SHU
+123a ETHIOPIC SYLLABLE SHI
+123b ETHIOPIC SYLLABLE SHAA
+123c ETHIOPIC SYLLABLE SHEE
+123d ETHIOPIC SYLLABLE SHE
+123e ETHIOPIC SYLLABLE SHO
+123f ETHIOPIC SYLLABLE SHWA
+1240 ETHIOPIC SYLLABLE QA
+1241 ETHIOPIC SYLLABLE QU
+1242 ETHIOPIC SYLLABLE QI
+1243 ETHIOPIC SYLLABLE QAA
+1244 ETHIOPIC SYLLABLE QEE
+1245 ETHIOPIC SYLLABLE QE
+1246 ETHIOPIC SYLLABLE QO
+1248 ETHIOPIC SYLLABLE QWA
+124a ETHIOPIC SYLLABLE QWI
+124b ETHIOPIC SYLLABLE QWAA
+124c ETHIOPIC SYLLABLE QWEE
+124d ETHIOPIC SYLLABLE QWE
+1250 ETHIOPIC SYLLABLE QHA
+1251 ETHIOPIC SYLLABLE QHU
+1252 ETHIOPIC SYLLABLE QHI
+1253 ETHIOPIC SYLLABLE QHAA
+1254 ETHIOPIC SYLLABLE QHEE
+1255 ETHIOPIC SYLLABLE QHE
+1256 ETHIOPIC SYLLABLE QHO
+1258 ETHIOPIC SYLLABLE QHWA
+125a ETHIOPIC SYLLABLE QHWI
+125b ETHIOPIC SYLLABLE QHWAA
+125c ETHIOPIC SYLLABLE QHWEE
+125d ETHIOPIC SYLLABLE QHWE
+1260 ETHIOPIC SYLLABLE BA
+1261 ETHIOPIC SYLLABLE BU
+1262 ETHIOPIC SYLLABLE BI
+1263 ETHIOPIC SYLLABLE BAA
+1264 ETHIOPIC SYLLABLE BEE
+1265 ETHIOPIC SYLLABLE BE
+1266 ETHIOPIC SYLLABLE BO
+1267 ETHIOPIC SYLLABLE BWA
+1268 ETHIOPIC SYLLABLE VA
+1269 ETHIOPIC SYLLABLE VU
+126a ETHIOPIC SYLLABLE VI
+126b ETHIOPIC SYLLABLE VAA
+126c ETHIOPIC SYLLABLE VEE
+126d ETHIOPIC SYLLABLE VE
+126e ETHIOPIC SYLLABLE VO
+126f ETHIOPIC SYLLABLE VWA
+1270 ETHIOPIC SYLLABLE TA
+1271 ETHIOPIC SYLLABLE TU
+1272 ETHIOPIC SYLLABLE TI
+1273 ETHIOPIC SYLLABLE TAA
+1274 ETHIOPIC SYLLABLE TEE
+1275 ETHIOPIC SYLLABLE TE
+1276 ETHIOPIC SYLLABLE TO
+1277 ETHIOPIC SYLLABLE TWA
+1278 ETHIOPIC SYLLABLE CA
+1279 ETHIOPIC SYLLABLE CU
+127a ETHIOPIC SYLLABLE CI
+127b ETHIOPIC SYLLABLE CAA
+127c ETHIOPIC SYLLABLE CEE
+127d ETHIOPIC SYLLABLE CE
+127e ETHIOPIC SYLLABLE CO
+127f ETHIOPIC SYLLABLE CWA
+1280 ETHIOPIC SYLLABLE XA
+1281 ETHIOPIC SYLLABLE XU
+1282 ETHIOPIC SYLLABLE XI
+1283 ETHIOPIC SYLLABLE XAA
+1284 ETHIOPIC SYLLABLE XEE
+1285 ETHIOPIC SYLLABLE XE
+1286 ETHIOPIC SYLLABLE XO
+1288 ETHIOPIC SYLLABLE XWA
+128a ETHIOPIC SYLLABLE XWI
+128b ETHIOPIC SYLLABLE XWAA
+128c ETHIOPIC SYLLABLE XWEE
+128d ETHIOPIC SYLLABLE XWE
+1290 ETHIOPIC SYLLABLE NA
+1291 ETHIOPIC SYLLABLE NU
+1292 ETHIOPIC SYLLABLE NI
+1293 ETHIOPIC SYLLABLE NAA
+1294 ETHIOPIC SYLLABLE NEE
+1295 ETHIOPIC SYLLABLE NE
+1296 ETHIOPIC SYLLABLE NO
+1297 ETHIOPIC SYLLABLE NWA
+1298 ETHIOPIC SYLLABLE NYA
+1299 ETHIOPIC SYLLABLE NYU
+129a ETHIOPIC SYLLABLE NYI
+129b ETHIOPIC SYLLABLE NYAA
+129c ETHIOPIC SYLLABLE NYEE
+129d ETHIOPIC SYLLABLE NYE
+129e ETHIOPIC SYLLABLE NYO
+129f ETHIOPIC SYLLABLE NYWA
+12a0 ETHIOPIC SYLLABLE GLOTTAL A
+12a1 ETHIOPIC SYLLABLE GLOTTAL U
+12a2 ETHIOPIC SYLLABLE GLOTTAL I
+12a3 ETHIOPIC SYLLABLE GLOTTAL AA
+12a4 ETHIOPIC SYLLABLE GLOTTAL EE
+12a5 ETHIOPIC SYLLABLE GLOTTAL E
+12a6 ETHIOPIC SYLLABLE GLOTTAL O
+12a7 ETHIOPIC SYLLABLE GLOTTAL WA
+12a8 ETHIOPIC SYLLABLE KA
+12a9 ETHIOPIC SYLLABLE KU
+12aa ETHIOPIC SYLLABLE KI
+12ab ETHIOPIC SYLLABLE KAA
+12ac ETHIOPIC SYLLABLE KEE
+12ad ETHIOPIC SYLLABLE KE
+12ae ETHIOPIC SYLLABLE KO
+12b0 ETHIOPIC SYLLABLE KWA
+12b2 ETHIOPIC SYLLABLE KWI
+12b3 ETHIOPIC SYLLABLE KWAA
+12b4 ETHIOPIC SYLLABLE KWEE
+12b5 ETHIOPIC SYLLABLE KWE
+12b8 ETHIOPIC SYLLABLE KXA
+12b9 ETHIOPIC SYLLABLE KXU
+12ba ETHIOPIC SYLLABLE KXI
+12bb ETHIOPIC SYLLABLE KXAA
+12bc ETHIOPIC SYLLABLE KXEE
+12bd ETHIOPIC SYLLABLE KXE
+12be ETHIOPIC SYLLABLE KXO
+12c0 ETHIOPIC SYLLABLE KXWA
+12c2 ETHIOPIC SYLLABLE KXWI
+12c3 ETHIOPIC SYLLABLE KXWAA
+12c4 ETHIOPIC SYLLABLE KXWEE
+12c5 ETHIOPIC SYLLABLE KXWE
+12c8 ETHIOPIC SYLLABLE WA
+12c9 ETHIOPIC SYLLABLE WU
+12ca ETHIOPIC SYLLABLE WI
+12cb ETHIOPIC SYLLABLE WAA
+12cc ETHIOPIC SYLLABLE WEE
+12cd ETHIOPIC SYLLABLE WE
+12ce ETHIOPIC SYLLABLE WO
+12d0 ETHIOPIC SYLLABLE PHARYNGEAL A
+12d1 ETHIOPIC SYLLABLE PHARYNGEAL U
+12d2 ETHIOPIC SYLLABLE PHARYNGEAL I
+12d3 ETHIOPIC SYLLABLE PHARYNGEAL AA
+12d4 ETHIOPIC SYLLABLE PHARYNGEAL EE
+12d5 ETHIOPIC SYLLABLE PHARYNGEAL E
+12d6 ETHIOPIC SYLLABLE PHARYNGEAL O
+12d8 ETHIOPIC SYLLABLE ZA
+12d9 ETHIOPIC SYLLABLE ZU
+12da ETHIOPIC SYLLABLE ZI
+12db ETHIOPIC SYLLABLE ZAA
+12dc ETHIOPIC SYLLABLE ZEE
+12dd ETHIOPIC SYLLABLE ZE
+12de ETHIOPIC SYLLABLE ZO
+12df ETHIOPIC SYLLABLE ZWA
+12e0 ETHIOPIC SYLLABLE ZHA
+12e1 ETHIOPIC SYLLABLE ZHU
+12e2 ETHIOPIC SYLLABLE ZHI
+12e3 ETHIOPIC SYLLABLE ZHAA
+12e4 ETHIOPIC SYLLABLE ZHEE
+12e5 ETHIOPIC SYLLABLE ZHE
+12e6 ETHIOPIC SYLLABLE ZHO
+12e7 ETHIOPIC SYLLABLE ZHWA
+12e8 ETHIOPIC SYLLABLE YA
+12e9 ETHIOPIC SYLLABLE YU
+12ea ETHIOPIC SYLLABLE YI
+12eb ETHIOPIC SYLLABLE YAA
+12ec ETHIOPIC SYLLABLE YEE
+12ed ETHIOPIC SYLLABLE YE
+12ee ETHIOPIC SYLLABLE YO
+12f0 ETHIOPIC SYLLABLE DA
+12f1 ETHIOPIC SYLLABLE DU
+12f2 ETHIOPIC SYLLABLE DI
+12f3 ETHIOPIC SYLLABLE DAA
+12f4 ETHIOPIC SYLLABLE DEE
+12f5 ETHIOPIC SYLLABLE DE
+12f6 ETHIOPIC SYLLABLE DO
+12f7 ETHIOPIC SYLLABLE DWA
+12f8 ETHIOPIC SYLLABLE DDA
+12f9 ETHIOPIC SYLLABLE DDU
+12fa ETHIOPIC SYLLABLE DDI
+12fb ETHIOPIC SYLLABLE DDAA
+12fc ETHIOPIC SYLLABLE DDEE
+12fd ETHIOPIC SYLLABLE DDE
+12fe ETHIOPIC SYLLABLE DDO
+12ff ETHIOPIC SYLLABLE DDWA
+1300 ETHIOPIC SYLLABLE JA
+1301 ETHIOPIC SYLLABLE JU
+1302 ETHIOPIC SYLLABLE JI
+1303 ETHIOPIC SYLLABLE JAA
+1304 ETHIOPIC SYLLABLE JEE
+1305 ETHIOPIC SYLLABLE JE
+1306 ETHIOPIC SYLLABLE JO
+1307 ETHIOPIC SYLLABLE JWA
+1308 ETHIOPIC SYLLABLE GA
+1309 ETHIOPIC SYLLABLE GU
+130a ETHIOPIC SYLLABLE GI
+130b ETHIOPIC SYLLABLE GAA
+130c ETHIOPIC SYLLABLE GEE
+130d ETHIOPIC SYLLABLE GE
+130e ETHIOPIC SYLLABLE GO
+1310 ETHIOPIC SYLLABLE GWA
+1312 ETHIOPIC SYLLABLE GWI
+1313 ETHIOPIC SYLLABLE GWAA
+1314 ETHIOPIC SYLLABLE GWEE
+1315 ETHIOPIC SYLLABLE GWE
+1318 ETHIOPIC SYLLABLE GGA
+1319 ETHIOPIC SYLLABLE GGU
+131a ETHIOPIC SYLLABLE GGI
+131b ETHIOPIC SYLLABLE GGAA
+131c ETHIOPIC SYLLABLE GGEE
+131d ETHIOPIC SYLLABLE GGE
+131e ETHIOPIC SYLLABLE GGO
+1320 ETHIOPIC SYLLABLE THA
+1321 ETHIOPIC SYLLABLE THU
+1322 ETHIOPIC SYLLABLE THI
+1323 ETHIOPIC SYLLABLE THAA
+1324 ETHIOPIC SYLLABLE THEE
+1325 ETHIOPIC SYLLABLE THE
+1326 ETHIOPIC SYLLABLE THO
+1327 ETHIOPIC SYLLABLE THWA
+1328 ETHIOPIC SYLLABLE CHA
+1329 ETHIOPIC SYLLABLE CHU
+132a ETHIOPIC SYLLABLE CHI
+132b ETHIOPIC SYLLABLE CHAA
+132c ETHIOPIC SYLLABLE CHEE
+132d ETHIOPIC SYLLABLE CHE
+132e ETHIOPIC SYLLABLE CHO
+132f ETHIOPIC SYLLABLE CHWA
+1330 ETHIOPIC SYLLABLE PHA
+1331 ETHIOPIC SYLLABLE PHU
+1332 ETHIOPIC SYLLABLE PHI
+1333 ETHIOPIC SYLLABLE PHAA
+1334 ETHIOPIC SYLLABLE PHEE
+1335 ETHIOPIC SYLLABLE PHE
+1336 ETHIOPIC SYLLABLE PHO
+1337 ETHIOPIC SYLLABLE PHWA
+1338 ETHIOPIC SYLLABLE TSA
+1339 ETHIOPIC SYLLABLE TSU
+133a ETHIOPIC SYLLABLE TSI
+133b ETHIOPIC SYLLABLE TSAA
+133c ETHIOPIC SYLLABLE TSEE
+133d ETHIOPIC SYLLABLE TSE
+133e ETHIOPIC SYLLABLE TSO
+133f ETHIOPIC SYLLABLE TSWA
+1340 ETHIOPIC SYLLABLE TZA
+1341 ETHIOPIC SYLLABLE TZU
+1342 ETHIOPIC SYLLABLE TZI
+1343 ETHIOPIC SYLLABLE TZAA
+1344 ETHIOPIC SYLLABLE TZEE
+1345 ETHIOPIC SYLLABLE TZE
+1346 ETHIOPIC SYLLABLE TZO
+1348 ETHIOPIC SYLLABLE FA
+1349 ETHIOPIC SYLLABLE FU
+134a ETHIOPIC SYLLABLE FI
+134b ETHIOPIC SYLLABLE FAA
+134c ETHIOPIC SYLLABLE FEE
+134d ETHIOPIC SYLLABLE FE
+134e ETHIOPIC SYLLABLE FO
+134f ETHIOPIC SYLLABLE FWA
+1350 ETHIOPIC SYLLABLE PA
+1351 ETHIOPIC SYLLABLE PU
+1352 ETHIOPIC SYLLABLE PI
+1353 ETHIOPIC SYLLABLE PAA
+1354 ETHIOPIC SYLLABLE PEE
+1355 ETHIOPIC SYLLABLE PE
+1356 ETHIOPIC SYLLABLE PO
+1357 ETHIOPIC SYLLABLE PWA
+1358 ETHIOPIC SYLLABLE RYA
+1359 ETHIOPIC SYLLABLE MYA
+135a ETHIOPIC SYLLABLE FYA
+1361 ETHIOPIC WORDSPACE
+1362 ETHIOPIC FULL STOP
+1363 ETHIOPIC COMMA
+1364 ETHIOPIC SEMICOLON
+1365 ETHIOPIC COLON
+1366 ETHIOPIC PREFACE COLON
+1367 ETHIOPIC QUESTION MARK
+1368 ETHIOPIC PARAGRAPH SEPARATOR
+1369 ETHIOPIC DIGIT ONE
+136a ETHIOPIC DIGIT TWO
+136b ETHIOPIC DIGIT THREE
+136c ETHIOPIC DIGIT FOUR
+136d ETHIOPIC DIGIT FIVE
+136e ETHIOPIC DIGIT SIX
+136f ETHIOPIC DIGIT SEVEN
+1370 ETHIOPIC DIGIT EIGHT
+1371 ETHIOPIC DIGIT NINE
+1372 ETHIOPIC NUMBER TEN
+1373 ETHIOPIC NUMBER TWENTY
+1374 ETHIOPIC NUMBER THIRTY
+1375 ETHIOPIC NUMBER FORTY
+1376 ETHIOPIC NUMBER FIFTY
+1377 ETHIOPIC NUMBER SIXTY
+1378 ETHIOPIC NUMBER SEVENTY
+1379 ETHIOPIC NUMBER EIGHTY
+137a ETHIOPIC NUMBER NINETY
+137b ETHIOPIC NUMBER HUNDRED
+137c ETHIOPIC NUMBER TEN THOUSAND
1e00 LATIN CAPITAL LETTER A WITH RING BELOW
1e01 LATIN SMALL LETTER A WITH RING BELOW
1e02 LATIN CAPITAL LETTER B WITH DOT ABOVE
diff --git a/lib/unicode/To/Digit.pl b/lib/unicode/To/Digit.pl
index 8f60c4f3b7..7ccd849700 100644
--- a/lib/unicode/To/Digit.pl
+++ b/lib/unicode/To/Digit.pl
@@ -16,6 +16,7 @@ return <<'END';
0e50 0e59 0000
0ed0 0ed9 0000
0f20 0f29 0000
+1369 1371 0001
2070 0000
2074 2079 0004
2080 2089 0000
diff --git a/lib/unicode/mktables.PL b/lib/unicode/mktables.PL
index 306f2a43c5..82d83077d9 100755
--- a/lib/unicode/mktables.PL
+++ b/lib/unicode/mktables.PL
@@ -9,17 +9,23 @@ mkdir "To", 0777;
@todo = (
# typical
- ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
- ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
- ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
- ['IsDigit', '$cat =~ /^Nd$/', ''],
- ['IsUpper', '$cat =~ /^Lu$/', ''],
- ['IsLower', '$cat =~ /^Ll$/', ''],
- ['IsPrint', '$cat =~ /^[^C]/', ''],
- ['ToUpper', '$up', '$up'],
- ['ToLower', '$down', '$down'],
- ['ToTitle', '$title', '$title'],
- ['ToDigit', '$dec ne ""', '$dec'],
+ ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
+ ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
+ ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
+ ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
+ ['IsDigit', '$cat =~ /^Nd$/', ''],
+ ['IsUpper', '$cat =~ /^Lu$/', ''],
+ ['IsLower', '$cat =~ /^Ll$/', ''],
+ ['IsASCII', 'hex $code <= 127', ''],
+ ['IsCntrl', '$cat =~ /^C/', ''],
+ ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
+ ['IsPrint', '$cat =~ /^[^C]/', ''],
+ ['IsPunct', '$cat =~ /^P/', ''],
+ ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
+ ['ToUpper', '$up', '$up'],
+ ['ToLower', '$down', '$down'],
+ ['ToTitle', '$title', '$title'],
+ ['ToDigit', '$dec ne ""', '$dec'],
# Name
diff --git a/objXSUB.h b/objXSUB.h
index d91f84d0ee..6f201dca3d 100644
--- a/objXSUB.h
+++ b/objXSUB.h
@@ -742,16 +742,26 @@
#define PL_unsafe pPerl->PL_unsafe
#undef PL_utf8_alnum
#define PL_utf8_alnum pPerl->PL_utf8_alnum
+#undef PL_utf8_alnumc
+#define PL_utf8_alnumc pPerl->PL_utf8_alnumc
#undef PL_utf8_alpha
#define PL_utf8_alpha pPerl->PL_utf8_alpha
+#undef PL_utf8_ascii
+#define PL_utf8_ascii pPerl->PL_utf8_ascii
+#undef PL_utf8_cntrl
+#define PL_utf8_cntrl pPerl->PL_utf8_cntrl
#undef PL_utf8_digit
#define PL_utf8_digit pPerl->PL_utf8_digit
+#undef PL_utf8_graph
+#define PL_utf8_graph pPerl->PL_utf8_graph
#undef PL_utf8_lower
#define PL_utf8_lower pPerl->PL_utf8_lower
#undef PL_utf8_mark
#define PL_utf8_mark pPerl->PL_utf8_mark
#undef PL_utf8_print
#define PL_utf8_print pPerl->PL_utf8_print
+#undef PL_utf8_punct
+#define PL_utf8_punct pPerl->PL_utf8_punct
#undef PL_utf8_space
#define PL_utf8_space pPerl->PL_utf8_space
#undef PL_utf8_tolower
@@ -762,6 +772,8 @@
#define PL_utf8_toupper pPerl->PL_utf8_toupper
#undef PL_utf8_upper
#define PL_utf8_upper pPerl->PL_utf8_upper
+#undef PL_utf8_xdigit
+#define PL_utf8_xdigit pPerl->PL_utf8_xdigit
#undef PL_uudmap
#define PL_uudmap pPerl->PL_uudmap
#undef PL_warnhook
@@ -1505,6 +1517,10 @@
#define Perl_is_uni_alnum pPerl->Perl_is_uni_alnum
#undef is_uni_alnum
#define is_uni_alnum Perl_is_uni_alnum
+#undef Perl_is_uni_alnumc
+#define Perl_is_uni_alnumc pPerl->Perl_is_uni_alnumc
+#undef is_uni_alnumc
+#define is_uni_alnumc Perl_is_uni_alnumc
#undef Perl_is_uni_idfirst
#define Perl_is_uni_idfirst pPerl->Perl_is_uni_idfirst
#undef is_uni_idfirst
@@ -1513,10 +1529,22 @@
#define Perl_is_uni_alpha pPerl->Perl_is_uni_alpha
#undef is_uni_alpha
#define is_uni_alpha Perl_is_uni_alpha
+#undef Perl_is_uni_ascii
+#define Perl_is_uni_ascii pPerl->Perl_is_uni_ascii
+#undef is_uni_ascii
+#define is_uni_ascii Perl_is_uni_ascii
#undef Perl_is_uni_space
#define Perl_is_uni_space pPerl->Perl_is_uni_space
#undef is_uni_space
#define is_uni_space Perl_is_uni_space
+#undef Perl_is_uni_cntrl
+#define Perl_is_uni_cntrl pPerl->Perl_is_uni_cntrl
+#undef is_uni_cntrl
+#define is_uni_cntrl Perl_is_uni_cntrl
+#undef Perl_is_uni_graph
+#define Perl_is_uni_graph pPerl->Perl_is_uni_graph
+#undef is_uni_graph
+#define is_uni_graph Perl_is_uni_graph
#undef Perl_is_uni_digit
#define Perl_is_uni_digit pPerl->Perl_is_uni_digit
#undef is_uni_digit
@@ -1533,6 +1561,14 @@
#define Perl_is_uni_print pPerl->Perl_is_uni_print
#undef is_uni_print
#define is_uni_print Perl_is_uni_print
+#undef Perl_is_uni_punct
+#define Perl_is_uni_punct pPerl->Perl_is_uni_punct
+#undef is_uni_punct
+#define is_uni_punct Perl_is_uni_punct
+#undef Perl_is_uni_xdigit
+#define Perl_is_uni_xdigit pPerl->Perl_is_uni_xdigit
+#undef is_uni_xdigit
+#define is_uni_xdigit Perl_is_uni_xdigit
#undef Perl_to_uni_upper
#define Perl_to_uni_upper pPerl->Perl_to_uni_upper
#undef to_uni_upper
@@ -1549,6 +1585,10 @@
#define Perl_is_uni_alnum_lc pPerl->Perl_is_uni_alnum_lc
#undef is_uni_alnum_lc
#define is_uni_alnum_lc Perl_is_uni_alnum_lc
+#undef Perl_is_uni_alnumc_lc
+#define Perl_is_uni_alnumc_lc pPerl->Perl_is_uni_alnumc_lc
+#undef is_uni_alnumc_lc
+#define is_uni_alnumc_lc Perl_is_uni_alnumc_lc
#undef Perl_is_uni_idfirst_lc
#define Perl_is_uni_idfirst_lc pPerl->Perl_is_uni_idfirst_lc
#undef is_uni_idfirst_lc
@@ -1557,10 +1597,22 @@
#define Perl_is_uni_alpha_lc pPerl->Perl_is_uni_alpha_lc
#undef is_uni_alpha_lc
#define is_uni_alpha_lc Perl_is_uni_alpha_lc
+#undef Perl_is_uni_ascii_lc
+#define Perl_is_uni_ascii_lc pPerl->Perl_is_uni_ascii_lc
+#undef is_uni_ascii_lc
+#define is_uni_ascii_lc Perl_is_uni_ascii_lc
#undef Perl_is_uni_space_lc
#define Perl_is_uni_space_lc pPerl->Perl_is_uni_space_lc
#undef is_uni_space_lc
#define is_uni_space_lc Perl_is_uni_space_lc
+#undef Perl_is_uni_cntrl_lc
+#define Perl_is_uni_cntrl_lc pPerl->Perl_is_uni_cntrl_lc
+#undef is_uni_cntrl_lc
+#define is_uni_cntrl_lc Perl_is_uni_cntrl_lc
+#undef Perl_is_uni_graph_lc
+#define Perl_is_uni_graph_lc pPerl->Perl_is_uni_graph_lc
+#undef is_uni_graph_lc
+#define is_uni_graph_lc Perl_is_uni_graph_lc
#undef Perl_is_uni_digit_lc
#define Perl_is_uni_digit_lc pPerl->Perl_is_uni_digit_lc
#undef is_uni_digit_lc
@@ -1577,6 +1629,14 @@
#define Perl_is_uni_print_lc pPerl->Perl_is_uni_print_lc
#undef is_uni_print_lc
#define is_uni_print_lc Perl_is_uni_print_lc
+#undef Perl_is_uni_punct_lc
+#define Perl_is_uni_punct_lc pPerl->Perl_is_uni_punct_lc
+#undef is_uni_punct_lc
+#define is_uni_punct_lc Perl_is_uni_punct_lc
+#undef Perl_is_uni_xdigit_lc
+#define Perl_is_uni_xdigit_lc pPerl->Perl_is_uni_xdigit_lc
+#undef is_uni_xdigit_lc
+#define is_uni_xdigit_lc Perl_is_uni_xdigit_lc
#undef Perl_to_uni_upper_lc
#define Perl_to_uni_upper_lc pPerl->Perl_to_uni_upper_lc
#undef to_uni_upper_lc
@@ -1593,6 +1653,10 @@
#define Perl_is_utf8_alnum pPerl->Perl_is_utf8_alnum
#undef is_utf8_alnum
#define is_utf8_alnum Perl_is_utf8_alnum
+#undef Perl_is_utf8_alnumc
+#define Perl_is_utf8_alnumc pPerl->Perl_is_utf8_alnumc
+#undef is_utf8_alnumc
+#define is_utf8_alnumc Perl_is_utf8_alnumc
#undef Perl_is_utf8_idfirst
#define Perl_is_utf8_idfirst pPerl->Perl_is_utf8_idfirst
#undef is_utf8_idfirst
@@ -1601,14 +1665,26 @@
#define Perl_is_utf8_alpha pPerl->Perl_is_utf8_alpha
#undef is_utf8_alpha
#define is_utf8_alpha Perl_is_utf8_alpha
+#undef Perl_is_utf8_ascii
+#define Perl_is_utf8_ascii pPerl->Perl_is_utf8_ascii
+#undef is_utf8_ascii
+#define is_utf8_ascii Perl_is_utf8_ascii
#undef Perl_is_utf8_space
#define Perl_is_utf8_space pPerl->Perl_is_utf8_space
#undef is_utf8_space
#define is_utf8_space Perl_is_utf8_space
+#undef Perl_is_utf8_cntrl
+#define Perl_is_utf8_cntrl pPerl->Perl_is_utf8_cntrl
+#undef is_utf8_cntrl
+#define is_utf8_cntrl Perl_is_utf8_cntrl
#undef Perl_is_utf8_digit
#define Perl_is_utf8_digit pPerl->Perl_is_utf8_digit
#undef is_utf8_digit
#define is_utf8_digit Perl_is_utf8_digit
+#undef Perl_is_utf8_graph
+#define Perl_is_utf8_graph pPerl->Perl_is_utf8_graph
+#undef is_utf8_graph
+#define is_utf8_graph Perl_is_utf8_graph
#undef Perl_is_utf8_upper
#define Perl_is_utf8_upper pPerl->Perl_is_utf8_upper
#undef is_utf8_upper
@@ -1621,6 +1697,14 @@
#define Perl_is_utf8_print pPerl->Perl_is_utf8_print
#undef is_utf8_print
#define is_utf8_print Perl_is_utf8_print
+#undef Perl_is_utf8_punct
+#define Perl_is_utf8_punct pPerl->Perl_is_utf8_punct
+#undef is_utf8_punct
+#define is_utf8_punct Perl_is_utf8_punct
+#undef Perl_is_utf8_xdigit
+#define Perl_is_utf8_xdigit pPerl->Perl_is_utf8_xdigit
+#undef is_utf8_xdigit
+#define is_utf8_xdigit Perl_is_utf8_xdigit
#undef Perl_is_utf8_mark
#define Perl_is_utf8_mark pPerl->Perl_is_utf8_mark
#undef is_utf8_mark
@@ -3509,6 +3593,10 @@
#define Perl_ck_rvconst pPerl->Perl_ck_rvconst
#undef ck_rvconst
#define ck_rvconst Perl_ck_rvconst
+#undef Perl_ck_sassign
+#define Perl_ck_sassign pPerl->Perl_ck_sassign
+#undef ck_sassign
+#define ck_sassign Perl_ck_sassign
#undef Perl_ck_scmp
#define Perl_ck_scmp pPerl->Perl_ck_scmp
#undef ck_scmp
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index de727db487..2278a549cf 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -121,6 +121,13 @@ Unix and UNICOS also have 64-bit support.
=head2 Better syntax checks on parenthesized unary operators
+TODO
+
+=head2 POSIX character class syntax [: :] supported
+
+For example to match alphabetic characters use /[[:alpha:]]/.
+See L<perlre> for details.
+
Expressions such as:
print defined(&foo,&bar,&baz);
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index d7b9024998..b352e9c3c2 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1000,21 +1000,23 @@ there is no builtin with the name C<word>.
opposed to a subroutine reference): no such method callable via the
package. If method name is C<???>, this is an internal error.
-=item Character class syntax [. .] is reserved for future extensions
+=item Character class [:%s:] unknown
-(W) Within regular expression character classes ([]) the syntax beginning
-with "[." and ending with ".]" is reserved for future extensions.
-If you need to represent those character sequences inside a regular
-expression character class, just quote the square brackets with the
-backslash: "\[." and ".\]".
+(F) The class in the character class [: :] syntax is unknown.
-=item Character class syntax [: :] is reserved for future extensions
+=item Character class syntax [%s] belongs inside character classes
+
+(W) The character class constructs [: :], [= =], and [. .] go
+I<inside> character classes, the [] are part of the construct. For
+example: /[[:alpha:]]/
+
+=item Character class syntax [ .] is reserved for future extensions
(W) Within regular expression character classes ([]) the syntax beginning
-with "[:" and ending with ":]" is reserved for future extensions.
+with "[." and ending with ".]" is reserved for future extensions.
If you need to represent those character sequences inside a regular
expression character class, just quote the square brackets with the
-backslash: "\[:" and ":\]".
+backslash: "\[." and ".\]".
=item Character class syntax [= =] is reserved for future extensions
diff --git a/pod/perlre.pod b/pod/perlre.pod
index ca95638605..470c5934ff 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -186,6 +186,100 @@ current locale. See L<perllocale>. You may use C<\w>, C<\W>, C<\s>, C<\S>,
C<\d>, and C<\D> within character classes (though not as either end of
a range). See L<utf8> for details about C<\pP>, C<\PP>, and C<\X>.
+The POSIX character class syntax
+
+ [:class:]
+
+is also available. The available classes and their \-equivalents
+(if any) are as follows:
+
+ alpha
+ alnum
+ ascii
+ cntrl
+ digit \d
+ graph
+ lower
+ print
+ punct
+ space \s
+ upper
+ word \w
+ xdigit
+
+Note that the [] are part of the [::] construct, not part of the whole
+character class. For example:
+
+ [01[:alpha:]%]
+
+matches one, zero, any alphabetic character, and the percentage sign.
+
+The exact meanings of the above classes depend from many things:
+if the C<utf8> pragma is used, the following equivalenced to Unicode
+\p{} constructs hold:
+
+ alpha IsAlpha
+ alnum IsAlnum
+ ascii IsASCII
+ cntrl IsCntrl
+ digit IsDigit
+ graph IsGraph
+ lower IsLower
+ print IsPrint
+ punct IsPunct
+ space IsSpace
+ upper IsUpper
+ word IsWord
+ xdigit IsXDigit
+
+For example, [:lower:] and \p{IsLower} are equivalent.
+
+If the C<utf8> pragma is not used but the C<locale> pragma is, the
+classes correlate with the isalpha(3) interface (except for `word',
+which is a Perl extension).
+
+The assumedly non-obviously named classes are:
+
+=over 4
+
+=item cntrl
+
+ Any control character. Usually characters that don't produce
+ output as such but instead control the terminal somehow:
+ for example newline and backspace are control characters.
+
+=item graph
+
+ Any alphanumeric or punctuation character.
+
+=item print
+
+ Any alphanumeric or punctuation character or space.
+
+=item punct
+
+ Any punctuation character.
+
+=item xdigit
+
+ Any hexadecimal digit. Though this may feel silly
+ (/0-9a-f/i would work just fine) it is included
+ for completeness.
+
+=item
+
+=back
+
+You can negate the [::] character classes by prefixing the class name
+with a '^'. This is a Perl extension. For example:
+
+ ^digit \D \P{IsDigit}
+ ^space \S \P{IsSpace}
+ ^word \W \P{IsWord}
+
+The POSIX character classes [.cc.] and [=cc=] are B<not> supported
+and trying to use them will cause an error.
+
Perl defines the following zero-width assertions:
\b Match a word boundary
diff --git a/proto.h b/proto.h
index 7fa642405b..402876ac05 100644
--- a/proto.h
+++ b/proto.h
@@ -196,35 +196,53 @@ VIRTUAL char* Perl_instr(pTHX_ const char* big, const char* little);
VIRTUAL bool Perl_io_close(pTHX_ IO* io);
VIRTUAL OP* Perl_invert(pTHX_ OP* cmd);
VIRTUAL bool Perl_is_uni_alnum(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_alnumc(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_idfirst(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_alpha(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_ascii(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_space(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_cntrl(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_graph(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_digit(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_upper(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_lower(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_print(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_punct(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_xdigit(pTHX_ U32 c);
VIRTUAL U32 Perl_to_uni_upper(pTHX_ U32 c);
VIRTUAL U32 Perl_to_uni_title(pTHX_ U32 c);
VIRTUAL U32 Perl_to_uni_lower(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_alnum_lc(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_alnumc_lc(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_idfirst_lc(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_alpha_lc(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_ascii_lc(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_space_lc(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_cntrl_lc(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_graph_lc(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_digit_lc(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_upper_lc(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_lower_lc(pTHX_ U32 c);
VIRTUAL bool Perl_is_uni_print_lc(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_punct_lc(pTHX_ U32 c);
+VIRTUAL bool Perl_is_uni_xdigit_lc(pTHX_ U32 c);
VIRTUAL U32 Perl_to_uni_upper_lc(pTHX_ U32 c);
VIRTUAL U32 Perl_to_uni_title_lc(pTHX_ U32 c);
VIRTUAL U32 Perl_to_uni_lower_lc(pTHX_ U32 c);
VIRTUAL bool Perl_is_utf8_alnum(pTHX_ U8 *p);
+VIRTUAL bool Perl_is_utf8_alnumc(pTHX_ U8 *p);
VIRTUAL bool Perl_is_utf8_idfirst(pTHX_ U8 *p);
VIRTUAL bool Perl_is_utf8_alpha(pTHX_ U8 *p);
+VIRTUAL bool Perl_is_utf8_ascii(pTHX_ U8 *p);
VIRTUAL bool Perl_is_utf8_space(pTHX_ U8 *p);
+VIRTUAL bool Perl_is_utf8_cntrl(pTHX_ U8 *p);
VIRTUAL bool Perl_is_utf8_digit(pTHX_ U8 *p);
+VIRTUAL bool Perl_is_utf8_graph(pTHX_ U8 *p);
VIRTUAL bool Perl_is_utf8_upper(pTHX_ U8 *p);
VIRTUAL bool Perl_is_utf8_lower(pTHX_ U8 *p);
VIRTUAL bool Perl_is_utf8_print(pTHX_ U8 *p);
+VIRTUAL bool Perl_is_utf8_punct(pTHX_ U8 *p);
+VIRTUAL bool Perl_is_utf8_xdigit(pTHX_ U8 *p);
VIRTUAL bool Perl_is_utf8_mark(pTHX_ U8 *p);
VIRTUAL OP* Perl_jmaybe(pTHX_ OP* arg);
VIRTUAL I32 Perl_keyword(pTHX_ char* d, I32 len);
@@ -854,7 +872,8 @@ STATIC void S_scan_commit(pTHX_ scan_data_t *data);
STATIC I32 S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags);
STATIC I32 S_add_data(pTHX_ I32 n, char *s);
STATIC void S_re_croak2(pTHX_ const char* pat1, const char* pat2, ...) __attribute__((noreturn));
-STATIC char* S_regpposixcc(pTHX_ I32 value);
+STATIC I32 S_regpposixcc(pTHX_ I32 value);
+STATIC void S_checkposixcc(pTHX);
STATIC void S_clear_re(pTHX_ void *r);
#endif
#if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT)
diff --git a/regcomp.c b/regcomp.c
index 59fe5a7d9f..3569b3bbf1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -163,6 +163,9 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define LOC (PL_regflags & PMf_LOCALE)
#define FOLD (PL_regflags & PMf_FOLD)
+#define OOB_CHAR8 1234
+#define OOB_UTF8 123456
+
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
@@ -2093,12 +2096,17 @@ S_regwhite(pTHX_ char *p, char *e)
return p;
}
-/* parse POSIX character classes like [[:foo:]] */
-STATIC char*
+/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
+ Character classes ([:foo:]) can also be negated ([:^foo:]).
+ Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
+ Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
+ but trigger warnings because they are currently unimplemented. */
+STATIC I32
S_regpposixcc(pTHX_ I32 value)
{
dTHR;
char *posixcc = 0;
+ I32 namedclass = -1;
if (value == '[' && PL_regcomp_parse + 1 < PL_regxend &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
@@ -2114,26 +2122,120 @@ S_regpposixcc(pTHX_ I32 value)
/* Grandfather lone [:, [=, [. */
PL_regcomp_parse = s;
else {
- PL_regcomp_parse++; /* skip over the c */
- if (*PL_regcomp_parse == ']') {
- /* Not Implemented Yet.
- * (POSIX Extended Character Classes, that is)
- * The text between e.g. [: and :] would start
- * at s + 1 and stop at regcomp_parse - 2. */
- if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY)
+ char* t = PL_regcomp_parse++; /* skip over the c */
+
+ if (*PL_regcomp_parse == ']') {
+ PL_regcomp_parse++; /* skip over the ending ] */
+ posixcc = s + 1;
+ if (*s == ':') {
+ I32 complement = *posixcc == '^' ? *posixcc++ : 0;
+ I32 skip = 5; /* the most common skip */
+
+ switch (*posixcc) {
+ case 'a':
+ if (strnEQ(posixcc, "alnum", 5))
+ namedclass =
+ complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
+ else if (strnEQ(posixcc, "alpha", 5))
+ namedclass =
+ complement ? ANYOF_NALPHA : ANYOF_ALPHA;
+ else if (strnEQ(posixcc, "ascii", 5))
+ namedclass =
+ complement ? ANYOF_NASCII : ANYOF_ASCII;
+ break;
+ case 'c':
+ if (strnEQ(posixcc, "cntrl", 5))
+ namedclass =
+ complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
+ break;
+ case 'd':
+ if (strnEQ(posixcc, "digit", 5))
+ namedclass =
+ complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
+ break;
+ case 'g':
+ if (strnEQ(posixcc, "graph", 5))
+ namedclass =
+ complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
+ break;
+ case 'l':
+ if (strnEQ(posixcc, "lower", 5))
+ namedclass =
+ complement ? ANYOF_NLOWER : ANYOF_LOWER;
+ break;
+ case 'p':
+ if (strnEQ(posixcc, "print", 5))
+ namedclass =
+ complement ? ANYOF_NPRINT : ANYOF_PRINT;
+ else if (strnEQ(posixcc, "punct", 5))
+ namedclass =
+ complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
+ break;
+ case 's':
+ if (strnEQ(posixcc, "space", 5))
+ namedclass =
+ complement ? ANYOF_NSPACE : ANYOF_SPACE;
+ case 'u':
+ if (strnEQ(posixcc, "upper", 5))
+ namedclass =
+ complement ? ANYOF_NUPPER : ANYOF_UPPER;
+ break;
+ case 'w': /* this is not POSIX, this is the Perl \w */
+ if (strnEQ(posixcc, "word", 4)) {
+ namedclass =
+ complement ? ANYOF_NALNUM : ANYOF_ALNUM;
+ skip = 4;
+ }
+ break;
+ case 'x':
+ if (strnEQ(posixcc, "xdigit", 6)) {
+ namedclass =
+ complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
+ skip = 6;
+ }
+ break;
+ }
+ if ((namedclass == -1 ||
+ !(posixcc + skip + 2 < PL_regxend &&
+ (posixcc[skip] == ':' &&
+ posixcc[skip + 1] == ']'))))
+ Perl_croak(aTHX_ "Character class [:%.*s:] unknown",
+ t - s - 1, s + 1);
+ } else if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY)
+ /* [[=foo=]] and [[.foo.]] are still future. */
Perl_warner(aTHX_ WARN_UNSAFE,
- "Character class syntax [%c %c] is reserved for future extensions", c, c);
- PL_regcomp_parse++; /* skip over the ending ] */
- posixcc = s + 1;
- }
- else {
- /* maternal grandfather */
+ "Character class syntax [%c %c] is reserved for future extensions", c, c);
+ } else {
+ /* Maternal grandfather:
+ * "[:" ending in ":" but not in ":]" */
PL_regcomp_parse = s;
}
}
}
- return posixcc;
+ return namedclass;
+}
+
+STATIC void
+S_checkposixcc(pTHX)
+{
+ if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY &&
+ (*PL_regcomp_parse == ':' ||
+ *PL_regcomp_parse == '=' ||
+ *PL_regcomp_parse == '.')) {
+ char *s = PL_regcomp_parse;
+ char c = *s++;
+
+ while(*s && isALNUM(*s))
+ s++;
+ if (*s && c == *s && s[1] == ']') {
+ Perl_warner(aTHX_ WARN_UNSAFE,
+ "Character class syntax [%c %c] belongs inside character classes", c, c);
+ if (c == '=' || c == '.')
+ Perl_warner(aTHX_ WARN_UNSAFE,
+ "Character class syntax [%c %c] is reserved for future extensions", c, c);
+ }
+ }
}
STATIC regnode *
@@ -2142,142 +2244,319 @@ S_regclass(pTHX)
dTHR;
register char *opnd, *s;
register I32 value;
- register I32 lastvalue = 1234;
+ register I32 lastvalue = OOB_CHAR8;
register I32 range = 0;
register regnode *ret;
register I32 def;
I32 numlen;
+ I32 namedclass;
s = opnd = (char *) OPERAND(PL_regcode);
ret = reg_node(ANYOF);
- for (value = 0; value < 33; value++)
+ for (value = 0; value < ANYOF_SIZE; value++)
regc(0, s++);
if (*PL_regcomp_parse == '^') { /* Complement of range. */
PL_regnaughty++;
PL_regcomp_parse++;
if (!SIZE_ONLY)
- *opnd |= ANYOF_INVERT;
+ ANYOF_FLAGS(opnd) |= ANYOF_INVERT;
}
if (!SIZE_ONLY) {
PL_regcode += ANY_SKIP;
if (FOLD)
- *opnd |= ANYOF_FOLD;
+ ANYOF_FLAGS(opnd) |= ANYOF_FOLD;
if (LOC)
- *opnd |= ANYOF_LOCALE;
+ ANYOF_FLAGS(opnd) |= ANYOF_LOCALE;
}
else {
PL_regsize += ANY_SKIP;
}
+
+ checkposixcc();
+
if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
goto skipcond; /* allow 1st char to be ] or - */
while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
skipcond:
+ namedclass = -1;
value = UCHARAT(PL_regcomp_parse++);
if (value == '[')
- (void)regpposixcc(value); /* ignore the return value for now */
+ namedclass = regpposixcc(value);
else if (value == '\\') {
value = UCHARAT(PL_regcomp_parse++);
switch (value) {
- case 'w':
- if (!SIZE_ONLY) {
- if (LOC)
- *opnd |= ANYOF_ALNUML;
- else {
- for (value = 0; value < 256; value++)
- if (isALNUM(value))
- ANYOF_SET(opnd, value);
- }
+ case 'w': namedclass = ANYOF_ALNUM; break;
+ case 'W': namedclass = ANYOF_NALNUM; break;
+ case 's': namedclass = ANYOF_SPACE; break;
+ case 'S': namedclass = ANYOF_NSPACE; break;
+ case 'd': namedclass = ANYOF_DIGIT; break;
+ case 'D': namedclass = ANYOF_NDIGIT; break;
+ case 'n': value = '\n'; break;
+ case 'r': value = '\r'; break;
+ case 't': value = '\t'; break;
+ case 'f': value = '\f'; break;
+ case 'b': value = '\b'; break;
+ case 'e': value = '\033'; break;
+ case 'a': value = '\007'; break;
+ case 'x':
+ value = scan_hex(PL_regcomp_parse, 2, &numlen);
+ PL_regcomp_parse += numlen;
+ break;
+ case 'c':
+ value = UCHARAT(PL_regcomp_parse++);
+ value = toCTRL(value);
+ break;
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ value = scan_oct(--PL_regcomp_parse, 3, &numlen);
+ PL_regcomp_parse += numlen;
+ break;
+ }
+ }
+ if (!SIZE_ONLY && namedclass > -1) {
+ switch (namedclass) {
+ case ANYOF_ALNUM:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_ALNUM);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isALNUM(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'W':
- if (!SIZE_ONLY) {
- if (LOC)
- *opnd |= ANYOF_NALNUML;
- else {
- for (value = 0; value < 256; value++)
- if (!isALNUM(value))
- ANYOF_SET(opnd, value);
- }
+ break;
+ case ANYOF_NALNUM:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NALNUM);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isALNUM(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 's':
- if (!SIZE_ONLY) {
- if (LOC)
- *opnd |= ANYOF_SPACEL;
- else {
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_SET(opnd, value);
- }
+ break;
+ case ANYOF_SPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_SPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isSPACE(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'S':
- if (!SIZE_ONLY) {
- if (LOC)
- *opnd |= ANYOF_NSPACEL;
- else {
- for (value = 0; value < 256; value++)
- if (!isSPACE(value))
- ANYOF_SET(opnd, value);
- }
+ break;
+ case ANYOF_NSPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NSPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isSPACE(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'd':
- if (!SIZE_ONLY) {
+ break;
+ case ANYOF_DIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_DIGIT);
+ else {
for (value = '0'; value <= '9'; value++)
- ANYOF_SET(opnd, value);
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'D':
- if (!SIZE_ONLY) {
+ break;
+ case ANYOF_NDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NDIGIT);
+ else {
for (value = 0; value < '0'; value++)
- ANYOF_SET(opnd, value);
+ ANYOF_BITMAP_SET(opnd, value);
for (value = '9' + 1; value < 256; value++)
- ANYOF_SET(opnd, value);
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NALNUMC:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NALNUMC);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isALNUMC(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_ALNUMC:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_ALNUMC);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isALNUMC(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_ALPHA:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_ALPHA);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isALPHA(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'n':
- value = '\n';
break;
- case 'r':
- value = '\r';
+ case ANYOF_NALPHA:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NALPHA);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isALPHA(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 't':
- value = '\t';
+ case ANYOF_ASCII:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_ASCII);
+ else {
+ for (value = 0; value < 128; value++)
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'f':
- value = '\f';
+ case ANYOF_NASCII:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NASCII);
+ else {
+ for (value = 128; value < 256; value++)
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'b':
- value = '\b';
+ case ANYOF_CNTRL:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_CNTRL);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isCNTRL(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ lastvalue = OOB_CHAR8;
break;
- case 'e':
- value = '\033';
+ case ANYOF_NCNTRL:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NCNTRL);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isCNTRL(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'a':
- value = '\007';
+ case ANYOF_GRAPH:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_GRAPH);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isGRAPH(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'x':
- value = scan_hex(PL_regcomp_parse, 2, &numlen);
- PL_regcomp_parse += numlen;
+ case ANYOF_NGRAPH:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NGRAPH);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isGRAPH(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'c':
- value = UCHARAT(PL_regcomp_parse++);
- value = toCTRL(value);
+ case ANYOF_LOWER:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_LOWER);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isLOWER(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- value = scan_oct(--PL_regcomp_parse, 3, &numlen);
- PL_regcomp_parse += numlen;
+ case ANYOF_NLOWER:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NLOWER);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isLOWER(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_PRINT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_PRINT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isPRINT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NPRINT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NPRINT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isPRINT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_PUNCT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_PUNCT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isPUNCT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NPUNCT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NPUNCT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isPUNCT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_UPPER:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_UPPER);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isUPPER(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NUPPER:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NUPPER);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isUPPER(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_XDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_XDIGIT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isXDIGIT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NXDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NXDIGIT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isXDIGIT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ default:
+ FAIL("invalid [::] class in regexp");
break;
}
+ if (LOC)
+ ANYOF_FLAGS(opnd) |= ANYOF_CLASS;
+ lastvalue = OOB_CHAR8;
}
+ else
if (range) {
if (lastvalue > value)
FAIL("invalid [] range in regexp");
@@ -2301,35 +2580,36 @@ S_regclass(pTHX)
if (isLOWER(lastvalue)) {
for (i = lastvalue; i <= value; i++)
if (isLOWER(i))
- ANYOF_SET(opnd, i);
+ ANYOF_BITMAP_SET(opnd, i);
} else {
for (i = lastvalue; i <= value; i++)
if (isUPPER(i))
- ANYOF_SET(opnd, i);
+ ANYOF_BITMAP_SET(opnd, i);
}
}
else
#endif
for ( ; lastvalue <= value; lastvalue++)
- ANYOF_SET(opnd, lastvalue);
+ ANYOF_BITMAP_SET(opnd, lastvalue);
}
lastvalue = value;
}
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
- if (!SIZE_ONLY && (*opnd & (0xFF ^ ANYOF_INVERT)) == ANYOF_FOLD) {
+ if (!SIZE_ONLY &&
+ (ANYOF_FLAGS(opnd) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) {
for (value = 0; value < 256; ++value) {
- if (ANYOF_TEST(opnd, value)) {
+ if (ANYOF_BITMAP_TEST(opnd, value)) {
I32 cf = PL_fold[value];
- ANYOF_SET(opnd, cf);
+ ANYOF_BITMAP_SET(opnd, cf);
}
}
- *opnd &= ~ANYOF_FOLD;
+ ANYOF_FLAGS(opnd) &= ~ANYOF_FOLD;
}
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY && (*opnd & 0xFF) == ANYOF_INVERT) {
- for (value = 0; value < 32; ++value)
- opnd[1 + value] ^= 0xFF;
- *opnd = 0;
+ if (!SIZE_ONLY && (ANYOF_FLAGS(opnd) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+ for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
+ opnd[ANYOF_BITMAP_OFFSET + value] ^= ANYOF_FLAGS_ALL;
+ ANYOF_FLAGS(opnd) = 0;
}
return ret;
}
@@ -2337,16 +2617,17 @@ S_regclass(pTHX)
STATIC regnode *
S_regclassutf8(pTHX)
{
+ dTHR;
register char *opnd, *e;
register U32 value;
- register U32 lastvalue = 123456;
+ register U32 lastvalue = OOB_UTF8;
register I32 range = 0;
register regnode *ret;
I32 numlen;
I32 n;
SV *listsv;
U8 flags = 0;
- dTHR;
+ I32 namedclass;
if (*PL_regcomp_parse == '^') { /* Complement of range. */
PL_regnaughty++;
@@ -2362,75 +2643,29 @@ S_regclassutf8(pTHX)
listsv = newSVpvn("# comment\n",10);
}
+ checkposixcc();
+
if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
goto skipcond; /* allow 1st char to be ] or - */
while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
skipcond:
+ namedclass = -1;
value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
PL_regcomp_parse += numlen;
if (value == '[')
- (void)regpposixcc(value); /* ignore the return value for now */
+ namedclass = regpposixcc(value);
else if (value == '\\') {
value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
PL_regcomp_parse += numlen;
switch (value) {
- case 'w':
- if (!SIZE_ONLY) {
- if (LOC)
- flags |= ANYOF_ALNUML;
-
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
- }
- lastvalue = 123456;
- continue;
- case 'W':
- if (!SIZE_ONLY) {
- if (LOC)
- flags |= ANYOF_NALNUML;
-
- Perl_sv_catpvf(aTHX_ listsv,
- "-utf8::IsAlpha\n-utf8::IsDigit\n0000\t%04x\n%04x\tffff\n",
- '_' - 1,
- '_' + 1);
- }
- lastvalue = 123456;
- continue;
- case 's':
- if (!SIZE_ONLY) {
- if (LOC)
- flags |= ANYOF_SPACEL;
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
- if (!PL_utf8_space)
- is_utf8_space((U8*)" ");
- }
- lastvalue = 123456;
- continue;
- case 'S':
- if (!SIZE_ONLY) {
- if (LOC)
- flags |= ANYOF_NSPACEL;
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::IsSpace\n");
- if (!PL_utf8_space)
- is_utf8_space((U8*)" ");
- }
- lastvalue = 123456;
- continue;
- case 'd':
- if (!SIZE_ONLY) {
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
- }
- lastvalue = 123456;
- continue;
- case 'D':
- if (!SIZE_ONLY) {
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::IsDigit\n");
- }
- lastvalue = 123456;
- continue;
+ case 'w': namedclass = ANYOF_ALNUM; break;
+ case 'W': namedclass = ANYOF_NALNUM; break;
+ case 's': namedclass = ANYOF_SPACE; break;
+ case 'S': namedclass = ANYOF_NSPACE; break;
+ case 'd': namedclass = ANYOF_DIGIT; break;
+ case 'D': namedclass = ANYOF_NDIGIT; break;
case 'p':
case 'P':
if (*PL_regcomp_parse == '{') {
@@ -2445,41 +2680,30 @@ S_regclassutf8(pTHX)
}
if (!SIZE_ONLY) {
if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::%.*s\n", n, PL_regcomp_parse);
+ Perl_sv_catpvf(aTHX_ listsv,
+ "+utf8::%.*s\n", n, PL_regcomp_parse);
else
Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", n, PL_regcomp_parse);
+ "!utf8::%.*s\n", n, PL_regcomp_parse);
}
PL_regcomp_parse = e + 1;
- lastvalue = 123456;
+ lastvalue = OOB_UTF8;
continue;
- case 'n':
- value = '\n';
- break;
- case 'r':
- value = '\r';
- break;
- case 't':
- value = '\t';
- break;
- case 'f':
- value = '\f';
- break;
- case 'b':
- value = '\b';
- break;
- case 'e':
- value = '\033';
- break;
- case 'a':
- value = '\007';
- break;
+ case 'n': value = '\n'; break;
+ case 'r': value = '\r'; break;
+ case 't': value = '\t'; break;
+ case 'f': value = '\f'; break;
+ case 'b': value = '\b'; break;
+ case 'e': value = '\033'; break;
+ case 'a': value = '\007'; break;
case 'x':
if (*PL_regcomp_parse == '{') {
e = strchr(PL_regcomp_parse++, '}');
if (!e)
FAIL("Missing right brace on \\x{}");
- value = scan_hex(PL_regcomp_parse, e - PL_regcomp_parse, &numlen);
+ value = scan_hex(PL_regcomp_parse,
+ e - PL_regcomp_parse,
+ &numlen);
PL_regcomp_parse = e + 1;
}
else {
@@ -2498,7 +2722,64 @@ S_regclassutf8(pTHX)
break;
}
}
- if (range) {
+ if (!SIZE_ONLY && namedclass > -1) {
+ switch (namedclass) {
+ case ANYOF_ALNUM:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break;
+ case ANYOF_NALNUM:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break;
+ case ANYOF_ALNUMC:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break;
+ case ANYOF_NALNUMC:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break;
+ case ANYOF_ALPHA:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break;
+ case ANYOF_NALPHA:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break;
+ case ANYOF_ASCII:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break;
+ case ANYOF_NASCII:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break;
+ case ANYOF_CNTRL:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break;
+ case ANYOF_NCNTRL:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break;
+ case ANYOF_GRAPH:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break;
+ case ANYOF_NGRAPH:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break;
+ case ANYOF_DIGIT:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break;
+ case ANYOF_NDIGIT:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break;
+ case ANYOF_LOWER:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break;
+ case ANYOF_NLOWER:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break;
+ case ANYOF_PRINT:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break;
+ case ANYOF_NPRINT:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break;
+ case ANYOF_PUNCT:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break;
+ case ANYOF_NPUNCT:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break;
+ case ANYOF_SPACE:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break;
+ case ANYOF_NSPACE:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break;
+ case ANYOF_UPPER:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break;
+ case ANYOF_NUPPER:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break;
+ case ANYOF_XDIGIT:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break;
+ case ANYOF_NXDIGIT:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break;
+ }
+ }
+ else
+ if (range) {
if (lastvalue > value)
FAIL("invalid [] range in regexp");
if (!SIZE_ONLY)
diff --git a/regcomp.h b/regcomp.h
index 518add0309..c679ca4d46 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -154,24 +154,76 @@ struct regnode_2 {
#define SIZE_ONLY (PL_regcode == &PL_regdummy)
-/* Flags for first parameter byte of ANYOF */
-#define ANYOF_INVERT 0x40
-#define ANYOF_FOLD 0x20
-#define ANYOF_LOCALE 0x10
-#define ANYOF_ISA 0x0F
-#define ANYOF_ALNUML 0x08
-#define ANYOF_NALNUML 0x04
-#define ANYOF_SPACEL 0x02
-#define ANYOF_NSPACEL 0x01
-
-/* Utility macros for bitmap of ANYOF */
-#define ANYOF_BYTE(p,c) (p)[1 + (((c) >> 3) & 31)]
-#define ANYOF_BIT(c) (1 << ((c) & 7))
-#define ANYOF_SET(p,c) (ANYOF_BYTE(p,c) |= ANYOF_BIT(c))
-#define ANYOF_CLEAR(p,c) (ANYOF_BYTE(p,c) &= ~ANYOF_BIT(c))
-#define ANYOF_TEST(p,c) (ANYOF_BYTE(p,c) & ANYOF_BIT(c))
-
-#define ANY_SKIP ((33 - 1)/sizeof(regnode) + 1)
+/* Flags for first parameter byte [0] of ANYOF */
+
+#define ANYOF_CLASS 0x08
+#define ANYOF_INVERT 0x04
+#define ANYOF_FOLD 0x02
+#define ANYOF_LOCALE 0x01
+
+/* Character classes for bytes [1..4] of ANYOF */
+
+#define ANYOF_ALNUM 0 /* \w, utf8::IsWord, isALNUM() */
+#define ANYOF_NALNUM 1
+#define ANYOF_SPACE 2
+#define ANYOF_NSPACE 3
+#define ANYOF_DIGIT 4
+#define ANYOF_NDIGIT 5
+#define ANYOF_ALNUMC 6 /* isalnum(3), utf8::IsAlnum, isALNUMC() */
+#define ANYOF_NALNUMC 7
+#define ANYOF_ALPHA 8
+#define ANYOF_NALPHA 9
+#define ANYOF_ASCII 10
+#define ANYOF_NASCII 11
+#define ANYOF_CNTRL 12
+#define ANYOF_NCNTRL 13
+#define ANYOF_GRAPH 14
+#define ANYOF_NGRAPH 15
+#define ANYOF_LOWER 16
+#define ANYOF_NLOWER 17
+#define ANYOF_PRINT 18
+#define ANYOF_NPRINT 19
+#define ANYOF_PUNCT 20
+#define ANYOF_NPUNCT 21
+#define ANYOF_UPPER 22
+#define ANYOF_NUPPER 23
+#define ANYOF_XDIGIT 24
+#define ANYOF_NXDIGIT 25
+
+#define ANYOF_MAX 31
+
+/* Backward source code compatibility. */
+
+#define ANYOF_ALNUML ANYOF_ALNUM
+#define ANYOF_NALNUML ANYOF_NALNUM
+#define ANYOF_SPACEL ANYOF_SPACE
+#define ANYOF_NSPACEL ANYOF_NSPACE
+
+/* Utility macros for the bitmap and classes of ANYOF */
+
+#define ANYOF_OPND_SIZE 1
+#define ANYOF_CLASS_SIZE 4
+#define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */
+#define ANYOF_SIZE (ANYOF_OPND_SIZE+ANYOF_CLASS_SIZE+ANYOF_BITMAP_SIZE)
+
+#define ANYOF_FLAGS(p) ((p)[0])
+#define ANYOF_FLAGS_ALL 0xff
+
+#define ANYOF_BIT(c) (1 << ((c) & 7))
+
+#define ANYOF_CLASS_OFFSET ANYOF_OPND_SIZE
+#define ANYOF_CLASS_BYTE(p, c) ((p)[ANYOF_CLASS_OFFSET + (((c) >> 3) & 3)])
+#define ANYOF_CLASS_SET(p, c) (ANYOF_CLASS_BYTE(p, c) |= ANYOF_BIT(c))
+#define ANYOF_CLASS_CLEAR(p, c) (ANYOF_CLASS_BYTE(p, c) &= ~ANYOF_BIT(c))
+#define ANYOF_CLASS_TEST(p, c) (ANYOF_CLASS_BYTE(p, c) & ANYOF_BIT(c))
+
+#define ANYOF_BITMAP_OFFSET (ANYOF_CLASS_OFFSET+ANYOF_CLASS_SIZE)
+#define ANYOF_BITMAP_BYTE(p, c) ((p)[ANYOF_BITMAP_OFFSET + (((c) >> 3) & 31)])
+#define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c))
+#define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c))
+#define ANYOF_BITMAP_TEST(p, c) (ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c))
+
+#define ANY_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode) + 1)
/*
* Utility definitions.
diff --git a/regcomp.sym b/regcomp.sym
index 1391dfb22c..4e5c1c1ab2 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -50,8 +50,80 @@ NSPACEL NSPACE, no Match any non-whitespace char in locale
NSPACELUTF8 NSPACE, no Match any non-whitespace char in locale
DIGIT DIGIT, no Match any numeric character
DIGITUTF8 DIGIT, no Match any numeric character
+DIGITL DIGIT, no Match any numeric character in locale
+DIGITLUTF8 DIGIT, no Match any numeric character in locale
NDIGIT NDIGIT, no Match any non-numeric character
NDIGITUTF8 NDIGIT, no Match any non-numeric character
+NDIGITL NDIGIT, no Match any non-numeric character in locale
+NDIGITLUTF8 NDIGIT, no Match any non-numeric character in locale
+ALNUMC ALNUMC, no Match any alphanumeric character
+ALNUMCUTF8 ALNUMC, no Match any alphanumeric character
+ALNUMCL ALNUMC, no Match any alphanumeric character in locale
+ALNUMCLUTF8 ALNUMC, no Match any alphanumeric character in locale
+NALNUMC NALNUMC, no Match any non-alphanumeric character
+NALNUMCUTF8 NALNUMC, no Match any non-alphanumeric character
+NALNUMCL NALNUMC, no Match any non-alphanumeric character in locale
+NALNUMCLUTF8 NALNUMC, no Match any non-alphanumeric character in locale
+ALPHA ALPHA, no Match any alphabetic character
+ALPHAUTF8 ALPHA, no Match any alphabetic character
+ALPHAL ALPHA, no Match any alphabetic character in locale
+ALPHALUTF8 ALPHA, no Match any alphabetic character in locale
+NALPHA NALPHA, no Match any non-alphabetic character
+NALPHAUTF8 NALPHA, no Match any non-alphabetic character
+NALPHAL NALPHA, no Match any non-alphabetic character in locale
+NALPHALUTF8 NALPHA, no Match any non-alphabetic character in locale
+ASCII ASCII, no Match any ASCII character
+NASCII NASCII, no Match any non-ASCII character
+CNTRL CNTRL, no Match any control character
+CNTRLUTF8 CNTRL, no Match any control character
+CNTRLL CNTRL, no Match any control character in locale
+CNTRLLUTF8 CNTRL, no Match any control character in locale
+NCNTRL NCNTRL, no Match any non-control character
+NCNTRLUTF8 NCNTRL, no Match any non-control character
+NCNTRLL NCNTRL, no Match any non-control character in locale
+NCNTRLLUTF8 NCNTRL, no Match any non-control character in locale
+GRAPH GRAPH, no Match any graphical character
+GRAPHUTF8 GRAPH, no Match any graphical character
+GRAPHL GRAPH, no Match any graphical character in locale
+GRAPHLUTF8 GRAPH, no Match any graphical character in locale
+NGRAPH NGRAPH, no Match any non-graphical character
+NGRAPHUTF8 NGRAPH, no Match any non-graphical character
+NGRAPHL NGRAPH, no Match any non-graphical character in locale
+NGRAPHLUTF8 NGRAPH, no Match any non-graphical character in locale
+LOWER LOWER, no Match any lowercase character
+LOWERUTF8 LOWER, no Match any lowercase character
+LOWERL LOWER, no Match any lowercase character in locale
+LOWERLUTF8 LOWER, no Match any lowercase character in locale
+NLOWER NLOWER, no Match any non-lowercase character
+NLOWERUTF8 NLOWER, no Match any non-lowercase character
+NLOWERL NLOWER, no Match any non-lowercase character in locale
+NLOWERLUTF8 NLOWER, no Match any non-lowercase character in locale
+PRINT PRINT, no Match any printable character
+PRINTUTF8 PRINT, no Match any printable character
+PRINTL PRINT, no Match any printable character in locale
+PRINTLUTF8 PRINT, no Match any printable character in locale
+NPRINT NPRINT, no Match any non-printable character
+NPRINTUTF8 NPRINT, no Match any non-printable character
+NPRINTL NPRINT, no Match any non-printable character in locale
+NPRINTLUTF8 NPRINT, no Match any non-printable character in locale
+PUNCT PUNCT, no Match any punctuation character
+PUNCTUTF8 PUNCT, no Match any punctuation character
+PUNCTL PUNCT, no Match any punctuation character in locale
+PUNCTLUTF8 PUNCT, no Match any punctuation character in locale
+NPUNCT NPUNCT, no Match any non-punctuation character
+NPUNCTUTF8 NPUNCT, no Match any non-punctuation character
+NPUNCTL NPUNCT, no Match any non-punctuation character in locale
+NPUNCTLUTF8 NPUNCT, no Match any non-punctuation character in locale
+UPPER UPPER, no Match any uppercase character
+UPPERUTF8 UPPER, no Match any uppercase character
+UPPERL UPPER, no Match any uppercase character in locale
+UPPERLUTF8 UPPER, no Match any uppercase character in locale
+NUPPER NUPPER, no Match any non-uppercase character
+NUPPERUTF8 NUPPER, no Match any non-uppercase character
+NUPPERL NUPPER, no Match any non-uppercase character in locale
+NUPPERLUTF8 NUPPER, no Match any non-uppercase character in locale
+XDIGIT XDIGIT, no Match any hexdigit character
+NXDIGIT NXDIGIT, no Match any non-hexdigit character
CLUMP CLUMP, no Match any combining character sequence
# BRANCH The set of branches constituting a single choice are hooked
diff --git a/regexec.c b/regexec.c
index c97f89efa7..75f3873ce7 100644
--- a/regexec.c
+++ b/regexec.c
@@ -97,7 +97,7 @@
* Forwards.
*/
-#define REGINCLASS(p,c) (*(p) ? reginclass(p,c) : ANYOF_TEST(p,c))
+#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
#define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
@@ -1062,6 +1062,34 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
s += UTF8SKIP(s);
}
break;
+ case DIGITL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isDIGIT_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case DIGITLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isDIGIT_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
case NDIGIT:
while (s < strend) {
if (!isDIGIT(*s)) {
@@ -1088,6 +1116,842 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
s += UTF8SKIP(s);
}
break;
+ case NDIGITL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isDIGIT_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NDIGITLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isDIGIT_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case ALNUMC:
+ while (s < strend) {
+ if (isALNUMC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case ALNUMCUTF8:
+ while (s < strend) {
+ if (swash_fetch(PL_utf8_alnumc, (U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case ALNUMCL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isALNUMC_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case ALNUMCLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isALNUMC_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NALNUMC:
+ while (s < strend) {
+ if (!isALNUMC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NALNUMCUTF8:
+ while (s < strend) {
+ if (!swash_fetch(PL_utf8_alnumc, (U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NALNUMCL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isALNUMC_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NALNUMCLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isALNUMC_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case ASCII:
+ while (s < strend) {
+ if (isASCII(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NASCII:
+ while (s < strend) {
+ if (!isASCII(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case CNTRL:
+ while (s < strend) {
+ if (isCNTRL(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case CNTRLUTF8:
+ while (s < strend) {
+ if (swash_fetch(PL_utf8_cntrl,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case CNTRLL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isCNTRL_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case CNTRLLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (*s == ' ' || isCNTRL_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NCNTRL:
+ while (s < strend) {
+ if (!isCNTRL(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NCNTRLUTF8:
+ while (s < strend) {
+ if (!swash_fetch(PL_utf8_cntrl,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NCNTRLL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isCNTRL_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NCNTRLLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isCNTRL_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case GRAPH:
+ while (s < strend) {
+ if (isGRAPH(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case GRAPHUTF8:
+ while (s < strend) {
+ if (swash_fetch(PL_utf8_graph,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case GRAPHL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isGRAPH_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case GRAPHLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (*s == ' ' || isGRAPH_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NGRAPH:
+ while (s < strend) {
+ if (!isGRAPH(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NGRAPHUTF8:
+ while (s < strend) {
+ if (!swash_fetch(PL_utf8_graph,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NGRAPHL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isGRAPH_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NGRAPHLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isGRAPH_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case LOWER:
+ while (s < strend) {
+ if (isLOWER(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case LOWERUTF8:
+ while (s < strend) {
+ if (swash_fetch(PL_utf8_lower,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case LOWERL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isLOWER_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case LOWERLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (*s == ' ' || isLOWER_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NLOWER:
+ while (s < strend) {
+ if (!isLOWER(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NLOWERUTF8:
+ while (s < strend) {
+ if (!swash_fetch(PL_utf8_lower,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NLOWERL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isLOWER_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NLOWERLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isLOWER_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case PRINT:
+ while (s < strend) {
+ if (isPRINT(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case PRINTUTF8:
+ while (s < strend) {
+ if (swash_fetch(PL_utf8_print,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case PRINTL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isPRINT_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case PRINTLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (*s == ' ' || isPRINT_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NPRINT:
+ while (s < strend) {
+ if (!isPRINT(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NPRINTUTF8:
+ while (s < strend) {
+ if (!swash_fetch(PL_utf8_print,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NPRINTL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isPRINT_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NPRINTLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isPRINT_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case PUNCT:
+ while (s < strend) {
+ if (isPUNCT(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case PUNCTUTF8:
+ while (s < strend) {
+ if (swash_fetch(PL_utf8_punct,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case PUNCTL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isPUNCT_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case PUNCTLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (*s == ' ' || isPUNCT_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NPUNCT:
+ while (s < strend) {
+ if (!isPUNCT(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NPUNCTUTF8:
+ while (s < strend) {
+ if (!swash_fetch(PL_utf8_punct,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NPUNCTL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isPUNCT_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NPUNCTLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isPUNCT_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case UPPER:
+ while (s < strend) {
+ if (isUPPER(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case UPPERUTF8:
+ while (s < strend) {
+ if (swash_fetch(PL_utf8_upper,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case UPPERL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (isUPPER_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case UPPERLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (*s == ' ' || isUPPER_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NUPPER:
+ while (s < strend) {
+ if (!isUPPER(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NUPPERUTF8:
+ while (s < strend) {
+ if (!swash_fetch(PL_utf8_upper,(U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case NUPPERL:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isUPPER_LC(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NUPPERLUTF8:
+ PL_reg_flags |= RF_tainted;
+ while (s < strend) {
+ if (!isUPPER_LC_utf8((U8*)s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s += UTF8SKIP(s);
+ }
+ break;
+ case XDIGIT:
+ while (s < strend) {
+ if (isXDIGIT(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
+ case NXDIGIT:
+ while (s < strend) {
+ if (!isXDIGIT(*s)) {
+ if (tmp && regtry(prog, s))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
+ s++;
+ }
+ break;
}
}
else {
@@ -1707,15 +2571,30 @@ S_regmatch(pTHX_ regnode *prog)
sayNO;
nextchr = UCHARAT(++locinput);
break;
+ case DIGITL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
case DIGIT:
- if (!isDIGIT(nextchr))
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (!(OP(scan) == DIGIT
+ ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
sayNO;
nextchr = UCHARAT(++locinput);
break;
+ case DIGITLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
case DIGITUTF8:
+ if (!nextchr)
+ sayNO;
if (nextchr & 0x80) {
- if (!(swash_fetch(PL_utf8_digit,(U8*)locinput)))
+ if (OP(scan) == NDIGITUTF8
+ ? swash_fetch(PL_utf8_digit,(U8*)locinput)
+ : isDIGIT_LC_utf8((U8*)locinput))
+ {
sayNO;
+ }
locinput += PL_utf8skip[nextchr];
nextchr = UCHARAT(locinput);
break;
@@ -1724,13 +2603,20 @@ S_regmatch(pTHX_ regnode *prog)
sayNO;
nextchr = UCHARAT(++locinput);
break;
+ case NDIGITL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
case NDIGIT:
- if (!nextchr && locinput >= PL_regeol)
+ if (!nextchr)
sayNO;
- if (isDIGIT(nextchr))
+ if (OP(scan) == DIGIT
+ ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
sayNO;
nextchr = UCHARAT(++locinput);
break;
+ case NDIGITLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
case NDIGITUTF8:
if (!nextchr && locinput >= PL_regeol)
sayNO;
@@ -1745,6 +2631,522 @@ S_regmatch(pTHX_ regnode *prog)
sayNO;
nextchr = UCHARAT(++locinput);
break;
+ case ALNUMCL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case ALNUMC:
+ if (!nextchr)
+ sayNO;
+ if (!(OP(scan) == ALNUMC
+ ? isALNUMC(nextchr) : isALNUMC_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case ALNUMCLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case ALNUMCUTF8:
+ if (!nextchr)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (!(OP(scan) == ALNUMCUTF8
+ ? swash_fetch(PL_utf8_alnumc, (U8*)locinput)
+ : isALNUMC_LC_utf8((U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (!(OP(scan) == ALNUMCUTF8
+ ? isALNUMC(nextchr) : isALNUMC_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NALNUMCL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NALNUMC:
+ if (!nextchr)
+ sayNO;
+ if (OP(scan) == ALNUMC
+ ? isALNUMC(nextchr) : isALNUMC_LC(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NALNUMCLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NALNUMCUTF8:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (swash_fetch(PL_utf8_alnumc,(U8*)locinput))
+ sayNO;
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (isALNUMC(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case ALPHAL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case ALPHA:
+ if (!nextchr)
+ sayNO;
+ if (!(OP(scan) == ALPHA
+ ? isALPHA(nextchr) : isALPHA_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case ALPHALUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case ALPHAUTF8:
+ if (!nextchr)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (!(OP(scan) == ALPHAUTF8
+ ? swash_fetch(PL_utf8_alpha, (U8*)locinput)
+ : isALPHA_LC_utf8((U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (!(OP(scan) == ALPHAUTF8
+ ? isALPHA(nextchr) : isALPHA_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NALPHAL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NALPHA:
+ if (!nextchr)
+ sayNO;
+ if (OP(scan) == ALPHA
+ ? isALPHA(nextchr) : isALPHA_LC(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NALPHALUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NALPHAUTF8:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (swash_fetch(PL_utf8_alpha,(U8*)locinput))
+ sayNO;
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (isALPHA(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case ASCII:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (!isASCII(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NASCII:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (isASCII(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case CNTRLL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case CNTRL:
+ if (!nextchr)
+ sayNO;
+ if (!(OP(scan) == CNTRL
+ ? isCNTRL(nextchr) : isCNTRL_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case CNTRLLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case CNTRLUTF8:
+ if (!nextchr)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (!(OP(scan) == CNTRLUTF8
+ ? swash_fetch(PL_utf8_cntrl, (U8*)locinput)
+ : isCNTRL_LC_utf8((U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (!(OP(scan) == CNTRLUTF8
+ ? isCNTRL(nextchr) : isCNTRL_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NCNTRLL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NCNTRL:
+ if (!nextchr)
+ sayNO;
+ if (OP(scan) == CNTRL
+ ? isCNTRL(nextchr) : isCNTRL_LC(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NCNTRLLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NCNTRLUTF8:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (swash_fetch(PL_utf8_cntrl,(U8*)locinput))
+ sayNO;
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (isCNTRL(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case GRAPHL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case GRAPH:
+ if (!nextchr)
+ sayNO;
+ if (!(OP(scan) == GRAPH
+ ? isGRAPH(nextchr) : isGRAPH_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case GRAPHLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case GRAPHUTF8:
+ if (!nextchr)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (!(OP(scan) == GRAPHUTF8
+ ? swash_fetch(PL_utf8_graph, (U8*)locinput)
+ : isGRAPH_LC_utf8((U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (!(OP(scan) == GRAPHUTF8
+ ? isGRAPH(nextchr) : isGRAPH_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NGRAPHL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NGRAPH:
+ if (!nextchr)
+ sayNO;
+ if (OP(scan) == GRAPH
+ ? isGRAPH(nextchr) : isGRAPH_LC(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NGRAPHLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NGRAPHUTF8:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (swash_fetch(PL_utf8_graph,(U8*)locinput))
+ sayNO;
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (isGRAPH(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case LOWERL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case LOWER:
+ if (!nextchr)
+ sayNO;
+ if (!(OP(scan) == LOWER
+ ? isLOWER(nextchr) : isLOWER_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case LOWERLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case LOWERUTF8:
+ if (!nextchr)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (!(OP(scan) == LOWERUTF8
+ ? swash_fetch(PL_utf8_lower, (U8*)locinput)
+ : isLOWER_LC_utf8((U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (!(OP(scan) == LOWERUTF8
+ ? isLOWER(nextchr) : isLOWER_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NLOWERL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NLOWER:
+ if (!nextchr)
+ sayNO;
+ if (OP(scan) == LOWER
+ ? isLOWER(nextchr) : isLOWER_LC(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NLOWERLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NLOWERUTF8:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (swash_fetch(PL_utf8_lower,(U8*)locinput))
+ sayNO;
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (isLOWER(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case PRINTL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case PRINT:
+ if (!nextchr)
+ sayNO;
+ if (!(OP(scan) == PRINT
+ ? isPRINT(nextchr) : isPRINT_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case PRINTLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case PRINTUTF8:
+ if (!nextchr)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (!(OP(scan) == PRINTUTF8
+ ? swash_fetch(PL_utf8_print, (U8*)locinput)
+ : isPRINT_LC_utf8((U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (!(OP(scan) == PRINTUTF8
+ ? isPRINT(nextchr) : isPRINT_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NPRINTL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NPRINT:
+ if (!nextchr)
+ sayNO;
+ if (OP(scan) == PRINT
+ ? isPRINT(nextchr) : isPRINT_LC(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NPRINTLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NPRINTUTF8:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (swash_fetch(PL_utf8_print,(U8*)locinput))
+ sayNO;
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (isPRINT(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case PUNCTL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case PUNCT:
+ if (!nextchr)
+ sayNO;
+ if (!(OP(scan) == PUNCT
+ ? isPUNCT(nextchr) : isPUNCT_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case PUNCTLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case PUNCTUTF8:
+ if (!nextchr)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (!(OP(scan) == PUNCTUTF8
+ ? swash_fetch(PL_utf8_punct, (U8*)locinput)
+ : isPUNCT_LC_utf8((U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (!(OP(scan) == PUNCTUTF8
+ ? isPUNCT(nextchr) : isPUNCT_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NPUNCTL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NPUNCT:
+ if (!nextchr)
+ sayNO;
+ if (OP(scan) == PUNCT
+ ? isPUNCT(nextchr) : isPUNCT_LC(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NPUNCTLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NPUNCTUTF8:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (swash_fetch(PL_utf8_punct,(U8*)locinput))
+ sayNO;
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (isPUNCT(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case UPPERL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case UPPER:
+ if (!nextchr)
+ sayNO;
+ if (!(OP(scan) == UPPER
+ ? isUPPER(nextchr) : isUPPER_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case UPPERLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case UPPERUTF8:
+ if (!nextchr)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (!(OP(scan) == UPPERUTF8
+ ? swash_fetch(PL_utf8_upper, (U8*)locinput)
+ : isUPPER_LC_utf8((U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (!(OP(scan) == UPPERUTF8
+ ? isUPPER(nextchr) : isUPPER_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NUPPERL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NUPPER:
+ if (!nextchr)
+ sayNO;
+ if (OP(scan) == UPPER
+ ? isUPPER(nextchr) : isUPPER_LC(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NUPPERLUTF8:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
+ case NUPPERUTF8:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (nextchr & 0x80) {
+ if (swash_fetch(PL_utf8_upper,(U8*)locinput))
+ sayNO;
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
+ }
+ if (isUPPER(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case XDIGIT:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (!isXDIGIT(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NXDIGIT:
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ if (isXDIGIT(nextchr))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ break;
case CLUMP:
if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
sayNO;
@@ -2920,11 +4322,11 @@ STATIC bool
S_reginclass(pTHX_ register char *p, register I32 c)
{
dTHR;
- char flags = *p;
+ char flags = ANYOF_FLAGS(p);
bool match = FALSE;
c &= 0xFF;
- if (ANYOF_TEST(p, c))
+ if (ANYOF_BITMAP_TEST(p, c))
match = TRUE;
else if (flags & ANYOF_FOLD) {
I32 cf;
@@ -2934,17 +4336,40 @@ S_reginclass(pTHX_ register char *p, register I32 c)
}
else
cf = PL_fold[c];
- if (ANYOF_TEST(p, cf))
+ if (ANYOF_BITMAP_TEST(p, cf))
match = TRUE;
}
- if (!match && (flags & ANYOF_ISA)) {
+ if (!match && (flags & ANYOF_CLASS)) {
PL_reg_flags |= RF_tainted;
-
- if (((flags & ANYOF_ALNUML) && isALNUM_LC(c)) ||
- ((flags & ANYOF_NALNUML) && !isALNUM_LC(c)) ||
- ((flags & ANYOF_SPACEL) && isSPACE_LC(c)) ||
- ((flags & ANYOF_NSPACEL) && !isSPACE_LC(c)))
+ if (
+ (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
+ (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
+ ) /* How's that for a conditional? */
{
match = TRUE;
}
@@ -2976,17 +4401,7 @@ S_reginclassutf8(pTHX_ regnode *f, U8 *p)
match = TRUE;
}
- if (!match && (flags & ANYOF_ISA)) {
- PL_reg_flags |= RF_tainted;
-
- if (((flags & ANYOF_ALNUML) && isALNUM_LC_utf8(p)) ||
- ((flags & ANYOF_NALNUML) && !isALNUM_LC_utf8(p)) ||
- ((flags & ANYOF_SPACEL) && isSPACE_LC_utf8(p)) ||
- ((flags & ANYOF_NSPACEL) && !isSPACE_LC_utf8(p)))
- {
- match = TRUE;
- }
- }
+ /* UTF8 combined with ANYOF_CLASS is ill-defined. */
return (flags & ANYOF_INVERT) ? !match : match;
}
diff --git a/regnodes.h b/regnodes.h
index 030fa1a2c0..cdc6dd4744 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -45,40 +45,112 @@
#define NSPACELUTF8 39 /* 0x27 Match any non-whitespace char in locale */
#define DIGIT 40 /* 0x28 Match any numeric character */
#define DIGITUTF8 41 /* 0x29 Match any numeric character */
-#define NDIGIT 42 /* 0x2a Match any non-numeric character */
-#define NDIGITUTF8 43 /* 0x2b Match any non-numeric character */
-#define CLUMP 44 /* 0x2c Match any combining character sequence */
-#define BRANCH 45 /* 0x2d Match this alternative, or the next... */
-#define BACK 46 /* 0x2e Match "", "next" ptr points backward. */
-#define EXACT 47 /* 0x2f Match this string (preceded by length). */
-#define EXACTF 48 /* 0x30 Match this string, folded (prec. by length). */
-#define EXACTFL 49 /* 0x31 Match this string, folded in locale (w/len). */
-#define NOTHING 50 /* 0x32 Match empty string. */
-#define TAIL 51 /* 0x33 Match empty string. Can jump here from outside. */
-#define STAR 52 /* 0x34 Match this (simple) thing 0 or more times. */
-#define PLUS 53 /* 0x35 Match this (simple) thing 1 or more times. */
-#define CURLY 54 /* 0x36 Match this simple thing {n,m} times. */
-#define CURLYN 55 /* 0x37 Match next-after-this simple thing */
-#define CURLYM 56 /* 0x38 Match this medium-complex thing {n,m} times. */
-#define CURLYX 57 /* 0x39 Match this complex thing {n,m} times. */
-#define WHILEM 58 /* 0x3a Do curly processing and see if rest matches. */
-#define OPEN 59 /* 0x3b Mark this point in input as start of #n. */
-#define CLOSE 60 /* 0x3c Analogous to OPEN. */
-#define REF 61 /* 0x3d Match some already matched string */
-#define REFF 62 /* 0x3e Match already matched string, folded */
-#define REFFL 63 /* 0x3f Match already matched string, folded in loc. */
-#define IFMATCH 64 /* 0x40 Succeeds if the following matches. */
-#define UNLESSM 65 /* 0x41 Fails if the following matches. */
-#define SUSPEND 66 /* 0x42 "Independent" sub-RE. */
-#define IFTHEN 67 /* 0x43 Switch, should be preceeded by switcher . */
-#define GROUPP 68 /* 0x44 Whether the group matched. */
-#define LONGJMP 69 /* 0x45 Jump far away. */
-#define BRANCHJ 70 /* 0x46 BRANCH with long offset. */
-#define EVAL 71 /* 0x47 Execute some Perl code. */
-#define MINMOD 72 /* 0x48 Next operator is not greedy. */
-#define LOGICAL 73 /* 0x49 Next opcode should set the flag only. */
-#define RENUM 74 /* 0x4a Group with independently numbered parens. */
-#define OPTIMIZED 75 /* 0x4b Placeholder for dump. */
+#define DIGITL 42 /* 0x2a Match any numeric character in locale */
+#define DIGITLUTF8 43 /* 0x2b Match any numeric character in locale */
+#define NDIGIT 44 /* 0x2c Match any non-numeric character */
+#define NDIGITUTF8 45 /* 0x2d Match any non-numeric character */
+#define NDIGITL 46 /* 0x2e Match any non-numeric character in locale */
+#define NDIGITLUTF8 47 /* 0x2f Match any non-numeric character in locale */
+#define ALNUMC 48 /* 0x30 Match any alphanumeric character */
+#define ALNUMCUTF8 49 /* 0x31 Match any alphanumeric character */
+#define ALNUMCL 50 /* 0x32 Match any alphanumeric character in locale */
+#define ALNUMCLUTF8 51 /* 0x33 Match any alphanumeric character in locale */
+#define NALNUMC 52 /* 0x34 Match any non-alphanumeric character */
+#define NALNUMCUTF8 53 /* 0x35 Match any non-alphanumeric character */
+#define NALNUMCL 54 /* 0x36 Match any non-alphanumeric character in locale */
+#define NALNUMCLUTF8 55 /* 0x37 Match any non-alphanumeric character in locale */
+#define ALPHA 56 /* 0x38 Match any alphabetic character */
+#define ALPHAUTF8 57 /* 0x39 Match any alphabetic character */
+#define ALPHAL 58 /* 0x3a Match any alphabetic character in locale */
+#define ALPHALUTF8 59 /* 0x3b Match any alphabetic character in locale */
+#define NALPHA 60 /* 0x3c Match any non-alphabetic character */
+#define NALPHAUTF8 61 /* 0x3d Match any non-alphabetic character */
+#define NALPHAL 62 /* 0x3e Match any non-alphabetic character in locale */
+#define NALPHALUTF8 63 /* 0x3f Match any non-alphabetic character in locale */
+#define ASCII 64 /* 0x40 Match any ASCII character */
+#define NASCII 65 /* 0x41 Match any non-ASCII character */
+#define CNTRL 66 /* 0x42 Match any control character */
+#define CNTRLUTF8 67 /* 0x43 Match any control character */
+#define CNTRLL 68 /* 0x44 Match any control character in locale */
+#define CNTRLLUTF8 69 /* 0x45 Match any control character in locale */
+#define NCNTRL 70 /* 0x46 Match any non-control character */
+#define NCNTRLUTF8 71 /* 0x47 Match any non-control character */
+#define NCNTRLL 72 /* 0x48 Match any non-control character in locale */
+#define NCNTRLLUTF8 73 /* 0x49 Match any non-control character in locale */
+#define GRAPH 74 /* 0x4a Match any graphical character */
+#define GRAPHUTF8 75 /* 0x4b Match any graphical character */
+#define GRAPHL 76 /* 0x4c Match any graphical character in locale */
+#define GRAPHLUTF8 77 /* 0x4d Match any graphical character in locale */
+#define NGRAPH 78 /* 0x4e Match any non-graphical character */
+#define NGRAPHUTF8 79 /* 0x4f Match any non-graphical character */
+#define NGRAPHL 80 /* 0x50 Match any non-graphical character in locale */
+#define NGRAPHLUTF8 81 /* 0x51 Match any non-graphical character in locale */
+#define LOWER 82 /* 0x52 Match any lowercase character */
+#define LOWERUTF8 83 /* 0x53 Match any lowercase character */
+#define LOWERL 84 /* 0x54 Match any lowercase character in locale */
+#define LOWERLUTF8 85 /* 0x55 Match any lowercase character in locale */
+#define NLOWER 86 /* 0x56 Match any non-lowercase character */
+#define NLOWERUTF8 87 /* 0x57 Match any non-lowercase character */
+#define NLOWERL 88 /* 0x58 Match any non-lowercase character in locale */
+#define NLOWERLUTF8 89 /* 0x59 Match any non-lowercase character in locale */
+#define PRINT 90 /* 0x5a Match any printable character */
+#define PRINTUTF8 91 /* 0x5b Match any printable character */
+#define PRINTL 92 /* 0x5c Match any printable character in locale */
+#define PRINTLUTF8 93 /* 0x5d Match any printable character in locale */
+#define NPRINT 94 /* 0x5e Match any non-printable character */
+#define NPRINTUTF8 95 /* 0x5f Match any non-printable character */
+#define NPRINTL 96 /* 0x60 Match any non-printable character in locale */
+#define NPRINTLUTF8 97 /* 0x61 Match any non-printable character in locale */
+#define PUNCT 98 /* 0x62 Match any punctuation character */
+#define PUNCTUTF8 99 /* 0x63 Match any punctuation character */
+#define PUNCTL 100 /* 0x64 Match any punctuation character in locale */
+#define PUNCTLUTF8 101 /* 0x65 Match any punctuation character in locale */
+#define NPUNCT 102 /* 0x66 Match any non-punctuation character */
+#define NPUNCTUTF8 103 /* 0x67 Match any non-punctuation character */
+#define NPUNCTL 104 /* 0x68 Match any non-punctuation character in locale */
+#define NPUNCTLUTF8 105 /* 0x69 Match any non-punctuation character in locale */
+#define UPPER 106 /* 0x6a Match any uppercase character */
+#define UPPERUTF8 107 /* 0x6b Match any uppercase character */
+#define UPPERL 108 /* 0x6c Match any uppercase character in locale */
+#define UPPERLUTF8 109 /* 0x6d Match any uppercase character in locale */
+#define NUPPER 110 /* 0x6e Match any non-uppercase character */
+#define NUPPERUTF8 111 /* 0x6f Match any non-uppercase character */
+#define NUPPERL 112 /* 0x70 Match any non-uppercase character in locale */
+#define NUPPERLUTF8 113 /* 0x71 Match any non-uppercase character in locale */
+#define XDIGIT 114 /* 0x72 Match any hexdigit character */
+#define NXDIGIT 115 /* 0x73 Match any non-hexdigit character */
+#define CLUMP 116 /* 0x74 Match any combining character sequence */
+#define BRANCH 117 /* 0x75 Match this alternative, or the next... */
+#define BACK 118 /* 0x76 Match "", "next" ptr points backward. */
+#define EXACT 119 /* 0x77 Match this string (preceded by length). */
+#define EXACTF 120 /* 0x78 Match this string, folded (prec. by length). */
+#define EXACTFL 121 /* 0x79 Match this string, folded in locale (w/len). */
+#define NOTHING 122 /* 0x7a Match empty string. */
+#define TAIL 123 /* 0x7b Match empty string. Can jump here from outside. */
+#define STAR 124 /* 0x7c Match this (simple) thing 0 or more times. */
+#define PLUS 125 /* 0x7d Match this (simple) thing 1 or more times. */
+#define CURLY 126 /* 0x7e Match this simple thing {n,m} times. */
+#define CURLYN 127 /* 0x7f Match next-after-this simple thing */
+#define CURLYM 128 /* 0x80 Match this medium-complex thing {n,m} times. */
+#define CURLYX 129 /* 0x81 Match this complex thing {n,m} times. */
+#define WHILEM 130 /* 0x82 Do curly processing and see if rest matches. */
+#define OPEN 131 /* 0x83 Mark this point in input as start of #n. */
+#define CLOSE 132 /* 0x84 Analogous to OPEN. */
+#define REF 133 /* 0x85 Match some already matched string */
+#define REFF 134 /* 0x86 Match already matched string, folded */
+#define REFFL 135 /* 0x87 Match already matched string, folded in loc. */
+#define IFMATCH 136 /* 0x88 Succeeds if the following matches. */
+#define UNLESSM 137 /* 0x89 Fails if the following matches. */
+#define SUSPEND 138 /* 0x8a "Independent" sub-RE. */
+#define IFTHEN 139 /* 0x8b Switch, should be preceeded by switcher . */
+#define GROUPP 140 /* 0x8c Whether the group matched. */
+#define LONGJMP 141 /* 0x8d Jump far away. */
+#define BRANCHJ 142 /* 0x8e BRANCH with long offset. */
+#define EVAL 143 /* 0x8f Execute some Perl code. */
+#define MINMOD 144 /* 0x90 Next operator is not greedy. */
+#define LOGICAL 145 /* 0x91 Next opcode should set the flag only. */
+#define RENUM 146 /* 0x92 Group with independently numbered parens. */
+#define OPTIMIZED 147 /* 0x93 Placeholder for dump. */
#ifndef DOINIT
EXTCONST U8 PL_regkind[];
@@ -126,8 +198,80 @@ EXTCONST U8 PL_regkind[] = {
NSPACE, /* NSPACELUTF8 */
DIGIT, /* DIGIT */
DIGIT, /* DIGITUTF8 */
+ DIGIT, /* DIGITL */
+ DIGIT, /* DIGITLUTF8 */
NDIGIT, /* NDIGIT */
NDIGIT, /* NDIGITUTF8 */
+ NDIGIT, /* NDIGITL */
+ NDIGIT, /* NDIGITLUTF8 */
+ ALNUMC, /* ALNUMC */
+ ALNUMC, /* ALNUMCUTF8 */
+ ALNUMC, /* ALNUMCL */
+ ALNUMC, /* ALNUMCLUTF8 */
+ NALNUMC, /* NALNUMC */
+ NALNUMC, /* NALNUMCUTF8 */
+ NALNUMC, /* NALNUMCL */
+ NALNUMC, /* NALNUMCLUTF8 */
+ ALPHA, /* ALPHA */
+ ALPHA, /* ALPHAUTF8 */
+ ALPHA, /* ALPHAL */
+ ALPHA, /* ALPHALUTF8 */
+ NALPHA, /* NALPHA */
+ NALPHA, /* NALPHAUTF8 */
+ NALPHA, /* NALPHAL */
+ NALPHA, /* NALPHALUTF8 */
+ ASCII, /* ASCII */
+ NASCII, /* NASCII */
+ CNTRL, /* CNTRL */
+ CNTRL, /* CNTRLUTF8 */
+ CNTRL, /* CNTRLL */
+ CNTRL, /* CNTRLLUTF8 */
+ NCNTRL, /* NCNTRL */
+ NCNTRL, /* NCNTRLUTF8 */
+ NCNTRL, /* NCNTRLL */
+ NCNTRL, /* NCNTRLLUTF8 */
+ GRAPH, /* GRAPH */
+ GRAPH, /* GRAPHUTF8 */
+ GRAPH, /* GRAPHL */
+ GRAPH, /* GRAPHLUTF8 */
+ NGRAPH, /* NGRAPH */
+ NGRAPH, /* NGRAPHUTF8 */
+ NGRAPH, /* NGRAPHL */
+ NGRAPH, /* NGRAPHLUTF8 */
+ LOWER, /* LOWER */
+ LOWER, /* LOWERUTF8 */
+ LOWER, /* LOWERL */
+ LOWER, /* LOWERLUTF8 */
+ NLOWER, /* NLOWER */
+ NLOWER, /* NLOWERUTF8 */
+ NLOWER, /* NLOWERL */
+ NLOWER, /* NLOWERLUTF8 */
+ PRINT, /* PRINT */
+ PRINT, /* PRINTUTF8 */
+ PRINT, /* PRINTL */
+ PRINT, /* PRINTLUTF8 */
+ NPRINT, /* NPRINT */
+ NPRINT, /* NPRINTUTF8 */
+ NPRINT, /* NPRINTL */
+ NPRINT, /* NPRINTLUTF8 */
+ PUNCT, /* PUNCT */
+ PUNCT, /* PUNCTUTF8 */
+ PUNCT, /* PUNCTL */
+ PUNCT, /* PUNCTLUTF8 */
+ NPUNCT, /* NPUNCT */
+ NPUNCT, /* NPUNCTUTF8 */
+ NPUNCT, /* NPUNCTL */
+ NPUNCT, /* NPUNCTLUTF8 */
+ UPPER, /* UPPER */
+ UPPER, /* UPPERUTF8 */
+ UPPER, /* UPPERL */
+ UPPER, /* UPPERLUTF8 */
+ NUPPER, /* NUPPER */
+ NUPPER, /* NUPPERUTF8 */
+ NUPPER, /* NUPPERL */
+ NUPPER, /* NUPPERLUTF8 */
+ XDIGIT, /* XDIGIT */
+ NXDIGIT, /* NXDIGIT */
CLUMP, /* CLUMP */
BRANCH, /* BRANCH */
BACK, /* BACK */
@@ -208,8 +352,80 @@ const static U8 regarglen[] = {
0, /* NSPACELUTF8 */
0, /* DIGIT */
0, /* DIGITUTF8 */
+ 0, /* DIGITL */
+ 0, /* DIGITLUTF8 */
0, /* NDIGIT */
0, /* NDIGITUTF8 */
+ 0, /* NDIGITL */
+ 0, /* NDIGITLUTF8 */
+ 0, /* ALNUMC */
+ 0, /* ALNUMCUTF8 */
+ 0, /* ALNUMCL */
+ 0, /* ALNUMCLUTF8 */
+ 0, /* NALNUMC */
+ 0, /* NALNUMCUTF8 */
+ 0, /* NALNUMCL */
+ 0, /* NALNUMCLUTF8 */
+ 0, /* ALPHA */
+ 0, /* ALPHAUTF8 */
+ 0, /* ALPHAL */
+ 0, /* ALPHALUTF8 */
+ 0, /* NALPHA */
+ 0, /* NALPHAUTF8 */
+ 0, /* NALPHAL */
+ 0, /* NALPHALUTF8 */
+ 0, /* ASCII */
+ 0, /* NASCII */
+ 0, /* CNTRL */
+ 0, /* CNTRLUTF8 */
+ 0, /* CNTRLL */
+ 0, /* CNTRLLUTF8 */
+ 0, /* NCNTRL */
+ 0, /* NCNTRLUTF8 */
+ 0, /* NCNTRLL */
+ 0, /* NCNTRLLUTF8 */
+ 0, /* GRAPH */
+ 0, /* GRAPHUTF8 */
+ 0, /* GRAPHL */
+ 0, /* GRAPHLUTF8 */
+ 0, /* NGRAPH */
+ 0, /* NGRAPHUTF8 */
+ 0, /* NGRAPHL */
+ 0, /* NGRAPHLUTF8 */
+ 0, /* LOWER */
+ 0, /* LOWERUTF8 */
+ 0, /* LOWERL */
+ 0, /* LOWERLUTF8 */
+ 0, /* NLOWER */
+ 0, /* NLOWERUTF8 */
+ 0, /* NLOWERL */
+ 0, /* NLOWERLUTF8 */
+ 0, /* PRINT */
+ 0, /* PRINTUTF8 */
+ 0, /* PRINTL */
+ 0, /* PRINTLUTF8 */
+ 0, /* NPRINT */
+ 0, /* NPRINTUTF8 */
+ 0, /* NPRINTL */
+ 0, /* NPRINTLUTF8 */
+ 0, /* PUNCT */
+ 0, /* PUNCTUTF8 */
+ 0, /* PUNCTL */
+ 0, /* PUNCTLUTF8 */
+ 0, /* NPUNCT */
+ 0, /* NPUNCTUTF8 */
+ 0, /* NPUNCTL */
+ 0, /* NPUNCTLUTF8 */
+ 0, /* UPPER */
+ 0, /* UPPERUTF8 */
+ 0, /* UPPERL */
+ 0, /* UPPERLUTF8 */
+ 0, /* NUPPER */
+ 0, /* NUPPERUTF8 */
+ 0, /* NUPPERL */
+ 0, /* NUPPERLUTF8 */
+ 0, /* XDIGIT */
+ 0, /* NXDIGIT */
0, /* CLUMP */
0, /* BRANCH */
0, /* BACK */
@@ -287,8 +503,80 @@ const static char reg_off_by_arg[] = {
0, /* NSPACELUTF8 */
0, /* DIGIT */
0, /* DIGITUTF8 */
+ 0, /* DIGITL */
+ 0, /* DIGITLUTF8 */
0, /* NDIGIT */
0, /* NDIGITUTF8 */
+ 0, /* NDIGITL */
+ 0, /* NDIGITLUTF8 */
+ 0, /* ALNUMC */
+ 0, /* ALNUMCUTF8 */
+ 0, /* ALNUMCL */
+ 0, /* ALNUMCLUTF8 */
+ 0, /* NALNUMC */
+ 0, /* NALNUMCUTF8 */
+ 0, /* NALNUMCL */
+ 0, /* NALNUMCLUTF8 */
+ 0, /* ALPHA */
+ 0, /* ALPHAUTF8 */
+ 0, /* ALPHAL */
+ 0, /* ALPHALUTF8 */
+ 0, /* NALPHA */
+ 0, /* NALPHAUTF8 */
+ 0, /* NALPHAL */
+ 0, /* NALPHALUTF8 */
+ 0, /* ASCII */
+ 0, /* NASCII */
+ 0, /* CNTRL */
+ 0, /* CNTRLUTF8 */
+ 0, /* CNTRLL */
+ 0, /* CNTRLLUTF8 */
+ 0, /* NCNTRL */
+ 0, /* NCNTRLUTF8 */
+ 0, /* NCNTRLL */
+ 0, /* NCNTRLLUTF8 */
+ 0, /* GRAPH */
+ 0, /* GRAPHUTF8 */
+ 0, /* GRAPHL */
+ 0, /* GRAPHLUTF8 */
+ 0, /* NGRAPH */
+ 0, /* NGRAPHUTF8 */
+ 0, /* NGRAPHL */
+ 0, /* NGRAPHLUTF8 */
+ 0, /* LOWER */
+ 0, /* LOWERUTF8 */
+ 0, /* LOWERL */
+ 0, /* LOWERLUTF8 */
+ 0, /* NLOWER */
+ 0, /* NLOWERUTF8 */
+ 0, /* NLOWERL */
+ 0, /* NLOWERLUTF8 */
+ 0, /* PRINT */
+ 0, /* PRINTUTF8 */
+ 0, /* PRINTL */
+ 0, /* PRINTLUTF8 */
+ 0, /* NPRINT */
+ 0, /* NPRINTUTF8 */
+ 0, /* NPRINTL */
+ 0, /* NPRINTLUTF8 */
+ 0, /* PUNCT */
+ 0, /* PUNCTUTF8 */
+ 0, /* PUNCTL */
+ 0, /* PUNCTLUTF8 */
+ 0, /* NPUNCT */
+ 0, /* NPUNCTUTF8 */
+ 0, /* NPUNCTL */
+ 0, /* NPUNCTLUTF8 */
+ 0, /* UPPER */
+ 0, /* UPPERUTF8 */
+ 0, /* UPPERL */
+ 0, /* UPPERLUTF8 */
+ 0, /* NUPPER */
+ 0, /* NUPPERUTF8 */
+ 0, /* NUPPERL */
+ 0, /* NUPPERLUTF8 */
+ 0, /* XDIGIT */
+ 0, /* NXDIGIT */
0, /* CLUMP */
0, /* BRANCH */
0, /* BACK */
@@ -367,43 +655,115 @@ const static char * const reg_name[] = {
"NSPACELUTF8", /* 0x27 */
"DIGIT", /* 0x28 */
"DIGITUTF8", /* 0x29 */
- "NDIGIT", /* 0x2a */
- "NDIGITUTF8", /* 0x2b */
- "CLUMP", /* 0x2c */
- "BRANCH", /* 0x2d */
- "BACK", /* 0x2e */
- "EXACT", /* 0x2f */
- "EXACTF", /* 0x30 */
- "EXACTFL", /* 0x31 */
- "NOTHING", /* 0x32 */
- "TAIL", /* 0x33 */
- "STAR", /* 0x34 */
- "PLUS", /* 0x35 */
- "CURLY", /* 0x36 */
- "CURLYN", /* 0x37 */
- "CURLYM", /* 0x38 */
- "CURLYX", /* 0x39 */
- "WHILEM", /* 0x3a */
- "OPEN", /* 0x3b */
- "CLOSE", /* 0x3c */
- "REF", /* 0x3d */
- "REFF", /* 0x3e */
- "REFFL", /* 0x3f */
- "IFMATCH", /* 0x40 */
- "UNLESSM", /* 0x41 */
- "SUSPEND", /* 0x42 */
- "IFTHEN", /* 0x43 */
- "GROUPP", /* 0x44 */
- "LONGJMP", /* 0x45 */
- "BRANCHJ", /* 0x46 */
- "EVAL", /* 0x47 */
- "MINMOD", /* 0x48 */
- "LOGICAL", /* 0x49 */
- "RENUM", /* 0x4a */
- "OPTIMIZED", /* 0x4b */
+ "DIGITL", /* 0x2a */
+ "DIGITLUTF8", /* 0x2b */
+ "NDIGIT", /* 0x2c */
+ "NDIGITUTF8", /* 0x2d */
+ "NDIGITL", /* 0x2e */
+ "NDIGITLUTF8", /* 0x2f */
+ "ALNUMC", /* 0x30 */
+ "ALNUMCUTF8", /* 0x31 */
+ "ALNUMCL", /* 0x32 */
+ "ALNUMCLUTF8", /* 0x33 */
+ "NALNUMC", /* 0x34 */
+ "NALNUMCUTF8", /* 0x35 */
+ "NALNUMCL", /* 0x36 */
+ "NALNUMCLUTF8", /* 0x37 */
+ "ALPHA", /* 0x38 */
+ "ALPHAUTF8", /* 0x39 */
+ "ALPHAL", /* 0x3a */
+ "ALPHALUTF8", /* 0x3b */
+ "NALPHA", /* 0x3c */
+ "NALPHAUTF8", /* 0x3d */
+ "NALPHAL", /* 0x3e */
+ "NALPHALUTF8", /* 0x3f */
+ "ASCII", /* 0x40 */
+ "NASCII", /* 0x41 */
+ "CNTRL", /* 0x42 */
+ "CNTRLUTF8", /* 0x43 */
+ "CNTRLL", /* 0x44 */
+ "CNTRLLUTF8", /* 0x45 */
+ "NCNTRL", /* 0x46 */
+ "NCNTRLUTF8", /* 0x47 */
+ "NCNTRLL", /* 0x48 */
+ "NCNTRLLUTF8", /* 0x49 */
+ "GRAPH", /* 0x4a */
+ "GRAPHUTF8", /* 0x4b */
+ "GRAPHL", /* 0x4c */
+ "GRAPHLUTF8", /* 0x4d */
+ "NGRAPH", /* 0x4e */
+ "NGRAPHUTF8", /* 0x4f */
+ "NGRAPHL", /* 0x50 */
+ "NGRAPHLUTF8", /* 0x51 */
+ "LOWER", /* 0x52 */
+ "LOWERUTF8", /* 0x53 */
+ "LOWERL", /* 0x54 */
+ "LOWERLUTF8", /* 0x55 */
+ "NLOWER", /* 0x56 */
+ "NLOWERUTF8", /* 0x57 */
+ "NLOWERL", /* 0x58 */
+ "NLOWERLUTF8", /* 0x59 */
+ "PRINT", /* 0x5a */
+ "PRINTUTF8", /* 0x5b */
+ "PRINTL", /* 0x5c */
+ "PRINTLUTF8", /* 0x5d */
+ "NPRINT", /* 0x5e */
+ "NPRINTUTF8", /* 0x5f */
+ "NPRINTL", /* 0x60 */
+ "NPRINTLUTF8", /* 0x61 */
+ "PUNCT", /* 0x62 */
+ "PUNCTUTF8", /* 0x63 */
+ "PUNCTL", /* 0x64 */
+ "PUNCTLUTF8", /* 0x65 */
+ "NPUNCT", /* 0x66 */
+ "NPUNCTUTF8", /* 0x67 */
+ "NPUNCTL", /* 0x68 */
+ "NPUNCTLUTF8", /* 0x69 */
+ "UPPER", /* 0x6a */
+ "UPPERUTF8", /* 0x6b */
+ "UPPERL", /* 0x6c */
+ "UPPERLUTF8", /* 0x6d */
+ "NUPPER", /* 0x6e */
+ "NUPPERUTF8", /* 0x6f */
+ "NUPPERL", /* 0x70 */
+ "NUPPERLUTF8", /* 0x71 */
+ "XDIGIT", /* 0x72 */
+ "NXDIGIT", /* 0x73 */
+ "CLUMP", /* 0x74 */
+ "BRANCH", /* 0x75 */
+ "BACK", /* 0x76 */
+ "EXACT", /* 0x77 */
+ "EXACTF", /* 0x78 */
+ "EXACTFL", /* 0x79 */
+ "NOTHING", /* 0x7a */
+ "TAIL", /* 0x7b */
+ "STAR", /* 0x7c */
+ "PLUS", /* 0x7d */
+ "CURLY", /* 0x7e */
+ "CURLYN", /* 0x7f */
+ "CURLYM", /* 0x80 */
+ "CURLYX", /* 0x81 */
+ "WHILEM", /* 0x82 */
+ "OPEN", /* 0x83 */
+ "CLOSE", /* 0x84 */
+ "REF", /* 0x85 */
+ "REFF", /* 0x86 */
+ "REFFL", /* 0x87 */
+ "IFMATCH", /* 0x88 */
+ "UNLESSM", /* 0x89 */
+ "SUSPEND", /* 0x8a */
+ "IFTHEN", /* 0x8b */
+ "GROUPP", /* 0x8c */
+ "LONGJMP", /* 0x8d */
+ "BRANCHJ", /* 0x8e */
+ "EVAL", /* 0x8f */
+ "MINMOD", /* 0x90 */
+ "LOGICAL", /* 0x91 */
+ "RENUM", /* 0x92 */
+ "OPTIMIZED", /* 0x93 */
};
-const static int reg_num = 76;
+const static int reg_num = 148;
#endif /* DEBUGGING */
#endif /* REG_COMP_C */
diff --git a/t/op/pat.t b/t/op/pat.t
index a086c12eaf..6312c75cea 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -282,14 +282,7 @@ eval qq("${context}y" =~ /(?<=$context)y/);
print "not " if $@ !~ m%^\Q/(?<=\Ex+/: lookbehind longer than 255 not%;
print "ok 71\n";
-# This one will fail when POSIX character classes do get implemented
-{
- my $w;
- local $^W = 1;
- local $SIG{__WARN__} = sub{$w = shift};
- eval q('a' =~ /[[:alpha:]]/);
- print "not " if $w !~ /^\QCharacter class syntax [: :] is reserved/;
-}
+# removed test
print "ok 72\n";
# Long Monsters
diff --git a/t/op/re_tests b/t/op/re_tests
index 466fc856c9..cbcb7251b1 100644
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -474,9 +474,37 @@ $(?<=^(a)) a y $1 a
([[=]+) a=[b]= y $1 =[
([[.]+) a.[b]. y $1 .[
[a[:xyz: - c - /[a[:xyz:/: unmatched [] in regexp
-[a[:xyz:] - c - /[a[:xyz:]/: unmatched [] in regexp
+[a[:xyz:] - c - Character class [:xyz:] unknown
[a[:]b[:c] abc y $& abc
-([a[:xyz:]b]+) pbaq y $1 ba
+([a[:xyz:]b]+) pbaq c - Character class [:xyz:] unknown
+[a[:]b[:c] abc y $& abc
+([[:alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
+([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy
+([[:ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ${nulnul}
+([[:cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}
+([[:digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01
+([[:graph:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
+([[:lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd
+([[:print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
+([[:punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __--
+([[:space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1
+([[:word:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__
+([[:upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
+([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01
+([[:^alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01
+([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- ${nulnul}${ffff}
+([[:^ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${ffff}
+([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
+([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
+([[:^lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
+([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}${ffff}
+([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy
+([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
+([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 -- ${nulnul}${ffff}
+([[:^upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd01
+([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 Xy__-- ${nulnul}${ffff}
+[[:foo:]] - c - Character class [:foo:] unknown
+[[:^foo:]] - c - Character class [:^foo:] unknown
((?>a+)b) aaab y $1 aaab
(?>(a+))b aaab y $1 aaa
((?>[^()]+)|\([^()]*\))+ ((abc(ade)ufh()()x y $& abc(ade)ufh()()x
diff --git a/t/op/regexp.t b/t/op/regexp.t
index 66b2d1c116..4ffe1362c6 100755
--- a/t/op/regexp.t
+++ b/t/op/regexp.t
@@ -47,6 +47,8 @@ seek(TESTS,0,0);
$. = 0;
$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
+$ffff = chr(0xff) x 2;
+$nulnul = "\0" x 2;
$| = 1;
print "1..$numtests\n# $iters iterations\n";
@@ -59,12 +61,16 @@ while (<TESTS>) {
infty_subst(\$pat);
infty_subst(\$expect);
$pat = "'$pat'" unless $pat =~ /^[:']/;
- $pat =~ s/\\n/\n/g;
$pat =~ s/(\$\{\w+\})/$1/eeg;
+ $pat =~ s/\\n/\n/g;
+ $subject =~ s/(\$\{\w+\})/$1/eeg;
$subject =~ s/\\n/\n/g;
+ $expect =~ s/(\$\{\w+\})/$1/eeg;
$expect =~ s/\\n/\n/g;
$expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
$skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
+ # Certain tests don't work with utf8 (the re_test should be in UTF8)
+ $skip = 1 if ($^H &= ~0x00000008) && $pat =~ /\[:\^(alnum|print|word):\]/;
$result =~ s/B//i unless $skip;
for $study ('', 'study \$subject') {
$c = $iters;
@@ -75,7 +81,7 @@ while (<TESTS>) {
last; # no need to study a syntax error
}
elsif ( $skip ) {
- print "ok $. # Skipped: not fixed yet\n"; next TEST;
+ print "ok $. # skipped\n"; next TEST;
}
elsif ($@) {
print "not ok $. $input => error `$err'\n"; next TEST;
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t
index e5b3bb5cd3..5e467ae053 100755
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -6,7 +6,7 @@ BEGIN {
$ENV{PERL5LIB} = '../lib';
}
-print "1..3\n";
+print "1..9\n";
my $test = 1;
@@ -34,4 +34,37 @@ sub ok {
s/([$rx])/"&#".ord($1).";"/eg;
ok $_, '>&#9786;<';
$test++;
+
+ $_ = "alpha,numeric";
+ m/([[:alpha:]]+)/;
+ ok $1, 'alpha';
+ $test++;
+
+ $_ = "alphaNUMERICstring";
+ m/([[:^lower:]]+)/;
+ ok $1, 'NUMERIC';
+ $test++;
+
+ $_ = "alphaNUMERICstring";
+ m/(\p{Ll}+)/;
+ ok $1, 'alpha';
+ $test++;
+
+ $_ = "alphaNUMERICstring";
+ m/(\p{Lu}+)/;
+ ok $1, 'NUMERIC';
+ $test++;
+
+ $_ = "alpha,numeric";
+ m/([\p{IsAlpha}]+)/;
+ ok $1, 'alpha';
+ $test++;
+
+ $_ = "alphaNUMERICstring";
+ m/([^\p{IsLower}]+)/;
+ ok $1, 'NUMERIC';
+ $test++;
+
}
+
+
diff --git a/t/pragma/warn/regcomp b/t/pragma/warn/regcomp
index 52a163a2f5..0f48c67f92 100644
--- a/t/pragma/warn/regcomp
+++ b/t/pragma/warn/regcomp
@@ -8,9 +8,6 @@
/(?=a)?/
- Character class syntax [: :] is reserved for future extensions
- /[a[:xyz:]b]/
-
Character class syntax [. .] is reserved for future extensions
Character class syntax [= =] is reserved for future extensions
@@ -32,22 +29,21 @@ Strange *+?{} on zero-length expression at - line 4.
# regcomp.c
use warning 'unsafe' ;
$_ = "" ;
-/[a[:xyz:]b]/;
/[a[.xyz.]b]/;
/[a[=xyz=]b]/;
EXPECT
-Character class syntax [: :] is reserved for future extensions at - line 4.
-Character class syntax [. .] is reserved for future extensions at - line 5.
-Character class syntax [= =] is reserved for future extensions at - line 6.
+Character class syntax [. .] is reserved for future extensions at - line 4.
+Character class syntax [= =] is reserved for future extensions at - line 5.
########
# regcomp.c
-use warning 'unsafe' ;
-# use utf8 ; # Note this line should be uncommented when utf8 gets fixed.
+use warning 'unsafe' ;
$_ = "" ;
-/[a[:xyz:]b]/;
-/[a[.xyz.]b]/;
-/[a[=xyz=]b]/;
+/[:foo:]/;
+/[.bar.]/;
+/[=zog=]/;
EXPECT
-Character class syntax [: :] is reserved for future extensions at - line 5.
-Character class syntax [. .] is reserved for future extensions at - line 6.
-Character class syntax [= =] is reserved for future extensions at - line 7.
+Character class syntax [: :] belongs inside character classes at - line 4.
+Character class syntax [. .] belongs inside character classes at - line 5.
+Character class syntax [. .] is reserved for future extensions at - line 5.
+Character class syntax [= =] belongs inside character classes at - line 6.
+Character class syntax [= =] is reserved for future extensions at - line 6.
diff --git a/utf8.c b/utf8.c
index 8c7aee2d89..0e52f211f6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -255,6 +255,14 @@ Perl_is_uni_alnum(pTHX_ U32 c)
}
bool
+Perl_is_uni_alnumc(pTHX_ U32 c)
+{
+ U8 tmpbuf[10];
+ uv_to_utf8(tmpbuf, (UV)c);
+ return is_utf8_alnumc(tmpbuf);
+}
+
+bool
Perl_is_uni_idfirst(pTHX_ U32 c)
{
U8 tmpbuf[10];
@@ -303,6 +311,22 @@ Perl_is_uni_lower(pTHX_ U32 c)
}
bool
+Perl_is_uni_cntrl(pTHX_ U32 c)
+{
+ U8 tmpbuf[10];
+ uv_to_utf8(tmpbuf, (UV)c);
+ return is_utf8_cntrl(tmpbuf);
+}
+
+bool
+Perl_is_uni_graph(pTHX_ U32 c)
+{
+ U8 tmpbuf[10];
+ uv_to_utf8(tmpbuf, (UV)c);
+ return is_utf8_graph(tmpbuf);
+}
+
+bool
Perl_is_uni_print(pTHX_ U32 c)
{
U8 tmpbuf[10];
@@ -310,6 +334,14 @@ Perl_is_uni_print(pTHX_ U32 c)
return is_utf8_print(tmpbuf);
}
+bool
+is_uni_punct(U32 c)
+{
+ U8 tmpbuf[10];
+ uv_to_utf8(tmpbuf, (UV)c);
+ return is_utf8_punct(tmpbuf);
+}
+
U32
Perl_to_uni_upper(pTHX_ U32 c)
{
@@ -343,6 +375,12 @@ Perl_is_uni_alnum_lc(pTHX_ U32 c)
}
bool
+Perl_is_uni_alnumc_lc(pTHX_ U32 c)
+{
+ return is_uni_alnumc(c); /* XXX no locale support yet */
+}
+
+bool
Perl_is_uni_idfirst_lc(pTHX_ U32 c)
{
return is_uni_idfirst(c); /* XXX no locale support yet */
@@ -379,11 +417,29 @@ Perl_is_uni_lower_lc(pTHX_ U32 c)
}
bool
+Perl_is_uni_cntrl_lc(pTHX_ U32 c)
+{
+ return is_uni_cntrl(c); /* XXX no locale support yet */
+}
+
+bool
+Perl_is_uni_graph_lc(pTHX_ U32 c)
+{
+ return is_uni_graph(c); /* XXX no locale support yet */
+}
+
+bool
Perl_is_uni_print_lc(pTHX_ U32 c)
{
return is_uni_print(c); /* XXX no locale support yet */
}
+bool
+Perl_is_uni_punct_lc(pTHX_ U32 c)
+{
+ return is_uni_punct(c); /* XXX no locale support yet */
+}
+
U32
Perl_to_uni_upper_lc(pTHX_ U32 c)
{
@@ -402,7 +458,6 @@ Perl_to_uni_lower_lc(pTHX_ U32 c)
return to_uni_lower(c); /* XXX no locale support yet */
}
-
bool
Perl_is_utf8_alnum(pTHX_ U8 *p)
{
@@ -419,6 +474,21 @@ Perl_is_utf8_alnum(pTHX_ U8 *p)
}
bool
+Perl_is_utf8_alnumc(pTHX_ U8 *p)
+{
+ if (!PL_utf8_alnum)
+ PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
+ return swash_fetch(PL_utf8_alnum, p);
+/* return is_utf8_alpha(p) || is_utf8_digit(p); */
+#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
+ if (!PL_utf8_alnum)
+ PL_utf8_alnum = swash_init("utf8", "",
+ sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
+ return swash_fetch(PL_utf8_alnum, p);
+#endif
+}
+
+bool
Perl_is_utf8_idfirst(pTHX_ U8 *p)
{
return *p == '_' || is_utf8_alpha(p);
@@ -433,6 +503,14 @@ Perl_is_utf8_alpha(pTHX_ U8 *p)
}
bool
+Perl_is_utf8_ascii(pTHX_ U8 *p)
+{
+ if (!PL_utf8_ascii)
+ PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
+ return swash_fetch(PL_utf8_ascii, p);
+}
+
+bool
Perl_is_utf8_space(pTHX_ U8 *p)
{
if (!PL_utf8_space)
@@ -465,6 +543,22 @@ Perl_is_utf8_lower(pTHX_ U8 *p)
}
bool
+Perl_is_utf8_cntrl(pTHX_ U8 *p)
+{
+ if (!PL_utf8_cntrl)
+ PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
+ return swash_fetch(PL_utf8_cntrl, p);
+}
+
+bool
+Perl_is_utf8_graph(pTHX_ U8 *p)
+{
+ if (!PL_utf8_graph)
+ PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
+ return swash_fetch(PL_utf8_graph, p);
+}
+
+bool
Perl_is_utf8_print(pTHX_ U8 *p)
{
if (!PL_utf8_print)
@@ -473,6 +567,22 @@ Perl_is_utf8_print(pTHX_ U8 *p)
}
bool
+Perl_is_utf8_punct(pTHX_ U8 *p)
+{
+ if (!PL_utf8_punct)
+ PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
+ return swash_fetch(PL_utf8_punct, p);
+}
+
+bool
+Perl_is_utf8_xdigit(pTHX_ U8 *p)
+{
+ if (!PL_utf8_xdigit)
+ PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
+ return swash_fetch(PL_utf8_xdigit, p);
+}
+
+bool
Perl_is_utf8_mark(pTHX_ U8 *p)
{
if (!PL_utf8_mark)