diff options
-rw-r--r-- | lib/utf8_heavy.pl | 1 | ||||
-rw-r--r-- | pod/perlunicode.pod | 26 | ||||
-rw-r--r-- | regcomp.c | 4 | ||||
-rwxr-xr-x | t/op/pat.t | 32 |
4 files changed, 52 insertions, 11 deletions
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl index 3e145deb09..025a70bd12 100644 --- a/lib/utf8_heavy.pl +++ b/lib/utf8_heavy.pl @@ -53,6 +53,7 @@ sub SWASHNEW { { $list ||= do "$file.pl" + || do "unicore/Is/$type.pl" || croak("Can't find Unicode character property \"$type\""); } diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 4864909e35..4d6be20ffa 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -169,17 +169,27 @@ character with the Unicode uppercase property, while C<\p{M}> matches any mark character. Single letter properties may omit the brackets, so that can be written C<\pM> also. Many predefined character classes are available, such as C<\p{IsMirrored}> and C<\p{InTibetan}>. -The recommended naming convention of the C<In> classes are the -official Unicode script and block names, but with all non-alphanumeric -characters removed, for example the block name C<"Latin-1 Supplement"> -becomes C<\p{InLatin1Supplement}>. Perl will ignore the case of -letters, and any space or dash can be a space, dash, underbar, or be -missing altogether, so C<\p{ in latin 1 supplement }> will work, too. + +The C<\p{Is...}> test for "general properties" such as "letter", +"digit", while the C<\p{In...}> test for Unicode scripts and blocks. + +The official Unicode script and block names have spaces and +dashes and separators, but for convenience you can have +dashes, spaces, and underbars at every word division, and +you need not care about correct casing. It is recommended, +however, that for consistency you use the following naming: +the official Unicode script or block name (see below for +the additional rules that apply to block names), with the whitespace +and dashes removed, and the words "uppercase-first-lowercase-otherwise". +That is, "Latin-1 Supplement" becomes "Latin1Supplement". + You can also negate both C<\p{}> and C<\P{}> by introducing a caret (^) between the first curly and the property name: C<\p{^InTamil}> is -equal to C<\P{Tamil}>. +equal to C<\P{InTamil}>. + +The C<In> can be left out: C<\p{Greek}> is equal to C<\p{InGreek}>. -Here is the list as of Unicode 3.1.0 (the two-letter classes) and +Here is the list as of Unicode 3.1.1 (the two-letter classes) and as defined by Perl (the one-letter classes) (in Unicode materials what Perl calls C<L> is often called C<L&>): @@ -2881,7 +2881,7 @@ tryagain: if (!RExC_end) { RExC_parse += 2; RExC_end = oldregxend; - vFAIL("Missing right brace on \\p{}"); + vFAIL2("Missing right brace on \\%c{}", UCHARAT(RExC_parse - 2)); } RExC_end++; } @@ -3085,7 +3085,7 @@ tryagain: /* FALL THROUGH */ default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) - vWARN2(p +1, "Unrecognized escape \\%c passed through", *p); + vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); goto normal_default; } break; diff --git a/t/op/pat.t b/t/op/pat.t index a3f652230c..fa4d1b38ee 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..717\n"; +print "1..722\n"; BEGIN { chdir 't' if -d 't'; @@ -2127,12 +2127,42 @@ sub ok ($$) { print "ok 715\n"; } +print "# some Unicode properties\n"; + { + # Dashes, underbars, case. print "not " unless "\x80" =~ /\p{in-latin1_SUPPLEMENT}/; print "ok 716\n"; } { + # Complement, leading and trailing whitespace. print "not " unless "\x80" =~ /\P{ ^ In Latin 1 Supplement }/; print "ok 717\n"; } + +{ + # No ^In, dashes, case. + print "not " unless "\x80" =~ /\p{latin-1-supplement}/; + print "ok 718\n"; +} + +{ + print "not " unless "a" =~ /\pL/; + print "ok 719\n"; +} + +{ + print "not " unless "a" =~ /\p{IsLl}/; + print "ok 720\n"; +} + +{ + print "not " unless "A" =~ /\pL/; + print "ok 721\n"; +} + +{ + print "not " unless "A" =~ /\p{IsLu}/; + print "ok 722\n"; +} |