summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/utf8_heavy.pl1
-rw-r--r--pod/perlunicode.pod26
-rw-r--r--regcomp.c4
-rwxr-xr-xt/op/pat.t32
4 files changed, 52 insertions, 11 deletions
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index 3e145deb09..025a70bd12 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -53,6 +53,7 @@ sub SWASHNEW {
{
$list ||= do "$file.pl"
+ || do "unicore/Is/$type.pl"
|| croak("Can't find Unicode character property \"$type\"");
}
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 4864909e35..4d6be20ffa 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -169,17 +169,27 @@ character with the Unicode uppercase property, while C<\p{M}> matches
any mark character. Single letter properties may omit the brackets,
so that can be written C<\pM> also. Many predefined character classes
are available, such as C<\p{IsMirrored}> and C<\p{InTibetan}>.
-The recommended naming convention of the C<In> classes are the
-official Unicode script and block names, but with all non-alphanumeric
-characters removed, for example the block name C<"Latin-1 Supplement">
-becomes C<\p{InLatin1Supplement}>. Perl will ignore the case of
-letters, and any space or dash can be a space, dash, underbar, or be
-missing altogether, so C<\p{ in latin 1 supplement }> will work, too.
+
+The C<\p{Is...}> test for "general properties" such as "letter",
+"digit", while the C<\p{In...}> test for Unicode scripts and blocks.
+
+The official Unicode script and block names have spaces and
+dashes and separators, but for convenience you can have
+dashes, spaces, and underbars at every word division, and
+you need not care about correct casing. It is recommended,
+however, that for consistency you use the following naming:
+the official Unicode script or block name (see below for
+the additional rules that apply to block names), with the whitespace
+and dashes removed, and the words "uppercase-first-lowercase-otherwise".
+That is, "Latin-1 Supplement" becomes "Latin1Supplement".
+
You can also negate both C<\p{}> and C<\P{}> by introducing a caret
(^) between the first curly and the property name: C<\p{^InTamil}> is
-equal to C<\P{Tamil}>.
+equal to C<\P{InTamil}>.
+
+The C<In> can be left out: C<\p{Greek}> is equal to C<\p{InGreek}>.
-Here is the list as of Unicode 3.1.0 (the two-letter classes) and
+Here is the list as of Unicode 3.1.1 (the two-letter classes) and
as defined by Perl (the one-letter classes) (in Unicode materials
what Perl calls C<L> is often called C<L&>):
diff --git a/regcomp.c b/regcomp.c
index dda273d7bd..96bafd3162 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2881,7 +2881,7 @@ tryagain:
if (!RExC_end) {
RExC_parse += 2;
RExC_end = oldregxend;
- vFAIL("Missing right brace on \\p{}");
+ vFAIL2("Missing right brace on \\%c{}", UCHARAT(RExC_parse - 2));
}
RExC_end++;
}
@@ -3085,7 +3085,7 @@ tryagain:
/* FALL THROUGH */
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
- vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
+ vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
goto normal_default;
}
break;
diff --git a/t/op/pat.t b/t/op/pat.t
index a3f652230c..fa4d1b38ee 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
$| = 1;
-print "1..717\n";
+print "1..722\n";
BEGIN {
chdir 't' if -d 't';
@@ -2127,12 +2127,42 @@ sub ok ($$) {
print "ok 715\n";
}
+print "# some Unicode properties\n";
+
{
+ # Dashes, underbars, case.
print "not " unless "\x80" =~ /\p{in-latin1_SUPPLEMENT}/;
print "ok 716\n";
}
{
+ # Complement, leading and trailing whitespace.
print "not " unless "\x80" =~ /\P{ ^ In Latin 1 Supplement }/;
print "ok 717\n";
}
+
+{
+ # No ^In, dashes, case.
+ print "not " unless "\x80" =~ /\p{latin-1-supplement}/;
+ print "ok 718\n";
+}
+
+{
+ print "not " unless "a" =~ /\pL/;
+ print "ok 719\n";
+}
+
+{
+ print "not " unless "a" =~ /\p{IsLl}/;
+ print "ok 720\n";
+}
+
+{
+ print "not " unless "A" =~ /\pL/;
+ print "ok 721\n";
+}
+
+{
+ print "not " unless "A" =~ /\p{IsLu}/;
+ print "ok 722\n";
+}