summaryrefslogtreecommitdiff
path: root/cpan
diff options
context:
space:
mode:
authorSteve Hay <steve.m.hay@googlemail.com>2015-10-13 14:17:08 +0100
committerSteve Hay <steve.m.hay@googlemail.com>2015-10-13 14:17:08 +0100
commit3ece07bcb5b3313741afeab4751444de186b8349 (patch)
tree386687d858c98e2da289903a033b9b00df524583 /cpan
parentbfcc951916d5b5da0cfb24827ed20e7e8cdc3e8d (diff)
downloadperl-3ece07bcb5b3313741afeab4751444de186b8349.tar.gz
Upgrade Unicode::Normalize from version 1.19 to 1.21
Diffstat (limited to 'cpan')
-rw-r--r--cpan/Unicode-Normalize/Normalize.pm83
-rw-r--r--cpan/Unicode-Normalize/t/func.t77
2 files changed, 99 insertions, 61 deletions
diff --git a/cpan/Unicode-Normalize/Normalize.pm b/cpan/Unicode-Normalize/Normalize.pm
index 27514f2f83..b828543766 100644
--- a/cpan/Unicode-Normalize/Normalize.pm
+++ b/cpan/Unicode-Normalize/Normalize.pm
@@ -16,7 +16,7 @@ use Carp;
no warnings 'utf8';
-our $VERSION = '1.19';
+our $VERSION = '1.21';
our $PACKAGE = __PACKAGE__;
our @EXPORT = qw( NFC NFD NFKC NFKD );
@@ -45,9 +45,29 @@ sub pack_U {
}
sub unpack_U {
+
+ # The empty pack returns an empty UTF-8 string, so the effect is to force
+ # the shifted parameter into being UTF-8. This shouldn't matter; the
+ # commit messages seem to point to an attempt to get things to work in
+ # EBCDIC in 5.8.
return unpack('U*', shift(@_).pack('U*'));
}
+BEGIN {
+ # Starting in v5.20, the tables in lib/unicore are built using the
+ # platform's native character set for code points 0-255. Things like the
+ # combining class and compositions exclusions are all above 255, so it
+ # doesn't matter for them.
+
+ *pack_unicore = ($] ge 5.020)
+ ? sub { return pack('W*', @_); }
+ : \&pack_U;
+
+ *unpack_unicore = ($] ge 5.020)
+ ? sub { return unpack('W*', $_[0]); }
+ : \&unpack_U;
+}
+
require Exporter;
our @ISA = qw(Exporter);
@@ -70,7 +90,9 @@ our $Decomp = do "unicore/Decomposition.pl"
|| do "unicode/Decomposition.pl"
|| croak "$PACKAGE: Decomposition.pl not found";
-# CompositionExclusions.txt since Unicode 3.2.0
+# CompositionExclusions.txt since Unicode 3.2.0. Modern perl versions allow
+# one to get this table from Unicode::UCD, so if it ever changes, it might be
+# better to retrieve it from there, rather than hard-coding it here.
our @CompEx = qw(
0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
@@ -106,7 +128,7 @@ sub decomposeHangul {
VBase + $vindex,
$tindex ? (TBase + $tindex) : (),
);
- return wantarray ? @ret : pack_U(@ret);
+ return wantarray ? @ret : pack_unicore(@ret);
}
########## getting full decomposition ##########
@@ -223,7 +245,7 @@ sub getCombinClass ($) {
sub getCanon ($) {
my $uv = 0 + shift;
return exists $Canon{$uv}
- ? pack_U(@{ $Canon{$uv} })
+ ? pack_unicore(@{ $Canon{$uv} })
: (SBase <= $uv && $uv <= SFinal)
? scalar decomposeHangul($uv)
: undef;
@@ -232,7 +254,7 @@ sub getCanon ($) {
sub getCompat ($) {
my $uv = 0 + shift;
return exists $Compat{$uv}
- ? pack_U(@{ $Compat{$uv} })
+ ? pack_unicore(@{ $Compat{$uv} })
: (SBase <= $uv && $uv <= SFinal)
? scalar decomposeHangul($uv)
: undef;
@@ -310,10 +332,10 @@ sub isNFKC_NO ($) {
sub decompose ($;$)
{
my $hash = $_[1] ? \%Compat : \%Canon;
- return pack_U map {
+ return pack_unicore map {
$hash->{ $_ } ? @{ $hash->{ $_ } } :
(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) : $_
- } unpack_U($_[0]);
+ } unpack_unicore($_[0]);
}
##
@@ -321,7 +343,7 @@ sub decompose ($;$)
##
sub reorder ($)
{
- my @src = unpack_U($_[0]);
+ my @src = unpack_unicore($_[0]);
for (my $i=0; $i < @src;) {
$i++, next if ! $Combin{ $src[$i] };
@@ -335,7 +357,7 @@ sub reorder ($)
@src[ $ini .. $i - 1 ] = @src[ @tmp ];
}
- return pack_U(@src);
+ return pack_unicore(@src);
}
@@ -350,7 +372,7 @@ sub reorder ($)
##
sub compose ($)
{
- my @src = unpack_U($_[0]);
+ my @src = unpack_unicore($_[0]);
for (my $s = 0; $s+1 < @src; $s++) {
next unless defined $src[$s] && ! $Combin{ $src[$s] };
@@ -377,7 +399,7 @@ sub compose ($)
if ($blocked) { $blocked = 0 } else { -- $uncomposed_cc }
}
}
- return pack_U(grep defined, @src);
+ return pack_unicore(grep defined, @src);
}
@@ -386,7 +408,7 @@ sub compose ($)
##
sub composeContiguous ($)
{
- my @src = unpack_U($_[0]);
+ my @src = unpack_unicore($_[0]);
for (my $s = 0; $s+1 < @src; $s++) {
next unless defined $src[$s] && ! $Combin{ $src[$s] };
@@ -402,7 +424,7 @@ sub composeContiguous ($)
$src[$s] = $c; $src[$j] = undef;
}
}
- return pack_U(grep defined, @src);
+ return pack_unicore(grep defined, @src);
}
@@ -426,7 +448,7 @@ sub checkNFD ($)
{
my $preCC = 0;
my $curCC;
- for my $uv (unpack_U($_[0])) {
+ for my $uv (unpack_unicore($_[0])) {
$curCC = $Combin{ $uv } || 0;
return '' if $preCC > $curCC && $curCC != 0;
return '' if exists $Canon{$uv} || (SBase <= $uv && $uv <= SFinal);
@@ -439,7 +461,7 @@ sub checkNFKD ($)
{
my $preCC = 0;
my $curCC;
- for my $uv (unpack_U($_[0])) {
+ for my $uv (unpack_unicore($_[0])) {
$curCC = $Combin{ $uv } || 0;
return '' if $preCC > $curCC && $curCC != 0;
return '' if exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal);
@@ -452,7 +474,7 @@ sub checkNFC ($)
{
my $preCC = 0;
my($curCC, $isMAYBE);
- for my $uv (unpack_U($_[0])) {
+ for my $uv (unpack_unicore($_[0])) {
$curCC = $Combin{ $uv } || 0;
return '' if $preCC > $curCC && $curCC != 0;
@@ -470,7 +492,7 @@ sub checkNFKC ($)
{
my $preCC = 0;
my($curCC, $isMAYBE);
- for my $uv (unpack_U($_[0])) {
+ for my $uv (unpack_unicore($_[0])) {
$curCC = $Combin{ $uv } || 0;
return '' if $preCC > $curCC && $curCC != 0;
@@ -488,7 +510,7 @@ sub checkFCD ($)
{
my $preCC = 0;
my $curCC;
- for my $uv (unpack_U($_[0])) {
+ for my $uv (unpack_unicore($_[0])) {
# Hangul syllable need not decomposed since cc[any Jamo] == 0;
my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
@@ -503,7 +525,7 @@ sub checkFCC ($)
{
my $preCC = 0;
my($curCC, $isMAYBE);
- for my $uv (unpack_U($_[0])) {
+ for my $uv (unpack_unicore($_[0])) {
# Hangul syllable need not decomposed since cc[any Jamo] == 0;
my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
@@ -527,7 +549,7 @@ sub checkFCC ($)
sub splitOnLastStarter
{
- my $str = pack_U(unpack_U(shift));
+ my $str = pack_unicore(unpack_unicore(shift));
if ($str eq '') {
return ('', '');
}
@@ -537,7 +559,9 @@ sub splitOnLastStarter
do {
$ch = chop($str);
$unproc = $ch.$unproc;
- } while (getCombinClass(unpack 'U', $ch) && $str ne "");
+ } # Relies on the fact that the combining class for code points < 256 is
+ # 0, so don't have to worry about EBCDIC issues
+ while (getCombinClass(unpack 'U', $ch) && $str ne "");
return ($str, $unproc);
}
@@ -1019,22 +1043,29 @@ C<normalize> and other some functions: on request.
Since this module refers to perl core's Unicode database in the directory
F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
-normalization implemented by this module depends on your perl's version.
+normalization implemented by this module depends on what has been
+compiled into your perl. The following table lists the default Unicode
+version that comes with various perl versions. (It is possible to change
+the Unicode version in any perl version to be any earlier Unicode version,
+so one could cause Unicode 3.2 to be used in any perl version starting with
+5.8.0. See C<$Config{privlib}>/F<unicore/README.perl>.
perl's version implemented Unicode version
5.6.1 3.0.1
5.7.2 3.1.0
5.7.3 3.1.1 (normalization is same as 3.1.0)
5.8.0 3.2.0
- 5.8.1-5.8.3 4.0.0
- 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
- 5.8.7-5.8.8 4.1.0
+ 5.8.1-5.8.3 4.0.0
+ 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
+ 5.8.7-5.8.8 4.1.0
5.10.0 5.0.0
- 5.8.9, 5.10.1 5.1.0
+ 5.8.9, 5.10.1 5.1.0
5.12.x 5.2.0
5.14.x 6.0.0
5.16.x 6.1.0
5.18.x 6.2.0
+ 5.20.x 6.3.0
+ 5.22.x 7.0.0
=item Correction of decomposition mapping
diff --git a/cpan/Unicode-Normalize/t/func.t b/cpan/Unicode-Normalize/t/func.t
index b92ada7f21..7295b4792e 100644
--- a/cpan/Unicode-Normalize/t/func.t
+++ b/cpan/Unicode-Normalize/t/func.t
@@ -39,19 +39,25 @@ ok(1);
sub _pack_U { Unicode::Normalize::pack_U(@_) }
sub hexU { _pack_U map hex, split ' ', shift }
+# This won't work on EBCDIC platforms prior to v5.8.0, which is when this
+# translation function was defined
+*to_native = (defined &utf8::unicode_to_native)
+ ? \&utf8::unicode_to_native
+ : sub { return shift };
+
#########################
-ok(getCombinClass( 0), 0);
-ok(getCombinClass( 41), 0);
-ok(getCombinClass( 65), 0);
+ok(getCombinClass( to_native(0)), 0);
+ok(getCombinClass(to_native(41)), 0);
+ok(getCombinClass(to_native(65)), 0);
ok(getCombinClass( 768), 230);
ok(getCombinClass(1809), 36);
-ok(getCanon( 0), undef);
-ok(getCanon(0x29), undef);
-ok(getCanon(0x41), undef);
-ok(getCanon(0x00C0), _pack_U(0x0041, 0x0300));
-ok(getCanon(0x00EF), _pack_U(0x0069, 0x0308));
+ok(getCanon(to_native( 0)), undef);
+ok(getCanon(to_native(0x29)), undef);
+ok(getCanon(to_native(0x41)), undef);
+ok(getCanon(to_native(0x00C0)), _pack_U(0x0041, 0x0300));
+ok(getCanon(to_native(0x00EF)), _pack_U(0x0069, 0x0308));
ok(getCanon(0x304C), _pack_U(0x304B, 0x3099));
ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
@@ -64,11 +70,11 @@ ok(getCanon(0xFA2D), _pack_U(0x9DB4));
# 20
-ok(getCompat( 0), undef);
-ok(getCompat(0x29), undef);
-ok(getCompat(0x41), undef);
-ok(getCompat(0x00C0), _pack_U(0x0041, 0x0300));
-ok(getCompat(0x00EF), _pack_U(0x0069, 0x0308));
+ok(getCompat(to_native( 0)), undef);
+ok(getCompat(to_native(0x29)), undef);
+ok(getCompat(to_native(0x41)), undef);
+ok(getCompat(to_native(0x00C0)), _pack_U(0x0041, 0x0300));
+ok(getCompat(to_native(0x00EF)), _pack_U(0x0069, 0x0308));
ok(getCompat(0x304C), _pack_U(0x304B, 0x3099));
ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
@@ -81,17 +87,17 @@ ok(getCompat(0xFA2D), _pack_U(0x9DB4));
# 34
-ok(getComposite( 0, 0), undef);
-ok(getComposite( 0, 0x29), undef);
-ok(getComposite(0x29, 0), undef);
-ok(getComposite(0x29, 0x29), undef);
-ok(getComposite( 0, 0x41), undef);
-ok(getComposite(0x41, 0), undef);
-ok(getComposite(0x41, 0x41), undef);
-ok(getComposite(12, 0x0300), undef);
-ok(getComposite(0x0055, 0xFF00), undef);
-ok(getComposite(0x0041, 0x0300), 0x00C0);
-ok(getComposite(0x0055, 0x0300), 0x00D9);
+ok(getComposite(to_native( 0), to_native( 0)), undef);
+ok(getComposite(to_native( 0), to_native(0x29)), undef);
+ok(getComposite(to_native(0x29), to_native( 0)), undef);
+ok(getComposite(to_native(0x29), to_native(0x29)), undef);
+ok(getComposite(to_native( 0), to_native(0x41)), undef);
+ok(getComposite(to_native(0x41), to_native( 0)), undef);
+ok(getComposite(to_native(0x41), to_native(0x41)), undef);
+ok(getComposite(to_native(12), to_native(0x0300)), undef);
+ok(getComposite(to_native(0x0055), 0xFF00), undef);
+ok(getComposite(to_native(0x0041), 0x0300), to_native(0x00C0));
+ok(getComposite(to_native(0x0055), 0x0300), to_native(0x00D9));
ok(getComposite(0x0112, 0x0300), 0x1E14);
ok(getComposite(0x1100, 0x1161), 0xAC00);
ok(getComposite(0x1100, 0x1173), 0xADF8);
@@ -120,11 +126,11 @@ sub uprops {
return $r;
}
-ok(uprops(0x0000), 'xsnfbdmckyg'); # NULL
-ok(uprops(0x0029), 'xsnfbdmckyg'); # RIGHT PARENTHESIS
-ok(uprops(0x0041), 'xsnfbdmckyg'); # LATIN CAPITAL LETTER A
-ok(uprops(0x00A0), 'xsnfbdmcKyG'); # NO-BREAK SPACE
-ok(uprops(0x00C0), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE
+ok(uprops(to_native(0x0000)), 'xsnfbdmckyg'); # NULL
+ok(uprops(to_native(0x0029)), 'xsnfbdmckyg'); # RIGHT PARENTHESIS
+ok(uprops(to_native(0x0041)), 'xsnfbdmckyg'); # LATIN CAPITAL LETTER A
+ok(uprops(to_native(0x00A0)), 'xsnfbdmcKyG'); # NO-BREAK SPACE
+ok(uprops(to_native(0x00C0)), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE
ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT
ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS
ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA
@@ -266,12 +272,13 @@ ok(normalize('NFC', $2), "ABC");
# a string with initial zero should be treated like a number
# LATIN CAPITAL LETTER A WITH GRAVE
-ok(getCombinClass("0192"), 0);
-ok(getCanon ("0192"), _pack_U(0x41, 0x300));
-ok(getCompat("0192"), _pack_U(0x41, 0x300));
-ok(getComposite("065", "0768"), 192);
-ok(isNFD_NO ("0192"));
-ok(isNFKD_NO("0192"));
+ok(getCombinClass(sprintf("0%d", to_native(192))), 0);
+ok(getCanon (sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300));
+ok(getCompat(sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300));
+my $lead_zero = sprintf "0%d", to_native(65);
+ok(getComposite($lead_zero, "0768"), to_native(192));
+ok(isNFD_NO (sprintf("0%d", to_native(192))));
+ok(isNFKD_NO(sprintf("0%d", to_native(192))));
# DEVANAGARI LETTER QA
ok(isExclusion("02392"));