summaryrefslogtreecommitdiff
path: root/lib/Unicode
diff options
context:
space:
mode:
authorRafael Garcia-Suarez <rgarciasuarez@gmail.com>2003-09-07 19:12:05 +0000
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2003-09-07 19:12:05 +0000
commit06c8fc8f09dc8f7e52006b1a902e84e1587b786f (patch)
treeef1fc4371fd44769715bdf5c7ed3f4b8f5a0ee6a /lib/Unicode
parent35ed0d3c8f1120489361bf37c4e66472d2262576 (diff)
downloadperl-06c8fc8f09dc8f7e52006b1a902e84e1587b786f.tar.gz
Upgrade to Unicode::Collate 0.28
p4raw-id: //depot/perl@21064
Diffstat (limited to 'lib/Unicode')
-rw-r--r--lib/Unicode/Collate.pm84
-rw-r--r--lib/Unicode/Collate/Changes10
-rw-r--r--lib/Unicode/Collate/README2
-rw-r--r--lib/Unicode/Collate/t/contract.t146
-rw-r--r--lib/Unicode/Collate/t/test.t12
5 files changed, 220 insertions, 34 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index 2bcc3155d2..18ed44626c 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
require Exporter;
-our $VERSION = '0.27';
+our $VERSION = '0.28';
our $PACKAGE = __PACKAGE__;
our @ISA = qw(Exporter);
@@ -53,7 +53,7 @@ use constant NOMATCHPOS => -1;
# A coderef to get combining class imported from Unicode::Normalize
# (i.e. \&Unicode::Normalize::getCombinClass).
# This is also used as a HAS_UNICODE_NORMALIZE flag.
-our $getCombinClass;
+our $CVgetCombinClass;
# Supported Levels
use constant MinLevel => 1;
@@ -225,17 +225,16 @@ sub checkCollator {
croak "Unicode/Normalize.pm is required to normalize strings: $@"
if $@;
- $getCombinClass = \&Unicode::Normalize::getCombinClass
- if ! $getCombinClass;
+ $CVgetCombinClass = \&Unicode::Normalize::getCombinClass
+ if ! $CVgetCombinClass;
- my $norm = $self->{normalization};
- $self->{normCode} = sub {
+ if ($self->{normalization} ne 'prenormalized') {
+ my $norm = $self->{normalization};
+ $self->{normCode} = sub {
Unicode::Normalize::normalize($norm, shift);
};
-
- eval { $self->{normCode}->("") }; # try
- if ($@) {
- croak "$PACKAGE unknown normalization form name: $norm";
+ eval { $self->{normCode}->("") }; # try
+ $@ and croak "$PACKAGE unknown normalization form name: $norm";
}
}
return;
@@ -261,7 +260,7 @@ sub new
if ! exists $self->{overrideHangul};
$self->{overrideCJK} = ''
if ! exists $self->{overrideCJK};
- $self->{normalization} = 'D'
+ $self->{normalization} = 'NFD'
if ! exists $self->{normalization};
$self->{alternate} = $self->{alternateTable} || 'shifted'
if ! exists $self->{alternate};
@@ -490,19 +489,31 @@ sub splitCE
$i = $p;
}
}
- }
- # with Combining Char (UTS#10, 4.2.1).
- # requires Unicode::Normalize.
- # Not be $wLen, as not croaked due to $norm.
- if ($getCombinClass) {
- for (my $p = $i + 1; $p < @src; $p++) {
- next if ! defined $src[$p];
- last unless $getCombinClass->($src[$p]);
- my $tail = CODE_SEP . $src[$p];
- if ($ent->{$ce.$tail}) {
- $ce .= $tail;
- $src[$p] = undef;
+ # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
+ # This process requires Unicode::Normalize.
+ # If "normalize" is undef, here should be skipped *always*
+ # (in spite of bool value of $CVgetCombinClass),
+ # since canonical ordering cannot be expected.
+ # Blocked combining character should not be contracted.
+
+ if ($self->{normalization})
+ # $self->{normCode} is false in the case of "prenormalized".
+ {
+ my $preCC = 0;
+ my $curCC = 0;
+
+ for (my $p = $i + 1; $p < @src; $p++) {
+ next if ! defined $src[$p];
+ $curCC = $CVgetCombinClass->($src[$p]);
+ last unless $curCC;
+ my $tail = CODE_SEP . $src[$p];
+ if ($preCC != $curCC && $ent->{$ce.$tail}) {
+ $ce .= $tail;
+ $src[$p] = undef;
+ } else {
+ $preCC = $curCC;
+ }
}
}
}
@@ -1128,16 +1139,37 @@ If specified, strings are normalized before preparation of sort keys
A form name C<Unicode::Normalize::normalize()> accepts will be applied
as C<$normalization_form>.
+Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
See C<Unicode::Normalize::normalize()> for detail.
If omitted, C<'NFD'> is used.
L<normalization> is performed after L<preprocess> (if defined).
-If C<undef> is passed explicitly as the value for this key,
+Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
+though they are not concerned with C<Unicode::Normalize::normalize()>.
+
+If C<undef> (not a string C<"undef">) is passed explicitly
+as the value for this key,
any normalization is not carried out (this may make tailoring easier
if any normalization is not desired).
-
-see B<CAVEAT>.
+Under C<(normalization =E<gt> undef)>, only contiguous contractions
+are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>,
+even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>.
+In this point,
+C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
+B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
+
+In the case of C<(normalization =E<gt> "prenormalized")>,
+any normalization is not performed, but
+non-contiguous contractions with combining characters are performed.
+Therefore
+C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
+B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
+If source strings are finely prenormalized,
+C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
+
+Except C<(normalization =E<gt> undef)>,
+B<Unicode::Normalize> is required (see also B<CAVEAT>).
=item overrideCJK
diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes
index 4f61b8332f..3d39bbe248 100644
--- a/lib/Unicode/Collate/Changes
+++ b/lib/Unicode/Collate/Changes
@@ -1,8 +1,16 @@
Revision history for Perl module Unicode::Collate.
+0.28 Sat Sep 06 20:16:01 2003
+ - Fixed another inconsistency under (normalization => undef):
+ Non-contiguous contraction is always neglected.
+ - Fixed: according to S2.1 in UTS #10, a blocked combining character
+ should not be contracted. One test in test.t was wrong, then removed.
+ - Added contract.t.
+ - (normalization => "prenormalized") is able to be used.
+
0.27 Sun Aug 31 22:23:17 2003
some improvements:
- - The maximum length of contracted CE was not checked.
+ - The maximum length of contracted CE was not checked (v0.22 to v0.26).
Collation of a large string including a first letter of a contraction
that is not a part of that contraction (say, 'c' of 'ca'
where 'ch' is defined) was too slow, inefficient.
diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README
index 21e1ff8b48..3c86573ec3 100644
--- a/lib/Unicode/Collate/README
+++ b/lib/Unicode/Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.27
+Unicode/Collate version 0.28
===============================
NAME
diff --git a/lib/Unicode/Collate/t/contract.t b/lib/Unicode/Collate/t/contract.t
new file mode 100644
index 0000000000..c2aaecfaa7
--- /dev/null
+++ b/lib/Unicode/Collate/t/contract.t
@@ -0,0 +1,146 @@
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+use Test;
+BEGIN { plan tests => 40 };
+
+use strict;
+use warnings;
+use Unicode::Collate;
+
+use vars qw($IsEBCDIC);
+$IsEBCDIC = ord("A") != 0x41;
+
+our $kjeEntry = <<'ENTRIES';
+0301 ; [.0000.0032.0002.0301] # COMBINING ACUTE ACCENT
+0334 ; [.0000.008B.0002.0334] # COMBINING TILDE OVERLAY
+043A ; [.0D31.0020.0002.043A] # CYRILLIC SMALL LETTER KA
+041A ; [.0D31.0020.0008.041A] # CYRILLIC CAPITAL LETTER KA
+045C ; [.0DA1.0020.0002.045C] # CYRILLIC SMALL LETTER KJE
+043A 0301 ; [.0DA1.0020.0002.045C] # CYRILLIC SMALL LETTER KJE
+040C ; [.0DA1.0020.0008.040C] # CYRILLIC CAPITAL LETTER KJE
+041A 0301 ; [.0DA1.0020.0008.040C] # CYRILLIC CAPITAL LETTER KJE
+ENTRIES
+
+our $aaEntry = <<'ENTRIES';
+0304 ; [.0000.005A.0002.0304] # COMBINING MACRON (cc = 230)
+030A ; [.0000.0043.0002.030A] # COMBINING RING ABOVE (cc = 230)
+0327 ; [.0000.0055.0002.0327] # COMBINING CEDILLA (cc = 202)
+031A ; [.0000.006B.0002.031A] # COMBINING LEFT ANGLE ABOVE (cc = 232)
+0061 ; [.0A15.0020.0002.0061] # LATIN SMALL LETTER A
+0041 ; [.0A15.0020.0008.0041] # LATIN CAPITAL LETTER A
+007A ; [.0C13.0020.0002.007A] # LATIN SMALL LETTER Z
+005A ; [.0C13.0020.0008.005A] # LATIN CAPITAL LETTER Z
+00E5 ; [.0C25.0020.0002.00E5] # LATIN SMALL LETTER A WITH RING ABOVE; QQCM
+00C5 ; [.0C25.0020.0008.00C5] # LATIN CAPITAL LETTER A WITH RING ABOVE; QQCM
+0061 030A ; [.0C25.0020.0002.0061] # LATIN SMALL LETTER A WITH RING ABOVE
+0041 030A ; [.0C25.0020.0008.0041] # LATIN CAPITAL LETTER A WITH RING ABOVE
+ENTRIES
+
+#########################
+
+ok(1); # If we made it this far, we're ok.
+
+my $kjeNoN = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ normalization => undef,
+ entry => $kjeEntry,
+);
+
+ok($kjeNoN->lt("\x{043A}", "\x{043A}\x{0301}"));
+ok($kjeNoN->gt("\x{045C}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{043A}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}"));
+
+our %sortkeys;
+
+$sortkeys{'KAac'} = $kjeNoN->viewSortKey("\x{043A}\x{0301}");
+$sortkeys{'KAta'} = $kjeNoN->viewSortKey("\x{043A}\x{0334}\x{0301}");
+$sortkeys{'KAat'} = $kjeNoN->viewSortKey("\x{043A}\x{0301}\x{0334}");
+
+eval { require Unicode::Normalize };
+if (!$@ && !$IsEBCDIC) {
+ my $kjeNFD = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ entry => $kjeEntry,
+ );
+ok($kjeNFD->lt("\x{043A}", "\x{043A}\x{0301}"));
+ok($kjeNFD->eq("\x{045C}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNFD->lt("\x{043A}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNFD->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}"));
+
+ my $aaNFD = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ entry => $aaEntry,
+ );
+
+ok($aaNFD->lt("Z", "A\x{30A}\x{304}"));
+ok($aaNFD->eq("A", "A\x{304}\x{30A}"));
+ok($aaNFD->eq(pack('U', 0xE5), "A\x{30A}\x{304}"));
+ok($aaNFD->eq("A\x{304}", "A\x{304}\x{30A}"));
+ok($aaNFD->lt("Z", "A\x{327}\x{30A}"));
+ok($aaNFD->lt("Z", "A\x{30A}\x{327}"));
+ok($aaNFD->lt("Z", "A\x{31A}\x{30A}"));
+ok($aaNFD->lt("Z", "A\x{30A}\x{31A}"));
+
+ my $aaPre = Unicode::Collate->new(
+ level => 1,
+ normalization => "prenormalized",
+ table => undef,
+ entry => $aaEntry,
+ );
+
+ok($aaPre->lt("Z", "A\x{30A}\x{304}"));
+ok($aaPre->eq("A", "A\x{304}\x{30A}"));
+ok($aaPre->eq(pack('U', 0xE5), "A\x{30A}\x{304}"));
+ok($aaPre->eq("A\x{304}", "A\x{304}\x{30A}"));
+ok($aaPre->lt("Z", "A\x{327}\x{30A}"));
+ok($aaPre->lt("Z", "A\x{30A}\x{327}"));
+ok($aaPre->lt("Z", "A\x{31A}\x{30A}"));
+ok($aaPre->lt("Z", "A\x{30A}\x{31A}"));
+}
+else {
+ ok(1) for 1..20;
+}
+
+# again: loading Unicode::Normalize should not affect $kjeNoN.
+ok($kjeNoN->lt("\x{043A}", "\x{043A}\x{0301}"));
+ok($kjeNoN->gt("\x{045C}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{043A}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}"));
+
+ok($sortkeys{'KAac'}, $kjeNoN->viewSortKey("\x{043A}\x{0301}"));
+ok($sortkeys{'KAta'}, $kjeNoN->viewSortKey("\x{043A}\x{0334}\x{0301}"));
+ok($sortkeys{'KAat'}, $kjeNoN->viewSortKey("\x{043A}\x{0301}\x{0334}"));
+
+my $aaNoN = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ entry => $aaEntry,
+ normalization => undef,
+);
+
+ok($aaNoN->lt("Z", "A\x{30A}\x{304}"));
+ok($aaNoN->eq("A", "A\x{304}\x{30A}"));
+ok($aaNoN->eq(pack('U', 0xE5), "A\x{30A}\x{304}"));
+ok($aaNoN->eq("A\x{304}", "A\x{304}\x{30A}"));
+ok($aaNoN->eq("A", "A\x{327}\x{30A}"));
+ok($aaNoN->lt("Z", "A\x{30A}\x{327}"));
+ok($aaNoN->eq("A", "A\x{31A}\x{30A}"));
+ok($aaNoN->lt("Z", "A\x{30A}\x{31A}"));
+
diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t
index de4ca59d17..0c170e422a 100644
--- a/lib/Unicode/Collate/t/test.t
+++ b/lib/Unicode/Collate/t/test.t
@@ -15,7 +15,7 @@ BEGIN {
}
use Test;
-BEGIN { plan tests => 199 };
+BEGIN { plan tests => 200 };
use strict;
use warnings;
@@ -86,7 +86,8 @@ eval { require Unicode::Normalize };
if (!$@ && !$IsEBCDIC) {
my $NFD = Unicode::Collate->new(
- table => undef,
+ table => 'keys.txt',
+ level => 1,
entry => <<'ENTRIES',
0430 ; [.0CB5.0020.0002.0430] # CYRILLIC SMALL LETTER A
0410 ; [.0CB5.0020.0008.0410] # CYRILLIC CAPITAL LETTER A
@@ -101,14 +102,11 @@ ENTRIES
ok($NFD->eq("\x{4D3}\x{325}", "\x{430}\x{308}\x{325}"));
ok($NFD->lt("\x{430}\x{308}A", "\x{430}\x{308}B"));
ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A"));
- ok($NFD->eq("\x{0430}\x{3099}\x{309A}\x{0308}",
- "\x{0430}\x{309A}\x{3099}\x{0308}") );
}
else {
ok(1);
ok(1);
ok(1);
- ok(1);
}
##############
@@ -117,7 +115,7 @@ my $trad = Unicode::Collate->new(
table => 'keys.txt',
normalization => undef,
ignoreName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/,
- level => 4,
+ level => 3,
entry => << 'ENTRIES',
0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish
0043 0068 ; [.0A3F.0020.0008.0043] # "Ch" in traditional Spanish
@@ -138,6 +136,8 @@ ok(
);
ok($trad->eq("ocho", "oc\cAho")); # UCA v9
ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9
+ok($trad->eq("-", "")); # also UCA v8
+ok($trad->lt("oc-ho", "ocho")); # also UCA v8
my $hiragana = "\x{3042}\x{3044}";
my $katakana = "\x{30A2}\x{30A4}";