Upgrade to Unicode::Collate 0.28

p4raw-id: //depot/perl@21064
author: Rafael Garcia-Suarez <rgarciasuarez@gmail.com> 2003-09-07 19:12:05 +0000
committer: Rafael Garcia-Suarez <rgarciasuarez@gmail.com> 2003-09-07 19:12:05 +0000
commit: 06c8fc8f09dc8f7e52006b1a902e84e1587b786f (patch)
tree: ef1fc4371fd44769715bdf5c7ed3f4b8f5a0ee6a /lib/Unicode
parent: 35ed0d3c8f1120489361bf37c4e66472d2262576 (diff)
download: perl-06c8fc8f09dc8f7e52006b1a902e84e1587b786f.tar.gz
5 files changed, 220 insertions, 34 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index 2bcc3155d2..18ed44626c 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
 
 require Exporter;
 
-our $VERSION = '0.27';
+our $VERSION = '0.28';
 our $PACKAGE = __PACKAGE__;
 
 our @ISA = qw(Exporter);
@@ -53,7 +53,7 @@ use constant NOMATCHPOS => -1;
 # A coderef to get combining class imported from Unicode::Normalize
 # (i.e. \&Unicode::Normalize::getCombinClass).
 # This is also used as a HAS_UNICODE_NORMALIZE flag.
-our $getCombinClass;
+our $CVgetCombinClass;
 
 # Supported Levels
 use constant MinLevel => 1;
@@ -225,17 +225,16 @@ sub checkCollator {
 	croak "Unicode/Normalize.pm is required to normalize strings: $@"
 	    if $@;
 
-	$getCombinClass = \&Unicode::Normalize::getCombinClass
-	    if ! $getCombinClass;
+	$CVgetCombinClass = \&Unicode::Normalize::getCombinClass
+	    if ! $CVgetCombinClass;
 
-	my $norm = $self->{normalization};
-	$self->{normCode} = sub {
+	if ($self->{normalization} ne 'prenormalized') {
+	    my $norm = $self->{normalization};
+	    $self->{normCode} = sub {
 		Unicode::Normalize::normalize($norm, shift);
 	    };
-
-	eval { $self->{normCode}->("") }; # try
-	if ($@) {
-	    croak "$PACKAGE unknown normalization form name: $norm";
+	    eval { $self->{normCode}->("") }; # try
+	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
 	}
     }
     return;
@@ -261,7 +260,7 @@ sub new
 	if ! exists $self->{overrideHangul};
     $self->{overrideCJK} = ''
 	if ! exists $self->{overrideCJK};
-    $self->{normalization} = 'D'
+    $self->{normalization} = 'NFD'
 	if ! exists $self->{normalization};
     $self->{alternate} = $self->{alternateTable} || 'shifted'
 	if ! exists $self->{alternate};
@@ -490,19 +489,31 @@ sub splitCE
 		    $i = $p;
 		}
 	    }
-	}
 
-	# with Combining Char (UTS#10, 4.2.1).
-	# requires Unicode::Normalize.
-	# Not be $wLen, as not croaked due to $norm.
-	if ($getCombinClass) {
-	    for (my $p = $i + 1; $p < @src; $p++) {
-		next if ! defined $src[$p];
-		last unless $getCombinClass->($src[$p]);
-		my $tail = CODE_SEP . $src[$p];
-		if ($ent->{$ce.$tail}) {
-		    $ce .= $tail;
-		    $src[$p] = undef;
+	# not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
+	# This process requires Unicode::Normalize.
+	# If "normalize" is undef, here should be skipped *always*
+	# (in spite of bool value of $CVgetCombinClass),
+	# since canonical ordering cannot be expected.
+	# Blocked combining character should not be contracted.
+
+	    if ($self->{normalization})
+	    # $self->{normCode} is false in the case of "prenormalized".
+	    {
+		my $preCC = 0;
+		my $curCC = 0;
+
+		for (my $p = $i + 1; $p < @src; $p++) {
+		    next if ! defined $src[$p];
+		    $curCC = $CVgetCombinClass->($src[$p]);
+		    last unless $curCC;
+		    my $tail = CODE_SEP . $src[$p];
+		    if ($preCC != $curCC && $ent->{$ce.$tail}) {
+			$ce .= $tail;
+			$src[$p] = undef;
+		    } else {
+			$preCC = $curCC;
+		    }
 		}
 	    }
 	}
@@ -1128,16 +1139,37 @@ If specified, strings are normalized before preparation of sort keys
 
 A form name C<Unicode::Normalize::normalize()> accepts will be applied
 as C<$normalization_form>.
+Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
 See C<Unicode::Normalize::normalize()> for detail.
 If omitted, C<'NFD'> is used.
 
 L<normalization> is performed after L<preprocess> (if defined).
 
-If C<undef> is passed explicitly as the value for this key,
+Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
+though they are not concerned with C<Unicode::Normalize::normalize()>.
+
+If C<undef> (not a string C<"undef">) is passed explicitly
+as the value for this key,
 any normalization is not carried out (this may make tailoring easier
 if any normalization is not desired).
-
-see B<CAVEAT>.
+Under C<(normalization =E<gt> undef)>, only contiguous contractions
+are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>,
+even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>.
+In this point,
+C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
+B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
+
+In the case of C<(normalization =E<gt> "prenormalized")>,
+any normalization is not performed, but
+non-contiguous contractions with combining characters are performed.
+Therefore
+C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
+B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
+If source strings are finely prenormalized,
+C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
+
+Except C<(normalization =E<gt> undef)>,
+B<Unicode::Normalize> is required (see also B<CAVEAT>).
 
 =item overrideCJK
 
diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes
index 4f61b8332f..3d39bbe248 100644
--- a/lib/Unicode/Collate/Changes
+++ b/lib/Unicode/Collate/Changes
@@ -1,8 +1,16 @@
 Revision history for Perl module Unicode::Collate.
 
+0.28  Sat Sep 06 20:16:01 2003
+    - Fixed another inconsistency under (normalization => undef):
+      Non-contiguous contraction is always neglected.
+    - Fixed: according to S2.1 in UTS #10, a blocked combining character
+      should not be contracted. One test in test.t was wrong, then removed.
+    - Added contract.t.
+    - (normalization => "prenormalized") is able to be used.
+
 0.27  Sun Aug 31 22:23:17 2003
       some improvements:
-    - The maximum length of contracted CE was not checked.
+    - The maximum length of contracted CE was not checked (v0.22 to v0.26).
       Collation of a large string including a first letter of a contraction
       that is not a part of that contraction (say, 'c' of 'ca'
       where 'ch' is defined) was too slow, inefficient.
diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README
index 21e1ff8b48..3c86573ec3 100644
--- a/lib/Unicode/Collate/README
+++ b/lib/Unicode/Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.27
+Unicode/Collate version 0.28
 ===============================
 
 NAME
diff --git a/lib/Unicode/Collate/t/contract.t b/lib/Unicode/Collate/t/contract.t
new file mode 100644
index 0000000000..c2aaecfaa7
--- /dev/null
+++ b/lib/Unicode/Collate/t/contract.t
@@ -0,0 +1,146 @@
+BEGIN {
+    unless ("A" eq pack('U', 0x41)) {
+	print "1..0 # Unicode::Collate " .
+	    "cannot stringify a Unicode code point\n";
+	exit 0;
+    }
+}
+
+BEGIN {
+    if ($ENV{PERL_CORE}) {
+        chdir('t') if -d 't';
+        @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+    }
+}
+
+use Test;
+BEGIN { plan tests => 40 };
+
+use strict;
+use warnings;
+use Unicode::Collate;
+
+use vars qw($IsEBCDIC);
+$IsEBCDIC = ord("A") != 0x41;
+
+our $kjeEntry = <<'ENTRIES';
+0301  ; [.0000.0032.0002.0301] # COMBINING ACUTE ACCENT
+0334  ; [.0000.008B.0002.0334] # COMBINING TILDE OVERLAY
+043A  ; [.0D31.0020.0002.043A] # CYRILLIC SMALL LETTER KA
+041A  ; [.0D31.0020.0008.041A] # CYRILLIC CAPITAL LETTER KA
+045C  ; [.0DA1.0020.0002.045C] # CYRILLIC SMALL LETTER KJE
+043A 0301 ; [.0DA1.0020.0002.045C] # CYRILLIC SMALL LETTER KJE
+040C  ; [.0DA1.0020.0008.040C] # CYRILLIC CAPITAL LETTER KJE
+041A 0301 ; [.0DA1.0020.0008.040C] # CYRILLIC CAPITAL LETTER KJE
+ENTRIES
+
+our $aaEntry = <<'ENTRIES';
+0304  ; [.0000.005A.0002.0304] # COMBINING MACRON (cc = 230)
+030A  ; [.0000.0043.0002.030A] # COMBINING RING ABOVE (cc = 230)
+0327  ; [.0000.0055.0002.0327] # COMBINING CEDILLA (cc = 202)
+031A  ; [.0000.006B.0002.031A] # COMBINING LEFT ANGLE ABOVE (cc = 232)
+0061  ; [.0A15.0020.0002.0061] # LATIN SMALL LETTER A
+0041  ; [.0A15.0020.0008.0041] # LATIN CAPITAL LETTER A
+007A  ; [.0C13.0020.0002.007A] # LATIN SMALL LETTER Z
+005A  ; [.0C13.0020.0008.005A] # LATIN CAPITAL LETTER Z
+00E5  ; [.0C25.0020.0002.00E5] # LATIN SMALL LETTER A WITH RING ABOVE; QQCM
+00C5  ; [.0C25.0020.0008.00C5] # LATIN CAPITAL LETTER A WITH RING ABOVE; QQCM
+0061 030A ; [.0C25.0020.0002.0061] # LATIN SMALL LETTER A WITH RING ABOVE
+0041 030A ; [.0C25.0020.0008.0041] # LATIN CAPITAL LETTER A WITH RING ABOVE
+ENTRIES
+
+#########################
+
+ok(1); # If we made it this far, we're ok.
+
+my $kjeNoN = Unicode::Collate->new(
+    level => 1,
+    table => undef,
+    normalization => undef,
+    entry => $kjeEntry,
+);
+
+ok($kjeNoN->lt("\x{043A}", "\x{043A}\x{0301}"));
+ok($kjeNoN->gt("\x{045C}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{043A}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}"));
+
+our %sortkeys;
+
+$sortkeys{'KAac'} = $kjeNoN->viewSortKey("\x{043A}\x{0301}");
+$sortkeys{'KAta'} = $kjeNoN->viewSortKey("\x{043A}\x{0334}\x{0301}");
+$sortkeys{'KAat'} = $kjeNoN->viewSortKey("\x{043A}\x{0301}\x{0334}");
+
+eval { require Unicode::Normalize };
+if (!$@ && !$IsEBCDIC) {
+    my $kjeNFD = Unicode::Collate->new(
+	level => 1,
+	table => undef,
+	entry => $kjeEntry,
+    );
+ok($kjeNFD->lt("\x{043A}", "\x{043A}\x{0301}"));
+ok($kjeNFD->eq("\x{045C}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNFD->lt("\x{043A}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNFD->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}"));
+
+    my $aaNFD = Unicode::Collate->new(
+	level => 1,
+	table => undef,
+	entry => $aaEntry,
+    );
+
+ok($aaNFD->lt("Z", "A\x{30A}\x{304}"));
+ok($aaNFD->eq("A", "A\x{304}\x{30A}"));
+ok($aaNFD->eq(pack('U', 0xE5), "A\x{30A}\x{304}"));
+ok($aaNFD->eq("A\x{304}", "A\x{304}\x{30A}"));
+ok($aaNFD->lt("Z", "A\x{327}\x{30A}"));
+ok($aaNFD->lt("Z", "A\x{30A}\x{327}"));
+ok($aaNFD->lt("Z", "A\x{31A}\x{30A}"));
+ok($aaNFD->lt("Z", "A\x{30A}\x{31A}"));
+
+    my $aaPre = Unicode::Collate->new(
+	level => 1,
+	normalization => "prenormalized",
+	table => undef,
+	entry => $aaEntry,
+    );
+
+ok($aaPre->lt("Z", "A\x{30A}\x{304}"));
+ok($aaPre->eq("A", "A\x{304}\x{30A}"));
+ok($aaPre->eq(pack('U', 0xE5), "A\x{30A}\x{304}"));
+ok($aaPre->eq("A\x{304}", "A\x{304}\x{30A}"));
+ok($aaPre->lt("Z", "A\x{327}\x{30A}"));
+ok($aaPre->lt("Z", "A\x{30A}\x{327}"));
+ok($aaPre->lt("Z", "A\x{31A}\x{30A}"));
+ok($aaPre->lt("Z", "A\x{30A}\x{31A}"));
+}
+else {
+  ok(1) for 1..20;
+}
+
+# again: loading Unicode::Normalize should not affect $kjeNoN.
+ok($kjeNoN->lt("\x{043A}", "\x{043A}\x{0301}"));
+ok($kjeNoN->gt("\x{045C}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{043A}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}"));
+
+ok($sortkeys{'KAac'}, $kjeNoN->viewSortKey("\x{043A}\x{0301}"));
+ok($sortkeys{'KAta'}, $kjeNoN->viewSortKey("\x{043A}\x{0334}\x{0301}"));
+ok($sortkeys{'KAat'}, $kjeNoN->viewSortKey("\x{043A}\x{0301}\x{0334}"));
+
+my $aaNoN = Unicode::Collate->new(
+    level => 1,
+    table => undef,
+    entry => $aaEntry,
+    normalization => undef,
+);
+
+ok($aaNoN->lt("Z", "A\x{30A}\x{304}"));
+ok($aaNoN->eq("A", "A\x{304}\x{30A}"));
+ok($aaNoN->eq(pack('U', 0xE5), "A\x{30A}\x{304}"));
+ok($aaNoN->eq("A\x{304}", "A\x{304}\x{30A}"));
+ok($aaNoN->eq("A", "A\x{327}\x{30A}"));
+ok($aaNoN->lt("Z", "A\x{30A}\x{327}"));
+ok($aaNoN->eq("A", "A\x{31A}\x{30A}"));
+ok($aaNoN->lt("Z", "A\x{30A}\x{31A}"));
+
diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t
index de4ca59d17..0c170e422a 100644
--- a/lib/Unicode/Collate/t/test.t
+++ b/lib/Unicode/Collate/t/test.t
@@ -15,7 +15,7 @@ BEGIN {
 }
 
 use Test;
-BEGIN { plan tests => 199 };
+BEGIN { plan tests => 200 };
 
 use strict;
 use warnings;
@@ -86,7 +86,8 @@ eval { require Unicode::Normalize };
 
 if (!$@ && !$IsEBCDIC) {
   my $NFD = Unicode::Collate->new(
-    table => undef,
+    table => 'keys.txt',
+    level => 1,
     entry => <<'ENTRIES',
 0430  ; [.0CB5.0020.0002.0430] # CYRILLIC SMALL LETTER A
 0410  ; [.0CB5.0020.0008.0410] # CYRILLIC CAPITAL LETTER A
@@ -101,14 +102,11 @@ ENTRIES
   ok($NFD->eq("\x{4D3}\x{325}", "\x{430}\x{308}\x{325}"));
   ok($NFD->lt("\x{430}\x{308}A", "\x{430}\x{308}B"));
   ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A"));
-  ok($NFD->eq("\x{0430}\x{3099}\x{309A}\x{0308}",
-              "\x{0430}\x{309A}\x{3099}\x{0308}") );
 }
 else {
   ok(1);
   ok(1);
   ok(1);
-  ok(1);
 }
 
 ##############
@@ -117,7 +115,7 @@ my $trad = Unicode::Collate->new(
   table => 'keys.txt',
   normalization => undef,
   ignoreName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/,
-  level => 4,
+  level => 3,
   entry => << 'ENTRIES',
  0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish
  0043 0068 ; [.0A3F.0020.0008.0043] # "Ch" in traditional Spanish
@@ -138,6 +136,8 @@ ok(
 );
 ok($trad->eq("ocho", "oc\cAho")); # UCA v9
 ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9
+ok($trad->eq("-", "")); # also UCA v8
+ok($trad->lt("oc-ho", "ocho")); # also UCA v8
 
 my $hiragana = "\x{3042}\x{3044}";
 my $katakana = "\x{30A2}\x{30A4}";
author	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>	2003-09-07 19:12:05 +0000
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>	2003-09-07 19:12:05 +0000
commit	06c8fc8f09dc8f7e52006b1a902e84e1587b786f (patch)
tree	ef1fc4371fd44769715bdf5c7ed3f4b8f5a0ee6a /lib/Unicode
parent	35ed0d3c8f1120489361bf37c4e66472d2262576 (diff)
download	perl-06c8fc8f09dc8f7e52006b1a902e84e1587b786f.tar.gz