From 4d36a948d441eba21c37ec6739bbfa44cf858f85 Mon Sep 17 00:00:00 2001 From: SADAHIRO Tomoyuki Date: Fri, 6 Sep 2002 08:28:32 +0900 Subject: Unicode::Collate 0.23 Released Message-Id: <20020905232316.8151.BQW10602@nifty.com> p4raw-id: //depot/perl@17865 --- lib/Unicode/Collate.pm | 865 ++++++++++++++++++++++++++++-------------- lib/Unicode/Collate/Changes | 14 + lib/Unicode/Collate/README | 6 +- lib/Unicode/Collate/t/index.t | 399 +++++++++++++++++++ lib/Unicode/Collate/t/test.t | 166 ++++---- 5 files changed, 1072 insertions(+), 378 deletions(-) create mode 100644 lib/Unicode/Collate/t/index.t (limited to 'lib/Unicode') diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index 61f12010ea..5193559105 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -14,7 +14,11 @@ use File::Spec; require Exporter; -our $VERSION = '0.21'; +# Supporting on EBCDIC platform is not tested. +# Tester(s) welcome! +our $IsEBCDIC = ord("A") != 0x41; + +our $VERSION = '0.23'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -45,18 +49,64 @@ else { # XXX, Perl 5.6.1 } } -our $getCombinClass; # coderef for combining class from Unicode::Normalize - -use constant Min2 => 0x20; # minimum weight at level 2 -use constant Min3 => 0x02; # minimum weight at level 3 - -# format for pack -use constant VCE_FORMAT => 'Cn4'; # for variable + CE with 4 levels - -# values of variable +# Perl's boolean +use constant TRUE => 1; +use constant FALSE => ""; +use constant NOMATCHPOS => -1; + +# A coderef to get combining class imported from Unicode::Normalize +# (i.e. \&Unicode::Normalize::getCombinClass). +# This is also used as a HAS_UNICODE_NORMALIZE flag. +our $getCombinClass; + +# Minimum weights at level 2 and 3, respectively +use constant Min2 => 0x20; +use constant Min3 => 0x02; + +# Shifted weight at 4th level +use constant Shift4 => 0xFFFF; + +# Variable weight at 1st level. +# This is a negative value but should be regarded as zero on collation. +# This is for distinction of variable chars from level 3 ignorable chars. +use constant Var1 => -1; + + +# A boolean for Variable and 16-bit weights at 4 levels of Collation Element +# PROBLEM: The Default Unicode Collation Element Table +# has weights over 0xFFFF at the 4th level. +# The tie-breaking in the variable weights +# other than "shift" (as well as "shift-trimmed") is unreliable. +use constant VCE_TEMPLATE => 'Cn4'; + +# Unicode encoding of strings to be collated +# TODO: 'N*' for UTF-32BE, 'V*' for UTF-32LE. +use constant UTF_TEMPLATE => 'U*'; + +# A sort key: 16-bit weights +# See also the PROBLEM on VCE_TEMPLATE above. +use constant KEY_TEMPLATE => 'n*'; + +# Level separator in a sort key: +# i.e. pack(KEY_TEMPLATE, 0) +use constant LEVEL_SEP => "\0\0"; + +# As Unicode code point separator for hash keys. +# A joined code point string (denoted by JCPS below) +# like "65;768" is used for internal processing +# instead of Perl's Unicode string like "\x41\x{300}", +# as the native code point is different from the Unicode code point +# on EBCDIC platform. +# This character must not be included in any stringified +# representation of an integer. +use constant CODE_SEP => ';'; + +# boolean values of variable weights use constant NON_VAR => 0; # Non-Variable character use constant VAR => 1; # Variable character +# Logical_Order_Exception in PropList.txt +# TODO: synchronization with change of PropList.txt. our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; sub UCA_Version { "9" } @@ -78,9 +128,10 @@ our @ChangeNG = qw/ entry entries table combining maxlength ignoreChar ignoreName undefChar undefName versionTable alternateTable backwardsTable forwardsTable rearrangeTable - derivCode normCode rearrangeHash isShift L3ignorable + derivCode normCode rearrangeHash L3_ignorable /; # The hash key 'ignored' is deleted at VERSION 0.21. +# The hash key 'isShift' are deleted at VERSION 0.23. my (%ChangeOK, %ChangeNG); @ChangeOK{ @ChangeOK } = (); @@ -112,18 +163,14 @@ sub checkCollator { if 4 < $self->{level}; $self->{derivCode} = - $self->{UCA_Version} == -1 ? \&broken_derivCE : - $self->{UCA_Version} == 8 ? \&derivCE_8 : - $self->{UCA_Version} == 9 ? \&derivCE_9 : + $self->{UCA_Version} == 8 ? \&_derivCE_8 : + $self->{UCA_Version} == 9 ? \&_derivCE_9 : croak "Illegal UCA version (passed $self->{UCA_Version})."; $self->{alternate} = lc($self->{alternate}); croak "$PACKAGE unknown alternate tag name: $self->{alternate}" unless exists $AlternateOK{ $self->{alternate} }; - $self->{isShift} = $self->{alternate} eq 'shifted' || - $self->{alternate} eq 'shift-trimmed'; - $self->{backwards} = [] if ! defined $self->{backwards}; $self->{backwards} = [ $self->{backwards} ] @@ -238,7 +285,7 @@ sub parseEntry { my $self = shift; my $line = shift; - my($name, $ele, @key); + my($name, $entry, @uv, @key); return if $line !~ /^\s*[0-9A-Fa-f]/; @@ -252,57 +299,77 @@ sub parseEntry croak "Wrong Entry: must be separated by ';' from " if ! $k; - my @e = _getHexArray($e); - return if !@e; + @uv = _getHexArray($e); + return if !@uv; + + $entry = join(CODE_SEP, @uv); # in JCPS - $ele = pack('U*', @e); - return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; + if (defined $self->{undefChar} || defined $self->{ignoreChar}) { + # Do not use UTF_TEMPLATE; Perl' RE is only for utf8. + my $ele = $IsEBCDIC + ? pack('U*', map utf8::unicode_to_native($_), @uv) + : pack('U*', @uv); - my $combining = 1; # primary = 0, secondary != 0; - my $level3ignore; + # regarded as if it were not entried in the table + return + if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; - # replace with completely ignorable - if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ || - defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/) - { - $k = '[.0000.0000.0000.0000]'; + # replaced as completely ignorable + $k = '[.0000.0000.0000.0000]' + if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; } + # replaced as completely ignorable + $k = '[.0000.0000.0000.0000]' + if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; + + my $combining = TRUE; # primary = 0, secondary != 0; + my $is_L3_ignorable; + foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. - my @arr = _getHexArray($arr); - push @key, pack(VCE_FORMAT, $var, @arr); - $combining = 0 unless $arr[0] == 0 && $arr[1] != 0; - $level3ignore = 1 if $arr[0] == 0 && $arr[1] == 0 && $arr[2] == 0; + my @wt = _getHexArray($arr); + push @key, pack(VCE_TEMPLATE, $var, @wt); + $combining = FALSE + unless $wt[0] == 0 && $wt[1] != 0; + $is_L3_ignorable = TRUE + if $wt[0] + $wt[1] + $wt[2] == 0; + # if $arr !~ /[1-9A-Fa-f]/; NG + # Conformance Test shows L3-ignorable is completely ignorable. } - $self->{entries}{$ele} = \@key; + $self->{entries}{$entry} = \@key; - $self->{combining}{$ele} = 1 + $self->{combining}{$entry} = TRUE if $combining; - $self->{L3ignorable}{$e[0]} = 1 - if @e == 1 && $level3ignore; + # The key is a string representing a numeral code point. + + $self->{L3_ignorable}{$uv[0]} = TRUE + if @uv == 1 && $is_L3_ignorable; - $self->{maxlength}{ord $ele} = scalar @e if @e > 1; + # Contraction is to be considered in the range of this maxlength. + $self->{maxlength}{$uv[0]} = scalar @uv + if @uv > 1; } ## -## arrayref CE = altCE(bool variable?, list[num] weights) +## arrayref[weights] = altCE(bool variable?, list[num] weights) ## sub altCE { my $self = shift; - my($var, @c) = unpack(VCE_FORMAT, shift); + my($var, @wt) = unpack(VCE_TEMPLATE, shift); $self->{alternate} eq 'blanked' ? - $var ? [0,0,0,$c[3]] : \@c : + $var ? [Var1, 0, 0, $wt[3]] : \@wt : $self->{alternate} eq 'non-ignorable' ? - \@c : + \@wt : $self->{alternate} eq 'shifted' ? - $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] : + $var ? [Var1, 0, 0, $wt[0] ] + : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4 : 0 ] : $self->{alternate} eq 'shift-trimmed' ? - $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] : + $var ? [Var1, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] : croak "$PACKAGE unknown alternate name: $self->{alternate}"; } @@ -312,7 +379,8 @@ sub viewSortKey my $ver = $self->{UCA_Version}; my $key = $self->getSortKey(@_); - my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key; + my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, $key); + if ($ver <= 8) { $view =~ s/ ?0000 ?/|/g; } else { @@ -323,27 +391,46 @@ sub viewSortKey ## -## list[strings] elements = splitCE(string arg) +## arrayref of JCPS = splitCE(string to be collated) +## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true) ## sub splitCE { my $self = shift; + my $wLen = $_[1]; + my $code = $self->{preprocess}; my $norm = $self->{normCode}; my $ent = $self->{entries}; my $max = $self->{maxlength}; my $reH = $self->{rearrangeHash}; - my $L3i = $self->{L3ignorable}; + my $ign = $self->{L3_ignorable}; my $ver9 = $self->{UCA_Version} > 8; - my $str = ref $code ? &$code(shift) : shift; - $str = &$norm($str) if ref $norm; + my ($str, @buf); - my @src = unpack('U*', $str); - my @buf; + if ($wLen) { + $code and croak "Preprocess breaks character positions. " + . "Don't use with index(), match(), etc."; + $norm and croak "Normalization breaks character positions. " + . "Don't use with index(), match(), etc."; + $str = $_[0]; + } + else { + $str = $_[0]; + $str = &$code($str) if ref $code; + $str = &$norm($str) if ref $norm; + } - # rearrangement - if ($reH) { + # get array of Unicode code point of string. + my @src = $IsEBCDIC + ? map(utf8::native_to_unicode($_), unpack UTF_TEMPLATE, $str) + : unpack(UTF_TEMPLATE, $str); + + # rearrangement: + # Character positions are not kept if rearranged, + # then neglected if $wLen is true. + if ($reH && ! $wLen) { for (my $i = 0; $i < @src; $i++) { if (exists $reH->{ $src[$i] } && $i + 1 < @src) { ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); @@ -353,75 +440,85 @@ sub splitCE } if ($ver9) { - @src = grep ! $L3i->{$_}, @src; + # To remove a character marked as a completely ignorable. + for (my $i = 0; $i < @src; $i++) { + $src[$i] = undef if $ign->{ $src[$i] }; + } } for (my $i = 0; $i < @src; $i++) { - my $ch; - my $u = $src[$i]; - - # non-characters - next if ! defined $u - || ($u < 0 || 0x10FFFF < $u) # out of range - || (($u & 0xFFFE) == 0xFFFE) # ??FFFE or ??FFFF (cf. utf8.c) - || (0xD800 <= $u && $u <= 0xDFFF) # unpaired surrogates - || (0xFDD0 <= $u && $u <= 0xFDEF) # non-character - ; - - if ($max->{$u}) { # contract - for (my $j = $max->{$u}; $j >= 1; $j--) { - next unless $i+$j-1 < @src; - $ch = pack 'U*', @src[$i .. $i+$j-1]; - $i += $j-1, last if $ent->{$ch}; + next if _isNonCharacter($src[$i]); + + my $i_orig = $i; + my $ce = $src[$i]; + + if ($max->{$ce}) { # contract + my $temp_ce = $ce; + + for (my $p = $i + 1; $p < @src; $p++) { + next if ! defined $src[$p]; + $temp_ce .= CODE_SEP . $src[$p]; + if ($ent->{$temp_ce}) { + $ce = $temp_ce; + $i = $p; + } + } + } + + # with Combining Char (UTS#10, 4.2.1). + # requires Unicode::Normalize. + # Not be $wLen, as not croaked due to $norm. + if ($getCombinClass) { + for (my $p = $i + 1; $p < @src; $p++) { + next if ! defined $src[$p]; + last unless $getCombinClass->($src[$p]); + my $tail = CODE_SEP . $src[$p]; + if ($ent->{$ce.$tail}) { + $ce .= $tail; + $src[$p] = undef; + } } - } else { - $ch = pack('U', $u); } - # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize. - if ($getCombinClass && defined $ch) { - for (my $j = $i+1; $j < @src; $j++) { - next unless defined $src[$j]; - last unless $getCombinClass->( $src[$j] ); - my $comb = pack 'U', $src[$j]; - next if ! $ent->{ $ch.$comb }; - $ch .= $comb; - $src[$j] = undef; + if ($wLen) { + for (my $p = $i + 1; $p < @src; $p++) { + last if defined $src[$p]; + $i = $p; } } - push @buf, $ch; + + push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce; } - wantarray ? @buf : \@buf; + return \@buf; } ## -## list[arrayrefs] weight = getWt(string element) +## list of arrayrefs of weights = getWt(JCPS) ## sub getWt { my $self = shift; - my $ch = shift; + my $ce = shift; my $ent = $self->{entries}; my $cjk = $self->{overrideCJK}; my $hang = $self->{overrideHangul}; my $der = $self->{derivCode}; - return if !defined $ch; - return map($self->altCE($_), @{ $ent->{$ch} }) - if $ent->{$ch}; + return if !defined $ce; + return map($self->altCE($_), @{ $ent->{$ce} }) + if $ent->{$ce}; - my $u = unpack('U', $ch); + # CE must not be a contraction, then it's a code point. + my $u = $ce; - if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul + if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale return map $self->altCE($_), $hang - ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$hang($u)) + ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u)) : defined $hang ? map({ - my $v = $_; - my $vCE = $ent->{pack('U', $v)}; - $vCE ? @$vCE : $der->($v); + $ent->{$_} ? @{ $ent->{$_} } : $der->($_); } _decompHangul($u)) : $der->($u); } @@ -430,9 +527,9 @@ sub getWt 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph return map $self->altCE($_), $cjk - ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$cjk($u)) + ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u)) : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000 - ? pack(VCE_FORMAT, NON_VAR, $u, 0x20, 0x02, $u) + ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2, Min3, $u) : $der->($u); } else { @@ -440,89 +537,6 @@ sub getWt } } -## -## int = index(string, substring) -## -sub index -{ - my $self = shift; - my $lev = $self->{level}; - my $comb = $self->{combining}; - my $str = $self->splitCE(shift); - my $sub = $self->splitCE(shift); - - return wantarray ? (0,0) : 0 if ! @$sub; - return wantarray ? () : -1 if ! @$str; - - my @subWt = grep _ignorableAtLevel($_,$lev), - map $self->getWt($_), @$sub; - - my(@strWt,@strPt); - my $count = 0; - for (my $i = 0; $i < @$str; $i++) { - my $go_ahead = 0; - - my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]); - $go_ahead += length $str->[$i]; - - # /*XXX*/ still broken. - # index("e\x{300}", "e") should be 'no match' at level 2 or higher - # as "e\x{300}" is a *single* grapheme cluster and not equal to "e". - - # go ahead as far as we find a combining character; - while ($i + 1 < @$str && - (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) { - $i++; - next if ! defined $str->[$i]; - $go_ahead += length $str->[$i]; - push @tmp, - grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]); - } - - push @strWt, @tmp; - push @strPt, ($count) x @tmp; - $count += $go_ahead; - - while (@strWt >= @subWt) { - if (_eqArray(\@strWt, \@subWt, $lev)) { - my $pos = $strPt[0]; - return wantarray ? ($pos, $count-$pos) : $pos; - } - shift @strWt; - shift @strPt; - } - } - return wantarray ? () : -1; -} - -## -## bool _eqArray(arrayref, arrayref, level) -## -sub _eqArray($$$) -{ - my $a = shift; # length $a >= length $b; - my $b = shift; - my $lev = shift; - for my $v (0..$lev-1) { - for my $c (0..@$b-1){ - return if $a->[$c][$v] != $b->[$c][$v]; - } - } - return 1; -} - - -## -## bool _ignorableAtLevel(CE, level) -## -sub _ignorableAtLevel($$) -{ - my $ce = shift; - return unless defined $ce; - my $lv = shift; - return ! grep { ! $ce->[$_] } 0..$lv-1; -} - ## ## string sortkey = getSortKey(string arg) @@ -531,34 +545,30 @@ sub getSortKey { my $self = shift; my $lev = $self->{level}; - my $rCE = $self->splitCE(shift); # get an arrayref + my $rCE = $self->splitCE(shift); # get an arrayref of JCPS my $ver9 = $self->{UCA_Version} > 8; - my $sht = $self->{isShift}; + my $v2i = $self->{alternate} ne 'non-ignorable'; # weight arrays my (@buf, $last_is_variable); - foreach my $ce (@$rCE) { - my @t = $self->getWt($ce); - if ($sht && $ver9) { - if (@t == 1 && $t[0][0] == 0) { - if ($t[0][1] == 0 && $t[0][2] == 0) { - $last_is_variable = 1; - } else { - next if $last_is_variable; - } + foreach my $wt (map $self->getWt($_), @$rCE) { + if ($v2i && $ver9) { + if ($wt->[0] == 0) { # ignorable + next if $last_is_variable; } else { - $last_is_variable = 0; + $last_is_variable = ($wt->[0] == Var1); } } - push @buf, @t; + push @buf, $wt; } # make sort key my @ret = ([],[],[],[]); foreach my $v (0..$lev-1) { foreach my $b (@buf) { - push @{ $ret[$v] }, $b->[$v] if $b->[$v]; + push @{ $ret[$v] }, $b->[$v] + if 0 < $b->[$v]; } } foreach (@{ $self->{backwards} }) { @@ -581,7 +591,7 @@ sub getSortKey elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana } } - join "\0\0", map pack('n*', @$_), @ret; + join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret; } @@ -608,37 +618,29 @@ sub sort { } -sub derivCE_9 { +sub _derivCE_9 { my $u = shift; my $base = - (0x4E00 <= $u && $u <= 0x9FA5) # CJK - ? 0xFB40 : + (0x4E00 <= $u && $u <= 0x9FA5) + ? 0xFB40 : # CJK (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6) - ? 0xFB80 : 0xFBC0; + ? 0xFB80 # CJK ext. + : 0xFBC0; # others my $aaaa = $base + ($u >> 15); my $bbbb = ($u & 0x7FFF) | 0x8000; return - pack(VCE_FORMAT, NON_VAR, $aaaa, Min2, Min3, $u), - pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $u); + pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2, Min3, $u), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); } -sub derivCE_8 { +sub _derivCE_8 { my $code = shift; my $aaaa = 0xFF80 + ($code >> 15); my $bbbb = ($code & 0x7FFF) | 0x8000; return - pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code), - pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code); -} - -sub broken_derivCE { # NG - my $code = shift; - my $aaaa = 0xFFC2 + ($code >> 15); - my $bbbb = $code & 0x7FFF | 0x1000; - return - pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code), - pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code); + pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); } ## @@ -647,7 +649,7 @@ sub broken_derivCE { # NG sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } # -# $code must be in Hangul syllable. +# $code *must* be in Hangul syllable. # Check it before you enter here. # sub _decompHangul { @@ -663,6 +665,253 @@ sub _decompHangul { ); } +sub _isNonCharacter { + my $code = shift; + return ! defined $code # removed + || ($code < 0 || 0x10FFFF < $code) # out of range + || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c) + || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates + || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters + ; +} + + +## +## bool _nonIgnorAtLevel(arrayref weights, int level) +## +sub _nonIgnorAtLevel($$) +{ + my $wt = shift; + return if ! defined $wt; + my $lv = shift; + return grep($wt->[$_] != 0, 0..$lv-1) ? TRUE : FALSE; +} + +## +## bool _eqArray( +## arrayref of arrayref[weights] source, +## arrayref of arrayref[weights] substr, +## int level) +## * comparison of graphemes vs graphemes. +## @$source >= @$substr must be true (check it before call this); +## +sub _eqArray($$$) +{ + my $source = shift; + my $substr = shift; + my $lev = shift; + + for my $g (0..@$substr-1){ + # Do the $g'th graphemes have the same number of AV weigths? + return if @{ $source->[$g] } != @{ $substr->[$g] }; + + for my $w (0..@{ $substr->[$g] }-1) { + for my $v (0..$lev-1) { + return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; + } + } + } + return 1; +} + +## +## (int position, int length) +## int position = index(string, substring, position, [undoc'ed grobal]) +## +## With "grobal" (only for the list context), +## returns list of arrayref[position, length]. +## +sub index +{ + my $self = shift; + my $str = shift; + my $len = length($str); + my $subCE = $self->splitCE(shift); + my $pos = @_ ? shift : 0; + $pos = 0 if $pos < 0; + my $grob = shift; + + my $comb = $self->{combining}; + my $lev = $self->{level}; + my $ver9 = $self->{UCA_Version} > 8; + my $v2i = $self->{alternate} ne 'non-ignorable'; + + if (! @$subCE) { + my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; + return $grob + ? map([$_, 0], $temp..$len) + : wantarray ? ($temp,0) : $temp; + } + if ($len < $pos) { + return wantarray ? () : NOMATCHPOS; + } + my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE); + if (! @$strCE) { + return wantarray ? () : NOMATCHPOS; + } + my $last_is_variable; + my(@strWt, @iniPos, @finPos, @subWt, @g_ret); + + $last_is_variable = FALSE; + for my $wt (map $self->getWt($_), @$subCE) { + my $to_be_pushed = _nonIgnorAtLevel($wt,$lev); + + if ($v2i && $ver9) { + if ($wt->[0] == 0) { + $to_be_pushed = FALSE if $last_is_variable; + } else { + $last_is_variable = ($wt->[0] == Var1); + } + } + + if (@subWt && $wt->[0] == 0) { + push @{ $subWt[-1] }, $wt if $to_be_pushed; + } else { + $wt->[0] = 0 if $wt->[0] == Var1; + push @subWt, [ $wt ]; + } + } + + my $count = 0; + my $end = @$strCE - 1; + + $last_is_variable = FALSE; + + for (my $i = 0; $i <= $end; ) { # no $i++ + my $found_base = 0; + + # fetch a grapheme + while ($i <= $end && $found_base == 0) { + for my $wt ($self->getWt($strCE->[$i][0])) { + my $to_be_pushed = _nonIgnorAtLevel($wt,$lev); + + if ($v2i && $ver9) { + if ($wt->[0] == 0) { + $to_be_pushed = FALSE if $last_is_variable; + } else { + $last_is_variable = ($wt->[0] == Var1); + } + } + + if (@strWt && $wt->[0] == 0) { + push @{ $strWt[-1] }, $wt if $to_be_pushed; + $finPos[-1] = $strCE->[$i][2]; + } elsif ($to_be_pushed) { + $wt->[0] = 0 if $wt->[0] == Var1; + push @strWt, [ $wt ]; + push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1]; + $finPos[-1] = NOMATCHPOS if $found_base; + push @finPos, $strCE->[$i][2]; + $found_base++; + } + # else ===> no-op + } + $i++; + } + + # try to match + while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { + if ($iniPos[0] != NOMATCHPOS && + $finPos[$#subWt] != NOMATCHPOS && + _eqArray(\@strWt, \@subWt, $lev)) { + my $temp = $iniPos[0] + $pos; + + if ($grob) { + push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; + splice @strWt, 0, $#subWt; + splice @iniPos, 0, $#subWt; + splice @finPos, 0, $#subWt; + } + else { + return wantarray + ? ($temp, $finPos[$#subWt] - $iniPos[0]) + : $temp; + } + } + shift @strWt; + shift @iniPos; + shift @finPos; + } + } + + return $grob + ? @g_ret + : wantarray ? () : NOMATCHPOS; +} + +## +## scalarref to matching part = match(string, substring) +## +sub match +{ + my $self = shift; + if (my($pos,$len) = $self->index($_[0], $_[1])) { + my $temp = substr($_[0], $pos, $len); + return wantarray ? $temp : \$temp; + # An lvalue ref \substr should be avoided, + # since its value is affected by modification of its referent. + } + else { + return; + } +} + +## +## arrayref matching parts = gmatch(string, substring) +## +sub gmatch +{ + my $self = shift; + my $str = shift; + my $sub = shift; + return map substr($str, $_->[0], $_->[1]), + $self->index($str, $sub, 0, 'g'); +} + +## +## bool subst'ed = subst(string, substring, replace) +## +sub subst +{ + my $self = shift; + my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; + + if (my($pos,$len) = $self->index($_[0], $_[1])) { + if ($code) { + my $mat = substr($_[0], $pos, $len); + substr($_[0], $pos, $len, $code->($mat)); + } else { + substr($_[0], $pos, $len, $_[2]); + } + return TRUE; + } + else { + return FALSE; + } +} + +## +## int count = gsubst(string, substring, replace) +## +sub gsubst +{ + my $self = shift; + my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; + my $cnt = 0; + + # Replacement is carried out from the end, then use reverse. + for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { + if ($code) { + my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); + substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); + } else { + substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); + } + $cnt++; + } + return $cnt; +} + 1; __END__ @@ -685,6 +934,10 @@ Unicode::Collate - Unicode Collation Algorithm =head1 DESCRIPTION +This module is an implementation +of Unicode Technical Standard #10 (UTS #10) +"Unicode Collation Algorithm." + =head2 Constructor and Tailoring The C method returns a collator object. @@ -726,7 +979,7 @@ as switching the algorithm would affect the performance.> =item alternate --- see 3.2.2 Variable Weighting, UTR #10. +-- see 3.2.2 Variable Weighting, UTS #10. (the title in UCA version 8: Alternate Weighting) @@ -739,12 +992,12 @@ which are marked with an ASTERISK in the table These names are case-insensitive. By default (if specification is omitted), 'shifted' is adopted. - 'Blanked' Variable elements are ignorable at levels 1 through 3; + 'Blanked' Variable elements are made ignorable at levels 1 through 3; considered at the 4th level. 'Non-ignorable' Variable elements are not reset to ignorable. - 'Shifted' Variable elements are ignorable at levels 1 through 3 + 'Shifted' Variable elements are made ignorable at levels 1 through 3 their level 4 weight is replaced by the old level 1 weight. Level 4 weight for Non-Variable elements is 0xFFFF. @@ -753,7 +1006,7 @@ By default (if specification is omitted), 'shifted' is adopted. =item backwards --- see 3.1.2 French Accents, UTR #10. +-- see 3.1.2 French Accents, UTS #10. backwards => $levelNumber or \@levelNumbers @@ -762,7 +1015,7 @@ If omitted, forwards at all the levels. =item entry --- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10. +-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10. Overrides a default order or defines additional collation elements @@ -772,11 +1025,16 @@ Overrides a default order or defines additional collation elements 0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish ENTRIES +B The code point in the UCA file format (before C<';'>) +B be a Unicode code point, but not a native code point. +So C<0063> must always denote C, +but not a character of C<"\x63">. + =item ignoreName =item ignoreChar --- see Completely Ignorable, 3.2.2 Variable Weighting, UTR #10. +-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10. Makes the entry in the table completely ignorable; i.e. as if the weights were zero at all level. @@ -786,7 +1044,7 @@ E.g. when 'a' and 'e' are ignorable, =item level --- see 4.3 Form a sort key for each string, UTR #10. +-- see 4.3 Form a sort key for each string, UTS #10. Set the maximum level. Any higher levels than the specified one are ignored. @@ -802,7 +1060,7 @@ If omitted, the maximum is the 4th. =item normalization --- see 4.1 Normalize each input string, UTR #10. +-- see 4.1 Normalize each input string, UTS #10. If specified, strings are normalized before preparation of sort keys (the normalization is executed after preprocess). @@ -824,7 +1082,7 @@ see B. =item overrideCJK --- see 7.1 Derived Collation Elements, UTR #10. +-- see 7.1 Derived Collation Elements, UTS #10. By default, mapping of CJK Unified Ideographs uses the Unicode codepoint order. @@ -854,7 +1112,7 @@ in table or L is still valid. =item overrideHangul --- see 7.1 Derived Collation Elements, UTR #10. +-- see 7.1 Derived Collation Elements, UTS #10. By default, Hangul Syllables are decomposed into Hangul Jamo. But the mapping of Hangul Syllables may be overrided. @@ -873,7 +1131,7 @@ in table or L is still valid. =item preprocess --- see 5.1 Preprocessing, UTR #10. +-- see 5.1 Preprocessing, UTS #10. If specified, the coderef is used to preprocess before the formation of sort keys. @@ -889,7 +1147,7 @@ Then, "the pen" is before "a pencil". =item rearrange --- see 3.1.3 Rearrangement, UTR #10. +-- see 3.1.3 Rearrangement, UTS #10. Characters that are not coded in logical order and to be rearranged. By default, @@ -905,7 +1163,7 @@ but it is not warned at present.> =item table --- see 3.2 Default Unicode Collation Element Table, UTR #10. +-- see 3.2 Default Unicode Collation Element Table, UTS #10. You can use another element table if desired. The table file must be in your C directory. @@ -934,7 +1192,7 @@ ENTRIES =item undefChar --- see 6.3.4 Reducing the Repertoire, UTR #10. +-- see 6.3.4 Reducing the Repertoire, UTS #10. Undefines the collation element as if it were unassigned in the table. This reduces the size of the table. @@ -950,7 +1208,7 @@ unfamiliar to you and maybe never used. =item upper_before_lower --- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10. +-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10. By default, lowercase is before uppercase and hiragana is before katakana. @@ -960,9 +1218,9 @@ If the tag is made true, this is reversed. B: These tags simplemindedly assume any lowercase/uppercase or hiragana/katakana distinctions should occur in level 3, and their weights at level 3 -should be same as those mentioned in 7.3.1, UTR #10. +should be same as those mentioned in 7.3.1, UTS #10. If you define your collation elements which violates this, -these tags doesn't work validly. +these tags don't work validly. =back @@ -1003,7 +1261,7 @@ They works like the same name operators as theirs. =item C<$sortKey = $Collator-EgetSortKey($string)> --- see 4.3 Form a sort key for each string, UTR #10. +-- see 4.3 Form a sort key for each string, UTS #10. Returns a sort key. @@ -1028,27 +1286,32 @@ and get the result of the comparison of the strings using UCA. (If C is 8, the output is slightly different.) -=item C<$position = $Collator-Eindex($string, $substring)> +=back + +=head2 Methods for Searching -=item C<($position, $length) = $Collator-Eindex($string, $substring)> +B If C or C tag is true +for C<$Collator>, calling these methods (C, C, C, +C, C) is croaked, +as the position and the length might differ +from those on the specified string. +(And the C tag is neglected.) --- see 6.8 Searching, UTR #10. +The C, C, C, C methods work +like C, C, C, C, respectively, +but they are not aware of any pattern, but only a literal substring. + +=over 4 + +=item C<$position = $Collator-Eindex($string, $substring[, $position])> + +=item C<($position, $length) = $Collator-Eindex($string, $substring[, $position])> If C<$substring> matches a part of C<$string>, returns the position of the first occurrence of the matching part in scalar context; in list context, returns a two-element list of the position and the length of the matching part. -B that the length of the matching part may differ from -the length of C<$substring>. - -B that the position and the length are counted on the string -after the process of preprocess, normalization, and rearrangement. -Therefore, in case the specified string is not binary equal to -the preprocessed/normalized/rearranged string, the position and the length -may differ form those on the specified string. But it is guaranteed -that, if matched, it returns a non-negative value as C<$position>. - If C<$substring> does not match any part of C<$string>, returns C<-1> in scalar context and an empty list in list context. @@ -1056,15 +1319,86 @@ an empty list in list context. e.g. you say my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); - my $str = "Ich mu\x{00DF} studieren."; - my $sub = "m\x{00FC}ss"; + # (normalization => undef) is REQUIRED. + my $str = "Ich muß studieren Perl."; + my $sub = "MÜSS"; my $match; if (my($pos,$len) = $Collator->index($str, $sub)) { $match = substr($str, $pos, $len); } -and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<"> -is primary equal to C<"m>E<252>C. +and get C<"muß"> in C<$match> since C<"muß"> +is primary equal to C<"MÜSS">. + +=item C<$match_ref = $Collator-Ematch($string, $substring)> + +=item C<($match) = $Collator-Ematch($string, $substring)> + +If C<$substring> matches a part of C<$string>, in scalar context, returns +B the first occurrence of the matching part +(C<$match_ref> is always true if matches, +since every reference is B); +in list context, returns the first occurrence of the matching part. + +If C<$substring> does not match any part of C<$string>, +returns C in scalar context and +an empty list in list context. + +e.g. + + if ($match_ref = $Collator->match($str, $sub)) { # scalar context + print "matches [$$match_ref].\n"; + } else { + print "doesn't match.\n"; + } + + or + + if (($match) = $Collator->match($str, $sub)) { # list context + print "matches [$match].\n"; + } else { + print "doesn't match.\n"; + } + +=item C<@match = $Collator-Egmatch($string, $substring)> + +If C<$substring> matches a part of C<$string>, returns +all the matching parts (or matching count in scalar context). + +If C<$substring> does not match any part of C<$string>, +returns an empty list. + +=item C<$count = $Collator-Esubst($string, $substring, $replacement)> + +If C<$substring> matches a part of C<$string>, +the first occurrence of the matching part is replaced by C<$replacement> +(C<$string> is modified) and return C<$count> (always equals to C<1>). + +C<$replacement> can be a C, +taking the matching part as an argument, +and returning a string to replace the matching part +(a bit similar to C($1)/e>). + +=item C<$count = $Collator-Egsubst($string, $substring, $replacement)> + +If C<$substring> matches a part of C<$string>, +all the occurrences of the matching part is replaced by C<$replacement> +(C<$string> is modified) and return C<$count>. + +C<$replacement> can be a C, +taking the matching part as an argument, +and returning a string to replace the matching part +(a bit similar to C($1)/eg>). + +e.g. + + my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); + # (normalization => undef) is REQUIRED. + my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L..."; + $Collator->gsubst($str, "camel", sub { "$_[0]" }); + + # now $str is "Camel ass came\x{301}l CAMEL horse cAm\0E\0L..."; + # i.e., all the camels are made bold-faced. =back @@ -1102,8 +1436,7 @@ In the scalar context, returns the modified collator =item UCA_Version -Returns the version number of Unicode Technical Standard 10 -this module consults. +Returns the version number of UTS #10 this module consults. =item Base_Unicode_Version @@ -1118,14 +1451,10 @@ None by default. =head2 TODO -Unicode::Collate has not been ported to EBCDIC. The code mostly would -work just fine but a decision needs to be made: how the module should -work in EBCDIC? Should the low 256 characters be understood as -Unicode or as EBCDIC code points? Should one be chosen or should -there be a way to do either? Or should such translation be left -outside the module for the user to do, for example by using -Encode::from_to()? -(or utf8::unicode_to_native()/utf8::native_to_unicode()?) +Unicode::Collate has not been ported to EBCDIC. +IMHO, use of utf8::unicode_to_native()/utf8::native_to_unicode() +at the proper postions should allow +this module to work on EBCDIC platform... =head2 CAVEAT @@ -1136,7 +1465,7 @@ If you need not it (say, in the case when you need not handle any combining characters), assign C undef> explicitly. --- see 6.5 Avoiding Normalization, UTR #10. +-- see 6.5 Avoiding Normalization, UTS #10. =head2 Conformance Test @@ -1149,17 +1478,7 @@ a collator via Cnew( )> should be used; for F, a collator via Cnew(alternate =E "non-ignorable", level =E 3)>. -B - -=head2 BUGS - -C is an experimental method and -its return value may be unreliable. -The correct implementation for C must be based -on Locale-Sensitive Support: Level 3 in UTR #18, -F. - -See also 4.2 Locale-Dependent Graphemes in UTR #18. +B =head1 AUTHOR @@ -1178,7 +1497,7 @@ SADAHIRO Tomoyuki, ESADAHIRO@cpan.orgE =item http://www.unicode.org/reports/tr10/ -Unicode Collation Algorithm - UTR #10 +Unicode Collation Algorithm - UTS #10 =item http://www.unicode.org/reports/tr10/allkeys.txt @@ -1193,10 +1512,6 @@ The latest versions of the conformance test for the UCA Unicode Normalization Forms - UAX #15 -=item http://www.unicode.org/reports/tr18 - -Unicode Regular Expression Guidelines - UTR #18 - =item L =back diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index adf2c59891..3e60f0bf79 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,5 +1,19 @@ Revision history for Perl extension Unicode::Collate. +0.23 Wed Sep 04 19:25:20 2002 + - fix: scalar match() no longer returns an lvalue substr ref. + - fix: "Ignorable after variable" should be made level 3 ignorable + even if alternate => 'blanked'. + - Now a grapheme may contain trailing level 2, level 3, + and completely ignorable characters. + +0.22 Mon Sep 02 23:15:14 2002 + - New File: index.t. + (The new test.t excludes tests for index.) + - tweak on index(). POSITION is supported. + - add match, gmatch, subst, gsubst methods. + - fix: ignorable after variable in 'shift'-variable weight. + 0.21 Sat Aug 03 10:24:00 2002 - upgrade keys.txt and t/test.t for UCA Version 9. diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index 957fc95ba3..d829c77490 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.21 +Unicode/Collate version 0.23 =============================== NAME @@ -22,7 +22,7 @@ SYNOPSIS INSTALLATION -Perl 5.6.1 or better +Perl 5.6.1 or later To install this module type the following: @@ -50,7 +50,7 @@ DEPENDENCIES COPYRIGHT AND LICENCE -SADAHIRO Tomoyuki +SADAHIRO Tomoyuki http://homepage1.nifty.com/nomenclator/perl/ diff --git a/lib/Unicode/Collate/t/index.t b/lib/Unicode/Collate/t/index.t new file mode 100644 index 0000000000..75fcccf29f --- /dev/null +++ b/lib/Unicode/Collate/t/index.t @@ -0,0 +1,399 @@ + +BEGIN { + if (ord("A") == 193) { + print "1..0 # Unicode::Collate not ported to EBCDIC\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 65 }; +use Unicode::Collate; + +our $IsEBCDIC = ord("A") != 0x41; + +######################### + +ok(1); # If we made it this far, we're ok. + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, +); + +############## + +my %old_level = $Collator->change(level => 2); + +my $str; + +my $orig = "This is a Perl book."; +my $sub = "PERL"; +my $rep = "camel"; +my $ret = "This is a camel book."; + +$str = $orig; +if (my($pos,$len) = $Collator->index($str, $sub)) { + substr($str, $pos, $len, $rep); +} + +ok($str, $ret); + +$Collator->change(%old_level); + +$str = $orig; +if (my($pos,$len) = $Collator->index($str, $sub)) { + substr($str, $pos, $len, $rep); +} + +ok($str, $orig); + +############## + +my $match; + +$Collator->change(level => 1); + +$str = "Pe\x{300}rl"; +$sub = "pe"; +$ret = "Pe\x{300}"; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + +$str = "P\x{300}e\x{300}\x{301}\x{303}rl"; +$sub = "pE"; +$ret = "P\x{300}e\x{300}\x{301}\x{303}"; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + +$Collator->change(level => 2); + +$str = "Pe\x{300}rl"; +$sub = "pe"; +$ret = undef; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + +$str = "P\x{300}e\x{300}\x{301}\x{303}rl"; +$sub = "pE"; +$ret = undef; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + +$str = "Pe\x{300}rl"; +$sub = "pe\x{300}"; +$ret = "Pe\x{300}"; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + +$str = "P\x{300}e\x{300}\x{301}\x{303}rl"; +$sub = "p\x{300}E\x{300}\x{301}\x{303}"; +$ret = "P\x{300}e\x{300}\x{301}\x{303}"; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + +############## + +$Collator->change(level => 1); + +$str = $IsEBCDIC + ? "Ich mu\x{0059} studieren Perl." + : "Ich mu\x{00DF} studieren Perl."; +$sub = $IsEBCDIC + ? "m\x{00DC}ss" + : "m\x{00FC}ss"; +$ret = $IsEBCDIC + ? "mu\x{0059}" + : "mu\x{00DF}"; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + +$Collator->change(%old_level); + +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, undef); + +$match = undef; +if (my($pos,$len) = $Collator->index("", "")) { + $match = substr("", $pos, $len); +} +ok($match, ""); + +$match = undef; +if (my($pos,$len) = $Collator->index("", "abc")) { + $match = substr("", $pos, $len); +} +ok($match, undef); + +############## + +$Collator->change(level => 1); + +$str = "\0\cA\0\cAe\0\x{300}\cA\x{301}\cB\x{302}\0 \0\cA"; +$sub = "e"; +$ret = "e\0\x{300}\cA\x{301}\cB\x{302}\0"; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + +$Collator->change(level => 1); + +$str = "\0\cA\0\cAe\0\cA\x{300}\0\cAe"; +$sub = "e"; +$ret = "e\0\cA\x{300}\0\cA"; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + + +$Collator->change(%old_level); + +$str = "e\x{300}"; +$sub = "e"; +$ret = undef; +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub)) { + $match = substr($str, $pos, $len); +} +ok($match, $ret); + +############## + +$Collator->change(level => 1); + +$str = "The Perl is a language, and the perl is an interpreter."; +$sub = "PERL"; + +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub, -40)) { + $match = substr($str, $pos, $len); +} +ok($match, "Perl"); + +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub, 4)) { + $match = substr($str, $pos, $len); +} +ok($match, "Perl"); + +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub, 5)) { + $match = substr($str, $pos, $len); +} +ok($match, "perl"); + +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub, 32)) { + $match = substr($str, $pos, $len); +} +ok($match, "perl"); + +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub, 33)) { + $match = substr($str, $pos, $len); +} +ok($match, undef); + +$match = undef; +if (my($pos, $len) = $Collator->index($str, $sub, 100)) { + $match = substr($str, $pos, $len); +} +ok($match, undef); + +$Collator->change(%old_level); + +############## + +my @ret; + +$Collator->change(level => 1); + +$ret = $Collator->match("P\cBe\x{300}\cBrl and PERL", "pe"); +ok($ret); +ok($$ret eq "P\cBe\x{300}\cB"); + +@ret = $Collator->match("P\cBe\x{300}\cBrl and PERL", "pe"); +ok($ret[0], "P\cBe\x{300}\cB"); + +$str = $IsEBCDIC ? "mu\x{0059}" : "mu\x{00DF}"; +$sub = $IsEBCDIC ? "m\x{00DC}ss" : "m\x{00FC}ss"; + +($ret) = $Collator->match($str, $sub); +ok($ret, $str); + +$str = $IsEBCDIC ? "mu\x{0059}" : "mu\x{00DF}"; +$sub = $IsEBCDIC ? "m\x{00DC}s" : "m\x{00FC}s"; + +($ret) = $Collator->match($str, $sub); +ok($ret, undef); + +$ret = join ':', $Collator->gmatch("P\cBe\x{300}\cBrl, perl, and PERL", "pe"); +ok($ret eq "P\cBe\x{300}\cB:pe:PE"); + +$ret = $Collator->gmatch("P\cBe\x{300}\cBrl, perl, and PERL", "pe"); +ok($ret == 3); + +$str = "ABCDEF"; +$sub = "cde"; +$ret = $Collator->match($str, $sub); +$str = "01234567"; +ok($ret && $$ret, "CDE"); + +$str = "ABCDEF"; +$sub = "cde"; +($ret) = $Collator->match($str, $sub); +$str = "01234567"; +ok($ret, "CDE"); + + +$Collator->change(level => 3); + +$ret = $Collator->match("P\cBe\x{300}\cBrl and PERL", "pe"); +ok($ret, undef); + +@ret = $Collator->match("P\cBe\x{300}\cBrl and PERL", "pe"); +ok(@ret == 0); + +$ret = join ':', $Collator->gmatch("P\cBe\x{300}\cBrl and PERL", "pe"); +ok($ret eq ""); + +$ret = $Collator->gmatch("P\cBe\x{300}\cBrl and PERL", "pe"); +ok($ret == 0); + +$ret = join ':', $Collator->gmatch("P\cBe\x{300}\cBrl, perl, and PERL", "pe"); +ok($ret eq "pe"); + +$ret = $Collator->gmatch("P\cBe\x{300}\cBrl, perl, and PERL", "pe"); +ok($ret == 1); + +$str = $IsEBCDIC ? "mu\x{0059}" : "mu\x{00DF}"; +$sub = $IsEBCDIC ? "m\x{00DC}ss" : "m\x{00FC}ss"; + +($ret) = $Collator->match($str, $sub); +ok($ret, undef); + +$Collator->change(%old_level); + +############## + +$Collator->change(level => 1); + +sub strreverse { scalar reverse shift } + +$str = "P\cBe\x{300}\cBrl and PERL."; +$ret = $Collator->subst($str, "perl", 'Camel'); +ok($ret, 1); +ok($str, "Camel and PERL."); + +$str = "P\cBe\x{300}\cBrl and PERL."; +$ret = $Collator->subst($str, "perl", \&strreverse); +ok($ret, 1); +ok($str, "lr\cB\x{300}e\cBP and PERL."); + +$str = "P\cBe\x{300}\cBrl and PERL."; +$ret = $Collator->gsubst($str, "perl", 'Camel'); +ok($ret, 2); +ok($str, "Camel and Camel."); + +$str = "P\cBe\x{300}\cBrl and PERL."; +$ret = $Collator->gsubst($str, "perl", \&strreverse); +ok($ret, 2); +ok($str, "lr\cB\x{300}e\cBP and LREP."); + +$str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L..."; +$Collator->gsubst($str, "camel", sub { "$_[0]" }); +ok($str, +"Camel ass came\x{301}l CAMEL horse cAm\0E\0L..."); + +$Collator->change(level => 3); + +$str = "P\cBe\x{300}\cBrl and PERL."; +$ret = $Collator->subst($str, "perl", "Camel"); +ok(! $ret); +ok($str, "P\cBe\x{300}\cBrl and PERL."); + +$str = "P\cBe\x{300}\cBrl and PERL."; +$ret = $Collator->subst($str, "perl", \&strreverse); +ok(! $ret); +ok($str, "P\cBe\x{300}\cBrl and PERL."); + +$str = "P\cBe\x{300}\cBrl and PERL."; +$ret = $Collator->gsubst($str, "perl", "Camel"); +ok($ret, 0); +ok($str, "P\cBe\x{300}\cBrl and PERL."); + +$str = "P\cBe\x{300}\cBrl and PERL."; +$ret = $Collator->gsubst($str, "perl", \&strreverse); +ok($ret, 0); +ok($str, "P\cBe\x{300}\cBrl and PERL."); + +$Collator->change(%old_level); + +############## + +$str = "Perl and Camel"; +$ret = $Collator->gsubst($str, "\cA\cA\0", "AB"); +ok($ret, 15); +ok($str, "ABPABeABrABlAB ABaABnABdAB ABCABaABmABeABlAB"); + +$str = ''; +$ret = $Collator->subst($str, "", "ABC"); +ok($ret, 1); +ok($str, "ABC"); + +$str = ''; +$ret = $Collator->gsubst($str, "", "ABC"); +ok($ret, 1); +ok($str, "ABC"); + +$str = 'PPPPP'; +$ret = $Collator->gsubst($str, 'PP', "ABC"); +ok($ret, 2); +ok($str, "ABCABCP"); + +############## + +# Shifted; ignorable after variable + +($ret) = $Collator->match("A?\x{300}!\x{301}\x{344}B\x{315}", "?!"); +ok($ret, "?\x{300}!\x{301}\x{344}"); + +$Collator->change(alternate => 'Non-ignorable'); + +($ret) = $Collator->match("A?\x{300}!\x{301}B\x{315}", "?!"); +ok($ret, undef); diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t index 28aba79a0d..971ad6768c 100644 --- a/lib/Unicode/Collate/t/test.t +++ b/lib/Unicode/Collate/t/test.t @@ -14,9 +14,11 @@ BEGIN { } use Test; -BEGIN { plan tests => 184 }; +BEGIN { plan tests => 194 }; use Unicode::Collate; +our $IsEBCDIC = ord("A") != 0x41; + ######################### ok(1); # If we made it this far, we're ok. @@ -51,13 +53,14 @@ ok($Collator->cmp("", "perl"), -1); ############## -my $A_acute = pack('U', 0x00C1); -my $a_acute = pack('U', 0x00E1); +# Use pack('U'), not chr(), for Perl 5.6.1. +my $A_acute = pack('U', $IsEBCDIC ? 0x65 : 0xC1); +my $a_acute = pack('U', $IsEBCDIC ? 0x45 : 0xE1); my $acute = pack('U', 0x0301); ok($Collator->cmp("A$acute", $A_acute), 0); # @version 3.1.1 (prev: -1) ok($Collator->cmp($a_acute, $A_acute), -1); -ok($Collator->eq("A\cA$acute", $A_acute)); # UCA v9 +ok($Collator->eq("A\cA$acute", $A_acute)); # UCA v9. \cA is invariant. my %old_level = $Collator->change(level => 1); ok($Collator->eq("A$acute", $A_acute)); @@ -75,7 +78,7 @@ ok($Collator->lt($a_acute, $A_acute)); eval { require Unicode::Normalize }; -if (!$@) { +if (!$@ && !$IsEBCDIC) { my $NFD = Unicode::Collate->new( table => undef, entry => <<'ENTRIES', @@ -112,12 +115,11 @@ my $trad = Unicode::Collate->new( entry => << 'ENTRIES', 0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish 0043 0068 ; [.0A3F.0020.0008.0043] # "Ch" in traditional Spanish - 00DF ; [.0BA7.0020.0004.00DF][.0000.0153.0004.00DF][.0BA7.0020.001F.00DF] # sz ENTRIES ); # 0063 ; [.0A3D.0020.0002.0063] # LATIN SMALL LETTER C # 0064 ; [.0A49.0020.0002.0064] # LATIN SMALL LETTER D -# 0073 ; [.0BA7.0020.0002.0073] # LATIN SMALL LETTER S +# Deutsch sz is included in 'keys.txt'; ok( join(':', $trad->sort( qw/ acha aca ada acia acka / ) ), @@ -129,7 +131,7 @@ ok( join(':', qw/ aca acha acia acka ada / ), ); ok($trad->eq("ocho", "oc\cAho")); # UCA v9 -ok($trad->eq("ocho", "oc\000\cA\000\x7Fho")); # UCA v9 +ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9 my $hiragana = "\x{3042}\x{3044}"; my $katakana = "\x{30A2}\x{30A4}"; @@ -241,92 +243,6 @@ ok($Collator->lt("lake","like")); ############## -$Collator->change(level => 2); - -my $str; - -my $orig = "This is a Perl book."; -my $sub = "PERL"; -my $rep = "camel"; -my $ret = "This is a camel book."; - -$str = $orig; -if (my($pos,$len) = $Collator->index($str, $sub)) { - substr($str, $pos, $len, $rep); -} - -ok($str, $ret); - -$Collator->change(%old_level); - -$str = $orig; -if (my($pos,$len) = $Collator->index($str, $sub)) { - substr($str, $pos, $len, $rep); -} - -ok($str, $orig); - -############## - -my $match; - -$Collator->change(level => 1); - -$str = "Pe\x{300}rl"; -$sub = "pe"; -$match = undef; -if (my($pos, $len) = $Collator->index($str, $sub)) { - $match = substr($str, $pos, $len); -} -ok($match, "Pe\x{300}"); - -$str = "P\x{300}e\x{300}\x{301}\x{303}rl"; -$sub = "pE"; -$match = undef; -if (my($pos, $len) = $Collator->index($str, $sub)) { - $match = substr($str, $pos, $len); -} -ok($match, "P\x{300}e\x{300}\x{301}\x{303}"); - -$Collator->change(%old_level); - -############## - -%old_level = $trad->change(level => 1); - -$str = "Ich mu\x{00DF} studieren."; -$sub = "m\x{00FC}ss"; -$match = undef; -if (my($pos, $len) = $trad->index($str, $sub)) { - $match = substr($str, $pos, $len); -} -ok($match, "mu\x{00DF}"); - -$trad->change(%old_level); - -$str = "Ich mu\x{00DF} studieren."; -$sub = "m\x{00FC}ss"; -$match = undef; - -if (my($pos, $len) = $trad->index($str, $sub)) { - $match = substr($str, $pos, $len); -} -ok($match, undef); - -$match = undef; -if (my($pos,$len) = $Collator->index("", "")) { - $match = substr("", $pos, $len); -} -ok($match, ""); - -$match = undef; -if (my($pos,$len) = $Collator->index("", "abc")) { - $match = substr("", $pos, $len); -} -ok($match, undef); - -############## - # Table is undefined, then no entry is defined. my $undef_table = Unicode::Collate->new( @@ -535,7 +451,8 @@ my %old_rearrange = $Collator->change(rearrange => undef); ok($Collator->gt("\x{0E41}A", "\x{0E40}B")); ok($Collator->gt("A\x{0E41}A", "A\x{0E40}B")); -$Collator->change(rearrange => [ 0x61 ]); # 'a' +$Collator->change(rearrange => [ 0x61 ]); + # U+0061, 'a': This is a Unicode value, never a native value. ok($Collator->gt("ab", "AB")); # as 'ba' > 'AB' @@ -625,7 +542,6 @@ ok($backLevel2->gt("ca\x{300}ca\x{302}", "Ca\x{302}ca\x{300}")); ok($Collator ->lt("Ca\x{300}ca\x{302}", "ca\x{302}ca\x{300}")); ok($Collator ->lt("ca\x{300}ca\x{302}", "Ca\x{302}ca\x{300}")); - # HIRAGANA and KATAKANA are made undefined via undefName. # So they are after CJK Unified Ideographs. @@ -636,14 +552,64 @@ ok($Collator ->gt("\x{4E03}", $katakana)); ############## -# Shifted; ignorable after variable +# ignorable after variable +# Shifted; ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); -ok($Collator->eq("?\x{300}A\x{300}", "?A\x{300}")); +ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); ok($Collator->eq("?\x{300}", "?")); +ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. -$Collator->change(alternate => 'Non-ignorable'); +$Collator->change(level => 3); +ok($Collator->eq("\cA", "?")); + +$Collator->change(alternate => 'blanked', level => 4); +ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); +ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); +ok($Collator->eq("?\x{300}", "?")); +ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. + +$Collator->change(level => 3); +ok($Collator->eq("\cA", "?")); +$Collator->change(alternate => 'Non-ignorable', level => 4); + +ok($Collator->lt("?\x{300}", "?!")); +ok($Collator->gt("?\x{300}A$acute", "?$A_acute")); ok($Collator->gt("?\x{300}", "?")); +ok($Collator->gt("?\x{344}", "?")); -$Collator->change(alternate => 'Shifted'); +$Collator->change(level => 3); +ok($Collator->lt("\cA", "?")); + +$Collator->change(alternate => 'Shifted', level => 4); + +############## + +# According to Conformance Test, +# a L3-ignorable is treated as a completely ignorable. + +my $L3ignorable = Unicode::Collate->new( + alternate => 'Non-ignorable', + table => undef, + normalization => undef, + entry => <<'ENTRIES', +0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429) +0001 ; [.0000.0000.0000.0000] # [0001] START OF HEADING (in 6429) +0591 ; [.0000.0000.0000.0591] # HEBREW ACCENT ETNAHTA +1D165 ; [.0000.0000.0000.1D165] # MUSICAL SYMBOL COMBINING STEM +0021 ; [*024B.0020.0002.0021] # EXCLAMATION MARK +09BE ; [.114E.0020.0002.09BE] # BENGALI VOWEL SIGN AA +09C7 ; [.1157.0020.0002.09C7] # BENGALI VOWEL SIGN E +09CB ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O +09C7 09BE ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O +ENTRIES +); + +ok($L3ignorable->lt("\cA", "!")); +ok($L3ignorable->lt("\x{591}", "!")); +ok($L3ignorable->eq("\cA", "\x{591}")); +ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\cA\x{09BE}A")); +ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{0591}\x{09BE}A")); +ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{1D165}\x{09BE}A")); +ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09CB}A")); -- cgit v1.2.1