summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MANIFEST3
-rwxr-xr-xPorting/Maintainers.pl2
-rw-r--r--cpan/Unicode-Collate/.gitignore1
-rw-r--r--cpan/Unicode-Collate/Changes25
-rw-r--r--cpan/Unicode-Collate/Collate.pm360
-rw-r--r--cpan/Unicode-Collate/Collate.xs691
-rw-r--r--cpan/Unicode-Collate/Collate/Locale.pm8
-rw-r--r--cpan/Unicode-Collate/Makefile.PL28
-rw-r--r--cpan/Unicode-Collate/README17
-rw-r--r--cpan/Unicode-Collate/mkheader196
-rw-r--r--cpan/Unicode-Collate/t/loc_test.t13
-rw-r--r--pod/perldelta.pod7
12 files changed, 1020 insertions, 331 deletions
diff --git a/MANIFEST b/MANIFEST
index 05ec676ef3..9842238328 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -2554,6 +2554,9 @@ cpan/Unicode-Collate/Collate/Locale/zh_pin.pl Unicode::Collate
cpan/Unicode-Collate/Collate/Locale/zh.pl Unicode::Collate
cpan/Unicode-Collate/Collate/Locale/zh_strk.pl Unicode::Collate
cpan/Unicode-Collate/Collate.pm Unicode::Collate
+cpan/Unicode-Collate/Collate.xs Unicode::Collate
+cpan/Unicode-Collate/Makefile.PL Unicode::Collate
+cpan/Unicode-Collate/mkheader Unicode::Collate
cpan/Unicode-Collate/README Unicode::Collate
cpan/Unicode-Collate/t/altern.t Unicode::Collate
cpan/Unicode-Collate/t/backwds.t Unicode::Collate
diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl
index ad040d545d..f41a2677e9 100755
--- a/Porting/Maintainers.pl
+++ b/Porting/Maintainers.pl
@@ -1577,7 +1577,7 @@ use File::Glob qw(:case);
'Unicode::Collate' =>
{
'MAINTAINER' => 'sadahiro',
- 'DISTRIBUTION' => 'SADAHIRO/Unicode-Collate-0.70-withoutworldwriteables.tar.gz',
+ 'DISTRIBUTION' => 'SADAHIRO/Unicode-Collate-0.72-withoutworldwriteables.tar.gz',
'FILES' => q[cpan/Unicode-Collate],
'EXCLUDED' => [ qr{N$},
qr{^data/},
diff --git a/cpan/Unicode-Collate/.gitignore b/cpan/Unicode-Collate/.gitignore
new file mode 100644
index 0000000000..424c745c12
--- /dev/null
+++ b/cpan/Unicode-Collate/.gitignore
@@ -0,0 +1 @@
+*.h
diff --git a/cpan/Unicode-Collate/Changes b/cpan/Unicode-Collate/Changes
index ca9be54809..816d43a4d3 100644
--- a/cpan/Unicode-Collate/Changes
+++ b/cpan/Unicode-Collate/Changes
@@ -1,5 +1,20 @@
Revision history for Perl module Unicode::Collate.
+0.72 Sat Jan 22 17:28:32 2011
+ - xs: fix mixing char* and U8*.
+
+0.71 Tue Jan 18 22:29:44 2011
+ - t/loc_test.t should not fail without Unicode::Normalize.
+
+0.70 Sun Jan 16 20:31:07 2011
+ - Now U::C::Locale->new will use the compiled DUCET via XS if available.
+ added some tests in t/loc_test.t.
+
+0.69 Sat Jan 15 19:41:11 2011
+ - clarified about XSUB. revised INSTALL in README.
+ - xs: flag passed to utf8n_to_uvuni().
+ - doc and comments: [perl #81876] Fix typos by Peter J. Acklam.
+
0.68 Tue Nov 23 20:17:22 2010
- doc: clarified about (backwards => [ ]) and (backwards => undef).
- separated t/backwds.t from t/test.t.
@@ -24,7 +39,7 @@ Revision history for Perl module Unicode::Collate.
- 12 compat. ideographs (e.g. U+FA0E) are treated as unified ideographs.
(though DUCET also does it, now Unicode::Collate does it without DUCET.)
- added t/compatui.t.
- ! Ideographs Ext.B (U+20000..U+2A6D6) can be overrided with UCA_Version 8.
+ ! Ideographs Ext.B (U+20000..U+2A6D6) can be overridden with UCA_Version 8.
This is a long-standing behavior from Unicode::Collate 0.11 to 0.63.
A wrong fix at 0.64 should be abandoned.
@@ -121,6 +136,8 @@ Revision history for Perl module Unicode::Collate.
- U+9FC4..U+9FCB and U+2A700..U+2B734 are new CJK unified ideographs.
- Many hangul jamo are assigned (affecting hangul_terminator).
+ ! Now XSUB will be built by default. (XSUB needs a C compiler.)
+ To build pure perl, run disableXS before Makefile.PL.
! DUCET will be compiled when XS is used. Explicit saying
<table => 'allkeys.txt'> (or using another table) will prevent
this module from using the compiled DUCET.
@@ -174,11 +191,11 @@ Revision history for Perl module Unicode::Collate.
(Perl 5.7.3 or before)). If perl 5.6.X is used, XSUB may help it
in place of broken CORE::unpack('U*') in older perl.
- added illegal.t and illegalp.t in t.
- - added XSUB (EXPERIMENTAL!) where some functions are implemented
- in XSUB. Pure Perl is also supported.
+ - added XSUB where some functions are implemented in XSUB.
+ Pure Perl is also supported.
0.30 Mon Oct 13 21:26:37 2003
- - fix: Completely ignorable in table should be able to be overrided
+ - fix: Completely ignorable in table should be able to be overridden
by non-ignorable in entry.
- fix: Maximum length for contraction must not be shortened
by a shorter contraction following in table and/or entry.
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm
index b337b6f24b..c3ed1c7356 100644
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -14,9 +14,13 @@ use File::Spec;
no warnings 'utf8';
-our $VERSION = '0.6801';
+our $VERSION = '0.72';
our $PACKAGE = __PACKAGE__;
+require DynaLoader;
+our @ISA = qw(DynaLoader);
+bootstrap Unicode::Collate $VERSION;
+
my @Path = qw(Unicode Collate);
my $KeyFile = "allkeys.txt";
@@ -71,49 +75,8 @@ use constant NON_VAR => 0; # Non-Variable character
use constant VAR => 1; # Variable character
# specific code points
-use constant Hangul_SBase => 0xAC00;
use constant Hangul_SIni => 0xAC00;
use constant Hangul_SFin => 0xD7A3;
-use constant Hangul_NCount => 588;
-use constant Hangul_TCount => 28;
-use constant Hangul_LBase => 0x1100;
-use constant Hangul_LIni => 0x1100;
-use constant Hangul_LFin => 0x1159;
-use constant Hangul_LFill => 0x115F;
-use constant Hangul_LEnd => 0x115F; # Unicode 5.2
-use constant Hangul_VBase => 0x1161;
-use constant Hangul_VIni => 0x1160; # from Vowel Filler
-use constant Hangul_VFin => 0x11A2;
-use constant Hangul_VEnd => 0x11A7; # Unicode 5.2
-use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
-use constant Hangul_TIni => 0x11A8;
-use constant Hangul_TFin => 0x11F9;
-use constant Hangul_TEnd => 0x11FF; # Unicode 5.2
-use constant HangulL2Ini => 0xA960; # Unicode 5.2
-use constant HangulL2Fin => 0xA97C; # Unicode 5.2
-use constant HangulV2Ini => 0xD7B0; # Unicode 5.2
-use constant HangulV2Fin => 0xD7C6; # Unicode 5.2
-use constant HangulT2Ini => 0xD7CB; # Unicode 5.2
-use constant HangulT2Fin => 0xD7FB; # Unicode 5.2
-
-use constant CJK_UidIni => 0x4E00;
-use constant CJK_UidFin => 0x9FA5;
-use constant CJK_UidF41 => 0x9FBB;
-use constant CJK_UidF51 => 0x9FC3;
-use constant CJK_UidF52 => 0x9FCB;
-use constant CJK_ExtAIni => 0x3400; # Unicode 3.0
-use constant CJK_ExtAFin => 0x4DB5; # Unicode 3.0
-use constant CJK_ExtBIni => 0x20000; # Unicode 3.1
-use constant CJK_ExtBFin => 0x2A6D6; # Unicode 3.1
-use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2
-use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2
-use constant CJK_ExtDIni => 0x2B740; # Unicode 6.0
-use constant CJK_ExtDFin => 0x2B81D; # Unicode 6.0
-
-my %CompatUI = map +($_ => 1), (
- 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F,
- 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29,
-);
# Logical_Order_Exception in PropList.txt
my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
@@ -128,10 +91,6 @@ sub pack_U {
return pack('U*', @_);
}
-sub unpack_U {
- return unpack('U*', shift(@_).pack('U*'));
-}
-
######
my (%VariableOK);
@@ -152,6 +111,7 @@ our @ChangeNG = qw/
versionTable alternateTable backwardsTable forwardsTable rearrangeTable
derivCode normCode rearrangeHash backwardsFlag
suppress suppressHash
+ __useXS
/;
# The hash key 'ignored' is deleted at v 0.21.
# The hash key 'isShift' is deleted at v 0.23.
@@ -285,6 +245,12 @@ sub new
my $class = shift;
my $self = bless { @_ }, $class;
+ if (! exists $self->{table} &&
+ !defined $self->{undefName} && !defined $self->{ignoreName} &&
+ !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
+ $self->{__useXS} = \&_fetch_simple;
+ } # XS only
+
# keys of $self->{suppressHash} are $self->{suppress}.
if ($self->{suppress} && @{ $self->{suppress} }) {
@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
@@ -347,6 +313,20 @@ sub parseAtmark {
sub read_table {
my $self = shift;
+ if ($self->{__useXS}) {
+ my @rest = _fetch_rest(); # complex matter need to parse
+ for my $line (@rest) {
+ next if $line =~ /^\s*#/;
+
+ if ($line =~ s/^\s*\@//) {
+ $self->parseAtmark($line);
+ } else {
+ $self->parseEntry($line);
+ }
+ }
+ return;
+ }
+
my($f, $fh);
foreach my $d (@INC) {
$f = File::Spec->catfile($d, @Path, $self->{table});
@@ -445,50 +425,12 @@ sub parseEntry
}
-##
-## VCE = _varCE(variable, VCE)
-##
-sub _varCE
-{
- my $vbl = shift;
- my $vce = shift;
- if ($vbl eq 'non-ignorable') {
- return $vce;
- }
- my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
-
- if ($var) {
- return pack(VCE_TEMPLATE, $var, 0, 0, 0,
- $vbl eq 'blanked' ? $wt[3] : $wt[0]);
- }
- elsif ($vbl eq 'blanked') {
- return $vce;
- }
- else {
- return pack(VCE_TEMPLATE, $var, @wt[0..2],
- $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
- }
-}
-
sub viewSortKey
{
my $self = shift;
$self->visualizeSortKey($self->getSortKey(@_));
}
-sub visualizeSortKey
-{
- my $self = shift;
- my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
-
- if ($self->{UCA_Version} <= 8) {
- $view =~ s/ ?0000 ?/|/g;
- } else {
- $view =~ s/\b0000\b/|/g;
- }
- return "[$view]";
-}
-
##
## arrayref of JCPS = splitEnt(string to be collated)
@@ -506,6 +448,7 @@ sub splitEnt
my $reH = $self->{rearrangeHash};
my $vers = $self->{UCA_Version};
my $ver9 = $vers >= 9 && $vers <= 11;
+ my $uXS = $self->{__useXS};
my ($str, @buf);
@@ -544,6 +487,9 @@ sub splitEnt
} elsif ($ver9) {
$src[$i] = undef if $map->{ $src[$i] } &&
@{ $map->{ $src[$i] } } == 0;
+ if ($uXS) {
+ $src[$i] = undef if _ignorable_simple($src[$i]);
+ }
}
}
@@ -623,7 +569,8 @@ sub splitEnt
}
# skip completely ignorable
- if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
+ if ($uXS && $jcps =~ /^[0-9]+\z/ && _ignorable_simple($jcps) ||
+ $map->{$jcps} && @{ $map->{$jcps} } == 0) {
if ($wLen && @buf) {
$buf[-1][2] = $i + 1;
}
@@ -662,10 +609,13 @@ sub getWt
my $vbl = $self->{variable};
my $map = $self->{mapping};
my $der = $self->{derivCode};
+ my $uXS = $self->{__useXS};
return if !defined $u;
return map(_varCE($vbl, $_), @{ $map->{$u} })
if $map->{$u};
+ return map(_varCE($vbl, $_), _fetch_simple($u))
+ if $uXS && _exists_simple($u);
# JCPS must not be a contraction, then it's a code point.
if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
@@ -692,7 +642,7 @@ sub getWt
$map->{$contract} and @decH = ($contract, $decH[2]);
}
# even if V's ignorable, LT contraction is not supported.
- # If such a situatution were required, NFD should be used.
+ # If such a situation were required, NFD should be used.
}
if (@decH == 3 && $max->{$decH[1]}) {
my $contract = join(CODE_SEP, @decH[1,2]);
@@ -701,7 +651,9 @@ sub getWt
}
@hangulCE = map({
- $map->{$_} ? @{ $map->{$_} } : $der->($_);
+ $map->{$_} ? @{ $map->{$_} } :
+ $uXS && _exists_simple($_) ? _fetch_simple($_) :
+ $der->($_);
} @decH);
}
return map _varCE($vbl, $_), @hangulCE;
@@ -726,12 +678,10 @@ sub getWt
sub getSortKey
{
my $self = shift;
- my $lev = $self->{level};
my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
my $vers = $self->{UCA_Version};
my $vbl = $self->{variable};
my $term = $self->{hangul_terminator};
- my $v2i = $vers >= 9 && $vbl ne 'non-ignorable';
my @buf; # weight arrays
if ($term) {
@@ -756,53 +706,7 @@ sub getSortKey
}
}
- # make sort key
- my @ret = ([],[],[],[]);
- my $last_is_variable;
-
- foreach my $vwt (@buf) {
- my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
-
- # "Ignorable (L1, L2) after Variable" since track. v. 9
- if ($v2i) {
- if ($var) {
- $last_is_variable = TRUE;
- } elsif (!$wt[0]) { # ignorable
- next if $last_is_variable;
- } else {
- $last_is_variable = FALSE;
- }
- }
- foreach my $v (0..$lev-1) {
- 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
- }
- }
-
- # modification of tertiary weights
- if ($self->{upper_before_lower}) {
- foreach my $w (@{ $ret[2] }) {
- if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
- elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
- elsif ($w == 0x1C) { $w += 1 } # square upper
- elsif ($w == 0x1D) { $w -= 1 } # square lower
- }
- }
- if ($self->{katakana_before_hiragana}) {
- foreach my $w (@{ $ret[2] }) {
- if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
- elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
- }
- }
-
- if ($self->{backwardsFlag}) {
- for (my $v = MinLevel; $v <= MaxLevel; $v++) {
- if ($self->{backwardsFlag} & (1 << $v)) {
- @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
- }
- }
- }
-
- join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
+ return $self->mk_SortKey(\@buf);
}
@@ -829,174 +733,6 @@ sub sort {
}
-sub _derivCE_22 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
- CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
- CJK_ExtDIni <= $u && $u <= CJK_ExtDFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_20 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
- CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_18 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51 || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_14 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41 || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_9 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_8 {
- my $code = shift;
- my $aaaa = 0xFF80 + ($code >> 15);
- my $bbbb = ($code & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
-}
-
-sub _uideoCE_8 {
- my $u = shift;
- return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
-}
-
-sub _isUIdeo {
- # $uca_vers = 0 for _uideoCE_8()
- my ($u, $uca_vers) = @_;
- return((CJK_UidIni <= $u && (
- $uca_vers >= 20 ? ($u <= CJK_UidF52) :
- $uca_vers >= 18 ? ($u <= CJK_UidF51) :
- $uca_vers >= 14 ? ($u <= CJK_UidF41) :
- ($u <= CJK_UidFin))) || $CompatUI{$u}
- ||
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
- ||
- ($uca_vers >= 8 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
- ||
- ($uca_vers >= 20 && CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
- ||
- ($uca_vers >= 22 && CJK_ExtDIni <= $u && $u <= CJK_ExtDFin)
- );
-}
-
-
-##
-## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
-##
-sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
-
-#
-# $code *must* be in Hangul syllable.
-# Check it before you enter here.
-#
-sub _decompHangul {
- my $code = shift;
- my $si = $code - Hangul_SBase;
- my $li = int( $si / Hangul_NCount);
- my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
- my $ti = $si % Hangul_TCount;
- return (
- Hangul_LBase + $li,
- Hangul_VBase + $vi,
- $ti ? (Hangul_TBase + $ti) : (),
- );
-}
-
-sub _isIllegal {
- my $code = shift;
- return((! defined $code) # removed
- || ($code < 0 || 0x10FFFF < $code) # out of range
- );
-}
-
-sub _isNonchar {
- my $code = shift;
- return((($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
- || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
- || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
- );
-}
-
-# Hangul Syllable Type
-sub getHST {
- my $u = shift;
- my $vers = shift || 0;
-
- if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
- return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV";
- }
-
- if ($vers < 20) {
- return Hangul_LIni <= $u && $u <= Hangul_LFin ||
- $u == Hangul_LFill ? "L" :
- Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
- Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : "";
- } else {
- return Hangul_LIni <= $u && $u <= Hangul_LEnd ||
- HangulL2Ini <= $u && $u <= HangulL2Fin ? "L" :
- Hangul_VIni <= $u && $u <= Hangul_VEnd ||
- HangulV2Ini <= $u && $u <= HangulV2Fin ? "V" :
- Hangul_TIni <= $u && $u <= Hangul_TEnd ||
- HangulT2Ini <= $u && $u <= HangulT2Fin ? "T" : "";
- }
-}
-
-
##
## bool _nonIgnorAtLevel(arrayref weights, int level)
##
@@ -1023,7 +759,7 @@ sub _eqArray($$$)
my $lev = shift;
for my $g (0..@$substr-1){
- # Do the $g'th graphemes have the same number of AV weigths?
+ # Do the $g'th graphemes have the same number of AV weights?
return if @{ $source->[$g] } != @{ $substr->[$g] };
for my $w (0..@{ $substr->[$g] }-1) {
@@ -1321,7 +1057,7 @@ The following tracking versions are supported. The default is 20.
Note: Recent UTS #10 renames "Tracking Version" to "Revision."
-* Noncharacters (e.g. U+FFFF) are not ignored, and can be overrided
+* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
since C<UCA_Version> 22.
* Fully ignorable characters were ignored, and would not interrupt
@@ -1359,7 +1095,7 @@ forwards at all the levels.
If the same character (or a sequence of characters) exists
in the collation element table through C<table>,
-mapping to collation elements is overrided.
+mapping to collation elements is overridden.
If it does not exist, the mapping is defined additionally.
entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
@@ -1536,7 +1272,7 @@ those in the CJK Unified Ideographs Extension A etc.
Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater.
Through C<overrideCJK>, ordering of CJK unified ideographs (including
-extensions) can be overrided.
+extensions) can be overridden.
ex. CJK unified ideographs in the JIS code point order.
@@ -1579,7 +1315,7 @@ in C<table> or C<entry> is still valid.
B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
-ideographs. But they can't be overrided via C<overrideCJK> when you use
+ideographs. But they can't be overridden via C<overrideCJK> when you use
DUCET, as the table includes weights for them. C<table> or C<entry> has
priority over C<overrideCJK>.
@@ -1589,7 +1325,7 @@ priority over C<overrideCJK>.
By default, Hangul syllables are decomposed into Hangul Jamo,
even if C<(normalization =E<gt> undef)>.
-But the mapping of Hangul syllables may be overrided.
+But the mapping of Hangul syllables may be overridden.
This parameter works like C<overrideCJK>, so see there for examples.
@@ -1750,7 +1486,7 @@ this parameter doesn't work validly.
This key allows to variable weighting for variable collation elements,
which are marked with an ASTERISK in the table
-(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
+(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
@@ -2058,7 +1794,7 @@ B<Unicode::Normalize is required to try The Conformance Test.>
=head1 AUTHOR, COPYRIGHT AND LICENSE
The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
-<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2010,
+<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2011,
SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it and/or
diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs
new file mode 100644
index 0000000000..d96912bf6d
--- /dev/null
+++ b/cpan/Unicode-Collate/Collate.xs
@@ -0,0 +1,691 @@
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+/* This file is prepared by mkheader */
+#include "ucatbl.h"
+
+/* Perl 5.6.1 ? */
+#ifndef utf8n_to_uvuni
+#define utf8n_to_uvuni utf8_to_uv
+#endif /* utf8n_to_uvuni */
+
+/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
+#ifndef UTF8_ALLOW_BOM
+#define UTF8_ALLOW_BOM (0)
+#endif /* UTF8_ALLOW_BOM */
+
+#ifndef UTF8_ALLOW_SURROGATE
+#define UTF8_ALLOW_SURROGATE (0)
+#endif /* UTF8_ALLOW_SURROGATE */
+
+#ifndef UTF8_ALLOW_FE_FF
+#define UTF8_ALLOW_FE_FF (0)
+#endif /* UTF8_ALLOW_FE_FF */
+
+#ifndef UTF8_ALLOW_FFFF
+#define UTF8_ALLOW_FFFF (0)
+#endif /* UTF8_ALLOW_FFFF */
+
+#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
+
+/* if utf8n_to_uvuni() sets retlen to 0 (?) */
+#define ErrRetlenIsZero "panic (Unicode::Collate): zero-length character"
+
+/* At present, char > 0x10ffff are unaffected without complaint, right? */
+#define VALID_UTF_MAX (0x10ffff)
+#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
+
+static const UV max_div_16 = UV_MAX / 16;
+
+/* Supported Levels */
+#define MinLevel (1)
+#define MaxLevel (4)
+
+/* Shifted weight at 4th level */
+#define Shift4Wt (0xFFFF)
+
+#define VCE_Length (9)
+
+#define Hangul_SBase (0xAC00)
+#define Hangul_SIni (0xAC00)
+#define Hangul_SFin (0xD7A3)
+#define Hangul_NCount (588)
+#define Hangul_TCount (28)
+#define Hangul_LBase (0x1100)
+#define Hangul_LIni (0x1100)
+#define Hangul_LFin (0x1159)
+#define Hangul_LFill (0x115F)
+#define Hangul_LEnd (0x115F) /* Unicode 5.2 */
+#define Hangul_VBase (0x1161)
+#define Hangul_VIni (0x1160) /* from Vowel Filler */
+#define Hangul_VFin (0x11A2)
+#define Hangul_VEnd (0x11A7) /* Unicode 5.2 */
+#define Hangul_TBase (0x11A7) /* from "no-final" codepoint */
+#define Hangul_TIni (0x11A8)
+#define Hangul_TFin (0x11F9)
+#define Hangul_TEnd (0x11FF) /* Unicode 5.2 */
+#define HangulL2Ini (0xA960) /* Unicode 5.2 */
+#define HangulL2Fin (0xA97C) /* Unicode 5.2 */
+#define HangulV2Ini (0xD7B0) /* Unicode 5.2 */
+#define HangulV2Fin (0xD7C6) /* Unicode 5.2 */
+#define HangulT2Ini (0xD7CB) /* Unicode 5.2 */
+#define HangulT2Fin (0xD7FB) /* Unicode 5.2 */
+
+#define CJK_UidIni (0x4E00)
+#define CJK_UidFin (0x9FA5)
+#define CJK_UidF41 (0x9FBB)
+#define CJK_UidF51 (0x9FC3)
+#define CJK_UidF52 (0x9FCB)
+#define CJK_ExtAIni (0x3400) /* Unicode 3.0 */
+#define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */
+#define CJK_ExtBIni (0x20000) /* Unicode 3.1 */
+#define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */
+#define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */
+#define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */
+#define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */
+#define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */
+
+static STDCHAR UnifiedCompat[] = {
+ 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1
+}; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */
+
+#define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode))
+
+MODULE = Unicode::Collate PACKAGE = Unicode::Collate
+
+PROTOTYPES: DISABLE
+
+void
+_fetch_rest ()
+ PREINIT:
+ char ** rest;
+ PPCODE:
+ for (rest = UCA_rest; *rest; ++rest) {
+ XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0)));
+ }
+
+
+void
+_fetch_simple (uv)
+ UV uv
+ PREINIT:
+ U8 ***plane, **row;
+ U8* result = NULL;
+ PPCODE:
+ if (!OVER_UTF_MAX(uv)){
+ plane = (U8***)UCA_simple[uv >> 16];
+ if (plane) {
+ row = plane[(uv >> 8) & 0xff];
+ result = row ? row[uv & 0xff] : NULL;
+ }
+ }
+ if (result) {
+ int i;
+ int num = (int)*result;
+ ++result;
+ for (i = 0; i < num; ++i) {
+ XPUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length)));
+ result += VCE_Length;
+ }
+ } else {
+ XPUSHs(sv_2mortal(newSViv(0)));
+ }
+
+SV*
+_ignorable_simple (uv)
+ UV uv
+ ALIAS:
+ _exists_simple = 1
+ PREINIT:
+ U8 ***plane, **row;
+ int num = -1;
+ U8* result = NULL;
+ CODE:
+ if (!OVER_UTF_MAX(uv)){
+ plane = (U8***)UCA_simple[uv >> 16];
+ if (plane) {
+ row = plane[(uv >> 8) & 0xff];
+ result = row ? row[uv & 0xff] : NULL;
+ }
+ if (result)
+ num = (int)*result; /* assuming 0 <= num < 128 */
+ }
+
+ if (ix)
+ RETVAL = boolSV(num >0);
+ else
+ RETVAL = boolSV(num==0);
+ OUTPUT:
+ RETVAL
+
+
+void
+_getHexArray (src)
+ SV* src
+ PREINIT:
+ char *s, *e;
+ STRLEN byte;
+ UV value;
+ bool overflowed = FALSE;
+ const char *hexdigit;
+ PPCODE:
+ s = SvPV(src,byte);
+ for (e = s + byte; s < e;) {
+ hexdigit = strchr((char *) PL_hexdigit, *s++);
+ if (! hexdigit)
+ continue;
+ value = (hexdigit - PL_hexdigit) & 0xF;
+ while (*s) {
+ hexdigit = strchr((char *) PL_hexdigit, *s++);
+ if (! hexdigit)
+ break;
+ if (overflowed)
+ continue;
+ if (value > max_div_16) {
+ overflowed = TRUE;
+ continue;
+ }
+ value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF);
+ }
+ XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value)));
+ }
+
+
+SV*
+_isIllegal (sv)
+ SV* sv
+ PREINIT:
+ UV uv;
+ CODE:
+ if (!sv || !SvIOK(sv))
+ XSRETURN_YES;
+ uv = SvUVX(sv);
+ RETVAL = boolSV(
+ 0x10FFFF < uv /* out of range */
+ );
+OUTPUT:
+ RETVAL
+
+
+SV*
+_isNonchar (sv)
+ SV* sv
+ PREINIT:
+ UV uv;
+ CODE:
+ /* should be called only if ! _isIllegal(sv). */
+ uv = SvUVX(sv);
+ RETVAL = boolSV(
+ ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] (cf. utf8.c) */
+ || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */
+ || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */
+ );
+OUTPUT:
+ RETVAL
+
+
+void
+_decompHangul (code)
+ UV code
+ PREINIT:
+ UV sindex, lindex, vindex, tindex;
+ PPCODE:
+ /* code *must* be in Hangul syllable.
+ * Check it before you enter here. */
+ sindex = code - Hangul_SBase;
+ lindex = sindex / Hangul_NCount;
+ vindex = (sindex % Hangul_NCount) / Hangul_TCount;
+ tindex = sindex % Hangul_TCount;
+
+ XPUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase)));
+ XPUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase)));
+ if (tindex)
+ XPUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase)));
+
+
+SV*
+getHST (code, uca_vers = 0)
+ UV code;
+ IV uca_vers;
+ PREINIT:
+ char * hangtype;
+ STRLEN typelen;
+ CODE:
+ if (codeRange(Hangul_SIni, Hangul_SFin)) {
+ if ((code - Hangul_SBase) % Hangul_TCount) {
+ hangtype = "LVT"; typelen = 3;
+ } else {
+ hangtype = "LV"; typelen = 2;
+ }
+ } else if (uca_vers < 20) {
+ if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) {
+ hangtype = "L"; typelen = 1;
+ } else if (codeRange(Hangul_VIni, Hangul_VFin)) {
+ hangtype = "V"; typelen = 1;
+ } else if (codeRange(Hangul_TIni, Hangul_TFin)) {
+ hangtype = "T"; typelen = 1;
+ } else {
+ hangtype = ""; typelen = 0;
+ }
+ } else {
+ if (codeRange(Hangul_LIni, Hangul_LEnd) ||
+ codeRange(HangulL2Ini, HangulL2Fin)) {
+ hangtype = "L"; typelen = 1;
+ } else if (codeRange(Hangul_VIni, Hangul_VEnd) ||
+ codeRange(HangulV2Ini, HangulV2Fin)) {
+ hangtype = "V"; typelen = 1;
+ } else if (codeRange(Hangul_TIni, Hangul_TEnd) ||
+ codeRange(HangulT2Ini, HangulT2Fin)) {
+ hangtype = "T"; typelen = 1;
+ } else {
+ hangtype = ""; typelen = 0;
+ }
+ }
+
+ RETVAL = newSVpvn(hangtype, typelen);
+OUTPUT:
+ RETVAL
+
+
+void
+_derivCE_9 (code)
+ UV code
+ ALIAS:
+ _derivCE_14 = 1
+ _derivCE_18 = 2
+ _derivCE_20 = 3
+ _derivCE_22 = 4
+ PREINIT:
+ UV base, aaaa, bbbb;
+ U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
+ U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
+ bool basic_unified = 0;
+ PPCODE:
+ if (CJK_UidIni <= code) {
+ if (codeRange(0xFA0E, 0xFA29))
+ basic_unified = (bool)UnifiedCompat[code - 0xFA0E];
+ else
+ basic_unified = (ix >= 3 ? (code <= CJK_UidF52) :
+ ix == 2 ? (code <= CJK_UidF51) :
+ ix == 1 ? (code <= CJK_UidF41) :
+ (code <= CJK_UidFin));
+ }
+ base = (basic_unified)
+ ? 0xFB40 : /* CJK */
+ ((codeRange(CJK_ExtAIni, CJK_ExtAFin))
+ ||
+ (codeRange(CJK_ExtBIni, CJK_ExtBFin))
+ ||
+ (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
+ ||
+ (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)))
+ ? 0xFB80 /* CJK ext. */
+ : 0xFBC0; /* others */
+ aaaa = base + (code >> 15);
+ bbbb = (code & 0x7FFF) | 0x8000;
+ a[1] = (U8)(aaaa >> 8);
+ a[2] = (U8)(aaaa & 0xFF);
+ b[1] = (U8)(bbbb >> 8);
+ b[2] = (U8)(bbbb & 0xFF);
+ a[7] = b[7] = (U8)(code >> 8);
+ a[8] = b[8] = (U8)(code & 0xFF);
+ XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
+ XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
+
+
+void
+_derivCE_8 (code)
+ UV code
+ PREINIT:
+ UV aaaa, bbbb;
+ U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x02\x00\x01\xFF\xFF";
+ U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
+ PPCODE:
+ aaaa = 0xFF80 + (code >> 15);
+ bbbb = (code & 0x7FFF) | 0x8000;
+ a[1] = (U8)(aaaa >> 8);
+ a[2] = (U8)(aaaa & 0xFF);
+ b[1] = (U8)(bbbb >> 8);
+ b[2] = (U8)(bbbb & 0xFF);
+ a[7] = b[7] = (U8)(code >> 8);
+ a[8] = b[8] = (U8)(code & 0xFF);
+ XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
+ XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
+
+
+void
+_uideoCE_8 (code)
+ UV code
+ PREINIT:
+ U8 uice[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
+ PPCODE:
+ uice[1] = uice[7] = (U8)(code >> 8);
+ uice[2] = uice[8] = (U8)(code & 0xFF);
+ XPUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length)));
+
+
+SV*
+_isUIdeo (code, uca_vers)
+ UV code;
+ IV uca_vers;
+ bool basic_unified = 0;
+ CODE:
+ /* uca_vers = 0 for _uideoCE_8() */
+ if (CJK_UidIni <= code) {
+ if (codeRange(0xFA0E, 0xFA29))
+ basic_unified = (bool)UnifiedCompat[code - 0xFA0E];
+ else
+ basic_unified = (uca_vers >= 20 ? (code <= CJK_UidF52) :
+ uca_vers >= 18 ? (code <= CJK_UidF51) :
+ uca_vers >= 14 ? (code <= CJK_UidF41) :
+ (code <= CJK_UidFin));
+ }
+ RETVAL = boolSV(
+ (basic_unified)
+ ||
+ (codeRange(CJK_ExtAIni, CJK_ExtAFin))
+ ||
+ (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin))
+ ||
+ (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
+ ||
+ (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
+ );
+OUTPUT:
+ RETVAL
+
+
+SV*
+mk_SortKey (self, buf)
+ SV* self;
+ SV* buf;
+ PREINIT:
+ SV *dst, **svp;
+ STRLEN dlen, vlen;
+ U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel];
+ AV *bufAV;
+ HV *selfHV;
+ UV back_flag;
+ I32 i, buf_len;
+ IV lv, level, uca_vers;
+ bool upper_lower, kata_hira, v2i, last_is_var;
+ CODE:
+ if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
+ selfHV = (HV*)SvRV(self);
+ else
+ croak("$self is not a HASHREF.");
+
+ svp = hv_fetch(selfHV, "level", 5, FALSE);
+ level = svp ? SvIV(*svp) : MaxLevel;
+
+ if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV)
+ bufAV = (AV*)SvRV(buf);
+ else
+ croak("XSUB, not an ARRAYREF.");
+
+ buf_len = av_len(bufAV);
+
+ if (buf_len < 0) { /* empty: -1 */
+ dlen = 2 * (MaxLevel - 1);
+ dst = newSV(dlen);
+ (void)SvPOK_only(dst);
+ d = (U8*)SvPVX(dst);
+ while (dlen--)
+ *d++ = '\0';
+ }
+ else {
+ for (lv = 0; lv < level; lv++) {
+ New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8);
+ s[lv] = eachlevel[lv];
+ }
+
+ svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE);
+ upper_lower = svp ? SvTRUE(*svp) : FALSE;
+ svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE);
+ kata_hira = svp ? SvTRUE(*svp) : FALSE;
+ svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
+ uca_vers = SvIV(*svp);
+ svp = hv_fetch(selfHV, "variable", 8, FALSE);
+ v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */
+ ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13))
+ : FALSE;
+
+ last_is_var = FALSE;
+ for (i = 0; i <= buf_len; i++) {
+ svp = av_fetch(bufAV, i, FALSE);
+
+ if (svp && SvPOK(*svp))
+ v = (U8*)SvPV(*svp, vlen);
+ else
+ croak("not a vwt.");
+
+ if (vlen < VCE_Length) /* ignore short VCE (unexpected) */
+ continue;
+
+ /* "Ignorable (L1, L2) after Variable" since track. v. 9 */
+ if (v2i) {
+ if (*v)
+ last_is_var = TRUE;
+ else if (v[1] || v[2]) /* non zero primary weight */
+ last_is_var = FALSE;
+ else if (last_is_var) /* zero primary weight; skipped */
+ continue;
+ }
+
+ if (v[5] == 0) { /* tert wt < 256 */
+ if (upper_lower) {
+ if (0x8 <= v[6] && v[6] <= 0xC) /* lower */
+ v[6] -= 6;
+ else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */
+ v[6] += 6;
+ else if (v[6] == 0x1C) /* square upper */
+ v[6]++;
+ else if (v[6] == 0x1D) /* square lower */
+ v[6]--;
+ }
+ if (kata_hira) {
+ if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */
+ v[6] -= 2;
+ else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */
+ v[6] += 5;
+ }
+ }
+
+ for (lv = 0; lv < level; lv++) {
+ if (v[2 * lv + 1] || v[2 * lv + 2]) {
+ *s[lv]++ = v[2 * lv + 1];
+ *s[lv]++ = v[2 * lv + 2];
+ }
+ }
+ }
+
+ dlen = 2 * (MaxLevel - 1);
+ for (lv = 0; lv < level; lv++)
+ dlen += s[lv] - eachlevel[lv];
+
+ dst = newSV(dlen);
+ (void)SvPOK_only(dst);
+ d = (U8*)SvPVX(dst);
+
+ svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE);
+ back_flag = svp ? SvUV(*svp) : (UV)0;
+
+ for (lv = 0; lv < level; lv++) {
+ if (back_flag & (1 << (lv + 1))) {
+ p = s[lv];
+ e = eachlevel[lv];
+ for ( ; e < p; p -= 2) {
+ *d++ = p[-2];
+ *d++ = p[-1];
+ }
+ }
+ else {
+ p = eachlevel[lv];
+ e = s[lv];
+ while (p < e)
+ *d++ = *p++;
+ }
+ if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
+ *d++ = '\0';
+ *d++ = '\0';
+ }
+ }
+
+ for (lv = level; lv < MaxLevel; lv++) {
+ if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
+ *d++ = '\0';
+ *d++ = '\0';
+ }
+ }
+
+ for (lv = 0; lv < level; lv++) {
+ Safefree(eachlevel[lv]);
+ }
+ }
+ *d = '\0';
+ SvCUR_set(dst, d - (U8*)SvPVX(dst));
+ RETVAL = dst;
+OUTPUT:
+ RETVAL
+
+
+SV*
+_varCE (vbl, vce)
+ SV* vbl
+ SV* vce
+ PREINIT:
+ SV *dst;
+ U8 *a, *v, *d;
+ STRLEN alen, vlen;
+ CODE:
+ a = (U8*)SvPV(vbl, alen);
+ v = (U8*)SvPV(vce, vlen);
+
+ dst = newSV(vlen);
+ d = (U8*)SvPVX(dst);
+ (void)SvPOK_only(dst);
+ Copy(v, d, vlen, U8);
+ SvCUR_set(dst, vlen);
+ d[vlen] = '\0';
+
+ /* variable: checked only the first char and the length,
+ trusting checkCollator() and %VariableOK in Perl ... */
+
+ if (vlen < VCE_Length /* ignore short VCE (unexpected) */
+ ||
+ *a == 'n') /* 'non-ignorable' */
+ 1;
+ else if (*v) {
+ if (*a == 's') { /* shifted or shift-trimmed */
+ d[7] = d[1]; /* wt level 1 to 4 */
+ d[8] = d[2];
+ }
+ d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
+ }
+ else if (*a == 'b') /* blanked */
+ 1;
+ else if (*a == 's') { /* shifted or shift-trimmed */
+ if (alen == 7 && (d[1] + d[2] + d[3] + d[4] + d[5] + d[6])) {
+ d[7] = (U8)(Shift4Wt >> 8);
+ d[8] = (U8)(Shift4Wt & 0xFF);
+ }
+ else {
+ d[7] = d[8] = 0;
+ }
+ }
+ else
+ croak("unknown variable value '%s'", a);
+ RETVAL = dst;
+OUTPUT:
+ RETVAL
+
+
+
+SV*
+visualizeSortKey (self, key)
+ SV * self
+ SV * key
+ PREINIT:
+ HV *selfHV;
+ SV **svp, *dst;
+ U8 *s, *e, *d;
+ STRLEN klen, dlen;
+ UV uv;
+ IV uca_vers;
+ static char *upperhex = "0123456789ABCDEF";
+ CODE:
+ if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
+ selfHV = (HV*)SvRV(self);
+ else
+ croak("$self is not a HASHREF.");
+
+ svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
+ if (!svp)
+ croak("Panic: no $self->{UCA_Version} in visualizeSortKey");
+ uca_vers = SvIV(*svp);
+
+ s = (U8*)SvPV(key, klen);
+
+ /* slightly *longer* than the need, but I'm afraid of miscounting;
+ exactly: (klen / 2) * 5 + MaxLevel * 2 - 1 (excluding '\0')
+ = (klen / 2) * 5 - 1 # FFFF (16bit) and ' ' between 16bit units
+ + (MaxLevel - 1) * 2 # ' ' and '|' for level boundaries
+ + 2 # '[' and ']'
+ */
+ dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
+ dst = newSV(dlen);
+ (void)SvPOK_only(dst);
+ d = (U8*)SvPVX(dst);
+
+ *d++ = '[';
+ for (e = s + klen; s < e; s += 2) {
+ uv = (U16)(*s << 8 | s[1]);
+ if (uv) {
+ if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
+ *d++ = ' ';
+ *d++ = upperhex[ (s[0] >> 4) & 0xF ];
+ *d++ = upperhex[ s[0] & 0xF ];
+ *d++ = upperhex[ (s[1] >> 4) & 0xF ];
+ *d++ = upperhex[ s[1] & 0xF ];
+ }
+ else {
+ if ((9 <= uca_vers) && (d[-1] != '['))
+ *d++ = ' ';
+ *d++ = '|';
+ }
+ }
+ *d++ = ']';
+ *d = '\0';
+ SvCUR_set(dst, d - (U8*)SvPVX(dst));
+ RETVAL = dst;
+OUTPUT:
+ RETVAL
+
+
+
+void
+unpack_U (src)
+ SV* src
+ PREINIT:
+ STRLEN srclen, retlen;
+ U8 *s, *p, *e;
+ UV uv;
+ PPCODE:
+ s = (U8*)SvPV(src,srclen);
+ if (!SvUTF8(src)) {
+ SV* tmpsv = sv_mortalcopy(src);
+ if (!SvPOK(tmpsv))
+ (void)sv_pvn_force(tmpsv,&srclen);
+ sv_utf8_upgrade(tmpsv);
+ s = (U8*)SvPV(tmpsv,srclen);
+ }
+ e = s + srclen;
+
+ for (p = s; p < e; p += retlen) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero);
+ XPUSHs(sv_2mortal(newSVuv(uv)));
+ }
+
diff --git a/cpan/Unicode-Collate/Collate/Locale.pm b/cpan/Unicode-Collate/Collate/Locale.pm
index 5dddfb82a7..c5891444b9 100644
--- a/cpan/Unicode-Collate/Collate/Locale.pm
+++ b/cpan/Unicode-Collate/Collate/Locale.pm
@@ -4,12 +4,11 @@ use strict;
use Carp;
use base qw(Unicode::Collate);
-our $VERSION = '0.68';
+our $VERSION = '0.71';
use File::Spec;
(my $ModPath = $INC{'Unicode/Collate/Locale.pm'}) =~ s/\.pm$//;
-my $KeyPath = File::Spec->catfile('allkeys.txt');
my $PL_EXT = '.pl';
my %LocaleFile = map { ($_, $_) } qw(
@@ -71,7 +70,6 @@ sub new {
if (exists $hash{table}) {
croak "your table can't be used with Unicode::Collate::Locale";
}
- $hash{table} = $KeyPath;
my $href = _fetchpl($hash{accepted_locale});
while (my($k,$v) = each %$href) {
@@ -297,7 +295,7 @@ tailored as well as it. For example, even though W is tailored,
fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
tailored. The result may depend on whether source strings are
normalized or not, and whether decomposed or composed.
-Thus C<(normalization =E<gt> undef> is less preferred.
+Thus C<(normalization =E<gt> undef)> is less preferred.
=back
@@ -305,7 +303,7 @@ Thus C<(normalization =E<gt> undef> is less preferred.
The Unicode::Collate::Locale module for perl was written
by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
-This module is Copyright(C) 2004-2010, SADAHIRO Tomoyuki. Japan.
+This module is Copyright(C) 2004-2011, SADAHIRO Tomoyuki. Japan.
All rights reserved.
This module is free software; you can redistribute it and/or
diff --git a/cpan/Unicode-Collate/Makefile.PL b/cpan/Unicode-Collate/Makefile.PL
new file mode 100644
index 0000000000..30d6fc0aee
--- /dev/null
+++ b/cpan/Unicode-Collate/Makefile.PL
@@ -0,0 +1,28 @@
+require 5.006001;
+use ExtUtils::MakeMaker;
+
+my $clean = {};
+
+if (-f "Collate.xs") {
+ print STDERR "Making header files for XS...\n";
+
+ do 'mkheader' or die $@ || "mkheader: $!";
+
+ $clean = { FILES => 'ucatbl.h' };
+}
+
+WriteMakefile(
+ 'INSTALLDIRS' => $] >= 5.007002 ? 'perl' : 'site',
+ 'NAME' => 'Unicode::Collate',
+ 'VERSION_FROM' => 'Collate.pm', # finds $VERSION
+ 'clean' => $clean,
+ 'PREREQ_PM' => {
+ Carp => 0,
+ constant => 0,
+ DynaLoader => 0,
+ File::Spec => 0,
+ strict => 0,
+ Test => 0,
+ warnings => 0,
+ },
+);
diff --git a/cpan/Unicode-Collate/README b/cpan/Unicode-Collate/README
index 16bf8c4aa7..743d713043 100644
--- a/cpan/Unicode-Collate/README
+++ b/cpan/Unicode-Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.68
+Unicode/Collate version 0.72
===============================
NAME
@@ -40,6 +40,7 @@ INSTALL
gendata/*, and mklocale.
Tests for Unicode::Collate::Locale are named t/loc_*.t.
+Since 0.54, XSUB that requires a C compiler will be built by default.
To install this module type the following:
perl Makefile.PL
@@ -47,20 +48,20 @@ To install this module type the following:
make test
make install
-If you have a C compiler and want to use XSUB edition,
-type the following (!! "enableXS" must run before "Makefile.PL" !!):
+Even if a C compiler is not available, pure Perl (i.e. non-XS) edition
+is available; type the following:
- perl enableXS
+ perl disableXS
perl Makefile.PL
make
make test
make install
-If you decide to install pure Perl (i.e. non-XS) edition after trying
-to build XSUB, type the following:
+If you decide to install XSUB edition after trying to build pure Perl,
+type the following:
make clean
- perl disableXS
+ perl enableXS
perl Makefile.PL
make
make test
@@ -107,7 +108,7 @@ HOW TO CHANGE DUCET (NOT WARRANTED)
AUTHOR, COPYRIGHT AND LICENSE
The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
-<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2010,
+<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2011,
SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it and/or
diff --git a/cpan/Unicode-Collate/mkheader b/cpan/Unicode-Collate/mkheader
new file mode 100644
index 0000000000..dde4ee110c
--- /dev/null
+++ b/cpan/Unicode-Collate/mkheader
@@ -0,0 +1,196 @@
+#!perl
+#
+# This auxiliary script makes five header files
+# used for building XSUB of Unicode::Collate.
+#
+# Usage:
+# <do 'mkheader'> in perl, or <perl mkheader> in command line
+#
+# Input file:
+# Collate/allkeys.txt
+#
+# Output file:
+# ucatbl.h
+#
+use 5.006;
+use strict;
+use warnings;
+use Carp;
+use File::Spec;
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ die "Unicode::Collate cannot stringify a Unicode code point\n";
+ }
+}
+
+use constant TRUE => 1;
+use constant FALSE => "";
+use constant VCE_TEMPLATE => 'Cn4';
+
+sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
+
+our $PACKAGE = 'Unicode::Collate, mkheader';
+our $prefix = "UCA_";
+
+our %SimpleEntries; # $codepoint => $keys
+our @Rest;
+
+{
+ my($f, $fh);
+ foreach my $d ('.') {
+ $f = File::Spec->catfile($d, "Collate", "allkeys.txt");
+ last if open($fh, $f);
+ $f = undef;
+ }
+ croak "$PACKAGE: Collate/allkeys.txt is not found" if !defined $f;
+
+ while (my $line = <$fh>) {
+ next if $line =~ /^\s*#/;
+ if ($line =~ /^\s*\@/) {
+ push @Rest, $line;
+ next;
+ }
+
+ next if $line !~ /^\s*[0-9A-Fa-f]/;
+
+ $line =~ s/[#%]\s*(.*)//; # removing comment (not getting the name)
+
+ # gets element
+ my($e, $k) = split /;/, $line;
+
+ croak "Wrong Entry: <charList> must be separated by ';' ".
+ "from <collElement>" if ! $k;
+
+ my @uv = _getHexArray($e);
+ next if !@uv;
+
+ if (@uv != 1) {
+ push @Rest, $line;
+ next;
+ }
+
+ my $is_L3_ignorable = TRUE;
+
+ my @key;
+ foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
+ my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
+ my @wt = _getHexArray($arr);
+ push @key, pack(VCE_TEMPLATE, $var, @wt);
+ $is_L3_ignorable = FALSE
+ if $wt[0] || $wt[1] || $wt[2];
+ # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
+ # is completely ignorable.
+ # For expansion, an entry $is_L3_ignorable
+ # if and only if "all" CEs are [.0000.0000.0000].
+ }
+ my $mapping = $is_L3_ignorable ? [] : \@key;
+ my $num = @$mapping;
+ my $str = chr($num).join('', @$mapping);
+ $SimpleEntries{$uv[0]} = stringify($str);
+ }
+}
+
+sub stringify {
+ my $str = shift;
+ return sprintf '"%s"', join '',
+ map sprintf("\\x%02x", ord $_), split //, $str;
+
+}
+
+########## writing header files ##########
+
+my $init = '';
+{
+ my $type = "char*";
+ my $head = $prefix."rest";
+
+ $init .= "static $type $head [] = {\n";
+ for my $line (@Rest) {
+ $line =~ s/\s*\z//;
+ next if $line eq '';
+ $init .= "/*$line*/\n" if $line =~ /^[A-Za-z0-9_.:;@\ \[\]]+\z/;
+ $init .= "($type)".stringify($line).",\n";
+ }
+ $init .= "NULL\n"; # sentinel
+ $init .= "};\n\n";
+}
+
+my @tripletable = (
+ {
+ file => "ucatbl",
+ name => "simple",
+ type => "char*",
+ hash => \%SimpleEntries,
+ null => "NULL",
+ init => $init,
+ },
+);
+
+foreach my $tbl (@tripletable) {
+ my $file = "$tbl->{file}.h";
+ my $head = "${prefix}$tbl->{name}";
+ my $type = $tbl->{type};
+ my $hash = $tbl->{hash};
+ my $null = $tbl->{null};
+ my $init = $tbl->{init};
+
+ open FH, ">$file" or croak "$PACKAGE: $file can't be made";
+ binmode FH; select FH;
+ my %val;
+
+ print FH << 'EOF';
+/*
+ * This file is auto-generated by mkheader.
+ * Any changes here will be lost!
+ */
+EOF
+
+ print $init if defined $init;
+
+ foreach my $uv (keys %$hash) {
+ croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
+ unless $uv <= 0x10FFFF;
+ my @c = unpack 'CCCC', pack 'N', $uv;
+ $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
+ }
+
+ foreach my $p (sort { $a <=> $b } keys %val) {
+ next if ! $val{ $p };
+ for (my $r = 0; $r < 256; $r++) {
+ next if ! $val{ $p }{ $r };
+ printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
+ for (my $c = 0; $c < 256; $c++) {
+ print "\t", defined $val{$p}{$r}{$c}
+ ? "($type)".$val{$p}{$r}{$c}
+ : $null;
+ print ',' if $c != 255;
+ print "\n" if $c % 8 == 7;
+ }
+ print "};\n\n";
+ }
+ }
+ foreach my $p (sort { $a <=> $b } keys %val) {
+ next if ! $val{ $p };
+ printf "static $type* ${head}_%02x [256] = {\n", $p;
+ for (my $r = 0; $r < 256; $r++) {
+ print $val{ $p }{ $r }
+ ? sprintf("${head}_%02x_%02x", $p, $r)
+ : "NULL";
+ print ',' if $r != 255;
+ print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
+ }
+ print "};\n\n";
+ }
+ print "static $type** $head [] = {\n";
+ for (my $p = 0; $p <= 0x10; $p++) {
+ print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
+ print ',' if $p != 0x10;
+ print "\n";
+ }
+ print "};\n\n";
+ close FH;
+}
+
+1;
+__END__
diff --git a/cpan/Unicode-Collate/t/loc_test.t b/cpan/Unicode-Collate/t/loc_test.t
index d1b5b4a1e4..8d7d74a816 100644
--- a/cpan/Unicode-Collate/t/loc_test.t
+++ b/cpan/Unicode-Collate/t/loc_test.t
@@ -12,7 +12,7 @@ BEGIN {
}
use Test;
-BEGIN { plan tests => 116 };
+BEGIN { plan tests => 120 };
use strict;
use warnings;
@@ -127,3 +127,14 @@ our @sortFr = $objFr->sort(@randFr);
ok("@sortFr" eq "@listFr");
# 116
+
+{
+ my $keyXS = '__useXS'; # see Unicode::Collate internal
+ my $noLoc = Unicode::Collate->new(normalization => undef);
+ my $UseXS = ref($noLoc->{$keyXS});
+ ok(ref($Collator->{$keyXS}), $UseXS);
+ ok(ref($objFr ->{$keyXS}), $UseXS);
+ ok(ref($objEs ->{$keyXS}), $UseXS);
+ ok(ref($objEsT ->{$keyXS}), $UseXS);
+}
+# 120
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index ce7efdf95b..20359219b0 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -96,6 +96,13 @@ XXX
=item *
+C<Unicode::Collate> has been upgraded from version 0.68 to 0.72
+
+This also sees the switch from using the pure-perl version of this
+module to the XS version.`
+
+=item *
+
XXX
=back