summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-01-30 18:17:11 -0700
committerKarl Williamson <public@khwilliamson.com>2012-02-04 16:29:32 -0700
commit34132297113975a3522f23d745e0ccf336803994 (patch)
tree46168c9561d1fe5fc4ebe13d5077e514a14dea98 /lib
parentbea2c146352c47a938243d84d1a4fa99f7a328bb (diff)
downloadperl-34132297113975a3522f23d745e0ccf336803994.tar.gz
Unicode::UCD::prop_invmap(): Make the NFKCCF property return deltas
The file for this property is stored in the old-style format for backward compatibility with any applications that might be reading it directly. But the values should be returned through the Unicode::UCD API as deltas for consistency with other, similar properties.
Diffstat (limited to 'lib')
-rw-r--r--lib/Unicode/UCD.pm46
-rw-r--r--lib/Unicode/UCD.t27
2 files changed, 48 insertions, 25 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index de62e5035a..3473ecbe1a 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -2398,20 +2398,18 @@ that are lists, and the addition is extra work.
=item B<C<cle>>
-is like C<cl> except that, for the time being, as an interim measure, the map
-returned for simple scalars is the correct value and the code point should NOT
-be added to it. Also, some of the map array elements have the forms given by C<cl>, and
+means that some of the map array elements have the forms given by C<cl>, and
the rest are the empty string. The property C<NFKC_Casefold> has this form.
An example slice is:
@$ranges_ref @$maps_ref Note
...
- 0x00AA 0x0061 FEMININE ORDINAL INDICATOR => 'a'
- 0x00AB <code point>
+ 0x00AA -73 FEMININE ORDINAL INDICATOR => 'a'
+ 0x00AB 0
0x00AD SOFT HYPHEN => ""
- 0x00AE <code point>
+ 0x00AE 0
0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON
- 0x00B0 <code point>
+ 0x00B0 0
...
=item B<C<n>>
@@ -2576,8 +2574,9 @@ RETRY:
# new-style, and this routine is supposed to return old-style block names.
# The Name table is valid, but we need to execute the special code below
# to add in the algorithmic-defined name entries.
+ # And NFKCCF needs conversion, so handle that here too.
if (ref $swash eq ""
- || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na) $ /x)
+ || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x)
{
# Get the short name of the input property, in standard form
@@ -2798,6 +2797,35 @@ RETRY:
}
$swash = \%decomps;
}
+ elsif ($second_try eq 'nfkccf') {
+
+ # This property is stored in the old format for backwards
+ # compatibility for any applications that read its file directly.
+ # So here we convert it to delta format for compatibility with the
+ # other properties similar to it.
+ my %nfkccf;
+
+ # Create a new LIST with deltas instead of code points.
+ my $list = "";
+ foreach my $range (split "\n", $swash->{'LIST'}) {
+ my ($hex_begin, $hex_end, $map) = split "\t", $range;
+ my $begin = hex $hex_begin;
+ my $end = (defined $hex_end && $hex_end ne "")
+ ? hex $hex_end
+ : $begin;
+ my $decimal_map = hex $map;
+ foreach my $code_point ($begin .. $end) {
+ $list .= sprintf("%04X\t\t%d\n", $code_point, $decimal_map - $code_point);
+ }
+ }
+
+ $nfkccf{'LIST'} = $list;
+ $nfkccf{'TYPE'} = "ToNFKCCF";
+ $nfkccf{'SPECIALS'} = $swash->{'SPECIALS'};
+ $swash = \%nfkccf;
+ $utf8::SwashInfo{'ToNFKCCF'}{'missing'} = 0;
+ $utf8::SwashInfo{'ToNFKCCF'}{'format'} = 'i';
+ }
else { # Don't know this property. Fail.
return;
}
@@ -2809,7 +2837,7 @@ RETRY:
}
# Here, have a valid swash return. Examine it.
- my $returned_prop = $swash->{TYPE};
+ my $returned_prop = $swash->{'TYPE'};
# All properties but binary ones should have 'missing' and 'format'
# entries
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 45573de07c..530c548694 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -1259,13 +1259,6 @@ foreach my $prop (keys %props) {
next PROPERTY;
}
}
- elsif ($name eq 'nfkccf') { # This one has an atypical $missing
- if ($missing ne "<code point>") {
- fail("prop_invmap('$mod_prop')");
- diag("The missings should be \"\"; got '$missing'");
- next PROPERTY;
- }
- }
elsif ($format =~ /^ c /x) {
if ($missing ne "0") {
fail("prop_invmap('$mod_prop')");
@@ -1619,13 +1612,15 @@ foreach my $prop (keys %props) {
next PROPERTY;
}
}
- elsif ($format eq 'd') {
-
- # The numerics in the map are stored as deltas. The defaults
- # are 0, and don't appear in $official, and are excluded
- # later, but the elements must be converted back to their real
- # code point values before comparing with $official, as that
- # file, for backwards compatibility, is not stored as deltas
+ elsif ($format eq 'd' || $format eq 'cle') {
+
+ # The numerics in the returned map are stored as deltas. The
+ # defaults are 0, and don't appear in $official, and are
+ # excluded later, but the elements must be converted back to
+ # their real code point values before comparing with
+ # $official, as these files, for backwards compatibility, are
+ # not stored as deltas. (There currently is only one cle
+ # property, nfkccf. If that changed this would also have to.)
if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x
&& $invmap_ref->[$i] != 0)
{
@@ -1644,8 +1639,7 @@ foreach my $prop (keys %props) {
splice @$invmap_ref, $i+1, 0, $delta;
}
}
- }
- elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") {
+ if ($format eq 'cle' && $invmap_ref->[$i] eq "") {
# cle properties have maps to the empty string that also
# should be in the specials hash, with the key the packed code
@@ -1673,6 +1667,7 @@ foreach my $prop (keys %props) {
next PROPERTY;
}
next;
+ }
}
elsif ($is_binary) { # These binary files don't have an explicit Y
$invmap_ref->[$i] =~ s/Y//;