ext/Encode/t/encoding.t


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

BEGIN {
    require Config; import Config;
    if ($Config{'extensions'} !~ /\bEncode\b/) {
      print "1..0 # Skip: Encode was not built\n";
      exit 0;
    }
    unless (find PerlIO::Layer 'perlio') {
    print "1..0 # Skip: PerlIO was not built\n";
    exit 0;
    }
    if (ord("A") == 193) {
    print "1..0 # encoding pragma does not support EBCDIC platforms\n";
    exit(0);
    }
}

print "1..31\n";

use encoding "latin1"; # ignored (overwritten by the next line)
use encoding "greek";  # iso 8859-7 (no "latin" alias, surprise...)

# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is
# \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS),
# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S)

$a = "\xDF";
$b = "\x{100}";

print "not " unless ord($a) == 0x3af;
print "ok 1\n";

print "not " unless ord($b) == 0x100;
print "ok 2\n";

my $c;

$c = $a . $b;

print "not " unless ord($c) == 0x3af;
print "ok 3\n";

print "not " unless length($c) == 2;
print "ok 4\n";

print "not " unless ord(substr($c, 1, 1)) == 0x100;
print "ok 5\n";

print "not " unless ord(chr(0xdf)) == 0x3af; # spooky
print "ok 6\n";

print "not " unless ord(pack("C", 0xdf)) == 0x3af;
print "ok 7\n";

# we didn't break pack/unpack, I hope

print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf;
print "ok 8\n";

# the first octet of UTF-8 encoded 0x3af 
print "not " unless unpack("U0 C", chr(0xdf)) == 0xce;
print "ok 9\n";

print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf;
print "ok 10\n";

print "not " unless unpack("U", chr(0xdf)) == 0x3af;
print "ok 11\n";

# charnames must still work
use charnames ':full';
print "not " unless ord("\N{LATIN SMALL LETTER SHARP S}") == 0xdf;
print "ok 12\n";

# combine

$c = "\xDF\N{LATIN SMALL LETTER SHARP S}" . chr(0xdf);

print "not " unless ord($c) == 0x3af;
print "ok 13\n";

print "not " unless ord(substr($c, 1, 1)) == 0xdf;
print "ok 14\n";

print "not " unless ord(substr($c, 2, 1)) == 0x3af;
print "ok 15\n";

# regex literals

print "not " unless "\xDF"    =~ /\x{3AF}/;
print "ok 16\n";

print "not " unless "\x{3AF}" =~ /\xDF/;
print "ok 17\n";

print "not " unless "\xDF"    =~ /\xDF/;
print "ok 18\n";

print "not " unless "\x{3AF}" =~ /\x{3AF}/;
print "ok 19\n";

# eq, cmp

my ($byte,$bytes,$U,$Ub,$g1,$g2,$l) = ( 
    pack("C*", 0xDF ),       # byte
    pack("C*", 0xDF, 0x20),  # ($bytes2 cmp $U) > 0
    pack("U*", 0x3AF),       # $U eq $byte
    pack("U*", 0xDF ),       # $Ub would eq $bytev w/o use encoding
    pack("U*", 0x3B1),       # ($g1 cmp $byte) > 0; === chr(0xe1)
    pack("U*", 0x3AF, 0x20), # ($g2 cmp $byte) > 0;
    pack("U*", 0x3AB),       # ($l  cmp $byte) < 0; === chr(0xdb)
);

# all the tests in this section that compare a byte encoded string 
# ato UTF-8 encoded are run in all possible vairants 
# all of the eq, ne, cmp operations tested,
# $v z $u tested as well as $u z $v

sub alleq($$){
    my ($a,$b)    =    (shift, shift);
     $a  eq  $b        &&     $b  eq  $a         && 
  !( $a  ne  $b )      &&  !( $b  ne  $a )       &&
   ( $a  cmp $b ) == 0 &&   ( $b  cmp $a ) == 0;
}
   
sub anyeq($$){
    my ($a,$b)    =    (shift, shift);
     $a  eq  $b        ||     $b  eq  $a         ||
  !( $a  ne  $b )      ||  !( $b  ne  $a )       ||
   ( $a  cmp $b ) == 0 ||   ( $b  cmp $a ) == 0;
}

sub allgt($$){
    my ($a,$b)    =    (shift, shift);
    ( $a cmp $b ) == 1 && ( $b cmp $a ) == -1;
}
#match the correct UTF-8 string
print "not " unless  alleq($byte, $U);
print "ok 20\n";

#do not match a wrong UTF-8 string
print "not " if anyeq($byte, $Ub);
print "ok 21\n";

#string ordering
print "not " unless allgt ( $g1,    $byte  )  &&
                    allgt ( $g2,    $byte  )  &&
                    allgt ( $byte,  $l     )  &&
                    allgt ( $bytes, $U     );
print "ok 22\n";

# upgrade, downgrade

my ($u,$v,$v2);
$u = $v = $v2 = pack("C*", 0xDF);
utf8::upgrade($v);                   #explicit upgrade
$v2 = substr( $v2."\x{410}", 0, -1); #implicit upgrade

# implicit upgrade === explicit upgrade
print "not "  if do{{use bytes; $v ne $v2}} || $v ne $v2;
print "ok 23\n";

# utf8::upgrade is transparent and does not break equality
print "not " unless alleq( $u, $v );
print "ok 24\n";

$u = $v = pack("C*", 0xDF);
utf8::upgrade($v);
#test for a roundtrip, we should get back from where we left
eval {utf8::downgrade( $v )};
print "not " if $@ !~ /^Wide / || do{{use bytes; $u eq $v}} || $u ne $v;
print "ok 25\n";

# some more eq, cmp

$byte=pack("C*", 0xDF);

print "not " unless pack("U*", 0x3AF) eq $byte;
print "ok 26\n";

print "not " if chr(0xDF) cmp $byte;
print "ok 27\n";

print "not " unless ((pack("U*", 0x3B0)       cmp $byte) ==  1) &&
                    ((pack("U*", 0x3AE)       cmp $byte) == -1) &&
                    ((pack("U*", 0x3AF, 0x20) cmp $byte) ==  1) &&
                ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1);
print "ok 28\n";


{
    # Used to core dump in 5.7.3
    no warnings; # so test goes noiselessly
    print ord(undef) == 0 ? "ok 29\n" : "not ok 29\n";
}

{
    my %h1;
    my %h2;
    $h1{"\xdf"}    = 41;
    $h2{"\x{3af}"} = 42;
    print $h1{"\x{3af}"} == 41 ? "ok 30\n" : "not ok 30\n";
    print $h2{"\xdf"}    == 42 ? "ok 31\n" : "not ok 31\n";
}