summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/Encode/t/encoding.t2
-rw-r--r--ext/Storable/t/utf8hash.t9
-rw-r--r--ext/Unicode/Normalize/t/short.t2
-rw-r--r--lib/CGI/Util.pm4
-rw-r--r--pod/perlfunc.pod7
-rw-r--r--pp_pack.c44
-rw-r--r--t/op/chr.t2
-rwxr-xr-xt/op/pack.t11
-rw-r--r--t/op/utftaint.t5
9 files changed, 42 insertions, 44 deletions
diff --git a/ext/Encode/t/encoding.t b/ext/Encode/t/encoding.t
index 67ea068087..b17b11fc44 100644
--- a/ext/Encode/t/encoding.t
+++ b/ext/Encode/t/encoding.t
@@ -57,7 +57,7 @@ print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf;
print "ok 8\n";
# the first octet of UTF-8 encoded 0x3af
-print "not " unless unpack("C", chr(0xdf)) == 0xce;
+print "not " unless unpack("U0 C", chr(0xdf)) == 0xce;
print "ok 9\n";
print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf;
diff --git a/ext/Storable/t/utf8hash.t b/ext/Storable/t/utf8hash.t
index eeb80eb86e..a529ea5626 100644
--- a/ext/Storable/t/utf8hash.t
+++ b/ext/Storable/t/utf8hash.t
@@ -34,7 +34,7 @@ use Storable qw(store nstore retrieve thaw freeze);
}
# Better than no plan, because I was getting out of memory errors, at which
# point Test::More tidily prints up 1..79 as if I meant to finish there.
-use Test::More tests=>148;
+use Test::More tests=>144;
use bytes ();
my %utf8hash;
@@ -57,13 +57,10 @@ my @ords = (
foreach my $i (@ords){
my $u = chr($i); utf8::upgrade($u);
# warn sprintf "%d,%d", bytes::length($u), is_utf8($u);
- my $b = pack("C*", unpack("C*", $u));
+ my $b = chr($i); utf8::encode($b);
# warn sprintf "%d,%d" ,bytes::length($b), is_utf8($b);
- isnt($u, $b,
- "equivalence - with utf8flag");
- is (pack("C*", unpack("C*", $u)), pack("C*", unpack("C*", $b)),
- "equivalence - without utf8flag");
+ isnt($u, $b, "equivalence - with utf8flag");
$utf8hash{$u} = $utf8hash{$b} = $i;
}
diff --git a/ext/Unicode/Normalize/t/short.t b/ext/Unicode/Normalize/t/short.t
index d799f4a096..a9e444f98a 100644
--- a/ext/Unicode/Normalize/t/short.t
+++ b/ext/Unicode/Normalize/t/short.t
@@ -35,7 +35,7 @@ print "ok 1\n";
no warnings qw(utf8);
# U+3042 is 3-byte length (in UTF-8/UTF-EBCDIC)
-our $a = pack 'U0C', unpack 'C', "\x{3042}";
+our $a = pack 'U0C', unpack 'U0C', "\x{3042}";
print NFD($a) eq "\0"
? "ok" : "not ok", " 2\n";
diff --git a/lib/CGI/Util.pm b/lib/CGI/Util.pm
index 9cef416b96..0cb6e51a61 100644
--- a/lib/CGI/Util.pm
+++ b/lib/CGI/Util.pm
@@ -200,8 +200,8 @@ sub escape {
shift() if @_ > 1 and ( ref($_[0]) || (defined $_[1] && $_[0] eq $CGI::DefaultClass));
my $toencode = shift;
return undef unless defined($toencode);
- # force bytes while preserving backward compatibility -- dankogai
- $toencode = pack("C*", unpack("C*", $toencode));
+ # we enforce UTF-8 encoding for URLs for no good reason except UTF-8 being the future
+ utf8::encode $toencode;
if ($EBCDIC) {
$toencode=~s/([^a-zA-Z0-9_.~-])/uc sprintf("%%%02x",$E2A[ord($1)])/eg;
} else {
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
index 1755bcefaf..101d10e9fb 100644
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -3505,8 +3505,7 @@ of values, as follows:
H A hex string (high nybble first).
c A signed char (8-bit) value.
- C An unsigned C char (octet) even under Unicode. Should normally not
- be used. See U and W instead.
+ C An unsigned char (octet) value.
W An unsigned char value (can be greater than 255).
s A signed short (16-bit) value.
@@ -3547,8 +3546,8 @@ of values, as follows:
P A pointer to a structure (fixed-length string).
u A uuencoded string.
- U A Unicode character number. Encodes to UTF-8 internally
- (or UTF-EBCDIC in EBCDIC platforms).
+ U A Unicode character number. Encodes to a character in character mode
+ and UTF-8 (or UTF-EBCDIC in EBCDIC platforms) in byte mode.
w A BER compressed integer (not an ASN.1 BER, see perlpacktut for
details). Its bytes represent an unsigned integer in base 128,
diff --git a/pp_pack.c b/pp_pack.c
index 7aa95a9747..76e631511c 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -26,7 +26,6 @@
* other pp*.c files for the rest of the pp_ functions.
*/
-
#include "EXTERN.h"
#define PERL_IN_PP_PACK_C
#include "perl.h"
@@ -381,7 +380,7 @@ STATIC const packprops_t packprops[512] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0,
- /* C */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE,
+ /* C */ sizeof(unsigned char),
#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
/* D */ LONG_DOUBLESIZE,
#else
@@ -532,7 +531,7 @@ STATIC const packprops_t packprops[512] = {
/* w */ sizeof(char) | PACK_SIZE_UNPREDICTABLE | PACK_SIZE_CANNOT_CSUM,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* C */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE,
+ /* C */ sizeof(unsigned char),
#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
/* D */ LONG_DOUBLESIZE,
#else
@@ -1562,10 +1561,29 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
XPUSHs(sv);
break;
}
+ case 'C':
+ if (len == 0) {
+ if (explicit_length)
+ /* Switch to "character" mode */
+ utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
+ break;
+ }
+ /* FALL THROUGH */
case 'c':
- while (len-- > 0) {
- int aint = SHIFT_BYTE(utf8, s, strend, datumtype);
- if (aint >= 128) /* fake up signed chars */
+ while (len-- > 0 && s < strend) {
+ int aint;
+ if (utf8)
+ {
+ STRLEN retlen;
+ aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ if (retlen == (STRLEN) -1 || retlen == 0)
+ Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
+ s += retlen;
+ }
+ else
+ aint = *(U8 *)(s)++;
+ if (aint >= 128 && datumtype != 'C') /* fake up signed chars */
aint -= 256;
if (!checksum)
PUSHs(sv_2mortal(newSViv((IV)aint)));
@@ -1575,18 +1593,9 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
cuv += aint;
}
break;
- case 'C':
case 'W':
W_checksum:
- if (len == 0) {
- if (explicit_length && datumtype == 'C')
- /* Switch to "character" mode */
- utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
- break;
- }
- if (datumtype == 'C' ?
- (symptr->flags & FLAG_DO_UTF8) &&
- !(symptr->flags & FLAG_WAS_UTF8) : utf8) {
+ if (utf8) {
while (len-- > 0 && s < strend) {
STRLEN retlen;
const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
@@ -2930,7 +2939,6 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
break;
}
- GROWING(0, cat, start, cur, len);
while (len-- > 0) {
IV aiv;
fromstr = NEXTFROM;
@@ -2939,7 +2947,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
ckWARN(WARN_PACK))
Perl_warner(aTHX_ packWARN(WARN_PACK),
"Character in 'C' format wrapped in pack");
- *cur++ = (char)(aiv & 0xff);
+ PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff));
}
break;
case 'W': {
diff --git a/t/op/chr.t b/t/op/chr.t
index 056f11ab92..5ac453f427 100644
--- a/t/op/chr.t
+++ b/t/op/chr.t
@@ -37,7 +37,7 @@ SKIP: {
sub hexes {
no warnings 'utf8'; # avoid surrogate and beyond Unicode warnings
- join(" ",map{sprintf"%02x",$_}unpack("C*",chr($_[0])));
+ join(" ",unpack "U0 (H2)*", chr $_[0]);
}
# The following code points are some interesting steps in UTF-8.
diff --git a/t/op/pack.t b/t/op/pack.t
index f37c73f2e8..ef88540106 100755
--- a/t/op/pack.t
+++ b/t/op/pack.t
@@ -12,7 +12,7 @@ my $no_endianness = $] > 5.009 ? '' :
my $no_signedness = $] > 5.009 ? '' :
"Signed/unsigned pack modifiers not available on this perl";
-plan tests => 14697;
+plan tests => 14696;
use strict;
use warnings qw(FATAL all);
@@ -918,7 +918,7 @@ SKIP: {
isnt(v1.20.300.4000, sprintf "%vd", pack("C0U*",1,20,300,4000));
my $rslt = $Is_EBCDIC ? "156 67" : "199 162";
-is(join(" ", unpack("C*", chr(0x1e2))), $rslt);
+is(join(" ", unpack("U0 C*", chr(0x1e2))), $rslt);
# does pack U create Unicode?
is(ord(pack('U', 300)), 300);
@@ -936,9 +936,6 @@ is("@{[unpack('U*', pack('U*', 100, 200))]}", "100 200");
SKIP: {
skip "Not for EBCDIC", 4 if $Is_EBCDIC;
- # does unpack C unravel pack U?
- is("@{[unpack('C*', pack('U*', 100, 200))]}", "100 195 136");
-
# does pack U0C create Unicode?
is("@{[pack('U0C*', 100, 195, 136)]}", v100.v200);
@@ -1648,7 +1645,7 @@ is(unpack('c'), 65, "one-arg unpack (change #18751)"); # defaulting to $_
}
{
- # C is *not* neutral
+ # C *is* neutral
my $down = "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x05\x06";
my $up = $down;
utf8::upgrade($up);
@@ -1658,7 +1655,7 @@ is(unpack('c'), 65, "one-arg unpack (change #18751)"); # defaulting to $_
is(pack("C*", @down), $down, "byte join");
my @up = unpack("C*", $up);
- my @expect_up = (0xc3, 0xb8, 0xc3, 0xb9, 0xc3, 0xba, 0xc3, 0xbb, 0xc3, 0xbc, 0xc3, 0xbd, 0xc3, 0xbe, 0xc3, 0xbf, 0x05, 0x06);
+ my @expect_up = (0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0x05, 0x06);
is("@up", "@expect_up", "UTF-8 expand");
is(pack("U0C0C*", @up), $up, "UTF-8 join");
}
diff --git a/t/op/utftaint.t b/t/op/utftaint.t
index d6e900df0f..df99c8dc43 100644
--- a/t/op/utftaint.t
+++ b/t/op/utftaint.t
@@ -23,10 +23,7 @@ plan(tests => 3*10 + 3*8 + 2*16 + 2);
my $arg = $ENV{PATH}; # a tainted value
use constant UTF8 => "\x{1234}";
-sub is_utf8 {
- my $s = shift;
- return 0xB6 != unpack('C', chr(0xB6).$s);
-}
+*is_utf8 = \&utf8::is_utf8;
for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) {
my $encode = $ary->[0];