From b1a770688af71aab31f77b935b0a5019726caa75 Mon Sep 17 00:00:00 2001 From: Gisle Aas Date: Sat, 7 Nov 2009 18:44:16 +0100 Subject: Make as_iri leave escapes not forming valid UTF-8 sequences --- URI.pm | 18 +++++++++++++++++- t/iri.t | 5 ++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/URI.pm b/URI.pm index 200104e..9eb5395 100644 --- a/URI.pm +++ b/URI.pm @@ -270,8 +270,24 @@ sub as_iri } } if ($str =~ s/%([89A-F][0-9A-F])/chr(hex($1))/eg) { + # All this crap because the more obvious: + # + # Encode::decode("UTF-8", $str, sub { sprintf "%%%02X", shift }) + # + # doesn't work. Apparently passing a sub as CHECK only works + # for 'ascii' and similar direct encodings. + require Encode; - return Encode::decode("UTF-8", $str); + my $enc = Encode::find_encoding("UTF-8"); + my $u = ""; + while (length $str) { + $u .= $enc->decode($str, Encode::FB_QUIET()); + if (length $str) { + # escape next char + $u .= URI::Escape::escape_char(substr($str, 0, 1, "")); + } + } + $str = $u; } return $str; } diff --git a/t/iri.t b/t/iri.t index a42317e..876b4a4 100644 --- a/t/iri.t +++ b/t/iri.t @@ -2,7 +2,7 @@ use utf8; use strict; -use Test::More tests => 10; +use Test::More tests => 11; use URI; @@ -18,6 +18,9 @@ $u = URI->new("http://example.com/Bücher"); is $u, "http://example.com/B%C3%BCcher"; is $u->as_iri, "http://example.com/Bücher"; +$u = URI->new("http://example.com/B%FCcher"); # latin1 encoded stuff +is $u->as_iri, "http://example.com/B%FCcher"; # ...should not be decoded + $u = URI->new("http://➡.ws/"); is $u, "http://xn--hgi.ws/"; is $u->host, "xn--hgi.ws"; -- cgit v1.2.1