diff options
author | Zoltan Arvai <zarvai@inf.u-szeged.hu> | 2014-03-27 17:27:22 +0100 |
---|---|---|
committer | Zoltan Arvai <zarvai@inf.u-szeged.hu> | 2014-03-28 18:46:12 +0100 |
commit | a6014652040e76de08e643b49b69fc97cb5bfd62 (patch) | |
tree | 756e51a1a5fc717e2a15a84aca686eb7fd43ff7d /chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode | |
parent | d12a5818c08a6e4ca207a0bb1688cb4d82c20460 (diff) | |
download | qtwebengine-chromium-a6014652040e76de08e643b49b69fc97cb5bfd62.tar.gz |
Add perl to cygwin
On Windows third_party/WebKit build depends on cygwin's perl version.
Change-Id: Icf6393906c0f977fca9ff652a8abca9dacb60765
Reviewed-by: Andras Becsi <andras.becsi@digia.com>
Diffstat (limited to 'chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode')
23 files changed, 3893 insertions, 0 deletions
diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Alias.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Alias.pm new file mode 100644 index 00000000000..e7db0ca719b --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Alias.pm @@ -0,0 +1,375 @@ +package Encode::Alias; +use strict; +use warnings; +no warnings 'redefine'; +our $VERSION = do { my @r = ( q$Revision: 2.9 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +sub DEBUG () { 0 } + +use base qw(Exporter); + +# Public, encouraged API is exported by default + +our @EXPORT = + qw ( + define_alias + find_alias +); + +our @Alias; # ordered matching list +our %Alias; # cached known aliases + +sub find_alias { + require Encode; + my $class = shift; + my $find = shift; + unless ( exists $Alias{$find} ) { + $Alias{$find} = undef; # Recursion guard + for ( my $i = 0 ; $i < @Alias ; $i += 2 ) { + my $alias = $Alias[$i]; + my $val = $Alias[ $i + 1 ]; + my $new; + if ( ref($alias) eq 'Regexp' && $find =~ $alias ) { + DEBUG and warn "eval $val"; + $new = eval $val; + DEBUG and $@ and warn "$val, $@"; + } + elsif ( ref($alias) eq 'CODE' ) { + DEBUG and warn "$alias", "->", "($find)"; + $new = $alias->($find); + } + elsif ( lc($find) eq lc($alias) ) { + $new = $val; + } + if ( defined($new) ) { + next if $new eq $find; # avoid (direct) recursion on bugs + DEBUG and warn "$alias, $new"; + my $enc = + ( ref($new) ) ? $new : Encode::find_encoding($new); + if ($enc) { + $Alias{$find} = $enc; + last; + } + } + } + + # case insensitive search when canonical is not in all lowercase + # RT ticket #7835 + unless ( $Alias{$find} ) { + my $lcfind = lc($find); + for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule ) + { + $lcfind eq lc($name) or next; + $Alias{$find} = Encode::find_encoding($name); + DEBUG and warn "$find => $name"; + } + } + } + if (DEBUG) { + my $name; + if ( my $e = $Alias{$find} ) { + $name = $e->name; + } + else { + $name = ""; + } + warn "find_alias($class, $find)->name = $name"; + } + return $Alias{$find}; +} + +sub define_alias { + while (@_) { + my ( $alias, $name ) = splice( @_, 0, 2 ); + unshift( @Alias, $alias => $name ); # newer one has precedence + if ( ref($alias) ) { + + # clear %Alias cache to allow overrides + my @a = keys %Alias; + for my $k (@a) { + if ( ref($alias) eq 'Regexp' && $k =~ $alias ) { + DEBUG and warn "delete \$Alias\{$k\}"; + delete $Alias{$k}; + } + elsif ( ref($alias) eq 'CODE' ) { + DEBUG and warn "delete \$Alias\{$k\}"; + delete $Alias{ $alias->($name) }; + } + } + } + else { + DEBUG and warn "delete \$Alias\{$alias\}"; + delete $Alias{$alias}; + } + } +} + +# Allow latin-1 style names as well +# 0 1 2 3 4 5 6 7 8 9 10 +our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); + +# Allow winlatin1 style names as well +our %Winlatin2cp = ( + 'latin1' => 1252, + 'latin2' => 1250, + 'cyrillic' => 1251, + 'greek' => 1253, + 'turkish' => 1254, + 'hebrew' => 1255, + 'arabic' => 1256, + 'baltic' => 1257, + 'vietnamese' => 1258, +); + +init_aliases(); + +sub undef_aliases { + @Alias = (); + %Alias = (); +} + +sub init_aliases { + require Encode; + undef_aliases(); + + # Try all-lower-case version should all else fails + define_alias( qr/^(.*)$/ => '"\L$1"' ); + + # UTF/UCS stuff + define_alias( qr/^UTF-?7$/i => '"UTF-7"' ); + define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); + define_alias( + qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', + qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', + qr/^iso-10646-1$/i => '"UCS-2BE"' + ); + define_alias( + qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', + qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', + qr/^UTF-?(16|32)$/i => '"UTF-$1"', + ); + + # ASCII + define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' ); + define_alias( 'C' => 'ascii' ); + define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' ); + + # Allow variants of iso-8859-1 etc. + define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); + + # At least HP-UX has these. + define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); + + # More HP stuff. + define_alias( + qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => + '"${1}8"' ); + + # The Official name of ASCII. + define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); + + # This is a font issue, not an encoding issue. + # (The currency symbol of the Latin 1 upper half + # has been redefined as the euro symbol.) + define_alias( qr/^(.+)\@euro$/i => '"$1"' ); + + define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i => +'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' + ); + + define_alias( + qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| + hebrew|arabic|baltic|vietnamese)$/ix => + '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' + ); + + # Common names for non-latin preferred MIME names + define_alias( + 'ascii' => 'US-ascii', + 'cyrillic' => 'iso-8859-5', + 'arabic' => 'iso-8859-6', + 'greek' => 'iso-8859-7', + 'hebrew' => 'iso-8859-8', + 'thai' => 'iso-8859-11', + ); + # RT #20781 + define_alias(qr/\btis-?620\b/i => '"iso-8859-11"'); + + # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. + # And Microsoft has their own naming (again, surprisingly). + # And windows-* is registered in IANA! + define_alias( + qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' ); + + # Sometimes seen with a leading zero. + # define_alias( qr/\bcp037\b/i => '"cp37"'); + + # Mac Mappings + # predefined in *.ucm; unneeded + # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); + define_alias( qr/^mac_(.*)$/i => '"mac$1"' ); + + # Ououououou. gone. They are differente! + # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); + + # Standardize on the dashed versions. + define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); + + unless ($Encode::ON_EBCDIC) { + + # for Encode::CN + define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); + define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); + + # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) + # CP936 doesn't have vendor-addon for GBK, so they're identical. + define_alias( qr/^gbk$/i => '"cp936"' ); + + # This fixes gb2312 vs. euc-cn confusion, practically + define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); + + # for Encode::JP + define_alias( qr/\bjis$/i => '"7bit-jis"' ); + define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); + define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); + define_alias( qr/\bujis$/i => '"euc-jp"' ); + define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); + define_alias( qr/\bsjis$/i => '"shiftjis"' ); + define_alias( qr/\bwindows-31j$/i => '"cp932"' ); + + # for Encode::KR + define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); + define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); + + # This fixes ksc5601 vs. euc-kr confusion, practically + define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); + define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); + define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); + + # for Encode::TW + define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); + define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); + define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); + define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); + define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); + } + + # utf8 is blessed :) + define_alias( qr/^UTF-8$/i => '"utf-8-strict"' ); + + # At last, Map white space and _ to '-' + define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); +} + +1; +__END__ + +# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 +# TODO: HP-UX '15' encodings japanese15 korean15 roi15 +# TODO: Cyrillic encoding ISO-IR-111 (useful?) +# TODO: Armenian encoding ARMSCII-8 +# TODO: Hebrew encoding ISO-8859-8-1 +# TODO: Thai encoding TCVN +# TODO: Vietnamese encodings VPS +# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese +# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic +# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese +# Kannada Khmer Korean Laotian Malayalam Mongolian +# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese + +=head1 NAME + +Encode::Alias - alias definitions to encodings + +=head1 SYNOPSIS + + use Encode; + use Encode::Alias; + define_alias( newName => ENCODING); + +=head1 DESCRIPTION + +Allows newName to be used as an alias for ENCODING. ENCODING may be +either the name of an encoding or an encoding object (as described +in L<Encode>). + +Currently I<newName> can be specified in the following ways: + +=over 4 + +=item As a simple string. + +=item As a qr// compiled regular expression, e.g.: + + define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); + +In this case, if I<ENCODING> is not a reference, it is C<eval>-ed +in order to allow C<$1> etc. to be substituted. The example is one +way to alias names as used in X11 fonts to the MIME names for the +iso-8859-* family. Note the double quotes inside the single quotes. + +(or, you don't have to do this yourself because this example is predefined) + +If you are using a regex here, you have to use the quotes as shown or +it won't work. Also note that regex handling is tricky even for the +experienced. Use this feature with caution. + +=item As a code reference, e.g.: + + define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); + +The same effect as the example above in a different way. The coderef +takes the alias name as an argument and returns a canonical name on +success or undef if not. Note the second argument is not required. +Use this with even more caution than the regex version. + +=back + +=head3 Changes in code reference aliasing + +As of Encode 1.87, the older form + + define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); + +no longer works. + +Encode up to 1.86 internally used "local $_" to implement ths older +form. But consider the code below; + + use Encode; + $_ = "eeeee" ; + while (/(e)/g) { + my $utf = decode('aliased-encoding-name', $1); + print "position:",pos,"\n"; + } + +Prior to Encode 1.86 this fails because of "local $_". + +=head2 Alias overloading + +You can override predefined aliases by simply applying define_alias(). +The new alias is always evaluated first, and when necessary, +define_alias() flushes the internal cache to make the new definition +available. + + # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a + # superset of SHIFT_JIS + + define_alias( qr/shift.*jis$/i => '"cp932"' ); + define_alias( qr/sjis$/i => '"cp932"' ); + +If you want to zap all predefined aliases, you can use + + Encode::Alias->undef_aliases; + +to do so. And + + Encode::Alias->init_aliases; + +gets the factory settings back. + +=head1 SEE ALSO + +L<Encode>, L<Encode::Supported> + +=cut + diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Byte.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Byte.pm new file mode 100644 index 00000000000..3ea9035b7bd --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Byte.pm @@ -0,0 +1,120 @@ +package Encode::Byte; +use strict; +use warnings; +use Encode; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use XSLoader; +XSLoader::load( __PACKAGE__, $VERSION ); + +1; +__END__ + +=head1 NAME + +Encode::Byte - Single Byte Encodings + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $greek = encode("iso-8859-7", $utf8); # loads Encode::Byte implicitly + $utf8 = decode("iso-8859-7", $greek); # ditto + +=head1 ABSTRACT + +This module implements various single byte encodings. For most cases it uses +\x80-\xff (upper half) to map non-ASCII characters. Encodings +supported are as follows. + + Canonical Alias Description + -------------------------------------------------------------------- + # ISO 8859 series + (iso-8859-1 is in built-in) + iso-8859-2 latin2 [ISO] + iso-8859-3 latin3 [ISO] + iso-8859-4 latin4 [ISO] + iso-8859-5 [ISO] + iso-8859-6 [ISO] + iso-8859-7 [ISO] + iso-8859-8 [ISO] + iso-8859-9 latin5 [ISO] + iso-8859-10 latin6 [ISO] + iso-8859-11 + (iso-8859-12 is nonexistent) + iso-8859-13 latin7 [ISO] + iso-8859-14 latin8 [ISO] + iso-8859-15 latin9 [ISO] + iso-8859-16 latin10 [ISO] + + # Cyrillic + koi8-f + koi8-r cp878 [RFC1489] + koi8-u [RFC2319] + + # Vietnamese + viscii + + # all cp* are also available as ibm-*, ms-*, and windows-* + # also see L<http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp> + + cp424 + cp437 + cp737 + cp775 + cp850 + cp852 + cp855 + cp856 + cp857 + cp860 + cp861 + cp862 + cp863 + cp864 + cp865 + cp866 + cp869 + cp874 + cp1006 + cp1250 WinLatin2 + cp1251 WinCyrillic + cp1252 WinLatin1 + cp1253 WinGreek + cp1254 WinTurkish + cp1255 WinHebrew + cp1256 WinArabic + cp1257 WinBaltic + cp1258 WinVietnamese + + # Macintosh + # Also see L<http://developer.apple.com/technotes/tn/tn1150.html> + MacArabic + MacCentralEurRoman + MacCroatian + MacCyrillic + MacFarsi + MacGreek + MacHebrew + MacIcelandic + MacRoman + MacRomanian + MacRumanian + MacSami + MacThai + MacTurkish + MacUkrainian + + # More vendor encodings + AdobeStandardEncoding + nextstep + hp-roman8 + +=head1 DESCRIPTION + +To find how to use this module in detail, see L<Encode>. + +=head1 SEE ALSO + +L<Encode> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/CJKConstants.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/CJKConstants.pm new file mode 100644 index 00000000000..43d20337544 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/CJKConstants.pm @@ -0,0 +1,66 @@ +# +# $Id: CJKConstants.pm,v 2.2 2006/06/03 20:28:48 dankogai Exp $ +# + +package Encode::CJKConstants; + +use strict; +use warnings; +our $RCSID = q$Id: CJKConstants.pm,v 2.2 2006/06/03 20:28:48 dankogai Exp $; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use Carp; + +require Exporter; +our @ISA = qw(Exporter); +our @EXPORT = qw(); +our @EXPORT_OK = qw(%CHARCODE %ESC %RE); +our %EXPORT_TAGS = ( 'all' => [ @EXPORT_OK, @EXPORT ] ); + +my %_0208 = ( + 1978 => '\e\$\@', + 1983 => '\e\$B', + 1990 => '\e&\@\e\$B', +); + +our %CHARCODE = ( + UNDEF_EUC => "\xa2\xae", # ¢® in EUC + UNDEF_SJIS => "\x81\xac", # ¢® in SJIS + UNDEF_JIS => "\xa2\xf7", # ¢÷ -- used in unicode + UNDEF_UNICODE => "\x20\x20", # ¢÷ -- used in unicode +); + +our %ESC = ( + GB_2312 => "\e\$A", + JIS_0208 => "\e\$B", + JIS_0212 => "\e\$(D", + KSC_5601 => "\e\$(C", + ASC => "\e\(B", + KANA => "\e\(I", + '2022_KR' => "\e\$)C", +); + +our %RE = ( + ASCII => '[\x00-\x7f]', + BIN => '[\x00-\x06\x7f\xff]', + EUC_0212 => '\x8f[\xa1-\xfe][\xa1-\xfe]', + EUC_C => '[\xa1-\xfe][\xa1-\xfe]', + EUC_KANA => '\x8e[\xa1-\xdf]', + JIS_0208 => "$_0208{1978}|$_0208{1983}|$_0208{1990}", + JIS_0212 => "\e" . '\$\(D', + ISO_ASC => "\e" . '\([BJ]', + JIS_KANA => "\e" . '\(I', + '2022_KR' => "\e" . '\$\)C', + SJIS_C => '[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]', + SJIS_KANA => '[\xa1-\xdf]', + UTF8 => '[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]' +); + +1; + +=head1 NAME + +Encode::CJKConstants.pm -- Internally used by Encode::??::ISO_2022_* + +=cut + diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/CN.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/CN.pm new file mode 100644 index 00000000000..9f120fb7f15 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/CN.pm @@ -0,0 +1,78 @@ +package Encode::CN; +BEGIN { + if ( ord("A") == 193 ) { + die "Encode::CN not supported on EBCDIC\n"; + } +} +use strict; +use warnings; +use Encode; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +use XSLoader; +XSLoader::load( __PACKAGE__, $VERSION ); + +# Relocated from Encode.pm + +use Encode::CN::HZ; + +# use Encode::CN::2022_CN; + +1; +__END__ + +=head1 NAME + +Encode::CN - China-based Chinese Encodings + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $euc_cn = encode("euc-cn", $utf8); # loads Encode::CN implicitly + $utf8 = decode("euc-cn", $euc_cn); # ditto + +=head1 DESCRIPTION + +This module implements China-based Chinese charset encodings. +Encodings supported are as follows. + + Canonical Alias Description + -------------------------------------------------------------------- + euc-cn /\beuc.*cn$/i EUC (Extended Unix Character) + /\bcn.*euc$/i + /\bGB[-_ ]?2312(?:\D.*$|$)/i (see below) + gb2312-raw The raw (low-bit) GB2312 character map + gb12345-raw Traditional chinese counterpart to + GB2312 (raw) + iso-ir-165 GB2312 + GB6345 + GB8565 + additions + MacChineseSimp GB2312 + Apple Additions + cp936 Code Page 936, also known as GBK + (Extended GuoBiao) + hz 7-bit escaped GB2312 encoding + -------------------------------------------------------------------- + +To find how to use this module in detail, see L<Encode>. + +=head1 NOTES + +Due to size concerns, C<GB 18030> (an extension to C<GBK>) is distributed +separately on CPAN, under the name L<Encode::HanExtra>. That module +also contains extra Taiwan-based encodings. + +=head1 BUGS + +When you see C<charset=gb2312> on mails and web pages, they really +mean C<euc-cn> encodings. To fix that, C<gb2312> is aliased to C<euc-cn>. +Use C<gb2312-raw> when you really mean it. + +The ASCII region (0x00-0x7f) is preserved for all encodings, even though +this conflicts with mappings by the Unicode Consortium. See + +L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en> + +to find out why it is implemented that way. + +=head1 SEE ALSO + +L<Encode> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/CN/HZ.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/CN/HZ.pm new file mode 100644 index 00000000000..98c16a9053a --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/CN/HZ.pm @@ -0,0 +1,198 @@ +package Encode::CN::HZ; + +use strict; +use warnings; +use utf8 (); + +use vars qw($VERSION); +$VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use Encode qw(:fallbacks); + +use base qw(Encode::Encoding); +__PACKAGE__->Define('hz'); + +# HZ is a combination of ASCII and escaped GB, so we implement it +# with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. + +# not ported for EBCDIC. Which should be used, "~" or "\x7E"? + +sub needs_lines { 1 } + +sub decode ($$;$) { + my ( $obj, $str, $chk ) = @_; + + my $GB = Encode::find_encoding('gb2312-raw'); + my $ret = ''; + my $in_ascii = 1; # default mode is ASCII. + + while ( length $str ) { + if ($in_ascii) { # ASCII mode + if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII + $ret .= $1; + + # EBCDIC should need ascii2native, but not ported. + } + elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde + $ret .= '~'; + } + elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII + 1; # no-op + } + elsif ( $str =~ s/^\x7E\x7B// ) { # '~{' + $in_ascii = 0; # to GB + } + else { # encounters an invalid escape, \x80 or greater + last; + } + } + else { # GB mode; the byte ranges are as in RFC 1843. + no warnings 'uninitialized'; + if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) { + $ret .= $GB->decode( $1, $chk ); + } + elsif ( $str =~ s/^\x7E\x7D// ) { # '~}' + $in_ascii = 1; + } + else { # invalid + last; + } + } + } + $_[1] = '' if $chk; # needs_lines guarantees no partial character + return $ret; +} + +sub cat_decode { + my ( $obj, undef, $src, $pos, $trm, $chk ) = @_; + my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ]; + + my $GB = Encode::find_encoding('gb2312-raw'); + my $ret = ''; + my $in_ascii = 1; # default mode is ASCII. + + my $ini_pos = pos($$rsrc); + + substr( $src, 0, $pos ) = ''; + + my $ini_len = bytes::length($src); + + # $trm is the first of the pair '~~', then 2nd tilde is to be removed. + # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? + $src =~ s/^\x7E// if $trm eq "\x7E"; + + while ( length $src ) { + my $now; + if ($in_ascii) { # ASCII mode + if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII + $now = $1; + } + elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde + $now = '~'; + } + elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII + next; + } + elsif ( $src =~ s/^\x7E\x7B// ) { # '~{' + $in_ascii = 0; # to GB + next; + } + else { # encounters an invalid escape, \x80 or greater + last; + } + } + else { # GB mode; the byte ranges are as in RFC 1843. + if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) { + $now = $GB->decode( $1, $chk ); + } + elsif ( $src =~ s/^\x7E\x7D// ) { # '~}' + $in_ascii = 1; + next; + } + else { # invalid + last; + } + } + + next if !defined $now; + + $ret .= $now; + + if ( $now eq $trm ) { + $$rdst .= $ret; + $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); + pos($$rsrc) = $ini_pos; + return 1; + } + } + + $$rdst .= $ret; + $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); + pos($$rsrc) = $ini_pos; + return ''; # terminator not found +} + +sub encode($$;$) { + my ( $obj, $str, $chk ) = @_; + + my $GB = Encode::find_encoding('gb2312-raw'); + my $ret = ''; + my $in_ascii = 1; # default mode is ASCII. + + no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. + + while ( length $str ) { + if ( $str =~ s/^([[:ascii:]]+)// ) { + my $tmp = $1; + $tmp =~ s/~/~~/g; # escapes tildes + if ( !$in_ascii ) { + $ret .= "\x7E\x7D"; # '~}' + $in_ascii = 1; + } + $ret .= pack 'a*', $tmp; # remove UTF8 flag. + } + elsif ( $str =~ s/(.)// ) { + my $s = $1; + my $tmp = $GB->encode( $s, $chk ); + last if !defined $tmp; + if ( length $tmp == 2 ) { # maybe a valid GB char (XXX) + if ($in_ascii) { + $ret .= "\x7E\x7B"; # '~{' + $in_ascii = 0; + } + $ret .= $tmp; + } + elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX) + if ( !$in_ascii ) { + $ret .= "\x7E\x7D"; # '~}' + $in_ascii = 1; + } + $ret .= $tmp; + } + } + else { # if $str is malformed UTF8 *and* if length $str != 0. + last; + } + } + $_[1] = $str if $chk; + + # The state at the end of the chunk is discarded, even if in GB mode. + # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". + # Parhaps it is harmless, but further investigations may be required... + + if ( !$in_ascii ) { + $ret .= "\x7E\x7D"; # '~}' + $in_ascii = 1; + } + utf8::encode($ret); # https://rt.cpan.org/Ticket/Display.html?id=35120 + return $ret; +} + +1; +__END__ + +=head1 NAME + +Encode::CN::HZ -- internally used by Encode::CN + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Config.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Config.pm new file mode 100644 index 00000000000..1286a479998 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Config.pm @@ -0,0 +1,170 @@ +# +# Demand-load module list +# +package Encode::Config; +our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use strict; +use warnings; + +our %ExtModule = ( + + # Encode::Byte + #iso-8859-1 is in Encode.pm itself + 'iso-8859-2' => 'Encode::Byte', + 'iso-8859-3' => 'Encode::Byte', + 'iso-8859-4' => 'Encode::Byte', + 'iso-8859-5' => 'Encode::Byte', + 'iso-8859-6' => 'Encode::Byte', + 'iso-8859-7' => 'Encode::Byte', + 'iso-8859-8' => 'Encode::Byte', + 'iso-8859-9' => 'Encode::Byte', + 'iso-8859-10' => 'Encode::Byte', + 'iso-8859-11' => 'Encode::Byte', + 'iso-8859-13' => 'Encode::Byte', + 'iso-8859-14' => 'Encode::Byte', + 'iso-8859-15' => 'Encode::Byte', + 'iso-8859-16' => 'Encode::Byte', + 'koi8-f' => 'Encode::Byte', + 'koi8-r' => 'Encode::Byte', + 'koi8-u' => 'Encode::Byte', + 'viscii' => 'Encode::Byte', + 'cp424' => 'Encode::Byte', + 'cp437' => 'Encode::Byte', + 'cp737' => 'Encode::Byte', + 'cp775' => 'Encode::Byte', + 'cp850' => 'Encode::Byte', + 'cp852' => 'Encode::Byte', + 'cp855' => 'Encode::Byte', + 'cp856' => 'Encode::Byte', + 'cp857' => 'Encode::Byte', + 'cp858' => 'Encode::Byte', + 'cp860' => 'Encode::Byte', + 'cp861' => 'Encode::Byte', + 'cp862' => 'Encode::Byte', + 'cp863' => 'Encode::Byte', + 'cp864' => 'Encode::Byte', + 'cp865' => 'Encode::Byte', + 'cp866' => 'Encode::Byte', + 'cp869' => 'Encode::Byte', + 'cp874' => 'Encode::Byte', + 'cp1006' => 'Encode::Byte', + 'cp1250' => 'Encode::Byte', + 'cp1251' => 'Encode::Byte', + 'cp1252' => 'Encode::Byte', + 'cp1253' => 'Encode::Byte', + 'cp1254' => 'Encode::Byte', + 'cp1255' => 'Encode::Byte', + 'cp1256' => 'Encode::Byte', + 'cp1257' => 'Encode::Byte', + 'cp1258' => 'Encode::Byte', + 'AdobeStandardEncoding' => 'Encode::Byte', + 'MacArabic' => 'Encode::Byte', + 'MacCentralEurRoman' => 'Encode::Byte', + 'MacCroatian' => 'Encode::Byte', + 'MacCyrillic' => 'Encode::Byte', + 'MacFarsi' => 'Encode::Byte', + 'MacGreek' => 'Encode::Byte', + 'MacHebrew' => 'Encode::Byte', + 'MacIcelandic' => 'Encode::Byte', + 'MacRoman' => 'Encode::Byte', + 'MacRomanian' => 'Encode::Byte', + 'MacRumanian' => 'Encode::Byte', + 'MacSami' => 'Encode::Byte', + 'MacThai' => 'Encode::Byte', + 'MacTurkish' => 'Encode::Byte', + 'MacUkrainian' => 'Encode::Byte', + 'nextstep' => 'Encode::Byte', + 'hp-roman8' => 'Encode::Byte', + #'gsm0338' => 'Encode::Byte', + 'gsm0338' => 'Encode::GSM0338', + + # Encode::EBCDIC + 'cp37' => 'Encode::EBCDIC', + 'cp500' => 'Encode::EBCDIC', + 'cp875' => 'Encode::EBCDIC', + 'cp1026' => 'Encode::EBCDIC', + 'cp1047' => 'Encode::EBCDIC', + 'posix-bc' => 'Encode::EBCDIC', + + # Encode::Symbol + 'dingbats' => 'Encode::Symbol', + 'symbol' => 'Encode::Symbol', + 'AdobeSymbol' => 'Encode::Symbol', + 'AdobeZdingbat' => 'Encode::Symbol', + 'MacDingbats' => 'Encode::Symbol', + 'MacSymbol' => 'Encode::Symbol', + + # Encode::Unicode + 'UCS-2BE' => 'Encode::Unicode', + 'UCS-2LE' => 'Encode::Unicode', + 'UTF-16' => 'Encode::Unicode', + 'UTF-16BE' => 'Encode::Unicode', + 'UTF-16LE' => 'Encode::Unicode', + 'UTF-32' => 'Encode::Unicode', + 'UTF-32BE' => 'Encode::Unicode', + 'UTF-32LE' => 'Encode::Unicode', + 'UTF-7' => 'Encode::Unicode::UTF7', +); + +unless ( ord("A") == 193 ) { + %ExtModule = ( + %ExtModule, + 'euc-cn' => 'Encode::CN', + 'gb12345-raw' => 'Encode::CN', + 'gb2312-raw' => 'Encode::CN', + 'hz' => 'Encode::CN', + 'iso-ir-165' => 'Encode::CN', + 'cp936' => 'Encode::CN', + 'MacChineseSimp' => 'Encode::CN', + + '7bit-jis' => 'Encode::JP', + 'euc-jp' => 'Encode::JP', + 'iso-2022-jp' => 'Encode::JP', + 'iso-2022-jp-1' => 'Encode::JP', + 'jis0201-raw' => 'Encode::JP', + 'jis0208-raw' => 'Encode::JP', + 'jis0212-raw' => 'Encode::JP', + 'cp932' => 'Encode::JP', + 'MacJapanese' => 'Encode::JP', + 'shiftjis' => 'Encode::JP', + + 'euc-kr' => 'Encode::KR', + 'iso-2022-kr' => 'Encode::KR', + 'johab' => 'Encode::KR', + 'ksc5601-raw' => 'Encode::KR', + 'cp949' => 'Encode::KR', + 'MacKorean' => 'Encode::KR', + + 'big5-eten' => 'Encode::TW', + 'big5-hkscs' => 'Encode::TW', + 'cp950' => 'Encode::TW', + 'MacChineseTrad' => 'Encode::TW', + + #'big5plus' => 'Encode::HanExtra', + #'euc-tw' => 'Encode::HanExtra', + #'gb18030' => 'Encode::HanExtra', + + 'MIME-Header' => 'Encode::MIME::Header', + 'MIME-B' => 'Encode::MIME::Header', + 'MIME-Q' => 'Encode::MIME::Header', + + 'MIME-Header-ISO_2022_JP' => 'Encode::MIME::Header::ISO_2022_JP', + ); +} + +# +# Why not export ? to keep ConfigLocal Happy! +# +while ( my ( $enc, $mod ) = each %ExtModule ) { + $Encode::ExtModule{$enc} = $mod; +} + +1; +__END__ + +=head1 NAME + +Encode::Config -- internally used by Encode + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/EBCDIC.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/EBCDIC.pm new file mode 100644 index 00000000000..8024c130da1 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/EBCDIC.pm @@ -0,0 +1,45 @@ +package Encode::EBCDIC; +use strict; +use warnings; +use Encode; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use XSLoader; +XSLoader::load( __PACKAGE__, $VERSION ); + +1; +__END__ + +=head1 NAME + +Encode::EBCDIC - EBCDIC Encodings + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $posix_bc = encode("posix-bc", $utf8); # loads Encode::EBCDIC implicitly + $utf8 = decode("", $posix_bc); # ditto + +=head1 ABSTRACT + +This module implements various EBCDIC-Based encodings. Encodings +supported are as follows. + + Canonical Alias Description + -------------------------------------------------------------------- + cp37 + cp500 + cp875 + cp1026 + cp1047 + posix-bc + +=head1 DESCRIPTION + +To find how to use this module in detail, see L<Encode>. + +=head1 SEE ALSO + +L<Encode>, L<perlebcdic> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Encoder.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Encoder.pm new file mode 100644 index 00000000000..f7194f81ac4 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Encoder.pm @@ -0,0 +1,253 @@ +# +# $Id: Encoder.pm,v 2.1 2006/05/03 18:24:10 dankogai Exp $ +# +package Encode::Encoder; +use strict; +use warnings; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +require Exporter; +our @ISA = qw(Exporter); +our @EXPORT_OK = qw ( encoder ); + +our $AUTOLOAD; +sub DEBUG () { 0 } +use Encode qw(encode decode find_encoding from_to); +use Carp; + +sub new { + my ( $class, $data, $encname ) = @_; + unless ($encname) { + $encname = Encode::is_utf8($data) ? 'utf8' : ''; + } + else { + my $obj = find_encoding($encname) + or croak __PACKAGE__, ": unknown encoding: $encname"; + $encname = $obj->name; + } + my $self = { + data => $data, + encoding => $encname, + }; + bless $self => $class; +} + +sub encoder { __PACKAGE__->new(@_) } + +sub data { + my ( $self, $data ) = @_; + if ( defined $data ) { + $self->{data} = $data; + return $data; + } + else { + return $self->{data}; + } +} + +sub encoding { + my ( $self, $encname ) = @_; + if ($encname) { + my $obj = find_encoding($encname) + or confess __PACKAGE__, ": unknown encoding: $encname"; + $self->{encoding} = $obj->name; + return $self; + } + else { + return $self->{encoding}; + } +} + +sub bytes { + my ( $self, $encname ) = @_; + $encname ||= $self->{encoding}; + my $obj = find_encoding($encname) + or confess __PACKAGE__, ": unknown encoding: $encname"; + $self->{data} = $obj->decode( $self->{data}, 1 ); + $self->{encoding} = ''; + return $self; +} + +sub DESTROY { # defined so it won't autoload. + DEBUG and warn shift; +} + +sub AUTOLOAD { + my $self = shift; + my $type = ref($self) + or confess "$self is not an object"; + my $myname = $AUTOLOAD; + $myname =~ s/.*://; # strip fully-qualified portion + my $obj = find_encoding($myname) + or confess __PACKAGE__, ": unknown encoding: $myname"; + DEBUG and warn $self->{encoding}, " => ", $obj->name; + if ( $self->{encoding} ) { + from_to( $self->{data}, $self->{encoding}, $obj->name, 1 ); + } + else { + $self->{data} = $obj->encode( $self->{data}, 1 ); + } + $self->{encoding} = $obj->name; + return $self; +} + +use overload + q("") => sub { $_[0]->{data} }, + q(0+) => sub { use bytes(); bytes::length( $_[0]->{data} ) }, + fallback => 1, + ; + +1; +__END__ + +=head1 NAME + +Encode::Encoder -- Object Oriented Encoder + +=head1 SYNOPSIS + + use Encode::Encoder; + # Encode::encode("ISO-8859-1", $data); + Encode::Encoder->new($data)->iso_8859_1; # OOP way + # shortcut + use Encode::Encoder qw(encoder); + encoder($data)->iso_8859_1; + # you can stack them! + encoder($data)->iso_8859_1->base64; # provided base64() is defined + # you can use it as a decoder as well + encoder($base64)->bytes('base64')->latin1; + # stringified + print encoder($data)->utf8->latin1; # prints the string in latin1 + # numified + encoder("\x{abcd}\x{ef}g")->utf8 == 6; # true. bytes::length($data) + +=head1 ABSTRACT + +B<Encode::Encoder> allows you to use Encode in an object-oriented +style. This is not only more intuitive than a functional approach, +but also handier when you want to stack encodings. Suppose you want +your UTF-8 string converted to Latin1 then Base64: you can simply say + + my $base64 = encoder($utf8)->latin1->base64; + +instead of + + my $latin1 = encode("latin1", $utf8); + my $base64 = encode_base64($utf8); + +or the lazier and more convoluted + + my $base64 = encode_base64(encode("latin1", $utf8)); + +=head1 Description + +Here is how to use this module. + +=over 4 + +=item * + +There are at least two instance variables stored in a hash reference, +{data} and {encoding}. + +=item * + +When there is no method, it takes the method name as the name of the +encoding and encodes the instance I<data> with I<encoding>. If successful, +the instance I<encoding> is set accordingly. + +=item * + +You can retrieve the result via -E<gt>data but usually you don't have to +because the stringify operator ("") is overridden to do exactly that. + +=back + +=head2 Predefined Methods + +This module predefines the methods below: + +=over 4 + +=item $e = Encode::Encoder-E<gt>new([$data, $encoding]); + +returns an encoder object. Its data is initialized with $data if +present, and its encoding is set to $encoding if present. + +When $encoding is omitted, it defaults to utf8 if $data is already in +utf8 or "" (empty string) otherwise. + +=item encoder() + +is an alias of Encode::Encoder-E<gt>new(). This one is exported on demand. + +=item $e-E<gt>data([$data]) + +When $data is present, sets the instance data to $data and returns the +object itself. Otherwise, the current instance data is returned. + +=item $e-E<gt>encoding([$encoding]) + +When $encoding is present, sets the instance encoding to $encoding and +returns the object itself. Otherwise, the current instance encoding is +returned. + +=item $e-E<gt>bytes([$encoding]) + +decodes instance data from $encoding, or the instance encoding if +omitted. If the conversion is successful, the instance encoding +will be set to "". + +The name I<bytes> was deliberately picked to avoid namespace tainting +-- this module may be used as a base class so method names that appear +in Encode::Encoding are avoided. + +=back + +=head2 Example: base64 transcoder + +This module is designed to work with L<Encode::Encoding>. +To make the Base64 transcoder example above really work, you could +write a module like this: + + package Encode::Base64; + use base 'Encode::Encoding'; + __PACKAGE__->Define('base64'); + use MIME::Base64; + sub encode{ + my ($obj, $data) = @_; + return encode_base64($data); + } + sub decode{ + my ($obj, $data) = @_; + return decode_base64($data); + } + 1; + __END__ + +And your caller module would be something like this: + + use Encode::Encoder; + use Encode::Base64; + + # now you can really do the following + + encoder($data)->iso_8859_1->base64; + encoder($base64)->bytes('base64')->latin1; + +=head2 Operator Overloading + +This module overloads two operators, stringify ("") and numify (0+). + +Stringify dumps the data inside the object. + +Numify returns the number of bytes in the instance data. + +They come in handy when you want to print or find the size of data. + +=head1 SEE ALSO + +L<Encode>, +L<Encode::Encoding> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Encoding.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Encoding.pm new file mode 100644 index 00000000000..768d6d126ac --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Encoding.pm @@ -0,0 +1,360 @@ +package Encode::Encoding; + +# Base class for classes which implement encodings +use strict; +use warnings; +our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +require Encode; + +sub DEBUG { 0 } + +sub Define { + my $obj = shift; + my $canonical = shift; + $obj = bless { Name => $canonical }, $obj unless ref $obj; + + # warn "$canonical => $obj\n"; + Encode::define_encoding( $obj, $canonical, @_ ); +} + +sub name { return shift->{'Name'} } + +sub mime_name{ + require Encode::MIME::Name; + return Encode::MIME::Name::get_mime_name(shift->name); +} + +# sub renew { return $_[0] } + +sub renew { + my $self = shift; + my $clone = bless {%$self} => ref($self); + $clone->{renewed}++; # so the caller can see it + DEBUG and warn $clone->{renewed}; + return $clone; +} + +sub renewed { return $_[0]->{renewed} || 0 } + +*new_sequence = \&renew; + +sub needs_lines { 0 } + +sub perlio_ok { + eval { require PerlIO::encoding }; + return $@ ? 0 : 1; +} + +# (Temporary|legacy) methods + +sub toUnicode { shift->decode(@_) } +sub fromUnicode { shift->encode(@_) } + +# +# Needs to be overloaded or just croak +# + +sub encode { + require Carp; + my $obj = shift; + my $class = ref($obj) ? ref($obj) : $obj; + Carp::croak( $class . "->encode() not defined!" ); +} + +sub decode { + require Carp; + my $obj = shift; + my $class = ref($obj) ? ref($obj) : $obj; + Carp::croak( $class . "->encode() not defined!" ); +} + +sub DESTROY { } + +1; +__END__ + +=head1 NAME + +Encode::Encoding - Encode Implementation Base Class + +=head1 SYNOPSIS + + package Encode::MyEncoding; + use base qw(Encode::Encoding); + + __PACKAGE__->Define(qw(myCanonical myAlias)); + +=head1 DESCRIPTION + +As mentioned in L<Encode>, encodings are (in the current +implementation at least) defined as objects. The mapping of encoding +name to object is via the C<%Encode::Encoding> hash. Though you can +directly manipulate this hash, it is strongly encouraged to use this +base class module and add encode() and decode() methods. + +=head2 Methods you should implement + +You are strongly encouraged to implement methods below, at least +either encode() or decode(). + +=over 4 + +=item -E<gt>encode($string [,$check]) + +MUST return the octet sequence representing I<$string>. + +=over 2 + +=item * + +If I<$check> is true, it SHOULD modify I<$string> in place to remove +the converted part (i.e. the whole string unless there is an error). +If perlio_ok() is true, SHOULD becomes MUST. + +=item * + +If an error occurs, it SHOULD return the octet sequence for the +fragment of string that has been converted and modify $string in-place +to remove the converted part leaving it starting with the problem +fragment. If perlio_ok() is true, SHOULD becomes MUST. + +=item * + +If I<$check> is is false then C<encode> MUST make a "best effort" to +convert the string - for example, by using a replacement character. + +=back + +=item -E<gt>decode($octets [,$check]) + +MUST return the string that I<$octets> represents. + +=over 2 + +=item * + +If I<$check> is true, it SHOULD modify I<$octets> in place to remove +the converted part (i.e. the whole sequence unless there is an +error). If perlio_ok() is true, SHOULD becomes MUST. + +=item * + +If an error occurs, it SHOULD return the fragment of string that has +been converted and modify $octets in-place to remove the converted +part leaving it starting with the problem fragment. If perlio_ok() is +true, SHOULD becomes MUST. + +=item * + +If I<$check> is false then C<decode> should make a "best effort" to +convert the string - for example by using Unicode's "\x{FFFD}" as a +replacement character. + +=back + +=back + +If you want your encoding to work with L<encoding> pragma, you should +also implement the method below. + +=over 4 + +=item -E<gt>cat_decode($destination, $octets, $offset, $terminator [,$check]) + +MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>. +Decoding will terminate when $terminator (a string) appears in output. +I<$offset> will be modified to the last $octets position at end of decode. +Returns true if $terminator appears output, else returns false. + +=back + +=head2 Other methods defined in Encode::Encodings + +You do not have to override methods shown below unless you have to. + +=over 4 + +=item -E<gt>name + +Predefined As: + + sub name { return shift->{'Name'} } + +MUST return the string representing the canonical name of the encoding. + +=item -E<gt>mime_name + +Predefined As: + + sub mime_name{ + require Encode::MIME::Name; + return Encode::MIME::Name::get_mime_name(shift->name); + } + +MUST return the string representing the IANA charset name of the encoding. + +=item -E<gt>renew + +Predefined As: + + sub renew { + my $self = shift; + my $clone = bless { %$self } => ref($self); + $clone->{renewed}++; + return $clone; + } + +This method reconstructs the encoding object if necessary. If you need +to store the state during encoding, this is where you clone your object. + +PerlIO ALWAYS calls this method to make sure it has its own private +encoding object. + +=item -E<gt>renewed + +Predefined As: + + sub renewed { $_[0]->{renewed} || 0 } + +Tells whether the object is renewed (and how many times). Some +modules emit C<Use of uninitialized value in null operation> warning +unless the value is numeric so return 0 for false. + +=item -E<gt>perlio_ok() + +Predefined As: + + sub perlio_ok { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + } + +If your encoding does not support PerlIO for some reasons, just; + + sub perlio_ok { 0 } + +=item -E<gt>needs_lines() + +Predefined As: + + sub needs_lines { 0 }; + +If your encoding can work with PerlIO but needs line buffering, you +MUST define this method so it returns true. 7bit ISO-2022 encodings +are one example that needs this. When this method is missing, false +is assumed. + +=back + +=head2 Example: Encode::ROT13 + + package Encode::ROT13; + use strict; + use base qw(Encode::Encoding); + + __PACKAGE__->Define('rot13'); + + sub encode($$;$){ + my ($obj, $str, $chk) = @_; + $str =~ tr/A-Za-z/N-ZA-Mn-za-m/; + $_[1] = '' if $chk; # this is what in-place edit means + return $str; + } + + # Jr pna or ynml yvxr guvf; + *decode = \&encode; + + 1; + +=head1 Why the heck Encode API is different? + +It should be noted that the I<$check> behaviour is different from the +outer public API. The logic is that the "unchecked" case is useful +when the encoding is part of a stream which may be reporting errors +(e.g. STDERR). In such cases, it is desirable to get everything +through somehow without causing additional errors which obscure the +original one. Also, the encoding is best placed to know what the +correct replacement character is, so if that is the desired behaviour +then letting low level code do it is the most efficient. + +By contrast, if I<$check> is true, the scheme above allows the +encoding to do as much as it can and tell the layer above how much +that was. What is lacking at present is a mechanism to report what +went wrong. The most likely interface will be an additional method +call to the object, or perhaps (to avoid forcing per-stream objects +on otherwise stateless encodings) an additional parameter. + +It is also highly desirable that encoding classes inherit from +C<Encode::Encoding> as a base class. This allows that class to define +additional behaviour for all encoding objects. + + package Encode::MyEncoding; + use base qw(Encode::Encoding); + + __PACKAGE__->Define(qw(myCanonical myAlias)); + +to create an object with C<< bless {Name => ...}, $class >>, and call +define_encoding. They inherit their C<name> method from +C<Encode::Encoding>. + +=head2 Compiled Encodings + +For the sake of speed and efficiency, most of the encodings are now +supported via a I<compiled form>: XS modules generated from UCM +files. Encode provides the enc2xs tool to achieve that. Please see +L<enc2xs> for more details. + +=head1 SEE ALSO + +L<perlmod>, L<enc2xs> + +=begin future + +=over 4 + +=item Scheme 1 + +The fixup routine gets passed the remaining fragment of string being +processed. It modifies it in place to remove bytes/characters it can +understand and returns a string used to represent them. For example: + + sub fixup { + my $ch = substr($_[0],0,1,''); + return sprintf("\x{%02X}",ord($ch); + } + +This scheme is close to how the underlying C code for Encode works, +but gives the fixup routine very little context. + +=item Scheme 2 + +The fixup routine gets passed the original string, an index into +it of the problem area, and the output string so far. It appends +what it wants to the output string and returns a new index into the +original string. For example: + + sub fixup { + # my ($s,$i,$d) = @_; + my $ch = substr($_[0],$_[1],1); + $_[2] .= sprintf("\x{%02X}",ord($ch); + return $_[1]+1; + } + +This scheme gives maximal control to the fixup routine but is more +complicated to code, and may require that the internals of Encode be tweaked to +keep the original string intact. + +=item Other Schemes + +Hybrids of the above. + +Multiple return values rather than in-place modifications. + +Index into the string could be C<pos($str)> allowing C<s/\G...//>. + +=back + +=end future + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/GSM0338.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/GSM0338.pm new file mode 100644 index 00000000000..5e57056e399 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/GSM0338.pm @@ -0,0 +1,292 @@ +# +# $Id: GSM0338.pm,v 2.1 2008/05/07 20:56:05 dankogai Exp dankogai $ +# +package Encode::GSM0338; + +use strict; +use warnings; +use Carp; + +use vars qw($VERSION); +$VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use Encode qw(:fallbacks); + +use base qw(Encode::Encoding); +__PACKAGE__->Define('gsm0338'); + +sub needs_lines { 1 } +sub perlio_ok { 0 } + +use utf8; +our %UNI2GSM = ( + "\x{0040}" => "\x00", # COMMERCIAL AT + "\x{000A}" => "\x0A", # LINE FEED + "\x{000C}" => "\x1B\x0A", # FORM FEED + "\x{000D}" => "\x0D", # CARRIAGE RETURN + "\x{0020}" => "\x20", # SPACE + "\x{0021}" => "\x21", # EXCLAMATION MARK + "\x{0022}" => "\x22", # QUOTATION MARK + "\x{0023}" => "\x23", # NUMBER SIGN + "\x{0024}" => "\x02", # DOLLAR SIGN + "\x{0025}" => "\x25", # PERCENT SIGN + "\x{0026}" => "\x26", # AMPERSAND + "\x{0027}" => "\x27", # APOSTROPHE + "\x{0028}" => "\x28", # LEFT PARENTHESIS + "\x{0029}" => "\x29", # RIGHT PARENTHESIS + "\x{002A}" => "\x2A", # ASTERISK + "\x{002B}" => "\x2B", # PLUS SIGN + "\x{002C}" => "\x2C", # COMMA + "\x{002D}" => "\x2D", # HYPHEN-MINUS + "\x{002E}" => "\x2E", # FULL STOP + "\x{002F}" => "\x2F", # SOLIDUS + "\x{0030}" => "\x30", # DIGIT ZERO + "\x{0031}" => "\x31", # DIGIT ONE + "\x{0032}" => "\x32", # DIGIT TWO + "\x{0033}" => "\x33", # DIGIT THREE + "\x{0034}" => "\x34", # DIGIT FOUR + "\x{0035}" => "\x35", # DIGIT FIVE + "\x{0036}" => "\x36", # DIGIT SIX + "\x{0037}" => "\x37", # DIGIT SEVEN + "\x{0038}" => "\x38", # DIGIT EIGHT + "\x{0039}" => "\x39", # DIGIT NINE + "\x{003A}" => "\x3A", # COLON + "\x{003B}" => "\x3B", # SEMICOLON + "\x{003C}" => "\x3C", # LESS-THAN SIGN + "\x{003D}" => "\x3D", # EQUALS SIGN + "\x{003E}" => "\x3E", # GREATER-THAN SIGN + "\x{003F}" => "\x3F", # QUESTION MARK + "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A + "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B + "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C + "\x{0044}" => "\x44", # LATIN CAPITAL LETTER D + "\x{0045}" => "\x45", # LATIN CAPITAL LETTER E + "\x{0046}" => "\x46", # LATIN CAPITAL LETTER F + "\x{0047}" => "\x47", # LATIN CAPITAL LETTER G + "\x{0048}" => "\x48", # LATIN CAPITAL LETTER H + "\x{0049}" => "\x49", # LATIN CAPITAL LETTER I + "\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J + "\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K + "\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L + "\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M + "\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N + "\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O + "\x{0050}" => "\x50", # LATIN CAPITAL LETTER P + "\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q + "\x{0052}" => "\x52", # LATIN CAPITAL LETTER R + "\x{0053}" => "\x53", # LATIN CAPITAL LETTER S + "\x{0054}" => "\x54", # LATIN CAPITAL LETTER T + "\x{0055}" => "\x55", # LATIN CAPITAL LETTER U + "\x{0056}" => "\x56", # LATIN CAPITAL LETTER V + "\x{0057}" => "\x57", # LATIN CAPITAL LETTER W + "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X + "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y + "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z + "\x{005F}" => "\x11", # LOW LINE + "\x{0061}" => "\x61", # LATIN SMALL LETTER A + "\x{0062}" => "\x62", # LATIN SMALL LETTER B + "\x{0063}" => "\x63", # LATIN SMALL LETTER C + "\x{0064}" => "\x64", # LATIN SMALL LETTER D + "\x{0065}" => "\x65", # LATIN SMALL LETTER E + "\x{0066}" => "\x66", # LATIN SMALL LETTER F + "\x{0067}" => "\x67", # LATIN SMALL LETTER G + "\x{0068}" => "\x68", # LATIN SMALL LETTER H + "\x{0069}" => "\x69", # LATIN SMALL LETTER I + "\x{006A}" => "\x6A", # LATIN SMALL LETTER J + "\x{006B}" => "\x6B", # LATIN SMALL LETTER K + "\x{006C}" => "\x6C", # LATIN SMALL LETTER L + "\x{006D}" => "\x6D", # LATIN SMALL LETTER M + "\x{006E}" => "\x6E", # LATIN SMALL LETTER N + "\x{006F}" => "\x6F", # LATIN SMALL LETTER O + "\x{0070}" => "\x70", # LATIN SMALL LETTER P + "\x{0071}" => "\x71", # LATIN SMALL LETTER Q + "\x{0072}" => "\x72", # LATIN SMALL LETTER R + "\x{0073}" => "\x73", # LATIN SMALL LETTER S + "\x{0074}" => "\x74", # LATIN SMALL LETTER T + "\x{0075}" => "\x75", # LATIN SMALL LETTER U + "\x{0076}" => "\x76", # LATIN SMALL LETTER V + "\x{0077}" => "\x77", # LATIN SMALL LETTER W + "\x{0078}" => "\x78", # LATIN SMALL LETTER X + "\x{0079}" => "\x79", # LATIN SMALL LETTER Y + "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z + "\x{000C}" => "\x1B\x0A", # FORM FEED + "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET + "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS + "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET + "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT + "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET + "\x{007C}" => "\x1B\x40", # VERTICAL LINE + "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET + "\x{007E}" => "\x1B\x3D", # TILDE + "\x{00A0}" => "\x1B", # NO-BREAK SPACE + "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK + "\x{00A3}" => "\x01", # POUND SIGN + "\x{00A4}" => "\x24", # CURRENCY SIGN + "\x{00A5}" => "\x03", # YEN SIGN + "\x{00A7}" => "\x5F", # SECTION SIGN + "\x{00BF}" => "\x60", # INVERTED QUESTION MARK + "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS + "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE + "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE + "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE + "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE + "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS + "\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE + "\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS + "\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S + "\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE + "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS + "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE + "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE + "\x{00E7}" => "\x09", # LATIN SMALL LETTER C WITH CEDILLA + "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE + "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE + "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE + "\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE + "\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE + "\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS + "\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE + "\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE + "\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS + "\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA + "\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA + "\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA + "\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA + "\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI + "\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI + "\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA + "\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI + "\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI + "\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA + "\x{20AC}" => "\x1B\x65", # EURO SIGN +); +our %GSM2UNI = reverse %UNI2GSM; +our $ESC = "\x1b"; +our $ATMARK = "\x40"; +our $FBCHAR = "\x3F"; +our $NBSP = "\x{00A0}"; + +#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" + +sub decode ($$;$) { + my ( $obj, $bytes, $chk ) = @_; + my $str; + while ( length $bytes ) { + my $c = substr( $bytes, 0, 1, '' ); + my $u; + if ( $c eq "\x00" ) { + my $c2 = substr( $bytes, 0, 1, '' ); + $u = + !length $c2 ? $ATMARK + : $c2 eq "\x00" ? "\x{0000}" + : exists $GSM2UNI{$c2} ? $ATMARK . $GSM2UNI{$c2} + : $chk + ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", + ord($c), ord($c2) ) + : $ATMARK . $FBCHAR; + + } + elsif ( $c eq $ESC ) { + my $c2 = substr( $bytes, 0, 1, '' ); + $u = + exists $GSM2UNI{ $c . $c2 } ? $GSM2UNI{ $c . $c2 } + : exists $GSM2UNI{$c2} ? $NBSP . $GSM2UNI{$c2} + : $chk + ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", + ord($c), ord($c2) ) + : $NBSP . $FBCHAR; + } + else { + $u = + exists $GSM2UNI{$c} + ? $GSM2UNI{$c} + : $chk ? ref $chk eq 'CODE' + ? $chk->( ord $c ) + : croak sprintf( "\\x%02X does not map to Unicode", ord($c) ) + : $FBCHAR; + } + $str .= $u; + } + $_[1] = $bytes if $chk; + return $str; +} + +#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" + +sub encode($$;$) { + my ( $obj, $str, $chk ) = @_; + my $bytes; + while ( length $str ) { + my $u = substr( $str, 0, 1, '' ); + my $c; + $bytes .= + exists $UNI2GSM{$u} + ? $UNI2GSM{$u} + : $chk ? ref $chk eq 'CODE' + ? $chk->( ord($u) ) + : croak sprintf( "\\x{%04x} does not map to %s", + ord($u), $obj->name ) + : $FBCHAR; + } + $_[1] = $str if $chk; + return $bytes; +} + +1; +__END__ + +=head1 NAME + +Encode::GSM0338 -- ESTI GSM 03.38 Encoding + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $gsm0338 = encode("gsm0338", $utf8); # loads Encode::GSM0338 implicitly + $utf8 = decode("gsm0338", $gsm0338); # ditto + +=head1 DESCRIPTION + +GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII, +control character ranges and other parts are mapped very differently, +mainly to store Greek characters. There are also escape sequences +(starting with 0x1B) to cover e.g. the Euro sign. + +This was once handled by L<Encode::Bytes> but because of all those +unusual specifications, Encode 2.20 has relocated the support to +this module. + +=head1 NOTES + +Unlike most other encodings, the following aways croaks on error +for any $chk that evaluates to true. + + $gsm0338 = encode("gsm0338", $utf8 $chk); + $utf8 = decode("gsm0338", $gsm0338, $chk); + +So if you want to check the validity of the encoding, surround the +expression with C<eval {}> block as follows; + + eval { + $utf8 = decode("gsm0338", $gsm0338, $chk); + }; + if ($@){ + # handle exception here + } + +=head1 BUGS + +ESTI GSM 03.38 Encoding itself. + +Mapping \x00 to '@' causes too much pain everywhere. + +Its use of \x1b (escape) is also very questionable. + +Because of those two, the code paging approach used use in ucm-based +Encoding SOMETIMES fails so this module was written. + +=head1 SEE ALSO + +L<Encode> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Guess.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Guess.pm new file mode 100644 index 00000000000..1bc4df777d8 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Guess.pm @@ -0,0 +1,355 @@ +package Encode::Guess; +use strict; +use warnings; +use Encode qw(:fallbacks find_encoding); +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +my $Canon = 'Guess'; +sub DEBUG () { 0 } +our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); +$Encode::Encoding{$Canon} = bless { + Name => $Canon, + Suspects => {%DEF_SUSPECTS}, +} => __PACKAGE__; + +use base qw(Encode::Encoding); +sub needs_lines { 1 } +sub perlio_ok { 0 } + +our @EXPORT = qw(guess_encoding); +our $NoUTFAutoGuess = 0; +our $UTF8_BOM = pack( "C3", 0xef, 0xbb, 0xbf ); + +sub import { # Exporter not used so we do it on our own + my $callpkg = caller; + for my $item (@EXPORT) { + no strict 'refs'; + *{"$callpkg\::$item"} = \&{"$item"}; + } + set_suspects(@_); +} + +sub set_suspects { + my $class = shift; + my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; + $self->{Suspects} = {%DEF_SUSPECTS}; + $self->add_suspects(@_); +} + +sub add_suspects { + my $class = shift; + my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; + for my $c (@_) { + my $e = find_encoding($c) or die "Unknown encoding: $c"; + $self->{Suspects}{ $e->name } = $e; + DEBUG and warn "Added: ", $e->name; + } +} + +sub decode($$;$) { + my ( $obj, $octet, $chk ) = @_; + my $guessed = guess( $obj, $octet ); + unless ( ref($guessed) ) { + require Carp; + Carp::croak($guessed); + } + my $utf8 = $guessed->decode( $octet, $chk ); + $_[1] = $octet if $chk; + return $utf8; +} + +sub guess_encoding { + guess( $Encode::Encoding{$Canon}, @_ ); +} + +sub guess { + my $class = shift; + my $obj = ref($class) ? $class : $Encode::Encoding{$Canon}; + my $octet = shift; + + # sanity check + return unless defined $octet and length $octet; + + # cheat 0: utf8 flag; + if ( Encode::is_utf8($octet) ) { + return find_encoding('utf8') unless $NoUTFAutoGuess; + Encode::_utf8_off($octet); + } + + # cheat 1: BOM + use Encode::Unicode; + unless ($NoUTFAutoGuess) { + my $BOM = pack( 'C3', unpack( "C3", $octet ) ); + return find_encoding('utf8') + if ( defined $BOM and $BOM eq $UTF8_BOM ); + $BOM = unpack( 'N', $octet ); + return find_encoding('UTF-32') + if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe0000 ) ); + $BOM = unpack( 'n', $octet ); + return find_encoding('UTF-16') + if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe ) ); + if ( $octet =~ /\x00/o ) + { # if \x00 found, we assume UTF-(16|32)(BE|LE) + my $utf; + my ( $be, $le ) = ( 0, 0 ); + if ( $octet =~ /\x00\x00/o ) { # UTF-32(BE|LE) assumed + $utf = "UTF-32"; + for my $char ( unpack( 'N*', $octet ) ) { + $char & 0x0000ffff and $be++; + $char & 0xffff0000 and $le++; + } + } + else { # UTF-16(BE|LE) assumed + $utf = "UTF-16"; + for my $char ( unpack( 'n*', $octet ) ) { + $char & 0x00ff and $be++; + $char & 0xff00 and $le++; + } + } + DEBUG and warn "$utf, be == $be, le == $le"; + $be == $le + and return + "Encodings ambiguous between $utf BE and LE ($be, $le)"; + $utf .= ( $be > $le ) ? 'BE' : 'LE'; + return find_encoding($utf); + } + } + my %try = %{ $obj->{Suspects} }; + for my $c (@_) { + my $e = find_encoding($c) or die "Unknown encoding: $c"; + $try{ $e->name } = $e; + DEBUG and warn "Added: ", $e->name; + } + my $nline = 1; + for my $line ( split /\r\n?|\n/, $octet ) { + + # cheat 2 -- \e in the string + if ( $line =~ /\e/o ) { + my @keys = keys %try; + delete @try{qw/utf8 ascii/}; + for my $k (@keys) { + ref( $try{$k} ) eq 'Encode::XS' and delete $try{$k}; + } + } + my %ok = %try; + + # warn join(",", keys %try); + for my $k ( keys %try ) { + my $scratch = $line; + $try{$k}->decode( $scratch, FB_QUIET ); + if ( $scratch eq '' ) { + DEBUG and warn sprintf( "%4d:%-24s ok\n", $nline, $k ); + } + else { + use bytes (); + DEBUG + and warn sprintf( "%4d:%-24s not ok; %d bytes left\n", + $nline, $k, bytes::length($scratch) ); + delete $ok{$k}; + } + } + %ok or return "No appropriate encodings found!"; + if ( scalar( keys(%ok) ) == 1 ) { + my ($retval) = values(%ok); + return $retval; + } + %try = %ok; + $nline++; + } + $try{ascii} + or return "Encodings too ambiguous: ", join( " or ", keys %try ); + return $try{ascii}; +} + +1; +__END__ + +=head1 NAME + +Encode::Guess -- Guesses encoding from data + +=head1 SYNOPSIS + + # if you are sure $data won't contain anything bogus + + use Encode; + use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; + my $utf8 = decode("Guess", $data); + my $data = encode("Guess", $utf8); # this doesn't work! + + # more elaborate way + use Encode::Guess; + my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/); + ref($enc) or die "Can't guess: $enc"; # trap error this way + $utf8 = $enc->decode($data); + # or + $utf8 = decode($enc->name, $data) + +=head1 ABSTRACT + +Encode::Guess enables you to guess in what encoding a given data is +encoded, or at least tries to. + +=head1 DESCRIPTION + +By default, it checks only ascii, utf8 and UTF-16/32 with BOM. + + use Encode::Guess; # ascii/utf8/BOMed UTF + +To use it more practically, you have to give the names of encodings to +check (I<suspects> as follows). The name of suspects can either be +canonical names or aliases. + +CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED. + + # tries all major Japanese Encodings as well + use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; + +If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true +value, no heuristics will be applied to UTF8/16/32, and the result +will be limited to the suspects and C<ascii>. + +=over 4 + +=item Encode::Guess->set_suspects + +You can also change the internal suspects list via C<set_suspects> +method. + + use Encode::Guess; + Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/); + +=item Encode::Guess->add_suspects + +Or you can use C<add_suspects> method. The difference is that +C<set_suspects> flushes the current suspects list while +C<add_suspects> adds. + + use Encode::Guess; + Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/); + # now the suspects are euc-jp,shiftjis,7bit-jis, AND + # euc-kr,euc-cn, and big5-eten + Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/); + +=item Encode::decode("Guess" ...) + +When you are content with suspects list, you can now + + my $utf8 = Encode::decode("Guess", $data); + +=item Encode::Guess->guess($data) + +But it will croak if: + +=over + +=item * + +Two or more suspects remain + +=item * + +No suspects left + +=back + +So you should instead try this; + + my $decoder = Encode::Guess->guess($data); + +On success, $decoder is an object that is documented in +L<Encode::Encoding>. So you can now do this; + + my $utf8 = $decoder->decode($data); + +On failure, $decoder now contains an error message so the whole thing +would be as follows; + + my $decoder = Encode::Guess->guess($data); + die $decoder unless ref($decoder); + my $utf8 = $decoder->decode($data); + +=item guess_encoding($data, [, I<list of suspects>]) + +You can also try C<guess_encoding> function which is exported by +default. It takes $data to check and it also takes the list of +suspects by option. The optional suspect list is I<not reflected> to +the internal suspects list. + + my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/); + die $decoder unless ref($decoder); + my $utf8 = $decoder->decode($data); + # check only ascii and utf8 + my $decoder = guess_encoding($data); + +=back + +=head1 CAVEATS + +=over 4 + +=item * + +Because of the algorithm used, ISO-8859 series and other single-byte +encodings do not work well unless either one of ISO-8859 is the only +one suspect (besides ascii and utf8). + + use Encode::Guess; + # perhaps ok + my $decoder = guess_encoding($data, 'latin1'); + # definitely NOT ok + my $decoder = guess_encoding($data, qw/latin1 greek/); + +The reason is that Encode::Guess guesses encoding by trial and error. +It first splits $data into lines and tries to decode the line for each +suspect. It keeps it going until all but one encoding is eliminated +out of suspects list. ISO-8859 series is just too successful for most +cases (because it fills almost all code points in \x00-\xff). + +=item * + +Do not mix national standard encodings and the corresponding vendor +encodings. + + # a very bad idea + my $decoder + = guess_encoding($data, qw/shiftjis MacJapanese cp932/); + +The reason is that vendor encoding is usually a superset of national +standard so it becomes too ambiguous for most cases. + +=item * + +On the other hand, mixing various national standard encodings +automagically works unless $data is too short to allow for guessing. + + # This is ok if $data is long enough + my $decoder = + guess_encoding($data, qw/euc-cn + euc-jp shiftjis 7bit-jis + euc-kr + big5-eten/); + +=item * + +DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this! + + my $decoder = guess_encoding($data, + Encode->encodings(":all")); + +=back + +It is, after all, just a guess. You should alway be explicit when it +comes to encodings. But there are some, especially Japanese, +environment that guess-coding is a must. Use this module with care. + +=head1 TO DO + +Encode::Guess does not work on EBCDIC platforms. + +=head1 SEE ALSO + +L<Encode>, L<Encode::Encoding> + +=cut + diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/JP.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/JP.pm new file mode 100644 index 00000000000..e78e54d0522 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/JP.pm @@ -0,0 +1,99 @@ +package Encode::JP; +BEGIN { + if ( ord("A") == 193 ) { + die "Encode::JP not supported on EBCDIC\n"; + } +} +use strict; +use warnings; +use Encode; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use XSLoader; +XSLoader::load( __PACKAGE__, $VERSION ); + +use Encode::JP::JIS7; + +1; +__END__ + +=head1 NAME + +Encode::JP - Japanese Encodings + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $euc_jp = encode("euc-jp", $utf8); # loads Encode::JP implicitly + $utf8 = decode("euc-jp", $euc_jp); # ditto + +=head1 ABSTRACT + +This module implements Japanese charset encodings. Encodings +supported are as follows. + + Canonical Alias Description + -------------------------------------------------------------------- + euc-jp /\beuc.*jp$/i EUC (Extended Unix Character) + /\bjp.*euc/i + /\bujis$/i + shiftjis /\bshift.*jis$/i Shift JIS (aka MS Kanji) + /\bsjis$/i + 7bit-jis /\bjis$/i 7bit JIS + iso-2022-jp ISO-2022-JP [RFC1468] + = 7bit JIS with all Halfwidth Kana + converted to Fullwidth + iso-2022-jp-1 ISO-2022-JP-1 [RFC2237] + = ISO-2022-JP with JIS X 0212-1990 + support. See below + MacJapanese Shift JIS + Apple vendor mappings + cp932 /\bwindows-31j$/i Code Page 932 + = Shift JIS + MS/IBM vendor mappings + jis0201-raw JIS0201, raw format + jis0208-raw JIS0201, raw format + jis0212-raw JIS0201, raw format + -------------------------------------------------------------------- + +=head1 DESCRIPTION + +To find out how to use this module in detail, see L<Encode>. + +=head1 Note on ISO-2022-JP(-1)? + +ISO-2022-JP-1 (RFC2237) is a superset of ISO-2022-JP (RFC1468) which +adds support for JIS X 0212-1990. That means you can use the same +code to decode to utf8 but not vice versa. + + $utf8 = decode('iso-2022-jp-1', $stream); + +and + + $utf8 = decode('iso-2022-jp', $stream); + +yield the same result but + + $with_0212 = encode('iso-2022-jp-1', $utf8); + +is now different from + + $without_0212 = encode('iso-2022-jp', $utf8 ); + +In the latter case, characters that map to 0212 are first converted +to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu' or +'geta mark') then fed to the decoding engine. U+FFFD is not used, +in order to preserve text layout as much as possible. + +=head1 BUGS + +The ASCII region (0x00-0x7f) is preserved for all encodings, even +though this conflicts with mappings by the Unicode Consortium. See + +L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en> + +to find out why it is implemented that way. + +=head1 SEE ALSO + +L<Encode> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/JP/H2Z.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/JP/H2Z.pm new file mode 100644 index 00000000000..f8e2230da11 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/JP/H2Z.pm @@ -0,0 +1,176 @@ +# +# $Id: H2Z.pm,v 2.2 2006/06/03 20:28:48 dankogai Exp $ +# + +package Encode::JP::H2Z; + +use strict; +use warnings; + +our $RCSID = q$Id: H2Z.pm,v 2.2 2006/06/03 20:28:48 dankogai Exp $; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use Encode::CJKConstants qw(:all); + +use vars qw(%_D2Z $_PAT_D2Z + %_Z2D $_PAT_Z2D + %_H2Z $_PAT_H2Z + %_Z2H $_PAT_Z2H); + +%_H2Z = ( + "\x8e\xa1" => "\xa1\xa3", #¡£ + "\x8e\xa2" => "\xa1\xd6", #¡Ö + "\x8e\xa3" => "\xa1\xd7", #¡× + "\x8e\xa4" => "\xa1\xa2", #¡¢ + "\x8e\xa5" => "\xa1\xa6", #¡¦ + "\x8e\xa6" => "\xa5\xf2", #¥ò + "\x8e\xa7" => "\xa5\xa1", #¥¡ + "\x8e\xa8" => "\xa5\xa3", #¥£ + "\x8e\xa9" => "\xa5\xa5", #¥¥ + "\x8e\xaa" => "\xa5\xa7", #¥§ + "\x8e\xab" => "\xa5\xa9", #¥© + "\x8e\xac" => "\xa5\xe3", #¥ã + "\x8e\xad" => "\xa5\xe5", #¥å + "\x8e\xae" => "\xa5\xe7", #¥ç + "\x8e\xaf" => "\xa5\xc3", #¥Ã + "\x8e\xb0" => "\xa1\xbc", #¡¼ + "\x8e\xb1" => "\xa5\xa2", #¥¢ + "\x8e\xb2" => "\xa5\xa4", #¥¤ + "\x8e\xb3" => "\xa5\xa6", #¥¦ + "\x8e\xb4" => "\xa5\xa8", #¥¨ + "\x8e\xb5" => "\xa5\xaa", #¥ª + "\x8e\xb6" => "\xa5\xab", #¥« + "\x8e\xb7" => "\xa5\xad", #¥ + "\x8e\xb8" => "\xa5\xaf", #¥¯ + "\x8e\xb9" => "\xa5\xb1", #¥± + "\x8e\xba" => "\xa5\xb3", #¥³ + "\x8e\xbb" => "\xa5\xb5", #¥µ + "\x8e\xbc" => "\xa5\xb7", #¥· + "\x8e\xbd" => "\xa5\xb9", #¥¹ + "\x8e\xbe" => "\xa5\xbb", #¥» + "\x8e\xbf" => "\xa5\xbd", #¥½ + "\x8e\xc0" => "\xa5\xbf", #¥¿ + "\x8e\xc1" => "\xa5\xc1", #¥Á + "\x8e\xc2" => "\xa5\xc4", #¥Ä + "\x8e\xc3" => "\xa5\xc6", #¥Æ + "\x8e\xc4" => "\xa5\xc8", #¥È + "\x8e\xc5" => "\xa5\xca", #¥Ê + "\x8e\xc6" => "\xa5\xcb", #¥Ë + "\x8e\xc7" => "\xa5\xcc", #¥Ì + "\x8e\xc8" => "\xa5\xcd", #¥Í + "\x8e\xc9" => "\xa5\xce", #¥Î + "\x8e\xca" => "\xa5\xcf", #¥Ï + "\x8e\xcb" => "\xa5\xd2", #¥Ò + "\x8e\xcc" => "\xa5\xd5", #¥Õ + "\x8e\xcd" => "\xa5\xd8", #¥Ø + "\x8e\xce" => "\xa5\xdb", #¥Û + "\x8e\xcf" => "\xa5\xde", #¥Þ + "\x8e\xd0" => "\xa5\xdf", #¥ß + "\x8e\xd1" => "\xa5\xe0", #¥à + "\x8e\xd2" => "\xa5\xe1", #¥á + "\x8e\xd3" => "\xa5\xe2", #¥â + "\x8e\xd4" => "\xa5\xe4", #¥ä + "\x8e\xd5" => "\xa5\xe6", #¥æ + "\x8e\xd6" => "\xa5\xe8", #¥è + "\x8e\xd7" => "\xa5\xe9", #¥é + "\x8e\xd8" => "\xa5\xea", #¥ê + "\x8e\xd9" => "\xa5\xeb", #¥ë + "\x8e\xda" => "\xa5\xec", #¥ì + "\x8e\xdb" => "\xa5\xed", #¥í + "\x8e\xdc" => "\xa5\xef", #¥ï + "\x8e\xdd" => "\xa5\xf3", #¥ó + "\x8e\xde" => "\xa1\xab", #¡« + "\x8e\xdf" => "\xa1\xac", #¡¬ +); + +%_D2Z = ( + "\x8e\xb6\x8e\xde" => "\xa5\xac", #¥¬ + "\x8e\xb7\x8e\xde" => "\xa5\xae", #¥® + "\x8e\xb8\x8e\xde" => "\xa5\xb0", #¥° + "\x8e\xb9\x8e\xde" => "\xa5\xb2", #¥² + "\x8e\xba\x8e\xde" => "\xa5\xb4", #¥´ + "\x8e\xbb\x8e\xde" => "\xa5\xb6", #¥¶ + "\x8e\xbc\x8e\xde" => "\xa5\xb8", #¥¸ + "\x8e\xbd\x8e\xde" => "\xa5\xba", #¥º + "\x8e\xbe\x8e\xde" => "\xa5\xbc", #¥¼ + "\x8e\xbf\x8e\xde" => "\xa5\xbe", #¥¾ + "\x8e\xc0\x8e\xde" => "\xa5\xc0", #¥À + "\x8e\xc1\x8e\xde" => "\xa5\xc2", #¥Â + "\x8e\xc2\x8e\xde" => "\xa5\xc5", #¥Å + "\x8e\xc3\x8e\xde" => "\xa5\xc7", #¥Ç + "\x8e\xc4\x8e\xde" => "\xa5\xc9", #¥É + "\x8e\xca\x8e\xde" => "\xa5\xd0", #¥Ð + "\x8e\xcb\x8e\xde" => "\xa5\xd3", #¥Ó + "\x8e\xcc\x8e\xde" => "\xa5\xd6", #¥Ö + "\x8e\xcd\x8e\xde" => "\xa5\xd9", #¥Ù + "\x8e\xce\x8e\xde" => "\xa5\xdc", #¥Ü + "\x8e\xca\x8e\xdf" => "\xa5\xd1", #¥Ñ + "\x8e\xcb\x8e\xdf" => "\xa5\xd4", #¥Ô + "\x8e\xcc\x8e\xdf" => "\xa5\xd7", #¥× + "\x8e\xcd\x8e\xdf" => "\xa5\xda", #¥Ú + "\x8e\xce\x8e\xdf" => "\xa5\xdd", #¥Ý + "\x8e\xb3\x8e\xde" => "\xa5\xf4", #¥ô +); + +# init only once; + +#$_PAT_D2Z = join("|", keys %_D2Z); +#$_PAT_H2Z = join("|", keys %_H2Z); + +%_Z2H = reverse %_H2Z; +%_Z2D = reverse %_D2Z; + +#$_PAT_Z2H = join("|", keys %_Z2H); +#$_PAT_Z2D = join("|", keys %_Z2D); + +sub h2z { + no warnings qw(uninitialized); + my $r_str = shift; + my ($keep_dakuten) = @_; + my $n = 0; + unless ($keep_dakuten) { + $n = ( + $$r_str =~ s( + ($RE{EUC_KANA} + (?:\x8e[\xde\xdf])?) + ){ + my $str = $1; + $_D2Z{$str} || $_H2Z{$str} || + # in case dakuten and handakuten are side-by-side! + $_H2Z{substr($str,0,2)} . $_H2Z{substr($str,2,2)}; + }eogx + ); + } + else { + $n = ( + $$r_str =~ s( + ($RE{EUC_KANA}) + ){ + $_H2Z{$1}; + }eogx + ); + } + $n; +} + +sub z2h { + my $r_str = shift; + my $n = ( + $$r_str =~ s( + ($RE{EUC_C}|$RE{EUC_0212}|$RE{EUC_KANA}) + ){ + $_Z2D{$1} || $_Z2H{$1} || $1; + }eogx + ); + $n; +} + +1; +__END__ + + +=head1 NAME + +Encode::JP::H2Z -- internally used by Encode::JP::2022_JP* + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/JP/JIS7.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/JP/JIS7.pm new file mode 100644 index 00000000000..bb048fdf4ac --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/JP/JIS7.pm @@ -0,0 +1,164 @@ +package Encode::JP::JIS7; +use strict; +use warnings; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use Encode qw(:fallbacks); + +for my $name ( '7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1' ) { + my $h2z = ( $name eq '7bit-jis' ) ? 0 : 1; + my $jis0212 = ( $name eq 'iso-2022-jp' ) ? 0 : 1; + + $Encode::Encoding{$name} = bless { + Name => $name, + h2z => $h2z, + jis0212 => $jis0212, + } => __PACKAGE__; +} + +use base qw(Encode::Encoding); + +# we override this to 1 so PerlIO works +sub needs_lines { 1 } + +use Encode::CJKConstants qw(:all); + +# +# decode is identical for all 2022 variants +# + +sub decode($$;$) { + my ( $obj, $str, $chk ) = @_; + my $residue = ''; + if ($chk) { + $str =~ s/([^\x00-\x7f].*)$//so and $residue = $1; + } + $residue .= jis_euc( \$str ); + $_[1] = $residue if $chk; + return Encode::decode( 'euc-jp', $str, FB_PERLQQ ); +} + +# +# encode is different +# + +sub encode($$;$) { + require Encode::JP::H2Z; + my ( $obj, $utf8, $chk ) = @_; + + # empty the input string in the stack so perlio is ok + $_[1] = '' if $chk; + my ( $h2z, $jis0212 ) = @$obj{qw(h2z jis0212)}; + my $octet = Encode::encode( 'euc-jp', $utf8, $chk ); + $h2z and &Encode::JP::H2Z::h2z( \$octet ); + euc_jis( \$octet, $jis0212 ); + return $octet; +} + +# +# cat_decode +# +my $re_scan_jis_g = qr{ + \G ( ($RE{JIS_0212}) | $RE{JIS_0208} | + ($RE{ISO_ASC}) | ($RE{JIS_KANA}) | ) + ([^\e]*) +}x; + +sub cat_decode { # ($obj, $dst, $src, $pos, $trm, $chk) + my ( $obj, undef, undef, $pos, $trm ) = @_; # currently ignores $chk + my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; + local ${^ENCODING}; + use bytes; + my $opos = pos($$rsrc); + pos($$rsrc) = $pos; + while ( $$rsrc =~ /$re_scan_jis_g/gc ) { + my ( $esc, $esc_0212, $esc_asc, $esc_kana, $chunk ) = + ( $1, $2, $3, $4, $5 ); + + unless ($chunk) { $esc or last; next; } + + if ( $esc && !$esc_asc ) { + $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; + if ($esc_kana) { + $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; + } + elsif ($esc_0212) { + $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; + } + $chunk = Encode::decode( 'euc-jp', $chunk, 0 ); + } + elsif ( ( my $npos = index( $chunk, $trm ) ) >= 0 ) { + $$rdst .= substr( $chunk, 0, $npos + length($trm) ); + $$rpos += length($esc) + $npos + length($trm); + pos($$rsrc) = $opos; + return 1; + } + $$rdst .= $chunk; + $$rpos = pos($$rsrc); + } + $$rpos = pos($$rsrc); + pos($$rsrc) = $opos; + return ''; +} + +# JIS<->EUC +my $re_scan_jis = qr{ + (?:($RE{JIS_0212})|$RE{JIS_0208}|($RE{ISO_ASC})|($RE{JIS_KANA}))([^\e]*) +}x; + +sub jis_euc { + local ${^ENCODING}; + my $r_str = shift; + $$r_str =~ s($re_scan_jis) + { + my ($esc_0212, $esc_asc, $esc_kana, $chunk) = + ($1, $2, $3, $4); + if (!$esc_asc) { + $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; + if ($esc_kana) { + $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; + } + elsif ($esc_0212) { + $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; + } + } + $chunk; + }geox; + my ($residue) = ( $$r_str =~ s/(\e.*)$//so ); + return $residue; +} + +sub euc_jis { + no warnings qw(uninitialized); + my $r_str = shift; + my $jis0212 = shift; + $$r_str =~ s{ + ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+) + }{ + my $chunk = $1; + my $esc = + ( $chunk =~ tr/\x8E//d ) ? $ESC{KANA} : + ( $chunk =~ tr/\x8F//d ) ? $ESC{JIS_0212} : + $ESC{JIS_0208}; + if ($esc eq $ESC{JIS_0212} && !$jis0212){ + # fallback to '?' + $chunk =~ tr/\xA1-\xFE/\x3F/; + }else{ + $chunk =~ tr/\xA1-\xFE/\x21-\x7E/; + } + $esc . $chunk . $ESC{ASC}; + }geox; + $$r_str =~ s/\Q$ESC{ASC}\E + (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox; + $$r_str; +} + +1; +__END__ + + +=head1 NAME + +Encode::JP::JIS7 -- internally used by Encode::JP + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/KR.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/KR.pm new file mode 100644 index 00000000000..8cb2c63b162 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/KR.pm @@ -0,0 +1,73 @@ +package Encode::KR; +BEGIN { + if ( ord("A") == 193 ) { + die "Encode::KR not supported on EBCDIC\n"; + } +} +use strict; +use warnings; +use Encode; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +use XSLoader; +XSLoader::load( __PACKAGE__, $VERSION ); + +use Encode::KR::2022_KR; + +1; +__END__ + +=head1 NAME + +Encode::KR - Korean Encodings + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $euc_kr = encode("euc-kr", $utf8); # loads Encode::KR implicitly + $utf8 = decode("euc-kr", $euc_kr); # ditto + +=head1 DESCRIPTION + +This module implements Korean charset encodings. Encodings supported +are as follows. + + + Canonical Alias Description + -------------------------------------------------------------------- + euc-kr /\beuc.*kr$/i EUC (Extended Unix Character) + /\bkr.*euc$/i + ksc5601-raw Korean standard code set (as is) + cp949 /(?:x-)?uhc$/i + /(?:x-)?windows-949$/i + /\bks_c_5601-1987$/i + Code Page 949 (EUC-KR + 8,822 + (additional Hangul syllables) + MacKorean EUC-KR + Apple Vendor Mappings + johab JOHAB A supplementary encoding defined in + Annex 3 of KS X 1001:1998 + iso-2022-kr iso-2022-kr [RFC1557] + -------------------------------------------------------------------- + +To find how to use this module in detail, see L<Encode>. + +=head1 BUGS + +When you see C<charset=ks_c_5601-1987> on mails and web pages, they really +mean "cp949" encodings. To fix that, the following aliases are set; + + qr/(?:x-)?uhc$/i => '"cp949"' + qr/(?:x-)?windows-949$/i => '"cp949"' + qr/ks_c_5601-1987$/i => '"cp949"' + +The ASCII region (0x00-0x7f) is preserved for all encodings, even +though this conflicts with mappings by the Unicode Consortium. See + +L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en> + +to find out why it is implemented that way. + +=head1 SEE ALSO + +L<Encode> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/KR/2022_KR.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/KR/2022_KR.pm new file mode 100644 index 00000000000..0218d971ee2 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/KR/2022_KR.pm @@ -0,0 +1,81 @@ +package Encode::KR::2022_KR; +use strict; +use warnings; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use Encode qw(:fallbacks); + +use base qw(Encode::Encoding); +__PACKAGE__->Define('iso-2022-kr'); + +sub needs_lines { 1 } + +sub perlio_ok { + return 0; # for the time being +} + +sub decode { + my ( $obj, $str, $chk ) = @_; + my $res = $str; + my $residue = iso_euc( \$res ); + + # This is for PerlIO + $_[1] = $residue if $chk; + return Encode::decode( 'euc-kr', $res, FB_PERLQQ ); +} + +sub encode { + my ( $obj, $utf8, $chk ) = @_; + + # empty the input string in the stack so perlio is ok + $_[1] = '' if $chk; + my $octet = Encode::encode( 'euc-kr', $utf8, FB_PERLQQ ); + euc_iso( \$octet ); + return $octet; +} + +use Encode::CJKConstants qw(:all); + +# ISO<->EUC + +sub iso_euc { + my $r_str = shift; + $$r_str =~ s/$RE{'2022_KR'}//gox; # remove the designator + $$r_str =~ s{ # replace characters in GL + \x0e # between SO(\x0e) and SI(\x0f) + ([^\x0f]*) # with characters in GR + \x0f + } + { + my $out= $1; + $out =~ tr/\x21-\x7e/\xa1-\xfe/; + $out; + }geox; + my ($residue) = ( $$r_str =~ s/(\e.*)$//so ); + return $residue; +} + +sub euc_iso { + no warnings qw(uninitialized); + my $r_str = shift; + substr( $$r_str, 0, 0 ) = + $ESC{'2022_KR'}; # put the designator at the beg. + $$r_str =~ + s{ # move KS X 1001 characters in GR to GL + ($RE{EUC_C}+) # and enclose them with SO and SI + }{ + my $str = $1; + $str =~ tr/\xA1-\xFE/\x21-\x7E/; + "\x0e" . $str . "\x0f"; + }geox; + $$r_str; +} + +1; +__END__ + +=head1 NAME + +Encode::KR::2022_KR -- internally used by Encode::KR + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/MIME/Header.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/MIME/Header.pm new file mode 100644 index 00000000000..b664d888b3f --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/MIME/Header.pm @@ -0,0 +1,242 @@ +package Encode::MIME::Header; +use strict; +use warnings; +no warnings 'redefine'; + +our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +use Encode qw(find_encoding encode_utf8 decode_utf8); +use MIME::Base64; +use Carp; + +my %seed = ( + decode_b => '1', # decodes 'B' encoding ? + decode_q => '1', # decodes 'Q' encoding ? + encode => 'B', # encode with 'B' or 'Q' ? + bpl => 75, # bytes per line +); + +$Encode::Encoding{'MIME-Header'} = + bless { %seed, Name => 'MIME-Header', } => __PACKAGE__; + +$Encode::Encoding{'MIME-B'} = bless { + %seed, + decode_q => 0, + Name => 'MIME-B', +} => __PACKAGE__; + +$Encode::Encoding{'MIME-Q'} = bless { + %seed, + decode_q => 1, + encode => 'Q', + Name => 'MIME-Q', +} => __PACKAGE__; + +use base qw(Encode::Encoding); + +sub needs_lines { 1 } +sub perlio_ok { 0 } + +sub decode($$;$) { + use utf8; + my ( $obj, $str, $chk ) = @_; + + # zap spaces between encoded words + $str =~ s/\?=\s+=\?/\?==\?/gos; + + # multi-line header to single line + $str =~ s/(:?\r|\n|\r\n)[ \t]//gos; + + 1 while ( $str =~ + s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/ ) + ; # Concat consecutive QP encoded mime headers + # Fixes breaking inside multi-byte characters + + $str =~ s{ + =\? # begin encoded word + ([0-9A-Za-z\-_]+) # charset (encoding) + (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) + \?([QqBb])\? # delimiter + (.*?) # Base64-encodede contents + \?= # end encoded word + }{ + if (uc($2) eq 'B'){ + $obj->{decode_b} or croak qq(MIME "B" unsupported); + decode_b($1, $3); + }elsif(uc($2) eq 'Q'){ + $obj->{decode_q} or croak qq(MIME "Q" unsupported); + decode_q($1, $3); + }else{ + croak qq(MIME "$2" encoding is nonexistent!); + } + }egox; + $_[1] = '' if $chk; + return $str; +} + +sub decode_b { + my $enc = shift; + my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); + my $db64 = decode_base64(shift); + return $d->name eq 'utf8' + ? Encode::decode_utf8($db64) + : $d->decode( $db64, Encode::FB_PERLQQ ); +} + +sub decode_q { + my ( $enc, $q ) = @_; + my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); + $q =~ s/_/ /go; + $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; + return $d->name eq 'utf8' + ? Encode::decode_utf8($q) + : $d->decode( $q, Encode::FB_PERLQQ ); +} + +my $especials = + join( '|' => map { quotemeta( chr($_) ) } + unpack( "C*", qq{()<>@,;:\"\'/[]?.=} ) ); + +my $re_encoded_word = qr{ + (?: + =\? # begin encoded word + (?:[0-9A-Za-z\-_]+) # charset (encoding) + (?:\*\w+(?:-\w+)*)? # language (RFC 2231) + \?(?:[QqBb])\? # delimiter + (?:.*?) # Base64-encodede contents + \?= # end encoded word + ) + }xo; + +my $re_especials = qr{$re_encoded_word|$especials}xo; + +sub encode($$;$) { + my ( $obj, $str, $chk ) = @_; + my @line = (); + for my $line ( split /\r|\n|\r\n/o, $str ) { + my ( @word, @subline ); + for my $word ( split /($re_especials)/o, $line ) { + if ( $word =~ /[^\x00-\x7f]/o + or $word =~ /^$re_encoded_word$/o ) + { + push @word, $obj->_encode($word); + } + else { + push @word, $word; + } + } + my $subline = ''; + for my $word (@word) { + use bytes (); + if ( bytes::length($subline) + bytes::length($word) > + $obj->{bpl} ) + { + push @subline, $subline; + $subline = ''; + } + $subline .= $word; + } + $subline and push @subline, $subline; + push @line, join( "\n " => @subline ); + } + $_[1] = '' if $chk; + return join( "\n", @line ); +} + +use constant HEAD => '=?UTF-8?'; +use constant TAIL => '?='; +use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, }; + +sub _encode { + my ( $o, $str ) = @_; + my $enc = $o->{encode}; + my $llen = ( $o->{bpl} - length(HEAD) - 2 - length(TAIL) ); + + # to coerce a floating-point arithmetics, the following contains + # .0 in numbers -- dankogai + $llen *= $enc eq 'B' ? 3.0 / 4.0 : 1.0 / 3.0; + my @result = (); + my $chunk = ''; + while ( length( my $chr = substr( $str, 0, 1, '' ) ) ) { + use bytes (); + if ( bytes::length($chunk) + bytes::length($chr) > $llen ) { + push @result, SINGLE->{$enc}($chunk); + $chunk = ''; + } + $chunk .= $chr; + } + $chunk and push @result, SINGLE->{$enc}($chunk); + return @result; +} + +sub _encode_b { + HEAD . 'B?' . encode_base64( encode_utf8(shift), '' ) . TAIL; +} + +sub _encode_q { + my $chunk = shift; + $chunk = encode_utf8($chunk); + $chunk =~ s{ + ([^0-9A-Za-z]) + }{ + join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) + }egox; + return HEAD . 'Q?' . $chunk . TAIL; +} + +1; +__END__ + +=head1 NAME + +Encode::MIME::Header -- MIME 'B' and 'Q' header encoding + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $utf8 = decode('MIME-Header', $header); + $header = encode('MIME-Header', $utf8); + +=head1 ABSTRACT + +This module implements RFC 2047 Mime Header Encoding. There are 3 +variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The +difference is described below + + decode() encode() + ---------------------------------------------- + MIME-Header Both B and Q =?UTF-8?B?....?= + MIME-B B only; Q croaks =?UTF-8?B?....?= + MIME-Q Q only; B croaks =?UTF-8?Q?....?= + +=head1 DESCRIPTION + +When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD> +is extracted and decoded for I<X> encoding (B for Base64, Q for +Quoted-Printable). Then the decoded chunk is fed to +decode(I<encoding>). So long as I<encoding> is supported by Encode, +any source encoding is fine. + +When you encode, it just encodes UTF-8 string with I<X> encoding then +quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to +encode are left as is and long lines are folded within 76 bytes per +line. + +=head1 BUGS + +It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP? +and =?ISO-8859-1?= but that makes the implementation too complicated. +These days major mail agents all support =?UTF-8? so I think it is +just good enough. + +Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by +Makamaka. Thre are still too many MUAs especially cellular phone +handsets which does not grok UTF-8. + +=head1 SEE ALSO + +L<Encode> + +RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other +locations. + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/MIME/Header/ISO_2022_JP.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/MIME/Header/ISO_2022_JP.pm new file mode 100644 index 00000000000..4abfbd05ef8 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/MIME/Header/ISO_2022_JP.pm @@ -0,0 +1,131 @@ +package Encode::MIME::Header::ISO_2022_JP; + +use strict; +use warnings; + +use base qw(Encode::MIME::Header); + +$Encode::Encoding{'MIME-Header-ISO_2022_JP'} = + bless { encode => 'B', bpl => 76, Name => 'MIME-Header-ISO_2022_JP' } => + __PACKAGE__; + +use constant HEAD => '=?ISO-2022-JP?B?'; +use constant TAIL => '?='; + +use Encode::CJKConstants qw(%RE); + +our $VERSION = do { my @r = ( q$Revision: 1.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +# I owe the below codes totally to +# Jcode by Dan Kogai & http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64 + +sub encode { + my $self = shift; + my $str = shift; + + utf8::encode($str) if ( Encode::is_utf8($str) ); + Encode::from_to( $str, 'utf8', 'euc-jp' ); + + my ($trailing_crlf) = ( $str =~ /(\n|\r|\x0d\x0a)$/o ); + + $str = _mime_unstructured_header( $str, $self->{bpl} ); + + not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o; + + return $str; +} + +sub _mime_unstructured_header { + my ( $oldheader, $bpl ) = @_; + my $crlf = $oldheader =~ /\n$/; + my ( $header, @words, @wordstmp, $i ) = (''); + + $oldheader =~ s/\s+$//; + + @wordstmp = split /\s+/, $oldheader; + + for ( $i = 0 ; $i < $#wordstmp ; $i++ ) { + if ( $wordstmp[$i] !~ /^[\x21-\x7E]+$/ + and $wordstmp[ $i + 1 ] !~ /^[\x21-\x7E]+$/ ) + { + $wordstmp[ $i + 1 ] = "$wordstmp[$i] $wordstmp[$i + 1]"; + } + else { + push( @words, $wordstmp[$i] ); + } + } + + push( @words, $wordstmp[-1] ); + + for my $word (@words) { + if ( $word =~ /^[\x21-\x7E]+$/ ) { + $header =~ /(?:.*\n)*(.*)/; + if ( length($1) + length($word) > $bpl ) { + $header .= "\n $word"; + } + else { + $header .= $word; + } + } + else { + $header = _add_encoded_word( $word, $header, $bpl ); + } + + $header =~ /(?:.*\n)*(.*)/; + + if ( length($1) == $bpl ) { + $header .= "\n "; + } + else { + $header .= ' '; + } + } + + $header =~ s/\n? $//mg; + + $crlf ? "$header\n" : $header; +} + +sub _add_encoded_word { + my ( $str, $line, $bpl ) = @_; + my $result = ''; + + while ( length($str) ) { + my $target = $str; + $str = ''; + + if ( + length($line) + 22 + + ( $target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o ) * 8 > $bpl ) + { + $line =~ s/[ \t\n\r]*$/\n/; + $result .= $line; + $line = ' '; + } + + while (1) { + my $iso_2022_jp = $target; + Encode::from_to( $iso_2022_jp, 'euc-jp', 'iso-2022-jp' ); + + my $encoded = + HEAD . MIME::Base64::encode_base64( $iso_2022_jp, '' ) . TAIL; + + if ( length($encoded) + length($line) > $bpl ) { + $target =~ + s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o; + $str = $1 . $str; + } + else { + $line .= $encoded; + last; + } + } + + } + + $result . $line; +} + +1; +__END__ + diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/MIME/Name.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/MIME/Name.pm new file mode 100644 index 00000000000..10d86a746dc --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/MIME/Name.pm @@ -0,0 +1,94 @@ +package Encode::MIME::Name; +use strict; +use warnings; +our $VERSION = do { my @r = ( q$Revision: 1.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +our %MIME_NAME_OF = ( + 'AdobeStandardEncoding' => 'Adobe-Standard-Encoding', + 'AdobeSymbol' => 'Adobe-Symbol-Encoding', + 'ascii' => 'US-ASCII', + 'big5-hkscs' => 'Big5-HKSCS', + 'cp1026' => 'IBM1026', + 'cp1047' => 'IBM1047', + 'cp1250' => 'windows-1250', + 'cp1251' => 'windows-1251', + 'cp1252' => 'windows-1252', + 'cp1253' => 'windows-1253', + 'cp1254' => 'windows-1254', + 'cp1255' => 'windows-1255', + 'cp1256' => 'windows-1256', + 'cp1257' => 'windows-1257', + 'cp1258' => 'windows-1258', + 'cp37' => 'IBM037', + 'cp424' => 'IBM424', + 'cp437' => 'IBM437', + 'cp500' => 'IBM500', + 'cp775' => 'IBM775', + 'cp850' => 'IBM850', + 'cp852' => 'IBM852', + 'cp855' => 'IBM855', + 'cp857' => 'IBM857', + 'cp860' => 'IBM860', + 'cp861' => 'IBM861', + 'cp862' => 'IBM862', + 'cp863' => 'IBM863', + 'cp864' => 'IBM864', + 'cp865' => 'IBM865', + 'cp866' => 'IBM866', + 'cp869' => 'IBM869', + 'cp936' => 'GBK', + 'euc-jp' => 'EUC-JP', + 'euc-kr' => 'EUC-KR', + #'gb2312-raw' => 'GB2312', # no, you're wrong, I18N::Charset + 'hp-roman8' => 'hp-roman8', + 'hz' => 'HZ-GB-2312', + 'iso-2022-jp' => 'ISO-2022-JP', + 'iso-2022-jp-1' => 'ISO-2022-JP', + 'iso-2022-kr' => 'ISO-2022-KR', + 'iso-8859-1' => 'ISO-8859-1', + 'iso-8859-10' => 'ISO-8859-10', + 'iso-8859-13' => 'ISO-8859-13', + 'iso-8859-14' => 'ISO-8859-14', + 'iso-8859-15' => 'ISO-8859-15', + 'iso-8859-16' => 'ISO-8859-16', + 'iso-8859-2' => 'ISO-8859-2', + 'iso-8859-3' => 'ISO-8859-3', + 'iso-8859-4' => 'ISO-8859-4', + 'iso-8859-5' => 'ISO-8859-5', + 'iso-8859-6' => 'ISO-8859-6', + 'iso-8859-7' => 'ISO-8859-7', + 'iso-8859-8' => 'ISO-8859-8', + 'iso-8859-9' => 'ISO-8859-9', + #'jis0201-raw' => 'JIS_X0201', + #'jis0208-raw' => 'JIS_C6226-1983', + #'jis0212-raw' => 'JIS_X0212-1990', + 'koi8-r' => 'KOI8-R', + 'koi8-u' => 'KOI8-U', + #'ksc5601-raw' => 'KS_C_5601-1987', + 'shiftjis' => 'Shift_JIS', + 'UTF-16' => 'UTF-16', + 'UTF-16BE' => 'UTF-16BE', + 'UTF-16LE' => 'UTF-16LE', + 'UTF-32' => 'UTF-32', + 'UTF-32BE' => 'UTF-32BE', + 'UTF-32LE' => 'UTF-32LE', + 'UTF-7' => 'UTF-7', + 'utf8' => 'UTF-8', + 'utf-8-strict' => 'UTF-8', + 'viscii' => 'VISCII', +); + +sub get_mime_name($) { $MIME_NAME_OF{$_[0]} }; + +1; +__END__ + +=head1 NAME + +Encode::MIME::NAME -- internally used by Encode + +=head1 SEE ALSO + +L<I18N::Charset> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Symbol.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Symbol.pm new file mode 100644 index 00000000000..77031aad75c --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Symbol.pm @@ -0,0 +1,44 @@ +package Encode::Symbol; +use strict; +use warnings; +use Encode; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use XSLoader; +XSLoader::load( __PACKAGE__, $VERSION ); + +1; +__END__ + +=head1 NAME + +Encode::Symbol - Symbol Encodings + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $symbol = encode("symbol", $utf8); # loads Encode::Symbol implicitly + $utf8 = decode("", $symbol); # ditto + +=head1 ABSTRACT + +This module implements symbol and dingbats encodings. Encodings +supported are as follows. + + Canonical Alias Description + -------------------------------------------------------------------- + symbol + dingbats + AdobeZDingbat + AdobeSymbol + MacDingbats + +=head1 DESCRIPTION + +To find out how to use this module in detail, see L<Encode>. + +=head1 SEE ALSO + +L<Encode> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/TW.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/TW.pm new file mode 100644 index 00000000000..ffaa8442040 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/TW.pm @@ -0,0 +1,79 @@ +package Encode::TW; +BEGIN { + if ( ord("A") == 193 ) { + die "Encode::TW not supported on EBCDIC\n"; + } +} +use strict; +use warnings; +use Encode; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +use XSLoader; +XSLoader::load( __PACKAGE__, $VERSION ); + +1; +__END__ + +=head1 NAME + +Encode::TW - Taiwan-based Chinese Encodings + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $big5 = encode("big5", $utf8); # loads Encode::TW implicitly + $utf8 = decode("big5", $big5); # ditto + +=head1 DESCRIPTION + +This module implements tradition Chinese charset encodings as used +in Taiwan and Hong Kong. +Encodings supported are as follows. + + Canonical Alias Description + -------------------------------------------------------------------- + big5-eten /\bbig-?5$/i Big5 encoding (with ETen extensions) + /\bbig5-?et(en)?$/i + /\btca-?big5$/i + big5-hkscs /\bbig5-?hk(scs)?$/i + /\bhk(scs)?-?big5$/i + Big5 + Cantonese characters in Hong Kong + MacChineseTrad Big5 + Apple Vendor Mappings + cp950 Code Page 950 + = Big5 + Microsoft vendor mappings + -------------------------------------------------------------------- + +To find out how to use this module in detail, see L<Encode>. + +=head1 NOTES + +Due to size concerns, C<EUC-TW> (Extended Unix Character), C<CCCII> +(Chinese Character Code for Information Interchange), C<BIG5PLUS> +(CMEX's Big5+) and C<BIG5EXT> (CMEX's Big5e) are distributed separately +on CPAN, under the name L<Encode::HanExtra>. That module also contains +extra China-based encodings. + +=head1 BUGS + +Since the original C<big5> encoding (1984) is not supported anywhere +(glibc and DOS-based systems uses C<big5> to mean C<big5-eten>; Microsoft +uses C<big5> to mean C<cp950>), a conscious decision was made to alias +C<big5> to C<big5-eten>, which is the de facto superset of the original +big5. + +The C<CNS11643> encoding files are not complete. For common C<CNS11643> +manipulation, please use C<EUC-TW> in L<Encode::HanExtra>, which contains +planes 1-7. + +The ASCII region (0x00-0x7f) is preserved for all encodings, even +though this conflicts with mappings by the Unicode Consortium. See + +L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en> + +to find out why it is implemented that way. + +=head1 SEE ALSO + +L<Encode> + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Unicode.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Unicode.pm new file mode 100644 index 00000000000..cdfe02d673c --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Unicode.pm @@ -0,0 +1,270 @@ +package Encode::Unicode; + +use strict; +use warnings; +no warnings 'redefine'; + +our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use XSLoader; +XSLoader::load( __PACKAGE__, $VERSION ); + +# +# Object Generator 8 transcoders all at once! +# + +require Encode; + +our %BOM_Unknown = map { $_ => 1 } qw(UTF-16 UTF-32); + +for my $name ( + qw(UTF-16 UTF-16BE UTF-16LE + UTF-32 UTF-32BE UTF-32LE + UCS-2BE UCS-2LE) + ) +{ + my ( $size, $endian, $ucs2, $mask ); + $name =~ /^(\w+)-(\d+)(\w*)$/o; + if ( $ucs2 = ( $1 eq 'UCS' ) ) { + $size = 2; + } + else { + $size = $2 / 8; + } + $endian = ( $3 eq 'BE' ) ? 'n' : ( $3 eq 'LE' ) ? 'v' : ''; + $size == 4 and $endian = uc($endian); + + $Encode::Encoding{$name} = bless { + Name => $name, + size => $size, + endian => $endian, + ucs2 => $ucs2, + } => __PACKAGE__; +} + +use base qw(Encode::Encoding); + +sub renew { + my $self = shift; + $BOM_Unknown{ $self->name } or return $self; + my $clone = bless {%$self} => ref($self); + $clone->{renewed}++; # so the caller knows it is renewed. + return $clone; +} + +# There used to be a perl implemntation of (en|de)code but with +# XS version is ripe, perl version is zapped for optimal speed + +*decode = \&decode_xs; +*encode = \&encode_xs; + +1; +__END__ + +=head1 NAME + +Encode::Unicode -- Various Unicode Transformation Formats + +=cut + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $ucs2 = encode("UCS-2BE", $utf8); + $utf8 = decode("UCS-2BE", $ucs2); + +=head1 ABSTRACT + +This module implements all Character Encoding Schemes of Unicode that +are officially documented by Unicode Consortium (except, of course, +for UTF-8, which is a native format in perl). + +=over 4 + +=item L<http://www.unicode.org/glossary/> says: + +I<Character Encoding Scheme> A character encoding form plus byte +serialization. There are Seven character encoding schemes in Unicode: +UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and +UTF-32LE (UCS-4LE), and UTF-7. + +Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of +Unicode's Character Encoding Scheme. It is separately implemented in +Encode::Unicode::UTF7. For details see L<Encode::Unicode::UTF7>. + +=item Quick Reference + + Decodes from ord(N) Encodes chr(N) to... + octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} == + ---------------+-----------------+------------------------------ + UCS-2BE 2 N N is bogus Not Available + UCS-2LE 2 N N bogus Not Available + UTF-16 2/4 Y Y is S.P S.P BE/LE + UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd + UTF-16LE 2/4 N Y S.P S.P 0x2ad8,0xcddf + UTF-32 4 Y - is bogus As is BE/LE + UTF-32BE 4 N - bogus As is 0x0001abcd + UTF-32LE 4 N - bogus As is 0xcdab0100 + UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d + ---------------+-----------------+------------------------------ + +=back + +=head1 Size, Endianness, and BOM + +You can categorize these CES by 3 criteria: size of each character, +endianness, and Byte Order Mark. + +=head2 by size + +UCS-2 is a fixed-length encoding with each character taking 16 bits. +It B<does not> support I<surrogate pairs>. When a surrogate pair +is encountered during decode(), its place is filled with \x{FFFD} +if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a +character whose ord value is larger than 0xFFFF is encountered, +its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine +croaks if I<CHECK> is 1. + +UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>. +When it encounters a high surrogate (0xD800-0xDBFF), it fetches the +following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to +form a character. Bogus surrogates result in death. When \x{10000} +or above is encountered during encode(), it C<ensurrogate>s them and +pushes the surrogate pair to the output stream. + +UTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits. +Since it is 32-bit, there is no need for I<surrogate pairs>. + +=head2 by endianness + +The first (and now failed) goal of Unicode was to map all character +repertoires into a fixed-length integer so that programmers are happy. +Since each character is either a I<short> or I<long> in C, you have to +pay attention to the endianness of each platform when you pass data +to one another. + +Anything marked as BE is Big Endian (or network byte order) and LE is +Little Endian (aka VAX byte order). For anything not marked either +BE or LE, a character called Byte Order Mark (BOM) indicating the +endianness is prepended to the string. + +CAVEAT: Though BOM in utf8 (\xEF\xBB\xBF) is valid, it is meaningless +and as of this writing Encode suite just leave it as is (\x{FeFF}). + +=over 4 + +=item BOM as integer when fetched in network byte order + + 16 32 bits/char + ------------------------- + BE 0xFeFF 0x0000FeFF + LE 0xFFFe 0xFFFe0000 + ------------------------- + +=back + +This modules handles the BOM as follows. + +=over 4 + +=item * + +When BE or LE is explicitly stated as the name of encoding, BOM is +simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE). + +=item * + +When BE or LE is omitted during decode(), it checks if BOM is at the +beginning of the string; if one is found, the endianness is set to +what the BOM says. If no BOM is found, the routine dies. + +=item * + +When BE or LE is omitted during encode(), it returns a BE-encoded +string with BOM prepended. So when you want to encode a whole text +file, make sure you encode() the whole text at once, not line by line +or each line, not file, will have a BOM prepended. + +=item * + +C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE. +UCS-2 is already registered by IANA and others that way. + +=back + +=head1 Surrogate Pairs + +To say the least, surrogate pairs were the biggest mistake of the +Unicode Consortium. But according to the late Douglas Adams in I<The +Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the +Universe was created. This has made a lot of people very angry and +been widely regarded as a bad move>. Their mistake was not of this +magnitude so let's forgive them. + +(I don't dare make any comparison with Unicode Consortium and the +Vogons here ;) Or, comparing Encode to Babel Fish is completely +appropriate -- if you can only stick this into your ear :) + +Surrogate pairs were born when the Unicode Consortium finally +admitted that 16 bits were not big enough to hold all the world's +character repertoires. But they already made UCS-2 16-bit. What +do we do? + +Back then, the range 0xD800-0xDFFF was not allocated. Let's split +that range in half and use the first half to represent the C<upper +half of a character> and the second half to represent the C<lower +half of a character>. That way, you can represent 1024 * 1024 = +1048576 more characters. Now we can store character ranges up to +\x{10ffff} even with 16-bit encodings. This pair of half-character is +now called a I<surrogate pair> and UTF-16 is the name of the encoding +that embraces them. + +Here is a formula to ensurrogate a Unicode character \x{10000} and +above; + + $hi = ($uni - 0x10000) / 0x400 + 0xD800; + $lo = ($uni - 0x10000) % 0x400 + 0xDC00; + +And to desurrogate; + + $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); + +Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but +perl does not prohibit the use of characters within this range. To perl, +every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>. + + (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit + integer support! + +=head1 Error Checking + +Unlike most encodings which accept various ways to handle errors, +Unicode encodings simply croaks. + + % perl -MEncode -e '$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \ + -e 'Encode::from_to($_, "utf16","shift_jis", 0); print' + UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184. + % perl -MEncode -e '$a = "BOM missing"' \ + -e ' Encode::from_to($a, "utf16", "shift_jis", 0); print' + UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184. + +Unlike other encodings where mappings are not one-to-one against +Unicode, UTFs are supposed to map 100% against one another. So Encode +is more strict on UTFs. + +Consider that "division by zero" of Encode :) + +=head1 SEE ALSO + +L<Encode>, L<Encode::Unicode::UTF7>, L<http://www.unicode.org/glossary/>, +L<http://www.unicode.org/unicode/faq/utf_bom.html>, + +RFC 2781 L<http://rfc.net/rfc2781.html>, + +The whole Unicode standard L<http://www.unicode.org/unicode/uni2book/u2.html> + +Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)> +by Larry Wall, Tom Christiansen, Jon Orwant; +O'Reilly & Associates; ISBN 0-596-00027-8 + +=cut diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Unicode/UTF7.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Unicode/UTF7.pm new file mode 100644 index 00000000000..6ee46195854 --- /dev/null +++ b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/Encode/Unicode/UTF7.pm @@ -0,0 +1,128 @@ +# +# $Id: UTF7.pm,v 2.4 2006/06/03 20:28:48 dankogai Exp $ +# +package Encode::Unicode::UTF7; +use strict; +use warnings; +no warnings 'redefine'; +use base qw(Encode::Encoding); +__PACKAGE__->Define('UTF-7'); +our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +use MIME::Base64; +use Encode; + +# +# Algorithms taken from Unicode::String by Gisle Aas +# + +our $OPTIONAL_DIRECT_CHARS = 1; +my $specials = quotemeta "\'(),-./:?"; +$OPTIONAL_DIRECT_CHARS + and $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}"; + +# \s will not work because it matches U+3000 DEOGRAPHIC SPACE +# We use qr/[\n\r\t\ ] instead +my $re_asis = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/; +my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/; +my $e_utf16 = find_encoding("UTF-16BE"); + +sub needs_lines { 1 } + +sub encode($$;$) { + my ( $obj, $str, $chk ) = @_; + my $len = length($str); + pos($str) = 0; + my $bytes = ''; + while ( pos($str) < $len ) { + if ( $str =~ /\G($re_asis+)/ogc ) { + $bytes .= $1; + } + elsif ( $str =~ /\G($re_encoded+)/ogsc ) { + if ( $1 eq "+" ) { + $bytes .= "+-"; + } + else { + my $s = $1; + my $base64 = encode_base64( $e_utf16->encode($s), '' ); + $base64 =~ s/=+$//; + $bytes .= "+$base64-"; + } + } + else { + die "This should not happen! (pos=" . pos($str) . ")"; + } + } + $_[1] = '' if $chk; + return $bytes; +} + +sub decode($$;$) { + my ( $obj, $bytes, $chk ) = @_; + my $len = length($bytes); + my $str = ""; + no warnings 'uninitialized'; + while ( pos($bytes) < $len ) { + if ( $bytes =~ /\G([^+]+)/ogc ) { + $str .= $1; + } + elsif ( $bytes =~ /\G\+-/ogc ) { + $str .= "+"; + } + elsif ( $bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc ) { + my $base64 = $1; + my $pad = length($base64) % 4; + $base64 .= "=" x ( 4 - $pad ) if $pad; + $str .= $e_utf16->decode( decode_base64($base64) ); + } + elsif ( $bytes =~ /\G\+/ogc ) { + $^W and warn "Bad UTF7 data escape"; + $str .= "+"; + } + else { + die "This should not happen " . pos($bytes); + } + } + $_[1] = '' if $chk; + return $str; +} +1; +__END__ + +=head1 NAME + +Encode::Unicode::UTF7 -- UTF-7 encoding + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $utf7 = encode("UTF-7", $utf8); + $utf8 = decode("UTF-7", $ucs2); + +=head1 ABSTRACT + +This module implements UTF-7 encoding documented in RFC 2152. UTF-7, +as its name suggests, is a 7-bit re-encoded version of UTF-16BE. It +is designed to be MTA-safe and expected to be a standard way to +exchange Unicoded mails via mails. But with the advent of UTF-8 and +8-bit compliant MTAs, UTF-7 is hardly ever used. + +UTF-7 was not supported by Encode until version 1.95 because of that. +But Unicode::String, a module by Gisle Aas which adds Unicode supports +to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added +so Encode can supersede Unicode::String 100%. + +=head1 In Practice + +When you want to encode Unicode for mails and web pages, however, do +not use UTF-7 unless you are sure your recipients and readers can +handle it. Very few MUAs and WWW Browsers support these days (only +Mozilla seems to support one). For general cases, use UTF-8 for +message body and MIME-Header for header instead. + +=head1 SEE ALSO + +L<Encode>, L<Encode::Unicode>, L<Unicode::String> + +RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt> + +=cut |