1 files changed, 575 insertions, 0 deletions
diff --git a/lib/IO/HTML.pm b/lib/IO/HTML.pm
new file mode 100644
index 0000000..5fdad22
--- /dev/null
+++ b/lib/IO/HTML.pm
@@ -0,0 +1,575 @@
+#---------------------------------------------------------------------
+package IO::HTML;
+#
+# Copyright 2014 Christopher J. Madsen
+#
+# Author: Christopher J. Madsen <perl@cjmweb.net>
+# Created: 14 Jan 2012
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the same terms as Perl itself.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See either the
+# GNU General Public License or the Artistic License for more details.
+#
+# ABSTRACT: Open an HTML file with automatic charset detection
+#---------------------------------------------------------------------
+
+use 5.008;
+use strict;
+use warnings;
+
+use Carp 'croak';
+use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding
+use Exporter 5.57 'import';
+
+our $VERSION = '1.001';
+# This file is part of IO-HTML 1.001 (June 28, 2014)
+
+our $default_encoding ||= 'cp1252';
+
+our @EXPORT    = qw(html_file);
+our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile
+                    sniff_encoding);
+
+our %EXPORT_TAGS = (
+  rw  => [qw( html_file html_file_and_encoding html_outfile )],
+  all => [ @EXPORT, @EXPORT_OK ],
+);
+
+#=====================================================================
+
+
+sub html_file
+{
+  (&html_file_and_encoding)[0]; # return just the filehandle
+} # end html_file
+
+
+# Note: I made html_file and html_file_and_encoding separate functions
+# (instead of making html_file context-sensitive) because I wanted to
+# use html_file in function calls (i.e. list context) without having
+# to write "scalar html_file" all the time.
+
+sub html_file_and_encoding
+{
+  my ($filename, $options) = @_;
+
+  $options ||= {};
+
+  open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!";
+
+
+  my ($encoding, $bom) = sniff_encoding($in, $filename, $options);
+
+  if (not defined $encoding) {
+    croak "No default encoding specified"
+        unless defined($encoding = $default_encoding);
+    $encoding = find_encoding($encoding) if $options->{encoding};
+  } # end if we didn't find an encoding
+
+  binmode $in, sprintf(":encoding(%s):crlf",
+                       $options->{encoding} ? $encoding->name : $encoding);
+
+  return ($in, $encoding, $bom);
+} # end html_file_and_encoding
+#---------------------------------------------------------------------
+
+
+sub html_outfile
+{
+  my ($filename, $encoding, $bom) = @_;
+
+  if (not defined $encoding) {
+    croak "No default encoding specified"
+        unless defined($encoding = $default_encoding);
+  } # end if we didn't find an encoding
+  elsif (ref $encoding) {
+    $encoding = $encoding->name;
+  }
+
+  open(my $out, ">:encoding($encoding)", $filename)
+      or croak "Failed to open $filename: $!";
+
+  print $out "\x{FeFF}" if $bom;
+
+  return $out;
+} # end html_outfile
+#---------------------------------------------------------------------
+
+
+sub sniff_encoding
+{
+  my ($in, $filename, $options) = @_;
+
+  $filename = 'file' unless defined $filename;
+  $options ||= {};
+
+  my $pos = tell $in;
+  croak "Could not seek $filename: $!" if $pos < 0;
+
+  croak "Could not read $filename: $!" unless defined read $in, my $buf, 1024;
+
+  seek $in, $pos, 0 or croak "Could not seek $filename: $!";
+
+
+  # Check for BOM:
+  my $bom;
+  my $encoding = do {
+    if ($buf =~ /^\xFe\xFF/) {
+      $bom = 2;
+      'UTF-16BE';
+    } elsif ($buf =~ /^\xFF\xFe/) {
+      $bom = 2;
+      'UTF-16LE';
+    } elsif ($buf =~ /^\xEF\xBB\xBF/) {
+      $bom = 3;
+      'utf-8-strict';
+    } else {
+      find_charset_in($buf, $options); # check for <meta charset>
+    }
+  }; # end $encoding
+
+  if ($bom) {
+    seek $in, $bom, 1 or croak "Could not seek $filename: $!";
+    $bom = 1;
+  }
+  elsif (not defined $encoding) { # try decoding as UTF-8
+    my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET);
+    if ($buf =~ /^(?:                   # nothing left over
+         | [\xC2-\xDF]                  # incomplete 2-byte char
+         | [\xE0-\xEF] [\x80-\xBF]?     # incomplete 3-byte char
+         | [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char
+        )\z/x and $test =~ /[^\x00-\x7F]/) {
+      $encoding = 'utf-8-strict';
+    } # end if valid UTF-8 with at least one multi-byte character:
+  } # end if testing for UTF-8
+
+  if (defined $encoding and $options->{encoding} and not ref $encoding) {
+    $encoding = find_encoding($encoding);
+  } # end if $encoding is a string and we want an object
+
+  return wantarray ? ($encoding, $bom) : $encoding;
+} # end sniff_encoding
+
+#=====================================================================
+# Based on HTML5 8.2.2.2 Determining the character encoding:
+
+# Get attribute from current position of $_
+sub _get_attribute
+{
+  m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or /
+
+  return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc;
+
+  my ($name, $value) = (lc $1, '');
+
+  if (/\G[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/gc
+      and (/\G"([^"]*)"?/gc or
+           /\G'([^']*)'?/gc or
+           /\G([^\x09\x0A\x0C\x0D >]*)/gc)) {
+    $value = lc $1;
+  } # end if attribute has value
+
+  return wantarray ? ($name, $value) : 1;
+} # end _get_attribute
+
+# Examine a meta value for a charset:
+sub _get_charset_from_meta
+{
+  for (shift) {
+    while (/charset[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/ig) {
+      return $1 if (/\G"([^"]*)"/gc or
+                    /\G'([^']*)'/gc or
+                    /\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc);
+    }
+  } # end for value
+
+  return undef;
+} # end _get_charset_from_meta
+#---------------------------------------------------------------------
+
+
+sub find_charset_in
+{
+  for (shift) {
+    my $options = shift || {};
+    my $stop = length > 1024 ? 1024 : length; # search first 1024 bytes
+
+    my $expect_pragma = (defined $options->{need_pragma}
+                         ? $options->{need_pragma} : 1);
+
+    pos() = 0;
+    while (pos() < $stop) {
+      if (/\G<!--.*?(?<=--)>/sgc) {
+      } # Skip comment
+      elsif (m!\G<meta(?=[\x09\x0A\x0C\x0D /])!gic) {
+        my ($got_pragma, $need_pragma, $charset);
+
+        while (my ($name, $value) = &_get_attribute) {
+          if ($name eq 'http-equiv' and $value eq 'content-type') {
+            $got_pragma = 1;
+          } elsif ($name eq 'content' and not defined $charset) {
+            $need_pragma = $expect_pragma
+                if defined($charset = _get_charset_from_meta($value));
+          } elsif ($name eq 'charset') {
+            $charset = $value;
+            $need_pragma = 0;
+          }
+        } # end while more attributes in this <meta> tag
+
+        if (defined $need_pragma and (not $need_pragma or $got_pragma)) {
+          $charset = 'UTF-8'  if $charset =~ /^utf-?16/;
+          $charset = 'cp1252' if $charset eq 'iso-8859-1'; # people lie
+          if (my $encoding = find_encoding($charset)) {
+            return $options->{encoding} ? $encoding : $encoding->name;
+          } # end if charset is a recognized encoding
+        } # end if found charset
+      } # end elsif <meta
+      elsif (m!\G</?[a-zA-Z][^\x09\x0A\x0C\x0D >]*!gc) {
+        1 while &_get_attribute;
+      } # end elsif some other tag
+      elsif (m{\G<[!/?][^>]*}gc) {
+      } # skip unwanted things
+      elsif (m/\G</gc) {
+      } # skip < that doesn't open anything we recognize
+
+      # Advance to the next <:
+      m/\G[^<]+/gc;
+    } # end while not at search boundary
+  } # end for string
+
+  return undef;                 # Couldn't find a charset
+} # end find_charset_in
+#---------------------------------------------------------------------
+
+
+# Shortcuts for people who don't like exported functions:
+*file               = \&html_file;
+*file_and_encoding  = \&html_file_and_encoding;
+*outfile            = \&html_outfile;
+
+#=====================================================================
+# Package Return Value:
+
+1;
+
+__END__
+
+=head1 NAME
+
+IO::HTML - Open an HTML file with automatic charset detection
+
+=head1 VERSION
+
+This document describes version 1.001 of
+IO::HTML, released June 28, 2014.
+
+=head1 SYNOPSIS
+
+  use IO::HTML;                 # exports html_file by default
+  use HTML::TreeBuilder;
+
+  my $tree = HTML::TreeBuilder->new_from_file(
+               html_file('foo.html')
+             );
+
+  # Alternative interface:
+  open(my $in, '<:raw', 'bar.html');
+  my $encoding = IO::HTML::sniff_encoding($in, 'bar.html');
+
+=head1 DESCRIPTION
+
+IO::HTML provides an easy way to open a file containing HTML while
+automatically determining its encoding.  It uses the HTML5 encoding
+sniffing algorithm specified in section 8.2.2.2 of the draft standard.
+
+The algorithm as implemented here is:
+
+=over
+
+=item 1.
+
+If the file begins with a byte order mark indicating UTF-16LE,
+UTF-16BE, or UTF-8, then that is the encoding.
+
+=item 2.
+
+If the first 1024 bytes of the file contain a C<< <meta> >> tag that
+indicates the charset, and Encode recognizes the specified charset
+name, then that is the encoding.  (This portion of the algorithm is
+implemented by C<find_charset_in>.)
+
+The C<< <meta> >> tag can be in one of two formats:
+
+  <meta charset="...">
+  <meta http-equiv="Content-Type" content="...charset=...">
+
+The search is case-insensitive, and the order of attributes within the
+tag is irrelevant.  Any additional attributes of the tag are ignored.
+The first matching tag with a recognized encoding ends the search.
+
+=item 3.
+
+If the first 1024 bytes of the file are valid UTF-8 (with at least 1
+non-ASCII character), then the encoding is UTF-8.
+
+=item 4.
+
+If all else fails, use the default character encoding.  The HTML5
+standard suggests the default encoding should be locale dependent, but
+currently it is always C<cp1252> unless you set
+C<$IO::HTML::default_encoding> to a different value.  Note:
+C<sniff_encoding> does not apply this step; only C<html_file> does
+that.
+
+=back
+
+=head1 SUBROUTINES
+
+=head2 html_file
+
+  $filehandle = html_file($filename, \%options);
+
+This function (exported by default) is the primary entry point.  It
+opens the file specified by C<$filename> for reading, uses
+C<sniff_encoding> to find a suitable encoding layer, and applies it.
+It also applies the C<:crlf> layer.  If the file begins with a BOM,
+the filehandle is positioned just after the BOM.
+
+The optional second argument is a hashref containing options.  The
+possible keys are described under C<find_charset_in>.
+
+If C<sniff_encoding> is unable to determine the encoding, it defaults
+to C<$IO::HTML::default_encoding>, which is set to C<cp1252>
+(a.k.a. Windows-1252) by default.  According to the standard, the
+default should be locale dependent, but that is not currently
+implemented.
+
+It dies if the file cannot be opened.
+
+
+=head2 html_file_and_encoding
+
+  ($filehandle, $encoding, $bom)
+    = html_file_and_encoding($filename, \%options);
+
+This function (exported only by request) is just like C<html_file>,
+but returns more information.  In addition to the filehandle, it
+returns the name of the encoding used, and a flag indicating whether a
+byte order mark was found (if C<$bom> is true, the file began with a
+BOM).  This may be useful if you want to write the file out again
+(especially in conjunction with the C<html_outfile> function).
+
+The optional second argument is a hashref containing options.  The
+possible keys are described under C<find_charset_in>.
+
+It dies if the file cannot be opened.  The result of calling it in
+scalar context is undefined.
+
+
+=head2 html_outfile
+
+  $filehandle = html_outfile($filename, $encoding, $bom);
+
+This function (exported only by request) opens C<$filename> for output
+using C<$encoding>, and writes a BOM to it if C<$bom> is true.
+If C<$encoding> is C<undef>, it defaults to C<$IO::HTML::default_encoding>.
+C<$encoding> may be either an encoding name or an Encode::Encoding object.
+
+It dies if the file cannot be opened.
+
+
+=head2 sniff_encoding
+
+  ($encoding, $bom) = sniff_encoding($filehandle, $filename, \%options);
+
+This function (exported only by request) runs the HTML5 encoding
+sniffing algorithm on C<$filehandle> (which must be seekable, and
+should have been opened in C<:raw> mode).  C<$filename> is used only
+for error messages (if there's a problem using the filehandle), and
+defaults to "file" if omitted.  The optional third argument is a
+hashref containing options.  The possible keys are described under
+C<find_charset_in>.
+
+It returns Perl's canonical name for the encoding, which is not
+necessarily the same as the MIME or IANA charset name.  It returns
+C<undef> if the encoding cannot be determined.  C<$bom> is true if the
+file began with a byte order mark.  In scalar context, it returns only
+C<$encoding>.
+
+The filehandle's position is restored to its original position
+(normally the beginning of the file) unless C<$bom> is true.  In that
+case, the position is immediately after the BOM.
+
+Tip: If you want to run C<sniff_encoding> on a file you've already
+loaded into a string, open an in-memory file on the string, and pass
+that handle:
+
+  ($encoding, $bom) = do {
+    open(my $fh, '<', \$string);  sniff_encoding($fh)
+  };
+
+(This only makes sense if C<$string> contains bytes, not characters.)
+
+
+=head2 find_charset_in
+
+  $encoding = find_charset_in($string_containing_HTML, \%options);
+
+This function (exported only by request) looks for charset information
+in a C<< <meta> >> tag in a possibly incomplete HTML document using
+the "two step" algorithm specified by HTML5.  It does not look for a BOM.
+Only the first 1024 bytes of the string are checked.
+
+It returns Perl's canonical name for the encoding, which is not
+necessarily the same as the MIME or IANA charset name.  It returns
+C<undef> if no charset is specified or if the specified charset is not
+recognized by the Encode module.
+
+The optional second argument is a hashref containing options.  The
+following keys are recognized:
+
+=over
+
+=item C<encoding>
+
+If true, return the L<Encode::Encoding> object instead of its name.
+Defaults to false.
+
+=item C<need_pragma>
+
+If true (the default), follow the HTML5 spec and examine the
+C<content> attribute only of C<< <meta http-equiv="Content-Type" >>.
+If set to 0, relax the HTML5 spec, and look for "charset=" in the
+C<content> attribute of I<every> meta tag.
+
+=back
+
+=head1 EXPORTS
+
+By default, only C<html_file> is exported.  Other functions may be
+exported on request.
+
+For people who prefer not to export functions, all functions beginning
+with C<html_> have an alias without that prefix (e.g. you can call
+C<IO::HTML::file(...)> instead of C<IO::HTML::html_file(...)>.  These
+aliases are not exportable.
+
+=for Pod::Coverage
+file
+file_and_encoding
+outfile
+
+The following export tags are available:
+
+=over
+
+=item C<:all>
+
+All exportable functions.
+
+=item C<:rw>
+
+C<html_file>, C<html_file_and_encoding>, C<html_outfile>.
+
+=back
+
+=head1 SEE ALSO
+
+The HTML5 specification, section 8.2.2.2 Determining the character encoding:
+L<http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding>
+
+=head1 DIAGNOSTICS
+
+=over
+
+=item C<< Could not read %s: %s >>
+
+The specified file could not be read from for the reason specified by C<$!>.
+
+
+=item C<< Could not seek %s: %s >>
+
+The specified file could not be rewound for the reason specified by C<$!>.
+
+
+=item C<< Failed to open %s: %s >>
+
+The specified file could not be opened for reading for the reason
+specified by C<$!>.
+
+
+=item C<< No default encoding specified >>
+
+The C<sniff_encoding> algorithm didn't find an encoding to use, and
+you set C<$IO::HTML::default_encoding> to C<undef>.
+
+
+=back
+
+=head1 CONFIGURATION AND ENVIRONMENT
+
+IO::HTML requires no configuration files or environment variables.
+
+=head1 DEPENDENCIES
+
+IO::HTML has no non-core dependencies for Perl 5.8.7+.  With earlier
+versions of Perl 5.8, you need to upgrade L<Encode> to at least
+version 2.10, and
+you may need to upgrade L<Exporter> to at least version
+5.57.
+
+=head1 INCOMPATIBILITIES
+
+None reported.
+
+=head1 BUGS AND LIMITATIONS
+
+No bugs have been reported.
+
+=head1 AUTHOR
+
+Christopher J. Madsen  S<C<< <perl AT cjmweb.net> >>>
+
+Please report any bugs or feature requests
+to S<C<< <bug-IO-HTML AT rt.cpan.org> >>>
+or through the web interface at
+L<< http://rt.cpan.org/Public/Bug/Report.html?Queue=IO-HTML >>.
+
+You can follow or contribute to IO-HTML's development at
+L<< https://github.com/madsen/io-html >>.
+
+=head1 COPYRIGHT AND LICENSE
+
+This software is copyright (c) 2014 by Christopher J. Madsen.
+
+This is free software; you can redistribute it and/or modify it under
+the same terms as the Perl 5 programming language system itself.
+
+=head1 DISCLAIMER OF WARRANTY
+
+BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
+EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
+ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
+YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
+NECESSARY SERVICING, REPAIR, OR CORRECTION.
+
+IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENSE, BE
+LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
+OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
+THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+=cut