summaryrefslogtreecommitdiff
path: root/lib/IO/HTML.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/IO/HTML.pm')
-rw-r--r--lib/IO/HTML.pm575
1 files changed, 575 insertions, 0 deletions
diff --git a/lib/IO/HTML.pm b/lib/IO/HTML.pm
new file mode 100644
index 0000000..5fdad22
--- /dev/null
+++ b/lib/IO/HTML.pm
@@ -0,0 +1,575 @@
+#---------------------------------------------------------------------
+package IO::HTML;
+#
+# Copyright 2014 Christopher J. Madsen
+#
+# Author: Christopher J. Madsen <perl@cjmweb.net>
+# Created: 14 Jan 2012
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the same terms as Perl itself.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the
+# GNU General Public License or the Artistic License for more details.
+#
+# ABSTRACT: Open an HTML file with automatic charset detection
+#---------------------------------------------------------------------
+
+use 5.008;
+use strict;
+use warnings;
+
+use Carp 'croak';
+use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding
+use Exporter 5.57 'import';
+
+our $VERSION = '1.001';
+# This file is part of IO-HTML 1.001 (June 28, 2014)
+
+our $default_encoding ||= 'cp1252';
+
+our @EXPORT = qw(html_file);
+our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile
+ sniff_encoding);
+
+our %EXPORT_TAGS = (
+ rw => [qw( html_file html_file_and_encoding html_outfile )],
+ all => [ @EXPORT, @EXPORT_OK ],
+);
+
+#=====================================================================
+
+
+sub html_file
+{
+ (&html_file_and_encoding)[0]; # return just the filehandle
+} # end html_file
+
+
+# Note: I made html_file and html_file_and_encoding separate functions
+# (instead of making html_file context-sensitive) because I wanted to
+# use html_file in function calls (i.e. list context) without having
+# to write "scalar html_file" all the time.
+
+sub html_file_and_encoding
+{
+ my ($filename, $options) = @_;
+
+ $options ||= {};
+
+ open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!";
+
+
+ my ($encoding, $bom) = sniff_encoding($in, $filename, $options);
+
+ if (not defined $encoding) {
+ croak "No default encoding specified"
+ unless defined($encoding = $default_encoding);
+ $encoding = find_encoding($encoding) if $options->{encoding};
+ } # end if we didn't find an encoding
+
+ binmode $in, sprintf(":encoding(%s):crlf",
+ $options->{encoding} ? $encoding->name : $encoding);
+
+ return ($in, $encoding, $bom);
+} # end html_file_and_encoding
+#---------------------------------------------------------------------
+
+
+sub html_outfile
+{
+ my ($filename, $encoding, $bom) = @_;
+
+ if (not defined $encoding) {
+ croak "No default encoding specified"
+ unless defined($encoding = $default_encoding);
+ } # end if we didn't find an encoding
+ elsif (ref $encoding) {
+ $encoding = $encoding->name;
+ }
+
+ open(my $out, ">:encoding($encoding)", $filename)
+ or croak "Failed to open $filename: $!";
+
+ print $out "\x{FeFF}" if $bom;
+
+ return $out;
+} # end html_outfile
+#---------------------------------------------------------------------
+
+
+sub sniff_encoding
+{
+ my ($in, $filename, $options) = @_;
+
+ $filename = 'file' unless defined $filename;
+ $options ||= {};
+
+ my $pos = tell $in;
+ croak "Could not seek $filename: $!" if $pos < 0;
+
+ croak "Could not read $filename: $!" unless defined read $in, my $buf, 1024;
+
+ seek $in, $pos, 0 or croak "Could not seek $filename: $!";
+
+
+ # Check for BOM:
+ my $bom;
+ my $encoding = do {
+ if ($buf =~ /^\xFe\xFF/) {
+ $bom = 2;
+ 'UTF-16BE';
+ } elsif ($buf =~ /^\xFF\xFe/) {
+ $bom = 2;
+ 'UTF-16LE';
+ } elsif ($buf =~ /^\xEF\xBB\xBF/) {
+ $bom = 3;
+ 'utf-8-strict';
+ } else {
+ find_charset_in($buf, $options); # check for <meta charset>
+ }
+ }; # end $encoding
+
+ if ($bom) {
+ seek $in, $bom, 1 or croak "Could not seek $filename: $!";
+ $bom = 1;
+ }
+ elsif (not defined $encoding) { # try decoding as UTF-8
+ my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET);
+ if ($buf =~ /^(?: # nothing left over
+ | [\xC2-\xDF] # incomplete 2-byte char
+ | [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char
+ | [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char
+ )\z/x and $test =~ /[^\x00-\x7F]/) {
+ $encoding = 'utf-8-strict';
+ } # end if valid UTF-8 with at least one multi-byte character:
+ } # end if testing for UTF-8
+
+ if (defined $encoding and $options->{encoding} and not ref $encoding) {
+ $encoding = find_encoding($encoding);
+ } # end if $encoding is a string and we want an object
+
+ return wantarray ? ($encoding, $bom) : $encoding;
+} # end sniff_encoding
+
+#=====================================================================
+# Based on HTML5 8.2.2.2 Determining the character encoding:
+
+# Get attribute from current position of $_
+sub _get_attribute
+{
+ m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or /
+
+ return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc;
+
+ my ($name, $value) = (lc $1, '');
+
+ if (/\G[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/gc
+ and (/\G"([^"]*)"?/gc or
+ /\G'([^']*)'?/gc or
+ /\G([^\x09\x0A\x0C\x0D >]*)/gc)) {
+ $value = lc $1;
+ } # end if attribute has value
+
+ return wantarray ? ($name, $value) : 1;
+} # end _get_attribute
+
+# Examine a meta value for a charset:
+sub _get_charset_from_meta
+{
+ for (shift) {
+ while (/charset[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/ig) {
+ return $1 if (/\G"([^"]*)"/gc or
+ /\G'([^']*)'/gc or
+ /\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc);
+ }
+ } # end for value
+
+ return undef;
+} # end _get_charset_from_meta
+#---------------------------------------------------------------------
+
+
+sub find_charset_in
+{
+ for (shift) {
+ my $options = shift || {};
+ my $stop = length > 1024 ? 1024 : length; # search first 1024 bytes
+
+ my $expect_pragma = (defined $options->{need_pragma}
+ ? $options->{need_pragma} : 1);
+
+ pos() = 0;
+ while (pos() < $stop) {
+ if (/\G<!--.*?(?<=--)>/sgc) {
+ } # Skip comment
+ elsif (m!\G<meta(?=[\x09\x0A\x0C\x0D /])!gic) {
+ my ($got_pragma, $need_pragma, $charset);
+
+ while (my ($name, $value) = &_get_attribute) {
+ if ($name eq 'http-equiv' and $value eq 'content-type') {
+ $got_pragma = 1;
+ } elsif ($name eq 'content' and not defined $charset) {
+ $need_pragma = $expect_pragma
+ if defined($charset = _get_charset_from_meta($value));
+ } elsif ($name eq 'charset') {
+ $charset = $value;
+ $need_pragma = 0;
+ }
+ } # end while more attributes in this <meta> tag
+
+ if (defined $need_pragma and (not $need_pragma or $got_pragma)) {
+ $charset = 'UTF-8' if $charset =~ /^utf-?16/;
+ $charset = 'cp1252' if $charset eq 'iso-8859-1'; # people lie
+ if (my $encoding = find_encoding($charset)) {
+ return $options->{encoding} ? $encoding : $encoding->name;
+ } # end if charset is a recognized encoding
+ } # end if found charset
+ } # end elsif <meta
+ elsif (m!\G</?[a-zA-Z][^\x09\x0A\x0C\x0D >]*!gc) {
+ 1 while &_get_attribute;
+ } # end elsif some other tag
+ elsif (m{\G<[!/?][^>]*}gc) {
+ } # skip unwanted things
+ elsif (m/\G</gc) {
+ } # skip < that doesn't open anything we recognize
+
+ # Advance to the next <:
+ m/\G[^<]+/gc;
+ } # end while not at search boundary
+ } # end for string
+
+ return undef; # Couldn't find a charset
+} # end find_charset_in
+#---------------------------------------------------------------------
+
+
+# Shortcuts for people who don't like exported functions:
+*file = \&html_file;
+*file_and_encoding = \&html_file_and_encoding;
+*outfile = \&html_outfile;
+
+#=====================================================================
+# Package Return Value:
+
+1;
+
+__END__
+
+=head1 NAME
+
+IO::HTML - Open an HTML file with automatic charset detection
+
+=head1 VERSION
+
+This document describes version 1.001 of
+IO::HTML, released June 28, 2014.
+
+=head1 SYNOPSIS
+
+ use IO::HTML; # exports html_file by default
+ use HTML::TreeBuilder;
+
+ my $tree = HTML::TreeBuilder->new_from_file(
+ html_file('foo.html')
+ );
+
+ # Alternative interface:
+ open(my $in, '<:raw', 'bar.html');
+ my $encoding = IO::HTML::sniff_encoding($in, 'bar.html');
+
+=head1 DESCRIPTION
+
+IO::HTML provides an easy way to open a file containing HTML while
+automatically determining its encoding. It uses the HTML5 encoding
+sniffing algorithm specified in section 8.2.2.2 of the draft standard.
+
+The algorithm as implemented here is:
+
+=over
+
+=item 1.
+
+If the file begins with a byte order mark indicating UTF-16LE,
+UTF-16BE, or UTF-8, then that is the encoding.
+
+=item 2.
+
+If the first 1024 bytes of the file contain a C<< <meta> >> tag that
+indicates the charset, and Encode recognizes the specified charset
+name, then that is the encoding. (This portion of the algorithm is
+implemented by C<find_charset_in>.)
+
+The C<< <meta> >> tag can be in one of two formats:
+
+ <meta charset="...">
+ <meta http-equiv="Content-Type" content="...charset=...">
+
+The search is case-insensitive, and the order of attributes within the
+tag is irrelevant. Any additional attributes of the tag are ignored.
+The first matching tag with a recognized encoding ends the search.
+
+=item 3.
+
+If the first 1024 bytes of the file are valid UTF-8 (with at least 1
+non-ASCII character), then the encoding is UTF-8.
+
+=item 4.
+
+If all else fails, use the default character encoding. The HTML5
+standard suggests the default encoding should be locale dependent, but
+currently it is always C<cp1252> unless you set
+C<$IO::HTML::default_encoding> to a different value. Note:
+C<sniff_encoding> does not apply this step; only C<html_file> does
+that.
+
+=back
+
+=head1 SUBROUTINES
+
+=head2 html_file
+
+ $filehandle = html_file($filename, \%options);
+
+This function (exported by default) is the primary entry point. It
+opens the file specified by C<$filename> for reading, uses
+C<sniff_encoding> to find a suitable encoding layer, and applies it.
+It also applies the C<:crlf> layer. If the file begins with a BOM,
+the filehandle is positioned just after the BOM.
+
+The optional second argument is a hashref containing options. The
+possible keys are described under C<find_charset_in>.
+
+If C<sniff_encoding> is unable to determine the encoding, it defaults
+to C<$IO::HTML::default_encoding>, which is set to C<cp1252>
+(a.k.a. Windows-1252) by default. According to the standard, the
+default should be locale dependent, but that is not currently
+implemented.
+
+It dies if the file cannot be opened.
+
+
+=head2 html_file_and_encoding
+
+ ($filehandle, $encoding, $bom)
+ = html_file_and_encoding($filename, \%options);
+
+This function (exported only by request) is just like C<html_file>,
+but returns more information. In addition to the filehandle, it
+returns the name of the encoding used, and a flag indicating whether a
+byte order mark was found (if C<$bom> is true, the file began with a
+BOM). This may be useful if you want to write the file out again
+(especially in conjunction with the C<html_outfile> function).
+
+The optional second argument is a hashref containing options. The
+possible keys are described under C<find_charset_in>.
+
+It dies if the file cannot be opened. The result of calling it in
+scalar context is undefined.
+
+
+=head2 html_outfile
+
+ $filehandle = html_outfile($filename, $encoding, $bom);
+
+This function (exported only by request) opens C<$filename> for output
+using C<$encoding>, and writes a BOM to it if C<$bom> is true.
+If C<$encoding> is C<undef>, it defaults to C<$IO::HTML::default_encoding>.
+C<$encoding> may be either an encoding name or an Encode::Encoding object.
+
+It dies if the file cannot be opened.
+
+
+=head2 sniff_encoding
+
+ ($encoding, $bom) = sniff_encoding($filehandle, $filename, \%options);
+
+This function (exported only by request) runs the HTML5 encoding
+sniffing algorithm on C<$filehandle> (which must be seekable, and
+should have been opened in C<:raw> mode). C<$filename> is used only
+for error messages (if there's a problem using the filehandle), and
+defaults to "file" if omitted. The optional third argument is a
+hashref containing options. The possible keys are described under
+C<find_charset_in>.
+
+It returns Perl's canonical name for the encoding, which is not
+necessarily the same as the MIME or IANA charset name. It returns
+C<undef> if the encoding cannot be determined. C<$bom> is true if the
+file began with a byte order mark. In scalar context, it returns only
+C<$encoding>.
+
+The filehandle's position is restored to its original position
+(normally the beginning of the file) unless C<$bom> is true. In that
+case, the position is immediately after the BOM.
+
+Tip: If you want to run C<sniff_encoding> on a file you've already
+loaded into a string, open an in-memory file on the string, and pass
+that handle:
+
+ ($encoding, $bom) = do {
+ open(my $fh, '<', \$string); sniff_encoding($fh)
+ };
+
+(This only makes sense if C<$string> contains bytes, not characters.)
+
+
+=head2 find_charset_in
+
+ $encoding = find_charset_in($string_containing_HTML, \%options);
+
+This function (exported only by request) looks for charset information
+in a C<< <meta> >> tag in a possibly incomplete HTML document using
+the "two step" algorithm specified by HTML5. It does not look for a BOM.
+Only the first 1024 bytes of the string are checked.
+
+It returns Perl's canonical name for the encoding, which is not
+necessarily the same as the MIME or IANA charset name. It returns
+C<undef> if no charset is specified or if the specified charset is not
+recognized by the Encode module.
+
+The optional second argument is a hashref containing options. The
+following keys are recognized:
+
+=over
+
+=item C<encoding>
+
+If true, return the L<Encode::Encoding> object instead of its name.
+Defaults to false.
+
+=item C<need_pragma>
+
+If true (the default), follow the HTML5 spec and examine the
+C<content> attribute only of C<< <meta http-equiv="Content-Type" >>.
+If set to 0, relax the HTML5 spec, and look for "charset=" in the
+C<content> attribute of I<every> meta tag.
+
+=back
+
+=head1 EXPORTS
+
+By default, only C<html_file> is exported. Other functions may be
+exported on request.
+
+For people who prefer not to export functions, all functions beginning
+with C<html_> have an alias without that prefix (e.g. you can call
+C<IO::HTML::file(...)> instead of C<IO::HTML::html_file(...)>. These
+aliases are not exportable.
+
+=for Pod::Coverage
+file
+file_and_encoding
+outfile
+
+The following export tags are available:
+
+=over
+
+=item C<:all>
+
+All exportable functions.
+
+=item C<:rw>
+
+C<html_file>, C<html_file_and_encoding>, C<html_outfile>.
+
+=back
+
+=head1 SEE ALSO
+
+The HTML5 specification, section 8.2.2.2 Determining the character encoding:
+L<http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding>
+
+=head1 DIAGNOSTICS
+
+=over
+
+=item C<< Could not read %s: %s >>
+
+The specified file could not be read from for the reason specified by C<$!>.
+
+
+=item C<< Could not seek %s: %s >>
+
+The specified file could not be rewound for the reason specified by C<$!>.
+
+
+=item C<< Failed to open %s: %s >>
+
+The specified file could not be opened for reading for the reason
+specified by C<$!>.
+
+
+=item C<< No default encoding specified >>
+
+The C<sniff_encoding> algorithm didn't find an encoding to use, and
+you set C<$IO::HTML::default_encoding> to C<undef>.
+
+
+=back
+
+=head1 CONFIGURATION AND ENVIRONMENT
+
+IO::HTML requires no configuration files or environment variables.
+
+=head1 DEPENDENCIES
+
+IO::HTML has no non-core dependencies for Perl 5.8.7+. With earlier
+versions of Perl 5.8, you need to upgrade L<Encode> to at least
+version 2.10, and
+you may need to upgrade L<Exporter> to at least version
+5.57.
+
+=head1 INCOMPATIBILITIES
+
+None reported.
+
+=head1 BUGS AND LIMITATIONS
+
+No bugs have been reported.
+
+=head1 AUTHOR
+
+Christopher J. Madsen S<C<< <perl AT cjmweb.net> >>>
+
+Please report any bugs or feature requests
+to S<C<< <bug-IO-HTML AT rt.cpan.org> >>>
+or through the web interface at
+L<< http://rt.cpan.org/Public/Bug/Report.html?Queue=IO-HTML >>.
+
+You can follow or contribute to IO-HTML's development at
+L<< https://github.com/madsen/io-html >>.
+
+=head1 COPYRIGHT AND LICENSE
+
+This software is copyright (c) 2014 by Christopher J. Madsen.
+
+This is free software; you can redistribute it and/or modify it under
+the same terms as the Perl 5 programming language system itself.
+
+=head1 DISCLAIMER OF WARRANTY
+
+BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
+EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
+ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
+YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
+NECESSARY SERVICING, REPAIR, OR CORRECTION.
+
+IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENSE, BE
+LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
+OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
+THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+=cut