diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2014-06-28 17:03:42 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2014-06-28 17:03:42 +0000 |
commit | adc9c8b29ed1144626af564f936811a9d5e319a6 (patch) | |
tree | a420ceb0326d0a3a2d7fa4acbea4fec33de2c528 | |
download | IO-HTML-tarball-master.tar.gz |
IO-HTML-1.001HEADIO-HTML-1.001master
-rw-r--r-- | Changes | 24 | ||||
-rw-r--r-- | LICENSE | 379 | ||||
-rw-r--r-- | MANIFEST | 15 | ||||
-rw-r--r-- | META.json | 308 | ||||
-rw-r--r-- | META.yml | 219 | ||||
-rw-r--r-- | Makefile.PL | 62 | ||||
-rw-r--r-- | README | 46 | ||||
-rw-r--r-- | lib/IO/HTML.pm | 575 | ||||
-rw-r--r-- | t/00-all_prereqs.t | 95 | ||||
-rw-r--r-- | t/00-load.t | 10 | ||||
-rw-r--r-- | t/10-find.t | 133 | ||||
-rw-r--r-- | t/20-open.t | 150 | ||||
-rw-r--r-- | t/30-outfile.t | 61 | ||||
-rw-r--r-- | xt/release/pod-coverage.t | 14 | ||||
-rw-r--r-- | xt/release/pod-syntax.t | 8 |
15 files changed, 2099 insertions, 0 deletions
@@ -0,0 +1,24 @@ +Revision history for IO-HTML + +1.001 2014-06-28 + - No code changes, just documentation improvements + - Update links to the HTML5 draft specification + +1.00 2013-02-23 + - No code changes, just documentation improvements + - Document filehandle position set by sniff_encoding + +0.04 2012-02-04 + - Require Encode 2.10 to get the utf-8-strict encoding. + This is only an issue with Perl 5.8.6 and earlier. + +0.03 2012-01-30 + - Fix 10-find.t to work on Perl 5.8.8 + (mime_name wasn't introduced until Encode 2.21; 5.8.8 has 2.12.) + +0.02 2012-01-29 + - Fix 20-open.t to work on Perl < 5.14.0 + (Using "<:raw" on in-memory files didn't work until then.) + +0.01 2012-01-28 + - Initial release @@ -0,0 +1,379 @@ +This software is copyright (c) 2014 by Christopher J. Madsen. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +Terms of the Perl programming language system itself + +a) the GNU General Public License as published by the Free + Software Foundation; either version 1, or (at your option) any + later version, or +b) the "Artistic License" + +--- The GNU General Public License, Version 1, February 1989 --- + +This software is Copyright (c) 2014 by Christopher J. Madsen. + +This is free software, licensed under: + + The GNU General Public License, Version 1, February 1989 + + GNU GENERAL PUBLIC LICENSE + Version 1, February 1989 + + Copyright (C) 1989 Free Software Foundation, Inc. + 51 Franklin St, Suite 500, Boston, MA 02110-1335 USA + + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The license agreements of most software companies try to keep users +at the mercy of those companies. By contrast, our General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. The +General Public License applies to the Free Software Foundation's +software and to any other program whose authors commit to using it. +You can use it for your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Specifically, the General Public License is designed to make +sure that you have the freedom to give away or sell copies of free +software, that you receive source code or can get it if you want it, +that you can change the software or use pieces of it in new free +programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of a such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must tell them their rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any program or other work which +contains a notice placed by the copyright holder saying it may be +distributed under the terms of this General Public License. The +"Program", below, refers to any such program or work, and a "work based +on the Program" means either the Program or any work containing the +Program or a portion of it, either verbatim or with modifications. Each +licensee is addressed as "you". + + 1. You may copy and distribute verbatim copies of the Program's source +code as you receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice and +disclaimer of warranty; keep intact all the notices that refer to this +General Public License and to the absence of any warranty; and give any +other recipients of the Program a copy of this General Public License +along with the Program. You may charge a fee for the physical act of +transferring a copy. + + 2. You may modify your copy or copies of the Program or any portion of +it, and copy and distribute such modifications under the terms of Paragraph +1 above, provided that you also do the following: + + a) cause the modified files to carry prominent notices stating that + you changed the files and the date of any change; and + + b) cause the whole of any work that you distribute or publish, that + in whole or in part contains the Program or any part thereof, either + with or without modifications, to be licensed at no charge to all + third parties under the terms of this General Public License (except + that you may choose to grant warranty protection to some or all + third parties, at your option). + + c) If the modified program normally reads commands interactively when + run, you must cause it, when started running for such interactive use + in the simplest and most usual way, to print or display an + announcement including an appropriate copyright notice and a notice + that there is no warranty (or else, saying that you provide a + warranty) and that users may redistribute the program under these + conditions, and telling the user how to view a copy of this General + Public License. + + d) You may charge a fee for the physical act of transferring a + copy, and you may at your option offer warranty protection in + exchange for a fee. + +Mere aggregation of another independent work with the Program (or its +derivative) on a volume of a storage or distribution medium does not bring +the other work under the scope of these terms. + + 3. You may copy and distribute the Program (or a portion or derivative of +it, under Paragraph 2) in object code or executable form under the terms of +Paragraphs 1 and 2 above provided that you also do one of the following: + + a) accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of + Paragraphs 1 and 2 above; or, + + b) accompany it with a written offer, valid for at least three + years, to give any third party free (except for a nominal charge + for the cost of distribution) a complete machine-readable copy of the + corresponding source code, to be distributed under the terms of + Paragraphs 1 and 2 above; or, + + c) accompany it with the information you received as to where the + corresponding source code may be obtained. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form alone.) + +Source code for a work means the preferred form of the work for making +modifications to it. For an executable file, complete source code means +all the source code for all modules it contains; but, as a special +exception, it need not include source code for modules which are standard +libraries that accompany the operating system on which the executable +file runs, or for standard header files or definitions files that +accompany that operating system. + + 4. You may not copy, modify, sublicense, distribute or transfer the +Program except as expressly provided under this General Public License. +Any attempt otherwise to copy, modify, sublicense, distribute or transfer +the Program is void, and will automatically terminate your rights to use +the Program under this License. However, parties who have received +copies, or rights to use copies, from you under this General Public +License will not have their licenses terminated so long as such parties +remain in full compliance. + + 5. By copying, distributing or modifying the Program (or any work based +on the Program) you indicate your acceptance of this license to do so, +and all its terms and conditions. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the original +licensor to copy, distribute or modify the Program subject to these +terms and conditions. You may not impose any further restrictions on the +recipients' exercise of the rights granted herein. + + 7. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of the license which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +the license, you may choose any version ever published by the Free Software +Foundation. + + 8. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 9. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 10. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to humanity, the best way to achieve this is to make it +free software which everyone can redistribute and change under these +terms. + + To do so, attach the following notices to the program. It is safest to +attach them to the start of each source file to most effectively convey +the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) 19yy <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 1, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19xx name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the +appropriate parts of the General Public License. Of course, the +commands you use may be called something other than `show w' and `show +c'; they could even be mouse-clicks or menu items--whatever suits your +program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + program `Gnomovision' (a program to direct compilers to make passes + at assemblers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +That's all there is to it! + + +--- The Artistic License 1.0 --- + +This software is Copyright (c) 2014 by Christopher J. Madsen. + +This is free software, licensed under: + + The Artistic License 1.0 + +The Artistic License + +Preamble + +The intent of this document is to state the conditions under which a Package +may be copied, such that the Copyright Holder maintains some semblance of +artistic control over the development of the package, while giving the users of +the package the right to use and distribute the Package in a more-or-less +customary fashion, plus the right to make reasonable modifications. + +Definitions: + + - "Package" refers to the collection of files distributed by the Copyright + Holder, and derivatives of that collection of files created through + textual modification. + - "Standard Version" refers to such a Package if it has not been modified, + or has been modified in accordance with the wishes of the Copyright + Holder. + - "Copyright Holder" is whoever is named in the copyright or copyrights for + the package. + - "You" is you, if you're thinking about copying or distributing this Package. + - "Reasonable copying fee" is whatever you can justify on the basis of media + cost, duplication charges, time of people involved, and so on. (You will + not be required to justify it to the Copyright Holder, but only to the + computing community at large as a market that must bear the fee.) + - "Freely Available" means that no fee is charged for the item itself, though + there may be fees involved in handling the item. It also means that + recipients of the item may redistribute it under the same conditions they + received it. + +1. You may make and give away verbatim copies of the source form of the +Standard Version of this Package without restriction, provided that you +duplicate all of the original copyright notices and associated disclaimers. + +2. You may apply bug fixes, portability fixes and other modifications derived +from the Public Domain or from the Copyright Holder. A Package modified in such +a way shall still be considered the Standard Version. + +3. You may otherwise modify your copy of this Package in any way, provided that +you insert a prominent notice in each changed file stating how and when you +changed that file, and provided that you do at least ONE of the following: + + a) place your modifications in the Public Domain or otherwise make them + Freely Available, such as by posting said modifications to Usenet or an + equivalent medium, or placing the modifications on a major archive site + such as ftp.uu.net, or by allowing the Copyright Holder to include your + modifications in the Standard Version of the Package. + + b) use the modified Package only within your corporation or organization. + + c) rename any non-standard executables so the names do not conflict with + standard executables, which must also be provided, and provide a separate + manual page for each non-standard executable that clearly documents how it + differs from the Standard Version. + + d) make other distribution arrangements with the Copyright Holder. + +4. You may distribute the programs of this Package in object code or executable +form, provided that you do at least ONE of the following: + + a) distribute a Standard Version of the executables and library files, + together with instructions (in the manual page or equivalent) on where to + get the Standard Version. + + b) accompany the distribution with the machine-readable source of the Package + with your modifications. + + c) accompany any non-standard executables with their corresponding Standard + Version executables, giving the non-standard executables non-standard + names, and clearly documenting the differences in manual pages (or + equivalent), together with instructions on where to get the Standard + Version. + + d) make other distribution arrangements with the Copyright Holder. + +5. You may charge a reasonable copying fee for any distribution of this +Package. You may charge any fee you choose for support of this Package. You +may not charge a fee for this Package itself. However, you may distribute this +Package in aggregate with other (possibly commercial) programs as part of a +larger (possibly commercial) software distribution provided that you do not +advertise this Package as a product of your own. + +6. The scripts and library files supplied as input to or produced as output +from the programs of this Package do not automatically fall under the copyright +of this Package, but belong to whomever generated them, and may be sold +commercially, and may be aggregated with this Package. + +7. C or perl subroutines supplied by you and linked into this Package shall not +be considered part of this Package. + +8. The name of the Copyright Holder may not be used to endorse or promote +products derived from this software without specific prior written permission. + +9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + +The End + diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..65bc556 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,15 @@ +Changes +LICENSE +MANIFEST +META.json +META.yml +Makefile.PL +README +lib/IO/HTML.pm +t/00-all_prereqs.t +t/00-load.t +t/10-find.t +t/20-open.t +t/30-outfile.t +xt/release/pod-coverage.t +xt/release/pod-syntax.t diff --git a/META.json b/META.json new file mode 100644 index 0000000..75a6b37 --- /dev/null +++ b/META.json @@ -0,0 +1,308 @@ +{ + "abstract" : "Open an HTML file with automatic charset detection", + "author" : [ + "Christopher J. Madsen <perl@cjmweb.net>" + ], + "dynamic_config" : 0, + "generated_by" : "Dist::Zilla version 5.009, CPAN::Meta::Converter version 2.120921", + "license" : [ + "perl_5" + ], + "meta-spec" : { + "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", + "version" : "2" + }, + "name" : "IO-HTML", + "prereqs" : { + "configure" : { + "requires" : { + "ExtUtils::MakeMaker" : "6.30" + } + }, + "develop" : { + "requires" : { + "Pod::Coverage::TrustPod" : "0", + "Test::Pod" : "1.41", + "Test::Pod::Coverage" : "1.08" + } + }, + "runtime" : { + "requires" : { + "Carp" : "0", + "Encode" : "2.10", + "Exporter" : "5.57", + "perl" : "5.008" + } + }, + "test" : { + "requires" : { + "File::Temp" : "0", + "Scalar::Util" : "0", + "Test::More" : "0.88" + } + } + }, + "release_status" : "stable", + "resources" : { + "repository" : { + "type" : "git", + "url" : "git://github.com/madsen/io-html.git", + "web" : "https://github.com/madsen/io-html" + } + }, + "version" : "1.001", + "x_Dist_Zilla" : { + "perl" : { + "version" : "5.018002" + }, + "plugins" : [ + { + "class" : "Dist::Zilla::Plugin::VersionFromModule", + "name" : "CJM/VersionFromModule", + "version" : "0.08" + }, + { + "class" : "Dist::Zilla::Plugin::GatherDir", + "name" : "CJM/GatherDir", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::PruneCruft", + "name" : "CJM/PruneCruft", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::ManifestSkip", + "name" : "CJM/ManifestSkip", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::MetaJSON", + "name" : "CJM/MetaJSON", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::MetaYAML", + "name" : "CJM/MetaYAML", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::License", + "name" : "CJM/License", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::Test::PrereqsFromMeta", + "name" : "CJM/Test::PrereqsFromMeta", + "version" : "4.21" + }, + { + "class" : "Dist::Zilla::Plugin::PodSyntaxTests", + "name" : "CJM/PodSyntaxTests", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::PodCoverageTests", + "name" : "CJM/PodCoverageTests", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::PodLoom", + "config" : { + "Pod::Loom version" : "0.08" + }, + "name" : "CJM/PodLoom", + "version" : "5.00" + }, + { + "class" : "Dist::Zilla::Plugin::MakeMaker", + "name" : "CJM/MakeMaker", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::RunExtraTests", + "name" : "CJM/RunExtraTests", + "version" : "0.011" + }, + { + "class" : "Dist::Zilla::Plugin::MetaConfig", + "name" : "CJM/MetaConfig", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::MatchManifest", + "name" : "CJM/MatchManifest", + "version" : "4.01" + }, + { + "class" : "Dist::Zilla::Plugin::RecommendedPrereqs", + "name" : "CJM/RecommendedPrereqs", + "version" : "4.21" + }, + { + "class" : "Dist::Zilla::Plugin::CheckPrereqsIndexed", + "name" : "CJM/CheckPrereqsIndexed", + "version" : "0.009" + }, + { + "class" : "Dist::Zilla::Plugin::GitVersionCheckCJM", + "name" : "CJM/GitVersionCheckCJM", + "version" : "4.13" + }, + { + "class" : "Dist::Zilla::Plugin::TemplateCJM", + "name" : "CJM/TemplateCJM", + "version" : "4.22" + }, + { + "class" : "Dist::Zilla::Plugin::Repository", + "name" : "CJM/Repository", + "version" : "0.19" + }, + { + "class" : "Dist::Zilla::Plugin::Git::Check", + "config" : { + "Dist::Zilla::Plugin::Git::Check" : { + "untracked_files" : "die" + }, + "Dist::Zilla::Role::Git::DirtyFiles" : { + "allow_dirty" : [ + "Changes" + ], + "allow_dirty_match" : [], + "changelog" : "Changes" + }, + "Dist::Zilla::Role::Git::Repo" : { + "repo_root" : "." + } + }, + "name" : "CJM/@Git/Check", + "version" : "2.022" + }, + { + "class" : "Dist::Zilla::Plugin::Git::Commit", + "config" : { + "Dist::Zilla::Plugin::Git::Commit" : { + "add_files_in" : [], + "commit_msg" : "Updated Changes for %{MMMM d, yyyy}d%{ trial}t release of %v", + "time_zone" : "local" + }, + "Dist::Zilla::Role::Git::DirtyFiles" : { + "allow_dirty" : [ + "Changes" + ], + "allow_dirty_match" : [], + "changelog" : "Changes" + }, + "Dist::Zilla::Role::Git::Repo" : { + "repo_root" : "." + } + }, + "name" : "CJM/@Git/Commit", + "version" : "2.022" + }, + { + "class" : "Dist::Zilla::Plugin::Git::Tag", + "config" : { + "Dist::Zilla::Plugin::Git::Tag" : { + "branch" : null, + "signed" : 0, + "tag" : "1.001", + "tag_format" : "%v%t", + "tag_message" : "Tagged %N %v%{ (trial release)}t", + "time_zone" : "local" + }, + "Dist::Zilla::Role::Git::Repo" : { + "repo_root" : "." + } + }, + "name" : "CJM/@Git/Tag", + "version" : "2.022" + }, + { + "class" : "Dist::Zilla::Plugin::Git::Push", + "config" : { + "Dist::Zilla::Plugin::Git::Push" : { + "push_to" : [ + "github master" + ], + "remotes_must_exist" : 1 + }, + "Dist::Zilla::Role::Git::Repo" : { + "repo_root" : "." + } + }, + "name" : "CJM/@Git/Push", + "version" : "2.022" + }, + { + "class" : "Dist::Zilla::Plugin::TestRelease", + "name" : "CJM/TestRelease", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::UploadToCPAN", + "name" : "CJM/UploadToCPAN", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::ArchiveRelease", + "name" : "CJM/ArchiveRelease", + "version" : "4.00" + }, + { + "class" : "Dist::Zilla::Plugin::AutoPrereqs", + "name" : "AutoPrereqs", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::FinderCode", + "name" : ":InstallModules", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::FinderCode", + "name" : ":IncModules", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::FinderCode", + "name" : ":TestFiles", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::FinderCode", + "name" : ":ExecFiles", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::FinderCode", + "name" : ":ShareFiles", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::FinderCode", + "name" : ":MainModule", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::FinderCode", + "name" : ":AllFiles", + "version" : "5.009" + }, + { + "class" : "Dist::Zilla::Plugin::FinderCode", + "name" : ":NoFiles", + "version" : "5.009" + } + ], + "zilla" : { + "class" : "Dist::Zilla::Dist::Builder", + "config" : { + "is_trial" : "0" + }, + "version" : "5.009" + } + } +} + diff --git a/META.yml b/META.yml new file mode 100644 index 0000000..d691de8 --- /dev/null +++ b/META.yml @@ -0,0 +1,219 @@ +--- +abstract: 'Open an HTML file with automatic charset detection' +author: + - 'Christopher J. Madsen <perl@cjmweb.net>' +build_requires: + File::Temp: 0 + Scalar::Util: 0 + Test::More: 0.88 +configure_requires: + ExtUtils::MakeMaker: 6.30 +dynamic_config: 0 +generated_by: 'Dist::Zilla version 5.009, CPAN::Meta::Converter version 2.120921' +license: perl +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: 1.4 +name: IO-HTML +requires: + Carp: 0 + Encode: 2.10 + Exporter: 5.57 + perl: 5.008 +resources: + repository: git://github.com/madsen/io-html.git +version: 1.001 +x_Dist_Zilla: + perl: + version: 5.018002 + plugins: + - + class: Dist::Zilla::Plugin::VersionFromModule + name: CJM/VersionFromModule + version: 0.08 + - + class: Dist::Zilla::Plugin::GatherDir + name: CJM/GatherDir + version: 5.009 + - + class: Dist::Zilla::Plugin::PruneCruft + name: CJM/PruneCruft + version: 5.009 + - + class: Dist::Zilla::Plugin::ManifestSkip + name: CJM/ManifestSkip + version: 5.009 + - + class: Dist::Zilla::Plugin::MetaJSON + name: CJM/MetaJSON + version: 5.009 + - + class: Dist::Zilla::Plugin::MetaYAML + name: CJM/MetaYAML + version: 5.009 + - + class: Dist::Zilla::Plugin::License + name: CJM/License + version: 5.009 + - + class: Dist::Zilla::Plugin::Test::PrereqsFromMeta + name: CJM/Test::PrereqsFromMeta + version: 4.21 + - + class: Dist::Zilla::Plugin::PodSyntaxTests + name: CJM/PodSyntaxTests + version: 5.009 + - + class: Dist::Zilla::Plugin::PodCoverageTests + name: CJM/PodCoverageTests + version: 5.009 + - + class: Dist::Zilla::Plugin::PodLoom + config: + Pod::Loom version: 0.08 + name: CJM/PodLoom + version: 5.00 + - + class: Dist::Zilla::Plugin::MakeMaker + name: CJM/MakeMaker + version: 5.009 + - + class: Dist::Zilla::Plugin::RunExtraTests + name: CJM/RunExtraTests + version: 0.011 + - + class: Dist::Zilla::Plugin::MetaConfig + name: CJM/MetaConfig + version: 5.009 + - + class: Dist::Zilla::Plugin::MatchManifest + name: CJM/MatchManifest + version: 4.01 + - + class: Dist::Zilla::Plugin::RecommendedPrereqs + name: CJM/RecommendedPrereqs + version: 4.21 + - + class: Dist::Zilla::Plugin::CheckPrereqsIndexed + name: CJM/CheckPrereqsIndexed + version: 0.009 + - + class: Dist::Zilla::Plugin::GitVersionCheckCJM + name: CJM/GitVersionCheckCJM + version: 4.13 + - + class: Dist::Zilla::Plugin::TemplateCJM + name: CJM/TemplateCJM + version: 4.22 + - + class: Dist::Zilla::Plugin::Repository + name: CJM/Repository + version: 0.19 + - + class: Dist::Zilla::Plugin::Git::Check + config: + Dist::Zilla::Plugin::Git::Check: + untracked_files: die + Dist::Zilla::Role::Git::DirtyFiles: + allow_dirty: + - Changes + allow_dirty_match: [] + changelog: Changes + Dist::Zilla::Role::Git::Repo: + repo_root: '.' + name: CJM/@Git/Check + version: 2.022 + - + class: Dist::Zilla::Plugin::Git::Commit + config: + Dist::Zilla::Plugin::Git::Commit: + add_files_in: [] + commit_msg: 'Updated Changes for %{MMMM d, yyyy}d%{ trial}t release of %v' + time_zone: local + Dist::Zilla::Role::Git::DirtyFiles: + allow_dirty: + - Changes + allow_dirty_match: [] + changelog: Changes + Dist::Zilla::Role::Git::Repo: + repo_root: '.' + name: CJM/@Git/Commit + version: 2.022 + - + class: Dist::Zilla::Plugin::Git::Tag + config: + Dist::Zilla::Plugin::Git::Tag: + branch: ~ + signed: 0 + tag: 1.001 + tag_format: '%v%t' + tag_message: 'Tagged %N %v%{ (trial release)}t' + time_zone: local + Dist::Zilla::Role::Git::Repo: + repo_root: '.' + name: CJM/@Git/Tag + version: 2.022 + - + class: Dist::Zilla::Plugin::Git::Push + config: + Dist::Zilla::Plugin::Git::Push: + push_to: + - 'github master' + remotes_must_exist: 1 + Dist::Zilla::Role::Git::Repo: + repo_root: '.' + name: CJM/@Git/Push + version: 2.022 + - + class: Dist::Zilla::Plugin::TestRelease + name: CJM/TestRelease + version: 5.009 + - + class: Dist::Zilla::Plugin::UploadToCPAN + name: CJM/UploadToCPAN + version: 5.009 + - + class: Dist::Zilla::Plugin::ArchiveRelease + name: CJM/ArchiveRelease + version: 4.00 + - + class: Dist::Zilla::Plugin::AutoPrereqs + name: AutoPrereqs + version: 5.009 + - + class: Dist::Zilla::Plugin::FinderCode + name: ':InstallModules' + version: 5.009 + - + class: Dist::Zilla::Plugin::FinderCode + name: ':IncModules' + version: 5.009 + - + class: Dist::Zilla::Plugin::FinderCode + name: ':TestFiles' + version: 5.009 + - + class: Dist::Zilla::Plugin::FinderCode + name: ':ExecFiles' + version: 5.009 + - + class: Dist::Zilla::Plugin::FinderCode + name: ':ShareFiles' + version: 5.009 + - + class: Dist::Zilla::Plugin::FinderCode + name: ':MainModule' + version: 5.009 + - + class: Dist::Zilla::Plugin::FinderCode + name: ':AllFiles' + version: 5.009 + - + class: Dist::Zilla::Plugin::FinderCode + name: ':NoFiles' + version: 5.009 + zilla: + class: Dist::Zilla::Dist::Builder + config: + is_trial: 0 + version: 5.009 diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..c603a95 --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,62 @@ + +# This file was automatically generated by Dist::Zilla::Plugin::MakeMaker v5.009. +use strict; +use warnings; + +use 5.008; + +use ExtUtils::MakeMaker 6.30; + + + +my %WriteMakefileArgs = ( + "ABSTRACT" => "Open an HTML file with automatic charset detection", + "AUTHOR" => "Christopher J. Madsen <perl\@cjmweb.net>", + "BUILD_REQUIRES" => {}, + "CONFIGURE_REQUIRES" => { + "ExtUtils::MakeMaker" => "6.30" + }, + "DISTNAME" => "IO-HTML", + "EXE_FILES" => [], + "LICENSE" => "perl", + "NAME" => "IO::HTML", + "PREREQ_PM" => { + "Carp" => 0, + "Encode" => "2.10", + "Exporter" => "5.57" + }, + "TEST_REQUIRES" => { + "File::Temp" => 0, + "Scalar::Util" => 0, + "Test::More" => "0.88" + }, + "VERSION" => "1.001", + "test" => { + "TESTS" => "t/*.t" + } +); + + +my %FallbackPrereqs = ( + "Carp" => 0, + "Encode" => "2.10", + "Exporter" => "5.57", + "File::Temp" => 0, + "Scalar::Util" => 0, + "Test::More" => "0.88" +); + + +unless ( eval { ExtUtils::MakeMaker->VERSION(6.63_03) } ) { + delete $WriteMakefileArgs{TEST_REQUIRES}; + delete $WriteMakefileArgs{BUILD_REQUIRES}; + $WriteMakefileArgs{PREREQ_PM} = \%FallbackPrereqs; +} + +delete $WriteMakefileArgs{CONFIGURE_REQUIRES} + unless eval { ExtUtils::MakeMaker->VERSION(6.52) }; + +WriteMakefile(%WriteMakefileArgs); + + + @@ -0,0 +1,46 @@ +IO-HTML version 1.001, released June 28, 2014 + + +This module opens a file and performs automatic charset detection +based on the HTML5 algorithm. You can then pass the filehandle to +HTML::Parser or a related module (or just read it yourself). + + + +INSTALLATION + +To install this module, run the following commands: + + perl Makefile.PL + make + make test + make install + + + +DEPENDENCIES + + Package Minimum Version + --------- --------------- + perl 5.8.0 + Carp + Encode 2.10 + Exporter 5.57 + + + +CHANGES + Here's what's new in version 1.001 of IO-HTML: + (See the file "Changes" for the full revision history.) + + - No code changes, just documentation improvements + - Update links to the HTML5 draft specification + + + +COPYRIGHT AND LICENSE + +This software is copyright (c) 2014 by Christopher J. Madsen. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. diff --git a/lib/IO/HTML.pm b/lib/IO/HTML.pm new file mode 100644 index 0000000..5fdad22 --- /dev/null +++ b/lib/IO/HTML.pm @@ -0,0 +1,575 @@ +#--------------------------------------------------------------------- +package IO::HTML; +# +# Copyright 2014 Christopher J. Madsen +# +# Author: Christopher J. Madsen <perl@cjmweb.net> +# Created: 14 Jan 2012 +# +# This program is free software; you can redistribute it and/or modify +# it under the same terms as Perl itself. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the +# GNU General Public License or the Artistic License for more details. +# +# ABSTRACT: Open an HTML file with automatic charset detection +#--------------------------------------------------------------------- + +use 5.008; +use strict; +use warnings; + +use Carp 'croak'; +use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding +use Exporter 5.57 'import'; + +our $VERSION = '1.001'; +# This file is part of IO-HTML 1.001 (June 28, 2014) + +our $default_encoding ||= 'cp1252'; + +our @EXPORT = qw(html_file); +our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile + sniff_encoding); + +our %EXPORT_TAGS = ( + rw => [qw( html_file html_file_and_encoding html_outfile )], + all => [ @EXPORT, @EXPORT_OK ], +); + +#===================================================================== + + +sub html_file +{ + (&html_file_and_encoding)[0]; # return just the filehandle +} # end html_file + + +# Note: I made html_file and html_file_and_encoding separate functions +# (instead of making html_file context-sensitive) because I wanted to +# use html_file in function calls (i.e. list context) without having +# to write "scalar html_file" all the time. + +sub html_file_and_encoding +{ + my ($filename, $options) = @_; + + $options ||= {}; + + open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!"; + + + my ($encoding, $bom) = sniff_encoding($in, $filename, $options); + + if (not defined $encoding) { + croak "No default encoding specified" + unless defined($encoding = $default_encoding); + $encoding = find_encoding($encoding) if $options->{encoding}; + } # end if we didn't find an encoding + + binmode $in, sprintf(":encoding(%s):crlf", + $options->{encoding} ? $encoding->name : $encoding); + + return ($in, $encoding, $bom); +} # end html_file_and_encoding +#--------------------------------------------------------------------- + + +sub html_outfile +{ + my ($filename, $encoding, $bom) = @_; + + if (not defined $encoding) { + croak "No default encoding specified" + unless defined($encoding = $default_encoding); + } # end if we didn't find an encoding + elsif (ref $encoding) { + $encoding = $encoding->name; + } + + open(my $out, ">:encoding($encoding)", $filename) + or croak "Failed to open $filename: $!"; + + print $out "\x{FeFF}" if $bom; + + return $out; +} # end html_outfile +#--------------------------------------------------------------------- + + +sub sniff_encoding +{ + my ($in, $filename, $options) = @_; + + $filename = 'file' unless defined $filename; + $options ||= {}; + + my $pos = tell $in; + croak "Could not seek $filename: $!" if $pos < 0; + + croak "Could not read $filename: $!" unless defined read $in, my $buf, 1024; + + seek $in, $pos, 0 or croak "Could not seek $filename: $!"; + + + # Check for BOM: + my $bom; + my $encoding = do { + if ($buf =~ /^\xFe\xFF/) { + $bom = 2; + 'UTF-16BE'; + } elsif ($buf =~ /^\xFF\xFe/) { + $bom = 2; + 'UTF-16LE'; + } elsif ($buf =~ /^\xEF\xBB\xBF/) { + $bom = 3; + 'utf-8-strict'; + } else { + find_charset_in($buf, $options); # check for <meta charset> + } + }; # end $encoding + + if ($bom) { + seek $in, $bom, 1 or croak "Could not seek $filename: $!"; + $bom = 1; + } + elsif (not defined $encoding) { # try decoding as UTF-8 + my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET); + if ($buf =~ /^(?: # nothing left over + | [\xC2-\xDF] # incomplete 2-byte char + | [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char + | [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char + )\z/x and $test =~ /[^\x00-\x7F]/) { + $encoding = 'utf-8-strict'; + } # end if valid UTF-8 with at least one multi-byte character: + } # end if testing for UTF-8 + + if (defined $encoding and $options->{encoding} and not ref $encoding) { + $encoding = find_encoding($encoding); + } # end if $encoding is a string and we want an object + + return wantarray ? ($encoding, $bom) : $encoding; +} # end sniff_encoding + +#===================================================================== +# Based on HTML5 8.2.2.2 Determining the character encoding: + +# Get attribute from current position of $_ +sub _get_attribute +{ + m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or / + + return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc; + + my ($name, $value) = (lc $1, ''); + + if (/\G[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/gc + and (/\G"([^"]*)"?/gc or + /\G'([^']*)'?/gc or + /\G([^\x09\x0A\x0C\x0D >]*)/gc)) { + $value = lc $1; + } # end if attribute has value + + return wantarray ? ($name, $value) : 1; +} # end _get_attribute + +# Examine a meta value for a charset: +sub _get_charset_from_meta +{ + for (shift) { + while (/charset[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/ig) { + return $1 if (/\G"([^"]*)"/gc or + /\G'([^']*)'/gc or + /\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc); + } + } # end for value + + return undef; +} # end _get_charset_from_meta +#--------------------------------------------------------------------- + + +sub find_charset_in +{ + for (shift) { + my $options = shift || {}; + my $stop = length > 1024 ? 1024 : length; # search first 1024 bytes + + my $expect_pragma = (defined $options->{need_pragma} + ? $options->{need_pragma} : 1); + + pos() = 0; + while (pos() < $stop) { + if (/\G<!--.*?(?<=--)>/sgc) { + } # Skip comment + elsif (m!\G<meta(?=[\x09\x0A\x0C\x0D /])!gic) { + my ($got_pragma, $need_pragma, $charset); + + while (my ($name, $value) = &_get_attribute) { + if ($name eq 'http-equiv' and $value eq 'content-type') { + $got_pragma = 1; + } elsif ($name eq 'content' and not defined $charset) { + $need_pragma = $expect_pragma + if defined($charset = _get_charset_from_meta($value)); + } elsif ($name eq 'charset') { + $charset = $value; + $need_pragma = 0; + } + } # end while more attributes in this <meta> tag + + if (defined $need_pragma and (not $need_pragma or $got_pragma)) { + $charset = 'UTF-8' if $charset =~ /^utf-?16/; + $charset = 'cp1252' if $charset eq 'iso-8859-1'; # people lie + if (my $encoding = find_encoding($charset)) { + return $options->{encoding} ? $encoding : $encoding->name; + } # end if charset is a recognized encoding + } # end if found charset + } # end elsif <meta + elsif (m!\G</?[a-zA-Z][^\x09\x0A\x0C\x0D >]*!gc) { + 1 while &_get_attribute; + } # end elsif some other tag + elsif (m{\G<[!/?][^>]*}gc) { + } # skip unwanted things + elsif (m/\G</gc) { + } # skip < that doesn't open anything we recognize + + # Advance to the next <: + m/\G[^<]+/gc; + } # end while not at search boundary + } # end for string + + return undef; # Couldn't find a charset +} # end find_charset_in +#--------------------------------------------------------------------- + + +# Shortcuts for people who don't like exported functions: +*file = \&html_file; +*file_and_encoding = \&html_file_and_encoding; +*outfile = \&html_outfile; + +#===================================================================== +# Package Return Value: + +1; + +__END__ + +=head1 NAME + +IO::HTML - Open an HTML file with automatic charset detection + +=head1 VERSION + +This document describes version 1.001 of +IO::HTML, released June 28, 2014. + +=head1 SYNOPSIS + + use IO::HTML; # exports html_file by default + use HTML::TreeBuilder; + + my $tree = HTML::TreeBuilder->new_from_file( + html_file('foo.html') + ); + + # Alternative interface: + open(my $in, '<:raw', 'bar.html'); + my $encoding = IO::HTML::sniff_encoding($in, 'bar.html'); + +=head1 DESCRIPTION + +IO::HTML provides an easy way to open a file containing HTML while +automatically determining its encoding. It uses the HTML5 encoding +sniffing algorithm specified in section 8.2.2.2 of the draft standard. + +The algorithm as implemented here is: + +=over + +=item 1. + +If the file begins with a byte order mark indicating UTF-16LE, +UTF-16BE, or UTF-8, then that is the encoding. + +=item 2. + +If the first 1024 bytes of the file contain a C<< <meta> >> tag that +indicates the charset, and Encode recognizes the specified charset +name, then that is the encoding. (This portion of the algorithm is +implemented by C<find_charset_in>.) + +The C<< <meta> >> tag can be in one of two formats: + + <meta charset="..."> + <meta http-equiv="Content-Type" content="...charset=..."> + +The search is case-insensitive, and the order of attributes within the +tag is irrelevant. Any additional attributes of the tag are ignored. +The first matching tag with a recognized encoding ends the search. + +=item 3. + +If the first 1024 bytes of the file are valid UTF-8 (with at least 1 +non-ASCII character), then the encoding is UTF-8. + +=item 4. + +If all else fails, use the default character encoding. The HTML5 +standard suggests the default encoding should be locale dependent, but +currently it is always C<cp1252> unless you set +C<$IO::HTML::default_encoding> to a different value. Note: +C<sniff_encoding> does not apply this step; only C<html_file> does +that. + +=back + +=head1 SUBROUTINES + +=head2 html_file + + $filehandle = html_file($filename, \%options); + +This function (exported by default) is the primary entry point. It +opens the file specified by C<$filename> for reading, uses +C<sniff_encoding> to find a suitable encoding layer, and applies it. +It also applies the C<:crlf> layer. If the file begins with a BOM, +the filehandle is positioned just after the BOM. + +The optional second argument is a hashref containing options. The +possible keys are described under C<find_charset_in>. + +If C<sniff_encoding> is unable to determine the encoding, it defaults +to C<$IO::HTML::default_encoding>, which is set to C<cp1252> +(a.k.a. Windows-1252) by default. According to the standard, the +default should be locale dependent, but that is not currently +implemented. + +It dies if the file cannot be opened. + + +=head2 html_file_and_encoding + + ($filehandle, $encoding, $bom) + = html_file_and_encoding($filename, \%options); + +This function (exported only by request) is just like C<html_file>, +but returns more information. In addition to the filehandle, it +returns the name of the encoding used, and a flag indicating whether a +byte order mark was found (if C<$bom> is true, the file began with a +BOM). This may be useful if you want to write the file out again +(especially in conjunction with the C<html_outfile> function). + +The optional second argument is a hashref containing options. The +possible keys are described under C<find_charset_in>. + +It dies if the file cannot be opened. The result of calling it in +scalar context is undefined. + + +=head2 html_outfile + + $filehandle = html_outfile($filename, $encoding, $bom); + +This function (exported only by request) opens C<$filename> for output +using C<$encoding>, and writes a BOM to it if C<$bom> is true. +If C<$encoding> is C<undef>, it defaults to C<$IO::HTML::default_encoding>. +C<$encoding> may be either an encoding name or an Encode::Encoding object. + +It dies if the file cannot be opened. + + +=head2 sniff_encoding + + ($encoding, $bom) = sniff_encoding($filehandle, $filename, \%options); + +This function (exported only by request) runs the HTML5 encoding +sniffing algorithm on C<$filehandle> (which must be seekable, and +should have been opened in C<:raw> mode). C<$filename> is used only +for error messages (if there's a problem using the filehandle), and +defaults to "file" if omitted. The optional third argument is a +hashref containing options. The possible keys are described under +C<find_charset_in>. + +It returns Perl's canonical name for the encoding, which is not +necessarily the same as the MIME or IANA charset name. It returns +C<undef> if the encoding cannot be determined. C<$bom> is true if the +file began with a byte order mark. In scalar context, it returns only +C<$encoding>. + +The filehandle's position is restored to its original position +(normally the beginning of the file) unless C<$bom> is true. In that +case, the position is immediately after the BOM. + +Tip: If you want to run C<sniff_encoding> on a file you've already +loaded into a string, open an in-memory file on the string, and pass +that handle: + + ($encoding, $bom) = do { + open(my $fh, '<', \$string); sniff_encoding($fh) + }; + +(This only makes sense if C<$string> contains bytes, not characters.) + + +=head2 find_charset_in + + $encoding = find_charset_in($string_containing_HTML, \%options); + +This function (exported only by request) looks for charset information +in a C<< <meta> >> tag in a possibly incomplete HTML document using +the "two step" algorithm specified by HTML5. It does not look for a BOM. +Only the first 1024 bytes of the string are checked. + +It returns Perl's canonical name for the encoding, which is not +necessarily the same as the MIME or IANA charset name. It returns +C<undef> if no charset is specified or if the specified charset is not +recognized by the Encode module. + +The optional second argument is a hashref containing options. The +following keys are recognized: + +=over + +=item C<encoding> + +If true, return the L<Encode::Encoding> object instead of its name. +Defaults to false. + +=item C<need_pragma> + +If true (the default), follow the HTML5 spec and examine the +C<content> attribute only of C<< <meta http-equiv="Content-Type" >>. +If set to 0, relax the HTML5 spec, and look for "charset=" in the +C<content> attribute of I<every> meta tag. + +=back + +=head1 EXPORTS + +By default, only C<html_file> is exported. Other functions may be +exported on request. + +For people who prefer not to export functions, all functions beginning +with C<html_> have an alias without that prefix (e.g. you can call +C<IO::HTML::file(...)> instead of C<IO::HTML::html_file(...)>. These +aliases are not exportable. + +=for Pod::Coverage +file +file_and_encoding +outfile + +The following export tags are available: + +=over + +=item C<:all> + +All exportable functions. + +=item C<:rw> + +C<html_file>, C<html_file_and_encoding>, C<html_outfile>. + +=back + +=head1 SEE ALSO + +The HTML5 specification, section 8.2.2.2 Determining the character encoding: +L<http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding> + +=head1 DIAGNOSTICS + +=over + +=item C<< Could not read %s: %s >> + +The specified file could not be read from for the reason specified by C<$!>. + + +=item C<< Could not seek %s: %s >> + +The specified file could not be rewound for the reason specified by C<$!>. + + +=item C<< Failed to open %s: %s >> + +The specified file could not be opened for reading for the reason +specified by C<$!>. + + +=item C<< No default encoding specified >> + +The C<sniff_encoding> algorithm didn't find an encoding to use, and +you set C<$IO::HTML::default_encoding> to C<undef>. + + +=back + +=head1 CONFIGURATION AND ENVIRONMENT + +IO::HTML requires no configuration files or environment variables. + +=head1 DEPENDENCIES + +IO::HTML has no non-core dependencies for Perl 5.8.7+. With earlier +versions of Perl 5.8, you need to upgrade L<Encode> to at least +version 2.10, and +you may need to upgrade L<Exporter> to at least version +5.57. + +=head1 INCOMPATIBILITIES + +None reported. + +=head1 BUGS AND LIMITATIONS + +No bugs have been reported. + +=head1 AUTHOR + +Christopher J. Madsen S<C<< <perl AT cjmweb.net> >>> + +Please report any bugs or feature requests +to S<C<< <bug-IO-HTML AT rt.cpan.org> >>> +or through the web interface at +L<< http://rt.cpan.org/Public/Bug/Report.html?Queue=IO-HTML >>. + +You can follow or contribute to IO-HTML's development at +L<< https://github.com/madsen/io-html >>. + +=head1 COPYRIGHT AND LICENSE + +This software is copyright (c) 2014 by Christopher J. Madsen. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=head1 DISCLAIMER OF WARRANTY + +BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER +EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE +ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH +YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL +NECESSARY SERVICING, REPAIR, OR CORRECTION. + +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENSE, BE +LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, +OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE +THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + +=cut diff --git a/t/00-all_prereqs.t b/t/00-all_prereqs.t new file mode 100644 index 0000000..668f447 --- /dev/null +++ b/t/00-all_prereqs.t @@ -0,0 +1,95 @@ +#!perl + +use strict; +use warnings; + +# This doesn't use Test::More because I don't want to clutter %INC +# with modules that aren't prerequisites. + +my $test = 0; + +sub ok ($$) +{ + my ($ok, $name) = @_; + + printf "%sok %d - %s\n", ($ok ? '' : 'not '), ++$test, $name; + + return $ok; +} # end ok + +END { + ok(0, 'unknown failure') unless $test; + print "1..$test\n"; +} + +sub get_version +{ + my ($package) = @_; + + local $@; + my $version = eval { $package->VERSION }; + + defined $version ? $version : 'undef'; +} # end get_version + +TEST: { + ok(open(META, '<META.json'), 'opened META.json') or last TEST; + + while (<META>) { + last if /^\s*"prereqs" : \{\s*\z/; + } # end while <META> + + ok(defined $_, 'found prereqs') or last TEST; + + while (<META>) { + last if /^\s*\},?\s*\z/; + ok(/^\s*"(.+)" : \{\s*\z/, "found phase $1") or last TEST; + my $phase = $1; + + while (<META>) { + last if /^\s*\},?\s*\z/; + next if /^\s*"[^"]+"\s*:\s*\{\s*\},?\s*\z/; + ok(/^\s*"(.+)" : \{\s*\z/, "found relationship $phase $1") or last TEST; + my $rel = $1; + + while (<META>) { + last if /^\s*\},?\s*\z/; + ok(/^\s*"([^"]+)"\s*:\s*(\S+?),?\s*\z/, "found prereq $1") + or last TEST; + my ($prereq, $version) = ($1, $2); + + next if $phase ne 'runtime' or $prereq eq 'perl'; + + # Need a special case for if.pm, because "require if;" is a syntax error. + my $loaded = ($prereq eq 'if') + ? eval "require '$prereq.pm'; '$prereq'->VERSION($version); 1" + : eval "require $prereq; $prereq->VERSION($version); 1"; + if ($rel eq 'requires') { + ok($loaded, "loaded $prereq $version") + or printf STDERR "\n# Got: %s %s\n# Wanted: %s %s\n", + $prereq, get_version($prereq), $prereq, $version; + } else { + ok(1, ($loaded ? 'loaded' : 'failed to load') . " $prereq $version"); + } + } # end while <META> in prerequisites + } # end while <META> in relationship + } # end while <META> in phase + + close META; + + # Print version of all loaded modules: + if ($ENV{AUTOMATED_TESTING}) { + print STDERR "# Listing %INC\n"; + + my @packages = grep { s/\.pm\Z// and do { s![\\/]!::!g; 1 } } sort keys %INC; + + my $len = 0; + for (@packages) { $len = length if length > $len } + $len = 68 if $len > 68; + + for my $package (@packages) { + printf STDERR "# %${len}s %s\n", $package, get_version($package); + } + } # end if AUTOMATED_TESTING +} # end TEST + diff --git a/t/00-load.t b/t/00-load.t new file mode 100644 index 0000000..d72f2a0 --- /dev/null +++ b/t/00-load.t @@ -0,0 +1,10 @@ +#! /usr/bin/perl +#--------------------------------------------------------------------- + +use Test::More tests => 1; + +BEGIN { + use_ok('IO::HTML'); +} + +diag("Testing IO::HTML $IO::HTML::VERSION"); diff --git a/t/10-find.t b/t/10-find.t new file mode 100644 index 0000000..ef32b1f --- /dev/null +++ b/t/10-find.t @@ -0,0 +1,133 @@ +#! /usr/bin/perl +#--------------------------------------------------------------------- +# 10-find.t +# Copyright 2012 Christopher J. Madsen +# +# Test the find_charset_in function +#--------------------------------------------------------------------- + +use strict; +use warnings; + +use Test::More 0.88; # done_testing +use Scalar::Util 'blessed'; + +use IO::HTML 'find_charset_in'; + +plan tests => 23; + +sub test +{ + my $charset = shift; + my @data = shift; + push @data, shift if ref $_[0]; # options for find_charset_in + my $name = shift; + + local $Test::Builder::Level = $Test::Builder::Level + 1; + + is(scalar find_charset_in(@data), $charset, $name); +} # end test + +#--------------------------------------------------------------------- +test 'utf-8-strict' => <<''; +<meta charset="UTF-8"> + +test 'utf-8-strict' => <<''; +<!-- UTF-16 is recognized only with a BOM --> +<meta charset="UTF-16BE"> + +test 'iso-8859-15' => <<''; +<meta charset ="ISO-8859-15"> + +test 'iso-8859-15' => <<''; +<meta charset= "ISO-8859-15"> + +test 'iso-8859-15' => <<''; +<meta charset = + "ISO-8859-15"> + +test 'utf-8-strict' => <<''; +<meta foo=bar some=" charset = + "ISO-8859-15"> +<meta charset="UTF-8"> + +test 'cp1252' => <<''; +<meta charset="Windows-1252"> + +test undef, <<'', 'misspelled charset'; +<meta charseat="Windows-1252"> + +test 'utf-8-strict' => <<''; +<meta charset="UTF-8"> +<meta charset="Windows-1252"> +<meta charseat="Windows-1252"> + +test 'cp1252' => <<''; +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" /> +<title>Title</title> + +test 'iso-8859-15' => <<''; +<html> +<head><!-- somebody forgot the quotes --> +<meta http-equiv=Content-Type content=text/html; charset=ISO-8859-15 /> +<title>Title</title> + +test 'iso-8859-15' => <<''; +<html> +<head><!-- somebody forgot the quotes --> +<meta http-equiv +=Content-Type content=text/html; charset=ISO-8859-15 /> +<title>Title</title> + +test 'iso-8859-15' => <<''; +<html> +<head><!-- different order --> +<meta content=text/html; charset=ISO-8859-15 http-equiv=Content-Type> +<title>Title</title> + +test 'cp1252' => <<''; +<html> +<head> +<meta content="text/html;charset=ISO-8859-1" http-equiv=Content-Type> +<title>Title</title> + +test undef, <<'', 'incomplete attribute'; +<html> +<foo href="c06. + +test 'iso-8859-15' => <<'', 'short comment'; +<!--><meta charset="ISO-8859-15">--> + +test 'iso-8859-15' => <<'', 'strange comment'; +<!---><meta charset="ISO-8859-15">--> + +test undef, <<'', 'inside comment'; +<!-- ><meta charset="ISO-8859-15">--> + +test undef, <<'', 'wrong pragma'; +<html> +<head> +<meta http-equiv="X-Content-Type" content="text/html; charset=UTF-8" /> +<title>Title</title> + +test 'utf-8-strict', <<'', {need_pragma => 0}, 'need_pragma 0'; +<html> +<head> +<meta http-equiv="X-Content-Type" content="text/html; charset=UTF-8" /> +<title>Title</title> + +test 'iso-8859-15' => <<'', 'bogus encoding'; +<meta charset="Totally-Bogus-Encoding-That-Doesnt-Exist"> +<meta charset=ISO-8859-15> + +{ + my $encoding = find_charset_in('<meta charset="UTF-8">', { encoding => 1 }); + + ok(blessed($encoding), 'encoding is an object'); + + is(eval { $encoding->name }, 'utf-8-strict', 'encoding is UTF-8'); +} + +done_testing; diff --git a/t/20-open.t b/t/20-open.t new file mode 100644 index 0000000..64d6900 --- /dev/null +++ b/t/20-open.t @@ -0,0 +1,150 @@ +#! /usr/bin/perl +#--------------------------------------------------------------------- +# 20-open.t +# Copyright 2012 Christopher J. Madsen +# +# Actually open files and check the encoding +#--------------------------------------------------------------------- + +use strict; +use warnings; + +use Test::More 0.88; + +plan tests => 85; + +use IO::HTML; +use File::Temp; +use Scalar::Util 'blessed'; + +#--------------------------------------------------------------------- +sub test +{ + my ($expected, $out, $data, $name, $nextArg) = @_; + + local $Test::Builder::Level = $Test::Builder::Level + 1; + + my $options; + if (ref $name) { + $options = $name; + $name = $nextArg; + } + + unless ($name) { + $name = 'test ' . ($expected || 'cp1252'); + } + + my $tmp = File::Temp->new(UNLINK => 1); + open(my $mem, '>', \(my $buf)) or die; + + if ($out) { + $out = ":encoding($out)" unless $out =~ /^:/; + binmode $tmp, $out; + binmode $mem, $out; + } + + print $mem $data; + print $tmp $data; + close $mem; + $tmp->close; + + my ($fh, $encoding, $bom) = IO::HTML::file_and_encoding("$tmp", $options); + + if ($options and $options->{encoding}) { + ok(blessed($encoding), 'returned an object'); + + $encoding = eval { $encoding->name }; + } + + is($encoding, $expected || 'cp1252', $name); + + my $firstLine = <$fh>; + like($firstLine, qr/^<html/i); + + close $fh; + + $fh = html_file("$tmp", $options); + + is(<$fh>, $firstLine); + + close $fh; + + # Test sniff_encoding: + undef $mem; + open($mem, '<', \$buf) or die "Can't open in-memory file: $!"; + + delete $options->{encoding} if $options; + + ($encoding, $bom) = IO::HTML::sniff_encoding($mem, undef, $options); + + is($encoding, $expected); + + seek $mem, 0, 0; + + $options->{encoding} = 1; + + ($encoding, $bom) = IO::HTML::sniff_encoding($mem, undef, $options); + + if (defined $expected) { + ok(blessed($encoding), 'encoding is an object'); + + is(eval { $encoding->name }, $expected); + } else { + is($encoding, undef); + } +} # end test + +#--------------------------------------------------------------------- +test 'utf-8-strict' => '' => <<''; +<html><meta charset="UTF-8"> + +test 'utf-8-strict' => ':utf8' => <<""; +<html><head><title>Foo\xA0Bar</title> + +test undef, latin1 => <<""; +<html><head><title>Foo\xA0Bar</title> + +test 'UTF-16BE' => 'UTF-16BE' => <<""; +\x{FeFF}<html><head><title>Foo\xA0Bar</title> + +test 'utf-8-strict' => ':utf8' => <<""; +\x{FeFF}<html><meta charset="UTF-16"> + +test 'utf-8-strict' => ':utf8' => <<""; +<html><meta charset="UTF-16BE"> + +test 'UTF-16LE' => 'UTF-16LE' => <<""; +\x{FeFF}<html><meta charset="UTF-16"> + +test 'UTF-16LE' => 'UTF-16LE' => <<"", { encoding => 1 }; +\x{FeFF}<html><meta charset="UTF-16"> + +test 'utf-8-strict' => ':utf8' => <<"", { encoding => 1, need_pragma => 0 }; +<html><meta charset="UTF-16BE"> + +test 'utf-8-strict' => ':utf8' => + "<html><title>Foo\xA0Bar" . ("\x{2014}" x 512) . "</title>\n", + 'UTF-8 character crosses boundary'; + +test 'utf-8-strict' => ':utf8' => + "<html><title>Foo Bar" . ("\x{2014}" x 512) . "</title>\n", + 'UTF-8 character crosses boundary 2'; + +test undef, '', <<'', 'wrong pragma'; +<html> +<head> +<meta http-equiv="X-Content-Type" content="text/html; charset=UTF-8" /> +<title>Title</title> + +test 'utf-8-strict', '', <<'', {need_pragma => 0}, 'need_pragma 0'; +<html> +<head> +<meta http-equiv="X-Content-Type" content="text/html; charset=UTF-8" /> +<title>Title</title> + +test 'iso-8859-15', '', <<"", { encoding => 1, need_pragma => 0 }; +<html> +<meta content="text/html; charset=ISO-8859-15"> +<meta charset="UTF-16BE"> + +done_testing; diff --git a/t/30-outfile.t b/t/30-outfile.t new file mode 100644 index 0000000..64e744b --- /dev/null +++ b/t/30-outfile.t @@ -0,0 +1,61 @@ +#! /usr/bin/perl +#--------------------------------------------------------------------- +# 20-open.t +# Copyright 2012 Christopher J. Madsen +# +# Test the html_outfile function +#--------------------------------------------------------------------- + +use strict; +use warnings; + +use Test::More 0.88; + +plan tests => 6; + +use IO::HTML ':rw'; +use Encode 'find_encoding'; +use File::Temp; + +#--------------------------------------------------------------------- +sub test +{ + my ($encoding, $bom, $expected) = @_; + + my $name = ref $encoding ? $encoding->name . " object" : $encoding; + $name .= ($bom ? ' with BOM' : ' without BOM') if defined $bom; + + local $Test::Builder::Level = $Test::Builder::Level + 1; + + my $tmp = File::Temp->new(UNLINK => 1); + $tmp->close; + + my $fh = html_outfile("$tmp", $encoding, $bom); + + print $fh "\xA0\x{2014}"; + + close $fh; + + open(my $in, '<:raw', "$tmp") or die $!; + + my $got = do { local $/; <$in> }; + + close $in; + + is(unpack('H*', $got), $expected, $name); +} # end test + +#--------------------------------------------------------------------- +test 'utf-8-strict', 0, 'c2a0e28094'; + +test 'utf-8-strict', 1, 'efbbbfc2a0e28094'; + +test cp1252 => undef, 'a097'; + +test 'UTF-16BE', 1, 'feff00a02014'; + +test 'UTF-16LE', 1, 'fffea0001420'; + +test find_encoding('UTF-8'), 0, 'c2a0e28094'; + +done_testing; diff --git a/xt/release/pod-coverage.t b/xt/release/pod-coverage.t new file mode 100644 index 0000000..9d5f1b4 --- /dev/null +++ b/xt/release/pod-coverage.t @@ -0,0 +1,14 @@ +#!perl +# This file was automatically generated by Dist::Zilla::Plugin::PodCoverageTests. + +use Test::More; + +eval "use Test::Pod::Coverage 1.08"; +plan skip_all => "Test::Pod::Coverage 1.08 required for testing POD coverage" + if $@; + +eval "use Pod::Coverage::TrustPod"; +plan skip_all => "Pod::Coverage::TrustPod required for testing POD coverage" + if $@; + +all_pod_coverage_ok({ coverage_class => 'Pod::Coverage::TrustPod' }); diff --git a/xt/release/pod-syntax.t b/xt/release/pod-syntax.t new file mode 100644 index 0000000..3a69ee9 --- /dev/null +++ b/xt/release/pod-syntax.t @@ -0,0 +1,8 @@ +#!perl +# This file was automatically generated by Dist::Zilla::Plugin::PodSyntaxTests. +use Test::More; + +eval "use Test::Pod 1.41"; +plan skip_all => "Test::Pod 1.41 required for testing POD" if $@; + +all_pod_files_ok(); |