summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorLukas Larsson <lukas@erlang.org>2020-03-26 12:48:50 +0100
committerLukas Larsson <lukas@erlang.org>2020-04-17 10:28:27 +0200
commitd9c9864b83b47e4695e95692531b7fe98f04e213 (patch)
tree31d466fce830ef6af3af302e2efb9d4af31c11e5 /scripts
parent8e83be4fad802c2c39017cc606d3c34651ca39e1 (diff)
downloaderlang-d9c9864b83b47e4695e95692531b7fe98f04e213.tar.gz
docgen: Add html link-check script
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/otp_html_check533
1 files changed, 533 insertions, 0 deletions
diff --git a/scripts/otp_html_check b/scripts/otp_html_check
new file mode 100755
index 0000000000..abe6245ad3
--- /dev/null
+++ b/scripts/otp_html_check
@@ -0,0 +1,533 @@
+#!/usr/bin/perl -w
+
+###########################################################################
+#
+# Find broken links and files not referenced.
+#
+# Author: Kent Boortz <kent@erix.ericsson.se>
+#
+###########################################################################
+
+use File::Find;
+use strict;
+
+undef $/; # No record separator reading files
+
+###########################################################################
+#
+# When we talk about "a page" we mean the actual page/file
+# When we talk about "a link" we mean a referense to a page/file.
+# All links/URL's start with an slash except the top link that is
+# the empty string.
+#
+# So basically we have a set of links and a set of URL's to pages and
+# check if this is a valid combination.
+#
+###########################################################################
+
+my $debug = 1;
+my $expand_url = 0; # If we are to expand an URL with default
+ # names like "index.html"
+my @indexes = # The order to try URL expansion
+ (
+ "index.shtml",
+ "index.html",
+ "index.htm",
+ );
+
+my $html_ext = 'shtml|html|htm'; # HTML pages ends in these
+
+my @links; # Set of [page,link] we want to check
+my @exclude; # Pages/dir/prefix to exclude
+my %pages; # Set of all files found in the file system
+ # limited by the script arguments.
+ # After the spider is done all members in the
+ # set thas has the value 1 was visited.
+
+my %missing; # Pages not found "$page$;$link"
+my %invalid; # After expansion it is invalid
+my %access; # Can't access but exists
+
+my %anchor_refs; # Absolute links including anchor part
+my %anchor_defs; # <a name="..."> in the form "$page#$anchor"
+
+###########################################################################
+#
+# Argument processing, see usage() function below
+#
+###########################################################################
+
+@ARGV or usage("No base directory given");
+my $base = shift @ARGV;
+-d $base or usage("Not a directory: $base");
+$base =~ m&^/& or usage("Has to be absolute path: $base");
+$base =~ s&/+$&&; # Remove ending slash if any
+
+my $link;
+while ($link = shift @ARGV) {
+ last if $link eq '--';
+ $link =~ s&/+$&&; # Remove ending slash if any
+ $link =~ s&$base&&; # Make absolute URL
+ $link =~ m&^/& and usage("Invalid start point of HTML tree \"$_\"");
+ $link = "/$link";
+ push(@links,["",$link]);
+}
+
+while ($link = shift @ARGV) {
+ $link =~ s&/+$&&; # Remove ending slash if any
+ $link =~ s&$base&&; # Make absolute URL
+ $link =~ m&^/& and usage("Invalid exclude URL \"$_\"");
+ $link = "/$link";
+ push(@exclude,$link);
+}
+
+# OTP specific
+
+push(@links,["","/doc/index.html"]) unless @links;
+
+###########################################################################
+#
+# Traverse all files and directories and put all possible URL's into
+# the set %pages. When we later find a referense to a page that URL
+# is removed from the set. When we have followed all links the set
+# contains the pages never visited.
+#
+# We skip files and directories in @exclude.
+#
+###########################################################################
+
+find(\&wanted,$base);
+
+sub wanted {
+ return unless -f;
+ return if /^\.info\./;
+ return if /~$/;
+
+ my $url = $File::Find::name;
+ $url =~ s&$base&&;
+ $pages{$url} = 0 unless map {$url =~ m&^$_&} @exclude;
+}
+
+
+###########################################################################
+#
+# Spider that follow all links adding links to the @links set.
+#
+# @links is expanded, normalized links
+#
+# We check if there is an valid URL for this link.
+# @links may contain links that look bad, this is cleaned up here
+# before checking it.
+#
+###########################################################################
+
+while (@links) {
+ my $page_and_link = shift @links;
+ my ($page,$link) = @$page_and_link;
+
+ # We skip some links directly
+
+ next if $link =~ /^\w{3,10}:/i;
+ next if $link =~ /cgi-bin|cgiwrap|user-cgi/;
+ next if $link =~ /^and|or$/;
+# next if $link eq "";
+
+# print STDERR "1 link: $link\n";
+
+ $link = expand_link($link,\%pages) if $expand_url;
+
+ unless (exists $pages{$link}) {
+ # No page for link, mark as invalid
+ $missing{"$page$;$link"} = 1;
+ next;
+ }
+
+# print STDERR "2 link: $link\n";
+
+ next if $pages{$link}; # If == 1 it is visited
+ $pages{$link} = 1; # Mark as visited
+
+# print STDERR "3 link: $link\n";
+
+# next unless $link =~ /\.(shtml|html|htm)$/oi;
+ next unless $link =~ /\.($html_ext)$/oi;
+
+ push(@links,get_page_links($base,$link));
+}
+
+
+###########################################################################
+#
+# Read the page and get all the links. We know that the URL for the page
+# is absolute and that a page/file exists.
+#
+###########################################################################
+
+sub get_page_links {
+ my $base = shift;
+ my $page = shift; # Absolute URL
+
+# print STDERR "open: $page\n";
+
+ my $path = "$base$page";
+
+ open(HTML,$path)
+ or print STDERR "INTERNAL ERROR: Can't open page $page: $!\n";
+
+ my $html = <HTML>;
+ close HTML;
+
+# my $url_base = $page;
+# $url_base =~ s&/[^/]+$&&;
+
+ # Remove comments
+ $html =~ s/\<\!\-\-\s*(.*?)\s*\-\-\>//gs;
+
+# # Remove comments and expand SSI
+# $html =~ s/\<\!\-\-\s*(.*?)\s*\-\-\>/
+# expand_ssi($url_base,$page,$1)/gsie;
+
+ my @links; # Links in this document
+# push(@links,$html =~ /\/\*URL\*\/\s*\'([^\']+\.[^\']+)\'/gsi);
+# push(@links,$html =~ /=\s*\'([^\']+\.(?:gif|jpg|jpeg))\'/gsi);
+# push(@links,$html =~ /option value=\s*\"(\/[^\"]+)\"/gsi);
+# push(@links,$html =~ /option value=\s*\"([^\"]+\.[^\"]+)\"/gsi);
+# FIXME: This is not working....
+# push(@links,$html =~ /url\s*=\s*([\w-\.\/]+)/gsi);
+# push(@links,$html =~ /\"([^\"]+\.html)\"/gsi);
+
+ # Find real HTML links
+ push(@links,$html =~ /\<\s*\w[^\>]*\sHREF=\s*\"([^\"]*)\"[^\>]*\>/gsi);
+ push(@links,$html =~ /\<\s*\w[^\>]*\sSRC=\s*\"([^\"]*)\"[^\>]*\>/gsi);
+ push(@links,$html =~ /\<\s*\w[^\>]*\sLOWSRC=\s*\"([^\"]*)\"[^\>]*\>/gsi);
+ push(@links,$html =~ /\<\s*\w[^\>]*\sBACKGROUND=\s*\"([^\"]*)\"[^\>]*\>/gsi);
+
+ # FIXME: Now we have the raw links, if we want to complain about
+ # spaces etc this is the time.
+
+ # Remove referenses to the same page FIXME??? Was removed , why...
+# @links = grep {$_ and $_ !~ /^\#/} @links;
+
+ # Find the URL to the current directory
+ my $rpath = $page;
+ $rpath =~ s&/[^/]+$&&; # Remove name
+
+ # Links pointing to the same page
+ # should look the same
+ map {$_ = normalize_link($page,$rpath,$_)} @links;
+
+# print "XXX $page\n" if grep {m&lib/asn1-1.3.2/doc/index\.html&} @links;
+
+ map {$_ = [$page,$_]} @links; # Add what page was referensing it
+
+ # Find the anchors
+
+ my @anchors =
+ ($html =~ m/
+ <
+ \s*
+ A
+ [^>]*
+ \s (?: NAME|ID) \s* = \s*
+ (?: \"([^\"]*)\" | \'([^\']*)\' | ([^>\s]+) )
+ [^>]*
+ >
+ /gsix);
+
+ foreach my $anchor (@anchors) {
+ # FIXME if already there, duplicate
+ next unless defined $anchor;
+ $anchor =~ s/%([\da-fA-F]{2})/chr(hex($1))/eg; # Translate hex to char
+ $anchor =~ s/&lt;/</g; #
+ $anchor =~ s/&gt;/>/g; #
+ $anchor_defs{"$page#$anchor"} = 1;
+ }
+
+ return @links;
+}
+
+
+# -------------------------------------------------------------------------
+# -------------------------------------------------------------------------
+
+sub normalize_link {
+ my $page = shift; # Page where we found this link
+ my $rpath = shift; # URL to directory where we found this link
+ my $link = shift; # The link to normalize
+
+# print STDERR "\n";
+# print STDERR "1 normalize_link: $link\n";
+
+ # Handle javascript:erlhref() specially to be able to check those links.
+ if ($link =~ /^javascript:erlhref\(([^\)]*)\);$/) {
+ my($up,$part,$mod) = split(/,\s*/, $1);
+ $up =~ tr/\'//d;
+ $part =~ tr/\'//d;
+ $mod =~ tr/\'//d;
+ my $dir;
+ if ($part =~ m&^[a-z]+/&) {
+ $dir = "$base$rpath/${up}/$part";
+ } else {
+ my $path = "$base$rpath/${up}lib/$part";
+ ($dir) = <$path-*>;
+ return $link unless defined $dir;
+ $dir .= "/doc/html";
+ }
+ $dir =~ s&^$base&&o;
+ $link = "$dir/$mod";
+ }
+
+ return $link if $link =~ /^\w{3,10}:/i; # mailto: http: .....
+ return $link if $link =~ /\?/i; # Contains arguments to CGI
+
+ $link =~ s/%([\da-fA-F]{2})/chr(hex($1))/eg; # Translate hex to char
+
+ if ($link eq "") {
+ # The empty link is a reference to URL directory
+ return $rpath;
+ } elsif ($link =~ /^#(.*)$/) {
+ # Lokal reference to anchor
+ my $anchor = $1;
+ $anchor =~ s/%([\da-fA-F]{2})/chr(hex($1))/eg; # Translate hex to char
+ $anchor =~ s/&lt;/</g; #
+ $anchor =~ s/&gt;/>/g; #
+ push(@{$anchor_refs{"$page#$anchor"}}, $page);
+ return $page;
+ }
+
+ my $anchor = "";
+
+ if ($link =~ s&#(.*)$&&) {
+ # Removed page ref (anchor)
+ $anchor = $1;
+ $anchor =~ s/%([\da-fA-F]{2})/chr(hex($1))/eg; # Translate hex to char
+ $anchor =~ s/&lt;/</g; #
+ $anchor =~ s/&gt;/>/g; #
+ }
+
+ $link = "" if $link eq "/";
+
+ # Make the link absolute
+ # FIXME: maybe move down.....
+
+ if ($link !~ m&^/&) {
+ if ($link) {
+ $link = "$rpath/$link";
+ } else {
+ $link = $rpath;
+ }
+ }
+
+ my $xlink = $link;
+
+ $link =~ s&//+&/&g; # Replace multiple slashes with one slash
+# $link =~ s&^(\./)+&&g; # Remove starting dot slash "./" (can't be if absolute)
+ $link =~ s&(/\.)+$&&; # Remove ending slash dot "/."
+ $link =~ s&(/\.)+/&/&g; # Remove all slash dot slash "/./"
+ $link =~ s&/+$&&; # Remove ending slashes
+
+ # Remove a real directory part followed by ".."
+
+ while ($link =~ s&/[^/]+/\.\.&&) {}
+
+# print STDERR "4 normalize_link: $link\n";
+
+ $link = "" if $link eq "/"; # We do this again
+
+ # print STDERR "5 normalize_link: $link\n";
+
+ push(@{$anchor_refs{"$link#$anchor"}}, $page) if $anchor;
+
+ return $link;
+}
+
+
+# -------------------------------------------------------------------------
+# We know the link is normalized
+# -------------------------------------------------------------------------
+
+sub expand_link {
+ my $link = shift;
+ my $pages = shift;
+
+ return $link if exists $pages{$link};
+
+ my $newlink;
+
+ foreach my $index (@indexes) {
+ $newlink = "$link/$index";
+ return $newlink if exists $pages{$newlink};
+ }
+
+ return $link;
+}
+
+###########################################################################
+#
+# Report the result
+#
+###########################################################################
+
+if (keys %missing) {
+ print "\n\n\n**** Broken links\n\n";
+ foreach (sort keys %missing) {
+ my ($page,$link) = split($;);
+ print qq(Broken Link: $page -> "$link"\n);
+ }
+}
+
+
+# Entrys in %pages that has the value 0 is not visited
+if (keys %pages) {
+ print "\n\n\n**** Files not used (that I can see)\n\n";
+ foreach my $page (sort keys %pages) {
+ next if $pages{$page}; # If == 1 it is visited
+
+ # OTP specific
+
+ next if $page =~ m&^/(man|pdf|logs|COPYRIGHT|PR.template|README)&;
+ next if $page =~ m&^/.*\.tar.gz$&;
+ next if $page =~ m&(/info|\.kwc)$&;
+
+ print qq("$page"\n);
+ }
+}
+
+
+# Remove all references that has a matching NAME=....
+map {delete $anchor_refs{$_}} keys %anchor_defs;
+
+if (keys %anchor_refs) {
+ print "\n\n\n**** References to missing anchors\n\n";
+ foreach my $ref (sort keys %anchor_refs) {
+ foreach my $anchor (sort @{$anchor_refs{$ref}}) {
+ print qq(Missing Anchor: "$ref" from ${anchor}\n);
+ }
+ }
+}
+
+
+###########################################################################
+
+sub usage {
+ print STDERR "ERROR: ",join("\n",@_),"\n" if @_;
+ print <<HERE;
+Usage: $0 BaseDirectory URL [ URLs... ] [ -- ExcludeURLs... ]
+
+This script try to find out what files are used and not of your
+HTML documents, graphic files etc. It doesn't use HTTP, i.e. you
+work off-line, so this script may fail to find a link. Javascripts
+and other extensions also makes it very hard. But for many sites
+it work very well.
+
+The base directory has to given has to start with a slash.
+
+For URLs and ExcludeURLs absolute paths or relative the base
+directory can be used.
+
+ExcludeURLs is used as prefixes of directories or files that
+should be excluded from the search.
+
+You call it something like
+
+ % $0 /test/r7a /test/r7a/doc/index.html /test/r7a/lib/*/doc/index.html
+
+or using relative start points
+
+ % $0 /test/r7a doc/index.html
+
+HERE
+ exit 1;
+}
+
+
+__END__
+
+# FIXME: The order below is important
+
+if (%access) {
+ print "\n**** Link exists but can't open\n\n";
+
+ my $file;
+
+ foreach $file (sort keys %access) {
+ print "$file\n";
+ }
+}
+
+
+if (%invalid) {
+ print "\n**** Invalid links (goes up above top directory)\n\n";
+
+ foreach (sort keys %invalid) {
+ my ($page,$link) = split($;,$_);
+ delete $done{$link}; # FIXME: xxxx
+ print "$page\n\t-> $link\n";
+ }
+}
+
+if (%done) {
+ print "\n**** Internal error, should be no files here\n\n";
+
+ foreach (sort keys %done) {
+ print "$_\n";
+ }
+}
+
+
+__END__
+###########################################################################
+
+
+sub expand_ssi {
+ my $url_base = shift;
+ my $page = shift;
+ my $comment = shift; # Text between <!-- and -->
+
+ return "" unless $comment =~ s/^\#//;
+
+ # This is an SSI
+ unless ($comment =~ /([\w-]+)=\"([^\"]+)\"/) {
+# print STDERR "WARNING: Unknown SSI $comment\n\ton $page\n";
+ return "";
+ }
+
+ my $op = lc($1); # Operator
+ my $inc = $2; # Absolute or relative URL anding in anything
+
+ if ($debug) {
+ print STDERR "X: url_base = $url_base\n";
+ print STDERR "X: page = $page\n";
+ print STDERR "X: op = $op\n";
+ print STDERR "X: inc = $inc\n";
+ print STDERR "X: base = $base\n";
+ }
+
+ unless ($op eq 'virtual') {
+# print STDERR "WARNING: Unknown SSI $comment\n\ton $page\n";
+ return "";
+ }
+
+ $inc = make_url_absolute($url_base,$page,$inc);
+
+ my $path = "$base$inc";
+
+ if ($debug) {
+ print STDERR "X: inc = $inc\n";
+ print STDERR "X: path = $path\n\n";
+ }
+
+ unless (open(HTML,$path)) {
+# print STDERR "ERROR: Can't open page $inc: $!\n";
+ $access{$inc} = 1;
+ return "";
+ }
+
+ my $html = <HTML>;
+ close HTML;
+
+ $done{$inc} = 1; # Mark done
+
+ return $html;
+}
+