Add #anchor checking to 'linkcheck' builder.

This requires us to download the document and parse its HTML.
author: Marti Raudsepp <marti@juffo.org> 2012-02-28 20:12:59 +0200
committer: Marti Raudsepp <marti@juffo.org> 2012-02-28 20:12:59 +0200
commit: 9c0c3775263f6ea52f93905d25620f90c4546216 (patch)
tree: 408f93f202699d0e49868771f3e37763f8db2a5b /sphinx
parent: 2244cd80429fd317af1feb93a994d58512848068 (diff)
download: sphinx-9c0c3775263f6ea52f93905d25620f90c4546216.tar.gz
2 files changed, 65 insertions, 9 deletions
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
index ad15b55d..25d34aca 100644
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -15,7 +15,8 @@ import Queue
 import socket
 import threading
 from os import path
-from urllib2 import build_opener, Request
+from urllib2 import build_opener, unquote, Request
+from HTMLParser import HTMLParser, HTMLParseError
 
 from docutils import nodes
 
@@ -33,6 +34,42 @@ class HeadRequest(Request):
         return 'HEAD'
 
 
+class AnchorCheckParser(HTMLParser):
+    def __init__(self, search_anchor):
+        HTMLParser.__init__(self)
+
+        self.search_anchor = search_anchor
+        self.found = False
+
+    def handle_starttag(self, tag, attrs):
+        for key, value in attrs:
+            if key in ('id', 'name') and value == self.search_anchor:
+                self.found = True
+
+def check_anchor(f, hash):
+    """Reads HTML data from a filelike object 'f' searching for anchor 'hash'.
+
+    Returns True if anchor was found, False otherwise"""
+
+    parser = AnchorCheckParser(hash)
+
+    try:
+        # Read file in chunks of 8192 bytes. If we find a matching anchor, we
+        # break the loop early in hopes not to have to download the whole thing
+
+        chunk = f.read(8192)
+        while chunk and not parser.found:
+            parser.feed(chunk)
+            chunk = f.read(8192)
+
+        parser.close()
+    except HTMLParseError:
+        # HTMLParser is usually pretty good with sloppy HTML, but it tends to
+        # choke on EOF. But we're done then anyway.
+        pass
+
+    return parser.found
+
 class CheckExternalLinksBuilder(Builder):
     """
     Checks for broken external links.
@@ -66,7 +103,7 @@ class CheckExternalLinksBuilder(Builder):
 
         def check():
             # check for various conditions without bothering the network
-            if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
+            if len(uri) == 0 or uri[0] == '#' or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
                 return 'unchecked', ''
             elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
                 return 'local', ''
@@ -80,19 +117,39 @@ class CheckExternalLinksBuilder(Builder):
                 if rex.match(uri):
                     return 'ignored', ''
 
+            if '#' in uri:
+                req_url, hash = uri.split('#', 1)
+            else:
+                req_url = uri
+                hash = None
+
             # need to actually check the URI
             try:
-                f = opener.open(HeadRequest(uri), **kwargs)
-                f.close()
+                if hash and self.app.config.linkcheck_anchors:
+                    # Read the whole document and see if #hash exists
+                    f = opener.open(Request(req_url), **kwargs)
+                    found = check_anchor(f, unquote(hash))
+                    f.close()
+
+                    if not found:
+                        raise Exception("Anchor '%s' not found" % hash)
+                else:
+                    f = opener.open(HeadRequest(req_url), **kwargs)
+                    f.close()
+
             except Exception, err:
                 self.broken[uri] = str(err)
                 return 'broken', str(err)
-            if f.url.rstrip('/') == uri.rstrip('/'):
+            if f.url.rstrip('/') == req_url.rstrip('/'):
                 self.good.add(uri)
                 return 'working', 'new'
             else:
-                self.redirected[uri] = f.url
-                return 'redirected', f.url
+                new_url = f.url
+                if hash:
+                    new_url += '#' + hash
+
+                self.redirected[uri] = new_url
+                return 'redirected', new_url
 
         while True:
             uri, docname, lineno = self.wqueue.get()
@@ -142,8 +199,6 @@ class CheckExternalLinksBuilder(Builder):
             if 'refuri' not in node:
                 continue
             uri = node['refuri']
-            if '#' in uri:
-                uri = uri.split('#')[0]
             lineno = None
             while lineno is None:
                 node = node.parent
diff --git a/sphinx/config.py b/sphinx/config.py
index 767bf088..17b961ae 100644
--- a/sphinx/config.py
+++ b/sphinx/config.py
@@ -179,6 +179,7 @@ class Config(object):
         linkcheck_ignore = ([], None),
         linkcheck_timeout = (None, None),
         linkcheck_workers = (5, None),
+        linkcheck_anchors = (True, None),
 
         # gettext options
         gettext_compact = (True, 'gettext'),
author	Marti Raudsepp <marti@juffo.org>	2012-02-28 20:12:59 +0200
committer	Marti Raudsepp <marti@juffo.org>	2012-02-28 20:12:59 +0200
commit	9c0c3775263f6ea52f93905d25620f90c4546216 (patch)
tree	408f93f202699d0e49868771f3e37763f8db2a5b /sphinx
parent	2244cd80429fd317af1feb93a994d58512848068 (diff)
download	sphinx-9c0c3775263f6ea52f93905d25620f90c4546216.tar.gz