summaryrefslogtreecommitdiff
path: root/sphinx
diff options
context:
space:
mode:
authorMarti Raudsepp <marti@juffo.org>2012-02-28 20:12:59 +0200
committerMarti Raudsepp <marti@juffo.org>2012-02-28 20:12:59 +0200
commit9c0c3775263f6ea52f93905d25620f90c4546216 (patch)
tree408f93f202699d0e49868771f3e37763f8db2a5b /sphinx
parent2244cd80429fd317af1feb93a994d58512848068 (diff)
downloadsphinx-9c0c3775263f6ea52f93905d25620f90c4546216.tar.gz
Add #anchor checking to 'linkcheck' builder.
This requires us to download the document and parse its HTML.
Diffstat (limited to 'sphinx')
-rw-r--r--sphinx/builders/linkcheck.py73
-rw-r--r--sphinx/config.py1
2 files changed, 65 insertions, 9 deletions
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
index ad15b55d..25d34aca 100644
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -15,7 +15,8 @@ import Queue
import socket
import threading
from os import path
-from urllib2 import build_opener, Request
+from urllib2 import build_opener, unquote, Request
+from HTMLParser import HTMLParser, HTMLParseError
from docutils import nodes
@@ -33,6 +34,42 @@ class HeadRequest(Request):
return 'HEAD'
+class AnchorCheckParser(HTMLParser):
+ def __init__(self, search_anchor):
+ HTMLParser.__init__(self)
+
+ self.search_anchor = search_anchor
+ self.found = False
+
+ def handle_starttag(self, tag, attrs):
+ for key, value in attrs:
+ if key in ('id', 'name') and value == self.search_anchor:
+ self.found = True
+
+def check_anchor(f, hash):
+ """Reads HTML data from a filelike object 'f' searching for anchor 'hash'.
+
+ Returns True if anchor was found, False otherwise"""
+
+ parser = AnchorCheckParser(hash)
+
+ try:
+ # Read file in chunks of 8192 bytes. If we find a matching anchor, we
+ # break the loop early in hopes not to have to download the whole thing
+
+ chunk = f.read(8192)
+ while chunk and not parser.found:
+ parser.feed(chunk)
+ chunk = f.read(8192)
+
+ parser.close()
+ except HTMLParseError:
+ # HTMLParser is usually pretty good with sloppy HTML, but it tends to
+ # choke on EOF. But we're done then anyway.
+ pass
+
+ return parser.found
+
class CheckExternalLinksBuilder(Builder):
"""
Checks for broken external links.
@@ -66,7 +103,7 @@ class CheckExternalLinksBuilder(Builder):
def check():
# check for various conditions without bothering the network
- if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
+ if len(uri) == 0 or uri[0] == '#' or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
return 'unchecked', ''
elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
return 'local', ''
@@ -80,19 +117,39 @@ class CheckExternalLinksBuilder(Builder):
if rex.match(uri):
return 'ignored', ''
+ if '#' in uri:
+ req_url, hash = uri.split('#', 1)
+ else:
+ req_url = uri
+ hash = None
+
# need to actually check the URI
try:
- f = opener.open(HeadRequest(uri), **kwargs)
- f.close()
+ if hash and self.app.config.linkcheck_anchors:
+ # Read the whole document and see if #hash exists
+ f = opener.open(Request(req_url), **kwargs)
+ found = check_anchor(f, unquote(hash))
+ f.close()
+
+ if not found:
+ raise Exception("Anchor '%s' not found" % hash)
+ else:
+ f = opener.open(HeadRequest(req_url), **kwargs)
+ f.close()
+
except Exception, err:
self.broken[uri] = str(err)
return 'broken', str(err)
- if f.url.rstrip('/') == uri.rstrip('/'):
+ if f.url.rstrip('/') == req_url.rstrip('/'):
self.good.add(uri)
return 'working', 'new'
else:
- self.redirected[uri] = f.url
- return 'redirected', f.url
+ new_url = f.url
+ if hash:
+ new_url += '#' + hash
+
+ self.redirected[uri] = new_url
+ return 'redirected', new_url
while True:
uri, docname, lineno = self.wqueue.get()
@@ -142,8 +199,6 @@ class CheckExternalLinksBuilder(Builder):
if 'refuri' not in node:
continue
uri = node['refuri']
- if '#' in uri:
- uri = uri.split('#')[0]
lineno = None
while lineno is None:
node = node.parent
diff --git a/sphinx/config.py b/sphinx/config.py
index 767bf088..17b961ae 100644
--- a/sphinx/config.py
+++ b/sphinx/config.py
@@ -179,6 +179,7 @@ class Config(object):
linkcheck_ignore = ([], None),
linkcheck_timeout = (None, None),
linkcheck_workers = (5, None),
+ linkcheck_anchors = (True, None),
# gettext options
gettext_compact = (True, 'gettext'),