diff options
| author | Georg Brandl <georg@python.org> | 2011-01-04 11:27:42 +0100 |
|---|---|---|
| committer | Georg Brandl <georg@python.org> | 2011-01-04 11:27:42 +0100 |
| commit | 8a623b1e53df7c211a2b6989e187d1072377fd5b (patch) | |
| tree | 52eecb2e8dfe9f0186eb64903dc32165d50b9c0c /sphinx | |
| parent | 902f69759d2844def54b6182d875da737e1f8f3a (diff) | |
| download | sphinx-8a623b1e53df7c211a2b6989e187d1072377fd5b.tar.gz | |
#472: linkcheck builder: Check links in parallel, use HTTP HEAD requests and allow configuring the timeout.
New config values: :confval:`linkcheck_timeout` and :confval:`linkcheck_workers`.
Diffstat (limited to 'sphinx')
| -rw-r--r-- | sphinx/builders/linkcheck.py | 181 | ||||
| -rw-r--r-- | sphinx/config.py | 2 |
2 files changed, 108 insertions, 75 deletions
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index dd87d70d..e4091986 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -10,9 +10,12 @@ """ import re +import sys +import Queue import socket +import threading from os import path -from urllib2 import build_opener, HTTPError +from urllib2 import build_opener, Request from docutils import nodes @@ -24,6 +27,12 @@ opener = build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] +class HeadRequest(Request): + """Subclass of urllib2.Request that sends a HEAD request.""" + def get_method(self): + return 'HEAD' + + class CheckExternalLinksBuilder(Builder): """ Checks for broken external links. @@ -40,6 +49,83 @@ class CheckExternalLinksBuilder(Builder): # create output file open(path.join(self.outdir, 'output.txt'), 'w').close() + # create queues and worker threads + self.wqueue = Queue.Queue() + self.rqueue = Queue.Queue() + self.workers = [] + for i in range(self.app.config.linkcheck_workers): + thread = threading.Thread(target=self.check_thread) + thread.setDaemon(True) + thread.start() + self.workers.append(thread) + + def check_thread(self): + kwargs = {} + if sys.version_info > (2, 5) and self.app.config.linkcheck_timeout: + kwargs['timeout'] = self.app.config.linkcheck_timeout + + def check(): + # check for various conditions without bothering the network + if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': + return 'unchecked', '' + elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): + return 'local', '' + elif uri in self.good: + return 'working', '' + elif uri in self.broken: + return 'broken', self.broken[uri] + elif uri in self.redirected: + return 'redirected', self.redirected[uri] + for rex in self.to_ignore: + if rex.match(uri): + return 'ignored', '' + + # need to actually check the URI + try: + f = opener.open(HeadRequest(uri), **kwargs) + f.close() + except Exception, err: + self.broken[uri] = str(err) + return 'broken', str(err) + if f.url.rstrip('/') == uri.rstrip('/'): + self.good.add(uri) + return 'working', 'new' + else: + self.redirected[uri] = f.url + return 'redirected', f.url + + while True: + uri, docname, lineno = self.wqueue.get() + if uri is None: + break + status, info = check() + self.rqueue.put((uri, docname, lineno, status, info)) + + def process_result(self, result): + uri, docname, lineno, status, info = result + if status == 'unchecked': + return + if status == 'working' and info != 'new': + return + if lineno: + self.info('(line %3d) ' % lineno, nonl=1) + if status == 'ignored': + self.info(uri + ' - ' + darkgray('ignored')) + elif status == 'local': + self.info(uri + ' - ' + darkgray('local')) + self.write_entry('local', docname, lineno, uri) + elif status == 'working': + self.info(uri + ' - ' + darkgreen('working')) + elif status == 'broken': + self.info(uri + ' - ' + red('broken: ') + info) + self.write_entry('broken', docname, lineno, uri + ': ' + info) + if self.app.quiet: + self.warn('broken link: %s' % uri, + '%s:%s' % (self.env.doc2path(docname), lineno)) + elif status == 'redirected': + self.info(uri + ' - ' + purple('redirected') + ' to ' + info) + self.write_entry('redirected', docname, lineno, uri + ' to ' + info) + def get_target_uri(self, docname, typ=None): return '' @@ -51,65 +137,25 @@ class CheckExternalLinksBuilder(Builder): def write_doc(self, docname, doctree): self.info() + n = 0 for node in doctree.traverse(nodes.reference): - try: - self.check(node, docname) - except KeyError: + if 'refuri' not in node: continue - - def check(self, node, docname): - uri = node['refuri'] - - if '#' in uri: - uri = uri.split('#')[0] - - if uri in self.good: - return - - lineno = None - while lineno is None: - node = node.parent - if node is None: - break - lineno = node.line - - if len(uri) == 0 or uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': - return - - if lineno: - self.info('(line %3d) ' % lineno, nonl=1) - for rex in self.to_ignore: - if rex.match(uri): - self.info(uri + ' - ' + darkgray('ignored')) - return - if uri[0:5] == 'http:' or uri[0:6] == 'https:': - self.info(uri, nonl=1) - - if uri in self.broken: - (r, s) = self.broken[uri] - elif uri in self.redirected: - (r, s) = self.redirected[uri] - else: - (r, s) = self.resolve(uri) - - if r == 0: - self.info(' - ' + darkgreen('working')) - self.good.add(uri) - elif r == 2: - self.info(' - ' + red('broken: ') + s) - self.write_entry('broken', docname, lineno, uri + ': ' + s) - self.broken[uri] = (r, s) - if self.app.quiet: - self.warn('broken link: %s' % uri, - '%s:%s' % (self.env.doc2path(docname), lineno)) - else: - self.info(' - ' + purple('redirected') + ' to ' + s) - self.write_entry('redirected', docname, - lineno, uri + ' to ' + s) - self.redirected[uri] = (r, s) - else: - self.info(uri + ' - ' + darkgray('local')) - self.write_entry('local', docname, lineno, uri) + uri = node['refuri'] + if '#' in uri: + uri = uri.split('#')[0] + lineno = None + while lineno is None: + node = node.parent + if node is None: + break + lineno = node.line + self.wqueue.put((uri, docname, lineno), False) + n += 1 + done = 0 + while done < n: + self.process_result(self.rqueue.get()) + done += 1 if self.broken: self.app.statuscode = 1 @@ -120,21 +166,6 @@ class CheckExternalLinksBuilder(Builder): line, what, uri)) output.close() - def resolve(self, uri): - try: - f = opener.open(uri) - f.close() - except HTTPError, err: - #if err.code == 403 and uri.startswith('http://en.wikipedia.org/'): - # # Wikipedia blocks requests from urllib User-Agent - # return (0, 0) - return (2, str(err)) - except Exception, err: - return (2, str(err)) - if f.url.rstrip('/') == uri.rstrip('/'): - return (0, 0) - else: - return (1, f.url) - def finish(self): - return + for worker in self.workers: + self.wqueue.put((None, None, None), False) diff --git a/sphinx/config.py b/sphinx/config.py index 922af803..b461c94b 100644 --- a/sphinx/config.py +++ b/sphinx/config.py @@ -168,6 +168,8 @@ class Config(object): # linkcheck options linkcheck_ignore = ([], None), + linkcheck_timeout = (None, None), + linkcheck_workers = (5, None), ) def __init__(self, dirname, filename, overrides, tags): |
