# -*- coding: utf-8 -*- """ sphinx.pycode ~~~~~~~~~~~~~ Utilities parsing and analyzing Python code. :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ import re import sys from os import path from cStringIO import StringIO from sphinx.errors import PycodeError from sphinx.pycode import nodes from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals from sphinx.util import get_module_source from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc # load the Python grammar _grammarfile = path.join(path.dirname(__file__), 'Grammar.txt') pygrammar = driver.load_grammar(_grammarfile) pydriver = driver.Driver(pygrammar, convert=nodes.convert) # an object with attributes corresponding to token and symbol names class sym: pass for k, v in pygrammar.symbol2number.iteritems(): setattr(sym, k, v) for k, v in token.tok_name.iteritems(): setattr(sym, v, k) # a dict mapping terminal and nonterminal numbers to their names number2name = pygrammar.number2symbol.copy() number2name.update(token.tok_name) # a regex to recognize coding cookies _coding_re = re.compile(r'coding[:=]\s*([-\w.]+)') _eq = nodes.Leaf(token.EQUAL, '=') class AttrDocVisitor(nodes.NodeVisitor): """ Visitor that collects docstrings for attribute assignments on toplevel and in classes (class attributes and attributes set in __init__). The docstrings can either be in special '#:' comments before the assignment or in a docstring after it. """ def init(self, scope, encoding): self.scope = scope self.in_init = 0 self.encoding = encoding self.namespace = [] self.collected = {} self.tagnumber = 0 self.tagorder = {} def add_tag(self, name): name = '.'.join(self.namespace + [name]) self.tagorder[name] = self.tagnumber self.tagnumber += 1 def visit_classdef(self, node): """Visit a class.""" self.add_tag(node[1].value) self.namespace.append(node[1].value) self.generic_visit(node) self.namespace.pop() def visit_funcdef(self, node): """Visit a function (or method).""" # usually, don't descend into functions -- nothing interesting there self.add_tag(node[1].value) if node[1].value == '__init__': # however, collect attributes set in __init__ methods self.in_init += 1 self.generic_visit(node) self.in_init -= 1 def visit_expr_stmt(self, node): """Visit an assignment which may have a special comment before it.""" if _eq not in node.children: # not an assignment (we don't care for augmented assignments) return pnode = node[0] prefix = pnode.get_prefix() # if the assignment is the first statement on a new indentation # level, its preceding whitespace and comments are not assigned # to that token, but the first INDENT or DEDENT token while not prefix: pnode = pnode.get_prev_leaf() if not pnode or pnode.type not in (token.INDENT, token.DEDENT): break prefix = pnode.get_prefix() if not isinstance(prefix, unicode): prefix = prefix.decode(self.encoding) docstring = prepare_commentdoc(prefix) self.add_docstring(node, docstring) def visit_simple_stmt(self, node): """Visit a docstring statement which may have an assignment before.""" if node[0].type != token.STRING: # not a docstring; but still need to visit children return self.generic_visit(node) prev = node.get_prev_sibling() if not prev: return if prev.type == sym.simple_stmt and \ prev[0].type == sym.expr_stmt and _eq in prev[0].children: # need to "eval" the string because it's returned in its # original form docstring = literals.evalString(node[0].value, self.encoding) docstring = prepare_docstring(docstring) self.add_docstring(prev[0], docstring) def add_docstring(self, node, docstring): # add an item for each assignment target for i in range(0, len(node) - 1, 2): target = node[i] if self.in_init and self.number2name[target.type] == 'power': # maybe an attribute assignment -- check necessary conditions if (# node must have two children len(target) != 2 or # first child must be "self" target[0].type != token.NAME or target[0].value != 'self' or # second child must be a "trailer" with two children self.number2name[target[1].type] != 'trailer' or len(target[1]) != 2 or # first child must be a dot, second child a name target[1][0].type != token.DOT or target[1][1].type != token.NAME): continue name = target[1][1].value elif target.type != token.NAME: # don't care about other complex targets continue else: name = target.value self.add_tag(name) if docstring: namespace = '.'.join(self.namespace) if namespace.startswith(self.scope): self.collected[namespace, name] = docstring class ModuleAnalyzer(object): # cache for analyzer objects -- caches both by module and file name cache = {} @classmethod def for_string(cls, string, modname, srcname=''): return cls(StringIO(string), modname, srcname) @classmethod def for_file(cls, filename, modname): if ('file', filename) in cls.cache: return cls.cache['file', filename] try: fileobj = open(filename, 'r') except Exception, err: raise PycodeError('error opening %r' % filename, err) obj = cls(fileobj, modname, filename) cls.cache['file', filename] = obj return obj @classmethod def for_module(cls, modname): if ('module', modname) in cls.cache: entry = cls.cache['module', modname] if isinstance(entry, PycodeError): raise entry return entry try: type, source = get_module_source(modname) if type == 'string': obj = cls.for_string(source, modname) else: obj = cls.for_file(source, modname) except PycodeError, err: cls.cache['module', modname] = err raise cls.cache['module', modname] = obj return obj def __init__(self, source, modname, srcname): # name of the module self.modname = modname # name of the source file self.srcname = srcname # file-like object yielding source lines self.source = source # will be changed when found by parse() self.encoding = sys.getdefaultencoding() # cache the source code as well pos = self.source.tell() self.code = self.source.read() self.source.seek(pos) # will be filled by tokenize() self.tokens = None # will be filled by parse() self.parsetree = None # will be filled by find_attr_docs() self.attr_docs = None self.tagorder = None # will be filled by find_tags() self.tags = None def tokenize(self): """Generate tokens from the source.""" if self.tokens is not None: return self.tokens = list(tokenize.generate_tokens(self.source.readline)) self.source.close() def parse(self): """Parse the generated source tokens.""" if self.parsetree is not None: return self.tokenize() try: self.parsetree = pydriver.parse_tokens(self.tokens) except parse.ParseError, err: raise PycodeError('parsing failed', err) # find the source code encoding, if present comments = self.parsetree.get_prefix() for line in comments.splitlines()[:2]: match = _coding_re.search(line) if match is not None: self.encoding = match.group(1) break def find_attr_docs(self, scope=''): """Find class and module-level attributes and their documentation.""" if self.attr_docs is not None: return self.attr_docs self.parse() attr_visitor = AttrDocVisitor(number2name, scope, self.encoding) attr_visitor.visit(self.parsetree) self.attr_docs = attr_visitor.collected self.tagorder = attr_visitor.tagorder # now that we found everything we could in the tree, throw it away # (it takes quite a bit of memory for large modules) self.parsetree = None return attr_visitor.collected def find_tags(self): """Find class, function and method definitions and their location.""" if self.tags is not None: return self.tags self.tokenize() result = {} namespace = [] stack = [] indent = 0 defline = False expect_indent = False def tokeniter(ignore = (token.COMMENT, token.NL)): for tokentup in self.tokens: if tokentup[0] not in ignore: yield tokentup tokeniter = tokeniter() for type, tok, spos, epos, line in tokeniter: if expect_indent: if type != token.INDENT: # no suite -- one-line definition assert stack dtype, fullname, startline, _ = stack.pop() endline = epos[0] namespace.pop() result[fullname] = (dtype, startline, endline) expect_indent = False if tok in ('def', 'class'): name = tokeniter.next()[1] namespace.append(name) fullname = '.'.join(namespace) stack.append((tok, fullname, spos[0], indent)) defline = True elif type == token.INDENT: expect_indent = False indent += 1 elif type == token.DEDENT: indent -= 1 # if the stacklevel is the same as it was before the last # def/class block, this dedent closes that block if stack and indent == stack[-1][3]: dtype, fullname, startline, _ = stack.pop() endline = spos[0] namespace.pop() result[fullname] = (dtype, startline, endline) elif type == token.NEWLINE: # if this line contained a definition, expect an INDENT # to start the suite; if there is no such INDENT # it's a one-line definition if defline: defline = False expect_indent = True self.tags = result return result if __name__ == '__main__': import time, pprint x0 = time.time() #ma = ModuleAnalyzer.for_file(__file__.rstrip('c'), 'sphinx.builders.html') ma = ModuleAnalyzer.for_file('sphinx/environment.py', 'sphinx.environment') ma.tokenize() x1 = time.time() ma.parse() x2 = time.time() #for (ns, name), doc in ma.find_attr_docs().iteritems(): # print '>>', ns, name # print '\n'.join(doc) pprint.pprint(ma.find_tags()) x3 = time.time() #print nodes.nice_repr(ma.parsetree, number2name) print "tokenizing %.4f, parsing %.4f, finding %.4f" % (x1-x0, x2-x1, x3-x2)