1 files changed, 310 insertions, 0 deletions
diff --git a/sphinx/pycode/__init__.py b/sphinx/pycode/__init__.py
new file mode 100644
index 00000000..de1c3dbe
--- /dev/null
+++ b/sphinx/pycode/__init__.py
@@ -0,0 +1,310 @@
+# -*- coding: utf-8 -*-
+"""
+    sphinx.pycode
+    ~~~~~~~~~~~~~
+
+    Utilities parsing and analyzing Python code.
+
+    :copyright: Copyright 2007-2009 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import re
+import sys
+from os import path
+from cStringIO import StringIO
+
+from sphinx.pycode import nodes
+from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
+from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc
+
+
+# load the Python grammar
+_grammarfile = path.join(path.dirname(__file__), 'Grammar.txt')
+pygrammar = driver.load_grammar(_grammarfile)
+pydriver = driver.Driver(pygrammar, convert=nodes.convert)
+
+# an object with attributes corresponding to token and symbol names
+class sym: pass
+for k, v in pygrammar.symbol2number.iteritems():
+    setattr(sym, k, v)
+for k, v in token.tok_name.iteritems():
+    setattr(sym, v, k)
+
+# a dict mapping terminal and nonterminal numbers to their names
+number2name = pygrammar.number2symbol.copy()
+number2name.update(token.tok_name)
+
+
+# a regex to recognize coding cookies
+_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
+
+_eq = nodes.Leaf(token.EQUAL, '=')
+
+
+class AttrDocVisitor(nodes.NodeVisitor):
+    """
+    Visitor that collects docstrings for attribute assignments on toplevel and
+    in classes.
+
+    The docstrings can either be in special '#:' comments before the assignment
+    or in a docstring after it.
+    """
+    def init(self, scope, encoding):
+        self.scope = scope
+        self.encoding = encoding
+        self.namespace = []
+        self.collected = {}
+
+    def visit_classdef(self, node):
+        self.namespace.append(node[1].value)
+        self.generic_visit(node)
+        self.namespace.pop()
+
+    def visit_expr_stmt(self, node):
+        """Visit an assignment which may have a special comment before it."""
+        if _eq not in node.children:
+            # not an assignment (we don't care for augmented assignments)
+            return
+        pnode = node[0]
+        prefix = pnode.get_prefix()
+        # if the assignment is the first statement on a new indentation
+        # level, its preceding whitespace and comments are not assigned
+        # to that token, but the first INDENT or DEDENT token
+        while not prefix:
+            pnode = pnode.get_prev_leaf()
+            if not pnode or pnode.type not in (token.INDENT, token.DEDENT):
+                break
+            prefix = pnode.get_prefix()
+        prefix = prefix.decode(self.encoding)
+        docstring = prepare_commentdoc(prefix)
+        if docstring:
+            self.add_docstring(node, docstring)
+
+    def visit_simple_stmt(self, node):
+        """Visit a docstring statement which may have an assignment before."""
+        if node[0].type != token.STRING:
+            # not a docstring; but still need to visit children
+            return self.generic_visit(node)
+        prev = node.get_prev_sibling()
+        if not prev:
+            return
+        if prev.type == sym.simple_stmt and \
+               prev[0].type == sym.expr_stmt and _eq in prev[0].children:
+            # need to "eval" the string because it's returned in its
+            # original form
+            docstring = literals.evalString(node[0].value, self.encoding)
+            docstring = prepare_docstring(docstring)
+            self.add_docstring(prev[0], docstring)
+
+    def visit_funcdef(self, node):
+        # don't descend into functions -- nothing interesting there
+        return
+
+    def add_docstring(self, node, docstring):
+        # add an item for each assignment target
+        for i in range(0, len(node) - 1, 2):
+            target = node[i]
+            if target.type != token.NAME:
+                # don't care about complex targets
+                continue
+            namespace = '.'.join(self.namespace)
+            if namespace.startswith(self.scope):
+                self.collected[namespace, target.value] = docstring
+
+
+class PycodeError(Exception):
+    def __str__(self):
+        res = self.args[0]
+        if len(self.args) > 1:
+            res += ' (exception was: %r)' % self.args[1]
+        return res
+
+
+class ModuleAnalyzer(object):
+    # cache for analyzer objects -- caches both by module and file name
+    cache = {}
+
+    @classmethod
+    def for_string(cls, string, modname, srcname='<string>'):
+        return cls(StringIO(string), modname, srcname)
+
+    @classmethod
+    def for_file(cls, filename, modname):
+        if ('file', filename) in cls.cache:
+            return cls.cache['file', filename]
+        try:
+            fileobj = open(filename, 'r')
+        except Exception, err:
+            raise PycodeError('error opening %r' % filename, err)
+        obj = cls(fileobj, modname, filename)
+        cls.cache['file', filename] = obj
+        return obj
+
+    @classmethod
+    def for_module(cls, modname):
+        if ('module', modname) in cls.cache:
+            entry = cls.cache['module', modname]
+            if isinstance(entry, PycodeError):
+                raise entry
+            return entry
+
+        try:
+            if modname not in sys.modules:
+                try:
+                    __import__(modname)
+                except ImportError, err:
+                    raise PycodeError('error importing %r' % modname, err)
+            mod = sys.modules[modname]
+            if hasattr(mod, '__loader__'):
+                try:
+                    source = mod.__loader__.get_source(modname)
+                except Exception, err:
+                    raise PycodeError('error getting source for %r' % modname,
+                                      err)
+                obj = cls.for_string(source, modname)
+                cls.cache['module', modname] = obj
+                return obj
+            filename = getattr(mod, '__file__', None)
+            if filename is None:
+                raise PycodeError('no source found for module %r' % modname)
+            filename = path.normpath(path.abspath(filename))
+            lfilename = filename.lower()
+            if lfilename.endswith('.pyo') or lfilename.endswith('.pyc'):
+                filename = filename[:-1]
+            elif not lfilename.endswith('.py'):
+                raise PycodeError('source is not a .py file: %r' % filename)
+            if not path.isfile(filename):
+                raise PycodeError('source file is not present: %r' % filename)
+            obj = cls.for_file(filename, modname)
+        except PycodeError, err:
+            cls.cache['module', modname] = err
+            raise
+        cls.cache['module', modname] = obj
+        return obj
+
+    def __init__(self, source, modname, srcname):
+        # name of the module
+        self.modname = modname
+        # name of the source file
+        self.srcname = srcname
+        # file-like object yielding source lines
+        self.source = source
+
+        # will be filled by tokenize()
+        self.tokens = None
+        # will be filled by parse()
+        self.parsetree = None
+        # will be filled by find_attr_docs()
+        self.attr_docs = None
+        # will be filled by find_tags()
+        self.tags = None
+
+    def tokenize(self):
+        """Generate tokens from the source."""
+        if self.tokens is not None:
+            return
+        self.tokens = list(tokenize.generate_tokens(self.source.readline))
+        self.source.close()
+
+    def parse(self):
+        """Parse the generated source tokens."""
+        if self.parsetree is not None:
+            return
+        self.tokenize()
+        try:
+            self.parsetree = pydriver.parse_tokens(self.tokens)
+        except parse.ParseError, err:
+            raise PycodeError('parsing failed', err)
+        # find the source code encoding
+        encoding = sys.getdefaultencoding()
+        comments = self.parsetree.get_prefix()
+        for line in comments.splitlines()[:2]:
+            match = _coding_re.search(line)
+            if match is not None:
+                encoding = match.group(1)
+                break
+        self.encoding = encoding
+
+    def find_attr_docs(self, scope=''):
+        """Find class and module-level attributes and their documentation."""
+        if self.attr_docs is not None:
+            return self.attr_docs
+        self.parse()
+        attr_visitor = AttrDocVisitor(number2name, scope, self.encoding)
+        attr_visitor.visit(self.parsetree)
+        self.attr_docs = attr_visitor.collected
+        return attr_visitor.collected
+
+    def find_tags(self):
+        """Find class, function and method definitions and their location."""
+        if self.tags is not None:
+            return self.tags
+        self.tokenize()
+        result = {}
+        namespace = []
+        stack = []
+        indent = 0
+        defline = False
+        expect_indent = False
+        def tokeniter(ignore = (token.COMMENT, token.NL)):
+            for tokentup in self.tokens:
+                if tokentup[0] not in ignore:
+                    yield tokentup
+        tokeniter = tokeniter()
+        for type, tok, spos, epos, line in tokeniter:
+            if expect_indent:
+                if type != token.INDENT:
+                    # no suite -- one-line definition
+                    assert stack
+                    dtype, fullname, startline, _ = stack.pop()
+                    endline = epos[0]
+                    namespace.pop()
+                    result[fullname] = (dtype, startline, endline)
+                expect_indent = False
+            if tok in ('def', 'class'):
+                name = tokeniter.next()[1]
+                namespace.append(name)
+                fullname = '.'.join(namespace)
+                stack.append((tok, fullname, spos[0], indent))
+                defline = True
+            elif type == token.INDENT:
+                expect_indent = False
+                indent += 1
+            elif type == token.DEDENT:
+                indent -= 1
+                # if the stacklevel is the same as it was before the last
+                # def/class block, this dedent closes that block
+                if stack and indent == stack[-1][3]:
+                    dtype, fullname, startline, _ = stack.pop()
+                    endline = spos[0]
+                    namespace.pop()
+                    result[fullname] = (dtype, startline, endline)
+            elif type == token.NEWLINE:
+                # if this line contained a definition, expect an INDENT
+                # to start the suite; if there is no such INDENT
+                # it's a one-line definition
+                if defline:
+                    defline = False
+                    expect_indent = True
+        self.tags = result
+        return result
+
+
+if __name__ == '__main__':
+    import time, pprint
+    x0 = time.time()
+    #ma = ModuleAnalyzer.for_file(__file__.rstrip('c'), 'sphinx.builders.html')
+    ma = ModuleAnalyzer.for_file('sphinx/builders/html.py',
+                                 'sphinx.builders.html')
+    ma.tokenize()
+    x1 = time.time()
+    ma.parse()
+    x2 = time.time()
+    #for (ns, name), doc in ma.find_attr_docs().iteritems():
+    #    print '>>', ns, name
+    #    print '\n'.join(doc)
+    pprint.pprint(ma.find_tags())
+    x3 = time.time()
+    #print nodes.nice_repr(ma.parsetree, number2name)
+    print "tokenizing %.4f, parsing %.4f, finding %.4f" % (x1-x0, x2-x1, x3-x2)