* Add a tag-finding method based on tokens.

* Don't parse immediately if tokenizing suffices. * Also cache by file name.
author: Georg Brandl <georg@python.org> 2008-12-30 12:42:26 +0100
committer: Georg Brandl <georg@python.org> 2008-12-30 12:42:26 +0100
commit: 998660e41e99b171d029a0e6aa7b85fc0ba62373 (patch)
tree: 452521e7761a7fa32465d1486ece58c3aea6d5a0 /sphinx/pycode
parent: 6bec10bbcf957d4a26dc5b3db2f4a099382abf56 (diff)
download: sphinx-998660e41e99b171d029a0e6aa7b85fc0ba62373.tar.gz
2 files changed, 118 insertions, 39 deletions
diff --git a/sphinx/pycode/__init__.py b/sphinx/pycode/__init__.py
index e52a231d..f456cfe6 100644
--- a/sphinx/pycode/__init__.py
+++ b/sphinx/pycode/__init__.py
@@ -11,9 +11,10 @@
 
 import sys
 from os import path
+from cStringIO import StringIO
 
 from sphinx.pycode import pytree
-from sphinx.pycode.pgen2 import driver, token, parse, literals
+from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
 from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc
 
 
@@ -22,9 +23,12 @@ _grammarfile = path.join(path.dirname(__file__), 'Grammar.txt')
 pygrammar = driver.load_grammar(_grammarfile)
 pydriver = driver.Driver(pygrammar, convert=pytree.convert)
 
+# an object with attributes corresponding to token and symbol names
 class sym: pass
 for k, v in pygrammar.symbol2number.iteritems():
     setattr(sym, k, v)
+for k, v in token.tok_name.iteritems():
+    setattr(sym, v, k)
 
 # a dict mapping terminal and nonterminal numbers to their names
 number2name = pygrammar.number2symbol.copy()
@@ -110,36 +114,29 @@ class PycodeError(Exception):
 
 
 class ModuleAnalyzer(object):
-    # cache for analyzer objects
+    # cache for analyzer objects -- caches both by module and file name
     cache = {}
 
-    def __init__(self, tree, modname, srcname):
-        self.tree = tree
-        self.modname = modname
-        self.srcname = srcname
-
     @classmethod
     def for_string(cls, string, modname, srcname='<string>'):
-        return cls(pydriver.parse_string(string), modname, srcname)
+        return cls(StringIO(string), modname, srcname)
 
     @classmethod
     def for_file(cls, filename, modname):
+        if ('file', filename) in cls.cache:
+            return cls.cache['file', filename]
         try:
             fileobj = open(filename, 'r')
         except Exception, err:
             raise PycodeError('error opening %r' % filename, err)
-        try:
-            try:
-                return cls(pydriver.parse_stream(fileobj), modname, filename)
-            except parse.ParseError, err:
-                raise PycodeError('error parsing %r' % filename, err)
-        finally:
-            fileobj.close()
+        obj = cls(fileobj, modname, filename)
+        cls.cache['file', filename] = obj
+        return obj
 
     @classmethod
     def for_module(cls, modname):
-        if modname in cls.cache:
-            return cls.cache[modname]
+        if ('module', modname) in cls.cache:
+            return cls.cache['module', modname]
         if modname not in sys.modules:
             try:
                 __import__(modname)
@@ -152,37 +149,119 @@ class ModuleAnalyzer(object):
             except Exception, err:
                 raise PycodeError('error getting source for %r' % modname, err)
             obj = cls.for_string(source, modname)
-            cls.cache[modname] = obj
+            cls.cache['module', modname] = obj
             return obj
         filename = getattr(mod, '__file__', None)
         if filename is None:
             raise PycodeError('no source found for module %r' % modname)
-        if filename.lower().endswith('.pyo') or \
-           filename.lower().endswith('.pyc'):
+        filename = path.normpath(filename)
+        lfilename = filename.lower()
+        if lfilename.endswith('.pyo') or lfilename.endswith('.pyc'):
             filename = filename[:-1]
-        elif not filename.lower().endswith('.py'):
+        elif not lfilename.endswith('.py'):
             raise PycodeError('source is not a .py file: %r' % filename)
         if not path.isfile(filename):
             raise PycodeError('source file is not present: %r' % filename)
         obj = cls.for_file(filename, modname)
-        cls.cache[modname] = obj
+        cls.cache['module', modname] = obj
         return obj
 
+    def __init__(self, source, modname, srcname):
+        self.modname = modname
+        self.srcname = srcname
+        # file-like object yielding source lines
+        self.source = source
+
+        # will be filled by tokenize()
+        self.tokens = None
+        # will be filled by parse()
+        self.parsetree = None
+
+    def tokenize(self):
+        """Generate tokens from the source."""
+        if self.tokens is not None:
+            return
+        self.tokens = list(tokenize.generate_tokens(self.source.readline))
+        self.source.close()
+
+    def parse(self):
+        """Parse the generated source tokens."""
+        if self.parsetree is not None:
+            return
+        self.tokenize()
+        self.parsetree = pydriver.parse_tokens(self.tokens)
+
     def find_attr_docs(self, scope=''):
+        """Find class and module-level attributes and their documentation."""
+        self.parse()
         attr_visitor = AttrDocVisitor(number2name, scope)
-        attr_visitor.visit(self.tree)
+        attr_visitor.visit(self.parsetree)
         return attr_visitor.collected
 
+    def find_tags(self):
+        """Find class, function and method definitions and their location."""
+        self.tokenize()
+        result = {}
+        namespace = []
+        stack = []
+        indent = 0
+        defline = False
+        expect_indent = False
+        def tokeniter(ignore = (token.COMMENT, token.NL)):
+            for tokentup in self.tokens:
+                if tokentup[0] not in ignore:
+                    yield tokentup
+        tokeniter = tokeniter()
+        for type, tok, spos, epos, line in tokeniter:
+            if expect_indent:
+                if type != token.INDENT:
+                    # no suite -- one-line definition
+                    assert stack
+                    dtype, fullname, startline, _ = stack.pop()
+                    endline = epos[0]
+                    namespace.pop()
+                    result[dtype, fullname] = (startline, endline)
+                expect_indent = False
+            if tok in ('def', 'class'):
+                name = tokeniter.next()[1]
+                namespace.append(name)
+                fullname = '.'.join(namespace)
+                stack.append((tok, fullname, spos[0], indent))
+                defline = True
+            elif type == token.INDENT:
+                expect_indent = False
+                indent += 1
+            elif type == token.DEDENT:
+                indent -= 1
+                # if the stacklevel is the same as it was before the last def/class block,
+                # this dedent closes that block
+                if stack and indent == stack[-1][3]:
+                    dtype, fullname, startline, _ = stack.pop()
+                    endline = spos[0]
+                    namespace.pop()
+                    result[dtype, fullname] = (startline, endline)
+            elif type == token.NEWLINE:
+                # if this line contained a definition, expect an INDENT to start the
+                # suite; if there is no such INDENT it's a one-line definition
+                if defline:
+                    defline = False
+                    expect_indent = True
+        return result
+
 
 if __name__ == '__main__':
-    import time
+    import time, pprint
     x0 = time.time()
-    ma = ModuleAnalyzer.for_file('sphinx/builders/html.py', 'sphinx.builders.html')
     #ma = ModuleAnalyzer.for_file(__file__.rstrip('c'), 'sphinx.builders.html')
+    ma = ModuleAnalyzer.for_file('sphinx/builders/html.py', 'sphinx.builders.html')
+    ma.tokenize()
     x1 = time.time()
-    for (ns, name), doc in ma.find_attr_docs().iteritems():
-        print '>>', ns, name
-        print '\n'.join(doc)
+    ma.parse()
     x2 = time.time()
-    #print pytree.nice_repr(ma.tree, number2name)
-    print "parsing %.4f, finding %.4f" % (x1-x0, x2-x1)
+    #for (ns, name), doc in ma.find_attr_docs().iteritems():
+    #    print '>>', ns, name
+    #    print '\n'.join(doc)
+    pprint.pprint(ma.find_tags())
+    x3 = time.time()
+    #print pytree.nice_repr(ma.parsetree, number2name)
+    print "tokenizing %.4f, parsing %.4f, finding %.4f" % (x1-x0, x2-x1, x3-x2)
diff --git a/sphinx/pycode/pgen2/driver.py b/sphinx/pycode/pgen2/driver.py
index 3e9e1043..edc882fa 100644
--- a/sphinx/pycode/pgen2/driver.py
+++ b/sphinx/pycode/pgen2/driver.py
@@ -42,8 +42,8 @@ class Driver(object):
         column = 0
         type = value = start = end = line_text = None
         prefix = ""
-        for quintuple in tokens:
-            type, value, start, end, line_text = quintuple
+        opmap = grammar.opmap
+        for type, value, start, end, line_text in tokens:
             if start != (lineno, column):
                 assert (lineno, column) <= start, ((lineno, column), start)
                 s_lineno, s_column = start
@@ -62,13 +62,13 @@ class Driver(object):
                     column = 0
                 continue
             if type == token.OP:
-                type = grammar.opmap[value]
-            if debug:
-                self.logger.debug("%s %r (prefix=%r)",
-                                  token.tok_name[type], value, prefix)
+                type = opmap[value]
+            # if debug:
+            #     self.logger.debug("%s %r (prefix=%r)",
+            #                       token.tok_name[type], value, prefix)
             if p.addtoken(type, value, (prefix, start)):
-                if debug:
-                    self.logger.debug("Stop.")
+                # if debug:
+                #     self.logger.debug("Stop.")
                 break
             prefix = ""
             lineno, column = end
@@ -77,7 +77,7 @@ class Driver(object):
                 column = 0
         else:
             # We never broke out -- EOF is too soon (how can this happen???)
-            raise parse.ParseError("incomplete input", t, v, x)
+            raise parse.ParseError("incomplete input", type, value, line_text)
         return p.rootnode
 
     def parse_stream_raw(self, stream, debug=False):
author	Georg Brandl <georg@python.org>	2008-12-30 12:42:26 +0100
committer	Georg Brandl <georg@python.org>	2008-12-30 12:42:26 +0100
commit	998660e41e99b171d029a0e6aa7b85fc0ba62373 (patch)
tree	452521e7761a7fa32465d1486ece58c3aea6d5a0 /sphinx/pycode
parent	6bec10bbcf957d4a26dc5b3db2f4a099382abf56 (diff)
download	sphinx-998660e41e99b171d029a0e6aa7b85fc0ba62373.tar.gz