making good progress

git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk/docutils@1020 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
author: goodger <goodger@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2002-12-14 01:38:31 +0000
committer: goodger <goodger@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2002-12-14 01:38:31 +0000
commit: 47ff214eaec8f59ac08c78614811e12eb9b06fde (patch)
tree: e5aa4cc02cc2a69f42321a9976cc7c746f6f2431 /docutils/readers
parent: b75e46e8a06dd8170c9fbabaee4c4aaa22050e67 (diff)
download: docutils-47ff214eaec8f59ac08c78614811e12eb9b06fde.tar.gz
1 files changed, 199 insertions, 94 deletions
diff --git a/docutils/readers/python/moduleparser.py b/docutils/readers/python/moduleparser.py
index 9ab3eea79..cbca876a7 100644
--- a/docutils/readers/python/moduleparser.py
+++ b/docutils/readers/python/moduleparser.py
@@ -12,7 +12,7 @@ Ideas:
 * Tokenize the module in parallel to extract initial values, comments, etc.
 
 * Merge the compiler & tokenize output such that the raw text hangs off of
-  nodes?  Especially assignment expressions (RHS).
+  nodes.  Useful for assignment expressions (RHS).
 
 What I'd like to do is to take a module, read in the text, run it through the
 module parser (using compiler.py and tokenize.py) and produce a high-level AST
@@ -79,7 +79,12 @@ The module parser should produce a high-level AST, something like this::
                     1
                 <Docstring>
                     class_attribute's docstring
-            <Method name="__init__" argnames=['self', ('text', 'None')]>
+            <Method name="__init__">
+                <Parameters>
+                    <Parameter name="self">
+                    <Parameter name="text">
+                        <Expression>
+                            None
                 <Docstring>
                     __init__'s docstring
                 <Attribute name="instance_attribute" instance=True>
@@ -109,10 +114,10 @@ The module parser should produce a high-level AST, something like this::
                 <Docstring>
                     f.function_attribute's docstring
 
-compiler.parse() provides most of what's needed for this AST.  I think that
-"tokenize" can be used to get the rest, and all that's left is to hunker down
-and figure out how.  We can determine the line number from the
-compiler.parse() AST, and a get_rhs(lineno) method would provide the rest.
+compiler.parse() provides most of what's needed for this AST, and "tokenize"
+can be used to get the rest.  We can determine the line number from the
+compiler.parse() AST, and the TokenParser.rhs(lineno) method provides the
+rest.
 
 The Docutils Python reader component will transform this AST into a
 Python-specific doctree, and then a `stylist transform`_ would further
@@ -174,17 +179,17 @@ from types import StringType, UnicodeType
 
 def parse_module(module_text, filename):
     ast = compiler.parse(module_text)
-    visitor = ModuleVisitor(filename)
+    token_parser = TokenParser(module_text)
+    visitor = ModuleVisitor(filename, token_parser)
     compiler.walk(ast, visitor, walker=visitor)
     return visitor.module
 
 
-class ModuleVisitor(ASTVisitor):
+class BaseVisitor(ASTVisitor):
 
-    def __init__(self, filename):
+    def __init__(self, token_parser):
         ASTVisitor.__init__(self)
-        self.filename = filename
-        self.module = None
+        self.token_parser = token_parser
         self.context = []
         self.documentable = None
 
@@ -193,22 +198,12 @@ class ModuleVisitor(ASTVisitor):
         #print 'in default (%s)' % node.__class__.__name__
         #ASTVisitor.default(self, node, *args)
 
-    def default_ignore(self, node, *args):
-        #print 'in default_ignore (%s)' % node.__class__.__name__
+    def default_visit(self, node, *args):
+        #print 'in default_visit (%s)' % node.__class__.__name__
         ASTVisitor.default(self, node, *args)
 
-    def visitModule(self, node):
-        #print dir(node)
-        self.module = module = Module(node, self.filename)
-        if node.doc is not None:
-            module.append(Docstring(node, node.doc))
-        self.context.append(module)
-        self.documentable = module
-        self.visit(node.node)
-        self.context.pop()
 
-    def visitStmt(self, node):
-        self.default_ignore(node)
+class DocstringVisitor(BaseVisitor):
 
     def visitDiscard(self, node):
         if self.documentable:
@@ -221,17 +216,14 @@ class ModuleVisitor(ASTVisitor):
             else:
                 self.documentable = None
 
-    def visitImport(self, node):
-        self.context[-1].append(Import(node, node.names))
-        self.documentable = None
+    def visitStmt(self, node):
+        self.default_visit(node)
 
-    def visitFrom(self, node):
-        self.context[-1].append(
-            Import(node, node.names, from_name=node.modname))
-        self.documentable = None
+
+class AssignmentVisitor(DocstringVisitor):
 
     def visitAssign(self, node):
-        visitor = AssignmentVisitor()
+        visitor = AttributeVisitor(self.token_parser)
         compiler.walk(node, visitor, walker=visitor)
         if visitor.attributes:
             self.context[-1].extend(visitor.attributes)
@@ -241,66 +233,111 @@ class ModuleVisitor(ASTVisitor):
             self.documentable = None
 
 
-class AssignmentVisitor(ASTVisitor):
+class ModuleVisitor(AssignmentVisitor):
 
-    """
-    Tried reconstructing expressions (the RHS of assignments) by
-    visiting the compiler.parse() tree, but a lot of information is
-    missing, like parenthesis-grouping of expressions.
+    def __init__(self, filename, token_parser):
+        AssignmentVisitor.__init__(self, token_parser)
+        self.filename = filename
+        self.module = None
 
-    Gotta do it by parsing tokens.
-    """
+    def visitModule(self, node):
+        self.module = module = Module(node, self.filename)
+        if node.doc is not None:
+            module.append(Docstring(node, node.doc))
+        self.context.append(module)
+        self.documentable = module
+        self.visit(node.node)
+        self.context.pop()
 
-    def __init__(self):
-        ASTVisitor.__init__(self)
-        self.attributes = []
-        self.parts = []
+    def visitImport(self, node):
+        self.context[-1].append(Import(node, node.names))
+        self.documentable = None
 
-    def default(self, node, *args):
-        print >>sys.stderr, '%s not visited!' % node.__class__.__name__
-        ASTVisitor.default(self, node)
+    def visitFrom(self, node):
+        self.context[-1].append(
+            Import(node, node.names, from_name=node.modname))
+        self.documentable = None
+
+    def visitFunction(self, node):
+        visitor = FunctionVisitor(self.token_parser)
+        compiler.walk(node, visitor, walker=visitor)
+        self.context[-1].append(visitor.function)
+
+
+class AttributeVisitor(BaseVisitor):
+
+    def __init__(self, token_parser):
+        BaseVisitor.__init__(self, token_parser)
+        self.attributes = []
 
     def visitAssign(self, node):
-        ASTVisitor.default(self, node)
-        self.attributes[-1].append(Expression(node, ''.join(self.parts)))
+        # Don't visit the expression itself, just the attribute nodes:
+        for child in node.nodes:
+            self.dispatch(child)
+        expression_text = self.token_parser.rhs(node.lineno)
+        expression = Expression(node, expression_text)
+        for attribute in self.attributes:
+            attribute.append(expression)
 
     def visitAssName(self, node):
         self.attributes.append(Attribute(node, node.name))
 
-    def visitAdd(self, node):
-        ASTVisitor.default(self, node)
-        self.parts[-2:] = ' + '.join(self.parts[-2:])
-
-    def visitAnd(self, node):
-        ASTVisitor.default(self, node)
-        self.parts.insert(len(self.parts) - 1, ' and ')
+    def visitAssTuple(self, node):
+        attributes = self.attributes
+        self.attributes = []
+        self.default_visit(node)
+        names = [attribute.name for attribute in self.attributes]
+        att_tuple = AttributeTuple(node, names)
+        att_tuple.lineno = self.attributes[0].lineno
+        self.attributes = attributes
+        self.attributes.append(att_tuple)
 
-    def visitBackquote(self, node):
-        self.parts.append('`')
-        ASTVisitor.default(self, node)
-        self.parts.append('`')
+    def visitAssAttr(self, node):
+        self.default_visit(node, node.attrname)
 
-    def visitBitand(self, node):
-        ASTVisitor.default(self, node)
-        self.parts.insert(len(self.parts) - 1, ' & ')
+    def visitGetattr(self, node, suffix):
+        self.default_visit(node, node.attrname + '.' + suffix)
 
-    def visitBitor(self, node):
-        ASTVisitor.default(self, node)
-        self.parts.insert(len(self.parts) - 1, ' | ')
+    def visitName(self, node, suffix):
+        self.attributes.append(Attribute(node, node.name + '.' + suffix))
 
-    def visitBitxor(self, node):
-        ASTVisitor.default(self, node)
-        self.parts.insert(len(self.parts) - 1, ' ^ ')
 
-    def visitConst(self, node):
-        self.parts.append(repr(node.value))
+class FunctionVisitor(DocstringVisitor):
 
-    def visitConst(self, node):
-        self.parts.append(repr(node.value))
+    def visitFunction(self, node):
+        self.function = function = Function(node, node.name)
+        if node.doc is not None:
+            function.append(Docstring(node, node.doc))
+        self.context.append(function)
+        self.documentable = function
+        self.parse_parameter_list(node)
+        self.visit(node.code)
+        self.context.pop()
 
-    def visitInvert(self, node):
-        self.parts.append('~ ')
-        ASTVisitor.default(self, node)
+    def parse_parameter_list(self, node):
+        parameters = []
+        special = []
+        argnames = list(node.argnames)
+        if node.kwargs:
+            special.append(ExcessKeywordArguments(node, argnames[-1]))
+            argnames.pop()
+        if node.varargs:
+            special.append(ExcessPositionalArguments(node, argnames[-1]))
+            argnames.pop()
+        defaults = list(node.defaults)
+        defaults = [None] * (len(argnames) - len(defaults)) + defaults
+        for argname, default in zip(argnames, defaults):
+            parameter = Parameter(node, argname)
+            if default:
+                default_text = self.token_parser.default(node.lineno)
+                parameter.append(Default(node, default_text))
+            parameters.append(parameter)
+        if parameters or special:
+            special.reverse()
+            parameters.extend(special)
+            parameter_list = ParameterList(node)
+            parameter_list.extend(parameters)
+            self.function.append(parameter_list)
 
 
 class Node:
@@ -395,6 +432,16 @@ class Attribute(Node):
         return Node.attlist(self, name=self.name)
 
 
+class AttributeTuple(Node):
+
+    def __init__(self, node, names):
+        Node.__init__(self, node)
+        self.names = names
+
+    def attlist(self):
+        return Node.attlist(self, names=' '.join(self.names))
+
+
 class Expression(Node):
 
     def __init__(self, node, text):
@@ -404,40 +451,98 @@ class Expression(Node):
     def __str__(self, indent='    ', level=0):
         prefix = indent * (level + 1)
         return '%s%s%s\n' % (Node.__str__(self, indent, level),
-                             prefix, self.text)
+                             prefix, self.text.encode('unicode-escape'))
 
 
-class TokenReader:
+class Function(Attribute): pass
+
+
+class ParameterList(Node): pass
+
+
+class Parameter(Attribute): pass
+
+
+class ExcessPositionalArguments(Parameter): pass
+
+
+class ExcessKeywordArguments(Parameter): pass
+
+
+class Default(Expression): pass
+
+
+class TokenParser:
 
     def __init__(self, text):
-        self.text = text
-        self.lines = text.splitlines(1)
+        self.text = text + '\n\n'
+        self.lines = self.text.splitlines(1)
         self.generator = tokenize.generate_tokens(iter(self.lines).next)
+        self.next()
 
     def __iter__(self):
         return self
 
     def next(self):
-        token = self.generator.next()
-        self.type, self.string, self.start, self.end, self.line = token
-        return token
+        self.token = self.generator.next()
+        self.type, self.string, self.start, self.end, self.line = self.token
+        return self.token
 
     def goto_line(self, lineno):
-        for token in self:
-            if self.start[0] >= lineno:
-                return token
-        else:
-            raise IndexError
+        while self.start[0] < lineno:
+            self.next()
+        return token
 
-    def rhs(self, name, lineno):
+    def rhs(self, lineno):
+        """
+        Return a whitespace-normalized expression string from the right-hand
+        side of an assignment at line `lineno`.
+        """
         self.goto_line(lineno)
-        while self.start[0] == lineno:
-            if self.type == token.OP and self.string == '=':
-                break
+        while self.string != '=':
             self.next()
-        else:
-            raise IndexError
-        
+        while self.type != token.NEWLINE and self.string != ';':
+            append = 1
+            append_ws = 1
+            del_ws = 0
+            if self.string == '=':
+                start_row, start_col = self.end
+                tokens = []
+                last_type = None
+                last_string = None
+                backquote = 0
+                append = 0
+            elif self.string == '.':
+                del_ws = 1
+                append_ws = 0
+            elif self.string in ('(', '[', '{'):
+                append_ws = 0
+                if self.string in '([' and (last_type == token.NAME or
+                                            last_string in (')', ']', '}')):
+                    del_ws = 1
+            elif self.string in (')', ']', '}', ':', ','):
+                    del_ws = 1
+            elif self.string == '`':
+                if backquote:
+                    del_ws = 1
+                else:
+                    append_ws = 0
+                backquote = not backquote
+            elif self.type == tokenize.NL:
+                append = 0
+            if append:
+                if del_ws and tokens and tokens[-1] == ' ':
+                    del tokens[-1]
+                tokens.append(self.string)
+                last_type = self.type
+                last_string = self.string
+                if append_ws:
+                    tokens.append(' ')
+            self.next()
+        self.next()
+        text = ''.join(tokens)
+        return text.strip()
+
 
 def trim_docstring(text):
     """
author	goodger <goodger@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2002-12-14 01:38:31 +0000
committer	goodger <goodger@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2002-12-14 01:38:31 +0000
commit	47ff214eaec8f59ac08c78614811e12eb9b06fde (patch)
tree	e5aa4cc02cc2a69f42321a9976cc7c746f6f2431 /docutils/readers
parent	b75e46e8a06dd8170c9fbabaee4c4aaa22050e67 (diff)
download	docutils-47ff214eaec8f59ac08c78614811e12eb9b06fde.tar.gz