bpo-33416: Add end positions to Python AST (GH-11605)

The majority of this PR is tediously passing `end_lineno` and `end_col_offset` everywhere. Here are non-trivial points: * It is not possible to reconstruct end positions in AST "on the fly", some information is lost after an AST node is constructed, so we need two more attributes for every AST node `end_lineno` and `end_col_offset`. * I add end position information to both CST and AST. Although it may be technically possible to avoid adding end positions to CST, the code becomes more cumbersome and less efficient. * Since the end position is not known for non-leaf CST nodes while the next token is added, this requires a bit of extra care (see `_PyNode_FinalizeEndPos`). Unless I made some mistake, the algorithm should be linear. * For statements, I "trim" the end position of suites to not include the terminal newlines and dedent (this seems to be what people would expect), for example in ```python class C: pass pass ``` the end line and end column for the class definition is (2, 8). * For `end_col_offset` I use the common Python convention for indexing, for example for `pass` the `end_col_offset` is 4 (not 3), so that `[0:4]` gives one the source code that corresponds to the node. * I added a helper function `ast.get_source_segment()`, to get source text segment corresponding to a given AST node. It is also useful for testing. An (inevitable) downside of this PR is that AST now takes almost 25% more memory. I think however it is probably justified by the benefits.
author: Ivan Levkivskyi <levkivskyi@gmail.com> 2019-01-22 11:18:22 +0000
committer: GitHub <noreply@github.com> 2019-01-22 11:18:22 +0000
commit: 9932a22897ef9905161dac7476e6976370e13515 (patch)
tree: 5cfbec44c7ecb01f4817274280881a74ec15c605 /Parser
parent: 7a2368063f25746d4008a74aca0dc0b82f86ff7b (diff)
download: cpython-git-9932a22897ef9905161dac7476e6976370e13515.tar.gz
6 files changed, 64 insertions, 18 deletions
diff --git a/Parser/Python.asdl b/Parser/Python.asdl
index eee982be1c..cedf37a2d9 100644
--- a/Parser/Python.asdl
+++ b/Parser/Python.asdl
@@ -50,7 +50,7 @@ module Python
 
           -- XXX Jython will be different
           -- col_offset is the byte offset in the utf8 string the parser uses
-          attributes (int lineno, int col_offset)
+          attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
 
           -- BoolOp() can use left & right?
     expr = BoolOp(boolop op, expr* values)
@@ -85,7 +85,7 @@ module Python
          | Tuple(expr* elts, expr_context ctx)
 
           -- col_offset is the byte offset in the utf8 string the parser uses
-          attributes (int lineno, int col_offset)
+          attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
 
     expr_context = Load | Store | Del | AugLoad | AugStore | Param
 
@@ -105,13 +105,13 @@ module Python
     comprehension = (expr target, expr iter, expr* ifs, int is_async)
 
     excepthandler = ExceptHandler(expr? type, identifier? name, stmt* body)
-                    attributes (int lineno, int col_offset)
+                    attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
 
     arguments = (arg* args, arg? vararg, arg* kwonlyargs, expr* kw_defaults,
                  arg? kwarg, expr* defaults)
 
     arg = (identifier arg, expr? annotation)
-           attributes (int lineno, int col_offset)
+           attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
 
     -- keyword arguments supplied to call (NULL identifier for **kwargs)
     keyword = (identifier? arg, expr value)
diff --git a/Parser/asdl_c.py b/Parser/asdl_c.py
index 75fb78b9c9..8640b29b8f 100644
--- a/Parser/asdl_c.py
+++ b/Parser/asdl_c.py
@@ -1250,10 +1250,12 @@ def main(srcfile, dump_module=False):
             f.write('#undef Yield   /* undefine macro conflicting with <winbase.h> */\n')
             f.write('\n')
             c = ChainOfVisitors(TypeDefVisitor(f),
-                                StructVisitor(f),
-                                PrototypeVisitor(f),
-                                )
+                                StructVisitor(f))
+
             c.visit(mod)
+            f.write("// Note: these macros affect function definitions, not only call sites.\n")
+            PrototypeVisitor(f).visit(mod)
+            f.write("\n")
             f.write("PyObject* PyAST_mod2obj(mod_ty t);\n")
             f.write("mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode);\n")
             f.write("int PyAST_Check(PyObject* obj);\n")
diff --git a/Parser/node.c b/Parser/node.c
index 240d29057c..f1b70e0f68 100644
--- a/Parser/node.c
+++ b/Parser/node.c
@@ -13,6 +13,8 @@ PyNode_New(int type)
     n->n_type = type;
     n->n_str = NULL;
     n->n_lineno = 0;
+    n->n_end_lineno = 0;
+    n->n_end_col_offset = -1;
     n->n_nchildren = 0;
     n->n_child = NULL;
     return n;
@@ -75,14 +77,34 @@ fancy_roundup(int n)
                fancy_roundup(n))
 
 
+void
+_PyNode_FinalizeEndPos(node *n)
+{
+    int nch = NCH(n);
+    node *last;
+    if (nch == 0) {
+        return;
+    }
+    last = CHILD(n, nch - 1);
+    _PyNode_FinalizeEndPos(last);
+    n->n_end_lineno = last->n_end_lineno;
+    n->n_end_col_offset = last->n_end_col_offset;
+}
+
 int
-PyNode_AddChild(node *n1, int type, char *str, int lineno, int col_offset)
+PyNode_AddChild(node *n1, int type, char *str, int lineno, int col_offset,
+                int end_lineno, int end_col_offset)
 {
     const int nch = n1->n_nchildren;
     int current_capacity;
     int required_capacity;
     node *n;
 
+    // finalize end position of previous node (if any)
+    if (nch > 0) {
+        _PyNode_FinalizeEndPos(CHILD(n1, nch - 1));
+    }
+
     if (nch == INT_MAX || nch < 0)
         return E_OVERFLOW;
 
@@ -107,6 +129,8 @@ PyNode_AddChild(node *n1, int type, char *str, int lineno, int col_offset)
     n->n_str = str;
     n->n_lineno = lineno;
     n->n_col_offset = col_offset;
+    n->n_end_lineno = end_lineno;  // this and below will be updates after all children are added.
+    n->n_end_col_offset = end_col_offset;
     n->n_nchildren = 0;
     n->n_child = NULL;
     return 0;
diff --git a/Parser/parser.c b/Parser/parser.c
index 41072c478c..a9916d392a 100644
--- a/Parser/parser.c
+++ b/Parser/parser.c
@@ -105,11 +105,13 @@ PyParser_Delete(parser_state *ps)
 /* PARSER STACK OPERATIONS */
 
 static int
-shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset)
+shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset,
+      int end_lineno, int end_col_offset)
 {
     int err;
     assert(!s_empty(s));
-    err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
+    err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset,
+                          end_lineno, end_col_offset);
     if (err)
         return err;
     s->s_top->s_state = newstate;
@@ -117,13 +119,15 @@ shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset)
 }
 
 static int
-push(stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
+push(stack *s, int type, dfa *d, int newstate, int lineno, int col_offset,
+     int end_lineno, int end_col_offset)
 {
     int err;
     node *n;
     n = s->s_top->s_parent;
     assert(!s_empty(s));
-    err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
+    err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset,
+                          end_lineno, end_col_offset);
     if (err)
         return err;
     s->s_top->s_state = newstate;
@@ -225,7 +229,9 @@ future_hack(parser_state *ps)
 
 int
 PyParser_AddToken(parser_state *ps, int type, char *str,
-                  int lineno, int col_offset, int *expected_ret)
+                  int lineno, int col_offset,
+                  int end_lineno, int end_col_offset,
+                  int *expected_ret)
 {
     int ilabel;
     int err;
@@ -257,7 +263,8 @@ PyParser_AddToken(parser_state *ps, int type, char *str,
                     dfa *d1 = PyGrammar_FindDFA(
                         ps->p_grammar, nt);
                     if ((err = push(&ps->p_stack, nt, d1,
-                        arrow, lineno, col_offset)) > 0) {
+                        arrow, lineno, col_offset,
+                        end_lineno, end_col_offset)) > 0) {
                         D(printf(" MemError: push\n"));
                         return err;
                     }
@@ -267,7 +274,8 @@ PyParser_AddToken(parser_state *ps, int type, char *str,
 
                 /* Shift the token */
                 if ((err = shift(&ps->p_stack, type, str,
-                                x, lineno, col_offset)) > 0) {
+                                x, lineno, col_offset,
+                                end_lineno, end_col_offset)) > 0) {
                     D(printf(" MemError: shift.\n"));
                     return err;
                 }
diff --git a/Parser/parser.h b/Parser/parser.h
index 39df948728..95cd39d209 100644
--- a/Parser/parser.h
+++ b/Parser/parser.h
@@ -32,7 +32,9 @@ typedef struct {
 
 parser_state *PyParser_New(grammar *g, int start);
 void PyParser_Delete(parser_state *ps);
-int PyParser_AddToken(parser_state *ps, int type, char *str, int lineno, int col_offset,
+int PyParser_AddToken(parser_state *ps, int type, char *str,
+                      int lineno, int col_offset,
+                      int end_lineno, int end_col_offset,
                       int *expected_ret);
 void PyGrammar_AddAccelerators(grammar *g);
 
diff --git a/Parser/parsetok.c b/Parser/parsetok.c
index d37e28a0a3..2b5254a8be 100644
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -187,7 +187,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
     parser_state *ps;
     node *n;
     int started = 0;
-    int col_offset;
+    int col_offset, end_col_offset;
 
     if ((ps = PyParser_New(g, start)) == NULL) {
         err_ret->error = E_NOMEM;
@@ -270,9 +270,16 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
             col_offset = -1;
         }
 
+        if (b != NULL && b >= tok->line_start) {
+            end_col_offset = Py_SAFE_DOWNCAST(b - tok->line_start,
+                                              intptr_t, int);
+        }
+        else {
+            end_col_offset = -1;
+        }
         if ((err_ret->error =
              PyParser_AddToken(ps, (int)type, str,
-                               lineno, col_offset,
+                               lineno, col_offset, tok->lineno, end_col_offset,
                                &(err_ret->expected))) != E_OK) {
             if (err_ret->error != E_DONE) {
                 PyObject_FREE(str);
@@ -368,6 +375,9 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 done:
     PyTokenizer_Free(tok);
 
+    if (n != NULL) {
+        _PyNode_FinalizeEndPos(n);
+    }
     return n;
 }
author	Ivan Levkivskyi <levkivskyi@gmail.com>	2019-01-22 11:18:22 +0000
committer	GitHub <noreply@github.com>	2019-01-22 11:18:22 +0000
commit	9932a22897ef9905161dac7476e6976370e13515 (patch)
tree	5cfbec44c7ecb01f4817274280881a74ec15c605 /Parser
parent	7a2368063f25746d4008a74aca0dc0b82f86ff7b (diff)
download	cpython-git-9932a22897ef9905161dac7476e6976370e13515.tar.gz