From 7829bba45d0e2446f3a0ca240bfe46959f01071e Mon Sep 17 00:00:00 2001 From: Ammar Askar Date: Fri, 6 Jul 2018 06:23:13 -0400 Subject: [2.7] bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891) (#8133) Most of the change involves fixing up the test suite, which previously made the assumption that there wouldn't be a new line if the input didn't end in one. Contributed by Ammar Askar. (cherry picked from commit c4ef4896eac86a6759901c8546e26de4695a1389) --- Lib/test/test_tokenize.py | 46 ++++++++++++++++++++++++++++++++++------------ Lib/tokenize.py | 10 ++++++++++ 2 files changed, 44 insertions(+), 12 deletions(-) (limited to 'Lib') diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index fd9486bdd7..a4625971d3 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,32 +1,54 @@ from test import test_support -from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP, +from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP, NEWLINE, STRING, ENDMARKER, tok_name, Untokenizer, tokenize) from StringIO import StringIO import os from unittest import TestCase +# Converts a source string into a list of textual representation +# of the tokens such as: +# ` NAME 'if' (1, 0) (1, 2)` +# to make writing tests easier. +def stringify_tokens_from_source(token_generator, source_string): + result = [] + num_lines = len(source_string.splitlines()) + missing_trailing_nl = source_string[-1] not in '\r\n' + + for type, token, start, end, line in token_generator: + if type == ENDMARKER: + break + # Ignore the new line on the last line if the input lacks one + if missing_trailing_nl and type == NEWLINE and end[0] == num_lines: + continue + type = tok_name[type] + result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" % + locals()) + + return result + class TokenizeTest(TestCase): # Tests for the tokenize module. # The tests can be really simple. Given a small fragment of source - # code, print out a table with tokens. The ENDMARKER is omitted for - # brevity. + # code, print out a table with tokens. The ENDMARKER, ENCODING and + # final NEWLINE are omitted for brevity. def check_tokenize(self, s, expected): # Format the tokens in s in a table format. - # The ENDMARKER is omitted. - result = [] f = StringIO(s) - for type, token, start, end, line in generate_tokens(f.readline): - if type == ENDMARKER: - break - type = tok_name[type] - result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" % - locals()) + result = stringify_tokens_from_source(generate_tokens(f.readline), s) + self.assertEqual(result, expected.rstrip().splitlines()) + def test_implicit_newline(self): + # Make sure that the tokenizer puts in an implicit NEWLINE + # when the input lacks a trailing new line. + f = StringIO("x") + tokens = list(generate_tokens(f.readline)) + self.assertEqual(tokens[-2][0], NEWLINE) + self.assertEqual(tokens[-1][0], ENDMARKER) def test_basic(self): self.check_tokenize("1 + 1", """\ @@ -616,7 +638,7 @@ class TestRoundtrip(TestCase): self.check_roundtrip("if x == 1:\n" " print x\n") self.check_roundtrip("# This is a comment\n" - "# This also") + "# This also\n") # Some people use different formatting conventions, which makes # untokenize a little trickier. Note that this test involves trailing diff --git a/Lib/tokenize.py b/Lib/tokenize.py index d426cd2df5..6c857f8547 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -306,8 +306,15 @@ def generate_tokens(readline): contline = None indents = [0] + last_line = b'' + line = b'' while 1: # loop over lines in stream try: + # We capture the value of the line variable here because + # readline uses the empty string '' to signal end of input, + # hence `line` itself will always be overwritten at the end + # of this loop. + last_line = line line = readline() except StopIteration: line = '' @@ -437,6 +444,9 @@ def generate_tokens(readline): (lnum, pos), (lnum, pos+1), line) pos += 1 + # Add an implicit NEWLINE if the input doesn't end in one + if last_line and last_line[-1] not in '\r\n': + yield (NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') for indent in indents[1:]: # pop remaining indent levels yield (DEDENT, '', (lnum, 0), (lnum, 0), '') yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') -- cgit v1.2.1