merge 3.4 (#24022)

author: Benjamin Peterson <benjamin@python.org> 2015-04-21 12:07:06 -0400
committer: Benjamin Peterson <benjamin@python.org> 2015-04-21 12:07:06 -0400
commit: 273a720f876e754013e17fab0ab9b599284239f7 (patch)
tree: 3467634eaf7246dd125f8aa8ebad732e001a6915
parent: 8714cfdc4aeedce96fb40c73f7226535fbe7833f (diff)
parent: d73aca769f1f6eebb46faa9161cbebe806db3659 (diff)
download: cpython-git-273a720f876e754013e17fab0ab9b599284239f7.tar.gz
3 files changed, 18 insertions, 5 deletions
diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py
index 3d33bb50f7..41a92ffe25 100644
--- a/Lib/test/test_compile.py
+++ b/Lib/test/test_compile.py
@@ -1,9 +1,11 @@
 import math
+import os
 import unittest
 import sys
 import _ast
+import tempfile
 import types
-from test import support
+from test import support, script_helper
 
 class TestSpecifics(unittest.TestCase):
 
@@ -492,6 +494,16 @@ if 1:
         self.assertInvalidSingle('f()\nxy # blah\nblah()')
         self.assertInvalidSingle('x = 5 # comment\nx = 6\n')
 
+    def test_particularly_evil_undecodable(self):
+        # Issue 24022
+        src = b'0000\x00\n00000000000\n\x00\n\x9e\n'
+        with tempfile.TemporaryDirectory() as tmpd:
+            fn = os.path.join(tmpd, "bad.py")
+            with open(fn, "wb") as fp:
+                fp.write(src)
+            res = script_helper.run_python_until_end(fn)[0]
+        self.assertIn(b"Non-UTF-8", res.err)
+
     @support.cpython_only
     def test_compiler_recursion_limit(self):
         # Expected limit is sys.getrecursionlimit() * the scaling factor
diff --git a/Misc/NEWS b/Misc/NEWS
index 7277617b64..515a6ef0ad 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ Release date: 2015-04-24
 Core and Builtins
 -----------------
 
+- Issue #24022: Fix tokenizer crash when processing undecodable source code.
+
 Library
 -------
 
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index ef7b19fb42..ac413a8455 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1307,6 +1307,8 @@ verify_identifier(struct tok_state *tok)
 {
     PyObject *s;
     int result;
+    if (tok->decoding_erred)
+        return 0;
     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
     if (s == NULL || PyUnicode_READY(s) == -1) {
         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
@@ -1475,11 +1477,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
             c = tok_nextc(tok);
         }
         tok_backup(tok, c);
-        if (nonascii &&
-            !verify_identifier(tok)) {
-            tok->done = E_IDENTIFIER;
+        if (nonascii && !verify_identifier(tok))
             return ERRORTOKEN;
-        }
         *p_start = tok->start;
         *p_end = tok->cur;
         return NAME;
author	Benjamin Peterson <benjamin@python.org>	2015-04-21 12:07:06 -0400
committer	Benjamin Peterson <benjamin@python.org>	2015-04-21 12:07:06 -0400
commit	273a720f876e754013e17fab0ab9b599284239f7 (patch)
tree	3467634eaf7246dd125f8aa8ebad732e001a6915
parent	8714cfdc4aeedce96fb40c73f7226535fbe7833f (diff)
parent	d73aca769f1f6eebb46faa9161cbebe806db3659 (diff)
download	cpython-git-273a720f876e754013e17fab0ab9b599284239f7.tar.gz