summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_exceptions.py12
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst2
-rw-r--r--Parser/tokenizer.c13
3 files changed, 19 insertions, 8 deletions
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py
index c861d8fe9e..102102669d 100644
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -2387,6 +2387,18 @@ class SyntaxErrorTests(unittest.TestCase):
finally:
unlink(TESTFN)
+ def test_non_utf8(self):
+ # Check non utf-8 characters
+ try:
+ with open(TESTFN, 'bw') as testfile:
+ testfile.write(b'\x7fELF\x02\x01\x01\x00\x00\x00')
+ rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
+ err = err.decode('utf-8').splitlines()
+
+ self.assertEqual(err[-1], "SyntaxError: invalid non-printable character U+007F")
+ finally:
+ unlink(TESTFN)
+
def test_attributes_new_constructor(self):
args = ("bad.py", 1, 2, "abcdefg", 1, 100)
the_exception = SyntaxError("bad bad", args)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst b/Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst
new file mode 100644
index 0000000000..6ca91f0344
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst
@@ -0,0 +1,2 @@
+Fix parser error when parsing non-utf8 characters in source files. Patch by
+Pablo Galindo.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 6358cdf654..a560572ac6 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -819,10 +819,10 @@ tok_readline_raw(struct tok_state *tok)
tok_concatenate_interactive_new_line(tok, line) == -1) {
return 0;
}
- if (*tok->inp == '\0') {
+ tok->inp = strchr(tok->inp, '\0');
+ if (tok->inp == tok->buf) {
return 0;
}
- tok->inp = strchr(tok->inp, '\0');
} while (tok->inp[-1] != '\n');
return 1;
}
@@ -984,12 +984,9 @@ tok_underflow_file(struct tok_state *tok) {
}
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
- if (!tok->encoding
- && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
- if (!ensure_utf8(tok->cur, tok)) {
- error_ret(tok);
- return 0;
- }
+ if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
+ error_ret(tok);
+ return 0;
}
assert(tok->done == E_OK);
return tok->done == E_OK;