From 684860280687561f6312e206c4ccfbe4baa17e89 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 14 Oct 2021 21:23:52 +0300 Subject: bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944) (GH-28953) They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.raw_unicode_escape_decode(). It is True by default to match the former behavior. (cherry picked from commit 39aa98346d5dd8ac591a7cafb467af21c53f1e5d) --- Doc/data/python3.9.abi | 640 +++++++++++---------- Include/cpython/unicodeobject.h | 8 + Lib/encodings/raw_unicode_escape.py | 9 +- Lib/test/test_codecs.py | 35 +- .../2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst | 2 + Modules/_codecsmodule.c | 13 +- Modules/clinic/_codecsmodule.c.h | 23 +- Objects/unicodeobject.c | 64 ++- 8 files changed, 443 insertions(+), 351 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst diff --git a/Doc/data/python3.9.abi b/Doc/data/python3.9.abi index 3b202fa39b..e2037436bd 100644 --- a/Doc/data/python3.9.abi +++ b/Doc/data/python3.9.abi @@ -1290,6 +1290,7 @@ + @@ -4199,10 +4200,10 @@ - + - + @@ -4808,7 +4809,7 @@ - + @@ -5430,7 +5431,7 @@ - + @@ -7213,7 +7214,7 @@ - + @@ -7588,13 +7589,13 @@ - + - + @@ -7650,7 +7651,7 @@ - + @@ -8701,7 +8702,7 @@ - + @@ -9664,7 +9665,7 @@ - + @@ -10044,7 +10045,7 @@ - + @@ -10463,389 +10464,396 @@ - + - + - - + + - - - + + + - - - + + + - - - - + + + + - - - + + + - - - + + + - - - - + + + + - - - + + + - - + + - - + + - - + + - - + + - - - + + + - - - - - + + + + + - - + + - - - - + + + + - - - - + + + + - - - + + + - - - + + + - - - + + + - - - - + + + + - - + + - - - - + + + + - - - - + + + + - - - - - + + + + + - - - - + + + + - - - - + + + + - - + + - - + + - + - - - + + + - - - + + + - + - - - + + + - - - + + + - - - + + + - - - + + + - - - - - + + + + + - - - - - + + + + + - - - + + + - - - - - - + + + + + + - - - - - - + + + + + + - - - - - - + + + + + + - - - - - + + + + + - - - - - - - - - - + + + + + + + + + + - - - - - + + + + + - - - + + + - - + + - - - - + + + + - - - - - + + + + + - - - + + + - - - - - + + + + + - - - - + + + + - - + + - - - - - + + + + + - + - - - - + + + + - - - - + + + + - + - - - - + + + + - - - - + + + + - - - + + + - - + + - - - - + + + + - - - + + + + + - - + + + - - - - + + - - - - - + + + + + + + + + + + @@ -10925,9 +10933,9 @@ - - - + + + @@ -10968,9 +10976,9 @@ - - - + + + @@ -10996,9 +11004,9 @@ - - - + + + @@ -11223,15 +11231,15 @@ - - - - + + + + - - - + + + @@ -11258,14 +11266,14 @@ - - - + + + - - - + + + @@ -11274,36 +11282,36 @@ - - - - + + + + - - - + + + - - - + + + - - - + + + - - - - - + + + + + - - + + @@ -11340,25 +11348,25 @@ - + - + - + - + - + - + @@ -11442,37 +11450,37 @@ - + - + - + - + - + - + - + - + - + - + - + @@ -12271,7 +12279,7 @@ - + @@ -16829,7 +16837,7 @@ - + @@ -18033,10 +18041,10 @@ - + - + @@ -18047,7 +18055,7 @@ - + @@ -18056,10 +18064,10 @@ - + - + @@ -18080,7 +18088,7 @@ - + diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index d3c906aa92..1b460c9f18 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -888,6 +888,14 @@ Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( Py_ssize_t length /* Number of Py_UNICODE chars to encode */ ); +/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */ +PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful( + const char *string, /* Unicode-Escape encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ +); + /* --- Latin-1 Codecs ----------------------------------------------------- */ PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py index 2b919b40d3..46c8e070dd 100644 --- a/Lib/encodings/raw_unicode_escape.py +++ b/Lib/encodings/raw_unicode_escape.py @@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): return codecs.raw_unicode_escape_encode(input, self.errors)[0] -class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return codecs.raw_unicode_escape_decode(input, self.errors)[0] +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, input, errors, final): + return codecs.raw_unicode_escape_decode(input, errors, final) class StreamWriter(Codec,codecs.StreamWriter): pass class StreamReader(Codec,codecs.StreamReader): - pass + def decode(self, input, errors='strict'): + return codecs.raw_unicode_escape_decode(input, errors, False) ### encodings module API diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 09ab852b39..2f7f93a29f 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -2457,7 +2457,11 @@ class UnicodeEscapeTest(ReadTest, unittest.TestCase): ] ) -class RawUnicodeEscapeTest(unittest.TestCase): +class RawUnicodeEscapeTest(ReadTest, unittest.TestCase): + encoding = "raw-unicode-escape" + + test_lone_surrogates = None + def test_empty(self): self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0)) self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0)) @@ -2506,6 +2510,35 @@ class RawUnicodeEscapeTest(unittest.TestCase): self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) + def test_partial(self): + self.check_partial( + "\x00\t\n\r\\\xff\uffff\U00010000", + [ + '\x00', + '\x00\t', + '\x00\t\n', + '\x00\t\n\r', + '\x00\t\n\r', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff\U00010000', + ] + ) + class EscapeEncodeTest(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst b/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst new file mode 100644 index 0000000000..f2c0ae4ae5 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst @@ -0,0 +1,2 @@ +Fix incremental decoder and stream reader in the "raw-unicode-escape" codec. +Previously they failed if the escape sequence was split. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index f22d4daca0..cbe5cc50f1 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -507,17 +507,20 @@ _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, _codecs.raw_unicode_escape_decode data: Py_buffer(accept={str, buffer}) errors: str(accept={str, NoneType}) = None + final: bool(accept={int}) = True / [clinic start generated code]*/ static PyObject * _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, - const char *errors) -/*[clinic end generated code: output=c98eeb56028070a6 input=d2f5159ce3b3392f]*/ + const char *errors, int final) +/*[clinic end generated code: output=11dbd96301e2879e input=2d166191beb3235a]*/ { - PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len, - errors); - return codec_tuple(decoded, data->len); + Py_ssize_t consumed = data->len; + PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len, + errors, + final ? NULL : &consumed); + return codec_tuple(decoded, consumed); } /*[clinic input] diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index 4e2c057007..7ddc36de8a 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -1234,7 +1234,7 @@ exit: } PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__, -"raw_unicode_escape_decode($module, data, errors=None, /)\n" +"raw_unicode_escape_decode($module, data, errors=None, final=True, /)\n" "--\n" "\n"); @@ -1243,7 +1243,7 @@ PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__, static PyObject * _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, - const char *errors); + const char *errors, int final); static PyObject * _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs) @@ -1251,8 +1251,9 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss PyObject *return_value = NULL; Py_buffer data = {NULL, NULL}; const char *errors = NULL; + int final = 1; - if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 2)) { + if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 3)) { goto exit; } if (PyUnicode_Check(args[0])) { @@ -1293,8 +1294,20 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss _PyArg_BadArgument("raw_unicode_escape_decode", "argument 2", "str or None", args[1]); goto exit; } + if (nargs < 3) { + goto skip_optional; + } + if (PyFloat_Check(args[2])) { + PyErr_SetString(PyExc_TypeError, + "integer argument expected, got float" ); + goto exit; + } + final = _PyLong_AsInt(args[2]); + if (final == -1 && PyErr_Occurred()) { + goto exit; + } skip_optional: - return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors); + return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors, final); exit: /* Cleanup for data */ @@ -2935,4 +2948,4 @@ exit: #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=d4b696fe54cfee8f input=a9049054013a1b77]*/ +/*[clinic end generated code: output=eed7dc9312baf252 input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d6fc03e1ae..7767d140e6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6308,8 +6308,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_ASCII_CHAR(ch) \ @@ -6336,7 +6334,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, continue; } - startinpos = s - starts - 1; + Py_ssize_t startinpos = s - starts - 1; /* \ - Escapes */ if (s >= end) { message = "\\ at end of string"; @@ -6483,8 +6481,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, *consumed = startinpos; break; } - error: - endinpos = s-starts; + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6679,9 +6677,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, /* --- Raw Unicode Escape Codec ------------------------------------------- */ PyObject * -PyUnicode_DecodeRawUnicodeEscape(const char *s, - Py_ssize_t size, - const char *errors) +_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) { const char *starts = s; _PyUnicodeWriter writer; @@ -6690,6 +6689,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, PyObject *exc = NULL; if (size == 0) { + if (consumed) { + *consumed = 0; + } _Py_RETURN_UNICODE_EMPTY(); } @@ -6708,8 +6710,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_CHAR(ch) \ @@ -6724,11 +6724,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, } while(0) /* Non-escape characters are interpreted as Unicode ordinals */ - if (c != '\\' || s >= end) { + if (c != '\\' || (s >= end && !consumed)) { WRITE_CHAR(c); continue; } + Py_ssize_t startinpos = s - starts - 1; + /* \ - Escapes */ + if (s >= end) { + assert(consumed); + // Set message to silent compiler warning. + // Actually it is never used. + message = "\\ at end of string"; + goto incomplete; + } + c = (unsigned char) *s++; if (c == 'u') { count = 4; @@ -6744,10 +6754,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, WRITE_CHAR(c); continue; } - startinpos = s - starts - 2; /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ - for (ch = 0; count && s < end; ++s, --count) { + for (ch = 0; count; ++s, --count) { + if (s >= end) { + goto incomplete; + } c = (unsigned char)*s; ch <<= 4; if (c >= '0' && c <= '9') { @@ -6760,18 +6772,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ch += c - ('A' - 10); } else { - break; + goto error; } } - if (!count) { - if (ch <= MAX_UNICODE) { - WRITE_CHAR(ch); - continue; - } + if (ch > MAX_UNICODE) { message = "\\Uxxxxxxxx out of range"; + goto error; } + WRITE_CHAR(ch); + continue; - endinpos = s-starts; + incomplete: + if (consumed) { + *consumed = startinpos; + break; + } + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6793,7 +6810,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, Py_XDECREF(errorHandler); Py_XDECREF(exc); return NULL; +} +PyObject * +PyUnicode_DecodeRawUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL); } -- cgit v1.2.1