From 35816bfe2d0ddeb5ddcc68239683cbb35b7e3ff2 Mon Sep 17 00:00:00 2001 From: Bob Ippolito Date: Tue, 7 May 2013 16:38:34 -0700 Subject: pass-through in decoder for lone surrogates #62 --- CHANGES.txt | 6 +++ conf.py | 4 +- setup.py | 2 +- simplejson/__init__.py | 2 +- simplejson/_speedups.c | 98 ++++++++++++++++--------------------- simplejson/decoder.py | 38 +++++++------- simplejson/tests/test_scanstring.py | 61 ++++++++++++++++++++--- simplejson/tests/test_unicode.py | 13 +---- 8 files changed, 123 insertions(+), 101 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 59aa614..6e056a4 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,9 @@ +Version 3.3.0 released 2013-05-07 + +* Unpaired surrogates once again pass through the decoder, to match older + behavior and the RFC-4627 spec. + https://github.com/simplejson/simplejson/issues/62 + Version 3.2.0 released 2013-05-01 * New ignore_nan kwarg in encoder that serializes out diff --git a/conf.py b/conf.py index ec7cfca..2fb5544 100644 --- a/conf.py +++ b/conf.py @@ -42,9 +42,9 @@ copyright = '2013, Bob Ippolito' # other places throughout the built documents. # # The short X.Y version. -version = '3.2' +version = '3.3' # The full version, including alpha/beta/rc tags. -release = '3.2.0' +release = '3.3.0' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: diff --git a/setup.py b/setup.py index 7527632..0cb9e6a 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ from distutils.errors import CCompilerError, DistutilsExecError, \ DistutilsPlatformError IS_PYPY = hasattr(sys, 'pypy_translation_info') -VERSION = '3.2.0' +VERSION = '3.3.0' DESCRIPTION = "Simple, fast, extensible JSON encoder/decoder for Python" with open('README.rst', 'r') as f: diff --git a/simplejson/__init__.py b/simplejson/__init__.py index 37a9e52..7fc8153 100644 --- a/simplejson/__init__.py +++ b/simplejson/__init__.py @@ -98,7 +98,7 @@ Using simplejson.tool from the shell to validate and pretty-print:: Expecting property name: line 1 column 3 (char 2) """ from __future__ import absolute_import -__version__ = '3.2.0' +__version__ = '3.3.0' __all__ = [ 'dump', 'dumps', 'load', 'loads', 'JSONDecoder', 'JSONDecodeError', 'JSONEncoder', diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c index 93f136c..e888873 100644 --- a/simplejson/_speedups.c +++ b/simplejson/_speedups.c @@ -124,9 +124,6 @@ JSON_Accu_Destroy(JSON_Accu *acc); #define ERR_STRING_CONTROL "Invalid control character %r at" #define ERR_STRING_ESC1 "Invalid \\X escape sequence %r" #define ERR_STRING_ESC4 "Invalid \\uXXXX escape sequence" -#define ERR_STRING_SURROGATE "Invalid \\uXXXX\\uXXXX surrogate pair" -#define ERR_STRING_HIGH_SURROGATE "Unpaired high surrogate" -#define ERR_STRING_LOW_SURROGATE "Unpaired low surrogate" typedef struct _PyScannerObject { PyObject_HEAD @@ -1025,21 +1022,14 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s #if (PY_MAJOR_VERSION >= 3 || defined(Py_UNICODE_WIDE)) /* Surrogate pair */ if ((c & 0xfc00) == 0xd800) { - JSON_UNICHR c2 = 0; - if (end + 6 >= len) { - raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5); - goto bail; - } - if (buf[next++] != '\\' || buf[next++] != 'u') { - raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5); - goto bail; - } - end += 6; - /* Decode 4 hex digits */ - for (; next < end; next++) { - c2 <<= 4; - JSON_UNICHR digit = buf[next]; - switch (digit) { + if (end + 6 < len && buf[next] == '\\' && buf[next+1] == 'u') { + JSON_UNICHR c2 = 0; + end += 6; + /* Decode 4 hex digits */ + for (next += 2; next < end; next++) { + c2 <<= 4; + JSON_UNICHR digit = buf[next]; + switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c2 |= (digit - '0'); break; @@ -1052,18 +1042,18 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s default: raise_errmsg(ERR_STRING_ESC4, pystr, end - 5); goto bail; - } - } - if ((c2 & 0xfc00) != 0xdc00) { - raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5); - goto bail; - } - c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); - } - else if ((c & 0xfc00) == 0xdc00) { - raise_errmsg(ERR_STRING_LOW_SURROGATE, pystr, end - 5); - goto bail; - } + } + } + if ((c2 & 0xfc00) != 0xdc00) { + /* not a low surrogate, rewind */ + end -= 6; + next = end; + } + else { + c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); + } + } + } #endif /* PY_MAJOR_VERSION >= 3 || Py_UNICODE_WIDE */ } if (c > 0x7f) { @@ -1234,21 +1224,15 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next /* Surrogate pair */ if ((c & 0xfc00) == 0xd800) { JSON_UNICHR c2 = 0; - if (end + 6 >= len) { - raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5); - goto bail; - } - if (PyUnicode_READ(kind, buf, next++) != '\\' || - PyUnicode_READ(kind, buf, next++) != 'u') { - raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5); - goto bail; - } - end += 6; - /* Decode 4 hex digits */ - for (; next < end; next++) { - JSON_UNICHR digit = PyUnicode_READ(kind, buf, next); - c2 <<= 4; - switch (digit) { + if (end + 6 < len && + PyUnicode_READ(kind, buf, next) == '\\' && + PyUnicode_READ(kind, buf, next + 1) == 'u') { + end += 6; + /* Decode 4 hex digits */ + for (next += 2; next < end; next++) { + JSON_UNICHR digit = PyUnicode_READ(kind, buf, next); + c2 <<= 4; + switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c2 |= (digit - '0'); break; @@ -1261,18 +1245,18 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next default: raise_errmsg(ERR_STRING_ESC4, pystr, end - 5); goto bail; - } - } - if ((c2 & 0xfc00) != 0xdc00) { - raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5); - goto bail; - } - c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); - } - else if ((c & 0xfc00) == 0xdc00) { - raise_errmsg(ERR_STRING_LOW_SURROGATE, pystr, end - 5); - goto bail; - } + } + } + if ((c2 & 0xfc00) != 0xdc00) { + /* not a low surrogate, rewind */ + end -= 6; + next = end; + } + else { + c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); + } + } + } #endif } APPEND_OLD_CHUNK diff --git a/simplejson/decoder.py b/simplejson/decoder.py index 54ced0a..5ccb450 100644 --- a/simplejson/decoder.py +++ b/simplejson/decoder.py @@ -102,36 +102,32 @@ def py_scanstring(s, end, encoding=None, strict=True, # Unicode escape sequence msg = "Invalid \\uXXXX escape sequence" esc = s[end + 1:end + 5] - next_end = end + 5 - if len(esc) != 4: - raise JSONDecodeError(msg, s, end) + escX = esc[1:2] + if len(esc) != 4 or escX == 'x' or escX == 'X': + raise JSONDecodeError(msg, s, end - 1) try: uni = int(esc, 16) except ValueError: - raise JSONDecodeError(msg, s, end) + raise JSONDecodeError(msg, s, end - 1) + end += 5 # Check for surrogate pair on UCS-4 systems - if _maxunicode > 65535: - unimask = uni & 0xfc00 - if unimask == 0xd800: - msg = "Unpaired high surrogate" - if not s[end + 5:end + 7] == '\\u': - raise JSONDecodeError(msg, s, end) - esc2 = s[end + 7:end + 11] - if len(esc2) != 4: - raise JSONDecodeError(msg, s, end) + # Note that this will join high/low surrogate pairs + # but will also pass unpaired surrogates through + if (_maxunicode > 65535 and + uni & 0xfc00 == 0xd800 and + s[end:end + 2] == '\\u'): + esc2 = s[end + 2:end + 6] + escX = esc2[1:2] + if len(esc2) == 4 and not (escX == 'x' or escX == 'X'): try: uni2 = int(esc2, 16) except ValueError: raise JSONDecodeError(msg, s, end) - if uni2 & 0xfc00 != 0xdc00: - raise JSONDecodeError(msg, s, end) - uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) - next_end += 6 - elif unimask == 0xdc00: - msg = "Unpaired low surrogate" - raise JSONDecodeError(msg, s, end) + if uni2 & 0xfc00 == 0xdc00: + uni = 0x10000 + (((uni - 0xd800) << 10) | + (uni2 - 0xdc00)) + end += 6 char = unichr(uni) - end = next_end # Append the unescaped character _append(char) return _join(chunks), end diff --git a/simplejson/tests/test_scanstring.py b/simplejson/tests/test_scanstring.py index 3b63d6b..3d98f0d 100644 --- a/simplejson/tests/test_scanstring.py +++ b/simplejson/tests/test_scanstring.py @@ -23,10 +23,6 @@ class TestScanString(TestCase): self._test_scanstring(simplejson.decoder.c_scanstring) def _test_scanstring(self, scanstring): - self.assertEqual( - scanstring('"z\\ud834\\udd20x"', 1, None, True), - (u'z\U0001d120x', 16)) - if sys.maxunicode == 65535: self.assertEqual( scanstring(u'"z\U0001d120x"', 1, None, True), @@ -129,9 +125,10 @@ class TestScanString(TestCase): self.assertRaises(ValueError, scanstring, '\\u012', 0, None, True) self.assertRaises(ValueError, scanstring, '\\u0123', 0, None, True) if sys.maxunicode > 65535: - self.assertRaises(ValueError, scanstring, '\\ud834"', 0, None, True), - self.assertRaises(ValueError, scanstring, '\\ud834\\u"', 0, None, True), - self.assertRaises(ValueError, scanstring, '\\ud834\\x0123"', 0, None, True), + self.assertRaises(ValueError, + scanstring, '\\ud834\\u"', 0, None, True) + self.assertRaises(ValueError, + scanstring, '\\ud834\\x0123"', 0, None, True) def test_issue3623(self): self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1, @@ -145,3 +142,53 @@ class TestScanString(TestCase): assert maxsize is not None self.assertRaises(OverflowError, json.decoder.scanstring, "xxx", maxsize + 1) + + def test_surrogates(self): + scanstring = json.decoder.scanstring + + def assertScan(given, expect, test_utf8=True): + givens = [given] + if not PY3 and test_utf8: + givens.append(given.encode('utf8')) + for given in givens: + (res, count) = scanstring(given, 1, None, True) + self.assertEqual(len(given), count) + self.assertEqual(res, expect) + + assertScan( + u'"z\\ud834\\u0079x"', + u'z\ud834yx') + assertScan( + u'"z\\ud834\\udd20x"', + u'z\U0001d120x') + assertScan( + u'"z\\ud834\\ud834\\udd20x"', + u'z\ud834\U0001d120x') + assertScan( + u'"z\\ud834x"', + u'z\ud834x') + assertScan( + u'"z\\udd20x"', + u'z\udd20x') + assertScan( + u'"z\ud834x"', + u'z\ud834x') + # It may look strange to join strings together, but Python is drunk. + # https://gist.github.com/etrepum/5538443 + assertScan( + u'"z\\ud834\udd20x12345"', + u''.join([u'z\ud834', u'\udd20x12345'])) + assertScan( + u'"z\ud834\\udd20x"', + u''.join([u'z\ud834', u'\udd20x'])) + # these have different behavior given UTF8 input, because the surrogate + # pair may be joined (in maxunicode > 65535 builds) + assertScan( + u''.join([u'"z\ud834', u'\udd20x"']), + u''.join([u'z\ud834', u'\udd20x']), + test_utf8=False) + + self.assertRaises(ValueError, + scanstring, u'"z\\ud83x"', 1, None, True) + self.assertRaises(ValueError, + scanstring, u'"z\\ud834\\udd2x"', 1, None, True) diff --git a/simplejson/tests/test_unicode.py b/simplejson/tests/test_unicode.py index f240176..f04cc5c 100644 --- a/simplejson/tests/test_unicode.py +++ b/simplejson/tests/test_unicode.py @@ -123,26 +123,15 @@ class TestUnicode(TestCase): self.assertRaises(json.JSONDecodeError, json.loads, '"\\u1x34"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ux234"') if sys.maxunicode > 65535: - # unpaired low surrogate - self.assertRaises(json.JSONDecodeError, json.loads, '"\\udc00"') - self.assertRaises(json.JSONDecodeError, json.loads, '"\\udcff"') - # unpaired high surrogate - self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800"') - self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800x"') - self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800xx"') - self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800xxxxxx"') + # invalid escape sequence for low surrogate self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000"') - # invalid escape sequence for low surrogate self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000x"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00x0"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0x00"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\ux000"') - # invalid value for low surrogate - self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0000"') - self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\ufc00"') def test_ensure_ascii_still_works(self): # in the ascii range, ensure that everything is the same -- cgit v1.2.1