summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Ippolito <bob@redivi.com>2013-05-07 16:38:34 -0700
committerBob Ippolito <bob@redivi.com>2013-05-07 23:02:33 -0700
commit35816bfe2d0ddeb5ddcc68239683cbb35b7e3ff2 (patch)
treefb0698c12a52678392b8ad1a0393f5186138899a
parentfc7b04d6bb1bfc0ffdddbd7d53ffd56f4142ea34 (diff)
downloadsimplejson-35816bfe2d0ddeb5ddcc68239683cbb35b7e3ff2.tar.gz
pass-through in decoder for lone surrogates #62v3.3.0surrogate-62baserock/morph
-rw-r--r--CHANGES.txt6
-rw-r--r--conf.py4
-rw-r--r--setup.py2
-rw-r--r--simplejson/__init__.py2
-rw-r--r--simplejson/_speedups.c98
-rw-r--r--simplejson/decoder.py38
-rw-r--r--simplejson/tests/test_scanstring.py61
-rw-r--r--simplejson/tests/test_unicode.py13
8 files changed, 123 insertions, 101 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index 59aa614..6e056a4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Version 3.3.0 released 2013-05-07
+
+* Unpaired surrogates once again pass through the decoder, to match older
+ behavior and the RFC-4627 spec.
+ https://github.com/simplejson/simplejson/issues/62
+
Version 3.2.0 released 2013-05-01
* New ignore_nan kwarg in encoder that serializes out
diff --git a/conf.py b/conf.py
index ec7cfca..2fb5544 100644
--- a/conf.py
+++ b/conf.py
@@ -42,9 +42,9 @@ copyright = '2013, Bob Ippolito'
# other places throughout the built documents.
#
# The short X.Y version.
-version = '3.2'
+version = '3.3'
# The full version, including alpha/beta/rc tags.
-release = '3.2.0'
+release = '3.3.0'
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
diff --git a/setup.py b/setup.py
index 7527632..0cb9e6a 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@ from distutils.errors import CCompilerError, DistutilsExecError, \
DistutilsPlatformError
IS_PYPY = hasattr(sys, 'pypy_translation_info')
-VERSION = '3.2.0'
+VERSION = '3.3.0'
DESCRIPTION = "Simple, fast, extensible JSON encoder/decoder for Python"
with open('README.rst', 'r') as f:
diff --git a/simplejson/__init__.py b/simplejson/__init__.py
index 37a9e52..7fc8153 100644
--- a/simplejson/__init__.py
+++ b/simplejson/__init__.py
@@ -98,7 +98,7 @@ Using simplejson.tool from the shell to validate and pretty-print::
Expecting property name: line 1 column 3 (char 2)
"""
from __future__ import absolute_import
-__version__ = '3.2.0'
+__version__ = '3.3.0'
__all__ = [
'dump', 'dumps', 'load', 'loads',
'JSONDecoder', 'JSONDecodeError', 'JSONEncoder',
diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c
index 93f136c..e888873 100644
--- a/simplejson/_speedups.c
+++ b/simplejson/_speedups.c
@@ -124,9 +124,6 @@ JSON_Accu_Destroy(JSON_Accu *acc);
#define ERR_STRING_CONTROL "Invalid control character %r at"
#define ERR_STRING_ESC1 "Invalid \\X escape sequence %r"
#define ERR_STRING_ESC4 "Invalid \\uXXXX escape sequence"
-#define ERR_STRING_SURROGATE "Invalid \\uXXXX\\uXXXX surrogate pair"
-#define ERR_STRING_HIGH_SURROGATE "Unpaired high surrogate"
-#define ERR_STRING_LOW_SURROGATE "Unpaired low surrogate"
typedef struct _PyScannerObject {
PyObject_HEAD
@@ -1025,21 +1022,14 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
#if (PY_MAJOR_VERSION >= 3 || defined(Py_UNICODE_WIDE))
/* Surrogate pair */
if ((c & 0xfc00) == 0xd800) {
- JSON_UNICHR c2 = 0;
- if (end + 6 >= len) {
- raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
- goto bail;
- }
- if (buf[next++] != '\\' || buf[next++] != 'u') {
- raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
- goto bail;
- }
- end += 6;
- /* Decode 4 hex digits */
- for (; next < end; next++) {
- c2 <<= 4;
- JSON_UNICHR digit = buf[next];
- switch (digit) {
+ if (end + 6 < len && buf[next] == '\\' && buf[next+1] == 'u') {
+ JSON_UNICHR c2 = 0;
+ end += 6;
+ /* Decode 4 hex digits */
+ for (next += 2; next < end; next++) {
+ c2 <<= 4;
+ JSON_UNICHR digit = buf[next];
+ switch (digit) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
c2 |= (digit - '0'); break;
@@ -1052,18 +1042,18 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
default:
raise_errmsg(ERR_STRING_ESC4, pystr, end - 5);
goto bail;
- }
- }
- if ((c2 & 0xfc00) != 0xdc00) {
- raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
- goto bail;
- }
- c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
- }
- else if ((c & 0xfc00) == 0xdc00) {
- raise_errmsg(ERR_STRING_LOW_SURROGATE, pystr, end - 5);
- goto bail;
- }
+ }
+ }
+ if ((c2 & 0xfc00) != 0xdc00) {
+ /* not a low surrogate, rewind */
+ end -= 6;
+ next = end;
+ }
+ else {
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+ }
+ }
+ }
#endif /* PY_MAJOR_VERSION >= 3 || Py_UNICODE_WIDE */
}
if (c > 0x7f) {
@@ -1234,21 +1224,15 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
/* Surrogate pair */
if ((c & 0xfc00) == 0xd800) {
JSON_UNICHR c2 = 0;
- if (end + 6 >= len) {
- raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
- goto bail;
- }
- if (PyUnicode_READ(kind, buf, next++) != '\\' ||
- PyUnicode_READ(kind, buf, next++) != 'u') {
- raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
- goto bail;
- }
- end += 6;
- /* Decode 4 hex digits */
- for (; next < end; next++) {
- JSON_UNICHR digit = PyUnicode_READ(kind, buf, next);
- c2 <<= 4;
- switch (digit) {
+ if (end + 6 < len &&
+ PyUnicode_READ(kind, buf, next) == '\\' &&
+ PyUnicode_READ(kind, buf, next + 1) == 'u') {
+ end += 6;
+ /* Decode 4 hex digits */
+ for (next += 2; next < end; next++) {
+ JSON_UNICHR digit = PyUnicode_READ(kind, buf, next);
+ c2 <<= 4;
+ switch (digit) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
c2 |= (digit - '0'); break;
@@ -1261,18 +1245,18 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
default:
raise_errmsg(ERR_STRING_ESC4, pystr, end - 5);
goto bail;
- }
- }
- if ((c2 & 0xfc00) != 0xdc00) {
- raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
- goto bail;
- }
- c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
- }
- else if ((c & 0xfc00) == 0xdc00) {
- raise_errmsg(ERR_STRING_LOW_SURROGATE, pystr, end - 5);
- goto bail;
- }
+ }
+ }
+ if ((c2 & 0xfc00) != 0xdc00) {
+ /* not a low surrogate, rewind */
+ end -= 6;
+ next = end;
+ }
+ else {
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+ }
+ }
+ }
#endif
}
APPEND_OLD_CHUNK
diff --git a/simplejson/decoder.py b/simplejson/decoder.py
index 54ced0a..5ccb450 100644
--- a/simplejson/decoder.py
+++ b/simplejson/decoder.py
@@ -102,36 +102,32 @@ def py_scanstring(s, end, encoding=None, strict=True,
# Unicode escape sequence
msg = "Invalid \\uXXXX escape sequence"
esc = s[end + 1:end + 5]
- next_end = end + 5
- if len(esc) != 4:
- raise JSONDecodeError(msg, s, end)
+ escX = esc[1:2]
+ if len(esc) != 4 or escX == 'x' or escX == 'X':
+ raise JSONDecodeError(msg, s, end - 1)
try:
uni = int(esc, 16)
except ValueError:
- raise JSONDecodeError(msg, s, end)
+ raise JSONDecodeError(msg, s, end - 1)
+ end += 5
# Check for surrogate pair on UCS-4 systems
- if _maxunicode > 65535:
- unimask = uni & 0xfc00
- if unimask == 0xd800:
- msg = "Unpaired high surrogate"
- if not s[end + 5:end + 7] == '\\u':
- raise JSONDecodeError(msg, s, end)
- esc2 = s[end + 7:end + 11]
- if len(esc2) != 4:
- raise JSONDecodeError(msg, s, end)
+ # Note that this will join high/low surrogate pairs
+ # but will also pass unpaired surrogates through
+ if (_maxunicode > 65535 and
+ uni & 0xfc00 == 0xd800 and
+ s[end:end + 2] == '\\u'):
+ esc2 = s[end + 2:end + 6]
+ escX = esc2[1:2]
+ if len(esc2) == 4 and not (escX == 'x' or escX == 'X'):
try:
uni2 = int(esc2, 16)
except ValueError:
raise JSONDecodeError(msg, s, end)
- if uni2 & 0xfc00 != 0xdc00:
- raise JSONDecodeError(msg, s, end)
- uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
- next_end += 6
- elif unimask == 0xdc00:
- msg = "Unpaired low surrogate"
- raise JSONDecodeError(msg, s, end)
+ if uni2 & 0xfc00 == 0xdc00:
+ uni = 0x10000 + (((uni - 0xd800) << 10) |
+ (uni2 - 0xdc00))
+ end += 6
char = unichr(uni)
- end = next_end
# Append the unescaped character
_append(char)
return _join(chunks), end
diff --git a/simplejson/tests/test_scanstring.py b/simplejson/tests/test_scanstring.py
index 3b63d6b..3d98f0d 100644
--- a/simplejson/tests/test_scanstring.py
+++ b/simplejson/tests/test_scanstring.py
@@ -23,10 +23,6 @@ class TestScanString(TestCase):
self._test_scanstring(simplejson.decoder.c_scanstring)
def _test_scanstring(self, scanstring):
- self.assertEqual(
- scanstring('"z\\ud834\\udd20x"', 1, None, True),
- (u'z\U0001d120x', 16))
-
if sys.maxunicode == 65535:
self.assertEqual(
scanstring(u'"z\U0001d120x"', 1, None, True),
@@ -129,9 +125,10 @@ class TestScanString(TestCase):
self.assertRaises(ValueError, scanstring, '\\u012', 0, None, True)
self.assertRaises(ValueError, scanstring, '\\u0123', 0, None, True)
if sys.maxunicode > 65535:
- self.assertRaises(ValueError, scanstring, '\\ud834"', 0, None, True),
- self.assertRaises(ValueError, scanstring, '\\ud834\\u"', 0, None, True),
- self.assertRaises(ValueError, scanstring, '\\ud834\\x0123"', 0, None, True),
+ self.assertRaises(ValueError,
+ scanstring, '\\ud834\\u"', 0, None, True)
+ self.assertRaises(ValueError,
+ scanstring, '\\ud834\\x0123"', 0, None, True)
def test_issue3623(self):
self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
@@ -145,3 +142,53 @@ class TestScanString(TestCase):
assert maxsize is not None
self.assertRaises(OverflowError, json.decoder.scanstring, "xxx",
maxsize + 1)
+
+ def test_surrogates(self):
+ scanstring = json.decoder.scanstring
+
+ def assertScan(given, expect, test_utf8=True):
+ givens = [given]
+ if not PY3 and test_utf8:
+ givens.append(given.encode('utf8'))
+ for given in givens:
+ (res, count) = scanstring(given, 1, None, True)
+ self.assertEqual(len(given), count)
+ self.assertEqual(res, expect)
+
+ assertScan(
+ u'"z\\ud834\\u0079x"',
+ u'z\ud834yx')
+ assertScan(
+ u'"z\\ud834\\udd20x"',
+ u'z\U0001d120x')
+ assertScan(
+ u'"z\\ud834\\ud834\\udd20x"',
+ u'z\ud834\U0001d120x')
+ assertScan(
+ u'"z\\ud834x"',
+ u'z\ud834x')
+ assertScan(
+ u'"z\\udd20x"',
+ u'z\udd20x')
+ assertScan(
+ u'"z\ud834x"',
+ u'z\ud834x')
+ # It may look strange to join strings together, but Python is drunk.
+ # https://gist.github.com/etrepum/5538443
+ assertScan(
+ u'"z\\ud834\udd20x12345"',
+ u''.join([u'z\ud834', u'\udd20x12345']))
+ assertScan(
+ u'"z\ud834\\udd20x"',
+ u''.join([u'z\ud834', u'\udd20x']))
+ # these have different behavior given UTF8 input, because the surrogate
+ # pair may be joined (in maxunicode > 65535 builds)
+ assertScan(
+ u''.join([u'"z\ud834', u'\udd20x"']),
+ u''.join([u'z\ud834', u'\udd20x']),
+ test_utf8=False)
+
+ self.assertRaises(ValueError,
+ scanstring, u'"z\\ud83x"', 1, None, True)
+ self.assertRaises(ValueError,
+ scanstring, u'"z\\ud834\\udd2x"', 1, None, True)
diff --git a/simplejson/tests/test_unicode.py b/simplejson/tests/test_unicode.py
index f240176..f04cc5c 100644
--- a/simplejson/tests/test_unicode.py
+++ b/simplejson/tests/test_unicode.py
@@ -123,26 +123,15 @@ class TestUnicode(TestCase):
self.assertRaises(json.JSONDecodeError, json.loads, '"\\u1x34"')
self.assertRaises(json.JSONDecodeError, json.loads, '"\\ux234"')
if sys.maxunicode > 65535:
- # unpaired low surrogate
- self.assertRaises(json.JSONDecodeError, json.loads, '"\\udc00"')
- self.assertRaises(json.JSONDecodeError, json.loads, '"\\udcff"')
- # unpaired high surrogate
- self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800"')
- self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800x"')
- self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800xx"')
- self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800xxxxxx"')
+ # invalid escape sequence for low surrogate
self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u"')
self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0"')
self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00"')
self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000"')
- # invalid escape sequence for low surrogate
self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000x"')
self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00x0"')
self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0x00"')
self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\ux000"')
- # invalid value for low surrogate
- self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0000"')
- self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\ufc00"')
def test_ensure_ascii_still_works(self):
# in the ascii range, ensure that everything is the same