pass-through in decoder for lone surrogates #62v3.3.0 surrogate-62 baserock/morph

author: Bob Ippolito <bob@redivi.com> 2013-05-07 16:38:34 -0700
committer: Bob Ippolito <bob@redivi.com> 2013-05-07 23:02:33 -0700
commit: 35816bfe2d0ddeb5ddcc68239683cbb35b7e3ff2 (patch)
tree: fb0698c12a52678392b8ad1a0393f5186138899a
parent: fc7b04d6bb1bfc0ffdddbd7d53ffd56f4142ea34 (diff)
download: simplejson-35816bfe2d0ddeb5ddcc68239683cbb35b7e3ff2.tar.gz
8 files changed, 123 insertions, 101 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index 59aa614..6e056a4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Version 3.3.0 released 2013-05-07
+
+* Unpaired surrogates once again pass through the decoder, to match older
+  behavior and the RFC-4627 spec.
+  https://github.com/simplejson/simplejson/issues/62
+
 Version 3.2.0 released 2013-05-01
 
 * New ignore_nan kwarg in encoder that serializes out
diff --git a/conf.py b/conf.py
index ec7cfca..2fb5544 100644
--- a/conf.py
+++ b/conf.py
@@ -42,9 +42,9 @@ copyright = '2013, Bob Ippolito'
 # other places throughout the built documents.
 #
 # The short X.Y version.
-version = '3.2'
+version = '3.3'
 # The full version, including alpha/beta/rc tags.
-release = '3.2.0'
+release = '3.3.0'
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
diff --git a/setup.py b/setup.py
index 7527632..0cb9e6a 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@ from distutils.errors import CCompilerError, DistutilsExecError, \
     DistutilsPlatformError
 
 IS_PYPY = hasattr(sys, 'pypy_translation_info')
-VERSION = '3.2.0'
+VERSION = '3.3.0'
 DESCRIPTION = "Simple, fast, extensible JSON encoder/decoder for Python"
 
 with open('README.rst', 'r') as f:
diff --git a/simplejson/__init__.py b/simplejson/__init__.py
index 37a9e52..7fc8153 100644
--- a/simplejson/__init__.py
+++ b/simplejson/__init__.py
@@ -98,7 +98,7 @@ Using simplejson.tool from the shell to validate and pretty-print::
     Expecting property name: line 1 column 3 (char 2)
 """
 from __future__ import absolute_import
-__version__ = '3.2.0'
+__version__ = '3.3.0'
 __all__ = [
     'dump', 'dumps', 'load', 'loads',
     'JSONDecoder', 'JSONDecodeError', 'JSONEncoder',
diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c
index 93f136c..e888873 100644
--- a/simplejson/_speedups.c
+++ b/simplejson/_speedups.c
@@ -124,9 +124,6 @@ JSON_Accu_Destroy(JSON_Accu *acc);
 #define ERR_STRING_CONTROL "Invalid control character %r at"
 #define ERR_STRING_ESC1 "Invalid \\X escape sequence %r"
 #define ERR_STRING_ESC4 "Invalid \\uXXXX escape sequence"
-#define ERR_STRING_SURROGATE "Invalid \\uXXXX\\uXXXX surrogate pair"
-#define ERR_STRING_HIGH_SURROGATE "Unpaired high surrogate"
-#define ERR_STRING_LOW_SURROGATE "Unpaired low surrogate"
 
 typedef struct _PyScannerObject {
     PyObject_HEAD
@@ -1025,21 +1022,14 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
 #if (PY_MAJOR_VERSION >= 3 || defined(Py_UNICODE_WIDE))
             /* Surrogate pair */
             if ((c & 0xfc00) == 0xd800) {
-                JSON_UNICHR c2 = 0;
-                if (end + 6 >= len) {
-                    raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
-                    goto bail;
-                }
-                if (buf[next++] != '\\' || buf[next++] != 'u') {
-                    raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
-                    goto bail;
-                }
-                end += 6;
-                /* Decode 4 hex digits */
-                for (; next < end; next++) {
-                    c2 <<= 4;
-                    JSON_UNICHR digit = buf[next];
-                    switch (digit) {
+                if (end + 6 < len && buf[next] == '\\' && buf[next+1] == 'u') {
+		    JSON_UNICHR c2 = 0;
+		    end += 6;
+		    /* Decode 4 hex digits */
+		    for (next += 2; next < end; next++) {
+			c2 <<= 4;
+			JSON_UNICHR digit = buf[next];
+			switch (digit) {
                         case '0': case '1': case '2': case '3': case '4':
                         case '5': case '6': case '7': case '8': case '9':
                             c2 |= (digit - '0'); break;
@@ -1052,18 +1042,18 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
                         default:
                             raise_errmsg(ERR_STRING_ESC4, pystr, end - 5);
                             goto bail;
-                    }
-                }
-                if ((c2 & 0xfc00) != 0xdc00) {
-                    raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
-                    goto bail;
-                }
-                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
-            }
-            else if ((c & 0xfc00) == 0xdc00) {
-                raise_errmsg(ERR_STRING_LOW_SURROGATE, pystr, end - 5);
-                goto bail;
-            }
+			}
+		    }
+		    if ((c2 & 0xfc00) != 0xdc00) {
+			/* not a low surrogate, rewind */
+			end -= 6;
+			next = end;
+		    }
+		    else {
+			c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+		    }
+		}
+	    }
 #endif /* PY_MAJOR_VERSION >= 3 || Py_UNICODE_WIDE */
         }
         if (c > 0x7f) {
@@ -1234,21 +1224,15 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
             /* Surrogate pair */
             if ((c & 0xfc00) == 0xd800) {
                 JSON_UNICHR c2 = 0;
-                if (end + 6 >= len) {
-                    raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
-                    goto bail;
-                }
-                if (PyUnicode_READ(kind, buf, next++) != '\\' ||
-                    PyUnicode_READ(kind, buf, next++) != 'u') {
-                    raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
-                    goto bail;
-                }
-                end += 6;
-                /* Decode 4 hex digits */
-                for (; next < end; next++) {
-                    JSON_UNICHR digit = PyUnicode_READ(kind, buf, next);
-                    c2 <<= 4;
-                    switch (digit) {
+		if (end + 6 < len &&
+		    PyUnicode_READ(kind, buf, next) == '\\' &&
+		    PyUnicode_READ(kind, buf, next + 1) == 'u') {
+		    end += 6;
+		    /* Decode 4 hex digits */
+		    for (next += 2; next < end; next++) {
+			JSON_UNICHR digit = PyUnicode_READ(kind, buf, next);
+			c2 <<= 4;
+			switch (digit) {
                         case '0': case '1': case '2': case '3': case '4':
                         case '5': case '6': case '7': case '8': case '9':
                             c2 |= (digit - '0'); break;
@@ -1261,18 +1245,18 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
                         default:
                             raise_errmsg(ERR_STRING_ESC4, pystr, end - 5);
                             goto bail;
-                    }
-                }
-                if ((c2 & 0xfc00) != 0xdc00) {
-                    raise_errmsg(ERR_STRING_HIGH_SURROGATE, pystr, end - 5);
-                    goto bail;
-                }
-                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
-            }
-            else if ((c & 0xfc00) == 0xdc00) {
-                raise_errmsg(ERR_STRING_LOW_SURROGATE, pystr, end - 5);
-                goto bail;
-            }
+			}
+		    }
+		    if ((c2 & 0xfc00) != 0xdc00) {
+			/* not a low surrogate, rewind */
+			end -= 6;
+			next = end;
+		    }
+		    else {
+			c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+		    }
+		}
+	    }
 #endif
         }
         APPEND_OLD_CHUNK
diff --git a/simplejson/decoder.py b/simplejson/decoder.py
index 54ced0a..5ccb450 100644
--- a/simplejson/decoder.py
+++ b/simplejson/decoder.py
@@ -102,36 +102,32 @@ def py_scanstring(s, end, encoding=None, strict=True,
             # Unicode escape sequence
             msg = "Invalid \\uXXXX escape sequence"
             esc = s[end + 1:end + 5]
-            next_end = end + 5
-            if len(esc) != 4:
-                raise JSONDecodeError(msg, s, end)
+            escX = esc[1:2]
+            if len(esc) != 4 or escX == 'x' or escX == 'X':
+                raise JSONDecodeError(msg, s, end - 1)
             try:
                 uni = int(esc, 16)
             except ValueError:
-                raise JSONDecodeError(msg, s, end)
+                raise JSONDecodeError(msg, s, end - 1)
+            end += 5
             # Check for surrogate pair on UCS-4 systems
-            if _maxunicode > 65535:
-                unimask = uni & 0xfc00
-                if unimask == 0xd800:
-                    msg = "Unpaired high surrogate"
-                    if not s[end + 5:end + 7] == '\\u':
-                        raise JSONDecodeError(msg, s, end)
-                    esc2 = s[end + 7:end + 11]
-                    if len(esc2) != 4:
-                        raise JSONDecodeError(msg, s, end)
+            # Note that this will join high/low surrogate pairs
+            # but will also pass unpaired surrogates through
+            if (_maxunicode > 65535 and
+                uni & 0xfc00 == 0xd800 and
+                s[end:end + 2] == '\\u'):
+                esc2 = s[end + 2:end + 6]
+                escX = esc2[1:2]
+                if len(esc2) == 4 and not (escX == 'x' or escX == 'X'):
                     try:
                         uni2 = int(esc2, 16)
                     except ValueError:
                         raise JSONDecodeError(msg, s, end)
-                    if uni2 & 0xfc00 != 0xdc00:
-                        raise JSONDecodeError(msg, s, end)
-                    uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
-                    next_end += 6
-                elif unimask == 0xdc00:
-                    msg = "Unpaired low surrogate"
-                    raise JSONDecodeError(msg, s, end)
+                    if uni2 & 0xfc00 == 0xdc00:
+                        uni = 0x10000 + (((uni - 0xd800) << 10) |
+                                         (uni2 - 0xdc00))
+                        end += 6
             char = unichr(uni)
-            end = next_end
         # Append the unescaped character
         _append(char)
     return _join(chunks), end
diff --git a/simplejson/tests/test_scanstring.py b/simplejson/tests/test_scanstring.py
index 3b63d6b..3d98f0d 100644
--- a/simplejson/tests/test_scanstring.py
+++ b/simplejson/tests/test_scanstring.py
@@ -23,10 +23,6 @@ class TestScanString(TestCase):
         self._test_scanstring(simplejson.decoder.c_scanstring)
 
     def _test_scanstring(self, scanstring):
-        self.assertEqual(
-            scanstring('"z\\ud834\\udd20x"', 1, None, True),
-            (u'z\U0001d120x', 16))
-
         if sys.maxunicode == 65535:
             self.assertEqual(
                 scanstring(u'"z\U0001d120x"', 1, None, True),
@@ -129,9 +125,10 @@ class TestScanString(TestCase):
         self.assertRaises(ValueError, scanstring, '\\u012', 0, None, True)
         self.assertRaises(ValueError, scanstring, '\\u0123', 0, None, True)
         if sys.maxunicode > 65535:
-            self.assertRaises(ValueError, scanstring, '\\ud834"', 0, None, True),
-            self.assertRaises(ValueError, scanstring, '\\ud834\\u"', 0, None, True),
-            self.assertRaises(ValueError, scanstring, '\\ud834\\x0123"', 0, None, True),
+            self.assertRaises(ValueError,
+                              scanstring, '\\ud834\\u"', 0, None, True)
+            self.assertRaises(ValueError,
+                              scanstring, '\\ud834\\x0123"', 0, None, True)
 
     def test_issue3623(self):
         self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
@@ -145,3 +142,53 @@ class TestScanString(TestCase):
         assert maxsize is not None
         self.assertRaises(OverflowError, json.decoder.scanstring, "xxx",
                           maxsize + 1)
+
+    def test_surrogates(self):
+        scanstring = json.decoder.scanstring
+
+        def assertScan(given, expect, test_utf8=True):
+            givens = [given]
+            if not PY3 and test_utf8:
+                givens.append(given.encode('utf8'))
+            for given in givens:
+                (res, count) = scanstring(given, 1, None, True)
+                self.assertEqual(len(given), count)
+                self.assertEqual(res, expect)
+
+        assertScan(
+            u'"z\\ud834\\u0079x"',
+            u'z\ud834yx')
+        assertScan(
+            u'"z\\ud834\\udd20x"',
+            u'z\U0001d120x')
+        assertScan(
+            u'"z\\ud834\\ud834\\udd20x"',
+            u'z\ud834\U0001d120x')
+        assertScan(
+            u'"z\\ud834x"',
+            u'z\ud834x')
+        assertScan(
+            u'"z\\udd20x"',
+            u'z\udd20x')
+        assertScan(
+            u'"z\ud834x"',
+            u'z\ud834x')
+        # It may look strange to join strings together, but Python is drunk.
+        # https://gist.github.com/etrepum/5538443
+        assertScan(
+            u'"z\\ud834\udd20x12345"',
+            u''.join([u'z\ud834', u'\udd20x12345']))
+        assertScan(
+            u'"z\ud834\\udd20x"',
+            u''.join([u'z\ud834', u'\udd20x']))
+        # these have different behavior given UTF8 input, because the surrogate
+        # pair may be joined (in maxunicode > 65535 builds)
+        assertScan(
+            u''.join([u'"z\ud834', u'\udd20x"']),
+            u''.join([u'z\ud834', u'\udd20x']),
+            test_utf8=False)
+
+        self.assertRaises(ValueError,
+                          scanstring, u'"z\\ud83x"', 1, None, True)
+        self.assertRaises(ValueError,
+                          scanstring, u'"z\\ud834\\udd2x"', 1, None, True)
diff --git a/simplejson/tests/test_unicode.py b/simplejson/tests/test_unicode.py
index f240176..f04cc5c 100644
--- a/simplejson/tests/test_unicode.py
+++ b/simplejson/tests/test_unicode.py
@@ -123,26 +123,15 @@ class TestUnicode(TestCase):
         self.assertRaises(json.JSONDecodeError, json.loads, '"\\u1x34"')
         self.assertRaises(json.JSONDecodeError, json.loads, '"\\ux234"')
         if sys.maxunicode > 65535:
-            # unpaired low surrogate
-            self.assertRaises(json.JSONDecodeError, json.loads, '"\\udc00"')
-            self.assertRaises(json.JSONDecodeError, json.loads, '"\\udcff"')
-            # unpaired high surrogate
-            self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800"')
-            self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800x"')
-            self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800xx"')
-            self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800xxxxxx"')
+            # invalid escape sequence for low surrogate
             self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u"')
             self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0"')
             self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00"')
             self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000"')
-            # invalid escape sequence for low surrogate
             self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000x"')
             self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00x0"')
             self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0x00"')
             self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\ux000"')
-            # invalid value for low surrogate
-            self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0000"')
-            self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\ufc00"')
 
     def test_ensure_ascii_still_works(self):
         # in the ascii range, ensure that everything is the same
author	Bob Ippolito <bob@redivi.com>	2013-05-07 16:38:34 -0700
committer	Bob Ippolito <bob@redivi.com>	2013-05-07 23:02:33 -0700
commit	35816bfe2d0ddeb5ddcc68239683cbb35b7e3ff2 (patch)
tree	fb0698c12a52678392b8ad1a0393f5186138899a
parent	fc7b04d6bb1bfc0ffdddbd7d53ffd56f4142ea34 (diff)
download	simplejson-35816bfe2d0ddeb5ddcc68239683cbb35b7e3ff2.tar.gz