try and use native Py3.3 unicode everywhere

author: Bob Ippolito <bob@redivi.com> 2012-12-29 01:17:49 -0800
committer: Bob Ippolito <bob@redivi.com> 2012-12-29 01:17:49 -0800
commit: 83a493db6a8b859ec7b10fa85365dd3fdf144c68 (patch)
tree: 88bbbfe2c7654f54a44b922f9cdc6eb7f24ce209
parent: 7ed40f0f1f476595f4693a74a1074e5c2446a95e (diff)
download: simplejson-83a493db6a8b859ec7b10fa85365dd3fdf144c68.tar.gz
3 files changed, 182 insertions, 110 deletions
diff --git a/.gitignore b/.gitignore
index fb20d6f..590f60f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 *.egg
 *.pyc
 *.so
+.DS_Store
 /MANIFEST
 /.coverage
 /coverage.xml
diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c
index 8282d09..74fa831 100644
--- a/simplejson/_speedups.c
+++ b/simplejson/_speedups.c
@@ -16,12 +16,23 @@
 #define JSON_ASCII_AS_STRING PyUnicode_AsUTF8
 #define PyInt_Type PyLong_Type
 #define PyInt_FromString PyLong_FromString
+#define PY2_UNUSED
+#define PY3_UNUSED UNUSED
+#define JSON_NewEmptyUnicode() PyUnicode_New(0, 127)
 #else /* PY_MAJOR_VERSION >= 3 */
+#define PY2_UNUSED UNUSED
+#define PY3_UNUSED
+#define PyUnicode_READY(obj) 0
+#define PyUnicode_KIND(obj) (sizeof(Py_UNICODE))
+#define PyUnicode_DATA(obj) ((void *)(PyUnicode_AS_UNICODE(obj)))
+#define PyUnicode_READ(kind, data, index) ((JSON_UNICHR)((const Py_UNICODE *)(data))[(index)])
+#define PyUnicode_GetLength PyUnicode_GET_SIZE
 #define JSON_UNICHR Py_UNICODE
 #define JSON_ASCII_Check PyString_Check
 #define JSON_ASCII_AS_STRING PyString_AS_STRING
 #define JSON_InternFromString PyString_InternFromString
 #define JSON_Intern_GET_SIZE PyString_GET_SIZE
+#define JSON_NewEmptyUnicode() PyUnicode_FromUnicode(NULL, 0)
 #endif /* PY_MAJOR_VERSION < 3 */
 
 #if PY_VERSION_HEX < 0x02070000
@@ -144,6 +155,8 @@ static PyMemberDef encoder_members[] = {
 };
 
 static PyObject *
+JSON_UnicodeFromChar(JSON_UNICHR c);
+static PyObject *
 maybe_quote_bigint(PyObject *encoded, PyObject *obj);
 
 static Py_ssize_t
@@ -214,6 +227,25 @@ moduleinit(void);
 
 #define MIN_EXPANSION 6
 
+static int
+IS_DIGIT(JSON_UNICHR c)
+{
+    return c >= '0' && c <= '9';
+}
+
+static PyObject *
+JSON_UnicodeFromChar(JSON_UNICHR c)
+{
+#if PY_MAJOR_VERSION >= 3
+    PyObject *rval = PyUnicode_New(1, c);
+    if (rval)
+        PyUnicode_WRITE(PyUnicode_KIND(rval), PyUnicode_DATA(rval), 0, c);
+    return rval;
+#else /* PY_MAJOR_VERSION >= 3 */
+    return PyUnicode_FromUnicode(&c, 1);
+#endif /* PY_MAJOR_VERSION < 3 */
+}
+
 static PyObject *
 maybe_quote_bigint(PyObject *encoded, PyObject *obj)
 {
@@ -345,8 +377,6 @@ ascii_char_size(JSON_UNICHR c)
     }
 }
 
-#if PY_MAJOR_VERSION >= 3
-
 static PyObject *
 ascii_escape_unicode(PyObject *pystr)
 {
@@ -370,11 +400,20 @@ ascii_escape_unicode(PyObject *pystr)
     for (i = 0; i < input_chars; i++) {
         output_size += ascii_char_size(PyUnicode_READ(kind, data, i));
     }
+#if PY_MAJOR_VERSION >= 3
     rval = PyUnicode_New(output_size, 127);
     if (rval == NULL) {
         return NULL;
     }
+    assert(PyUnicode_KIND(rval) == PyUnicode_1BYTE_KIND);
     output = (char *)PyUnicode_DATA(rval);
+#else
+    rval = PyString_FromStringAndSize(NULL, output_size);
+    if (rval == NULL) {
+        return NULL;
+    }
+    output = PyString_AS_STRING(rval);
+#endif
     chars = 0;
     output[chars++] = '"';
     for (i = 0; i < input_chars; i++) {
@@ -385,6 +424,8 @@ ascii_escape_unicode(PyObject *pystr)
     return rval;
 }
 
+#if PY_MAJOR_VERSION >= 3
+
 static PyObject *
 ascii_escape_str(PyObject *pystr)
 {
@@ -400,40 +441,6 @@ ascii_escape_str(PyObject *pystr)
 #else /* PY_MAJOR_VERSION >= 3 */
 
 static PyObject *
-ascii_escape_unicode(PyObject *pystr)
-{
-    /* Take a PyUnicode pystr and return a new ASCII-only escaped PyString */
-    Py_ssize_t i;
-    Py_ssize_t input_chars;
-    Py_ssize_t output_size;
-    Py_ssize_t chars;
-    PyObject *rval;
-    char *output;
-    Py_UNICODE *input_unicode;
-
-    input_chars = PyUnicode_GET_SIZE(pystr);
-    input_unicode = PyUnicode_AS_UNICODE(pystr);
-
-    output_size = 2;
-    for (i = 0; i < input_chars; i++) {
-        output_size += ascii_char_size((JSON_UNICHR)input_unicode[i]);
-    }
-    rval = PyString_FromStringAndSize(NULL, output_size);
-    if (rval == NULL) {
-        return NULL;
-    }
-    output = PyString_AS_STRING(rval);
-    chars = 0;
-    output[chars++] = '"';
-    for (i = 0; i < input_chars; i++) {
-        chars = ascii_escape_char((JSON_UNICHR)input_unicode[i], output, chars);
-    }
-    output[chars++] = '"';
-    assert(chars == output_size);
-    return rval;
-}
-
-static PyObject *
 ascii_escape_str(PyObject *pystr)
 {
     /* Take a PyString pystr and return a new ASCII-only escaped PyString */
@@ -554,13 +561,10 @@ encoder_dict_iteritems(PyEncoderObject *s, PyObject *dct)
             value = PyTuple_GET_ITEM(item, 1);
             if (value == NULL)
                 goto bail;
-            PyObject *tpl = PyTuple_New(2);
+            PyObject *tpl = PyTuple_Pack(2, kstr, value);
             if (tpl == NULL)
                 goto bail;
-            PyTuple_SET_ITEM(tpl, 0, kstr);
-            kstr = NULL;
-            Py_INCREF(value);
-            PyTuple_SET_ITEM(tpl, 1, value);
+            Py_CLEAR(kstr);
             Py_DECREF(item);
             item = tpl;
         }
@@ -618,7 +622,7 @@ join_list_unicode(PyObject *lst)
     /* return u''.join(lst) */
     static PyObject *joinfn = NULL;
     if (joinfn == NULL) {
-        PyObject *ustr = PyUnicode_FromUnicode(NULL, 0);
+        PyObject *ustr = JSON_NewEmptyUnicode();
         if (ustr == NULL)
             return NULL;
 
@@ -878,14 +882,13 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
         }
         APPEND_OLD_CHUNK
 #if PY_MAJOR_VERSION >= 3
-        chunk = PyUnicode_New(1, c);
+        chunk = JSON_UnicodeFromChar(c);
         if (chunk == NULL) {
             goto bail;
         }
-        PyUnicode_WRITE(PyUnicode_KIND(chunk), PyUnicode_DATA(chunk), 0, c);
 #else /* PY_MAJOR_VERSION >= 3 */
         if (has_unicode) {
-            chunk = PyUnicode_FromUnicode(&c, 1);
+            chunk = JSON_UnicodeFromChar(c);
             if (chunk == NULL) {
                 goto bail;
             }
@@ -904,11 +907,7 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
         if (chunk != NULL)
             rval = chunk;
         else
-#if PY_MAJOR_VERSION >= 3
-            rval = PyUnicode_New(0, 127);
-#else
-            rval = PyString_FromStringAndSize("", 0);
-#endif
+            rval = JSON_NewEmptyUnicode();
     }
     else {
         APPEND_OLD_CHUNK
@@ -941,10 +940,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
     Return value is a new PyUnicode
     */
     PyObject *rval;
-    Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
     Py_ssize_t begin = end - 1;
     Py_ssize_t next = begin;
-    const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
+    PY2_UNUSED int kind = PyUnicode_KIND(pystr);
+    Py_ssize_t len = PyUnicode_GetLength(pystr);
+    void *buf = PyUnicode_DATA(pystr);
     PyObject *chunks = NULL;
     PyObject *chunk = NULL;
 
@@ -957,9 +957,9 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
     }
     while (1) {
         /* Find the end of the string or the next escape */
-        Py_UNICODE c = 0;
+        JSON_UNICHR c = 0;
         for (next = end; next < len; next++) {
-            c = buf[next];
+            c = PyUnicode_READ(kind, buf, next);
             if (c == '"' || c == '\\') {
                 break;
             }
@@ -975,7 +975,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
         /* Pick up this chunk if it's not zero length */
         if (next != end) {
             APPEND_OLD_CHUNK
-            chunk = PyUnicode_FromUnicode(&buf[end], next - end);
+#if PY_MAJOR_VERSION < 3
+            chunk = PyUnicode_FromUnicode(&((const Py_UNICODE *)buf)[end], next - end);
+#else
+            chunk = PyUnicode_Substring(pystr, end, next);
+#endif
             if (chunk == NULL) {
                 goto bail;
             }
@@ -989,7 +993,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
             raise_errmsg("Unterminated string starting at", pystr, begin);
             goto bail;
         }
-        c = buf[next];
+        c = PyUnicode_READ(kind, buf, next);
         if (c != 'u') {
             /* Non-unicode backslash escapes */
             end = next + 1;
@@ -1019,7 +1023,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
             }
             /* Decode 4 hex digits */
             for (; next < end; next++) {
-                Py_UNICODE digit = buf[next];
+                JSON_UNICHR digit = PyUnicode_READ(kind, buf, next);
                 c <<= 4;
                 switch (digit) {
                     case '0': case '1': case '2': case '3': case '4':
@@ -1036,15 +1040,16 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
                         goto bail;
                 }
             }
-#ifdef Py_UNICODE_WIDE
+#if PY_MAJOR_VERSION >= 3 || defined(Py_UNICODE_WIDE)
             /* Surrogate pair */
             if ((c & 0xfc00) == 0xd800) {
-                Py_UNICODE c2 = 0;
+                JSON_UNICHR c2 = 0;
                 if (end + 6 >= len) {
                     raise_errmsg("Unpaired high surrogate", pystr, end - 5);
                     goto bail;
                 }
-                if (buf[next++] != '\\' || buf[next++] != 'u') {
+                if (PyUnicode_READ(kind, buf, next++) != '\\' ||
+                    PyUnicode_READ(kind, buf, next++) != 'u') {
                     raise_errmsg("Unpaired high surrogate", pystr, end - 5);
                     goto bail;
                 }
@@ -1052,7 +1057,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
                 /* Decode 4 hex digits */
                 for (; next < end; next++) {
                     c2 <<= 4;
-                    Py_UNICODE digit = buf[next];
+                    JSON_UNICHR digit = PyUnicode_READ(kind, buf, next);
                     switch (digit) {
                         case '0': case '1': case '2': case '3': case '4':
                         case '5': case '6': case '7': case '8': case '9':
@@ -1081,7 +1086,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
 #endif
         }
         APPEND_OLD_CHUNK
-        chunk = PyUnicode_FromUnicode(&c, 1);
+        chunk = JSON_UnicodeFromChar(c);
         if (chunk == NULL) {
             goto bail;
         }
@@ -1091,7 +1096,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
         if (chunk != NULL)
             rval = chunk;
         else
-            rval = PyUnicode_FromUnicode(NULL, 0);
+            rval = JSON_NewEmptyUnicode();
     }
     else {
         APPEND_OLD_CHUNK
@@ -1386,8 +1391,9 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
 
     Returns a new PyObject (usually a dict, but object_hook can change that)
     */
-    Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
-    Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
+    void *str = PyUnicode_DATA(pystr);
+    Py_ssize_t end_idx = PyUnicode_GetLength(pystr) - 1;
+    PY2_UNUSED int kind = PyUnicode_KIND(pystr);
     PyObject *rval = NULL;
     PyObject *pairs = NULL;
     PyObject *item;
@@ -1409,15 +1415,15 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
     }
 
     /* skip whitespace after { */
-    while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+    while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
 
     /* only loop if the object is non-empty */
-    if (idx <= end_idx && str[idx] != '}') {
+    if (idx <= end_idx && PyUnicode_READ(kind, str, idx) != '}') {
         while (idx <= end_idx) {
             PyObject *memokey;
 
             /* read key */
-            if (str[idx] != '"') {
+            if (PyUnicode_READ(kind, str, idx) != '"') {
                 raise_errmsg(
                     "Expecting property name enclosed in double quotes",
                     pystr, idx);
@@ -1440,13 +1446,13 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
 
             /* skip whitespace between key and : delimiter, read :, skip
                whitespace */
-            while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
-            if (idx > end_idx || str[idx] != ':') {
+            while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
+            if (idx > end_idx || PyUnicode_READ(kind, str, idx) != ':') {
                 raise_errmsg("Expecting ':' delimiter", pystr, idx);
                 goto bail;
             }
             idx++;
-            while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+            while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
 
             /* read any JSON term */
             val = scan_once_unicode(s, pystr, idx, &next_idx);
@@ -1474,27 +1480,27 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
             idx = next_idx;
 
             /* skip whitespace before } or , */
-            while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+            while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
 
             /* bail if the object is closed or we didn't get the ,
                delimiter */
             if (idx > end_idx) break;
-            if (str[idx] == '}') {
+            if (PyUnicode_READ(kind, str, idx) == '}') {
                 break;
             }
-            else if (str[idx] != ',') {
+            else if (PyUnicode_READ(kind, str, idx) != ',') {
                 raise_errmsg("Expecting ',' delimiter", pystr, idx);
                 goto bail;
             }
             idx++;
 
             /* skip whitespace after , delimiter */
-            while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+            while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
         }
     }
 
     /* verify that idx < end_idx, str[idx] should be '}' */
-    if (idx > end_idx || str[idx] != '}') {
+    if (idx > end_idx || PyUnicode_READ(kind, str, idx) != '}') {
         raise_errmsg("Expecting object", pystr, end_idx);
         goto bail;
     }
@@ -1613,8 +1619,9 @@ _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssi
 
     Returns a new PyList
     */
-    Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
-    Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
+    PY2_UNUSED int kind = PyUnicode_KIND(pystr);
+    void *str = PyUnicode_DATA(pystr);
+    Py_ssize_t end_idx = PyUnicode_GetLength(pystr) - 1;
     PyObject *val = NULL;
     PyObject *rval = PyList_New(0);
     Py_ssize_t next_idx;
@@ -1622,10 +1629,10 @@ _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssi
         return NULL;
 
     /* skip whitespace after [ */
-    while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+    while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
 
     /* only loop if the array is non-empty */
-    if (idx <= end_idx && str[idx] != ']') {
+    if (idx <= end_idx && PyUnicode_READ(kind, str, idx) != ']') {
         while (idx <= end_idx) {
 
             /* read any JSON term  */
@@ -1645,26 +1652,26 @@ _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssi
             idx = next_idx;
 
             /* skip whitespace between term and , */
-            while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+            while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
 
             /* bail if the array is closed or we didn't get the , delimiter */
             if (idx > end_idx) break;
-            if (str[idx] == ']') {
+            if (PyUnicode_READ(kind, str, idx) == ']') {
                 break;
             }
-            else if (str[idx] != ',') {
+            else if (PyUnicode_READ(kind, str, idx) != ',') {
                 raise_errmsg("Expecting ',' delimiter", pystr, idx);
                 goto bail;
             }
             idx++;
 
             /* skip whitespace after , */
-            while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+            while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
         }
     }
 
     /* verify that idx < end_idx, str[idx] should be ']' */
-    if (idx > end_idx || str[idx] != ']') {
+    if (idx > end_idx || PyUnicode_READ(kind, str, idx) != ']') {
         raise_errmsg("Expecting object", pystr, end_idx);
         goto bail;
     }
@@ -1821,15 +1828,17 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_
         PyInt, PyLong, or PyFloat.
         May return other types if parse_int or parse_float are set
     */
-    Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
-    Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
+    PY2_UNUSED int kind = PyUnicode_KIND(pystr);
+    void *str = PyUnicode_DATA(pystr);
+    Py_ssize_t end_idx = PyUnicode_GetLength(pystr) - 1;
     Py_ssize_t idx = start;
     int is_float = 0;
+    JSON_UNICHR c;
     PyObject *rval;
     PyObject *numstr;
 
     /* read a sign if it's there, make sure it's not the end of the string */
-    if (str[idx] == '-') {
+    if (PyUnicode_READ(kind, str, idx) == '-') {
         idx++;
         if (idx > end_idx) {
             PyErr_SetNone(PyExc_StopIteration);
@@ -1838,40 +1847,49 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_
     }
 
     /* read as many integer digits as we find as long as it doesn't start with 0 */
-    if (str[idx] >= '1' && str[idx] <= '9') {
+    c = PyUnicode_READ(kind, str, idx);
+    if (c == '0') {
+        /* if it starts with 0 we only expect one integer digit */
         idx++;
-        while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
     }
-    /* if it starts with 0 we only expect one integer digit */
-    else if (str[idx] == '0') {
+    else if (IS_DIGIT(c)) {
         idx++;
+        while (idx <= end_idx && IS_DIGIT(PyUnicode_READ(kind, str, idx))) {
+            idx++;
+        }
     }
-    /* no integer digits, error */
     else {
+        /* no integer digits, error */
         PyErr_SetNone(PyExc_StopIteration);
         return NULL;
     }
 
     /* if the next char is '.' followed by a digit then read all float digits */
-    if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') {
+    if (idx < end_idx &&
+        PyUnicode_READ(kind, str, idx) == '.' &&
+        IS_DIGIT(PyUnicode_READ(kind, str, idx + 1))) {
         is_float = 1;
         idx += 2;
-        while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
+        while (idx <= end_idx && IS_DIGIT(PyUnicode_READ(kind, str, idx))) idx++;
     }
 
     /* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */
-    if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) {
+    if (idx < end_idx &&
+        (PyUnicode_READ(kind, str, idx) == 'e' ||
+            PyUnicode_READ(kind, str, idx) == 'E')) {
         Py_ssize_t e_start = idx;
         idx++;
 
         /* read an exponent sign if present */
-        if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++;
+        if (idx < end_idx &&
+            (PyUnicode_READ(kind, str, idx) == '-' ||
+                PyUnicode_READ(kind, str, idx) == '+')) idx++;
 
         /* read all digits */
-        while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
+        while (idx <= end_idx && IS_DIGIT(PyUnicode_READ(kind, str, idx))) idx++;
 
         /* if we got a digit, then parse as float. if not, backtrack */
-        if (str[idx - 1] >= '0' && str[idx - 1] <= '9') {
+        if (IS_DIGIT(PyUnicode_READ(kind, str, idx - 1))) {
             is_float = 1;
         }
         else {
@@ -1880,7 +1898,11 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_
     }
 
     /* copy the section we determined to be a number */
-    numstr = PyUnicode_FromUnicode(&str[start], idx - start);
+#if PY_MAJOR_VERSION >= 3
+    numstr = PyUnicode_Substring(pystr, start, idx);
+#else
+    numstr = PyUnicode_FromUnicode(&((Py_UNICODE *)str)[start], idx - start);
+#endif
     if (numstr == NULL)
         return NULL;
     if (is_float) {
@@ -2018,8 +2040,9 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
 
     Returns a new PyObject representation of the term.
     */
-    Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
-    Py_ssize_t length = PyUnicode_GET_SIZE(pystr);
+    PY2_UNUSED int kind = PyUnicode_KIND(pystr);
+    void *str = PyUnicode_DATA(pystr);
+    Py_ssize_t length = PyUnicode_GetLength(pystr);
     PyObject *rval = NULL;
     int fallthrough = 0;
     if (idx >= length) {
@@ -2028,7 +2051,7 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
     }
     if (Py_EnterRecursiveCall(" while decoding a JSON document"))
         return NULL;
-    switch (str[idx]) {
+    switch (PyUnicode_READ(kind, str, idx)) {
         case '"':
             /* string */
             rval = scanstring_unicode(pystr, idx + 1,
@@ -2045,7 +2068,10 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
             break;
         case 'n':
             /* null */
-            if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') {
+            if ((idx + 3 < length) &&
+                PyUnicode_READ(kind, str, idx + 1) == 'u' &&
+                PyUnicode_READ(kind, str, idx + 2) == 'l' &&
+                PyUnicode_READ(kind, str, idx + 3) == 'l') {
                 Py_INCREF(Py_None);
                 *next_idx_ptr = idx + 4;
                 rval = Py_None;
@@ -2055,7 +2081,10 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
             break;
         case 't':
             /* true */
-            if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') {
+            if ((idx + 3 < length) &&
+                PyUnicode_READ(kind, str, idx + 1) == 'r' &&
+                PyUnicode_READ(kind, str, idx + 2) == 'u' &&
+                PyUnicode_READ(kind, str, idx + 3) == 'e') {
                 Py_INCREF(Py_True);
                 *next_idx_ptr = idx + 4;
                 rval = Py_True;
@@ -2065,7 +2094,11 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
             break;
         case 'f':
             /* false */
-            if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') {
+            if ((idx + 4 < length) &&
+                PyUnicode_READ(kind, str, idx + 1) == 'a' &&
+                PyUnicode_READ(kind, str, idx + 2) == 'l' &&
+                PyUnicode_READ(kind, str, idx + 3) == 's' &&
+                PyUnicode_READ(kind, str, idx + 4) == 'e') {
                 Py_INCREF(Py_False);
                 *next_idx_ptr = idx + 5;
                 rval = Py_False;
@@ -2075,7 +2108,9 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
             break;
         case 'N':
             /* NaN */
-            if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') {
+            if ((idx + 2 < length) &&
+                PyUnicode_READ(kind, str, idx + 1) == 'a' &&
+                PyUnicode_READ(kind, str, idx + 2) == 'N') {
                 rval = _parse_constant(s, "NaN", idx, next_idx_ptr);
             }
             else
@@ -2083,7 +2118,14 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
             break;
         case 'I':
             /* Infinity */
-            if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') {
+            if ((idx + 7 < length) &&
+                PyUnicode_READ(kind, str, idx + 1) == 'n' &&
+                PyUnicode_READ(kind, str, idx + 2) == 'f' &&
+                PyUnicode_READ(kind, str, idx + 3) == 'i' &&
+                PyUnicode_READ(kind, str, idx + 4) == 'n' &&
+                PyUnicode_READ(kind, str, idx + 5) == 'i' &&
+                PyUnicode_READ(kind, str, idx + 6) == 't' &&
+                PyUnicode_READ(kind, str, idx + 7) == 'y') {
                 rval = _parse_constant(s, "Infinity", idx, next_idx_ptr);
             }
             else
@@ -2091,7 +2133,15 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
             break;
         case '-':
             /* -Infinity */
-            if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') {
+            if ((idx + 8 < length) &&
+                PyUnicode_READ(kind, str, idx + 1) == 'I' &&
+                PyUnicode_READ(kind, str, idx + 2) == 'n' &&
+                PyUnicode_READ(kind, str, idx + 3) == 'f' &&
+                PyUnicode_READ(kind, str, idx + 4) == 'i' &&
+                PyUnicode_READ(kind, str, idx + 5) == 'n' &&
+                PyUnicode_READ(kind, str, idx + 6) == 'i' &&
+                PyUnicode_READ(kind, str, idx + 7) == 't' &&
+                PyUnicode_READ(kind, str, idx + 8) == 'y') {
                 rval = _parse_constant(s, "-Infinity", idx, next_idx_ptr);
             }
             else
diff --git a/simplejson/tests/test_scanstring.py b/simplejson/tests/test_scanstring.py
index 297bc1b..045725a 100644
--- a/simplejson/tests/test_scanstring.py
+++ b/simplejson/tests/test_scanstring.py
@@ -3,7 +3,7 @@ from unittest import TestCase
 
 import simplejson as json
 import simplejson.decoder
-from simplejson.compat import b
+from simplejson.compat import b, PY3
 
 class TestScanString(TestCase):
     # The bytes type is intentionally not used in most of these tests
@@ -112,6 +112,27 @@ class TestScanString(TestCase):
             scanstring('["Bad value", truth]', 2, None, True),
             (u'Bad value', 12))
 
+        for c in map(chr, range(0x00, 0x1f)):
+            self.assertEquals(
+                scanstring(c + '"', 0, None, False),
+                (c, 2))
+            self.assertRaises(
+                ValueError,
+                scanstring, c + '"', 0, None, True)
+
+        self.assertRaises(ValueError, scanstring, '', 0, None, True)
+        self.assertRaises(ValueError, scanstring, 'a', 0, None, True)
+        self.assertRaises(ValueError, scanstring, '\\', 0, None, True)
+        self.assertRaises(ValueError, scanstring, '\\u', 0, None, True)
+        self.assertRaises(ValueError, scanstring, '\\u0', 0, None, True)
+        self.assertRaises(ValueError, scanstring, '\\u01', 0, None, True)
+        self.assertRaises(ValueError, scanstring, '\\u012', 0, None, True)
+        self.assertRaises(ValueError, scanstring, '\\u0123', 0, None, True)
+        if sys.maxunicode > 65535:
+            self.assertRaises(ValueError, scanstring, '\\ud834"', 0, None, True),
+            self.assertRaises(ValueError, scanstring, '\\ud834\\u"', 0, None, True),
+            self.assertRaises(ValueError, scanstring, '\\ud834\\x0123"', 0, None, True),
+
     def test_issue3623(self):
         self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
                           "xxx")
author	Bob Ippolito <bob@redivi.com>	2012-12-29 01:17:49 -0800
committer	Bob Ippolito <bob@redivi.com>	2012-12-29 01:17:49 -0800
commit	83a493db6a8b859ec7b10fa85365dd3fdf144c68 (patch)
tree	88bbbfe2c7654f54a44b922f9cdc6eb7f24ce209
parent	7ed40f0f1f476595f4693a74a1074e5c2446a95e (diff)
download	simplejson-83a493db6a8b859ec7b10fa85365dd3fdf144c68.tar.gz