summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Ippolito <bob@redivi.com>2012-12-29 01:17:49 -0800
committerBob Ippolito <bob@redivi.com>2012-12-29 01:17:49 -0800
commit83a493db6a8b859ec7b10fa85365dd3fdf144c68 (patch)
tree88bbbfe2c7654f54a44b922f9cdc6eb7f24ce209
parent7ed40f0f1f476595f4693a74a1074e5c2446a95e (diff)
downloadsimplejson-83a493db6a8b859ec7b10fa85365dd3fdf144c68.tar.gz
try and use native Py3.3 unicode everywhere
-rw-r--r--.gitignore1
-rw-r--r--simplejson/_speedups.c268
-rw-r--r--simplejson/tests/test_scanstring.py23
3 files changed, 182 insertions, 110 deletions
diff --git a/.gitignore b/.gitignore
index fb20d6f..590f60f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
*.egg
*.pyc
*.so
+.DS_Store
/MANIFEST
/.coverage
/coverage.xml
diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c
index 8282d09..74fa831 100644
--- a/simplejson/_speedups.c
+++ b/simplejson/_speedups.c
@@ -16,12 +16,23 @@
#define JSON_ASCII_AS_STRING PyUnicode_AsUTF8
#define PyInt_Type PyLong_Type
#define PyInt_FromString PyLong_FromString
+#define PY2_UNUSED
+#define PY3_UNUSED UNUSED
+#define JSON_NewEmptyUnicode() PyUnicode_New(0, 127)
#else /* PY_MAJOR_VERSION >= 3 */
+#define PY2_UNUSED UNUSED
+#define PY3_UNUSED
+#define PyUnicode_READY(obj) 0
+#define PyUnicode_KIND(obj) (sizeof(Py_UNICODE))
+#define PyUnicode_DATA(obj) ((void *)(PyUnicode_AS_UNICODE(obj)))
+#define PyUnicode_READ(kind, data, index) ((JSON_UNICHR)((const Py_UNICODE *)(data))[(index)])
+#define PyUnicode_GetLength PyUnicode_GET_SIZE
#define JSON_UNICHR Py_UNICODE
#define JSON_ASCII_Check PyString_Check
#define JSON_ASCII_AS_STRING PyString_AS_STRING
#define JSON_InternFromString PyString_InternFromString
#define JSON_Intern_GET_SIZE PyString_GET_SIZE
+#define JSON_NewEmptyUnicode() PyUnicode_FromUnicode(NULL, 0)
#endif /* PY_MAJOR_VERSION < 3 */
#if PY_VERSION_HEX < 0x02070000
@@ -144,6 +155,8 @@ static PyMemberDef encoder_members[] = {
};
static PyObject *
+JSON_UnicodeFromChar(JSON_UNICHR c);
+static PyObject *
maybe_quote_bigint(PyObject *encoded, PyObject *obj);
static Py_ssize_t
@@ -214,6 +227,25 @@ moduleinit(void);
#define MIN_EXPANSION 6
+static int
+IS_DIGIT(JSON_UNICHR c)
+{
+ return c >= '0' && c <= '9';
+}
+
+static PyObject *
+JSON_UnicodeFromChar(JSON_UNICHR c)
+{
+#if PY_MAJOR_VERSION >= 3
+ PyObject *rval = PyUnicode_New(1, c);
+ if (rval)
+ PyUnicode_WRITE(PyUnicode_KIND(rval), PyUnicode_DATA(rval), 0, c);
+ return rval;
+#else /* PY_MAJOR_VERSION >= 3 */
+ return PyUnicode_FromUnicode(&c, 1);
+#endif /* PY_MAJOR_VERSION < 3 */
+}
+
static PyObject *
maybe_quote_bigint(PyObject *encoded, PyObject *obj)
{
@@ -345,8 +377,6 @@ ascii_char_size(JSON_UNICHR c)
}
}
-#if PY_MAJOR_VERSION >= 3
-
static PyObject *
ascii_escape_unicode(PyObject *pystr)
{
@@ -370,11 +400,20 @@ ascii_escape_unicode(PyObject *pystr)
for (i = 0; i < input_chars; i++) {
output_size += ascii_char_size(PyUnicode_READ(kind, data, i));
}
+#if PY_MAJOR_VERSION >= 3
rval = PyUnicode_New(output_size, 127);
if (rval == NULL) {
return NULL;
}
+ assert(PyUnicode_KIND(rval) == PyUnicode_1BYTE_KIND);
output = (char *)PyUnicode_DATA(rval);
+#else
+ rval = PyString_FromStringAndSize(NULL, output_size);
+ if (rval == NULL) {
+ return NULL;
+ }
+ output = PyString_AS_STRING(rval);
+#endif
chars = 0;
output[chars++] = '"';
for (i = 0; i < input_chars; i++) {
@@ -385,6 +424,8 @@ ascii_escape_unicode(PyObject *pystr)
return rval;
}
+#if PY_MAJOR_VERSION >= 3
+
static PyObject *
ascii_escape_str(PyObject *pystr)
{
@@ -400,40 +441,6 @@ ascii_escape_str(PyObject *pystr)
#else /* PY_MAJOR_VERSION >= 3 */
static PyObject *
-ascii_escape_unicode(PyObject *pystr)
-{
- /* Take a PyUnicode pystr and return a new ASCII-only escaped PyString */
- Py_ssize_t i;
- Py_ssize_t input_chars;
- Py_ssize_t output_size;
- Py_ssize_t chars;
- PyObject *rval;
- char *output;
- Py_UNICODE *input_unicode;
-
- input_chars = PyUnicode_GET_SIZE(pystr);
- input_unicode = PyUnicode_AS_UNICODE(pystr);
-
- output_size = 2;
- for (i = 0; i < input_chars; i++) {
- output_size += ascii_char_size((JSON_UNICHR)input_unicode[i]);
- }
- rval = PyString_FromStringAndSize(NULL, output_size);
- if (rval == NULL) {
- return NULL;
- }
- output = PyString_AS_STRING(rval);
- chars = 0;
- output[chars++] = '"';
- for (i = 0; i < input_chars; i++) {
- chars = ascii_escape_char((JSON_UNICHR)input_unicode[i], output, chars);
- }
- output[chars++] = '"';
- assert(chars == output_size);
- return rval;
-}
-
-static PyObject *
ascii_escape_str(PyObject *pystr)
{
/* Take a PyString pystr and return a new ASCII-only escaped PyString */
@@ -554,13 +561,10 @@ encoder_dict_iteritems(PyEncoderObject *s, PyObject *dct)
value = PyTuple_GET_ITEM(item, 1);
if (value == NULL)
goto bail;
- PyObject *tpl = PyTuple_New(2);
+ PyObject *tpl = PyTuple_Pack(2, kstr, value);
if (tpl == NULL)
goto bail;
- PyTuple_SET_ITEM(tpl, 0, kstr);
- kstr = NULL;
- Py_INCREF(value);
- PyTuple_SET_ITEM(tpl, 1, value);
+ Py_CLEAR(kstr);
Py_DECREF(item);
item = tpl;
}
@@ -618,7 +622,7 @@ join_list_unicode(PyObject *lst)
/* return u''.join(lst) */
static PyObject *joinfn = NULL;
if (joinfn == NULL) {
- PyObject *ustr = PyUnicode_FromUnicode(NULL, 0);
+ PyObject *ustr = JSON_NewEmptyUnicode();
if (ustr == NULL)
return NULL;
@@ -878,14 +882,13 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
}
APPEND_OLD_CHUNK
#if PY_MAJOR_VERSION >= 3
- chunk = PyUnicode_New(1, c);
+ chunk = JSON_UnicodeFromChar(c);
if (chunk == NULL) {
goto bail;
}
- PyUnicode_WRITE(PyUnicode_KIND(chunk), PyUnicode_DATA(chunk), 0, c);
#else /* PY_MAJOR_VERSION >= 3 */
if (has_unicode) {
- chunk = PyUnicode_FromUnicode(&c, 1);
+ chunk = JSON_UnicodeFromChar(c);
if (chunk == NULL) {
goto bail;
}
@@ -904,11 +907,7 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
if (chunk != NULL)
rval = chunk;
else
-#if PY_MAJOR_VERSION >= 3
- rval = PyUnicode_New(0, 127);
-#else
- rval = PyString_FromStringAndSize("", 0);
-#endif
+ rval = JSON_NewEmptyUnicode();
}
else {
APPEND_OLD_CHUNK
@@ -941,10 +940,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
Return value is a new PyUnicode
*/
PyObject *rval;
- Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
Py_ssize_t begin = end - 1;
Py_ssize_t next = begin;
- const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
+ PY2_UNUSED int kind = PyUnicode_KIND(pystr);
+ Py_ssize_t len = PyUnicode_GetLength(pystr);
+ void *buf = PyUnicode_DATA(pystr);
PyObject *chunks = NULL;
PyObject *chunk = NULL;
@@ -957,9 +957,9 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
}
while (1) {
/* Find the end of the string or the next escape */
- Py_UNICODE c = 0;
+ JSON_UNICHR c = 0;
for (next = end; next < len; next++) {
- c = buf[next];
+ c = PyUnicode_READ(kind, buf, next);
if (c == '"' || c == '\\') {
break;
}
@@ -975,7 +975,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
/* Pick up this chunk if it's not zero length */
if (next != end) {
APPEND_OLD_CHUNK
- chunk = PyUnicode_FromUnicode(&buf[end], next - end);
+#if PY_MAJOR_VERSION < 3
+ chunk = PyUnicode_FromUnicode(&((const Py_UNICODE *)buf)[end], next - end);
+#else
+ chunk = PyUnicode_Substring(pystr, end, next);
+#endif
if (chunk == NULL) {
goto bail;
}
@@ -989,7 +993,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
raise_errmsg("Unterminated string starting at", pystr, begin);
goto bail;
}
- c = buf[next];
+ c = PyUnicode_READ(kind, buf, next);
if (c != 'u') {
/* Non-unicode backslash escapes */
end = next + 1;
@@ -1019,7 +1023,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
}
/* Decode 4 hex digits */
for (; next < end; next++) {
- Py_UNICODE digit = buf[next];
+ JSON_UNICHR digit = PyUnicode_READ(kind, buf, next);
c <<= 4;
switch (digit) {
case '0': case '1': case '2': case '3': case '4':
@@ -1036,15 +1040,16 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
goto bail;
}
}
-#ifdef Py_UNICODE_WIDE
+#if PY_MAJOR_VERSION >= 3 || defined(Py_UNICODE_WIDE)
/* Surrogate pair */
if ((c & 0xfc00) == 0xd800) {
- Py_UNICODE c2 = 0;
+ JSON_UNICHR c2 = 0;
if (end + 6 >= len) {
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
goto bail;
}
- if (buf[next++] != '\\' || buf[next++] != 'u') {
+ if (PyUnicode_READ(kind, buf, next++) != '\\' ||
+ PyUnicode_READ(kind, buf, next++) != 'u') {
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
goto bail;
}
@@ -1052,7 +1057,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
/* Decode 4 hex digits */
for (; next < end; next++) {
c2 <<= 4;
- Py_UNICODE digit = buf[next];
+ JSON_UNICHR digit = PyUnicode_READ(kind, buf, next);
switch (digit) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
@@ -1081,7 +1086,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
#endif
}
APPEND_OLD_CHUNK
- chunk = PyUnicode_FromUnicode(&c, 1);
+ chunk = JSON_UnicodeFromChar(c);
if (chunk == NULL) {
goto bail;
}
@@ -1091,7 +1096,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
if (chunk != NULL)
rval = chunk;
else
- rval = PyUnicode_FromUnicode(NULL, 0);
+ rval = JSON_NewEmptyUnicode();
}
else {
APPEND_OLD_CHUNK
@@ -1386,8 +1391,9 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
Returns a new PyObject (usually a dict, but object_hook can change that)
*/
- Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
- Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
+ void *str = PyUnicode_DATA(pystr);
+ Py_ssize_t end_idx = PyUnicode_GetLength(pystr) - 1;
+ PY2_UNUSED int kind = PyUnicode_KIND(pystr);
PyObject *rval = NULL;
PyObject *pairs = NULL;
PyObject *item;
@@ -1409,15 +1415,15 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
}
/* skip whitespace after { */
- while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+ while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
/* only loop if the object is non-empty */
- if (idx <= end_idx && str[idx] != '}') {
+ if (idx <= end_idx && PyUnicode_READ(kind, str, idx) != '}') {
while (idx <= end_idx) {
PyObject *memokey;
/* read key */
- if (str[idx] != '"') {
+ if (PyUnicode_READ(kind, str, idx) != '"') {
raise_errmsg(
"Expecting property name enclosed in double quotes",
pystr, idx);
@@ -1440,13 +1446,13 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
/* skip whitespace between key and : delimiter, read :, skip
whitespace */
- while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
- if (idx > end_idx || str[idx] != ':') {
+ while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
+ if (idx > end_idx || PyUnicode_READ(kind, str, idx) != ':') {
raise_errmsg("Expecting ':' delimiter", pystr, idx);
goto bail;
}
idx++;
- while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+ while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
/* read any JSON term */
val = scan_once_unicode(s, pystr, idx, &next_idx);
@@ -1474,27 +1480,27 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
idx = next_idx;
/* skip whitespace before } or , */
- while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+ while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
/* bail if the object is closed or we didn't get the ,
delimiter */
if (idx > end_idx) break;
- if (str[idx] == '}') {
+ if (PyUnicode_READ(kind, str, idx) == '}') {
break;
}
- else if (str[idx] != ',') {
+ else if (PyUnicode_READ(kind, str, idx) != ',') {
raise_errmsg("Expecting ',' delimiter", pystr, idx);
goto bail;
}
idx++;
/* skip whitespace after , delimiter */
- while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+ while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
}
}
/* verify that idx < end_idx, str[idx] should be '}' */
- if (idx > end_idx || str[idx] != '}') {
+ if (idx > end_idx || PyUnicode_READ(kind, str, idx) != '}') {
raise_errmsg("Expecting object", pystr, end_idx);
goto bail;
}
@@ -1613,8 +1619,9 @@ _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssi
Returns a new PyList
*/
- Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
- Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
+ PY2_UNUSED int kind = PyUnicode_KIND(pystr);
+ void *str = PyUnicode_DATA(pystr);
+ Py_ssize_t end_idx = PyUnicode_GetLength(pystr) - 1;
PyObject *val = NULL;
PyObject *rval = PyList_New(0);
Py_ssize_t next_idx;
@@ -1622,10 +1629,10 @@ _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssi
return NULL;
/* skip whitespace after [ */
- while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+ while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
/* only loop if the array is non-empty */
- if (idx <= end_idx && str[idx] != ']') {
+ if (idx <= end_idx && PyUnicode_READ(kind, str, idx) != ']') {
while (idx <= end_idx) {
/* read any JSON term */
@@ -1645,26 +1652,26 @@ _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssi
idx = next_idx;
/* skip whitespace between term and , */
- while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+ while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
/* bail if the array is closed or we didn't get the , delimiter */
if (idx > end_idx) break;
- if (str[idx] == ']') {
+ if (PyUnicode_READ(kind, str, idx) == ']') {
break;
}
- else if (str[idx] != ',') {
+ else if (PyUnicode_READ(kind, str, idx) != ',') {
raise_errmsg("Expecting ',' delimiter", pystr, idx);
goto bail;
}
idx++;
/* skip whitespace after , */
- while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++;
+ while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++;
}
}
/* verify that idx < end_idx, str[idx] should be ']' */
- if (idx > end_idx || str[idx] != ']') {
+ if (idx > end_idx || PyUnicode_READ(kind, str, idx) != ']') {
raise_errmsg("Expecting object", pystr, end_idx);
goto bail;
}
@@ -1821,15 +1828,17 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_
PyInt, PyLong, or PyFloat.
May return other types if parse_int or parse_float are set
*/
- Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
- Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
+ PY2_UNUSED int kind = PyUnicode_KIND(pystr);
+ void *str = PyUnicode_DATA(pystr);
+ Py_ssize_t end_idx = PyUnicode_GetLength(pystr) - 1;
Py_ssize_t idx = start;
int is_float = 0;
+ JSON_UNICHR c;
PyObject *rval;
PyObject *numstr;
/* read a sign if it's there, make sure it's not the end of the string */
- if (str[idx] == '-') {
+ if (PyUnicode_READ(kind, str, idx) == '-') {
idx++;
if (idx > end_idx) {
PyErr_SetNone(PyExc_StopIteration);
@@ -1838,40 +1847,49 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_
}
/* read as many integer digits as we find as long as it doesn't start with 0 */
- if (str[idx] >= '1' && str[idx] <= '9') {
+ c = PyUnicode_READ(kind, str, idx);
+ if (c == '0') {
+ /* if it starts with 0 we only expect one integer digit */
idx++;
- while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
}
- /* if it starts with 0 we only expect one integer digit */
- else if (str[idx] == '0') {
+ else if (IS_DIGIT(c)) {
idx++;
+ while (idx <= end_idx && IS_DIGIT(PyUnicode_READ(kind, str, idx))) {
+ idx++;
+ }
}
- /* no integer digits, error */
else {
+ /* no integer digits, error */
PyErr_SetNone(PyExc_StopIteration);
return NULL;
}
/* if the next char is '.' followed by a digit then read all float digits */
- if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') {
+ if (idx < end_idx &&
+ PyUnicode_READ(kind, str, idx) == '.' &&
+ IS_DIGIT(PyUnicode_READ(kind, str, idx + 1))) {
is_float = 1;
idx += 2;
- while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
+ while (idx <= end_idx && IS_DIGIT(PyUnicode_READ(kind, str, idx))) idx++;
}
/* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */
- if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) {
+ if (idx < end_idx &&
+ (PyUnicode_READ(kind, str, idx) == 'e' ||
+ PyUnicode_READ(kind, str, idx) == 'E')) {
Py_ssize_t e_start = idx;
idx++;
/* read an exponent sign if present */
- if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++;
+ if (idx < end_idx &&
+ (PyUnicode_READ(kind, str, idx) == '-' ||
+ PyUnicode_READ(kind, str, idx) == '+')) idx++;
/* read all digits */
- while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++;
+ while (idx <= end_idx && IS_DIGIT(PyUnicode_READ(kind, str, idx))) idx++;
/* if we got a digit, then parse as float. if not, backtrack */
- if (str[idx - 1] >= '0' && str[idx - 1] <= '9') {
+ if (IS_DIGIT(PyUnicode_READ(kind, str, idx - 1))) {
is_float = 1;
}
else {
@@ -1880,7 +1898,11 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_
}
/* copy the section we determined to be a number */
- numstr = PyUnicode_FromUnicode(&str[start], idx - start);
+#if PY_MAJOR_VERSION >= 3
+ numstr = PyUnicode_Substring(pystr, start, idx);
+#else
+ numstr = PyUnicode_FromUnicode(&((Py_UNICODE *)str)[start], idx - start);
+#endif
if (numstr == NULL)
return NULL;
if (is_float) {
@@ -2018,8 +2040,9 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
Returns a new PyObject representation of the term.
*/
- Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
- Py_ssize_t length = PyUnicode_GET_SIZE(pystr);
+ PY2_UNUSED int kind = PyUnicode_KIND(pystr);
+ void *str = PyUnicode_DATA(pystr);
+ Py_ssize_t length = PyUnicode_GetLength(pystr);
PyObject *rval = NULL;
int fallthrough = 0;
if (idx >= length) {
@@ -2028,7 +2051,7 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
}
if (Py_EnterRecursiveCall(" while decoding a JSON document"))
return NULL;
- switch (str[idx]) {
+ switch (PyUnicode_READ(kind, str, idx)) {
case '"':
/* string */
rval = scanstring_unicode(pystr, idx + 1,
@@ -2045,7 +2068,10 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
break;
case 'n':
/* null */
- if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') {
+ if ((idx + 3 < length) &&
+ PyUnicode_READ(kind, str, idx + 1) == 'u' &&
+ PyUnicode_READ(kind, str, idx + 2) == 'l' &&
+ PyUnicode_READ(kind, str, idx + 3) == 'l') {
Py_INCREF(Py_None);
*next_idx_ptr = idx + 4;
rval = Py_None;
@@ -2055,7 +2081,10 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
break;
case 't':
/* true */
- if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') {
+ if ((idx + 3 < length) &&
+ PyUnicode_READ(kind, str, idx + 1) == 'r' &&
+ PyUnicode_READ(kind, str, idx + 2) == 'u' &&
+ PyUnicode_READ(kind, str, idx + 3) == 'e') {
Py_INCREF(Py_True);
*next_idx_ptr = idx + 4;
rval = Py_True;
@@ -2065,7 +2094,11 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
break;
case 'f':
/* false */
- if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') {
+ if ((idx + 4 < length) &&
+ PyUnicode_READ(kind, str, idx + 1) == 'a' &&
+ PyUnicode_READ(kind, str, idx + 2) == 'l' &&
+ PyUnicode_READ(kind, str, idx + 3) == 's' &&
+ PyUnicode_READ(kind, str, idx + 4) == 'e') {
Py_INCREF(Py_False);
*next_idx_ptr = idx + 5;
rval = Py_False;
@@ -2075,7 +2108,9 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
break;
case 'N':
/* NaN */
- if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') {
+ if ((idx + 2 < length) &&
+ PyUnicode_READ(kind, str, idx + 1) == 'a' &&
+ PyUnicode_READ(kind, str, idx + 2) == 'N') {
rval = _parse_constant(s, "NaN", idx, next_idx_ptr);
}
else
@@ -2083,7 +2118,14 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
break;
case 'I':
/* Infinity */
- if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') {
+ if ((idx + 7 < length) &&
+ PyUnicode_READ(kind, str, idx + 1) == 'n' &&
+ PyUnicode_READ(kind, str, idx + 2) == 'f' &&
+ PyUnicode_READ(kind, str, idx + 3) == 'i' &&
+ PyUnicode_READ(kind, str, idx + 4) == 'n' &&
+ PyUnicode_READ(kind, str, idx + 5) == 'i' &&
+ PyUnicode_READ(kind, str, idx + 6) == 't' &&
+ PyUnicode_READ(kind, str, idx + 7) == 'y') {
rval = _parse_constant(s, "Infinity", idx, next_idx_ptr);
}
else
@@ -2091,7 +2133,15 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_
break;
case '-':
/* -Infinity */
- if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') {
+ if ((idx + 8 < length) &&
+ PyUnicode_READ(kind, str, idx + 1) == 'I' &&
+ PyUnicode_READ(kind, str, idx + 2) == 'n' &&
+ PyUnicode_READ(kind, str, idx + 3) == 'f' &&
+ PyUnicode_READ(kind, str, idx + 4) == 'i' &&
+ PyUnicode_READ(kind, str, idx + 5) == 'n' &&
+ PyUnicode_READ(kind, str, idx + 6) == 'i' &&
+ PyUnicode_READ(kind, str, idx + 7) == 't' &&
+ PyUnicode_READ(kind, str, idx + 8) == 'y') {
rval = _parse_constant(s, "-Infinity", idx, next_idx_ptr);
}
else
diff --git a/simplejson/tests/test_scanstring.py b/simplejson/tests/test_scanstring.py
index 297bc1b..045725a 100644
--- a/simplejson/tests/test_scanstring.py
+++ b/simplejson/tests/test_scanstring.py
@@ -3,7 +3,7 @@ from unittest import TestCase
import simplejson as json
import simplejson.decoder
-from simplejson.compat import b
+from simplejson.compat import b, PY3
class TestScanString(TestCase):
# The bytes type is intentionally not used in most of these tests
@@ -112,6 +112,27 @@ class TestScanString(TestCase):
scanstring('["Bad value", truth]', 2, None, True),
(u'Bad value', 12))
+ for c in map(chr, range(0x00, 0x1f)):
+ self.assertEquals(
+ scanstring(c + '"', 0, None, False),
+ (c, 2))
+ self.assertRaises(
+ ValueError,
+ scanstring, c + '"', 0, None, True)
+
+ self.assertRaises(ValueError, scanstring, '', 0, None, True)
+ self.assertRaises(ValueError, scanstring, 'a', 0, None, True)
+ self.assertRaises(ValueError, scanstring, '\\', 0, None, True)
+ self.assertRaises(ValueError, scanstring, '\\u', 0, None, True)
+ self.assertRaises(ValueError, scanstring, '\\u0', 0, None, True)
+ self.assertRaises(ValueError, scanstring, '\\u01', 0, None, True)
+ self.assertRaises(ValueError, scanstring, '\\u012', 0, None, True)
+ self.assertRaises(ValueError, scanstring, '\\u0123', 0, None, True)
+ if sys.maxunicode > 65535:
+ self.assertRaises(ValueError, scanstring, '\\ud834"', 0, None, True),
+ self.assertRaises(ValueError, scanstring, '\\ud834\\u"', 0, None, True),
+ self.assertRaises(ValueError, scanstring, '\\ud834\\x0123"', 0, None, True),
+
def test_issue3623(self):
self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
"xxx")