diff options
author | Bob Ippolito <bob@redivi.com> | 2008-03-23 21:07:48 +0000 |
---|---|---|
committer | Bob Ippolito <bob@redivi.com> | 2008-03-23 21:07:48 +0000 |
commit | d7879c9c97547d4d8ca9e9072bbd8d33bd315c6c (patch) | |
tree | 7f045db7d61c0d404d8e647ae7401f7c6b83d026 | |
parent | 2f1e6e426465e68ca3fd30c596cc98c30f7ff076 (diff) | |
download | simplejson-d7879c9c97547d4d8ca9e9072bbd8d33bd315c6c.tar.gz |
decoder speed enhancements
git-svn-id: http://simplejson.googlecode.com/svn/trunk@77 a4795897-2c25-0410-b006-0d3caba88fa1
-rwxr-xr-x | scripts/bench.sh | 1 | ||||
-rw-r--r-- | setup.py | 2 | ||||
-rw-r--r-- | simplejson/__init__.py | 2 | ||||
-rw-r--r-- | simplejson/_speedups.c | 399 | ||||
-rw-r--r-- | simplejson/decoder.py | 12 | ||||
-rw-r--r-- | simplejson/tests/test_unicode.py | 10 |
6 files changed, 422 insertions, 4 deletions
diff --git a/scripts/bench.sh b/scripts/bench.sh index e30e246..de41c3e 100755 --- a/scripts/bench.sh +++ b/scripts/bench.sh @@ -1,2 +1,3 @@ #!/bin/sh /usr/bin/env python -mtimeit -s 'from simplejson.tests.test_pass1 import test_parse' 'test_parse()' +/usr/bin/env python -c 'from simplejson.tests.test_pass1 import test_parse; import profile; profile.run("for i in xrange(100): test_parse()")' @@ -18,7 +18,7 @@ from distutils.command.build_ext import build_ext from distutils.errors import CCompilerError, DistutilsExecError, \ DistutilsPlatformError -VERSION = '1.8' +VERSION = '1.8.1' DESCRIPTION = "Simple, fast, extensible JSON encoder/decoder for Python" LONG_DESCRIPTION = """ simplejson is a simple, fast, complete, correct and extensible diff --git a/simplejson/__init__.py b/simplejson/__init__.py index e39a110..be9e87a 100644 --- a/simplejson/__init__.py +++ b/simplejson/__init__.py @@ -99,7 +99,7 @@ pretty-print:: Note that the JSON produced by this module's default settings is a subset of YAML, so it may be used as a serializer for that as well. """ -__version__ = '1.8' +__version__ = '1.8.1' __all__ = [ 'dump', 'dumps', 'load', 'loads', 'JSONDecoder', 'JSONEncoder', diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c index 053369a..c7a7a7c 100644 --- a/simplejson/_speedups.c +++ b/simplejson/_speedups.c @@ -11,6 +11,8 @@ typedef int Py_ssize_t; #define UNUSED #endif +#define DEFAULT_ENCODING "utf-8" + static Py_ssize_t ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars); static PyObject * @@ -181,6 +183,402 @@ ascii_escape_str(PyObject *pystr) { return rval; } +void +raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) { + static PyObject *errmsg_fn = NULL; + PyObject *pymsg; + if (errmsg_fn == NULL) { + PyObject *decoder = PyImport_ImportModule("simplejson.decoder"); + if (decoder == NULL) return; + errmsg_fn = PyObject_GetAttrString(decoder, "errmsg"); + if (errmsg_fn == NULL) return; + Py_XDECREF(decoder); + } +#if PY_VERSION_HEX < 0x02050000 + pymsg = PyObject_CallFunction(errmsg_fn, "(zOi)", msg, s, end); +#else + pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end); +#endif + PyErr_SetObject(PyExc_ValueError, pymsg); +/* + +def linecol(doc, pos): + lineno = doc.count('\n', 0, pos) + 1 + if lineno == 1: + colno = pos + else: + colno = pos - doc.rindex('\n', 0, pos) + return lineno, colno + +def errmsg(msg, doc, pos, end=None): + lineno, colno = linecol(doc, pos) + if end is None: + return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos) + endlineno, endcolno = linecol(doc, end) + return '%s: line %d column %d - line %d column %d (char %d - %d)' % ( + msg, lineno, colno, endlineno, endcolno, pos, end) + +*/ +} + +static PyObject * +join_list_unicode(PyObject *lst) { + static PyObject *ustr = NULL; + static PyObject *joinstr = NULL; + if (ustr == NULL) { + Py_UNICODE c = 0; + ustr = PyUnicode_FromUnicode(&c, 0); + } + if (joinstr == NULL) { + joinstr = PyString_FromString("join"); + } + if (joinstr == NULL || ustr == NULL) { + return NULL; + } + return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL); +} + +static PyObject * +scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding) { + PyObject *rval; + Py_ssize_t len = PyString_GET_SIZE(pystr); + Py_ssize_t begin = end - 1; + Py_ssize_t next = begin; + char *buf = PyString_AS_STRING(pystr); + PyObject *chunks = PyList_New(0); + if (chunks == NULL) { + goto bail; + } + while (1) { + /* Find the end of the string or the next escape */ + Py_UNICODE c = 0; + PyObject *chunk = NULL; + for (next = end; next < len; next++) { + c = buf[next]; + if (c == '"' || c == '\\') { + break; + } + } + if (!(c == '"' || c == '\\')) { + raise_errmsg("Unterminated string starting at", pystr, begin); + goto bail; + } + /* Pick up this chunk if it's not zero length */ + if (next != end) { + PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end); + if (strchunk == NULL) { + goto bail; + } + chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL); + if (chunk == NULL) { + goto bail; + } + if (PyList_Append(chunks, chunk)) { + goto bail; + } + Py_DECREF(chunk); + } + next++; + if (c == '"') { + end = next; + break; + } + if (next == len) { + raise_errmsg("Unterminated string starting at", pystr, begin); + goto bail; + } + c = buf[next]; + if (c != 'u') { + /* Non-unicode backslash escapes */ + end = next + 1; + switch (c) { + case '"': break; + case '\\': break; + case '/': break; + case 'b': c = '\b'; break; + case 'f': c = '\f'; break; + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + default: c = 0; + } + if (c == 0) { + raise_errmsg("Invalid \\escape", pystr, end - 2); + goto bail; + } + } else { + c = 0; + next++; + end = next + 4; + if (end >= len) { + raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); + goto bail; + } + /* Decode 4 hex digits */ + for (; next < end; next++) { + Py_ssize_t shl = (end - next - 1) << 2; + Py_UNICODE digit = buf[next]; + switch (digit) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c |= (digit - '0') << shl; break; + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': + c |= (digit - 'a' + 10) << shl; break; + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': + c |= (digit - 'A' + 10) << shl; break; + default: + raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); + goto bail; + } + } +#ifdef Py_UNICODE_WIDE + /* Surrogate pair */ + if (c >= 0xd800 && c <= 0xdbff) { + Py_UNICODE c2 = 0; + if (end + 6 >= len) { + raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, + end - 5); + } + if (buf[next++] != '\\' || buf[next++] != 'u') { + raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, + end - 5); + } + end += 6; + /* Decode 4 hex digits */ + for (; next < end; next++) { + Py_ssize_t shl = (end - next - 1) << 2; + Py_UNICODE digit = buf[next]; + switch (digit) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c2 |= (digit - '0') << shl; break; + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': + c2 |= (digit - 'a' + 10) << shl; break; + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': + c2 |= (digit - 'A' + 10) << shl; break; + default: + raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); + goto bail; + } + } + c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); + } +#endif + } + chunk = PyUnicode_FromUnicode(&c, 1); + if (chunk == NULL) { + goto bail; + } + if (PyList_Append(chunks, chunk)) { + goto bail; + } + Py_DECREF(chunk); + } + + rval = join_list_unicode(chunks); + if (rval == NULL) { + goto bail; + } + Py_DECREF(chunks); + chunks = NULL; +#if PY_VERSION_HEX < 0x02050000 + return Py_BuildValue("(Ni)", rval, end); +#else + return Py_BuildValue("(Nn)", rval, end); +#endif +bail: + Py_XDECREF(chunks); + return NULL; +} + + +static PyObject * +scanstring_unicode(PyObject *pystr, Py_ssize_t end) { + PyObject *rval; + Py_ssize_t len = PyUnicode_GET_SIZE(pystr); + Py_ssize_t begin = end - 1; + Py_ssize_t next = begin; + const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); + PyObject *chunks = PyList_New(0); + if (chunks == NULL) { + goto bail; + } + while (1) { + /* Find the end of the string or the next escape */ + Py_UNICODE c = 0; + PyObject *chunk = NULL; + for (next = end; next < len; next++) { + c = buf[next]; + if (c == '"' || c == '\\') { + break; + } + } + if (!(c == '"' || c == '\\')) { + raise_errmsg("Unterminated string starting at", pystr, begin); + goto bail; + } + /* Pick up this chunk if it's not zero length */ + if (next != end) { + chunk = PyUnicode_FromUnicode(&buf[end], next - end); + if (chunk == NULL) { + goto bail; + } + if (PyList_Append(chunks, chunk)) { + goto bail; + } + Py_DECREF(chunk); + } + next++; + if (c == '"') { + end = next; + break; + } + if (next == len) { + raise_errmsg("Unterminated string starting at", pystr, begin); + goto bail; + } + c = buf[next]; + if (c != 'u') { + /* Non-unicode backslash escapes */ + end = next + 1; + switch (c) { + case '"': break; + case '\\': break; + case '/': break; + case 'b': c = '\b'; break; + case 'f': c = '\f'; break; + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + default: c = 0; + } + if (c == 0) { + raise_errmsg("Invalid \\escape", pystr, end - 2); + goto bail; + } + } else { + c = 0; + next++; + end = next + 4; + if (end >= len) { + raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); + goto bail; + } + /* Decode 4 hex digits */ + for (; next < end; next++) { + Py_ssize_t shl = (end - next - 1) << 2; + Py_UNICODE digit = buf[next]; + switch (digit) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c |= (digit - '0') << shl; break; + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': + c |= (digit - 'a' + 10) << shl; break; + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': + c |= (digit - 'A' + 10) << shl; break; + default: + raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); + goto bail; + } + } +#ifdef Py_UNICODE_WIDE + /* Surrogate pair */ + if (c >= 0xd800 && c <= 0xdbff) { + Py_UNICODE c2 = 0; + if (end + 6 >= len) { + raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, + end - 5); + } + if (buf[next++] != '\\' || buf[next++] != 'u') { + raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, + end - 5); + } + end += 6; + /* Decode 4 hex digits */ + for (; next < end; next++) { + Py_ssize_t shl = (end - next - 1) << 2; + Py_UNICODE digit = buf[next]; + switch (digit) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c2 |= (digit - '0') << shl; break; + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': + c2 |= (digit - 'a' + 10) << shl; break; + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': + c2 |= (digit - 'A' + 10) << shl; break; + default: + raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); + goto bail; + } + } + c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); + } +#endif + } + chunk = PyUnicode_FromUnicode(&c, 1); + if (chunk == NULL) { + goto bail; + } + if (PyList_Append(chunks, chunk)) { + goto bail; + } + Py_DECREF(chunk); + } + + rval = join_list_unicode(chunks); + if (rval == NULL) { + goto bail; + } + Py_DECREF(chunks); + chunks = NULL; +#if PY_VERSION_HEX < 0x02050000 + return Py_BuildValue("(Ni)", rval, end); +#else + return Py_BuildValue("(Nn)", rval, end); +#endif +bail: + Py_XDECREF(chunks); + return NULL; +} + +PyDoc_STRVAR(pydoc_scanstring, + "scanstring(basestring, end, encoding) -> (str, end)\n" + "\n" + "..." +); + +static PyObject * +py_scanstring(PyObject* self UNUSED, PyObject *args) { + PyObject *pystr; + Py_ssize_t end; + char *encoding; +#if PY_VERSION_HEX < 0x02050000 + if (!PyArg_ParseTuple(args, "Oiz:scanstring", &pystr, &end, &encoding)) { +#else + if (!PyArg_ParseTuple(args, "Onz:scanstring", &pystr, &end, &encoding)) { +#endif + return NULL; + } + if (encoding == NULL) { + encoding = DEFAULT_ENCODING; + } + if (PyString_Check(pystr)) { + return scanstring_str(pystr, end, encoding); + } else if (PyUnicode_Check(pystr)) { + return scanstring_unicode(pystr, end); + } + PyErr_SetString(PyExc_TypeError, "first argument must be a string"); + return NULL; +} + PyDoc_STRVAR(pydoc_encode_basestring_ascii, "encode_basestring_ascii(basestring) -> str\n" "\n" @@ -208,6 +606,7 @@ py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr) { } static PyMethodDef speedups_methods[] = { DEFN(encode_basestring_ascii, METH_O), + DEFN(scanstring, METH_VARARGS), {} }; #undef DEFN diff --git a/simplejson/decoder.py b/simplejson/decoder.py index 06f9de3..6a73e9a 100644 --- a/simplejson/decoder.py +++ b/simplejson/decoder.py @@ -5,6 +5,10 @@ import re import sys from simplejson.scanner import Scanner, pattern +try: + from simplejson import _speedups +except: + _speedups = None FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL @@ -110,7 +114,7 @@ def scanstring(s, end, encoding=None, _b=BACKSLASH, _m=STRINGCHUNK.match): next_end = end + 5 msg = "Invalid \\uXXXX escape" try: - if len(esc) != 4 or not esc.isalnum(): + if len(esc) != 4: raise ValueError uni = int(esc, 16) if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: @@ -118,7 +122,7 @@ def scanstring(s, end, encoding=None, _b=BACKSLASH, _m=STRINGCHUNK.match): if not s[end + 5:end + 7] == '\\u': raise ValueError esc2 = s[end + 7:end + 11] - if len(esc2) != 4 or not esc2.isalnum(): + if len(esc2) != 4: raise ValueError uni2 = int(esc2, 16) uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) @@ -130,6 +134,10 @@ def scanstring(s, end, encoding=None, _b=BACKSLASH, _m=STRINGCHUNK.match): _append(m) return u''.join(chunks), end +# Use speedup +if _speedups is not None: + scanstring = _speedups.scanstring + def JSONString(match, context): encoding = getattr(context, 'encoding', None) return scanstring(match.string, match.end(), encoding) diff --git a/simplejson/tests/test_unicode.py b/simplejson/tests/test_unicode.py index 0a92b28..f73b48a 100644 --- a/simplejson/tests/test_unicode.py +++ b/simplejson/tests/test_unicode.py @@ -24,3 +24,13 @@ def test_big_unicode_decode(): u = u'z\U0001d120x' assert S.loads('"' + u + '"') == u assert S.loads('"z\\ud834\\udd20x"') == u + +def test_unicode_decode(): + for i in range(0, 0xd7ff): + u = unichr(i) + json = '"\\u%04x"' % (i,) + res = S.loads(json) + assert res == u, 'S.loads(%r) != %r got %r' % (json, u, res) + +if __name__ == '__main__': + test_unicode_decode() |