decoder speed enhancements

git-svn-id: http://simplejson.googlecode.com/svn/trunk@77 a4795897-2c25-0410-b006-0d3caba88fa1
author: Bob Ippolito <bob@redivi.com> 2008-03-23 21:07:48 +0000
committer: Bob Ippolito <bob@redivi.com> 2008-03-23 21:07:48 +0000
commit: d7879c9c97547d4d8ca9e9072bbd8d33bd315c6c (patch)
tree: 7f045db7d61c0d404d8e647ae7401f7c6b83d026
parent: 2f1e6e426465e68ca3fd30c596cc98c30f7ff076 (diff)
download: simplejson-d7879c9c97547d4d8ca9e9072bbd8d33bd315c6c.tar.gz
6 files changed, 422 insertions, 4 deletions
diff --git a/scripts/bench.sh b/scripts/bench.sh
index e30e246..de41c3e 100755
--- a/scripts/bench.sh
+++ b/scripts/bench.sh
@@ -1,2 +1,3 @@
 #!/bin/sh
 /usr/bin/env python -mtimeit -s 'from simplejson.tests.test_pass1 import test_parse' 'test_parse()'
+/usr/bin/env python -c 'from simplejson.tests.test_pass1 import test_parse; import profile; profile.run("for i in xrange(100): test_parse()")'
diff --git a/setup.py b/setup.py
index f7b0859..7fbd436 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ from distutils.command.build_ext import build_ext
 from distutils.errors import CCompilerError, DistutilsExecError, \
     DistutilsPlatformError
 
-VERSION = '1.8'
+VERSION = '1.8.1'
 DESCRIPTION = "Simple, fast, extensible JSON encoder/decoder for Python"
 LONG_DESCRIPTION = """
 simplejson is a simple, fast, complete, correct and extensible
diff --git a/simplejson/__init__.py b/simplejson/__init__.py
index e39a110..be9e87a 100644
--- a/simplejson/__init__.py
+++ b/simplejson/__init__.py
@@ -99,7 +99,7 @@ pretty-print::
 Note that the JSON produced by this module's default settings
 is a subset of YAML, so it may be used as a serializer for that as well.
 """
-__version__ = '1.8'
+__version__ = '1.8.1'
 __all__ = [
     'dump', 'dumps', 'load', 'loads',
     'JSONDecoder', 'JSONEncoder',
diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c
index 053369a..c7a7a7c 100644
--- a/simplejson/_speedups.c
+++ b/simplejson/_speedups.c
@@ -11,6 +11,8 @@ typedef int Py_ssize_t;
 #define UNUSED
 #endif
 
+#define DEFAULT_ENCODING "utf-8"
+
 static Py_ssize_t
 ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars);
 static PyObject *
@@ -181,6 +183,402 @@ ascii_escape_str(PyObject *pystr) {
     return rval;
 }
 
+void
+raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) {
+    static PyObject *errmsg_fn = NULL;
+    PyObject *pymsg;
+    if (errmsg_fn == NULL) {
+        PyObject *decoder = PyImport_ImportModule("simplejson.decoder");
+        if (decoder == NULL) return;
+        errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
+        if (errmsg_fn == NULL) return;
+        Py_XDECREF(decoder);
+    }
+#if PY_VERSION_HEX < 0x02050000 
+    pymsg = PyObject_CallFunction(errmsg_fn, "(zOi)", msg, s, end);
+#else
+    pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
+#endif
+    PyErr_SetObject(PyExc_ValueError, pymsg);
+/*
+
+def linecol(doc, pos):
+    lineno = doc.count('\n', 0, pos) + 1
+    if lineno == 1:
+        colno = pos
+    else:
+        colno = pos - doc.rindex('\n', 0, pos)
+    return lineno, colno
+
+def errmsg(msg, doc, pos, end=None):
+    lineno, colno = linecol(doc, pos)
+    if end is None:
+        return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
+    endlineno, endcolno = linecol(doc, end)
+    return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
+        msg, lineno, colno, endlineno, endcolno, pos, end)
+
+*/
+}
+
+static PyObject *
+join_list_unicode(PyObject *lst) {
+    static PyObject *ustr = NULL;
+    static PyObject *joinstr = NULL;
+    if (ustr == NULL) {
+        Py_UNICODE c = 0;
+        ustr = PyUnicode_FromUnicode(&c, 0);
+    }
+    if (joinstr == NULL) {
+        joinstr = PyString_FromString("join");
+    }
+    if (joinstr == NULL || ustr == NULL) {
+        return NULL;
+    }
+    return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
+}
+
+static PyObject *
+scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding) {
+    PyObject *rval;
+    Py_ssize_t len = PyString_GET_SIZE(pystr);
+    Py_ssize_t begin = end - 1;
+    Py_ssize_t next = begin;
+    char *buf = PyString_AS_STRING(pystr);
+    PyObject *chunks = PyList_New(0);
+    if (chunks == NULL) {
+        goto bail;
+    }
+    while (1) {
+        /* Find the end of the string or the next escape */
+        Py_UNICODE c = 0;
+        PyObject *chunk = NULL;
+        for (next = end; next < len; next++) {
+            c = buf[next];
+            if (c == '"' || c == '\\') {
+                break;
+            }
+        }
+        if (!(c == '"' || c == '\\')) {
+            raise_errmsg("Unterminated string starting at", pystr, begin);
+            goto bail;
+        }
+        /* Pick up this chunk if it's not zero length */
+        if (next != end) {
+            PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end);
+            if (strchunk == NULL) {
+                goto bail;
+            }
+            chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
+            if (chunk == NULL) {
+                goto bail;
+            }
+            if (PyList_Append(chunks, chunk)) {
+                goto bail;
+            }
+            Py_DECREF(chunk);
+        }
+        next++;
+        if (c == '"') {
+            end = next;
+            break;
+        }
+        if (next == len) {
+            raise_errmsg("Unterminated string starting at", pystr, begin);
+            goto bail;
+        }
+        c = buf[next];
+        if (c != 'u') {
+            /* Non-unicode backslash escapes */
+            end = next + 1;
+            switch (c) {
+                case '"': break;
+                case '\\': break;
+                case '/': break;
+                case 'b': c = '\b'; break;
+                case 'f': c = '\f'; break;
+                case 'n': c = '\n'; break;
+                case 'r': c = '\r'; break;
+                case 't': c = '\t'; break;
+                default: c = 0;
+            }
+            if (c == 0) {
+                raise_errmsg("Invalid \\escape", pystr, end - 2);
+                goto bail;
+            }
+        } else {
+            c = 0;
+            next++;
+            end = next + 4;
+            if (end >= len) {
+                raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
+                goto bail;
+            }
+            /* Decode 4 hex digits */
+            for (; next < end; next++) {
+                Py_ssize_t shl = (end - next - 1) << 2;
+                Py_UNICODE digit = buf[next];
+                switch (digit) {
+                    case '0': case '1': case '2': case '3': case '4':
+                    case '5': case '6': case '7': case '8': case '9':
+                        c |= (digit - '0') << shl; break;
+                    case 'a': case 'b': case 'c': case 'd': case 'e':
+                    case 'f':
+                        c |= (digit - 'a' + 10) << shl; break;
+                    case 'A': case 'B': case 'C': case 'D': case 'E':
+                    case 'F':
+                        c |= (digit - 'A' + 10) << shl; break;
+                    default:
+                        raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+                        goto bail;
+                }
+            }
+#ifdef Py_UNICODE_WIDE
+            /* Surrogate pair */
+            if (c >= 0xd800 && c <= 0xdbff) {
+                Py_UNICODE c2 = 0;
+                if (end + 6 >= len) {
+                    raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+                        end - 5);
+                }
+                if (buf[next++] != '\\' || buf[next++] != 'u') {
+                    raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+                        end - 5);
+                }
+                end += 6;
+                /* Decode 4 hex digits */
+                for (; next < end; next++) {
+                    Py_ssize_t shl = (end - next - 1) << 2;
+                    Py_UNICODE digit = buf[next];
+                    switch (digit) {
+                        case '0': case '1': case '2': case '3': case '4':
+                        case '5': case '6': case '7': case '8': case '9':
+                            c2 |= (digit - '0') << shl; break;
+                        case 'a': case 'b': case 'c': case 'd': case 'e':
+                        case 'f':
+                            c2 |= (digit - 'a' + 10) << shl; break;
+                        case 'A': case 'B': case 'C': case 'D': case 'E':
+                        case 'F':
+                            c2 |= (digit - 'A' + 10) << shl; break;
+                        default:
+                            raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+                            goto bail;
+                    }
+                }
+                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+            }
+#endif
+        }
+        chunk = PyUnicode_FromUnicode(&c, 1);
+        if (chunk == NULL) {
+            goto bail;
+        }
+        if (PyList_Append(chunks, chunk)) {
+            goto bail;
+        }
+        Py_DECREF(chunk);
+    }
+
+    rval = join_list_unicode(chunks);
+    if (rval == NULL) {
+        goto bail;
+    }
+    Py_DECREF(chunks);
+    chunks = NULL;
+#if PY_VERSION_HEX < 0x02050000 
+    return Py_BuildValue("(Ni)", rval, end);
+#else
+    return Py_BuildValue("(Nn)", rval, end);
+#endif
+bail:
+    Py_XDECREF(chunks);
+    return NULL;
+}
+
+
+static PyObject *
+scanstring_unicode(PyObject *pystr, Py_ssize_t end) {
+    PyObject *rval;
+    Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
+    Py_ssize_t begin = end - 1;
+    Py_ssize_t next = begin;
+    const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
+    PyObject *chunks = PyList_New(0);
+    if (chunks == NULL) {
+        goto bail;
+    }
+    while (1) {
+        /* Find the end of the string or the next escape */
+        Py_UNICODE c = 0;
+        PyObject *chunk = NULL;
+        for (next = end; next < len; next++) {
+            c = buf[next];
+            if (c == '"' || c == '\\') {
+                break;
+            }
+        }
+        if (!(c == '"' || c == '\\')) {
+            raise_errmsg("Unterminated string starting at", pystr, begin);
+            goto bail;
+        }
+        /* Pick up this chunk if it's not zero length */
+        if (next != end) {
+            chunk = PyUnicode_FromUnicode(&buf[end], next - end);
+            if (chunk == NULL) {
+                goto bail;
+            }
+            if (PyList_Append(chunks, chunk)) {
+                goto bail;
+            }
+            Py_DECREF(chunk);
+        }
+        next++;
+        if (c == '"') {
+            end = next;
+            break;
+        }
+        if (next == len) {
+            raise_errmsg("Unterminated string starting at", pystr, begin);
+            goto bail;
+        }
+        c = buf[next];
+        if (c != 'u') {
+            /* Non-unicode backslash escapes */
+            end = next + 1;
+            switch (c) {
+                case '"': break;
+                case '\\': break;
+                case '/': break;
+                case 'b': c = '\b'; break;
+                case 'f': c = '\f'; break;
+                case 'n': c = '\n'; break;
+                case 'r': c = '\r'; break;
+                case 't': c = '\t'; break;
+                default: c = 0;
+            }
+            if (c == 0) {
+                raise_errmsg("Invalid \\escape", pystr, end - 2);
+                goto bail;
+            }
+        } else {
+            c = 0;
+            next++;
+            end = next + 4;
+            if (end >= len) {
+                raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
+                goto bail;
+            }
+            /* Decode 4 hex digits */
+            for (; next < end; next++) {
+                Py_ssize_t shl = (end - next - 1) << 2;
+                Py_UNICODE digit = buf[next];
+                switch (digit) {
+                    case '0': case '1': case '2': case '3': case '4':
+                    case '5': case '6': case '7': case '8': case '9':
+                        c |= (digit - '0') << shl; break;
+                    case 'a': case 'b': case 'c': case 'd': case 'e':
+                    case 'f':
+                        c |= (digit - 'a' + 10) << shl; break;
+                    case 'A': case 'B': case 'C': case 'D': case 'E':
+                    case 'F':
+                        c |= (digit - 'A' + 10) << shl; break;
+                    default:
+                        raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+                        goto bail;
+                }
+            }
+#ifdef Py_UNICODE_WIDE
+            /* Surrogate pair */
+            if (c >= 0xd800 && c <= 0xdbff) {
+                Py_UNICODE c2 = 0;
+                if (end + 6 >= len) {
+                    raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+                        end - 5);
+                }
+                if (buf[next++] != '\\' || buf[next++] != 'u') {
+                    raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+                        end - 5);
+                }
+                end += 6;
+                /* Decode 4 hex digits */
+                for (; next < end; next++) {
+                    Py_ssize_t shl = (end - next - 1) << 2;
+                    Py_UNICODE digit = buf[next];
+                    switch (digit) {
+                        case '0': case '1': case '2': case '3': case '4':
+                        case '5': case '6': case '7': case '8': case '9':
+                            c2 |= (digit - '0') << shl; break;
+                        case 'a': case 'b': case 'c': case 'd': case 'e':
+                        case 'f':
+                            c2 |= (digit - 'a' + 10) << shl; break;
+                        case 'A': case 'B': case 'C': case 'D': case 'E':
+                        case 'F':
+                            c2 |= (digit - 'A' + 10) << shl; break;
+                        default:
+                            raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+                            goto bail;
+                    }
+                }
+                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+            }
+#endif
+        }
+        chunk = PyUnicode_FromUnicode(&c, 1);
+        if (chunk == NULL) {
+            goto bail;
+        }
+        if (PyList_Append(chunks, chunk)) {
+            goto bail;
+        }
+        Py_DECREF(chunk);
+    }
+
+    rval = join_list_unicode(chunks);
+    if (rval == NULL) {
+        goto bail;
+    }
+    Py_DECREF(chunks);
+    chunks = NULL;
+#if PY_VERSION_HEX < 0x02050000 
+    return Py_BuildValue("(Ni)", rval, end);
+#else
+    return Py_BuildValue("(Nn)", rval, end);
+#endif
+bail:
+    Py_XDECREF(chunks);
+    return NULL;
+}
+
+PyDoc_STRVAR(pydoc_scanstring,
+    "scanstring(basestring, end, encoding) -> (str, end)\n"
+    "\n"
+    "..."
+);
+
+static PyObject *
+py_scanstring(PyObject* self UNUSED, PyObject *args) {
+    PyObject *pystr;
+    Py_ssize_t end;
+    char *encoding;
+#if PY_VERSION_HEX < 0x02050000 
+    if (!PyArg_ParseTuple(args, "Oiz:scanstring", &pystr, &end, &encoding)) {
+#else
+    if (!PyArg_ParseTuple(args, "Onz:scanstring", &pystr, &end, &encoding)) {
+#endif
+        return NULL;
+    }
+    if (encoding == NULL) {
+        encoding = DEFAULT_ENCODING;
+    }
+    if (PyString_Check(pystr)) {
+        return scanstring_str(pystr, end, encoding);
+    } else if (PyUnicode_Check(pystr)) {
+        return scanstring_unicode(pystr, end);
+    }
+    PyErr_SetString(PyExc_TypeError, "first argument must be a string");
+    return NULL;
+}
+
 PyDoc_STRVAR(pydoc_encode_basestring_ascii,
     "encode_basestring_ascii(basestring) -> str\n"
     "\n"
@@ -208,6 +606,7 @@ py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr) {
     }
 static PyMethodDef speedups_methods[] = {
     DEFN(encode_basestring_ascii, METH_O),
+    DEFN(scanstring, METH_VARARGS),
     {}
 };
 #undef DEFN
diff --git a/simplejson/decoder.py b/simplejson/decoder.py
index 06f9de3..6a73e9a 100644
--- a/simplejson/decoder.py
+++ b/simplejson/decoder.py
@@ -5,6 +5,10 @@ import re
 import sys
 
 from simplejson.scanner import Scanner, pattern
+try:
+    from simplejson import _speedups
+except:
+    _speedups = None
 
 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
 
@@ -110,7 +114,7 @@ def scanstring(s, end, encoding=None, _b=BACKSLASH, _m=STRINGCHUNK.match):
             next_end = end + 5
             msg = "Invalid \\uXXXX escape"
             try:
-                if len(esc) != 4 or not esc.isalnum():
+                if len(esc) != 4:
                     raise ValueError
                 uni = int(esc, 16)
                 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
@@ -118,7 +122,7 @@ def scanstring(s, end, encoding=None, _b=BACKSLASH, _m=STRINGCHUNK.match):
                     if not s[end + 5:end + 7] == '\\u':
                         raise ValueError
                     esc2 = s[end + 7:end + 11]
-                    if len(esc2) != 4 or not esc2.isalnum():
+                    if len(esc2) != 4:
                         raise ValueError
                     uni2 = int(esc2, 16)
                     uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
@@ -130,6 +134,10 @@ def scanstring(s, end, encoding=None, _b=BACKSLASH, _m=STRINGCHUNK.match):
         _append(m)
     return u''.join(chunks), end
 
+# Use speedup
+if _speedups is not None:
+    scanstring = _speedups.scanstring
+
 def JSONString(match, context):
     encoding = getattr(context, 'encoding', None)
     return scanstring(match.string, match.end(), encoding)
diff --git a/simplejson/tests/test_unicode.py b/simplejson/tests/test_unicode.py
index 0a92b28..f73b48a 100644
--- a/simplejson/tests/test_unicode.py
+++ b/simplejson/tests/test_unicode.py
@@ -24,3 +24,13 @@ def test_big_unicode_decode():
     u = u'z\U0001d120x'
     assert S.loads('"' + u + '"') == u
     assert S.loads('"z\\ud834\\udd20x"') == u
+
+def test_unicode_decode():
+    for i in range(0, 0xd7ff):
+        u = unichr(i)
+        json = '"\\u%04x"' % (i,)
+        res = S.loads(json)
+        assert res == u, 'S.loads(%r) != %r got %r' % (json, u, res)
+
+if __name__ == '__main__':
+    test_unicode_decode()
author	Bob Ippolito <bob@redivi.com>	2008-03-23 21:07:48 +0000
committer	Bob Ippolito <bob@redivi.com>	2008-03-23 21:07:48 +0000
commit	d7879c9c97547d4d8ca9e9072bbd8d33bd315c6c (patch)
tree	7f045db7d61c0d404d8e647ae7401f7c6b83d026
parent	2f1e6e426465e68ca3fd30c596cc98c30f7ff076 (diff)
download	simplejson-d7879c9c97547d4d8ca9e9072bbd8d33bd315c6c.tar.gz