optional C speedups

git-svn-id: http://simplejson.googlecode.com/svn/trunk@45 a4795897-2c25-0410-b006-0d3caba88fa1
author: Bob Ippolito <bob@redivi.com> 2007-03-18 04:51:52 +0000
committer: Bob Ippolito <bob@redivi.com> 2007-03-18 04:51:52 +0000
commit: a0fac116e1e5fd038174312c5b52d15a163fcbc8 (patch)
tree: 615dcd6d0ab1843cc18507ed731eb00f2a12a13e
parent: be103d0368413157c2ac4df0c8b67e97f735bc34 (diff)
download: simplejson-a0fac116e1e5fd038174312c5b52d15a163fcbc8.tar.gz
5 files changed, 291 insertions, 18 deletions
diff --git a/setup.cfg b/setup.cfg
index 406686a..01bb954 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,3 @@
 [egg_info]
-#tag_build = dev
-#tag_svn_revision = true
+tag_build = dev
+tag_svn_revision = true
diff --git a/setup.py b/setup.py
index 7cf6d49..4a8d7b5 100644
--- a/setup.py
+++ b/setup.py
@@ -3,14 +3,15 @@
 import ez_setup
 ez_setup.use_setuptools()
 
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages, Extension, Feature
 
-VERSION = '1.6'
+VERSION = '1.7'
 DESCRIPTION = "Simple, fast, extensible JSON encoder/decoder for Python"
 LONG_DESCRIPTION = """
 simplejson is a simple, fast, complete, correct and extensible
 JSON <http://json.org> encoder and decoder for Python 2.3+.  It is
-pure Python code with no dependencies.
+pure Python code with no dependencies, but includes an optional C
+extension for a serious speed boost.
 
 simplejson was formerly known as simple_json, but changed its name to
 comply with PEP 8 module naming guidelines.
@@ -31,6 +32,14 @@ Programming Language :: Python
 Topic :: Software Development :: Libraries :: Python Modules
 """.splitlines()))
 
+speedups = Feature(
+    "options C speed-enhancement modules",
+    standard=True,
+    ext_modules = [
+        Extension("simplejson._speedups", ["simplejson/_speedups.c"]),
+    ],
+)
+
 setup(
     name="simplejson",
     version=VERSION,
@@ -48,4 +57,5 @@ setup(
     entry_points={
         'paste.filter_app_factory': ['json = simplejson.jsonfilter:factory'],
     },
+    features={'speedups': speedups},
 )
diff --git a/simplejson/__init__.py b/simplejson/__init__.py
index dc3d99d..8e4f77b 100644
--- a/simplejson/__init__.py
+++ b/simplejson/__init__.py
@@ -86,7 +86,7 @@ Extending JSONEncoder::
 Note that the JSON produced by this module's default settings
 is a subset of YAML, so it may be used as a serializer for that as well.
 """
-__version__ = '1.6'
+__version__ = '1.7'
 __all__ = [
     'dump', 'dumps', 'load', 'loads',
     'JSONDecoder', 'JSONEncoder',
@@ -95,8 +95,20 @@ __all__ = [
 from decoder import JSONDecoder
 from encoder import JSONEncoder
 
+_default_encoder = JSONEncoder(
+    skipkeys=False,
+    ensure_ascii=True,
+    check_circular=True,
+    allow_nan=True,
+    indent=None,
+    separators=None,
+    encoding='utf-8'
+)
+
+
 def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True,
-        allow_nan=True, cls=None, indent=None, encoding='utf-8', **kw):
+        allow_nan=True, cls=None, indent=None, encoding='utf-8',
+        _iterencode=_default_encoder.iterencode, **kw):
     """
     Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
     ``.write()``-supporting file-like object).
@@ -130,19 +142,27 @@ def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True,
     ``.default()`` method to serialize additional types), specify it with
     the ``cls`` kwarg.
     """
-    if cls is None:
-        cls = JSONEncoder
-    iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii,
-        check_circular=check_circular, allow_nan=allow_nan, indent=indent,
-        encoding=encoding, **kw).iterencode(obj)
+    # cached encoder
+    if (skipkeys is False and ensure_ascii is True and
+        check_circular is True and allow_nan is True and
+        cls is None and indent is None and separators is None and
+        encoding == 'utf-8' and not kw):
+        iterable = _iterencode(obj)
+    else:
+        if cls is None:
+            cls = JSONEncoder
+        iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii,
+            check_circular=check_circular, allow_nan=allow_nan, indent=indent,
+            encoding=encoding, **kw).iterencode(obj)
     # could accelerate with writelines in some versions of Python, at
     # a debuggability cost
     for chunk in iterable:
         fp.write(chunk)
 
+
 def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True,
         allow_nan=True, cls=None, indent=None, separators=None,
-        encoding='utf-8', **kw):
+        encoding='utf-8', _encode=_default_encoder.encode, **kw):
     """
     Serialize ``obj`` to a JSON formatted ``str``.
 
@@ -178,6 +198,12 @@ def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True,
     ``.default()`` method to serialize additional types), specify it with
     the ``cls`` kwarg.
     """
+    # cached encoder
+    if (skipkeys is False and ensure_ascii is True and
+        check_circular is True and allow_nan is True and
+        cls is None and indent is None and separators is None and
+        encoding == 'utf-8' and not kw):
+        return _encode(obj)
     if cls is None:
         cls = JSONEncoder
     return cls(
diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c
new file mode 100644
index 0000000..780bb6b
--- /dev/null
+++ b/simplejson/_speedups.c
@@ -0,0 +1,206 @@
+#include "Python.h"
+#if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN)
+typedef int Py_ssize_t;
+#define PY_SSIZE_T_MAX INT_MAX
+#define PY_SSIZE_T_MIN INT_MIN
+#endif
+
+static Py_ssize_t
+ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars);
+static PyObject *
+ascii_escape_unicode(PyObject *pystr);
+static PyObject *
+ascii_escape_str(PyObject *pystr);
+static PyObject *
+py_encode_basestring_ascii(PyObject* self __attribute__((__unused__)), PyObject *pystr);
+void init_speedups(void);
+
+#define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '/' && c != '"')
+
+static Py_ssize_t
+ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) {
+    Py_UNICODE x;
+    output[chars++] = '\\';
+    switch (c) {
+        case '/': output[chars++] = (char)c; break;
+        case '\\': output[chars++] = (char)c; break;
+        case '"': output[chars++] = (char)c; break;
+        case '\b': output[chars++] = 'b'; break;
+        case '\f': output[chars++] = 'f'; break;
+        case '\n': output[chars++] = 'n'; break;
+        case '\r': output[chars++] = 'r'; break;
+        case '\t': output[chars++] = 't'; break;
+        default:
+#ifdef Py_UNICODE_WIDE
+            if (c >= 0x10000) {
+                /* UTF-16 surrogate pair */
+                Py_UNICODE v = c - 0x10000;
+                c = 0xd800 | ((v >> 10) & 0x3ff);
+                output[chars++] = 'u';
+                x = (c & 0xf000) >> 12;
+                output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+                x = (c & 0x0f00) >> 8;
+                output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+                x = (c & 0x00f0) >> 4;
+                output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+                x = (c & 0x000f);
+                output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+                c = 0xdc00 | (v & 0x3ff);
+                output[chars++] = '\\';
+            }
+#endif
+            output[chars++] = 'u';
+            x = (c & 0xf000) >> 12;
+            output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+            x = (c & 0x0f00) >> 8;
+            output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+            x = (c & 0x00f0) >> 4;
+            output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+            x = (c & 0x000f);
+            output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+    }
+    return chars;
+}
+
+static PyObject *
+ascii_escape_unicode(PyObject *pystr) {
+    Py_ssize_t i;
+    Py_ssize_t input_chars;
+    Py_ssize_t output_size;
+    Py_ssize_t chars;
+    PyObject *rval;
+    char *output;
+    Py_UNICODE *input_unicode;
+
+    input_chars = PyUnicode_GET_SIZE(pystr);
+    input_unicode = PyUnicode_AS_UNICODE(pystr);
+    /* One char input can be up to 6 chars output, estimate 4 of these */
+    output_size = 32 + input_chars;
+    rval = PyString_FromStringAndSize(NULL, output_size);
+    if (rval == NULL) {
+        return NULL;
+    }
+    output = PyString_AS_STRING(rval);
+    chars = 0;
+    output[chars++] = '"';
+    for (i = 0; i < input_chars; i++) {
+        Py_UNICODE c = input_unicode[i];
+        if (S_CHAR(c)) {
+            output[chars++] = (char)c;
+        } else {
+            chars = ascii_escape_char(c, output, chars);
+        }
+        if (output_size - chars < 7) {
+            /* There's more than four, so let's resize by a lot */
+            output_size *= 2;
+            if (output_size > 2 + (input_chars * 6)) {
+                output_size = 2 + (input_chars * 6);
+            }
+            if (_PyString_Resize(&rval, output_size) == -1) {
+                return NULL;
+            }
+            output = PyString_AS_STRING(rval);
+        }
+    }
+    output[chars++] = '"';
+    if (_PyString_Resize(&rval, chars) == -1) {
+        return NULL;
+    }
+    return rval;
+}
+
+static PyObject *
+ascii_escape_str(PyObject *pystr) {
+    Py_ssize_t i;
+    Py_ssize_t input_chars;
+    Py_ssize_t output_size;
+    Py_ssize_t chars;
+    PyObject *rval;
+    char *output;
+    char *input_str;
+
+    input_chars = PyString_GET_SIZE(pystr);
+    input_str = PyString_AS_STRING(pystr);
+    /* One char input can be up to 6 chars output, estimate 4 of these */
+    output_size = 32 + input_chars;
+    rval = PyString_FromStringAndSize(NULL, output_size);
+    if (rval == NULL) {
+        return NULL;
+    }
+    output = PyString_AS_STRING(rval);
+    chars = 0;
+    output[chars++] = '"';
+    for (i = 0; i < input_chars; i++) {
+        Py_UNICODE c = (Py_UNICODE)input_str[i];
+        if (S_CHAR(c)) {
+            output[chars++] = (char)c;
+        } else if (c > 0x7F) {
+            /* We hit a non-ASCII character, bail to unicode mode */
+            PyObject *uni;
+            Py_DECREF(rval);
+            uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
+            if (uni == NULL) {
+                return NULL;
+            }
+            rval = ascii_escape_unicode(uni);
+            Py_DECREF(uni);
+            return rval;
+        } else {
+            chars = ascii_escape_char(c, output, chars);
+        }
+        if (output_size - chars < 7) {
+            /* There's more than four, so let's resize by a lot */
+            output_size *= 2;
+            if (output_size > 2 + (input_chars * 6)) {
+                output_size = 2 + (input_chars * 6);
+            }
+            if (_PyString_Resize(&rval, output_size) == -1) {
+                return NULL;
+            }
+            output = PyString_AS_STRING(rval);
+        }
+    }
+    output[chars++] = '"';
+    if (_PyString_Resize(&rval, chars) == -1) {
+        return NULL;
+    }
+    return rval;
+}
+
+PyDoc_STRVAR(pydoc_encode_basestring_ascii,
+    "encode_basestring_ascii(basestring) -> str\n"
+    "\n"
+    "..."
+);
+
+static PyObject *
+py_encode_basestring_ascii(PyObject* self __attribute__((__unused__)), PyObject *pystr) {
+    /* METH_O */
+    if (PyString_Check(pystr)) {
+        return ascii_escape_str(pystr);
+    } else if (PyUnicode_Check(pystr)) {
+        return ascii_escape_unicode(pystr);
+    }
+    PyErr_SetString(PyExc_TypeError, "first argument must be a string");
+    return NULL;
+}
+
+#define DEFN(n, k) \
+    {  \
+        #n, \
+        (PyCFunction)py_ ##n, \
+        k, \
+        pydoc_ ##n \
+    }
+static PyMethodDef speedups_methods[] = {
+    DEFN(encode_basestring_ascii, METH_O),
+    {}
+};
+#undef DEFN
+
+void
+init_speedups(void)
+{
+    PyObject *m;
+    m = Py_InitModule4("_speedups", speedups_methods, NULL, NULL, PYTHON_API_VERSION);
+}
diff --git a/simplejson/encoder.py b/simplejson/encoder.py
index 92ed9d4..eec9c7f 100644
--- a/simplejson/encoder.py
+++ b/simplejson/encoder.py
@@ -2,6 +2,10 @@
 Implementation of JSONEncoder
 """
 import re
+try:
+    from simplejson import _speedups
+except ImportError:
+    _speedups = None
 
 ESCAPE = re.compile(r'[\x00-\x19\\"\b\f\n\r\t]')
 ESCAPE_ASCII = re.compile(r'([\\"/]|[^\ -~])')
@@ -56,9 +60,22 @@ def encode_basestring_ascii(s):
         try:
             return ESCAPE_DCT[s]
         except KeyError:
-            return '\\u%04x' % (ord(s),)
+            n = ord(s)
+            if n < 0x10000:
+                return '\\u%04x' % (n,)
+            else:
+                # surrogate pair
+                n -= 0x10000
+                s1 = 0xd800 | ((n >> 10) & 0x3ff)
+                s2 = 0xdc00 | (n & 0x3ff)
+                return '\\u%04x\\u%04x' % (s1, s2)
     return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
         
+try:
+    encode_basestring_ascii = _speedups.encode_basestring_ascii
+    _need_utf8 = True
+except AttributeError:
+    _need_utf8 = False
 
 class JSONEncoder(object):
     """
@@ -212,9 +229,13 @@ class JSONEncoder(object):
             items = [(k, dct[k]) for k in keys]
         else:
             items = dct.iteritems()
+        _encoding = self.encoding
+        _do_decode = (_encoding is not None
+            and not (_need_utf8 and _encoding == 'utf-8'))
         for key, value in items:
-            if self.encoding is not None and isinstance(key, str):
-                key = key.decode(self.encoding)
+            if isinstance(key, str):
+                if _do_decode:
+                    key = key.decode(_encoding)
             elif isinstance(key, basestring):
                 pass
             # JavaScript is weakly typed for these, so it makes sense to
@@ -254,8 +275,10 @@ class JSONEncoder(object):
                 encoder = encode_basestring_ascii
             else:
                 encoder = encode_basestring
-            if self.encoding and isinstance(o, str):
-                o = o.decode(self.encoding)
+            _encoding = self.encoding
+            if (_encoding is not None and isinstance(o, str)
+                    and not (_need_utf8 and _encoding == 'utf-8')):
+                o = o.decode(_encoding)
             yield encoder(o)
         elif o is None:
             yield 'null'
@@ -315,6 +338,14 @@ class JSONEncoder(object):
         >>> JSONEncoder().encode({"foo": ["bar", "baz"]})
         '{"foo":["bar", "baz"]}'
         """
+        # This is for extremely simple cases and benchmarks...
+        if isinstance(o, basestring):
+            if isinstance(o, str):
+                _encoding = self.encoding
+                if (_encoding is not None 
+                        and not (_encoding == 'utf-8' and _need_utf8)):
+                    o = o.decode(_encoding)
+            return encode_basestring_ascii(o)
         # This doesn't pass the iterator directly to ''.join() because it
         # sucks at reporting exceptions.  It's going to do this internally
         # anyway because it uses PySequence_Fast or similar.
author	Bob Ippolito <bob@redivi.com>	2007-03-18 04:51:52 +0000
committer	Bob Ippolito <bob@redivi.com>	2007-03-18 04:51:52 +0000
commit	a0fac116e1e5fd038174312c5b52d15a163fcbc8 (patch)
tree	615dcd6d0ab1843cc18507ed731eb00f2a12a13e
parent	be103d0368413157c2ac4df0c8b67e97f735bc34 (diff)
download	simplejson-a0fac116e1e5fd038174312c5b52d15a163fcbc8.tar.gz