diff options
author | Bob Ippolito <bob@redivi.com> | 2007-03-18 04:51:52 +0000 |
---|---|---|
committer | Bob Ippolito <bob@redivi.com> | 2007-03-18 04:51:52 +0000 |
commit | a0fac116e1e5fd038174312c5b52d15a163fcbc8 (patch) | |
tree | 615dcd6d0ab1843cc18507ed731eb00f2a12a13e | |
parent | be103d0368413157c2ac4df0c8b67e97f735bc34 (diff) | |
download | simplejson-a0fac116e1e5fd038174312c5b52d15a163fcbc8.tar.gz |
optional C speedups
git-svn-id: http://simplejson.googlecode.com/svn/trunk@45 a4795897-2c25-0410-b006-0d3caba88fa1
-rw-r--r-- | setup.cfg | 4 | ||||
-rw-r--r-- | setup.py | 16 | ||||
-rw-r--r-- | simplejson/__init__.py | 42 | ||||
-rw-r--r-- | simplejson/_speedups.c | 206 | ||||
-rw-r--r-- | simplejson/encoder.py | 41 |
5 files changed, 291 insertions, 18 deletions
@@ -1,3 +1,3 @@ [egg_info] -#tag_build = dev -#tag_svn_revision = true +tag_build = dev +tag_svn_revision = true @@ -3,14 +3,15 @@ import ez_setup ez_setup.use_setuptools() -from setuptools import setup, find_packages +from setuptools import setup, find_packages, Extension, Feature -VERSION = '1.6' +VERSION = '1.7' DESCRIPTION = "Simple, fast, extensible JSON encoder/decoder for Python" LONG_DESCRIPTION = """ simplejson is a simple, fast, complete, correct and extensible JSON <http://json.org> encoder and decoder for Python 2.3+. It is -pure Python code with no dependencies. +pure Python code with no dependencies, but includes an optional C +extension for a serious speed boost. simplejson was formerly known as simple_json, but changed its name to comply with PEP 8 module naming guidelines. @@ -31,6 +32,14 @@ Programming Language :: Python Topic :: Software Development :: Libraries :: Python Modules """.splitlines())) +speedups = Feature( + "options C speed-enhancement modules", + standard=True, + ext_modules = [ + Extension("simplejson._speedups", ["simplejson/_speedups.c"]), + ], +) + setup( name="simplejson", version=VERSION, @@ -48,4 +57,5 @@ setup( entry_points={ 'paste.filter_app_factory': ['json = simplejson.jsonfilter:factory'], }, + features={'speedups': speedups}, ) diff --git a/simplejson/__init__.py b/simplejson/__init__.py index dc3d99d..8e4f77b 100644 --- a/simplejson/__init__.py +++ b/simplejson/__init__.py @@ -86,7 +86,7 @@ Extending JSONEncoder:: Note that the JSON produced by this module's default settings is a subset of YAML, so it may be used as a serializer for that as well. """ -__version__ = '1.6' +__version__ = '1.7' __all__ = [ 'dump', 'dumps', 'load', 'loads', 'JSONDecoder', 'JSONEncoder', @@ -95,8 +95,20 @@ __all__ = [ from decoder import JSONDecoder from encoder import JSONEncoder +_default_encoder = JSONEncoder( + skipkeys=False, + ensure_ascii=True, + check_circular=True, + allow_nan=True, + indent=None, + separators=None, + encoding='utf-8' +) + + def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, - allow_nan=True, cls=None, indent=None, encoding='utf-8', **kw): + allow_nan=True, cls=None, indent=None, encoding='utf-8', + _iterencode=_default_encoder.iterencode, **kw): """ Serialize ``obj`` as a JSON formatted stream to ``fp`` (a ``.write()``-supporting file-like object). @@ -130,19 +142,27 @@ def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, ``.default()`` method to serialize additional types), specify it with the ``cls`` kwarg. """ - if cls is None: - cls = JSONEncoder - iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii, - check_circular=check_circular, allow_nan=allow_nan, indent=indent, - encoding=encoding, **kw).iterencode(obj) + # cached encoder + if (skipkeys is False and ensure_ascii is True and + check_circular is True and allow_nan is True and + cls is None and indent is None and separators is None and + encoding == 'utf-8' and not kw): + iterable = _iterencode(obj) + else: + if cls is None: + cls = JSONEncoder + iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii, + check_circular=check_circular, allow_nan=allow_nan, indent=indent, + encoding=encoding, **kw).iterencode(obj) # could accelerate with writelines in some versions of Python, at # a debuggability cost for chunk in iterable: fp.write(chunk) + def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, - encoding='utf-8', **kw): + encoding='utf-8', _encode=_default_encoder.encode, **kw): """ Serialize ``obj`` to a JSON formatted ``str``. @@ -178,6 +198,12 @@ def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True, ``.default()`` method to serialize additional types), specify it with the ``cls`` kwarg. """ + # cached encoder + if (skipkeys is False and ensure_ascii is True and + check_circular is True and allow_nan is True and + cls is None and indent is None and separators is None and + encoding == 'utf-8' and not kw): + return _encode(obj) if cls is None: cls = JSONEncoder return cls( diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c new file mode 100644 index 0000000..780bb6b --- /dev/null +++ b/simplejson/_speedups.c @@ -0,0 +1,206 @@ +#include "Python.h" +#if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN) +typedef int Py_ssize_t; +#define PY_SSIZE_T_MAX INT_MAX +#define PY_SSIZE_T_MIN INT_MIN +#endif + +static Py_ssize_t +ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars); +static PyObject * +ascii_escape_unicode(PyObject *pystr); +static PyObject * +ascii_escape_str(PyObject *pystr); +static PyObject * +py_encode_basestring_ascii(PyObject* self __attribute__((__unused__)), PyObject *pystr); +void init_speedups(void); + +#define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '/' && c != '"') + +static Py_ssize_t +ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) { + Py_UNICODE x; + output[chars++] = '\\'; + switch (c) { + case '/': output[chars++] = (char)c; break; + case '\\': output[chars++] = (char)c; break; + case '"': output[chars++] = (char)c; break; + case '\b': output[chars++] = 'b'; break; + case '\f': output[chars++] = 'f'; break; + case '\n': output[chars++] = 'n'; break; + case '\r': output[chars++] = 'r'; break; + case '\t': output[chars++] = 't'; break; + default: +#ifdef Py_UNICODE_WIDE + if (c >= 0x10000) { + /* UTF-16 surrogate pair */ + Py_UNICODE v = c - 0x10000; + c = 0xd800 | ((v >> 10) & 0x3ff); + output[chars++] = 'u'; + x = (c & 0xf000) >> 12; + output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + x = (c & 0x0f00) >> 8; + output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + x = (c & 0x00f0) >> 4; + output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + x = (c & 0x000f); + output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + c = 0xdc00 | (v & 0x3ff); + output[chars++] = '\\'; + } +#endif + output[chars++] = 'u'; + x = (c & 0xf000) >> 12; + output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + x = (c & 0x0f00) >> 8; + output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + x = (c & 0x00f0) >> 4; + output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + x = (c & 0x000f); + output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + } + return chars; +} + +static PyObject * +ascii_escape_unicode(PyObject *pystr) { + Py_ssize_t i; + Py_ssize_t input_chars; + Py_ssize_t output_size; + Py_ssize_t chars; + PyObject *rval; + char *output; + Py_UNICODE *input_unicode; + + input_chars = PyUnicode_GET_SIZE(pystr); + input_unicode = PyUnicode_AS_UNICODE(pystr); + /* One char input can be up to 6 chars output, estimate 4 of these */ + output_size = 32 + input_chars; + rval = PyString_FromStringAndSize(NULL, output_size); + if (rval == NULL) { + return NULL; + } + output = PyString_AS_STRING(rval); + chars = 0; + output[chars++] = '"'; + for (i = 0; i < input_chars; i++) { + Py_UNICODE c = input_unicode[i]; + if (S_CHAR(c)) { + output[chars++] = (char)c; + } else { + chars = ascii_escape_char(c, output, chars); + } + if (output_size - chars < 7) { + /* There's more than four, so let's resize by a lot */ + output_size *= 2; + if (output_size > 2 + (input_chars * 6)) { + output_size = 2 + (input_chars * 6); + } + if (_PyString_Resize(&rval, output_size) == -1) { + return NULL; + } + output = PyString_AS_STRING(rval); + } + } + output[chars++] = '"'; + if (_PyString_Resize(&rval, chars) == -1) { + return NULL; + } + return rval; +} + +static PyObject * +ascii_escape_str(PyObject *pystr) { + Py_ssize_t i; + Py_ssize_t input_chars; + Py_ssize_t output_size; + Py_ssize_t chars; + PyObject *rval; + char *output; + char *input_str; + + input_chars = PyString_GET_SIZE(pystr); + input_str = PyString_AS_STRING(pystr); + /* One char input can be up to 6 chars output, estimate 4 of these */ + output_size = 32 + input_chars; + rval = PyString_FromStringAndSize(NULL, output_size); + if (rval == NULL) { + return NULL; + } + output = PyString_AS_STRING(rval); + chars = 0; + output[chars++] = '"'; + for (i = 0; i < input_chars; i++) { + Py_UNICODE c = (Py_UNICODE)input_str[i]; + if (S_CHAR(c)) { + output[chars++] = (char)c; + } else if (c > 0x7F) { + /* We hit a non-ASCII character, bail to unicode mode */ + PyObject *uni; + Py_DECREF(rval); + uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict"); + if (uni == NULL) { + return NULL; + } + rval = ascii_escape_unicode(uni); + Py_DECREF(uni); + return rval; + } else { + chars = ascii_escape_char(c, output, chars); + } + if (output_size - chars < 7) { + /* There's more than four, so let's resize by a lot */ + output_size *= 2; + if (output_size > 2 + (input_chars * 6)) { + output_size = 2 + (input_chars * 6); + } + if (_PyString_Resize(&rval, output_size) == -1) { + return NULL; + } + output = PyString_AS_STRING(rval); + } + } + output[chars++] = '"'; + if (_PyString_Resize(&rval, chars) == -1) { + return NULL; + } + return rval; +} + +PyDoc_STRVAR(pydoc_encode_basestring_ascii, + "encode_basestring_ascii(basestring) -> str\n" + "\n" + "..." +); + +static PyObject * +py_encode_basestring_ascii(PyObject* self __attribute__((__unused__)), PyObject *pystr) { + /* METH_O */ + if (PyString_Check(pystr)) { + return ascii_escape_str(pystr); + } else if (PyUnicode_Check(pystr)) { + return ascii_escape_unicode(pystr); + } + PyErr_SetString(PyExc_TypeError, "first argument must be a string"); + return NULL; +} + +#define DEFN(n, k) \ + { \ + #n, \ + (PyCFunction)py_ ##n, \ + k, \ + pydoc_ ##n \ + } +static PyMethodDef speedups_methods[] = { + DEFN(encode_basestring_ascii, METH_O), + {} +}; +#undef DEFN + +void +init_speedups(void) +{ + PyObject *m; + m = Py_InitModule4("_speedups", speedups_methods, NULL, NULL, PYTHON_API_VERSION); +} diff --git a/simplejson/encoder.py b/simplejson/encoder.py index 92ed9d4..eec9c7f 100644 --- a/simplejson/encoder.py +++ b/simplejson/encoder.py @@ -2,6 +2,10 @@ Implementation of JSONEncoder """ import re +try: + from simplejson import _speedups +except ImportError: + _speedups = None ESCAPE = re.compile(r'[\x00-\x19\\"\b\f\n\r\t]') ESCAPE_ASCII = re.compile(r'([\\"/]|[^\ -~])') @@ -56,9 +60,22 @@ def encode_basestring_ascii(s): try: return ESCAPE_DCT[s] except KeyError: - return '\\u%04x' % (ord(s),) + n = ord(s) + if n < 0x10000: + return '\\u%04x' % (n,) + else: + # surrogate pair + n -= 0x10000 + s1 = 0xd800 | ((n >> 10) & 0x3ff) + s2 = 0xdc00 | (n & 0x3ff) + return '\\u%04x\\u%04x' % (s1, s2) return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"' +try: + encode_basestring_ascii = _speedups.encode_basestring_ascii + _need_utf8 = True +except AttributeError: + _need_utf8 = False class JSONEncoder(object): """ @@ -212,9 +229,13 @@ class JSONEncoder(object): items = [(k, dct[k]) for k in keys] else: items = dct.iteritems() + _encoding = self.encoding + _do_decode = (_encoding is not None + and not (_need_utf8 and _encoding == 'utf-8')) for key, value in items: - if self.encoding is not None and isinstance(key, str): - key = key.decode(self.encoding) + if isinstance(key, str): + if _do_decode: + key = key.decode(_encoding) elif isinstance(key, basestring): pass # JavaScript is weakly typed for these, so it makes sense to @@ -254,8 +275,10 @@ class JSONEncoder(object): encoder = encode_basestring_ascii else: encoder = encode_basestring - if self.encoding and isinstance(o, str): - o = o.decode(self.encoding) + _encoding = self.encoding + if (_encoding is not None and isinstance(o, str) + and not (_need_utf8 and _encoding == 'utf-8')): + o = o.decode(_encoding) yield encoder(o) elif o is None: yield 'null' @@ -315,6 +338,14 @@ class JSONEncoder(object): >>> JSONEncoder().encode({"foo": ["bar", "baz"]}) '{"foo":["bar", "baz"]}' """ + # This is for extremely simple cases and benchmarks... + if isinstance(o, basestring): + if isinstance(o, str): + _encoding = self.encoding + if (_encoding is not None + and not (_encoding == 'utf-8' and _need_utf8)): + o = o.decode(_encoding) + return encode_basestring_ascii(o) # This doesn't pass the iterator directly to ''.join() because it # sucks at reporting exceptions. It's going to do this internally # anyway because it uses PySequence_Fast or similar. |