diff options
author | Bob Ippolito <bob@redivi.com> | 2009-01-05 01:29:32 +0000 |
---|---|---|
committer | Bob Ippolito <bob@redivi.com> | 2009-01-05 01:29:32 +0000 |
commit | 823f4a283da154d0ea83d1537a3914762fbe0c76 (patch) | |
tree | 5496fff98ba864af5bb5fd8757b063a3fba107ba /simplejson | |
parent | b919f3cbb861f3319a0af080e69652c839abf064 (diff) | |
download | simplejson-823f4a283da154d0ea83d1537a3914762fbe0c76.tar.gz |
changes relevant to http://codereview.appspot.com/7311 -- mostly documentation improvements and a small error handling fix
git-svn-id: http://simplejson.googlecode.com/svn/trunk@161 a4795897-2c25-0410-b006-0d3caba88fa1
Diffstat (limited to 'simplejson')
-rw-r--r-- | simplejson/_speedups.c | 150 | ||||
-rw-r--r-- | simplejson/decoder.py | 55 | ||||
-rw-r--r-- | simplejson/encoder.py | 3 |
3 files changed, 176 insertions, 32 deletions
diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c index ebf35bc..679a4c8 100644 --- a/simplejson/_speedups.c +++ b/simplejson/_speedups.c @@ -131,6 +131,7 @@ encoder_encode_float(PyEncoderObject *s, PyObject *obj); static int _convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr) { + /* PyObject to Py_ssize_t converter */ *size_ptr = PyInt_AsSsize_t(o); if (*size_ptr == -1 && PyErr_Occurred()); return 1; @@ -140,12 +141,16 @@ _convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr) static PyObject * _convertPyInt_FromSsize_t(Py_ssize_t *size_ptr) { + /* Py_ssize_t to PyObject converter */ return PyInt_FromSsize_t(*size_ptr); } static Py_ssize_t ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) { + /* Escape unicode code point c to ASCII escape sequences + in char *output. output must have at least 12 bytes unused to + accommodate an escaped surrogate pair "\uXXXX\uXXXX" */ output[chars++] = '\\'; switch (c) { case '\\': output[chars++] = (char)c; break; @@ -182,9 +187,11 @@ ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) static PyObject * ascii_escape_unicode(PyObject *pystr) { + /* Take a PyUnicode pystr and return a new ASCII-only escaped PyString */ Py_ssize_t i; Py_ssize_t input_chars; Py_ssize_t output_size; + Py_ssize_t max_output_size; Py_ssize_t chars; PyObject *rval; char *output; @@ -195,6 +202,7 @@ ascii_escape_unicode(PyObject *pystr) /* One char input can be up to 6 chars output, estimate 4 of these */ output_size = 2 + (MIN_EXPANSION * 4) + input_chars; + max_output_size = 2 + (input_chars * MAX_EXPANSION); rval = PyString_FromStringAndSize(NULL, output_size); if (rval == NULL) { return NULL; @@ -212,15 +220,19 @@ ascii_escape_unicode(PyObject *pystr) } if (output_size - chars < (1 + MAX_EXPANSION)) { /* There's more than four, so let's resize by a lot */ - output_size *= 2; + Py_ssize_t new_output_size = output_size * 2; /* This is an upper bound */ - if (output_size > 2 + (input_chars * MAX_EXPANSION)) { - output_size = 2 + (input_chars * MAX_EXPANSION); + if (new_output_size > max_output_size) { + new_output_size = max_output_size; } - if (_PyString_Resize(&rval, output_size) == -1) { - return NULL; + /* Make sure that the output size changed before resizing */ + if (new_output_size != output_size) { + output_size = new_output_size; + if (_PyString_Resize(&rval, output_size) == -1) { + return NULL; + } + output = PyString_AS_STRING(rval); } - output = PyString_AS_STRING(rval); } } output[chars++] = '"'; @@ -233,6 +245,7 @@ ascii_escape_unicode(PyObject *pystr) static PyObject * ascii_escape_str(PyObject *pystr) { + /* Take a PyString pystr and return a new ASCII-only escaped PyString */ Py_ssize_t i; Py_ssize_t input_chars; Py_ssize_t output_size; @@ -318,6 +331,8 @@ ascii_escape_str(PyObject *pystr) static void raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) { + /* Use the Python function simplejson.decoder.errmsg to raise a nice + looking ValueError exception */ static PyObject *errmsg_fn = NULL; PyObject *pymsg; if (errmsg_fn == NULL) { @@ -339,6 +354,7 @@ raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) static PyObject * join_list_unicode(PyObject *lst) { + /* return u''.join(lst) */ static PyObject *joinfn = NULL; if (joinfn == NULL) { PyObject *ustr = PyUnicode_FromUnicode(NULL, 0); @@ -356,6 +372,7 @@ join_list_unicode(PyObject *lst) static PyObject * join_list_string(PyObject *lst) { + /* return ''.join(lst) */ static PyObject *joinfn = NULL; if (joinfn == NULL) { PyObject *ustr = PyString_FromStringAndSize(NULL, 0); @@ -372,6 +389,7 @@ join_list_string(PyObject *lst) static PyObject * _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { + /* return (rval, idx) tuple, stealing reference to rval */ PyObject *tpl; PyObject *pyidx; /* @@ -399,6 +417,15 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { static PyObject * scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_ssize_t *next_end_ptr) { + /* Read the JSON string from PyString pystr. + end is the index of the first character after the quote. + encoding is the encoding of pystr (must be an ASCII superset) + if strict is zero then literal control characters are allowed + *next_end_ptr is a return-by-reference index of the character + after the end quote + + Return value is a new PyString (if ASCII-only) or PyUnicode + */ PyObject *rval; Py_ssize_t len = PyString_GET_SIZE(pystr); Py_ssize_t begin = end - 1; @@ -596,6 +623,15 @@ bail: static PyObject * scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr) { + /* Read the JSON string from PyUnicode pystr. + end is the index of the first character after the quote. + encoding is the encoding of pystr (must be an ASCII superset) + if strict is zero then literal control characters are allowed + *next_end_ptr is a return-by-reference index of the character + after the end quote + + Return value is a new PyUnicode + */ PyObject *rval; Py_ssize_t len = PyUnicode_GET_SIZE(pystr); Py_ssize_t begin = end - 1; @@ -764,9 +800,16 @@ bail: } PyDoc_STRVAR(pydoc_scanstring, - "scanstring(basestring, end, encoding) -> (str, end)\n" + "scanstring(basestring, end, encoding, strict=True) -> (str, end)\n" "\n" - "..." + "Scan the string s for a JSON string. End is the index of the\n" + "character in s after the quote that started the JSON string.\n" + "Unescapes all valid JSON string escape sequences and raises ValueError\n" + "on attempt to decode an invalid string. If strict is False then literal\n" + "control characters are allowed in the string.\n" + "\n" + "Returns a tuple of the decoded string and the index of the character in s\n" + "after the end quote." ); static PyObject * @@ -777,7 +820,7 @@ py_scanstring(PyObject* self UNUSED, PyObject *args) Py_ssize_t end; Py_ssize_t next_end = -1; char *encoding = NULL; - int strict = 0; + int strict = 1; if (!PyArg_ParseTuple(args, "OO&|zi:scanstring", &pystr, _convertPyInt_AsSsize_t, &end, &encoding, &strict)) { return NULL; } @@ -802,12 +845,13 @@ py_scanstring(PyObject* self UNUSED, PyObject *args) PyDoc_STRVAR(pydoc_encode_basestring_ascii, "encode_basestring_ascii(basestring) -> str\n" "\n" - "..." + "Return an ASCII-only JSON representation of a Python string" ); static PyObject * py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr) { + /* Return an ASCII-only JSON representation of a Python string */ /* METH_O */ if (PyString_Check(pystr)) { return ascii_escape_str(pystr); @@ -826,6 +870,7 @@ py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr) static void scanner_dealloc(PyObject *self) { + /* Deallocate scanner object */ PyScannerObject *s; assert(PyScanner_Check(self)); s = (PyScannerObject *)self; @@ -840,6 +885,13 @@ scanner_dealloc(PyObject *self) static PyObject * _parse_object_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read a JSON object from PyString pystr. + idx is the index of the first character after the opening curly brace. + *next_idx_ptr is a return-by-reference index to the first character after + the closing curly brace. + + Returns a new PyObject (usually a dict, but object_hook can change that) + */ char *str = PyString_AS_STRING(pystr); Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; PyObject *rval = PyDict_New(); @@ -931,6 +983,13 @@ bail: static PyObject * _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read a JSON object from PyUnicode pystr. + idx is the index of the first character after the opening curly brace. + *next_idx_ptr is a return-by-reference index to the first character after + the closing curly brace. + + Returns a new PyObject (usually a dict, but object_hook can change that) + */ Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; PyObject *val = NULL; @@ -1023,6 +1082,13 @@ bail: static PyObject * _parse_array_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read a JSON array from PyString pystr. + idx is the index of the first character after the opening brace. + *next_idx_ptr is a return-by-reference index to the first character after + the closing brace. + + Returns a new PyList + */ char *str = PyString_AS_STRING(pystr); Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; PyObject *val = NULL; @@ -1083,6 +1149,13 @@ bail: static PyObject * _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read a JSON array from PyString pystr. + idx is the index of the first character after the opening brace. + *next_idx_ptr is a return-by-reference index to the first character after + the closing brace. + + Returns a new PyList + */ Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; PyObject *val = NULL; @@ -1143,6 +1216,15 @@ bail: static PyObject * _parse_constant(PyScannerObject *s, char *constant, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read a JSON constant from PyString pystr. + constant is the constant string that was found + ("NaN", "Infinity", "-Infinity"). + idx is the index of the first character of the constant + *next_idx_ptr is a return-by-reference index to the first character after + the constant. + + Returns the result of parse_constant + */ PyObject *cstr; PyObject *rval; /* constant is "NaN", "Infinity", or "-Infinity" */ @@ -1160,6 +1242,15 @@ _parse_constant(PyScannerObject *s, char *constant, Py_ssize_t idx, Py_ssize_t * static PyObject * _match_number_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssize_t *next_idx_ptr) { + /* Read a JSON number from PyString pystr. + idx is the index of the first character of the number + *next_idx_ptr is a return-by-reference index to the first character after + the number. + + Returns a new PyObject representation of that number: + PyInt, PyLong, or PyFloat. + May return other types if parse_int or parse_float are set + */ char *str = PyString_AS_STRING(pystr); Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; Py_ssize_t idx = start; @@ -1249,6 +1340,15 @@ _match_number_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssiz static PyObject * _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssize_t *next_idx_ptr) { + /* Read a JSON number from PyUnicode pystr. + idx is the index of the first character of the number + *next_idx_ptr is a return-by-reference index to the first character after + the number. + + Returns a new PyObject representation of that number: + PyInt, PyLong, or PyFloat. + May return other types if parse_int or parse_float are set + */ Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; Py_ssize_t idx = start; @@ -1332,6 +1432,13 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ static PyObject * scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read one JSON term (of any kind) from PyString pystr. + idx is the index of the first character of the term + *next_idx_ptr is a return-by-reference index to the first character after + the number. + + Returns a new PyObject representation of the term. + */ char *str = PyString_AS_STRING(pystr); Py_ssize_t length = PyString_GET_SIZE(pystr); if (idx >= length) { @@ -1401,6 +1508,13 @@ scan_once_str(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *n static PyObject * scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read one JSON term (of any kind) from PyUnicode pystr. + idx is the index of the first character of the term + *next_idx_ptr is a return-by-reference index to the first character after + the number. + + Returns a new PyObject representation of the term. + */ Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); Py_ssize_t length = PyUnicode_GET_SIZE(pystr); if (idx >= length) { @@ -1469,6 +1583,7 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_ static PyObject * scanner_call(PyObject *self, PyObject *args, PyObject *kwds) { + /* Python callable interface to scan_once_{str,unicode} */ PyObject *pystr; PyObject *rval; Py_ssize_t idx; @@ -1498,6 +1613,7 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds) static int scanner_init(PyObject *self, PyObject *args, PyObject *kwds) { + /* Initialize Scanner object */ PyObject *ctx; static char *kwlist[] = {"context", NULL}; PyScannerObject *s; @@ -1564,7 +1680,7 @@ static PyTypeObject PyScannerType = { PyObject_HEAD_INIT(0) 0, /* tp_internal */ - "make_scanner", /* tp_name */ + "Scanner", /* tp_name */ sizeof(PyScannerObject), /* tp_basicsize */ 0, /* tp_itemsize */ scanner_dealloc, /* tp_dealloc */ @@ -1607,6 +1723,7 @@ PyTypeObject PyScannerType = { static int encoder_init(PyObject *self, PyObject *args, PyObject *kwds) { + /* initialize Encoder object */ static char *kwlist[] = {"markers", "default", "encoder", "indent", "key_separator", "item_separator", "sort_keys", "skipkeys", "allow_nan", NULL}; PyEncoderObject *s; @@ -1644,6 +1761,7 @@ encoder_init(PyObject *self, PyObject *args, PyObject *kwds) static PyObject * encoder_call(PyObject *self, PyObject *args, PyObject *kwds) { + /* Python callable interface to encode_listencode_obj */ static char *kwlist[] = {"obj", "_current_indent_level", NULL}; PyObject *obj; PyObject *rval; @@ -1667,6 +1785,7 @@ encoder_call(PyObject *self, PyObject *args, PyObject *kwds) static PyObject * _encoded_const(PyObject *obj) { + /* Return the JSON string representation of None, True, False */ if (obj == Py_None) { static PyObject *s_null = NULL; if (s_null == NULL) { @@ -1700,6 +1819,7 @@ _encoded_const(PyObject *obj) static PyObject * encoder_encode_float(PyEncoderObject *s, PyObject *obj) { + /* Return the JSON representation of a PyFloat */ double i = PyFloat_AS_DOUBLE(obj); if (!Py_IS_FINITE(i)) { if (!s->allow_nan) { @@ -1723,6 +1843,7 @@ encoder_encode_float(PyEncoderObject *s, PyObject *obj) static PyObject * encoder_encode_string(PyEncoderObject *s, PyObject *obj) { + /* Return the JSON representation of a string */ if (s->fast_encode) return py_encode_basestring_ascii(NULL, obj); else @@ -1732,6 +1853,7 @@ encoder_encode_string(PyEncoderObject *s, PyObject *obj) static int _steal_list_append(PyObject *lst, PyObject *stolen) { + /* Append stolen and then decrement its reference count */ int rval = PyList_Append(lst, stolen); Py_DECREF(stolen); return rval; @@ -1740,6 +1862,7 @@ _steal_list_append(PyObject *lst, PyObject *stolen) static int encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level) { + /* Encode Python object obj to a JSON term, rval is a PyList */ PyObject *newobj; int rv; @@ -1818,6 +1941,7 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi static int encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level) { + /* Encode Python dict dct a JSON term, rval is a PyList */ static PyObject *open_dict = NULL; static PyObject *close_dict = NULL; static PyObject *empty_dict = NULL; @@ -1949,6 +2073,7 @@ bail: static int encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level) { + /* Encode Python list seq to a JSON term, rval is a PyList */ static PyObject *open_array = NULL; static PyObject *close_array = NULL; static PyObject *empty_array = NULL; @@ -2038,6 +2163,7 @@ bail: static void encoder_dealloc(PyObject *self) { + /* Deallocate Encoder */ PyEncoderObject *s; assert(PyEncoder_Check(self)); s = (PyEncoderObject *)self; @@ -2058,7 +2184,7 @@ static PyTypeObject PyEncoderType = { PyObject_HEAD_INIT(0) 0, /* tp_internal */ - "make_encoder", /* tp_name */ + "Encoder", /* tp_name */ sizeof(PyEncoderObject), /* tp_basicsize */ 0, /* tp_itemsize */ encoder_dealloc, /* tp_dealloc */ diff --git a/simplejson/decoder.py b/simplejson/decoder.py index 6a328a5..44861a6 100644 --- a/simplejson/decoder.py +++ b/simplejson/decoder.py @@ -58,6 +58,14 @@ BACKSLASH = { DEFAULT_ENCODING = "utf-8" def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): + """Scan the string s for a JSON string. End is the index of the + character in s after the quote that started the JSON string. + Unescapes all valid JSON string escape sequences and raises ValueError + on attempt to decode an invalid string. If strict is False then literal + control characters are allowed in the string. + + Returns a tuple of the decoded string and the index of the character in s + after the end quote.""" if encoding is None: encoding = DEFAULT_ENCODING chunks = [] @@ -70,15 +78,19 @@ def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHU errmsg("Unterminated string starting at", s, begin)) end = chunk.end() content, terminator = chunk.groups() + # Content is contains zero or more unescaped string characters if content: if not isinstance(content, unicode): content = unicode(content, encoding) _append(content) + # Terminator is the end of string, a literal control character, + # or a backslash denoting that an escape sequence follows if terminator == '"': break elif terminator != '\\': if strict: - raise ValueError(errmsg("Invalid control character %r at", s, end)) + msg = "Invalid control character %r at" % (terminator,) + raise ValueError(msg, s, end) else: _append(terminator) continue @@ -87,36 +99,37 @@ def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHU except IndexError: raise ValueError( errmsg("Unterminated string starting at", s, begin)) + # If not a unicode escape sequence, must be in the lookup table if esc != 'u': try: - m = _b[esc] + char = _b[esc] except KeyError: raise ValueError( errmsg("Invalid \\escape: %r" % (esc,), s, end)) end += 1 else: + # Unicode escape sequence esc = s[end + 1:end + 5] next_end = end + 5 - msg = "Invalid \\uXXXX escape" - try: - if len(esc) != 4: - raise ValueError - uni = int(esc, 16) - if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: - msg = "Invalid \\uXXXX\\uXXXX surrogate pair" - if not s[end + 5:end + 7] == '\\u': - raise ValueError - esc2 = s[end + 7:end + 11] - if len(esc2) != 4: - raise ValueError - uni2 = int(esc2, 16) - uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) - next_end += 6 - m = unichr(uni) - except ValueError: + if len(esc) != 4: + msg = "Invalid \\uXXXX escape" raise ValueError(errmsg(msg, s, end)) + uni = int(esc, 16) + # Check for surrogate pair on UCS-4 systems + if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: + msg = "Invalid \\uXXXX\\uXXXX surrogate pair" + if not s[end + 5:end + 7] == '\\u': + raise ValueError(errmsg(msg, s, end)) + esc2 = s[end + 7:end + 11] + if len(esc2) != 4: + raise ValueError(errmsg(msg, s, end)) + uni2 = int(esc2, 16) + uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) + next_end += 6 + char = unichr(uni) end = next_end - _append(m) + # Append the unescaped character + _append(char) return u''.join(chunks), end @@ -128,6 +141,8 @@ WHITESPACE_STR = ' \t\n\r' def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR): pairs = {} + # Use a slice to prevent IndexError from being raised, the following + # check will raise a more specific ValueError if the string is empty nextchar = s[end:end + 1] # Normally we expect nextchar == '"' if nextchar != '"': diff --git a/simplejson/encoder.py b/simplejson/encoder.py index f75cda1..d28298f 100644 --- a/simplejson/encoder.py +++ b/simplejson/encoder.py @@ -40,6 +40,9 @@ def encode_basestring(s): def py_encode_basestring_ascii(s): + """Return an ASCII-only JSON representation of a Python string + + """ if isinstance(s, str) and HAS_UTF8.search(s) is not None: s = s.decode('utf-8') def replace(match): |