diff options
author | Benjamin Peterson <benjamin@python.org> | 2009-05-02 12:36:44 (GMT) |
---|---|---|
committer | Benjamin Peterson <benjamin@python.org> | 2009-05-02 12:36:44 (GMT) |
commit | c6b607d4a9e4d60fb506034ce67fc89734bb68a7 (patch) | |
tree | b4f27e81ab25a9dfe6a97433f4d25c263b59f6cc /Modules/_json.c | |
parent | 7255f18556ae70fc28b563a345577d3ec8f6f0ba (diff) | |
download | cpython-c6b607d4a9e4d60fb506034ce67fc89734bb68a7.zip cpython-c6b607d4a9e4d60fb506034ce67fc89734bb68a7.tar.gz cpython-c6b607d4a9e4d60fb506034ce67fc89734bb68a7.tar.bz2 |
port simplejson upgrade from the trunk #4136
json also now works only with unicode strings
Patch by Antoine Pitrou; updated by me
Diffstat (limited to 'Modules/_json.c')
-rw-r--r-- | Modules/_json.c | 1707 |
1 files changed, 1377 insertions, 330 deletions
diff --git a/Modules/_json.c b/Modules/_json.c index 9c35f64..d075fe7 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -1,23 +1,160 @@ #include "Python.h" +#include "structmember.h" +#if PY_VERSION_HEX < 0x02060000 && !defined(Py_TYPE) +#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) +#endif +#if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN) +typedef int Py_ssize_t; +#define PY_SSIZE_T_MAX INT_MAX +#define PY_SSIZE_T_MIN INT_MIN +#define PyInt_FromSsize_t PyInt_FromLong +#define PyInt_AsSsize_t PyInt_AsLong +#endif +#ifndef Py_IS_FINITE +#define Py_IS_FINITE(X) (!Py_IS_INFINITY(X) && !Py_IS_NAN(X)) +#endif + +#ifdef __GNUC__ +#define UNUSED __attribute__((__unused__)) +#else +#define UNUSED +#endif + +#define PyScanner_Check(op) PyObject_TypeCheck(op, &PyScannerType) +#define PyScanner_CheckExact(op) (Py_TYPE(op) == &PyScannerType) +#define PyEncoder_Check(op) PyObject_TypeCheck(op, &PyEncoderType) +#define PyEncoder_CheckExact(op) (Py_TYPE(op) == &PyEncoderType) + +static PyTypeObject PyScannerType; +static PyTypeObject PyEncoderType; + +typedef struct _PyScannerObject { + PyObject_HEAD + PyObject *strict; + PyObject *object_hook; + PyObject *object_pairs_hook; + PyObject *parse_float; + PyObject *parse_int; + PyObject *parse_constant; +} PyScannerObject; + +static PyMemberDef scanner_members[] = { + {"strict", T_OBJECT, offsetof(PyScannerObject, strict), READONLY, "strict"}, + {"object_hook", T_OBJECT, offsetof(PyScannerObject, object_hook), READONLY, "object_hook"}, + {"object_pairs_hook", T_OBJECT, offsetof(PyScannerObject, object_pairs_hook), READONLY}, + {"parse_float", T_OBJECT, offsetof(PyScannerObject, parse_float), READONLY, "parse_float"}, + {"parse_int", T_OBJECT, offsetof(PyScannerObject, parse_int), READONLY, "parse_int"}, + {"parse_constant", T_OBJECT, offsetof(PyScannerObject, parse_constant), READONLY, "parse_constant"}, + {NULL} +}; + +typedef struct _PyEncoderObject { + PyObject_HEAD + PyObject *markers; + PyObject *defaultfn; + PyObject *encoder; + PyObject *indent; + PyObject *key_separator; + PyObject *item_separator; + PyObject *sort_keys; + PyObject *skipkeys; + int fast_encode; + int allow_nan; +} PyEncoderObject; + +static PyMemberDef encoder_members[] = { + {"markers", T_OBJECT, offsetof(PyEncoderObject, markers), READONLY, "markers"}, + {"default", T_OBJECT, offsetof(PyEncoderObject, defaultfn), READONLY, "default"}, + {"encoder", T_OBJECT, offsetof(PyEncoderObject, encoder), READONLY, "encoder"}, + {"indent", T_OBJECT, offsetof(PyEncoderObject, indent), READONLY, "indent"}, + {"key_separator", T_OBJECT, offsetof(PyEncoderObject, key_separator), READONLY, "key_separator"}, + {"item_separator", T_OBJECT, offsetof(PyEncoderObject, item_separator), READONLY, "item_separator"}, + {"sort_keys", T_OBJECT, offsetof(PyEncoderObject, sort_keys), READONLY, "sort_keys"}, + {"skipkeys", T_OBJECT, offsetof(PyEncoderObject, skipkeys), READONLY, "skipkeys"}, + {NULL} +}; + +static PyObject * +ascii_escape_unicode(PyObject *pystr); +static PyObject * +py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr); +void init_json(void); +static PyObject * +scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr); +static PyObject * +_build_rval_index_tuple(PyObject *rval, Py_ssize_t idx); +static PyObject * +scanner_new(PyTypeObject *type, PyObject *args, PyObject *kwds); +static int +scanner_init(PyObject *self, PyObject *args, PyObject *kwds); +static void +scanner_dealloc(PyObject *self); +static int +scanner_clear(PyObject *self); +static PyObject * +encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds); +static int +encoder_init(PyObject *self, PyObject *args, PyObject *kwds); +static void +encoder_dealloc(PyObject *self); +static int +encoder_clear(PyObject *self); +static int +encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level); +static int +encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level); +static int +encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level); +static PyObject * +_encoded_const(PyObject *const); +static void +raise_errmsg(char *msg, PyObject *s, Py_ssize_t end); +static PyObject * +encoder_encode_string(PyEncoderObject *s, PyObject *obj); +static int +_convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr); +static PyObject * +_convertPyInt_FromSsize_t(Py_ssize_t *size_ptr); +static PyObject * +encoder_encode_float(PyEncoderObject *s, PyObject *obj); -#define DEFAULT_ENCODING "utf-8" #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"') -#define MIN_EXPANSION 6 +#define IS_WHITESPACE(c) (((c) == ' ') || ((c) == '\t') || ((c) == '\n') || ((c) == '\r')) +#define MIN_EXPANSION 6 #ifdef Py_UNICODE_WIDE #define MAX_EXPANSION (2 * MIN_EXPANSION) #else #define MAX_EXPANSION MIN_EXPANSION #endif +static int +_convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr) +{ + /* PyObject to Py_ssize_t converter */ + *size_ptr = PyLong_AsSsize_t(o); + if (*size_ptr == -1 && PyErr_Occurred()); + return 1; + return 0; +} + +static PyObject * +_convertPyInt_FromSsize_t(Py_ssize_t *size_ptr) +{ + /* Py_ssize_t to PyObject converter */ + return PyLong_FromSsize_t(*size_ptr); +} + static Py_ssize_t -ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) +ascii_escape_unichar(Py_UNICODE c, Py_UNICODE *output, Py_ssize_t chars) { - Py_UNICODE x; + /* Escape unicode code point c to ASCII escape sequences + in char *output. output must have at least 12 bytes unused to + accommodate an escaped surrogate pair "\uXXXX\uXXXX" */ output[chars++] = '\\'; switch (c) { - case '\\': output[chars++] = (char)c; break; - case '"': output[chars++] = (char)c; break; + case '\\': output[chars++] = c; break; + case '"': output[chars++] = c; break; case '\b': output[chars++] = 'b'; break; case '\f': output[chars++] = 'f'; break; case '\n': output[chars++] = 'n'; break; @@ -30,27 +167,19 @@ ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) Py_UNICODE v = c - 0x10000; c = 0xd800 | ((v >> 10) & 0x3ff); output[chars++] = 'u'; - x = (c & 0xf000) >> 12; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x0f00) >> 8; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x00f0) >> 4; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x000f); - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf]; + output[chars++] = "0123456789abcdef"[(c >> 8) & 0xf]; + output[chars++] = "0123456789abcdef"[(c >> 4) & 0xf]; + output[chars++] = "0123456789abcdef"[(c ) & 0xf]; c = 0xdc00 | (v & 0x3ff); output[chars++] = '\\'; } #endif output[chars++] = 'u'; - x = (c & 0xf000) >> 12; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x0f00) >> 8; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x00f0) >> 4; - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); - x = (c & 0x000f); - output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); + output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf]; + output[chars++] = "0123456789abcdef"[(c >> 8) & 0xf]; + output[chars++] = "0123456789abcdef"[(c >> 4) & 0xf]; + output[chars++] = "0123456789abcdef"[(c ) & 0xf]; } return chars; } @@ -58,118 +187,66 @@ ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) static PyObject * ascii_escape_unicode(PyObject *pystr) { + /* Take a PyUnicode pystr and return a new ASCII-only escaped PyUnicode */ Py_ssize_t i; Py_ssize_t input_chars; Py_ssize_t output_size; + Py_ssize_t max_output_size; Py_ssize_t chars; PyObject *rval; - char *output; + Py_UNICODE *output; Py_UNICODE *input_unicode; input_chars = PyUnicode_GET_SIZE(pystr); input_unicode = PyUnicode_AS_UNICODE(pystr); + /* One char input can be up to 6 chars output, estimate 4 of these */ output_size = 2 + (MIN_EXPANSION * 4) + input_chars; - rval = PyBytes_FromStringAndSize(NULL, output_size); + max_output_size = 2 + (input_chars * MAX_EXPANSION); + rval = PyUnicode_FromStringAndSize(NULL, output_size); if (rval == NULL) { return NULL; } - output = PyBytes_AS_STRING(rval); + output = PyUnicode_AS_UNICODE(rval); chars = 0; output[chars++] = '"'; for (i = 0; i < input_chars; i++) { Py_UNICODE c = input_unicode[i]; if (S_CHAR(c)) { - output[chars++] = (char)c; + output[chars++] = c; } - else { - chars = ascii_escape_char(c, output, chars); + else { + chars = ascii_escape_unichar(c, output, chars); } if (output_size - chars < (1 + MAX_EXPANSION)) { /* There's more than four, so let's resize by a lot */ - output_size *= 2; + Py_ssize_t new_output_size = output_size * 2; /* This is an upper bound */ - if (output_size > 2 + (input_chars * MAX_EXPANSION)) { - output_size = 2 + (input_chars * MAX_EXPANSION); - } - if (_PyBytes_Resize(&rval, output_size) == -1) { - return NULL; - } - output = PyBytes_AS_STRING(rval); - } - } - output[chars++] = '"'; - if (_PyBytes_Resize(&rval, chars) == -1) { - return NULL; - } - return rval; -} - -static PyObject * -ascii_escape_str(PyObject *pystr) -{ - Py_ssize_t i; - Py_ssize_t input_chars; - Py_ssize_t output_size; - Py_ssize_t chars; - PyObject *rval; - char *output; - char *input_str; - - input_chars = PyBytes_GET_SIZE(pystr); - input_str = PyBytes_AS_STRING(pystr); - /* One char input can be up to 6 chars output, estimate 4 of these */ - output_size = 2 + (MIN_EXPANSION * 4) + input_chars; - rval = PyBytes_FromStringAndSize(NULL, output_size); - if (rval == NULL) { - return NULL; - } - output = PyBytes_AS_STRING(rval); - chars = 0; - output[chars++] = '"'; - for (i = 0; i < input_chars; i++) { - Py_UNICODE c = (Py_UNICODE)input_str[i]; - if (S_CHAR(c)) { - output[chars++] = (char)c; - } - else if (c > 0x7F) { - /* We hit a non-ASCII character, bail to unicode mode */ - PyObject *uni; - Py_DECREF(rval); - uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict"); - if (uni == NULL) { - return NULL; - } - rval = ascii_escape_unicode(uni); - Py_DECREF(uni); - return rval; - } - else { - chars = ascii_escape_char(c, output, chars); - } - /* An ASCII char can't possibly expand to a surrogate! */ - if (output_size - chars < (1 + MIN_EXPANSION)) { - /* There's more than four, so let's resize by a lot */ - output_size *= 2; - if (output_size > 2 + (input_chars * MIN_EXPANSION)) { - output_size = 2 + (input_chars * MIN_EXPANSION); + if (new_output_size > max_output_size) { + new_output_size = max_output_size; } - if (_PyBytes_Resize(&rval, output_size) == -1) { - return NULL; + /* Make sure that the output size changed before resizing */ + if (new_output_size != output_size) { + output_size = new_output_size; + if (PyUnicode_Resize(&rval, output_size) == -1) { + return NULL; + } + output = PyUnicode_AS_UNICODE(rval); } - output = PyBytes_AS_STRING(rval); } } output[chars++] = '"'; - if (_PyBytes_Resize(&rval, chars) == -1) { + if (PyUnicode_Resize(&rval, chars) == -1) { return NULL; } return rval; } -void +static void raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) { + /* Use the Python function json.decoder.errmsg to raise a nice + looking ValueError exception */ static PyObject *errmsg_fn = NULL; PyObject *pymsg; if (errmsg_fn == NULL) { @@ -177,63 +254,73 @@ raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) if (decoder == NULL) return; errmsg_fn = PyObject_GetAttrString(decoder, "errmsg"); + Py_DECREF(decoder); if (errmsg_fn == NULL) return; - Py_DECREF(decoder); } - pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end); + pymsg = PyObject_CallFunction(errmsg_fn, "(zOO&)", msg, s, _convertPyInt_FromSsize_t, &end); if (pymsg) { PyErr_SetObject(PyExc_ValueError, pymsg); Py_DECREF(pymsg); } -/* - -def linecol(doc, pos): - lineno = doc.count('\n', 0, pos) + 1 - if lineno == 1: - colno = pos - else: - colno = pos - doc.rindex('\n', 0, pos) - return lineno, colno - -def errmsg(msg, doc, pos, end=None): - lineno, colno = linecol(doc, pos) - if end is None: - return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos) - endlineno, endcolno = linecol(doc, end) - return '%s: line %d column %d - line %d column %d (char %d - %d)' % ( - msg, lineno, colno, endlineno, endcolno, pos, end) - -*/ } static PyObject * join_list_unicode(PyObject *lst) { - static PyObject *ustr = NULL; - static PyObject *joinstr = NULL; - if (ustr == NULL) { - Py_UNICODE c = 0; - ustr = PyUnicode_FromUnicode(&c, 0); + /* return u''.join(lst) */ + static PyObject *sep = NULL; + if (sep == NULL) { + sep = PyUnicode_FromStringAndSize("", 0); + if (sep == NULL) + return NULL; } - if (joinstr == NULL) { - joinstr = PyUnicode_InternFromString("join"); + return PyUnicode_Join(sep, lst); +} + +static PyObject * +_build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { + /* return (rval, idx) tuple, stealing reference to rval */ + PyObject *tpl; + PyObject *pyidx; + /* + steal a reference to rval, returns (rval, idx) + */ + if (rval == NULL) { + return NULL; + } + pyidx = PyLong_FromSsize_t(idx); + if (pyidx == NULL) { + Py_DECREF(rval); + return NULL; } - if (joinstr == NULL || ustr == NULL) { + tpl = PyTuple_New(2); + if (tpl == NULL) { + Py_DECREF(pyidx); + Py_DECREF(rval); return NULL; } - return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL); + PyTuple_SET_ITEM(tpl, 0, rval); + PyTuple_SET_ITEM(tpl, 1, pyidx); + return tpl; } static PyObject * -scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict) +scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr) { + /* Read the JSON string from PyUnicode pystr. + end is the index of the first character after the quote. + if strict is zero then literal control characters are allowed + *next_end_ptr is a return-by-reference index of the character + after the end quote + + Return value is a new PyUnicode + */ PyObject *rval; - Py_ssize_t len = PyBytes_GET_SIZE(pystr); + Py_ssize_t len = PyUnicode_GET_SIZE(pystr); Py_ssize_t begin = end - 1; Py_ssize_t next = begin; - char *buf = PyBytes_AS_STRING(pystr); - Py_buffer info; + const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); PyObject *chunks = PyList_New(0); if (chunks == NULL) { goto bail; @@ -262,16 +349,7 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict) } /* Pick up this chunk if it's not zero length */ if (next != end) { - PyObject *strchunk; - if (PyBuffer_FillInfo(&info, NULL, &buf[end], next - end, 1, 0) < 0) { - goto bail; - } - strchunk = PyMemoryView_FromBuffer(&info); - if (strchunk == NULL) { - goto bail; - } - chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL); - Py_DECREF(strchunk); + chunk = PyUnicode_FromUnicode(&buf[end], next - end); if (chunk == NULL) { goto bail; } @@ -320,18 +398,18 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict) } /* Decode 4 hex digits */ for (; next < end; next++) { - Py_ssize_t shl = (end - next - 1) << 2; Py_UNICODE digit = buf[next]; + c <<= 4; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - c |= (digit - '0') << shl; break; + c |= (digit - '0'); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - c |= (digit - 'a' + 10) << shl; break; + c |= (digit - 'a' + 10); break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - c |= (digit - 'A' + 10) << shl; break; + c |= (digit - 'A' + 10); break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; @@ -339,38 +417,46 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict) } #ifdef Py_UNICODE_WIDE /* Surrogate pair */ - if (c >= 0xd800 && c <= 0xdbff) { + if ((c & 0xfc00) == 0xd800) { Py_UNICODE c2 = 0; if (end + 6 >= len) { - raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, - end - 5); + raise_errmsg("Unpaired high surrogate", pystr, end - 5); + goto bail; } if (buf[next++] != '\\' || buf[next++] != 'u') { - raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, - end - 5); + raise_errmsg("Unpaired high surrogate", pystr, end - 5); + goto bail; } end += 6; /* Decode 4 hex digits */ for (; next < end; next++) { - Py_ssize_t shl = (end - next - 1) << 2; + c2 <<= 4; Py_UNICODE digit = buf[next]; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - c2 |= (digit - '0') << shl; break; + c2 |= (digit - '0'); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - c2 |= (digit - 'a' + 10) << shl; break; + c2 |= (digit - 'a' + 10); break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - c2 |= (digit - 'A' + 10) << shl; break; + c2 |= (digit - 'A' + 10); break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; } } + if ((c2 & 0xfc00) != 0xdc00) { + raise_errmsg("Unpaired high surrogate", pystr, end - 5); + goto bail; + } c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); } + else if ((c & 0xfc00) == 0xdc00) { + raise_errmsg("Unpaired low surrogate", pystr, end - 5); + goto bail; + } #endif } chunk = PyUnicode_FromUnicode(&c, 1); @@ -388,237 +474,1176 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict) if (rval == NULL) { goto bail; } - Py_CLEAR(chunks); - return Py_BuildValue("(Nn)", rval, end); + Py_DECREF(chunks); + *next_end_ptr = end; + return rval; bail: + *next_end_ptr = -1; Py_XDECREF(chunks); return NULL; } +PyDoc_STRVAR(pydoc_scanstring, + "scanstring(basestring, end, strict=True) -> (bytes, end)\n" + "\n" + "Scan the string s for a JSON string. End is the index of the\n" + "character in s after the quote that started the JSON string.\n" + "Unescapes all valid JSON string escape sequences and raises ValueError\n" + "on attempt to decode an invalid string. If strict is False then literal\n" + "control characters are allowed in the string.\n" + "\n" + "Returns a tuple of the decoded string and the index of the character in s\n" + "after the end quote." +); static PyObject * -scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict) +py_scanstring(PyObject* self UNUSED, PyObject *args) { + PyObject *pystr; PyObject *rval; - Py_ssize_t len = PyUnicode_GET_SIZE(pystr); - Py_ssize_t begin = end - 1; - Py_ssize_t next = begin; - const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); - PyObject *chunks = PyList_New(0); - if (chunks == NULL) { - goto bail; + Py_ssize_t end; + Py_ssize_t next_end = -1; + int strict = 1; + if (!PyArg_ParseTuple(args, "OO&|i:scanstring", &pystr, _convertPyInt_AsSsize_t, &end, &strict)) { + return NULL; } - if (end < 0 || len <= end) { - PyErr_SetString(PyExc_ValueError, "end is out of bounds"); - goto bail; + if (PyUnicode_Check(pystr)) { + rval = scanstring_unicode(pystr, end, strict, &next_end); } - while (1) { - /* Find the end of the string or the next escape */ - Py_UNICODE c = 0; - PyObject *chunk = NULL; - for (next = end; next < len; next++) { - c = buf[next]; - if (c == '"' || c == '\\') { - break; - } - else if (strict && c <= 0x1f) { - raise_errmsg("Invalid control character at", pystr, next); + else { + PyErr_Format(PyExc_TypeError, + "first argument must be a string or bytes, not %.80s", + Py_TYPE(pystr)->tp_name); + return NULL; + } + return _build_rval_index_tuple(rval, next_end); +} + +PyDoc_STRVAR(pydoc_encode_basestring_ascii, + "encode_basestring_ascii(basestring) -> bytes\n" + "\n" + "Return an ASCII-only JSON representation of a Python string" +); + +static PyObject * +py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr) +{ + PyObject *rval; + /* Return an ASCII-only JSON representation of a Python string */ + /* METH_O */ + if (PyUnicode_Check(pystr)) { + rval = ascii_escape_unicode(pystr); + } + else { + PyErr_Format(PyExc_TypeError, + "first argument must be a string, not %.80s", + Py_TYPE(pystr)->tp_name); + return NULL; + } + return rval; +} + +static void +scanner_dealloc(PyObject *self) +{ + /* Deallocate scanner object */ + scanner_clear(self); + Py_TYPE(self)->tp_free(self); +} + +static int +scanner_traverse(PyObject *self, visitproc visit, void *arg) +{ + PyScannerObject *s; + assert(PyScanner_Check(self)); + s = (PyScannerObject *)self; + Py_VISIT(s->strict); + Py_VISIT(s->object_hook); + Py_VISIT(s->object_pairs_hook); + Py_VISIT(s->parse_float); + Py_VISIT(s->parse_int); + Py_VISIT(s->parse_constant); + return 0; +} + +static int +scanner_clear(PyObject *self) +{ + PyScannerObject *s; + assert(PyScanner_Check(self)); + s = (PyScannerObject *)self; + Py_CLEAR(s->strict); + Py_CLEAR(s->object_hook); + Py_CLEAR(s->object_pairs_hook); + Py_CLEAR(s->parse_float); + Py_CLEAR(s->parse_int); + Py_CLEAR(s->parse_constant); + return 0; +} + +static PyObject * +_parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read a JSON object from PyUnicode pystr. + idx is the index of the first character after the opening curly brace. + *next_idx_ptr is a return-by-reference index to the first character after + the closing curly brace. + + Returns a new PyObject (usually a dict, but object_hook can change that) + */ + Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); + Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; + PyObject *val = NULL; + PyObject *rval = PyList_New(0); + PyObject *key = NULL; + int strict = PyObject_IsTrue(s->strict); + Py_ssize_t next_idx; + if (rval == NULL) + return NULL; + + /* skip whitespace after { */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* only loop if the object is non-empty */ + if (idx <= end_idx && str[idx] != '}') { + while (idx <= end_idx) { + /* read key */ + if (str[idx] != '"') { + raise_errmsg("Expecting property name", pystr, idx); goto bail; } - } - if (!(c == '"' || c == '\\')) { - raise_errmsg("Unterminated string starting at", pystr, begin); - goto bail; - } - /* Pick up this chunk if it's not zero length */ - if (next != end) { - chunk = PyUnicode_FromUnicode(&buf[end], next - end); - if (chunk == NULL) { + key = scanstring_unicode(pystr, idx + 1, strict, &next_idx); + if (key == NULL) + goto bail; + idx = next_idx; + + /* skip whitespace between key and : delimiter, read :, skip whitespace */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + if (idx > end_idx || str[idx] != ':') { + raise_errmsg("Expecting : delimiter", pystr, idx); goto bail; } - if (PyList_Append(chunks, chunk)) { - Py_DECREF(chunk); + idx++; + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* read any JSON term */ + val = scan_once_unicode(s, pystr, idx, &next_idx); + if (val == NULL) goto bail; + + { + PyObject *tuple = PyTuple_Pack(2, key, val); + if (tuple == NULL) + goto bail; + if (PyList_Append(rval, tuple) == -1) { + Py_DECREF(tuple); + goto bail; + } + Py_DECREF(tuple); } - Py_DECREF(chunk); - } - next++; - if (c == '"') { - end = next; - break; - } - if (next == len) { - raise_errmsg("Unterminated string starting at", pystr, begin); - goto bail; - } - c = buf[next]; - if (c != 'u') { - /* Non-unicode backslash escapes */ - end = next + 1; - switch (c) { - case '"': break; - case '\\': break; - case '/': break; - case 'b': c = '\b'; break; - case 'f': c = '\f'; break; - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - default: c = 0; + + Py_CLEAR(key); + Py_CLEAR(val); + idx = next_idx; + + /* skip whitespace before } or , */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* bail if the object is closed or we didn't get the , delimiter */ + if (idx > end_idx) break; + if (str[idx] == '}') { + break; } - if (c == 0) { - raise_errmsg("Invalid \\escape", pystr, end - 2); + else if (str[idx] != ',') { + raise_errmsg("Expecting , delimiter", pystr, idx); goto bail; } + idx++; + + /* skip whitespace after , delimiter */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; } - else { - c = 0; - next++; - end = next + 4; - if (end >= len) { - raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); + } + + /* verify that idx < end_idx, str[idx] should be '}' */ + if (idx > end_idx || str[idx] != '}') { + raise_errmsg("Expecting object", pystr, end_idx); + goto bail; + } + + *next_idx_ptr = idx + 1; + + if (s->object_pairs_hook != Py_None) { + val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL); + if (val == NULL) + goto bail; + Py_DECREF(rval); + return val; + } + + val = PyDict_New(); + if (val == NULL) + goto bail; + if (PyDict_MergeFromSeq2(val, rval, 1) == -1) + goto bail; + Py_DECREF(rval); + rval = val; + + /* if object_hook is not None: rval = object_hook(rval) */ + if (s->object_hook != Py_None) { + val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL); + if (val == NULL) + goto bail; + Py_DECREF(rval); + rval = val; + val = NULL; + } + return rval; +bail: + Py_XDECREF(key); + Py_XDECREF(val); + Py_DECREF(rval); + return NULL; +} + +static PyObject * +_parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read a JSON array from PyString pystr. + idx is the index of the first character after the opening brace. + *next_idx_ptr is a return-by-reference index to the first character after + the closing brace. + + Returns a new PyList + */ + Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); + Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; + PyObject *val = NULL; + PyObject *rval = PyList_New(0); + Py_ssize_t next_idx; + if (rval == NULL) + return NULL; + + /* skip whitespace after [ */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* only loop if the array is non-empty */ + if (idx <= end_idx && str[idx] != ']') { + while (idx <= end_idx) { + + /* read any JSON term */ + val = scan_once_unicode(s, pystr, idx, &next_idx); + if (val == NULL) goto bail; + + if (PyList_Append(rval, val) == -1) + goto bail; + + Py_CLEAR(val); + idx = next_idx; + + /* skip whitespace between term and , */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + + /* bail if the array is closed or we didn't get the , delimiter */ + if (idx > end_idx) break; + if (str[idx] == ']') { + break; } - /* Decode 4 hex digits */ - for (; next < end; next++) { - Py_ssize_t shl = (end - next - 1) << 2; - Py_UNICODE digit = buf[next]; - switch (digit) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - c |= (digit - '0') << shl; break; - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': - c |= (digit - 'a' + 10) << shl; break; - case 'A': case 'B': case 'C': case 'D': case 'E': - case 'F': - c |= (digit - 'A' + 10) << shl; break; - default: - raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); - goto bail; - } - } -#ifdef Py_UNICODE_WIDE - /* Surrogate pair */ - if (c >= 0xd800 && c <= 0xdbff) { - Py_UNICODE c2 = 0; - if (end + 6 >= len) { - raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, - end - 5); - } - if (buf[next++] != '\\' || buf[next++] != 'u') { - raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, - end - 5); - } - end += 6; - /* Decode 4 hex digits */ - for (; next < end; next++) { - Py_ssize_t shl = (end - next - 1) << 2; - Py_UNICODE digit = buf[next]; - switch (digit) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - c2 |= (digit - '0') << shl; break; - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': - c2 |= (digit - 'a' + 10) << shl; break; - case 'A': case 'B': case 'C': case 'D': case 'E': - case 'F': - c2 |= (digit - 'A' + 10) << shl; break; - default: - raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); - goto bail; - } - } - c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); + else if (str[idx] != ',') { + raise_errmsg("Expecting , delimiter", pystr, idx); + goto bail; } -#endif - } - chunk = PyUnicode_FromUnicode(&c, 1); - if (chunk == NULL) { - goto bail; - } - if (PyList_Append(chunks, chunk)) { - Py_DECREF(chunk); - goto bail; + idx++; + + /* skip whitespace after , */ + while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; } - Py_DECREF(chunk); } - rval = join_list_unicode(chunks); - if (rval == NULL) { + /* verify that idx < end_idx, str[idx] should be ']' */ + if (idx > end_idx || str[idx] != ']') { + raise_errmsg("Expecting object", pystr, end_idx); goto bail; } - Py_CLEAR(chunks); - return Py_BuildValue("(Nn)", rval, end); + *next_idx_ptr = idx + 1; + return rval; bail: - Py_XDECREF(chunks); + Py_XDECREF(val); + Py_DECREF(rval); return NULL; } -PyDoc_STRVAR(pydoc_scanstring, -"scanstring(str_or_bytes, end, encoding) -> (bytes, end)\n"); +static PyObject * +_parse_constant(PyScannerObject *s, char *constant, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) { + /* Read a JSON constant from PyString pystr. + constant is the constant string that was found + ("NaN", "Infinity", "-Infinity"). + idx is the index of the first character of the constant + *next_idx_ptr is a return-by-reference index to the first character after + the constant. + + Returns the result of parse_constant + */ + PyObject *cstr; + PyObject *rval; + /* constant is "NaN", "Infinity", or "-Infinity" */ + cstr = PyUnicode_InternFromString(constant); + if (cstr == NULL) + return NULL; + + /* rval = parse_constant(constant) */ + rval = PyObject_CallFunctionObjArgs(s->parse_constant, cstr, NULL); + idx += PyUnicode_GET_SIZE(cstr); + Py_DECREF(cstr); + *next_idx_ptr = idx; + return rval; +} static PyObject * -py_scanstring(PyObject* self, PyObject *args) -{ - PyObject *pystr; - Py_ssize_t end; - char *encoding = NULL; - int strict = 0; - if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) { +_match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ssize_t *next_idx_ptr) { + /* Read a JSON number from PyUnicode pystr. + idx is the index of the first character of the number + *next_idx_ptr is a return-by-reference index to the first character after + the number. + + Returns a new PyObject representation of that number: + PyInt, PyLong, or PyFloat. + May return other types if parse_int or parse_float are set + */ + Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); + Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; + Py_ssize_t idx = start; + int is_float = 0; + PyObject *rval; + PyObject *numstr; + + /* read a sign if it's there, make sure it's not the end of the string */ + if (str[idx] == '-') { + idx++; + if (idx > end_idx) { + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + } + + /* read as many integer digits as we find as long as it doesn't start with 0 */ + if (str[idx] >= '1' && str[idx] <= '9') { + idx++; + while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + } + /* if it starts with 0 we only expect one integer digit */ + else if (str[idx] == '0') { + idx++; + } + /* no integer digits, error */ + else { + PyErr_SetNone(PyExc_StopIteration); return NULL; } - if (encoding == NULL) { - encoding = DEFAULT_ENCODING; + + /* if the next char is '.' followed by a digit then read all float digits */ + if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') { + is_float = 1; + idx += 2; + while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; } - if (PyBytes_Check(pystr)) { - return scanstring_str(pystr, end, encoding, strict); + + /* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */ + if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) { + Py_ssize_t e_start = idx; + idx++; + + /* read an exponent sign if present */ + if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++; + + /* read all digits */ + while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + + /* if we got a digit, then parse as float. if not, backtrack */ + if (str[idx - 1] >= '0' && str[idx - 1] <= '9') { + is_float = 1; + } + else { + idx = e_start; + } } - else if (PyUnicode_Check(pystr)) { - return scanstring_unicode(pystr, end, strict); + + /* copy the section we determined to be a number */ + numstr = PyUnicode_FromUnicode(&str[start], idx - start); + if (numstr == NULL) + return NULL; + if (is_float) { + /* parse as a float using a fast path if available, otherwise call user defined method */ + if (s->parse_float != (PyObject *)&PyFloat_Type) { + rval = PyObject_CallFunctionObjArgs(s->parse_float, numstr, NULL); + } + else { + rval = PyFloat_FromString(numstr); + } } else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string or bytes, not %.80s", - Py_TYPE(pystr)->tp_name); + /* no fast path for unicode -> int, just call */ + rval = PyObject_CallFunctionObjArgs(s->parse_int, numstr, NULL); + } + Py_DECREF(numstr); + *next_idx_ptr = idx; + return rval; +} + +static PyObject * +scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_t *next_idx_ptr) +{ + /* Read one JSON term (of any kind) from PyUnicode pystr. + idx is the index of the first character of the term + *next_idx_ptr is a return-by-reference index to the first character after + the number. + + Returns a new PyObject representation of the term. + */ + Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); + Py_ssize_t length = PyUnicode_GET_SIZE(pystr); + if (idx >= length) { + PyErr_SetNone(PyExc_StopIteration); return NULL; } + switch (str[idx]) { + case '"': + /* string */ + return scanstring_unicode(pystr, idx + 1, + PyObject_IsTrue(s->strict), + next_idx_ptr); + case '{': + /* object */ + return _parse_object_unicode(s, pystr, idx + 1, next_idx_ptr); + case '[': + /* array */ + return _parse_array_unicode(s, pystr, idx + 1, next_idx_ptr); + case 'n': + /* null */ + if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') { + Py_INCREF(Py_None); + *next_idx_ptr = idx + 4; + return Py_None; + } + break; + case 't': + /* true */ + if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') { + Py_INCREF(Py_True); + *next_idx_ptr = idx + 4; + return Py_True; + } + break; + case 'f': + /* false */ + if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') { + Py_INCREF(Py_False); + *next_idx_ptr = idx + 5; + return Py_False; + } + break; + case 'N': + /* NaN */ + if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') { + return _parse_constant(s, "NaN", idx, next_idx_ptr); + } + break; + case 'I': + /* Infinity */ + if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') { + return _parse_constant(s, "Infinity", idx, next_idx_ptr); + } + break; + case '-': + /* -Infinity */ + if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') { + return _parse_constant(s, "-Infinity", idx, next_idx_ptr); + } + break; + } + /* Didn't find a string, object, array, or named constant. Look for a number. */ + return _match_number_unicode(s, pystr, idx, next_idx_ptr); } -PyDoc_STRVAR(pydoc_encode_basestring_ascii, -"encode_basestring_ascii(str_or_bytes) -> bytes\n"); +static PyObject * +scanner_call(PyObject *self, PyObject *args, PyObject *kwds) +{ + /* Python callable interface to scan_once_{str,unicode} */ + PyObject *pystr; + PyObject *rval; + Py_ssize_t idx; + Py_ssize_t next_idx = -1; + static char *kwlist[] = {"string", "idx", NULL}; + PyScannerObject *s; + assert(PyScanner_Check(self)); + s = (PyScannerObject *)self; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:scan_once", kwlist, &pystr, _convertPyInt_AsSsize_t, &idx)) + return NULL; + + if (PyUnicode_Check(pystr)) { + rval = scan_once_unicode(s, pystr, idx, &next_idx); + } + else { + PyErr_Format(PyExc_TypeError, + "first argument must be a string, not %.80s", + Py_TYPE(pystr)->tp_name); + return NULL; + } + return _build_rval_index_tuple(rval, next_idx); +} + +static PyObject * +scanner_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + PyScannerObject *s; + s = (PyScannerObject *)type->tp_alloc(type, 0); + if (s != NULL) { + s->strict = NULL; + s->object_hook = NULL; + s->object_pairs_hook = NULL; + s->parse_float = NULL; + s->parse_int = NULL; + s->parse_constant = NULL; + } + return (PyObject *)s; +} + +static int +scanner_init(PyObject *self, PyObject *args, PyObject *kwds) +{ + /* Initialize Scanner object */ + PyObject *ctx; + static char *kwlist[] = {"context", NULL}; + PyScannerObject *s; + + assert(PyScanner_Check(self)); + s = (PyScannerObject *)self; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx)) + return -1; + + /* All of these will fail "gracefully" so we don't need to verify them */ + s->strict = PyObject_GetAttrString(ctx, "strict"); + if (s->strict == NULL) + goto bail; + s->object_hook = PyObject_GetAttrString(ctx, "object_hook"); + if (s->object_hook == NULL) + goto bail; + s->object_pairs_hook = PyObject_GetAttrString(ctx, "object_pairs_hook"); + if (s->object_pairs_hook == NULL) + goto bail; + s->parse_float = PyObject_GetAttrString(ctx, "parse_float"); + if (s->parse_float == NULL) + goto bail; + s->parse_int = PyObject_GetAttrString(ctx, "parse_int"); + if (s->parse_int == NULL) + goto bail; + s->parse_constant = PyObject_GetAttrString(ctx, "parse_constant"); + if (s->parse_constant == NULL) + goto bail; + + return 0; + +bail: + Py_CLEAR(s->strict); + Py_CLEAR(s->object_hook); + Py_CLEAR(s->object_pairs_hook); + Py_CLEAR(s->parse_float); + Py_CLEAR(s->parse_int); + Py_CLEAR(s->parse_constant); + return -1; +} + +PyDoc_STRVAR(scanner_doc, "JSON scanner object"); + +static +PyTypeObject PyScannerType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_json.Scanner", /* tp_name */ + sizeof(PyScannerObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + scanner_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + scanner_call, /* tp_call */ + 0, /* tp_str */ + 0,/* PyObject_GenericGetAttr, */ /* tp_getattro */ + 0,/* PyObject_GenericSetAttr, */ /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, /* tp_flags */ + scanner_doc, /* tp_doc */ + scanner_traverse, /* tp_traverse */ + scanner_clear, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + scanner_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + scanner_init, /* tp_init */ + 0,/* PyType_GenericAlloc, */ /* tp_alloc */ + scanner_new, /* tp_new */ + 0,/* PyObject_GC_Del, */ /* tp_free */ +}; + +static PyObject * +encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + PyEncoderObject *s; + s = (PyEncoderObject *)type->tp_alloc(type, 0); + if (s != NULL) { + s->markers = NULL; + s->defaultfn = NULL; + s->encoder = NULL; + s->indent = NULL; + s->key_separator = NULL; + s->item_separator = NULL; + s->sort_keys = NULL; + s->skipkeys = NULL; + } + return (PyObject *)s; +} + +static int +encoder_init(PyObject *self, PyObject *args, PyObject *kwds) +{ + /* initialize Encoder object */ + static char *kwlist[] = {"markers", "default", "encoder", "indent", "key_separator", "item_separator", "sort_keys", "skipkeys", "allow_nan", NULL}; + + PyEncoderObject *s; + PyObject *allow_nan; + + assert(PyEncoder_Check(self)); + s = (PyEncoderObject *)self; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOOOOOOOO:make_encoder", kwlist, + &s->markers, &s->defaultfn, &s->encoder, &s->indent, &s->key_separator, &s->item_separator, &s->sort_keys, &s->skipkeys, &allow_nan)) + return -1; + + Py_INCREF(s->markers); + Py_INCREF(s->defaultfn); + Py_INCREF(s->encoder); + Py_INCREF(s->indent); + Py_INCREF(s->key_separator); + Py_INCREF(s->item_separator); + Py_INCREF(s->sort_keys); + Py_INCREF(s->skipkeys); + s->fast_encode = (PyCFunction_Check(s->encoder) && PyCFunction_GetFunction(s->encoder) == (PyCFunction)py_encode_basestring_ascii); + s->allow_nan = PyObject_IsTrue(allow_nan); + return 0; +} static PyObject * -py_encode_basestring_ascii(PyObject* self, PyObject *pystr) +encoder_call(PyObject *self, PyObject *args, PyObject *kwds) { + /* Python callable interface to encode_listencode_obj */ + static char *kwlist[] = {"obj", "_current_indent_level", NULL}; + PyObject *obj; PyObject *rval; - /* METH_O */ - if (PyBytes_Check(pystr)) { - rval = ascii_escape_str(pystr); + Py_ssize_t indent_level; + PyEncoderObject *s; + assert(PyEncoder_Check(self)); + s = (PyEncoderObject *)self; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:_iterencode", kwlist, + &obj, _convertPyInt_AsSsize_t, &indent_level)) + return NULL; + rval = PyList_New(0); + if (rval == NULL) + return NULL; + if (encoder_listencode_obj(s, rval, obj, indent_level)) { + Py_DECREF(rval); + return NULL; } - else if (PyUnicode_Check(pystr)) { - rval = ascii_escape_unicode(pystr); + return rval; +} + +static PyObject * +_encoded_const(PyObject *obj) +{ + /* Return the JSON string representation of None, True, False */ + if (obj == Py_None) { + static PyObject *s_null = NULL; + if (s_null == NULL) { + s_null = PyUnicode_InternFromString("null"); + } + Py_INCREF(s_null); + return s_null; + } + else if (obj == Py_True) { + static PyObject *s_true = NULL; + if (s_true == NULL) { + s_true = PyUnicode_InternFromString("true"); + } + Py_INCREF(s_true); + return s_true; + } + else if (obj == Py_False) { + static PyObject *s_false = NULL; + if (s_false == NULL) { + s_false = PyUnicode_InternFromString("false"); + } + Py_INCREF(s_false); + return s_false; } else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string or unicode, not %.80s", - Py_TYPE(pystr)->tp_name); + PyErr_SetString(PyExc_ValueError, "not a const"); return NULL; } - if (rval != NULL && PyBytes_Check(rval)) { - PyObject *urval = PyUnicode_DecodeASCII(PyBytes_AS_STRING(rval), PyBytes_GET_SIZE(rval), NULL); - Py_DECREF(rval); - return urval; +} + +static PyObject * +encoder_encode_float(PyEncoderObject *s, PyObject *obj) +{ + /* Return the JSON representation of a PyFloat */ + double i = PyFloat_AS_DOUBLE(obj); + if (!Py_IS_FINITE(i)) { + if (!s->allow_nan) { + PyErr_SetString(PyExc_ValueError, "Out of range float values are not JSON compliant"); + return NULL; + } + if (i > 0) { + return PyUnicode_FromString("Infinity"); + } + else if (i < 0) { + return PyUnicode_FromString("-Infinity"); + } + else { + return PyUnicode_FromString("NaN"); + } } + /* Use a better float format here? */ + return PyObject_Repr(obj); +} + +static PyObject * +encoder_encode_string(PyEncoderObject *s, PyObject *obj) +{ + /* Return the JSON representation of a string */ + if (s->fast_encode) + return py_encode_basestring_ascii(NULL, obj); + else + return PyObject_CallFunctionObjArgs(s->encoder, obj, NULL); +} + +static int +_steal_list_append(PyObject *lst, PyObject *stolen) +{ + /* Append stolen and then decrement its reference count */ + int rval = PyList_Append(lst, stolen); + Py_DECREF(stolen); return rval; } -static PyMethodDef json_methods[] = { - {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii, - METH_O, pydoc_encode_basestring_ascii}, - {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS, - pydoc_scanstring}, +static int +encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level) +{ + /* Encode Python object obj to a JSON term, rval is a PyList */ + PyObject *newobj; + int rv; + + if (obj == Py_None || obj == Py_True || obj == Py_False) { + PyObject *cstr = _encoded_const(obj); + if (cstr == NULL) + return -1; + return _steal_list_append(rval, cstr); + } + else if (PyUnicode_Check(obj)) + { + PyObject *encoded = encoder_encode_string(s, obj); + if (encoded == NULL) + return -1; + return _steal_list_append(rval, encoded); + } + else if (PyLong_Check(obj)) { + PyObject *encoded = PyObject_Str(obj); + if (encoded == NULL) + return -1; + return _steal_list_append(rval, encoded); + } + else if (PyFloat_Check(obj)) { + PyObject *encoded = encoder_encode_float(s, obj); + if (encoded == NULL) + return -1; + return _steal_list_append(rval, encoded); + } + else if (PyList_Check(obj) || PyTuple_Check(obj)) { + return encoder_listencode_list(s, rval, obj, indent_level); + } + else if (PyDict_Check(obj)) { + return encoder_listencode_dict(s, rval, obj, indent_level); + } + else { + PyObject *ident = NULL; + if (s->markers != Py_None) { + int has_key; + ident = PyLong_FromVoidPtr(obj); + if (ident == NULL) + return -1; + has_key = PyDict_Contains(s->markers, ident); + if (has_key) { + if (has_key != -1) + PyErr_SetString(PyExc_ValueError, "Circular reference detected"); + Py_DECREF(ident); + return -1; + } + if (PyDict_SetItem(s->markers, ident, obj)) { + Py_DECREF(ident); + return -1; + } + } + newobj = PyObject_CallFunctionObjArgs(s->defaultfn, obj, NULL); + if (newobj == NULL) { + Py_XDECREF(ident); + return -1; + } + rv = encoder_listencode_obj(s, rval, newobj, indent_level); + Py_DECREF(newobj); + if (rv) { + Py_XDECREF(ident); + return -1; + } + if (ident != NULL) { + if (PyDict_DelItem(s->markers, ident)) { + Py_XDECREF(ident); + return -1; + } + Py_XDECREF(ident); + } + return rv; + } +} + +static int +encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level) +{ + /* Encode Python dict dct a JSON term, rval is a PyList */ + static PyObject *open_dict = NULL; + static PyObject *close_dict = NULL; + static PyObject *empty_dict = NULL; + PyObject *kstr = NULL; + PyObject *ident = NULL; + PyObject *key, *value; + Py_ssize_t pos; + int skipkeys; + Py_ssize_t idx; + + if (open_dict == NULL || close_dict == NULL || empty_dict == NULL) { + open_dict = PyUnicode_InternFromString("{"); + close_dict = PyUnicode_InternFromString("}"); + empty_dict = PyUnicode_InternFromString("{}"); + if (open_dict == NULL || close_dict == NULL || empty_dict == NULL) + return -1; + } + if (PyDict_Size(dct) == 0) + return PyList_Append(rval, empty_dict); + + if (s->markers != Py_None) { + int has_key; + ident = PyLong_FromVoidPtr(dct); + if (ident == NULL) + goto bail; + has_key = PyDict_Contains(s->markers, ident); + if (has_key) { + if (has_key != -1) + PyErr_SetString(PyExc_ValueError, "Circular reference detected"); + goto bail; + } + if (PyDict_SetItem(s->markers, ident, dct)) { + goto bail; + } + } + + if (PyList_Append(rval, open_dict)) + goto bail; + + if (s->indent != Py_None) { + /* TODO: DOES NOT RUN */ + indent_level += 1; + /* + newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) + separator = _item_separator + newline_indent + buf += newline_indent + */ + } + + /* TODO: C speedup not implemented for sort_keys */ + + pos = 0; + skipkeys = PyObject_IsTrue(s->skipkeys); + idx = 0; + while (PyDict_Next(dct, &pos, &key, &value)) { + PyObject *encoded; + + if (PyUnicode_Check(key)) { + Py_INCREF(key); + kstr = key; + } + else if (PyFloat_Check(key)) { + kstr = encoder_encode_float(s, key); + if (kstr == NULL) + goto bail; + } + else if (PyLong_Check(key)) { + kstr = PyObject_Str(key); + if (kstr == NULL) + goto bail; + } + else if (key == Py_True || key == Py_False || key == Py_None) { + kstr = _encoded_const(key); + if (kstr == NULL) + goto bail; + } + else if (skipkeys) { + continue; + } + else { + /* TODO: include repr of key */ + PyErr_SetString(PyExc_ValueError, "keys must be a string"); + goto bail; + } + + if (idx) { + if (PyList_Append(rval, s->item_separator)) + goto bail; + } + + encoded = encoder_encode_string(s, kstr); + Py_CLEAR(kstr); + if (encoded == NULL) + goto bail; + if (PyList_Append(rval, encoded)) { + Py_DECREF(encoded); + goto bail; + } + Py_DECREF(encoded); + if (PyList_Append(rval, s->key_separator)) + goto bail; + if (encoder_listencode_obj(s, rval, value, indent_level)) + goto bail; + idx += 1; + } + if (ident != NULL) { + if (PyDict_DelItem(s->markers, ident)) + goto bail; + Py_CLEAR(ident); + } + if (s->indent != Py_None) { + /* TODO: DOES NOT RUN */ + indent_level -= 1; + /* + yield '\n' + (' ' * (_indent * _current_indent_level)) + */ + } + if (PyList_Append(rval, close_dict)) + goto bail; + return 0; + +bail: + Py_XDECREF(kstr); + Py_XDECREF(ident); + return -1; +} + + +static int +encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level) +{ + /* Encode Python list seq to a JSON term, rval is a PyList */ + static PyObject *open_array = NULL; + static PyObject *close_array = NULL; + static PyObject *empty_array = NULL; + PyObject *ident = NULL; + PyObject *s_fast = NULL; + Py_ssize_t num_items; + PyObject **seq_items; + Py_ssize_t i; + + if (open_array == NULL || close_array == NULL || empty_array == NULL) { + open_array = PyUnicode_InternFromString("["); + close_array = PyUnicode_InternFromString("]"); + empty_array = PyUnicode_InternFromString("[]"); + if (open_array == NULL || close_array == NULL || empty_array == NULL) + return -1; + } + ident = NULL; + s_fast = PySequence_Fast(seq, "_iterencode_list needs a sequence"); + if (s_fast == NULL) + return -1; + num_items = PySequence_Fast_GET_SIZE(s_fast); + if (num_items == 0) { + Py_DECREF(s_fast); + return PyList_Append(rval, empty_array); + } + + if (s->markers != Py_None) { + int has_key; + ident = PyLong_FromVoidPtr(seq); + if (ident == NULL) + goto bail; + has_key = PyDict_Contains(s->markers, ident); + if (has_key) { + if (has_key != -1) + PyErr_SetString(PyExc_ValueError, "Circular reference detected"); + goto bail; + } + if (PyDict_SetItem(s->markers, ident, seq)) { + goto bail; + } + } + + seq_items = PySequence_Fast_ITEMS(s_fast); + if (PyList_Append(rval, open_array)) + goto bail; + if (s->indent != Py_None) { + /* TODO: DOES NOT RUN */ + indent_level += 1; + /* + newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) + separator = _item_separator + newline_indent + buf += newline_indent + */ + } + for (i = 0; i < num_items; i++) { + PyObject *obj = seq_items[i]; + if (i) { + if (PyList_Append(rval, s->item_separator)) + goto bail; + } + if (encoder_listencode_obj(s, rval, obj, indent_level)) + goto bail; + } + if (ident != NULL) { + if (PyDict_DelItem(s->markers, ident)) + goto bail; + Py_CLEAR(ident); + } + if (s->indent != Py_None) { + /* TODO: DOES NOT RUN */ + indent_level -= 1; + /* + yield '\n' + (' ' * (_indent * _current_indent_level)) + */ + } + if (PyList_Append(rval, close_array)) + goto bail; + Py_DECREF(s_fast); + return 0; + +bail: + Py_XDECREF(ident); + Py_DECREF(s_fast); + return -1; +} + +static void +encoder_dealloc(PyObject *self) +{ + /* Deallocate Encoder */ + encoder_clear(self); + Py_TYPE(self)->tp_free(self); +} + +static int +encoder_traverse(PyObject *self, visitproc visit, void *arg) +{ + PyEncoderObject *s; + assert(PyEncoder_Check(self)); + s = (PyEncoderObject *)self; + Py_VISIT(s->markers); + Py_VISIT(s->defaultfn); + Py_VISIT(s->encoder); + Py_VISIT(s->indent); + Py_VISIT(s->key_separator); + Py_VISIT(s->item_separator); + Py_VISIT(s->sort_keys); + Py_VISIT(s->skipkeys); + return 0; +} + +static int +encoder_clear(PyObject *self) +{ + /* Deallocate Encoder */ + PyEncoderObject *s; + assert(PyEncoder_Check(self)); + s = (PyEncoderObject *)self; + Py_CLEAR(s->markers); + Py_CLEAR(s->defaultfn); + Py_CLEAR(s->encoder); + Py_CLEAR(s->indent); + Py_CLEAR(s->key_separator); + Py_CLEAR(s->item_separator); + Py_CLEAR(s->sort_keys); + Py_CLEAR(s->skipkeys); + return 0; +} + +PyDoc_STRVAR(encoder_doc, "_iterencode(obj, _current_indent_level) -> iterable"); + +static +PyTypeObject PyEncoderType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_json.Encoder", /* tp_name */ + sizeof(PyEncoderObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + encoder_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + encoder_call, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, /* tp_flags */ + encoder_doc, /* tp_doc */ + encoder_traverse, /* tp_traverse */ + encoder_clear, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + encoder_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + encoder_init, /* tp_init */ + 0, /* tp_alloc */ + encoder_new, /* tp_new */ + 0, /* tp_free */ +}; + +static PyMethodDef speedups_methods[] = { + {"encode_basestring_ascii", + (PyCFunction)py_encode_basestring_ascii, + METH_O, + pydoc_encode_basestring_ascii}, + {"scanstring", + (PyCFunction)py_scanstring, + METH_VARARGS, + pydoc_scanstring}, {NULL, NULL, 0, NULL} }; @@ -630,7 +1655,7 @@ static struct PyModuleDef jsonmodule = { "_json", module_doc, -1, - json_methods, + speedups_methods, NULL, NULL, NULL, @@ -640,5 +1665,27 @@ static struct PyModuleDef jsonmodule = { PyObject* PyInit__json(void) { - return PyModule_Create(&jsonmodule); + PyObject *m = PyModule_Create(&jsonmodule); + if (!m) + return NULL; + PyScannerType.tp_new = PyType_GenericNew; + if (PyType_Ready(&PyScannerType) < 0) + goto fail; + PyEncoderType.tp_new = PyType_GenericNew; + if (PyType_Ready(&PyEncoderType) < 0) + goto fail; + Py_INCREF((PyObject*)&PyScannerType); + if (PyModule_AddObject(m, "make_scanner", (PyObject*)&PyScannerType) < 0) { + Py_DECREF((PyObject*)&PyScannerType); + goto fail; + } + Py_INCREF((PyObject*)&PyEncoderType); + if (PyModule_AddObject(m, "make_encoder", (PyObject*)&PyEncoderType) < 0) { + Py_DECREF((PyObject*)&PyEncoderType); + goto fail; + } + return m; + fail: + Py_DECREF(m); + return NULL; } |