summaryrefslogtreecommitdiffstats
path: root/Modules/_json.c
diff options
context:
space:
mode:
Diffstat (limited to 'Modules/_json.c')
-rw-r--r--Modules/_json.c620
1 files changed, 620 insertions, 0 deletions
diff --git a/Modules/_json.c b/Modules/_json.c
new file mode 100644
index 0000000..39d39cb
--- /dev/null
+++ b/Modules/_json.c
@@ -0,0 +1,620 @@
+#include "Python.h"
+
+#define DEFAULT_ENCODING "utf-8"
+#define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
+#define MIN_EXPANSION 6
+
+#ifdef Py_UNICODE_WIDE
+#define MAX_EXPANSION (2 * MIN_EXPANSION)
+#else
+#define MAX_EXPANSION MIN_EXPANSION
+#endif
+
+static Py_ssize_t
+ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars)
+{
+ Py_UNICODE x;
+ output[chars++] = '\\';
+ switch (c) {
+ case '\\': output[chars++] = (char)c; break;
+ case '"': output[chars++] = (char)c; break;
+ case '\b': output[chars++] = 'b'; break;
+ case '\f': output[chars++] = 'f'; break;
+ case '\n': output[chars++] = 'n'; break;
+ case '\r': output[chars++] = 'r'; break;
+ case '\t': output[chars++] = 't'; break;
+ default:
+#ifdef Py_UNICODE_WIDE
+ if (c >= 0x10000) {
+ /* UTF-16 surrogate pair */
+ Py_UNICODE v = c - 0x10000;
+ c = 0xd800 | ((v >> 10) & 0x3ff);
+ output[chars++] = 'u';
+ x = (c & 0xf000) >> 12;
+ output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+ x = (c & 0x0f00) >> 8;
+ output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+ x = (c & 0x00f0) >> 4;
+ output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+ x = (c & 0x000f);
+ output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+ c = 0xdc00 | (v & 0x3ff);
+ output[chars++] = '\\';
+ }
+#endif
+ output[chars++] = 'u';
+ x = (c & 0xf000) >> 12;
+ output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+ x = (c & 0x0f00) >> 8;
+ output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+ x = (c & 0x00f0) >> 4;
+ output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+ x = (c & 0x000f);
+ output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
+ }
+ return chars;
+}
+
+static PyObject *
+ascii_escape_unicode(PyObject *pystr)
+{
+ Py_ssize_t i;
+ Py_ssize_t input_chars;
+ Py_ssize_t output_size;
+ Py_ssize_t chars;
+ PyObject *rval;
+ char *output;
+ Py_UNICODE *input_unicode;
+
+ input_chars = PyUnicode_GET_SIZE(pystr);
+ input_unicode = PyUnicode_AS_UNICODE(pystr);
+ /* One char input can be up to 6 chars output, estimate 4 of these */
+ output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
+ rval = PyString_FromStringAndSize(NULL, output_size);
+ if (rval == NULL) {
+ return NULL;
+ }
+ output = PyString_AS_STRING(rval);
+ chars = 0;
+ output[chars++] = '"';
+ for (i = 0; i < input_chars; i++) {
+ Py_UNICODE c = input_unicode[i];
+ if (S_CHAR(c)) {
+ output[chars++] = (char)c;
+ }
+ else {
+ chars = ascii_escape_char(c, output, chars);
+ }
+ if (output_size - chars < (1 + MAX_EXPANSION)) {
+ /* There's more than four, so let's resize by a lot */
+ output_size *= 2;
+ /* This is an upper bound */
+ if (output_size > 2 + (input_chars * MAX_EXPANSION)) {
+ output_size = 2 + (input_chars * MAX_EXPANSION);
+ }
+ if (_PyString_Resize(&rval, output_size) == -1) {
+ return NULL;
+ }
+ output = PyString_AS_STRING(rval);
+ }
+ }
+ output[chars++] = '"';
+ if (_PyString_Resize(&rval, chars) == -1) {
+ return NULL;
+ }
+ return rval;
+}
+
+static PyObject *
+ascii_escape_str(PyObject *pystr)
+{
+ Py_ssize_t i;
+ Py_ssize_t input_chars;
+ Py_ssize_t output_size;
+ Py_ssize_t chars;
+ PyObject *rval;
+ char *output;
+ char *input_str;
+
+ input_chars = PyString_GET_SIZE(pystr);
+ input_str = PyString_AS_STRING(pystr);
+ /* One char input can be up to 6 chars output, estimate 4 of these */
+ output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
+ rval = PyString_FromStringAndSize(NULL, output_size);
+ if (rval == NULL) {
+ return NULL;
+ }
+ output = PyString_AS_STRING(rval);
+ chars = 0;
+ output[chars++] = '"';
+ for (i = 0; i < input_chars; i++) {
+ Py_UNICODE c = (Py_UNICODE)input_str[i];
+ if (S_CHAR(c)) {
+ output[chars++] = (char)c;
+ }
+ else if (c > 0x7F) {
+ /* We hit a non-ASCII character, bail to unicode mode */
+ PyObject *uni;
+ Py_DECREF(rval);
+ uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
+ if (uni == NULL) {
+ return NULL;
+ }
+ rval = ascii_escape_unicode(uni);
+ Py_DECREF(uni);
+ return rval;
+ }
+ else {
+ chars = ascii_escape_char(c, output, chars);
+ }
+ /* An ASCII char can't possibly expand to a surrogate! */
+ if (output_size - chars < (1 + MIN_EXPANSION)) {
+ /* There's more than four, so let's resize by a lot */
+ output_size *= 2;
+ if (output_size > 2 + (input_chars * MIN_EXPANSION)) {
+ output_size = 2 + (input_chars * MIN_EXPANSION);
+ }
+ if (_PyString_Resize(&rval, output_size) == -1) {
+ return NULL;
+ }
+ output = PyString_AS_STRING(rval);
+ }
+ }
+ output[chars++] = '"';
+ if (_PyString_Resize(&rval, chars) == -1) {
+ return NULL;
+ }
+ return rval;
+}
+
+void
+raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
+{
+ static PyObject *errmsg_fn = NULL;
+ PyObject *pymsg;
+ if (errmsg_fn == NULL) {
+ PyObject *decoder = PyImport_ImportModule("json.decoder");
+ if (decoder == NULL)
+ return;
+ errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
+ if (errmsg_fn == NULL)
+ return;
+ Py_XDECREF(decoder);
+ }
+ pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
+ PyErr_SetObject(PyExc_ValueError, pymsg);
+ Py_DECREF(pymsg);
+/*
+
+def linecol(doc, pos):
+ lineno = doc.count('\n', 0, pos) + 1
+ if lineno == 1:
+ colno = pos
+ else:
+ colno = pos - doc.rindex('\n', 0, pos)
+ return lineno, colno
+
+def errmsg(msg, doc, pos, end=None):
+ lineno, colno = linecol(doc, pos)
+ if end is None:
+ return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
+ endlineno, endcolno = linecol(doc, end)
+ return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
+ msg, lineno, colno, endlineno, endcolno, pos, end)
+
+*/
+}
+
+static PyObject *
+join_list_unicode(PyObject *lst)
+{
+ static PyObject *ustr = NULL;
+ static PyObject *joinstr = NULL;
+ if (ustr == NULL) {
+ Py_UNICODE c = 0;
+ ustr = PyUnicode_FromUnicode(&c, 0);
+ }
+ if (joinstr == NULL) {
+ joinstr = PyUnicode_InternFromString("join");
+ }
+ if (joinstr == NULL || ustr == NULL) {
+ return NULL;
+ }
+ return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
+}
+
+static PyObject *
+scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict)
+{
+ PyObject *rval;
+ Py_ssize_t len = PyString_GET_SIZE(pystr);
+ Py_ssize_t begin = end - 1;
+ Py_ssize_t next = begin;
+ char *buf = PyString_AS_STRING(pystr);
+ Py_buffer info;
+ PyObject *chunks = PyList_New(0);
+ if (chunks == NULL) {
+ goto bail;
+ }
+ while (1) {
+ /* Find the end of the string or the next escape */
+ Py_UNICODE c = 0;
+ PyObject *chunk = NULL;
+ for (next = end; next < len; next++) {
+ c = buf[next];
+ if (c == '"' || c == '\\') {
+ break;
+ }
+ else if (strict && c <= 0x1f) {
+ raise_errmsg("Invalid control character at", pystr, begin);
+ goto bail;
+ }
+ }
+ if (!(c == '"' || c == '\\')) {
+ raise_errmsg("Unterminated string starting at", pystr, begin);
+ goto bail;
+ }
+ /* Pick up this chunk if it's not zero length */
+ if (next != end) {
+ if (PyBuffer_FillInfo(&info, &buf[end], next - end, 1, 0) < 0) {
+ goto bail;
+ }
+ PyObject *strchunk = PyMemoryView_FromMemory(&info);
+ if (strchunk == NULL) {
+ goto bail;
+ }
+ chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
+ Py_DECREF(strchunk);
+ if (chunk == NULL) {
+ goto bail;
+ }
+ if (PyList_Append(chunks, chunk)) {
+ goto bail;
+ }
+ Py_DECREF(chunk);
+ }
+ next++;
+ if (c == '"') {
+ end = next;
+ break;
+ }
+ if (next == len) {
+ raise_errmsg("Unterminated string starting at", pystr, begin);
+ goto bail;
+ }
+ c = buf[next];
+ if (c != 'u') {
+ /* Non-unicode backslash escapes */
+ end = next + 1;
+ switch (c) {
+ case '"': break;
+ case '\\': break;
+ case '/': break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ default: c = 0;
+ }
+ if (c == 0) {
+ raise_errmsg("Invalid \\escape", pystr, end - 2);
+ goto bail;
+ }
+ }
+ else {
+ c = 0;
+ next++;
+ end = next + 4;
+ if (end >= len) {
+ raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
+ goto bail;
+ }
+ /* Decode 4 hex digits */
+ for (; next < end; next++) {
+ Py_ssize_t shl = (end - next - 1) << 2;
+ Py_UNICODE digit = buf[next];
+ switch (digit) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ c |= (digit - '0') << shl; break;
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f':
+ c |= (digit - 'a' + 10) << shl; break;
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F':
+ c |= (digit - 'A' + 10) << shl; break;
+ default:
+ raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+ goto bail;
+ }
+ }
+#ifdef Py_UNICODE_WIDE
+ /* Surrogate pair */
+ if (c >= 0xd800 && c <= 0xdbff) {
+ Py_UNICODE c2 = 0;
+ if (end + 6 >= len) {
+ raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+ end - 5);
+ }
+ if (buf[next++] != '\\' || buf[next++] != 'u') {
+ raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+ end - 5);
+ }
+ end += 6;
+ /* Decode 4 hex digits */
+ for (; next < end; next++) {
+ Py_ssize_t shl = (end - next - 1) << 2;
+ Py_UNICODE digit = buf[next];
+ switch (digit) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ c2 |= (digit - '0') << shl; break;
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f':
+ c2 |= (digit - 'a' + 10) << shl; break;
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F':
+ c2 |= (digit - 'A' + 10) << shl; break;
+ default:
+ raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+ goto bail;
+ }
+ }
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+ }
+#endif
+ }
+ chunk = PyUnicode_FromUnicode(&c, 1);
+ if (chunk == NULL) {
+ goto bail;
+ }
+ if (PyList_Append(chunks, chunk)) {
+ goto bail;
+ }
+ Py_DECREF(chunk);
+ }
+
+ rval = join_list_unicode(chunks);
+ if (rval == NULL) {
+ goto bail;
+ }
+ Py_DECREF(chunks);
+ chunks = NULL;
+ return Py_BuildValue("(Nn)", rval, end);
+bail:
+ Py_XDECREF(chunks);
+ return NULL;
+}
+
+
+static PyObject *
+scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict)
+{
+ PyObject *rval;
+ Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
+ Py_ssize_t begin = end - 1;
+ Py_ssize_t next = begin;
+ const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
+ PyObject *chunks = PyList_New(0);
+ if (chunks == NULL) {
+ goto bail;
+ }
+ while (1) {
+ /* Find the end of the string or the next escape */
+ Py_UNICODE c = 0;
+ PyObject *chunk = NULL;
+ for (next = end; next < len; next++) {
+ c = buf[next];
+ if (c == '"' || c == '\\') {
+ break;
+ }
+ else if (strict && c <= 0x1f) {
+ raise_errmsg("Invalid control character at", pystr, begin);
+ goto bail;
+ }
+ }
+ if (!(c == '"' || c == '\\')) {
+ raise_errmsg("Unterminated string starting at", pystr, begin);
+ goto bail;
+ }
+ /* Pick up this chunk if it's not zero length */
+ if (next != end) {
+ chunk = PyUnicode_FromUnicode(&buf[end], next - end);
+ if (chunk == NULL) {
+ goto bail;
+ }
+ if (PyList_Append(chunks, chunk)) {
+ goto bail;
+ }
+ Py_DECREF(chunk);
+ }
+ next++;
+ if (c == '"') {
+ end = next;
+ break;
+ }
+ if (next == len) {
+ raise_errmsg("Unterminated string starting at", pystr, begin);
+ goto bail;
+ }
+ c = buf[next];
+ if (c != 'u') {
+ /* Non-unicode backslash escapes */
+ end = next + 1;
+ switch (c) {
+ case '"': break;
+ case '\\': break;
+ case '/': break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ default: c = 0;
+ }
+ if (c == 0) {
+ raise_errmsg("Invalid \\escape", pystr, end - 2);
+ goto bail;
+ }
+ }
+ else {
+ c = 0;
+ next++;
+ end = next + 4;
+ if (end >= len) {
+ raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
+ goto bail;
+ }
+ /* Decode 4 hex digits */
+ for (; next < end; next++) {
+ Py_ssize_t shl = (end - next - 1) << 2;
+ Py_UNICODE digit = buf[next];
+ switch (digit) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ c |= (digit - '0') << shl; break;
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f':
+ c |= (digit - 'a' + 10) << shl; break;
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F':
+ c |= (digit - 'A' + 10) << shl; break;
+ default:
+ raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+ goto bail;
+ }
+ }
+#ifdef Py_UNICODE_WIDE
+ /* Surrogate pair */
+ if (c >= 0xd800 && c <= 0xdbff) {
+ Py_UNICODE c2 = 0;
+ if (end + 6 >= len) {
+ raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+ end - 5);
+ }
+ if (buf[next++] != '\\' || buf[next++] != 'u') {
+ raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+ end - 5);
+ }
+ end += 6;
+ /* Decode 4 hex digits */
+ for (; next < end; next++) {
+ Py_ssize_t shl = (end - next - 1) << 2;
+ Py_UNICODE digit = buf[next];
+ switch (digit) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ c2 |= (digit - '0') << shl; break;
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f':
+ c2 |= (digit - 'a' + 10) << shl; break;
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F':
+ c2 |= (digit - 'A' + 10) << shl; break;
+ default:
+ raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+ goto bail;
+ }
+ }
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+ }
+#endif
+ }
+ chunk = PyUnicode_FromUnicode(&c, 1);
+ if (chunk == NULL) {
+ goto bail;
+ }
+ if (PyList_Append(chunks, chunk)) {
+ goto bail;
+ }
+ Py_DECREF(chunk);
+ }
+
+ rval = join_list_unicode(chunks);
+ if (rval == NULL) {
+ goto bail;
+ }
+ Py_DECREF(chunks);
+ chunks = NULL;
+ return Py_BuildValue("(Nn)", rval, end);
+bail:
+ Py_XDECREF(chunks);
+ return NULL;
+}
+
+PyDoc_STRVAR(pydoc_scanstring,
+"scanstring(str_or_bytes, end, encoding) -> (bytes, end)\n");
+
+static PyObject *
+py_scanstring(PyObject* self, PyObject *args)
+{
+ PyObject *pystr;
+ Py_ssize_t end;
+ char *encoding = NULL;
+ int strict = 0;
+ if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) {
+ return NULL;
+ }
+ if (encoding == NULL) {
+ encoding = DEFAULT_ENCODING;
+ }
+ if (PyString_Check(pystr)) {
+ return scanstring_str(pystr, end, encoding, strict);
+ }
+ else if (PyUnicode_Check(pystr)) {
+ return scanstring_unicode(pystr, end, strict);
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "first argument must be a string or bytes, not %.80s",
+ Py_TYPE(pystr)->tp_name);
+ return NULL;
+ }
+}
+
+PyDoc_STRVAR(pydoc_encode_basestring_ascii,
+"encode_basestring_ascii(str_or_bytes) -> bytes\n");
+
+static PyObject *
+py_encode_basestring_ascii(PyObject* self, PyObject *pystr)
+{
+ PyObject *rval;
+ /* METH_O */
+ if (PyString_Check(pystr)) {
+ rval = ascii_escape_str(pystr);
+ }
+ else if (PyUnicode_Check(pystr)) {
+ rval = ascii_escape_unicode(pystr);
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "first argument must be a string or unicode, not %.80s",
+ Py_TYPE(pystr)->tp_name);
+ return NULL;
+ }
+ if (PyString_Check(rval)) {
+ PyObject *urval = PyUnicode_DecodeASCII(PyString_AS_STRING(rval), PyString_GET_SIZE(rval), NULL);
+ Py_DECREF(rval);
+ return urval;
+ }
+ return rval;
+}
+
+static PyMethodDef json_methods[] = {
+ {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii,
+ METH_O, pydoc_encode_basestring_ascii},
+ {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS,
+ pydoc_scanstring},
+ {NULL, NULL, 0, NULL}
+};
+
+PyDoc_STRVAR(module_doc,
+"json speedups\n");
+
+void
+init_json(void)
+{
+ PyObject *m;
+ m = Py_InitModule3("_json", json_methods, module_doc);
+}