summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2010-09-04 20:16:53 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2010-09-04 20:16:53 (GMT)
commit7d6e076f6d8dd48cfd748b02dad17dbeb0b346a3 (patch)
tree3d355bd888b6e2aaba52f33807d183055d61b700
parentd9107aadeda663173eb68c412396e5e3353a4059 (diff)
downloadcpython-7d6e076f6d8dd48cfd748b02dad17dbeb0b346a3.zip
cpython-7d6e076f6d8dd48cfd748b02dad17dbeb0b346a3.tar.gz
cpython-7d6e076f6d8dd48cfd748b02dad17dbeb0b346a3.tar.bz2
Issue #7451: Improve decoding performance of JSON objects, and reduce
the memory consumption of said decoded objects when they use the same strings as keys.
-rw-r--r--Lib/json/decoder.py10
-rw-r--r--Lib/json/scanner.py10
-rw-r--r--Lib/json/tests/test_decode.py28
-rw-r--r--Misc/NEWS4
-rw-r--r--Modules/_json.c127
5 files changed, 131 insertions, 48 deletions
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
index 3e7405b..6596154 100644
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py
@@ -147,10 +147,14 @@ WHITESPACE_STR = ' \t\n\r'
def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
- _w=WHITESPACE.match, _ws=WHITESPACE_STR):
+ memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
s, end = s_and_end
pairs = []
pairs_append = pairs.append
+ # Backwards compatibility
+ if memo is None:
+ memo = {}
+ memo_get = memo.setdefault
# Use a slice to prevent IndexError from being raised, the following
# check will raise a more specific ValueError if the string is empty
nextchar = s[end:end + 1]
@@ -167,6 +171,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
end += 1
while True:
key, end = scanstring(s, end, strict)
+ key = memo_get(key, key)
# To skip some function call overhead we optimize the fast paths where
# the JSON key separator is ": " or just ":".
if s[end:end + 1] != ':':
@@ -214,7 +219,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
pairs = object_hook(pairs)
return pairs, end
-def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match):
+def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
s, end = s_and_end
values = []
nextchar = s[end:end + 1]
@@ -314,6 +319,7 @@ class JSONDecoder(object):
self.parse_object = JSONObject
self.parse_array = JSONArray
self.parse_string = scanstring
+ self.memo = {}
self.scan_once = make_scanner(self)
diff --git a/Lib/json/scanner.py b/Lib/json/scanner.py
index b4f3561..23eef61 100644
--- a/Lib/json/scanner.py
+++ b/Lib/json/scanner.py
@@ -22,6 +22,8 @@ def py_make_scanner(context):
parse_int = context.parse_int
parse_constant = context.parse_constant
object_hook = context.object_hook
+ object_pairs_hook = context.object_pairs_hook
+ memo = context.memo
def _scan_once(string, idx):
try:
@@ -33,7 +35,7 @@ def py_make_scanner(context):
return parse_string(string, idx + 1, strict)
elif nextchar == '{':
return parse_object((string, idx + 1), strict,
- _scan_once, object_hook, object_pairs_hook)
+ _scan_once, object_hook, object_pairs_hook, memo)
elif nextchar == '[':
return parse_array((string, idx + 1), _scan_once)
elif nextchar == 'n' and string[idx:idx + 4] == 'null':
@@ -60,6 +62,12 @@ def py_make_scanner(context):
else:
raise StopIteration
+ def scan_once(string, idx):
+ try:
+ return _scan_once(string, idx)
+ finally:
+ memo.clear()
+
return _scan_once
make_scanner = c_make_scanner or py_make_scanner
diff --git a/Lib/json/tests/test_decode.py b/Lib/json/tests/test_decode.py
index 4610c6c..beb82a7 100644
--- a/Lib/json/tests/test_decode.py
+++ b/Lib/json/tests/test_decode.py
@@ -1,10 +1,25 @@
import decimal
from unittest import TestCase
from io import StringIO
+from contextlib import contextmanager
import json
+import json.decoder
+import json.scanner
from collections import OrderedDict
+
+@contextmanager
+def use_python_scanner():
+ py_scanner = json.scanner.py_make_scanner
+ old_scanner = json.decoder.make_scanner
+ json.decoder.make_scanner = py_scanner
+ try:
+ yield
+ finally:
+ json.decoder.make_scanner = old_scanner
+
+
class TestDecode(TestCase):
def test_decimal(self):
rval = json.loads('1.1', parse_float=decimal.Decimal)
@@ -39,3 +54,16 @@ class TestDecode(TestCase):
# exercise the uncommon cases. The array cases are already covered.
rval = json.loads('{ "key" : "value" , "k":"v" }')
self.assertEquals(rval, {"key":"value", "k":"v"})
+
+ def check_keys_reuse(self, source, loads):
+ rval = loads(source)
+ (a, b), (c, d) = sorted(rval[0]), sorted(rval[1])
+ self.assertIs(a, c)
+ self.assertIs(b, d)
+
+ def test_keys_reuse(self):
+ s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]'
+ self.check_keys_reuse(s, json.loads)
+ # Disabled: the pure Python version of json simply doesn't work
+ with use_python_scanner():
+ self.check_keys_reuse(s, json.decoder.JSONDecoder().decode)
diff --git a/Misc/NEWS b/Misc/NEWS
index 77e9dd5..f413da2 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -165,6 +165,10 @@ Extensions
Library
-------
+- Issue #7451: Improve decoding performance of JSON objects, and reduce
+ the memory consumption of said decoded objects when they use the same
+ strings as keys.
+
- Issue #1100562: Fix deep-copying of objects derived from the list and
dict types. Patch by Michele Orrù and Björn Lindqvist.
diff --git a/Modules/_json.c b/Modules/_json.c
index 2e39525..bbaf7dd 100644
--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -36,6 +36,7 @@ typedef struct _PyScannerObject {
PyObject *parse_float;
PyObject *parse_int;
PyObject *parse_constant;
+ PyObject *memo;
} PyScannerObject;
static PyMemberDef scanner_members[] = {
@@ -305,6 +306,21 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
return tpl;
}
+#define APPEND_OLD_CHUNK \
+ if (chunk != NULL) { \
+ if (chunks == NULL) { \
+ chunks = PyList_New(0); \
+ if (chunks == NULL) { \
+ goto bail; \
+ } \
+ } \
+ if (PyList_Append(chunks, chunk)) { \
+ Py_DECREF(chunk); \
+ goto bail; \
+ } \
+ Py_CLEAR(chunk); \
+ }
+
static PyObject *
scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
{
@@ -316,15 +332,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
Return value is a new PyUnicode
*/
- PyObject *rval;
+ PyObject *rval = NULL;
Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
Py_ssize_t begin = end - 1;
Py_ssize_t next = begin;
const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
- PyObject *chunks = PyList_New(0);
- if (chunks == NULL) {
- goto bail;
- }
+ PyObject *chunks = NULL;
+ PyObject *chunk = NULL;
+
if (end < 0 || len <= end) {
PyErr_SetString(PyExc_ValueError, "end is out of bounds");
goto bail;
@@ -332,7 +347,6 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
while (1) {
/* Find the end of the string or the next escape */
Py_UNICODE c = 0;
- PyObject *chunk = NULL;
for (next = end; next < len; next++) {
c = buf[next];
if (c == '"' || c == '\\') {
@@ -349,15 +363,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
}
/* Pick up this chunk if it's not zero length */
if (next != end) {
+ APPEND_OLD_CHUNK
chunk = PyUnicode_FromUnicode(&buf[end], next - end);
if (chunk == NULL) {
goto bail;
}
- if (PyList_Append(chunks, chunk)) {
- Py_DECREF(chunk);
- goto bail;
- }
- Py_DECREF(chunk);
}
next++;
if (c == '"') {
@@ -459,27 +469,34 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
}
#endif
}
+ APPEND_OLD_CHUNK
chunk = PyUnicode_FromUnicode(&c, 1);
if (chunk == NULL) {
goto bail;
}
- if (PyList_Append(chunks, chunk)) {
- Py_DECREF(chunk);
+ }
+
+ if (chunks == NULL) {
+ if (chunk != NULL)
+ rval = chunk;
+ else
+ rval = PyUnicode_FromStringAndSize("", 0);
+ }
+ else {
+ APPEND_OLD_CHUNK
+ rval = join_list_unicode(chunks);
+ if (rval == NULL) {
goto bail;
}
- Py_DECREF(chunk);
+ Py_CLEAR(chunks);
}
- rval = join_list_unicode(chunks);
- if (rval == NULL) {
- goto bail;
- }
- Py_DECREF(chunks);
*next_end_ptr = end;
return rval;
bail:
*next_end_ptr = -1;
Py_XDECREF(chunks);
+ Py_XDECREF(chunk);
return NULL;
}
@@ -578,6 +595,7 @@ scanner_clear(PyObject *self)
Py_CLEAR(s->parse_float);
Py_CLEAR(s->parse_int);
Py_CLEAR(s->parse_constant);
+ Py_CLEAR(s->memo);
return 0;
}
@@ -593,10 +611,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
PyObject *val = NULL;
- PyObject *rval = PyList_New(0);
+ PyObject *rval = NULL;
PyObject *key = NULL;
int strict = PyObject_IsTrue(s->strict);
+ int has_pairs_hook = (s->object_pairs_hook != Py_None);
Py_ssize_t next_idx;
+
+ if (has_pairs_hook)
+ rval = PyList_New(0);
+ else
+ rval = PyDict_New();
if (rval == NULL)
return NULL;
@@ -606,6 +630,8 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
/* only loop if the object is non-empty */
if (idx <= end_idx && str[idx] != '}') {
while (idx <= end_idx) {
+ PyObject *memokey;
+
/* read key */
if (str[idx] != '"') {
raise_errmsg("Expecting property name", pystr, idx);
@@ -614,6 +640,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
key = scanstring_unicode(pystr, idx + 1, strict, &next_idx);
if (key == NULL)
goto bail;
+ memokey = PyDict_GetItem(s->memo, key);
+ if (memokey != NULL) {
+ Py_INCREF(memokey);
+ Py_DECREF(key);
+ key = memokey;
+ }
+ else {
+ if (PyDict_SetItem(s->memo, key, key) < 0)
+ goto bail;
+ }
idx = next_idx;
/* skip whitespace between key and : delimiter, read :, skip whitespace */
@@ -630,19 +666,24 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
if (val == NULL)
goto bail;
- {
- PyObject *tuple = PyTuple_Pack(2, key, val);
- if (tuple == NULL)
+ if (has_pairs_hook) {
+ PyObject *item = PyTuple_Pack(2, key, val);
+ if (item == NULL)
goto bail;
- if (PyList_Append(rval, tuple) == -1) {
- Py_DECREF(tuple);
+ Py_CLEAR(key);
+ Py_CLEAR(val);
+ if (PyList_Append(rval, item) == -1) {
+ Py_DECREF(item);
goto bail;
}
- Py_DECREF(tuple);
+ Py_DECREF(item);
+ }
+ else {
+ if (PyDict_SetItem(rval, key, val) < 0)
+ goto bail;
+ Py_CLEAR(key);
+ Py_CLEAR(val);
}
-
- Py_CLEAR(key);
- Py_CLEAR(val);
idx = next_idx;
/* skip whitespace before } or , */
@@ -672,36 +713,23 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
*next_idx_ptr = idx + 1;
- if (s->object_pairs_hook != Py_None) {
+ if (has_pairs_hook) {
val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL);
- if (val == NULL)
- goto bail;
Py_DECREF(rval);
return val;
}
- val = PyDict_New();
- if (val == NULL)
- goto bail;
- if (PyDict_MergeFromSeq2(val, rval, 1) == -1)
- goto bail;
- Py_DECREF(rval);
- rval = val;
-
/* if object_hook is not None: rval = object_hook(rval) */
if (s->object_hook != Py_None) {
val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
- if (val == NULL)
- goto bail;
Py_DECREF(rval);
- rval = val;
- val = NULL;
+ return val;
}
return rval;
bail:
Py_XDECREF(key);
Py_XDECREF(val);
- Py_DECREF(rval);
+ Py_XDECREF(rval);
return NULL;
}
@@ -988,6 +1016,9 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds)
Py_TYPE(pystr)->tp_name);
return NULL;
}
+ PyDict_Clear(s->memo);
+ if (rval == NULL)
+ return NULL;
return _build_rval_index_tuple(rval, next_idx);
}
@@ -1021,6 +1052,12 @@ scanner_init(PyObject *self, PyObject *args, PyObject *kwds)
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx))
return -1;
+ if (s->memo == NULL) {
+ s->memo = PyDict_New();
+ if (s->memo == NULL)
+ goto bail;
+ }
+
/* All of these will fail "gracefully" so we don't need to verify them */
s->strict = PyObject_GetAttrString(ctx, "strict");
if (s->strict == NULL)