From 7d6e076f6d8dd48cfd748b02dad17dbeb0b346a3 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <solipsis@pitrou.net>
Date: Sat, 4 Sep 2010 20:16:53 +0000
Subject: Issue #7451: Improve decoding performance of JSON objects, and reduce
 the memory consumption of said decoded objects when they use the same strings
 as keys.

---
 Lib/json/decoder.py           |  10 +++-
 Lib/json/scanner.py           |  10 +++-
 Lib/json/tests/test_decode.py |  28 ++++++++++
 Misc/NEWS                     |   4 ++
 Modules/_json.c               | 127 +++++++++++++++++++++++++++---------------
 5 files changed, 131 insertions(+), 48 deletions(-)

diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
index 3e7405b..6596154 100644
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py
@@ -147,10 +147,14 @@ WHITESPACE_STR = ' \t\n\r'
 
 
 def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
-        _w=WHITESPACE.match, _ws=WHITESPACE_STR):
+               memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
     s, end = s_and_end
     pairs = []
     pairs_append = pairs.append
+    # Backwards compatibility
+    if memo is None:
+        memo = {}
+    memo_get = memo.setdefault
     # Use a slice to prevent IndexError from being raised, the following
     # check will raise a more specific ValueError if the string is empty
     nextchar = s[end:end + 1]
@@ -167,6 +171,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
     end += 1
     while True:
         key, end = scanstring(s, end, strict)
+        key = memo_get(key, key)
         # To skip some function call overhead we optimize the fast paths where
         # the JSON key separator is ": " or just ":".
         if s[end:end + 1] != ':':
@@ -214,7 +219,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
         pairs = object_hook(pairs)
     return pairs, end
 
-def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match):
+def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
     s, end = s_and_end
     values = []
     nextchar = s[end:end + 1]
@@ -314,6 +319,7 @@ class JSONDecoder(object):
         self.parse_object = JSONObject
         self.parse_array = JSONArray
         self.parse_string = scanstring
+        self.memo = {}
         self.scan_once = make_scanner(self)
 
 
diff --git a/Lib/json/scanner.py b/Lib/json/scanner.py
index b4f3561..23eef61 100644
--- a/Lib/json/scanner.py
+++ b/Lib/json/scanner.py
@@ -22,6 +22,8 @@ def py_make_scanner(context):
     parse_int = context.parse_int
     parse_constant = context.parse_constant
     object_hook = context.object_hook
+    object_pairs_hook = context.object_pairs_hook
+    memo = context.memo
 
     def _scan_once(string, idx):
         try:
@@ -33,7 +35,7 @@ def py_make_scanner(context):
             return parse_string(string, idx + 1, strict)
         elif nextchar == '{':
             return parse_object((string, idx + 1), strict,
-                _scan_once, object_hook, object_pairs_hook)
+                _scan_once, object_hook, object_pairs_hook, memo)
         elif nextchar == '[':
             return parse_array((string, idx + 1), _scan_once)
         elif nextchar == 'n' and string[idx:idx + 4] == 'null':
@@ -60,6 +62,12 @@ def py_make_scanner(context):
         else:
             raise StopIteration
 
+    def scan_once(string, idx):
+        try:
+            return _scan_once(string, idx)
+        finally:
+            memo.clear()
+
     return _scan_once
 
 make_scanner = c_make_scanner or py_make_scanner
diff --git a/Lib/json/tests/test_decode.py b/Lib/json/tests/test_decode.py
index 4610c6c..beb82a7 100644
--- a/Lib/json/tests/test_decode.py
+++ b/Lib/json/tests/test_decode.py
@@ -1,10 +1,25 @@
 import decimal
 from unittest import TestCase
 from io import StringIO
+from contextlib import contextmanager
 
 import json
+import json.decoder
+import json.scanner
 from collections import OrderedDict
 
+
+@contextmanager
+def use_python_scanner():
+    py_scanner = json.scanner.py_make_scanner
+    old_scanner = json.decoder.make_scanner
+    json.decoder.make_scanner = py_scanner
+    try:
+        yield
+    finally:
+        json.decoder.make_scanner = old_scanner
+
+
 class TestDecode(TestCase):
     def test_decimal(self):
         rval = json.loads('1.1', parse_float=decimal.Decimal)
@@ -39,3 +54,16 @@ class TestDecode(TestCase):
         # exercise the uncommon cases. The array cases are already covered.
         rval = json.loads('{   "key"    :    "value"    ,  "k":"v"    }')
         self.assertEquals(rval, {"key":"value", "k":"v"})
+
+    def check_keys_reuse(self, source, loads):
+        rval = loads(source)
+        (a, b), (c, d) = sorted(rval[0]), sorted(rval[1])
+        self.assertIs(a, c)
+        self.assertIs(b, d)
+
+    def test_keys_reuse(self):
+        s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]'
+        self.check_keys_reuse(s, json.loads)
+        # Disabled: the pure Python version of json simply doesn't work
+        with use_python_scanner():
+            self.check_keys_reuse(s, json.decoder.JSONDecoder().decode)
diff --git a/Misc/NEWS b/Misc/NEWS
index 77e9dd5..f413da2 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -165,6 +165,10 @@ Extensions
 Library
 -------
 
+- Issue #7451: Improve decoding performance of JSON objects, and reduce
+  the memory consumption of said decoded objects when they use the same
+  strings as keys.
+
 - Issue #1100562: Fix deep-copying of objects derived from the list and
   dict types.  Patch by Michele Orrù and Björn Lindqvist.
 
diff --git a/Modules/_json.c b/Modules/_json.c
index 2e39525..bbaf7dd 100644
--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -36,6 +36,7 @@ typedef struct _PyScannerObject {
     PyObject *parse_float;
     PyObject *parse_int;
     PyObject *parse_constant;
+    PyObject *memo;
 } PyScannerObject;
 
 static PyMemberDef scanner_members[] = {
@@ -305,6 +306,21 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
     return tpl;
 }
 
+#define APPEND_OLD_CHUNK \
+    if (chunk != NULL) { \
+        if (chunks == NULL) { \
+            chunks = PyList_New(0); \
+            if (chunks == NULL) { \
+                goto bail; \
+            } \
+        } \
+        if (PyList_Append(chunks, chunk)) { \
+            Py_DECREF(chunk); \
+            goto bail; \
+        } \
+        Py_CLEAR(chunk); \
+    }
+
 static PyObject *
 scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
 {
@@ -316,15 +332,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
 
     Return value is a new PyUnicode
     */
-    PyObject *rval;
+    PyObject *rval = NULL;
     Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
     Py_ssize_t begin = end - 1;
     Py_ssize_t next = begin;
     const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
-    PyObject *chunks = PyList_New(0);
-    if (chunks == NULL) {
-        goto bail;
-    }
+    PyObject *chunks = NULL;
+    PyObject *chunk = NULL;
+
     if (end < 0 || len <= end) {
         PyErr_SetString(PyExc_ValueError, "end is out of bounds");
         goto bail;
@@ -332,7 +347,6 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
     while (1) {
         /* Find the end of the string or the next escape */
         Py_UNICODE c = 0;
-        PyObject *chunk = NULL;
         for (next = end; next < len; next++) {
             c = buf[next];
             if (c == '"' || c == '\\') {
@@ -349,15 +363,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
         }
         /* Pick up this chunk if it's not zero length */
         if (next != end) {
+            APPEND_OLD_CHUNK
             chunk = PyUnicode_FromUnicode(&buf[end], next - end);
             if (chunk == NULL) {
                 goto bail;
             }
-            if (PyList_Append(chunks, chunk)) {
-                Py_DECREF(chunk);
-                goto bail;
-            }
-            Py_DECREF(chunk);
         }
         next++;
         if (c == '"') {
@@ -459,27 +469,34 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
             }
 #endif
         }
+        APPEND_OLD_CHUNK
         chunk = PyUnicode_FromUnicode(&c, 1);
         if (chunk == NULL) {
             goto bail;
         }
-        if (PyList_Append(chunks, chunk)) {
-            Py_DECREF(chunk);
+    }
+
+    if (chunks == NULL) {
+        if (chunk != NULL)
+            rval = chunk;
+        else
+            rval = PyUnicode_FromStringAndSize("", 0);
+    }
+    else {
+        APPEND_OLD_CHUNK
+        rval = join_list_unicode(chunks);
+        if (rval == NULL) {
             goto bail;
         }
-        Py_DECREF(chunk);
+        Py_CLEAR(chunks);
     }
 
-    rval = join_list_unicode(chunks);
-    if (rval == NULL) {
-        goto bail;
-    }
-    Py_DECREF(chunks);
     *next_end_ptr = end;
     return rval;
 bail:
     *next_end_ptr = -1;
     Py_XDECREF(chunks);
+    Py_XDECREF(chunk);
     return NULL;
 }
 
@@ -578,6 +595,7 @@ scanner_clear(PyObject *self)
     Py_CLEAR(s->parse_float);
     Py_CLEAR(s->parse_int);
     Py_CLEAR(s->parse_constant);
+    Py_CLEAR(s->memo);
     return 0;
 }
 
@@ -593,10 +611,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
     Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
     Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
     PyObject *val = NULL;
-    PyObject *rval = PyList_New(0);
+    PyObject *rval = NULL;
     PyObject *key = NULL;
     int strict = PyObject_IsTrue(s->strict);
+    int has_pairs_hook = (s->object_pairs_hook != Py_None);
     Py_ssize_t next_idx;
+
+    if (has_pairs_hook)
+        rval = PyList_New(0);
+    else
+        rval = PyDict_New();
     if (rval == NULL)
         return NULL;
 
@@ -606,6 +630,8 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
     /* only loop if the object is non-empty */
     if (idx <= end_idx && str[idx] != '}') {
         while (idx <= end_idx) {
+            PyObject *memokey;
+
             /* read key */
             if (str[idx] != '"') {
                 raise_errmsg("Expecting property name", pystr, idx);
@@ -614,6 +640,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
             key = scanstring_unicode(pystr, idx + 1, strict, &next_idx);
             if (key == NULL)
                 goto bail;
+            memokey = PyDict_GetItem(s->memo, key);
+            if (memokey != NULL) {
+                Py_INCREF(memokey);
+                Py_DECREF(key);
+                key = memokey;
+            }
+            else {
+                if (PyDict_SetItem(s->memo, key, key) < 0)
+                    goto bail;
+            }
             idx = next_idx;
 
             /* skip whitespace between key and : delimiter, read :, skip whitespace */
@@ -630,19 +666,24 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
             if (val == NULL)
                 goto bail;
 
-            {
-                PyObject *tuple = PyTuple_Pack(2, key, val);
-                if (tuple == NULL)
+            if (has_pairs_hook) {
+                PyObject *item = PyTuple_Pack(2, key, val);
+                if (item == NULL)
                     goto bail;
-                if (PyList_Append(rval, tuple) == -1) {
-                    Py_DECREF(tuple);
+                Py_CLEAR(key);
+                Py_CLEAR(val);
+                if (PyList_Append(rval, item) == -1) {
+                    Py_DECREF(item);
                     goto bail;
                 }
-                Py_DECREF(tuple);
+                Py_DECREF(item);
+            }
+            else {
+                if (PyDict_SetItem(rval, key, val) < 0)
+                    goto bail;
+                Py_CLEAR(key);
+                Py_CLEAR(val);
             }
-
-            Py_CLEAR(key);
-            Py_CLEAR(val);
             idx = next_idx;
 
             /* skip whitespace before } or , */
@@ -672,36 +713,23 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
 
     *next_idx_ptr = idx + 1;
 
-    if (s->object_pairs_hook != Py_None) {
+    if (has_pairs_hook) {
         val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL);
-        if (val == NULL)
-            goto bail;
         Py_DECREF(rval);
         return val;
     }
 
-    val = PyDict_New();
-    if (val == NULL)
-        goto bail;
-    if (PyDict_MergeFromSeq2(val, rval, 1) == -1)
-        goto bail;
-    Py_DECREF(rval);
-    rval = val;
-
     /* if object_hook is not None: rval = object_hook(rval) */
     if (s->object_hook != Py_None) {
         val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
-        if (val == NULL)
-            goto bail;
         Py_DECREF(rval);
-        rval = val;
-        val = NULL;
+        return val;
     }
     return rval;
 bail:
     Py_XDECREF(key);
     Py_XDECREF(val);
-    Py_DECREF(rval);
+    Py_XDECREF(rval);
     return NULL;
 }
 
@@ -988,6 +1016,9 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds)
                  Py_TYPE(pystr)->tp_name);
         return NULL;
     }
+    PyDict_Clear(s->memo);
+    if (rval == NULL)
+        return NULL;
     return _build_rval_index_tuple(rval, next_idx);
 }
 
@@ -1021,6 +1052,12 @@ scanner_init(PyObject *self, PyObject *args, PyObject *kwds)
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx))
         return -1;
 
+    if (s->memo == NULL) {
+        s->memo = PyDict_New();
+        if (s->memo == NULL)
+            goto bail;
+    }
+
     /* All of these will fail "gracefully" so we don't need to verify them */
     s->strict = PyObject_GetAttrString(ctx, "strict");
     if (s->strict == NULL)
-- 
cgit v0.12