From 109826c8508dd02e06ae0f1784f1d202495a8680 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Tue, 20 Oct 2020 06:22:44 +0100 Subject: bpo-42093: Add opcode cache for LOAD_ATTR (GH-22803) --- Doc/whatsnew/3.10.rst | 3 + Include/cpython/dictobject.h | 1 + Include/internal/pycore_code.h | 7 + .../2020-10-20-04-24-07.bpo-42093.ooZZNh.rst | 2 + Objects/codeobject.c | 4 +- Objects/dictobject.c | 65 ++++++ Python/ceval.c | 221 ++++++++++++++++++++- 7 files changed, 296 insertions(+), 7 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-10-20-04-24-07.bpo-42093.ooZZNh.rst diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index f57e1b4..e275a7c 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -252,6 +252,9 @@ Optimizations average. (Contributed by Victor Stinner in :issue:`41006`.) +* The ``LOAD_ATTR`` instruction now uses new "per opcode cache" mechanism. + It is about 36% faster now. (Contributed by Pablo Galindo and Yury Selivanov + in :issue:`42093`.) Deprecated ========== diff --git a/Include/cpython/dictobject.h b/Include/cpython/dictobject.h index 5a15630..f67c321 100644 --- a/Include/cpython/dictobject.h +++ b/Include/cpython/dictobject.h @@ -71,6 +71,7 @@ PyAPI_FUNC(void) _PyDict_DebugMallocStats(FILE *out); int _PyObjectDict_SetItem(PyTypeObject *tp, PyObject **dictptr, PyObject *name, PyObject *value); PyObject *_PyDict_LoadGlobal(PyDictObject *, PyDictObject *, PyObject *); +Py_ssize_t _PyDict_GetItemHint(PyDictObject *, PyObject *, Py_ssize_t, PyObject **); /* _PyDictView */ diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 88956f1..f1e89d9 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -10,9 +10,16 @@ typedef struct { uint64_t builtins_ver; /* ma_version of builtin dict */ } _PyOpcache_LoadGlobal; +typedef struct { + PyTypeObject *type; + Py_ssize_t hint; + unsigned int tp_version_tag; +} _PyOpCodeOpt_LoadAttr; + struct _PyOpcache { union { _PyOpcache_LoadGlobal lg; + _PyOpCodeOpt_LoadAttr la; } u; char optimized; }; diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-10-20-04-24-07.bpo-42093.ooZZNh.rst b/Misc/NEWS.d/next/Core and Builtins/2020-10-20-04-24-07.bpo-42093.ooZZNh.rst new file mode 100644 index 0000000..36a12c1 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-10-20-04-24-07.bpo-42093.ooZZNh.rst @@ -0,0 +1,2 @@ +The ``LOAD_ATTR`` instruction now uses new "per opcode cache" mechanism and +it is about 36% faster now. Patch by Pablo Galindo and Yury Selivanov. diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 4ca22fc..c86d0e1 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -301,8 +301,8 @@ _PyCode_InitOpcache(PyCodeObject *co) unsigned char opcode = _Py_OPCODE(opcodes[i]); i++; // 'i' is now aligned to (next_instr - first_instr) - // TODO: LOAD_METHOD, LOAD_ATTR - if (opcode == LOAD_GLOBAL) { + // TODO: LOAD_METHOD + if (opcode == LOAD_GLOBAL || opcode == LOAD_ATTR) { opts++; co->co_opcache_map[i] = (unsigned char)opts; if (opts > 254) { diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 6c3fc62..8e74962 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -1437,6 +1437,71 @@ PyDict_GetItem(PyObject *op, PyObject *key) return value; } +Py_ssize_t +_PyDict_GetItemHint(PyDictObject *mp, PyObject *key, + Py_ssize_t hint, PyObject **value) +{ + Py_hash_t hash; + PyThreadState *tstate; + + assert(*value == NULL); + assert(PyDict_CheckExact((PyObject*)mp)); + assert(PyUnicode_CheckExact(key)); + + if (hint >= 0 && hint < _PyDict_KeysSize(mp->ma_keys)) { + PyObject *res = NULL; + + PyDictKeyEntry *ep = DK_ENTRIES(mp->ma_keys) + (size_t)hint; + if (ep->me_key == key) { + if (mp->ma_keys->dk_lookup == lookdict_split) { + assert(mp->ma_values != NULL); + res = mp->ma_values[(size_t)hint]; + } + else { + res = ep->me_value; + } + if (res != NULL) { + *value = res; + return hint; + } + } + } + + if ((hash = ((PyASCIIObject *) key)->hash) == -1) + { + hash = PyObject_Hash(key); + if (hash == -1) { + PyErr_Clear(); + return -1; + } + } + + // We can arrive here with a NULL tstate during initialization: try + // running "python -Wi" for an example related to string interning + tstate = _PyThreadState_UncheckedGet(); + Py_ssize_t ix = 0; + if (tstate != NULL && tstate->curexc_type != NULL) { + /* preserve the existing exception */ + PyObject *err_type, *err_value, *err_tb; + PyErr_Fetch(&err_type, &err_value, &err_tb); + ix = (mp->ma_keys->dk_lookup)(mp, key, hash, value); + /* ignore errors */ + PyErr_Restore(err_type, err_value, err_tb); + if (ix < 0) { + return -1; + } + } + else { + ix = (mp->ma_keys->dk_lookup)(mp, key, hash, value); + if (ix < 0) { + PyErr_Clear(); + return -1; + } + } + + return ix; +} + /* Same as PyDict_GetItemWithError() but with hash supplied by caller. This returns NULL *with* an exception set if an exception occurred. It returns NULL *without* an exception set if the key wasn't present. diff --git a/Python/ceval.c b/Python/ceval.c index 762de57..fafbf75 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -111,6 +111,7 @@ static long dxp[256]; #else #define OPCACHE_MIN_RUNS 1024 /* create opcache when code executed this time */ #endif +#define OPCODE_CACHE_MAX_TRIES 20 #define OPCACHE_STATS 0 /* Enable stats */ #if OPCACHE_STATS @@ -120,6 +121,12 @@ static size_t opcache_code_objects_extra_mem = 0; static size_t opcache_global_opts = 0; static size_t opcache_global_hits = 0; static size_t opcache_global_misses = 0; + +static size_t opcache_attr_opts = 0; +static size_t opcache_attr_hits = 0; +static size_t opcache_attr_misses = 0; +static size_t opcache_attr_deopts = 0; +static size_t opcache_attr_total = 0; #endif @@ -365,6 +372,25 @@ _PyEval_Fini(void) opcache_global_opts); fprintf(stderr, "\n"); + + fprintf(stderr, "-- Opcode cache LOAD_ATTR hits = %zd (%d%%)\n", + opcache_attr_hits, + (int) (100.0 * opcache_attr_hits / + opcache_attr_total)); + + fprintf(stderr, "-- Opcode cache LOAD_ATTR misses = %zd (%d%%)\n", + opcache_attr_misses, + (int) (100.0 * opcache_attr_misses / + opcache_attr_total)); + + fprintf(stderr, "-- Opcode cache LOAD_ATTR opts = %zd\n", + opcache_attr_opts); + + fprintf(stderr, "-- Opcode cache LOAD_ATTR deopts = %zd\n", + opcache_attr_deopts); + + fprintf(stderr, "-- Opcode cache LOAD_ATTR total = %zd\n", + opcache_attr_total); #endif } @@ -1224,16 +1250,43 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag) do { \ co_opcache = NULL; \ if (co->co_opcache != NULL) { \ - unsigned char co_opt_offset = \ + unsigned char co_opcache_offset = \ co->co_opcache_map[next_instr - first_instr]; \ - if (co_opt_offset > 0) { \ - assert(co_opt_offset <= co->co_opcache_size); \ - co_opcache = &co->co_opcache[co_opt_offset - 1]; \ + if (co_opcache_offset > 0) { \ + assert(co_opcache_offset <= co->co_opcache_size); \ + co_opcache = &co->co_opcache[co_opcache_offset - 1]; \ assert(co_opcache != NULL); \ } \ } \ } while (0) +#define OPCACHE_DEOPT() \ + do { \ + if (co_opcache != NULL) { \ + co_opcache->optimized = -1; \ + unsigned char co_opcache_offset = \ + co->co_opcache_map[next_instr - first_instr]; \ + assert(co_opcache_offset <= co->co_opcache_size); \ + co->co_opcache_map[co_opcache_offset] = 0; \ + co_opcache = NULL; \ + } \ + } while (0) + +#define OPCACHE_DEOPT_LOAD_ATTR() \ + do { \ + if (co_opcache != NULL) { \ + OPCACHE_STAT_ATTR_DEOPT(); \ + OPCACHE_DEOPT(); \ + } \ + } while (0) + +#define OPCACHE_MAYBE_DEOPT_LOAD_ATTR() \ + do { \ + if (co_opcache != NULL && --co_opcache->optimized <= 0) { \ + OPCACHE_DEOPT_LOAD_ATTR(); \ + } \ + } while (0) + #if OPCACHE_STATS #define OPCACHE_STAT_GLOBAL_HIT() \ @@ -1251,12 +1304,43 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag) if (co->co_opcache != NULL) opcache_global_opts++; \ } while (0) +#define OPCACHE_STAT_ATTR_HIT() \ + do { \ + if (co->co_opcache != NULL) opcache_attr_hits++; \ + } while (0) + +#define OPCACHE_STAT_ATTR_MISS() \ + do { \ + if (co->co_opcache != NULL) opcache_attr_misses++; \ + } while (0) + +#define OPCACHE_STAT_ATTR_OPT() \ + do { \ + if (co->co_opcache!= NULL) opcache_attr_opts++; \ + } while (0) + +#define OPCACHE_STAT_ATTR_DEOPT() \ + do { \ + if (co->co_opcache != NULL) opcache_attr_deopts++; \ + } while (0) + +#define OPCACHE_STAT_ATTR_TOTAL() \ + do { \ + if (co->co_opcache != NULL) opcache_attr_total++; \ + } while (0) + #else /* OPCACHE_STATS */ #define OPCACHE_STAT_GLOBAL_HIT() #define OPCACHE_STAT_GLOBAL_MISS() #define OPCACHE_STAT_GLOBAL_OPT() +#define OPCACHE_STAT_ATTR_HIT() +#define OPCACHE_STAT_ATTR_MISS() +#define OPCACHE_STAT_ATTR_OPT() +#define OPCACHE_STAT_ATTR_DEOPT() +#define OPCACHE_STAT_ATTR_TOTAL() + #endif /* Start of code */ @@ -3023,7 +3107,134 @@ main_loop: case TARGET(LOAD_ATTR): { PyObject *name = GETITEM(names, oparg); PyObject *owner = TOP(); - PyObject *res = PyObject_GetAttr(owner, name); + + PyTypeObject *type = Py_TYPE(owner); + PyObject *res; + PyObject **dictptr; + PyObject *dict; + _PyOpCodeOpt_LoadAttr *la; + + OPCACHE_STAT_ATTR_TOTAL(); + + OPCACHE_CHECK(); + if (co_opcache != NULL && PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG)) + { + if (co_opcache->optimized > 0) { + /* Fast path -- cache hit makes LOAD_ATTR ~30% faster */ + la = &co_opcache->u.la; + if (la->type == type && la->tp_version_tag == type->tp_version_tag) + { + assert(type->tp_dict != NULL); + assert(type->tp_dictoffset > 0); + + dictptr = (PyObject **) ((char *)owner + type->tp_dictoffset); + dict = *dictptr; + if (dict != NULL && PyDict_CheckExact(dict)) { + Py_ssize_t hint = la->hint; + Py_INCREF(dict); + res = NULL; + la->hint = _PyDict_GetItemHint((PyDictObject*)dict, name, hint, &res); + + if (res != NULL) { + if (la->hint == hint && hint >= 0) { + /* Our hint has helped -- cache hit. */ + OPCACHE_STAT_ATTR_HIT(); + } else { + /* The hint we provided didn't work. + Maybe next time? */ + OPCACHE_MAYBE_DEOPT_LOAD_ATTR(); + } + + Py_INCREF(res); + SET_TOP(res); + Py_DECREF(owner); + Py_DECREF(dict); + DISPATCH(); + } else { + // This attribute can be missing sometimes -- we + // don't want to optimize this lookup. + OPCACHE_DEOPT_LOAD_ATTR(); + Py_DECREF(dict); + } + } else { + // There is no dict, or __dict__ doesn't satisfy PyDict_CheckExact + OPCACHE_DEOPT_LOAD_ATTR(); + } + } else { + // The type of the object has either been updated, + // or is different. Maybe it will stabilize? + OPCACHE_MAYBE_DEOPT_LOAD_ATTR(); + } + + OPCACHE_STAT_ATTR_MISS(); + } + + if (co_opcache != NULL && /* co_opcache can be NULL after a DEOPT() call. */ + type->tp_getattro == PyObject_GenericGetAttr) + { + PyObject *descr; + Py_ssize_t ret; + + if (type->tp_dictoffset > 0) { + if (type->tp_dict == NULL) { + if (PyType_Ready(type) < 0) { + Py_DECREF(owner); + SET_TOP(NULL); + goto error; + } + } + + descr = _PyType_Lookup(type, name); + if (descr == NULL || + descr->ob_type->tp_descr_get == NULL || + !PyDescr_IsData(descr)) + { + dictptr = (PyObject **) ((char *)owner + type->tp_dictoffset); + dict = *dictptr; + + if (dict != NULL && PyDict_CheckExact(dict)) { + Py_INCREF(dict); + res = NULL; + ret = _PyDict_GetItemHint((PyDictObject*)dict, name, -1, &res); + if (res != NULL) { + Py_INCREF(res); + Py_DECREF(dict); + Py_DECREF(owner); + SET_TOP(res); + + if (co_opcache->optimized == 0) { + // First time we optimize this opcode. */ + OPCACHE_STAT_ATTR_OPT(); + co_opcache->optimized = OPCODE_CACHE_MAX_TRIES; + } + + la = &co_opcache->u.la; + la->type = type; + la->tp_version_tag = type->tp_version_tag; + la->hint = ret; + + DISPATCH(); + } + Py_DECREF(dict); + } else { + // There is no dict, or __dict__ doesn't satisfy PyDict_CheckExact + OPCACHE_DEOPT_LOAD_ATTR(); + } + } else { + // We failed to find an attribute without a data-like descriptor + OPCACHE_DEOPT_LOAD_ATTR(); + } + } else { + // The object's class does not have a tp_dictoffset we can use + OPCACHE_DEOPT_LOAD_ATTR(); + } + } else if (type->tp_getattro != PyObject_GenericGetAttr) { + OPCACHE_DEOPT_LOAD_ATTR(); + } + } + + /* slow path */ + res = PyObject_GetAttr(owner, name); Py_DECREF(owner); SET_TOP(res); if (res == NULL) -- cgit v0.12