diff options
author | Inada Naoki <songofacandy@gmail.com> | 2019-06-03 12:30:58 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-06-03 12:30:58 (GMT) |
commit | 91234a16367b56ca03ee289f7c03a34d4cfec4c8 (patch) | |
tree | 81d13661e09df4712009641eaba30e4599f1b776 | |
parent | 29ec4228106ed5f970d1c3666614f4a51bad192c (diff) | |
download | cpython-91234a16367b56ca03ee289f7c03a34d4cfec4c8.zip cpython-91234a16367b56ca03ee289f7c03a34d4cfec4c8.tar.gz cpython-91234a16367b56ca03ee289f7c03a34d4cfec4c8.tar.bz2 |
bpo-26219: per opcode cache for LOAD_GLOBAL (GH-12884)
This patch implements per opcode cache mechanism, and use it in
only LOAD_GLOBAL opcode.
Based on Yury's opcache3.patch in bpo-26219.
-rw-r--r-- | Doc/whatsnew/3.8.rst | 4 | ||||
-rw-r--r-- | Include/code.h | 17 | ||||
-rw-r--r-- | Include/internal/pycore_ceval.h | 3 | ||||
-rw-r--r-- | Include/internal/pycore_code.h | 27 | ||||
-rw-r--r-- | Lib/test/test_dict_version.py | 6 | ||||
-rw-r--r-- | Makefile.pre.in | 1 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst | 3 | ||||
-rw-r--r-- | Objects/codeobject.c | 65 | ||||
-rw-r--r-- | Objects/dictobject.c | 25 | ||||
-rw-r--r-- | PCbuild/pythoncore.vcxproj | 1 | ||||
-rw-r--r-- | PCbuild/pythoncore.vcxproj.filters | 3 | ||||
-rw-r--r-- | Python/ceval.c | 143 | ||||
-rw-r--r-- | Python/pylifecycle.c | 3 |
13 files changed, 285 insertions, 16 deletions
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index 9474a2f..da97a72 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -860,6 +860,10 @@ Optimizations methods up to 20--50%. (Contributed by Serhiy Storchaka in :issue:`23867`, :issue:`35582` and :issue:`36127`.) +* ``LOAD_GLOBAL`` instruction now uses new "per opcode cache" mechanism. + It is about 40% faster now. (Contributed by Yury Selivanov and Inada Naoki in + :issue:`26219`.) + Build and C API Changes ======================= diff --git a/Include/code.h b/Include/code.h index 933de97..b79d977 100644 --- a/Include/code.h +++ b/Include/code.h @@ -17,6 +17,8 @@ typedef uint16_t _Py_CODEUNIT; # define _Py_OPARG(word) ((word) >> 8) #endif +typedef struct _PyOpcache _PyOpcache; + /* Bytecode object */ typedef struct { PyObject_HEAD @@ -49,6 +51,21 @@ typedef struct { Type is a void* to keep the format private in codeobject.c to force people to go through the proper APIs. */ void *co_extra; + + /* Per opcodes just-in-time cache + * + * To reduce cache size, we use indirect mapping from opcode index to + * cache object: + * cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1] + */ + + // co_opcache_map is indexed by (next_instr - first_instr). + // * 0 means there is no cache for this opcode. + // * n > 0 means there is cache in co_opcache[n-1]. + unsigned char *co_opcache_map; + _PyOpcache *co_opcache; + int co_opcache_flag; // used to determine when create a cache. + unsigned char co_opcache_size; // length of co_opcache. } PyCodeObject; /* Masks for co_flags above */ diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index d44afdf..2c6df9a 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -31,6 +31,9 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc( PyAPI_FUNC(void) _PyEval_ReInitThreads( _PyRuntimeState *runtime); +/* Private function */ +void _PyEval_Fini(void); + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h new file mode 100644 index 0000000..88956f1 --- /dev/null +++ b/Include/internal/pycore_code.h @@ -0,0 +1,27 @@ +#ifndef Py_INTERNAL_CODE_H +#define Py_INTERNAL_CODE_H +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + PyObject *ptr; /* Cached pointer (borrowed reference) */ + uint64_t globals_ver; /* ma_version of global dict */ + uint64_t builtins_ver; /* ma_version of builtin dict */ +} _PyOpcache_LoadGlobal; + +struct _PyOpcache { + union { + _PyOpcache_LoadGlobal lg; + } u; + char optimized; +}; + +/* Private API */ +int _PyCode_InitOpcache(PyCodeObject *co); + + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_CODE_H */ diff --git a/Lib/test/test_dict_version.py b/Lib/test/test_dict_version.py index 5671f9f..cb79e74 100644 --- a/Lib/test/test_dict_version.py +++ b/Lib/test/test_dict_version.py @@ -80,14 +80,14 @@ class DictVersionTests(unittest.TestCase): # setting a key to the same value with dict.__setitem__ # must change the version - self.check_version_changed(d, d.__setitem__, 'key', value) + self.check_version_dont_change(d, d.__setitem__, 'key', value) # setting a key to the same value with dict.update # must change the version - self.check_version_changed(d, d.update, key=value) + self.check_version_dont_change(d, d.update, key=value) d2 = self.new_dict(key=value) - self.check_version_changed(d, d.update, d2) + self.check_version_dont_change(d, d.update, d2) def test_setitem_equal(self): class AlwaysEqual: diff --git a/Makefile.pre.in b/Makefile.pre.in index 0bf5df4..a0bc9c1 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1070,6 +1070,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_accu.h \ $(srcdir)/Include/internal/pycore_atomic.h \ $(srcdir)/Include/internal/pycore_ceval.h \ + $(srcdir)/Include/internal/pycore_code.h \ $(srcdir)/Include/internal/pycore_condvar.h \ $(srcdir)/Include/internal/pycore_context.h \ $(srcdir)/Include/internal/pycore_fileutils.h \ diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst b/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst new file mode 100644 index 0000000..a4ee7b5 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst @@ -0,0 +1,3 @@ +Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use +it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov, +and Inada Naoki. diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 2333075..0d9e5d1 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2,7 +2,9 @@ #include "Python.h" #include "code.h" +#include "opcode.h" #include "structmember.h" +#include "pycore_code.h" #include "pycore_pystate.h" #include "pycore_tupleobject.h" #include "clinic/codeobject.c.h" @@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount, co->co_zombieframe = NULL; co->co_weakreflist = NULL; co->co_extra = NULL; + + co->co_opcache_map = NULL; + co->co_opcache = NULL; + co->co_opcache_flag = 0; + co->co_opcache_size = 0; return co; } +int +_PyCode_InitOpcache(PyCodeObject *co) +{ + Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT); + co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1); + if (co->co_opcache_map == NULL) { + return -1; + } + + _Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code); + Py_ssize_t opts = 0; + + for (Py_ssize_t i = 0; i < co_size;) { + unsigned char opcode = _Py_OPCODE(opcodes[i]); + i++; // 'i' is now aligned to (next_instr - first_instr) + + // TODO: LOAD_METHOD, LOAD_ATTR + if (opcode == LOAD_GLOBAL) { + co->co_opcache_map[i] = ++opts; + if (opts > 254) { + break; + } + } + } + + if (opts) { + co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache)); + if (co->co_opcache == NULL) { + PyMem_FREE(co->co_opcache_map); + return -1; + } + } + else { + PyMem_FREE(co->co_opcache_map); + co->co_opcache_map = NULL; + co->co_opcache = NULL; + } + + co->co_opcache_size = opts; + return 0; +} + PyCodeObject * PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno) { @@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw) static void code_dealloc(PyCodeObject *co) { + if (co->co_opcache != NULL) { + PyMem_FREE(co->co_opcache); + } + if (co->co_opcache_map != NULL) { + PyMem_FREE(co->co_opcache_map); + } + co->co_opcache_flag = 0; + co->co_opcache_size = 0; + if (co->co_extra != NULL) { PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); _PyCodeObjectExtra *co_extra = co->co_extra; @@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args)) res += sizeof(_PyCodeObjectExtra) + (co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]); } + if (co->co_opcache != NULL) { + assert(co->co_opcache_map != NULL); + // co_opcache_map + res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT); + // co_opcache + res += co->co_opcache_size * sizeof(_PyOpcache); + } return PyLong_FromSsize_t(res); } diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 2b04b0b..0cc1443 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -1080,20 +1080,21 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value) return 0; } - if (_PyDict_HasSplitTable(mp)) { - mp->ma_values[ix] = value; - if (old_value == NULL) { - /* pending state */ - assert(ix == mp->ma_used); - mp->ma_used++; + if (old_value != value) { + if (_PyDict_HasSplitTable(mp)) { + mp->ma_values[ix] = value; + if (old_value == NULL) { + /* pending state */ + assert(ix == mp->ma_used); + mp->ma_used++; + } } + else { + assert(old_value != NULL); + DK_ENTRIES(mp->ma_keys)[ix].me_value = value; + } + mp->ma_version_tag = DICT_NEXT_VERSION(); } - else { - assert(old_value != NULL); - DK_ENTRIES(mp->ma_keys)[ix].me_value = value; - } - - mp->ma_version_tag = DICT_NEXT_VERSION(); Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */ ASSERT_CONSISTENT(mp); Py_DECREF(key); diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 329f9fe..89625da 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -158,6 +158,7 @@ <ClInclude Include="..\Include\import.h" /> <ClInclude Include="..\Include\internal\pycore_accu.h" /> <ClInclude Include="..\Include\internal\pycore_atomic.h" /> + <ClInclude Include="..\Include\internal\pycore_code.h" /> <ClInclude Include="..\Include\internal\pycore_ceval.h" /> <ClInclude Include="..\Include\internal\pycore_condvar.h" /> <ClInclude Include="..\Include\internal\pycore_context.h" /> diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index d80d05f..63ab88b 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -177,6 +177,9 @@ <ClInclude Include="..\Include\internal\pycore_atomic.h"> <Filter>Include</Filter> </ClInclude> + <ClInclude Include="..\Include\internal\pycore_code.h"> + <Filter>Include</Filter> + </ClInclude> <ClInclude Include="..\Include\internal\pycore_ceval.h"> <Filter>Include</Filter> </ClInclude> diff --git a/Python/ceval.c b/Python/ceval.c index a092a23..411ba3d 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -11,6 +11,7 @@ #include "Python.h" #include "pycore_ceval.h" +#include "pycore_code.h" #include "pycore_object.h" #include "pycore_pyerrors.h" #include "pycore_pylifecycle.h" @@ -101,6 +102,20 @@ static long dxp[256]; #endif #endif +/* per opcode cache */ +#define OPCACHE_MIN_RUNS 1024 /* create opcache when code executed this time */ +#define OPCACHE_STATS 0 /* Enable stats */ + +#if OPCACHE_STATS +static size_t opcache_code_objects = 0; +static size_t opcache_code_objects_extra_mem = 0; + +static size_t opcache_global_opts = 0; +static size_t opcache_global_hits = 0; +static size_t opcache_global_misses = 0; +#endif + + /* This can set eval_breaker to 0 even though gil_drop_request became 1. We believe this is all right because the eval loop will release the GIL eventually anyway. */ @@ -226,6 +241,35 @@ exit_thread_if_finalizing(PyThreadState *tstate) } void +_PyEval_Fini(void) +{ +#if OPCACHE_STATS + fprintf(stderr, "-- Opcode cache number of objects = %zd\n", + opcache_code_objects); + + fprintf(stderr, "-- Opcode cache total extra mem = %zd\n", + opcache_code_objects_extra_mem); + + fprintf(stderr, "\n"); + + fprintf(stderr, "-- Opcode cache LOAD_GLOBAL hits = %zd (%d%%)\n", + opcache_global_hits, + (int) (100.0 * opcache_global_hits / + (opcache_global_hits + opcache_global_misses))); + + fprintf(stderr, "-- Opcode cache LOAD_GLOBAL misses = %zd (%d%%)\n", + opcache_global_misses, + (int) (100.0 * opcache_global_misses / + (opcache_global_hits + opcache_global_misses))); + + fprintf(stderr, "-- Opcode cache LOAD_GLOBAL opts = %zd\n", + opcache_global_opts); + + fprintf(stderr, "\n"); +#endif +} + +void PyEval_AcquireLock(void) { _PyRuntimeState *runtime = &_PyRuntime; @@ -799,6 +843,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag) const _Py_CODEUNIT *first_instr; PyObject *names; PyObject *consts; + _PyOpcache *co_opcache; #ifdef LLTRACE _Py_IDENTIFIER(__ltrace__); @@ -1061,6 +1106,49 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag) Py_XDECREF(traceback); \ } while(0) + /* macros for opcode cache */ +#define OPCACHE_CHECK() \ + do { \ + co_opcache = NULL; \ + if (co->co_opcache != NULL) { \ + unsigned char co_opt_offset = \ + co->co_opcache_map[next_instr - first_instr]; \ + if (co_opt_offset > 0) { \ + assert(co_opt_offset <= co->co_opcache_size); \ + co_opcache = &co->co_opcache[co_opt_offset - 1]; \ + assert(co_opcache != NULL); \ + if (co_opcache->optimized < 0) { \ + co_opcache = NULL; \ + } \ + } \ + } \ + } while (0) + +#if OPCACHE_STATS + +#define OPCACHE_STAT_GLOBAL_HIT() \ + do { \ + if (co->co_opcache != NULL) opcache_global_hits++; \ + } while (0) + +#define OPCACHE_STAT_GLOBAL_MISS() \ + do { \ + if (co->co_opcache != NULL) opcache_global_misses++; \ + } while (0) + +#define OPCACHE_STAT_GLOBAL_OPT() \ + do { \ + if (co->co_opcache != NULL) opcache_global_opts++; \ + } while (0) + +#else /* OPCACHE_STATS */ + +#define OPCACHE_STAT_GLOBAL_HIT() +#define OPCACHE_STAT_GLOBAL_MISS() +#define OPCACHE_STAT_GLOBAL_OPT() + +#endif + /* Start of code */ /* push frame */ @@ -1142,6 +1230,20 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag) f->f_stacktop = NULL; /* remains NULL unless yield suspends frame */ f->f_executing = 1; + if (co->co_opcache_flag < OPCACHE_MIN_RUNS) { + co->co_opcache_flag++; + if (co->co_opcache_flag == OPCACHE_MIN_RUNS) { + if (_PyCode_InitOpcache(co) < 0) { + return NULL; + } +#if OPCACHE_STATS + opcache_code_objects_extra_mem += + PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT) + + sizeof(_PyOpcache) * co->co_opcache_size; + opcache_code_objects++; +#endif + } + } #ifdef LLTRACE lltrace = _PyDict_GetItemId(f->f_globals, &PyId___ltrace__) != NULL; @@ -2451,11 +2553,30 @@ main_loop: } case TARGET(LOAD_GLOBAL): { - PyObject *name = GETITEM(names, oparg); + PyObject *name; PyObject *v; if (PyDict_CheckExact(f->f_globals) && PyDict_CheckExact(f->f_builtins)) { + OPCACHE_CHECK(); + if (co_opcache != NULL && co_opcache->optimized > 0) { + _PyOpcache_LoadGlobal *lg = &co_opcache->u.lg; + + if (lg->globals_ver == + ((PyDictObject *)f->f_globals)->ma_version_tag + && lg->builtins_ver == + ((PyDictObject *)f->f_builtins)->ma_version_tag) + { + PyObject *ptr = lg->ptr; + OPCACHE_STAT_GLOBAL_HIT(); + assert(ptr != NULL); + Py_INCREF(ptr); + PUSH(ptr); + DISPATCH(); + } + } + + name = GETITEM(names, oparg); v = _PyDict_LoadGlobal((PyDictObject *)f->f_globals, (PyDictObject *)f->f_builtins, name); @@ -2468,12 +2589,32 @@ main_loop: } goto error; } + + if (co_opcache != NULL) { + _PyOpcache_LoadGlobal *lg = &co_opcache->u.lg; + + if (co_opcache->optimized == 0) { + /* Wasn't optimized before. */ + OPCACHE_STAT_GLOBAL_OPT(); + } else { + OPCACHE_STAT_GLOBAL_MISS(); + } + + co_opcache->optimized = 1; + lg->globals_ver = + ((PyDictObject *)f->f_globals)->ma_version_tag; + lg->builtins_ver = + ((PyDictObject *)f->f_builtins)->ma_version_tag; + lg->ptr = v; /* borrowed */ + } + Py_INCREF(v); } else { /* Slow-path if globals or builtins is not a dict */ /* namespace 1: globals */ + name = GETITEM(names, oparg); v = PyObject_GetItem(f->f_globals, name); if (v == NULL) { if (!_PyErr_ExceptionMatches(tstate, PyExc_KeyError)) { diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 3de5528..fc7e551 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1242,6 +1242,9 @@ Py_FinalizeEx(void) /* Destroy all modules */ PyImport_Cleanup(); + /* Print debug stats if any */ + _PyEval_Fini(); + /* Flush sys.stdout and sys.stderr (again, in case more was printed) */ if (flush_std_files() < 0) { status = -1; |