diff options
author | Guido van Rossum <guido@python.org> | 2024-04-04 15:03:27 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-04 15:03:27 (GMT) |
commit | 060a96f1a9a901b01ed304aa82b886d248ca1cb6 (patch) | |
tree | cb3e95ecac1f90440b7d3752c4aad015ea734bf0 | |
parent | 63bbe77d9bb2be4db83ed09b96dd22f2a44ef55b (diff) | |
download | cpython-060a96f1a9a901b01ed304aa82b886d248ca1cb6.zip cpython-060a96f1a9a901b01ed304aa82b886d248ca1cb6.tar.gz cpython-060a96f1a9a901b01ed304aa82b886d248ca1cb6.tar.bz2 |
gh-116968: Reimplement Tier 2 counters (#117144)
Introduce a unified 16-bit backoff counter type (``_Py_BackoffCounter``),
shared between the Tier 1 adaptive specializer and the Tier 2 optimizer. The
API used for adaptive specialization counters is changed but the behavior is
(supposed to be) identical.
The behavior of the Tier 2 counters is changed:
- There are no longer dynamic thresholds (we never varied these).
- All counters now use the same exponential backoff.
- The counter for ``JUMP_BACKWARD`` starts counting down from 16.
- The ``temperature`` in side exits starts counting down from 64.
-rw-r--r-- | Include/cpython/code.h | 11 | ||||
-rw-r--r-- | Include/cpython/optimizer.h | 15 | ||||
-rw-r--r-- | Include/internal/pycore_backoff.h | 128 | ||||
-rw-r--r-- | Include/internal/pycore_code.h | 64 | ||||
-rw-r--r-- | Include/internal/pycore_interp.h | 6 | ||||
-rw-r--r-- | Lib/test/test_capi/test_opt.py | 16 | ||||
-rw-r--r-- | Makefile.pre.in | 1 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2024-04-03-13-44-04.gh-issue-116968.zgcdG2.rst | 11 | ||||
-rw-r--r-- | Modules/_testinternalcapi.c | 6 | ||||
-rw-r--r-- | PCbuild/pythoncore.vcxproj | 1 | ||||
-rw-r--r-- | Python/bytecodes.c | 101 | ||||
-rw-r--r-- | Python/ceval.c | 5 | ||||
-rw-r--r-- | Python/ceval_macros.h | 31 | ||||
-rw-r--r-- | Python/executor_cases.c.h | 11 | ||||
-rw-r--r-- | Python/generated_cases.c.h | 87 | ||||
-rw-r--r-- | Python/instrumentation.c | 8 | ||||
-rw-r--r-- | Python/optimizer.c | 37 | ||||
-rw-r--r-- | Python/specialize.c | 8 | ||||
-rw-r--r-- | Tools/jit/template.c | 1 |
19 files changed, 313 insertions, 235 deletions
diff --git a/Include/cpython/code.h b/Include/cpython/code.h index d5dac17..b0e226e 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -24,6 +24,16 @@ typedef struct _Py_GlobalMonitors { uint8_t tools[_PY_MONITORING_UNGROUPED_EVENTS]; } _Py_GlobalMonitors; +typedef struct { + union { + struct { + uint16_t backoff : 4; + uint16_t value : 12; + }; + uint16_t as_counter; // For printf("%#x", ...) + }; +} _Py_BackoffCounter; + /* Each instruction in a code object is a fixed-width value, * currently 2 bytes: 1-byte opcode + 1-byte oparg. The EXTENDED_ARG * opcode allows for larger values but the current limit is 3 uses @@ -39,6 +49,7 @@ typedef union { uint8_t code; uint8_t arg; } op; + _Py_BackoffCounter counter; // First cache entry of specializable op } _Py_CODEUNIT; diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index bc960c5..819251a 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -89,7 +89,7 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst) typedef struct _exit_data { uint32_t target; - int16_t temperature; + _Py_BackoffCounter temperature; const struct _PyExecutorObject *executor; } _PyExitData; @@ -115,11 +115,6 @@ typedef int (*optimize_func)( struct _PyOptimizerObject { PyObject_HEAD optimize_func optimize; - /* These thresholds are treated as signed so do not exceed INT16_MAX - * Use INT16_MAX to indicate that the optimizer should never be called */ - uint16_t resume_threshold; - uint16_t side_threshold; - uint16_t backedge_threshold; /* Data needed by the optimizer goes here, but is opaque to the VM */ }; @@ -151,14 +146,6 @@ extern void _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_inval PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewCounter(void); PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void); -#define OPTIMIZER_BITS_IN_COUNTER 4 -/* Minimum of 16 additional executions before retry */ -#define MIN_TIER2_BACKOFF 4 -#define MAX_TIER2_BACKOFF (15 - OPTIMIZER_BITS_IN_COUNTER) -#define OPTIMIZER_BITS_MASK ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1) -/* A value <= UINT16_MAX but large enough that when shifted is > UINT16_MAX */ -#define OPTIMIZER_UNREACHABLE_THRESHOLD UINT16_MAX - #define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3 #define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6 diff --git a/Include/internal/pycore_backoff.h b/Include/internal/pycore_backoff.h new file mode 100644 index 0000000..5d93c88 --- /dev/null +++ b/Include/internal/pycore_backoff.h @@ -0,0 +1,128 @@ + +#ifndef Py_INTERNAL_BACKOFF_H +#define Py_INTERNAL_BACKOFF_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> + +/* 16-bit countdown counters using exponential backoff. + + These are used by the adaptive specializer to count down until + it is time to specialize an instruction. If specialization fails + the counter is reset using exponential backoff. + + Another use is for the Tier 2 optimizer to decide when to create + a new Tier 2 trace (executor). Again, exponential backoff is used. + + The 16-bit counter is structured as a 12-bit unsigned 'value' + and a 4-bit 'backoff' field. When resetting the counter, the + backoff field is incremented (until it reaches a limit) and the + value is set to a bit mask representing the value 2**backoff - 1. + The maximum backoff is 12 (the number of value bits). + + There is an exceptional value which must not be updated, 0xFFFF. +*/ + +#define UNREACHABLE_BACKOFF 0xFFFF + +static inline bool +is_unreachable_backoff_counter(_Py_BackoffCounter counter) +{ + return counter.as_counter == UNREACHABLE_BACKOFF; +} + +static inline _Py_BackoffCounter +make_backoff_counter(uint16_t value, uint16_t backoff) +{ + assert(backoff <= 15); + assert(value <= 0xFFF); + return (_Py_BackoffCounter){.value = value, .backoff = backoff}; +} + +static inline _Py_BackoffCounter +forge_backoff_counter(uint16_t counter) +{ + return (_Py_BackoffCounter){.as_counter = counter}; +} + +static inline _Py_BackoffCounter +restart_backoff_counter(_Py_BackoffCounter counter) +{ + assert(!is_unreachable_backoff_counter(counter)); + if (counter.backoff < 12) { + return make_backoff_counter((1 << (counter.backoff + 1)) - 1, counter.backoff + 1); + } + else { + return make_backoff_counter((1 << 12) - 1, 12); + } +} + +static inline _Py_BackoffCounter +pause_backoff_counter(_Py_BackoffCounter counter) +{ + return make_backoff_counter(counter.value | 1, counter.backoff); +} + +static inline _Py_BackoffCounter +advance_backoff_counter(_Py_BackoffCounter counter) +{ + if (!is_unreachable_backoff_counter(counter)) { + return make_backoff_counter((counter.value - 1) & 0xFFF, counter.backoff); + } + else { + return counter; + } +} + +static inline bool +backoff_counter_triggers(_Py_BackoffCounter counter) +{ + return counter.value == 0; +} + +/* Initial JUMP_BACKWARD counter. + * This determines when we create a trace for a loop. +* Backoff sequence 16, 32, 64, 128, 256, 512, 1024, 2048, 4096. */ +#define JUMP_BACKWARD_INITIAL_VALUE 16 +#define JUMP_BACKWARD_INITIAL_BACKOFF 4 +static inline _Py_BackoffCounter +initial_jump_backoff_counter(void) +{ + return make_backoff_counter(JUMP_BACKWARD_INITIAL_VALUE, + JUMP_BACKWARD_INITIAL_BACKOFF); +} + +/* Initial exit temperature. + * Must be larger than ADAPTIVE_COOLDOWN_VALUE, + * otherwise when a side exit warms up we may construct + * a new trace before the Tier 1 code has properly re-specialized. + * Backoff sequence 64, 128, 256, 512, 1024, 2048, 4096. */ +#define COLD_EXIT_INITIAL_VALUE 64 +#define COLD_EXIT_INITIAL_BACKOFF 6 + +static inline _Py_BackoffCounter +initial_temperature_backoff_counter(void) +{ + return make_backoff_counter(COLD_EXIT_INITIAL_VALUE, + COLD_EXIT_INITIAL_BACKOFF); +} + +/* Unreachable backoff counter. */ +static inline _Py_BackoffCounter +initial_unreachable_backoff_counter(void) +{ + return forge_backoff_counter(UNREACHABLE_BACKOFF); +} + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_BACKOFF_H */ diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 6c90c9e..688051b 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -31,7 +31,7 @@ extern "C" { #define CACHE_ENTRIES(cache) (sizeof(cache)/sizeof(_Py_CODEUNIT)) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; uint16_t module_keys_version; uint16_t builtin_keys_version; uint16_t index; @@ -40,44 +40,44 @@ typedef struct { #define INLINE_CACHE_ENTRIES_LOAD_GLOBAL CACHE_ENTRIES(_PyLoadGlobalCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; } _PyBinaryOpCache; #define INLINE_CACHE_ENTRIES_BINARY_OP CACHE_ENTRIES(_PyBinaryOpCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; } _PyUnpackSequenceCache; #define INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE \ CACHE_ENTRIES(_PyUnpackSequenceCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; } _PyCompareOpCache; #define INLINE_CACHE_ENTRIES_COMPARE_OP CACHE_ENTRIES(_PyCompareOpCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; } _PyBinarySubscrCache; #define INLINE_CACHE_ENTRIES_BINARY_SUBSCR CACHE_ENTRIES(_PyBinarySubscrCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; } _PySuperAttrCache; #define INLINE_CACHE_ENTRIES_LOAD_SUPER_ATTR CACHE_ENTRIES(_PySuperAttrCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; uint16_t version[2]; uint16_t index; } _PyAttrCache; typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; uint16_t type_version[2]; union { uint16_t keys_version[2]; @@ -93,39 +93,39 @@ typedef struct { #define INLINE_CACHE_ENTRIES_STORE_ATTR CACHE_ENTRIES(_PyAttrCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; uint16_t func_version[2]; } _PyCallCache; #define INLINE_CACHE_ENTRIES_CALL CACHE_ENTRIES(_PyCallCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; } _PyStoreSubscrCache; #define INLINE_CACHE_ENTRIES_STORE_SUBSCR CACHE_ENTRIES(_PyStoreSubscrCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; } _PyForIterCache; #define INLINE_CACHE_ENTRIES_FOR_ITER CACHE_ENTRIES(_PyForIterCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; } _PySendCache; #define INLINE_CACHE_ENTRIES_SEND CACHE_ENTRIES(_PySendCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; uint16_t version[2]; } _PyToBoolCache; #define INLINE_CACHE_ENTRIES_TO_BOOL CACHE_ENTRIES(_PyToBoolCache) typedef struct { - uint16_t counter; + _Py_BackoffCounter counter; } _PyContainsOpCache; #define INLINE_CACHE_ENTRIES_CONTAINS_OP CACHE_ENTRIES(_PyContainsOpCache) @@ -451,18 +451,14 @@ write_location_entry_start(uint8_t *ptr, int code, int length) /** Counters * The first 16-bit value in each inline cache is a counter. - * When counting misses, the counter is treated as a simple unsigned value. * * When counting executions until the next specialization attempt, * exponential backoff is used to reduce the number of specialization failures. - * The high 12 bits store the counter, the low 4 bits store the backoff exponent. - * On a specialization failure, the backoff exponent is incremented and the - * counter set to (2**backoff - 1). - * Backoff == 6 -> starting counter == 63, backoff == 10 -> starting counter == 1023. + * See pycore_backoff.h for more details. + * On a specialization failure, the backoff counter is restarted. */ -/* With a 16-bit counter, we have 12 bits for the counter value, and 4 bits for the backoff */ -#define ADAPTIVE_BACKOFF_BITS 4 +#include "pycore_backoff.h" // A value of 1 means that we attempt to specialize the *second* time each // instruction is executed. Executing twice is a much better indicator of @@ -480,36 +476,30 @@ write_location_entry_start(uint8_t *ptr, int code, int length) #define ADAPTIVE_COOLDOWN_VALUE 52 #define ADAPTIVE_COOLDOWN_BACKOFF 0 -#define MAX_BACKOFF_VALUE (16 - ADAPTIVE_BACKOFF_BITS) +// Can't assert this in pycore_backoff.h because of header order dependencies +static_assert(COLD_EXIT_INITIAL_VALUE > ADAPTIVE_COOLDOWN_VALUE, + "Cold exit value should be larger than adaptive cooldown value"); - -static inline uint16_t +static inline _Py_BackoffCounter adaptive_counter_bits(uint16_t value, uint16_t backoff) { - return ((value << ADAPTIVE_BACKOFF_BITS) - | (backoff & ((1 << ADAPTIVE_BACKOFF_BITS) - 1))); + return make_backoff_counter(value, backoff); } -static inline uint16_t +static inline _Py_BackoffCounter adaptive_counter_warmup(void) { return adaptive_counter_bits(ADAPTIVE_WARMUP_VALUE, ADAPTIVE_WARMUP_BACKOFF); } -static inline uint16_t +static inline _Py_BackoffCounter adaptive_counter_cooldown(void) { return adaptive_counter_bits(ADAPTIVE_COOLDOWN_VALUE, ADAPTIVE_COOLDOWN_BACKOFF); } -static inline uint16_t -adaptive_counter_backoff(uint16_t counter) { - uint16_t backoff = counter & ((1 << ADAPTIVE_BACKOFF_BITS) - 1); - backoff++; - if (backoff > MAX_BACKOFF_VALUE) { - backoff = MAX_BACKOFF_VALUE; - } - uint16_t value = (uint16_t)(1 << backoff) - 1; - return adaptive_counter_bits(value, backoff); +static inline _Py_BackoffCounter +adaptive_counter_backoff(_Py_BackoffCounter counter) { + return restart_backoff_counter(counter); } diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index b8d0fdc..b5cea86 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -239,12 +239,6 @@ struct _is { _PyOptimizerObject *optimizer; _PyExecutorObject *executor_list_head; - /* These two values are shifted and offset to speed up check in JUMP_BACKWARD */ - uint32_t optimizer_resume_threshold; - uint32_t optimizer_backedge_threshold; - - uint16_t optimizer_side_threshold; - _rare_events rare_events; PyDict_WatchCallback builtins_dict_watcher; diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index ceb49c3..7ca0f69 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -10,6 +10,7 @@ import _testinternalcapi from test.support import script_helper, requires_specialization +from _testinternalcapi import TIER2_THRESHOLD @contextlib.contextmanager def temporary_optimizer(opt): @@ -69,7 +70,8 @@ class TestOptimizerAPI(unittest.TestCase): self.assertEqual(opt.get_count(), 0) with clear_executors(loop): loop() - self.assertEqual(opt.get_count(), 1000) + # Subtract because optimizer doesn't kick in sooner + self.assertEqual(opt.get_count(), 1000 - TIER2_THRESHOLD) def test_long_loop(self): "Check that we aren't confused by EXTENDED_ARG" @@ -81,7 +83,7 @@ class TestOptimizerAPI(unittest.TestCase): pass def long_loop(): - for _ in range(10): + for _ in range(20): nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); @@ -96,7 +98,7 @@ class TestOptimizerAPI(unittest.TestCase): with temporary_optimizer(opt): self.assertEqual(opt.get_count(), 0) long_loop() - self.assertEqual(opt.get_count(), 10) + self.assertEqual(opt.get_count(), 20 - TIER2_THRESHOLD) # Need iterations to warm up def test_code_restore_for_ENTER_EXECUTOR(self): def testfunc(x): @@ -932,10 +934,10 @@ class TestUopsOptimization(unittest.TestCase): exec(src, ns, ns) testfunc = ns['testfunc'] ns['_test_global'] = 0 - _, ex = self._run_with_optimizer(testfunc, 16) + _, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) self.assertIsNone(ex) ns['_test_global'] = 1 - _, ex = self._run_with_optimizer(testfunc, 16) + _, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) self.assertIsNotNone(ex) uops = get_opnames(ex) self.assertNotIn("_GUARD_BOTH_INT", uops) @@ -946,10 +948,10 @@ class TestUopsOptimization(unittest.TestCase): exec(src, ns, ns) testfunc = ns['testfunc'] ns['_test_global'] = 0 - _, ex = self._run_with_optimizer(testfunc, 16) + _, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) self.assertIsNone(ex) ns['_test_global'] = 3.14 - _, ex = self._run_with_optimizer(testfunc, 16) + _, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) self.assertIsNone(ex) def test_combine_stack_space_checks_sequential(self): diff --git a/Makefile.pre.in b/Makefile.pre.in index 2a22a1e..84058ac 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1124,6 +1124,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_ast.h \ $(srcdir)/Include/internal/pycore_ast_state.h \ $(srcdir)/Include/internal/pycore_atexit.h \ + $(srcdir)/Include/internal/pycore_backoff.h \ $(srcdir)/Include/internal/pycore_bitutils.h \ $(srcdir)/Include/internal/pycore_blocks_output_buffer.h \ $(srcdir)/Include/internal/pycore_brc.h \ diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-04-03-13-44-04.gh-issue-116968.zgcdG2.rst b/Misc/NEWS.d/next/Core and Builtins/2024-04-03-13-44-04.gh-issue-116968.zgcdG2.rst new file mode 100644 index 0000000..dc5beee --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-04-03-13-44-04.gh-issue-116968.zgcdG2.rst @@ -0,0 +1,11 @@ +Introduce a unified 16-bit backoff counter type (``_Py_BackoffCounter``), +shared between the Tier 1 adaptive specializer and the Tier 2 optimizer. The +API used for adaptive specialization counters is changed but the behavior is +(supposed to be) identical. + +The behavior of the Tier 2 counters is changed: + +* There are no longer dynamic thresholds (we never varied these). +* All counters now use the same exponential backoff. +* The counter for ``JUMP_BACKWARD`` starts counting down from 16. +* The ``temperature`` in side exits starts counting down from 64. diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 6b5d99f..758e88e 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -10,6 +10,7 @@ #undef NDEBUG #include "Python.h" +#include "pycore_backoff.h" // JUMP_BACKWARD_INITIAL_VALUE #include "pycore_bitutils.h" // _Py_bswap32() #include "pycore_bytesobject.h" // _PyBytes_Find() #include "pycore_ceval.h" // _PyEval_AddPendingCall() @@ -1819,6 +1820,11 @@ module_exec(PyObject *module) return 1; } + if (PyModule_Add(module, "TIER2_THRESHOLD", + PyLong_FromLong(JUMP_BACKWARD_INITIAL_VALUE)) < 0) { + return 1; + } + return 0; } diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 657ffd1..827c9f0 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -204,6 +204,7 @@ <ClInclude Include="..\Include\internal\pycore_ast.h" /> <ClInclude Include="..\Include\internal\pycore_ast_state.h" /> <ClInclude Include="..\Include\internal\pycore_atexit.h" /> + <ClInclude Include="..\Include\internal\pycore_backoff.h" /> <ClInclude Include="..\Include\internal\pycore_bitutils.h" /> <ClInclude Include="..\Include\internal\pycore_brc.h" /> <ClInclude Include="..\Include\internal\pycore_bytes_methods.h" /> diff --git a/Python/bytecodes.c b/Python/bytecodes.c index fa53c96..8af48d9 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -8,6 +8,7 @@ #include "Python.h" #include "pycore_abstract.h" // _PyIndex_Check() +#include "pycore_backoff.h" #include "pycore_cell.h" // PyCell_GetRef() #include "pycore_code.h" #include "pycore_emscripten_signal.h" // _Py_CHECK_EMSCRIPTEN_SIGNALS @@ -326,13 +327,13 @@ dummy_func( specializing op(_SPECIALIZE_TO_BOOL, (counter/1, value -- value)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ToBool(value, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(TO_BOOL, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -551,13 +552,13 @@ dummy_func( specializing op(_SPECIALIZE_BINARY_SUBSCR, (counter/1, container, sub -- container, sub)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinarySubscr(container, sub, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(BINARY_SUBSCR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -698,13 +699,13 @@ dummy_func( specializing op(_SPECIALIZE_STORE_SUBSCR, (counter/1, container, sub -- container, sub)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_StoreSubscr(container, sub, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(STORE_SUBSCR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -982,13 +983,13 @@ dummy_func( specializing op(_SPECIALIZE_SEND, (counter/1, receiver, unused -- receiver, unused)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Send(receiver, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(SEND, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -1211,13 +1212,13 @@ dummy_func( specializing op(_SPECIALIZE_UNPACK_SEQUENCE, (counter/1, seq -- seq)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_UnpackSequence(seq, next_instr, oparg); DISPATCH_SAME_OPARG(); } STAT_INC(UNPACK_SEQUENCE, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ (void)seq; (void)counter; @@ -1280,14 +1281,14 @@ dummy_func( specializing op(_SPECIALIZE_STORE_ATTR, (counter/1, owner -- owner)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg); next_instr = this_instr; _Py_Specialize_StoreAttr(owner, next_instr, name); DISPATCH_SAME_OPARG(); } STAT_INC(STORE_ATTR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -1398,14 +1399,14 @@ dummy_func( specializing op(_SPECIALIZE_LOAD_GLOBAL, (counter/1 -- )) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; _Py_Specialize_LoadGlobal(GLOBALS(), BUILTINS(), next_instr, name); DISPATCH_SAME_OPARG(); } STAT_INC(LOAD_GLOBAL, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -1711,7 +1712,7 @@ dummy_func( inst(INSTRUMENTED_LOAD_SUPER_ATTR, (unused/1, unused, unused, unused -- unused, unused if (oparg & 1))) { // cancel out the decrement that will happen in LOAD_SUPER_ATTR; we // don't want to specialize instrumented instructions - INCREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + PAUSE_ADAPTIVE_COUNTER(this_instr[1].counter); GO_TO_INSTRUCTION(LOAD_SUPER_ATTR); } @@ -1723,13 +1724,13 @@ dummy_func( specializing op(_SPECIALIZE_LOAD_SUPER_ATTR, (counter/1, global_super, class, unused -- global_super, class, unused)) { #if ENABLE_SPECIALIZATION int load_method = oparg & 1; - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_LoadSuperAttr(global_super, class, next_instr, load_method); DISPATCH_SAME_OPARG(); } STAT_INC(LOAD_SUPER_ATTR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -1836,14 +1837,14 @@ dummy_func( specializing op(_SPECIALIZE_LOAD_ATTR, (counter/1, owner -- owner)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; _Py_Specialize_LoadAttr(owner, next_instr, name); DISPATCH_SAME_OPARG(); } STAT_INC(LOAD_ATTR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -2157,13 +2158,13 @@ dummy_func( specializing op(_SPECIALIZE_COMPARE_OP, (counter/1, left, right -- left, right)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_CompareOp(left, right, next_instr, oparg); DISPATCH_SAME_OPARG(); } STAT_INC(COMPARE_OP, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -2254,13 +2255,13 @@ dummy_func( specializing op(_SPECIALIZE_CONTAINS_OP, (counter/1, left, right -- left, right)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ContainsOp(right, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(CONTAINS_OP, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -2340,16 +2341,8 @@ dummy_func( assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); #if ENABLE_SPECIALIZATION - uint16_t counter = this_instr[1].cache; - this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER); - /* We are using unsigned values, but we really want signed values, so - * do the 2s complement adjustment manually */ - uint32_t offset_counter = counter ^ (1 << 15); - uint32_t threshold = tstate->interp->optimizer_backedge_threshold; - assert((threshold & OPTIMIZER_BITS_MASK) == 0); - // Use '>=' not '>' so that the optimizer/backoff bits do not effect the result. - // Double-check that the opcode isn't instrumented or something: - if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) { + _Py_BackoffCounter counter = this_instr[1].counter; + if (backoff_counter_triggers(counter) && this_instr->op.code == JUMP_BACKWARD) { _Py_CODEUNIT *start = this_instr; /* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */ while (oparg > 255) { @@ -2365,17 +2358,12 @@ dummy_func( GOTO_TIER_TWO(executor); } else { - int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK; - backoff++; - if (backoff < MIN_TIER2_BACKOFF) { - backoff = MIN_TIER2_BACKOFF; - } - else if (backoff > MAX_TIER2_BACKOFF) { - backoff = MAX_TIER2_BACKOFF; - } - this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff; + this_instr[1].counter = restart_backoff_counter(counter); } } + else { + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); + } #endif /* ENABLE_SPECIALIZATION */ } @@ -2535,13 +2523,13 @@ dummy_func( specializing op(_SPECIALIZE_FOR_ITER, (counter/1, iter -- iter)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ForIter(iter, next_instr, oparg); DISPATCH_SAME_OPARG(); } STAT_INC(FOR_ITER, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -3001,7 +2989,7 @@ dummy_func( tstate, PY_MONITORING_EVENT_CALL, frame, this_instr, function, arg); ERROR_IF(err, error); - INCREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + PAUSE_ADAPTIVE_COUNTER(this_instr[1].counter); GO_TO_INSTRUCTION(CALL); } @@ -3030,13 +3018,13 @@ dummy_func( specializing op(_SPECIALIZE_CALL, (counter/1, callable, self_or_null, args[oparg] -- callable, self_or_null, args[oparg])) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Call(callable, next_instr, oparg + (self_or_null != NULL)); DISPATCH_SAME_OPARG(); } STAT_INC(CALL, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } @@ -3933,13 +3921,13 @@ dummy_func( specializing op(_SPECIALIZE_BINARY_OP, (counter/1, lhs, rhs -- lhs, rhs)) { #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); DISPATCH_SAME_OPARG(); } STAT_INC(BINARY_OP, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); @@ -3965,7 +3953,7 @@ dummy_func( ERROR_IF(next_opcode < 0, error); next_instr = this_instr; if (_PyOpcode_Caches[next_opcode]) { - INCREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + PAUSE_ADAPTIVE_COUNTER(next_instr[1].counter); } assert(next_opcode > 0 && next_opcode < 256); opcode = next_opcode; @@ -4157,21 +4145,22 @@ dummy_func( tier2 op(_COLD_EXIT, (--)) { _PyExecutorObject *previous = (_PyExecutorObject *)tstate->previous_executor; _PyExitData *exit = &previous->exits[oparg]; - exit->temperature++; PyCodeObject *code = _PyFrame_GetCode(frame); _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; - if (exit->temperature < (int32_t)tstate->interp->optimizer_side_threshold) { + _Py_BackoffCounter temperature = exit->temperature; + if (!backoff_counter_triggers(temperature)) { + exit->temperature = advance_backoff_counter(temperature); GOTO_TIER_ONE(target); } _PyExecutorObject *executor; if (target->op.code == ENTER_EXECUTOR) { executor = code->co_executors->executors[target->op.arg]; Py_INCREF(executor); - } else { + } + else { int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); if (optimized <= 0) { - int32_t new_temp = -1 * tstate->interp->optimizer_side_threshold; - exit->temperature = (new_temp < INT16_MIN) ? INT16_MIN : new_temp; + exit->temperature = restart_backoff_counter(temperature); if (optimized < 0) { Py_DECREF(previous); tstate->previous_executor = Py_None; @@ -4181,7 +4170,7 @@ dummy_func( } } /* We need two references. One to store in exit->executor and - * one to keep the executor alive when executing. */ + * one to keep the executor alive when executing. */ Py_INCREF(executor); exit->executor = executor; GOTO_TIER_TWO(executor); diff --git a/Python/ceval.c b/Python/ceval.c index f3b7316..57ae08e 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -4,6 +4,7 @@ #include "Python.h" #include "pycore_abstract.h" // _PyIndex_Check() +#include "pycore_backoff.h" #include "pycore_call.h" // _PyObject_CallNoArgs() #include "pycore_cell.h" // PyCell_GetRef() #include "pycore_ceval.h" @@ -822,7 +823,7 @@ resume_frame: _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(next_instr+1); /* Prevent the underlying instruction from specializing * and overwriting the instrumentation. */ - INCREMENT_ADAPTIVE_COUNTER(cache->counter); + PAUSE_ADAPTIVE_COUNTER(cache->counter); } opcode = original_opcode; DISPATCH_GOTO(); @@ -1099,7 +1100,7 @@ exit_to_trace: printf("SIDE EXIT: [UOp "); _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", - exit_index, exit->temperature, exit->target, + exit_index, exit->temperature.as_counter, exit->target, _PyOpcode_OpName[_PyCode_CODE(_PyFrame_GetCode(frame))[exit->target].op.code]); } #endif diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 1194c11..224cd1d 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -262,7 +262,7 @@ GETITEM(PyObject *v, Py_ssize_t i) { STAT_INC(opcode, miss); \ STAT_INC((INSTNAME), miss); \ /* The counter is always the first cache entry: */ \ - if (ADAPTIVE_COUNTER_IS_ZERO(next_instr->cache)) { \ + if (ADAPTIVE_COUNTER_TRIGGERS(next_instr->cache)) { \ STAT_INC((INSTNAME), deopt); \ } \ } while (0) @@ -290,29 +290,28 @@ GETITEM(PyObject *v, Py_ssize_t i) { dtrace_function_entry(frame); \ } -#define ADAPTIVE_COUNTER_IS_ZERO(COUNTER) \ - (((COUNTER) >> ADAPTIVE_BACKOFF_BITS) == 0) - -#define ADAPTIVE_COUNTER_IS_MAX(COUNTER) \ - (((COUNTER) >> ADAPTIVE_BACKOFF_BITS) == ((1 << MAX_BACKOFF_VALUE) - 1)) +/* This takes a uint16_t instead of a _Py_BackoffCounter, + * because it is used directly on the cache entry in generated code, + * which is always an integral type. */ +#define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \ + backoff_counter_triggers(forge_backoff_counter((COUNTER))) #ifdef Py_GIL_DISABLED -#define DECREMENT_ADAPTIVE_COUNTER(COUNTER) \ - do { \ - /* gh-115999 tracks progress on addressing this. */ \ +#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \ + do { \ + /* gh-115999 tracks progress on addressing this. */ \ static_assert(0, "The specializing interpreter is not yet thread-safe"); \ } while (0); #else -#define DECREMENT_ADAPTIVE_COUNTER(COUNTER) \ - do { \ - assert(!ADAPTIVE_COUNTER_IS_ZERO((COUNTER))); \ - (COUNTER) -= (1 << ADAPTIVE_BACKOFF_BITS); \ +#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \ + do { \ + (COUNTER) = advance_backoff_counter((COUNTER)); \ } while (0); #endif -#define INCREMENT_ADAPTIVE_COUNTER(COUNTER) \ - do { \ - (COUNTER) += (1 << ADAPTIVE_BACKOFF_BITS); \ +#define PAUSE_ADAPTIVE_COUNTER(COUNTER) \ + do { \ + (COUNTER) = pause_backoff_counter((COUNTER)); \ } while (0); #define UNBOUNDLOCAL_ERROR_MSG \ diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 9847679..8c3d41b 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3694,21 +3694,22 @@ oparg = CURRENT_OPARG(); _PyExecutorObject *previous = (_PyExecutorObject *)tstate->previous_executor; _PyExitData *exit = &previous->exits[oparg]; - exit->temperature++; PyCodeObject *code = _PyFrame_GetCode(frame); _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; - if (exit->temperature < (int32_t)tstate->interp->optimizer_side_threshold) { + _Py_BackoffCounter temperature = exit->temperature; + if (!backoff_counter_triggers(temperature)) { + exit->temperature = advance_backoff_counter(temperature); GOTO_TIER_ONE(target); } _PyExecutorObject *executor; if (target->op.code == ENTER_EXECUTOR) { executor = code->co_executors->executors[target->op.arg]; Py_INCREF(executor); - } else { + } + else { int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); if (optimized <= 0) { - int32_t new_temp = -1 * tstate->interp->optimizer_side_threshold; - exit->temperature = (new_temp < INT16_MIN) ? INT16_MIN : new_temp; + exit->temperature = restart_backoff_counter(temperature); if (optimized < 0) { Py_DECREF(previous); tstate->previous_executor = Py_None; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 6ee794a..0116acd 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -115,13 +115,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); DISPATCH_SAME_OPARG(); } STAT_INC(BINARY_OP, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); @@ -432,13 +432,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinarySubscr(container, sub, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(BINARY_SUBSCR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } // _BINARY_SUBSCR @@ -760,13 +760,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Call(callable, next_instr, oparg + (self_or_null != NULL)); DISPATCH_SAME_OPARG(); } STAT_INC(CALL, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } /* Skip 2 cache entries */ @@ -2036,13 +2036,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_CompareOp(left, right, next_instr, oparg); DISPATCH_SAME_OPARG(); } STAT_INC(COMPARE_OP, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } // _COMPARE_OP @@ -2185,13 +2185,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ContainsOp(right, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(CONTAINS_OP, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } // _CONTAINS_OP @@ -2596,13 +2596,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ForIter(iter, next_instr, oparg); DISPATCH_SAME_OPARG(); } STAT_INC(FOR_ITER, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } // _FOR_ITER @@ -3026,7 +3026,7 @@ tstate, PY_MONITORING_EVENT_CALL, frame, this_instr, function, arg); if (err) goto error; - INCREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + PAUSE_ADAPTIVE_COUNTER(this_instr[1].counter); GO_TO_INSTRUCTION(CALL); } @@ -3142,7 +3142,7 @@ if (next_opcode < 0) goto error; next_instr = this_instr; if (_PyOpcode_Caches[next_opcode]) { - INCREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + PAUSE_ADAPTIVE_COUNTER(next_instr[1].counter); } assert(next_opcode > 0 && next_opcode < 256); opcode = next_opcode; @@ -3177,7 +3177,7 @@ /* Skip 1 cache entry */ // cancel out the decrement that will happen in LOAD_SUPER_ATTR; we // don't want to specialize instrumented instructions - INCREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + PAUSE_ADAPTIVE_COUNTER(this_instr[1].counter); GO_TO_INSTRUCTION(LOAD_SUPER_ATTR); } @@ -3415,16 +3415,8 @@ assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); #if ENABLE_SPECIALIZATION - uint16_t counter = this_instr[1].cache; - this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER); - /* We are using unsigned values, but we really want signed values, so - * do the 2s complement adjustment manually */ - uint32_t offset_counter = counter ^ (1 << 15); - uint32_t threshold = tstate->interp->optimizer_backedge_threshold; - assert((threshold & OPTIMIZER_BITS_MASK) == 0); - // Use '>=' not '>' so that the optimizer/backoff bits do not effect the result. - // Double-check that the opcode isn't instrumented or something: - if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) { + _Py_BackoffCounter counter = this_instr[1].counter; + if (backoff_counter_triggers(counter) && this_instr->op.code == JUMP_BACKWARD) { _Py_CODEUNIT *start = this_instr; /* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */ while (oparg > 255) { @@ -3440,17 +3432,12 @@ GOTO_TIER_TWO(executor); } else { - int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK; - backoff++; - if (backoff < MIN_TIER2_BACKOFF) { - backoff = MIN_TIER2_BACKOFF; - } - else if (backoff > MAX_TIER2_BACKOFF) { - backoff = MAX_TIER2_BACKOFF; - } - this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff; + this_instr[1].counter = restart_backoff_counter(counter); } } + else { + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); + } #endif /* ENABLE_SPECIALIZATION */ DISPATCH(); } @@ -3543,14 +3530,14 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; _Py_Specialize_LoadAttr(owner, next_instr, name); DISPATCH_SAME_OPARG(); } STAT_INC(LOAD_ATTR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } /* Skip 8 cache entries */ @@ -4238,14 +4225,14 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; _Py_Specialize_LoadGlobal(GLOBALS(), BUILTINS(), next_instr, name); DISPATCH_SAME_OPARG(); } STAT_INC(LOAD_GLOBAL, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } /* Skip 1 cache entry */ @@ -4442,13 +4429,13 @@ (void)counter; #if ENABLE_SPECIALIZATION int load_method = oparg & 1; - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_LoadSuperAttr(global_super, class, next_instr, load_method); DISPATCH_SAME_OPARG(); } STAT_INC(LOAD_SUPER_ATTR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } // _LOAD_SUPER_ATTR @@ -5083,13 +5070,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Send(receiver, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(SEND, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } // _SEND @@ -5271,14 +5258,14 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg); next_instr = this_instr; _Py_Specialize_StoreAttr(owner, next_instr, name); DISPATCH_SAME_OPARG(); } STAT_INC(STORE_ATTR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } /* Skip 3 cache entries */ @@ -5562,13 +5549,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_StoreSubscr(container, sub, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(STORE_SUBSCR, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } // _STORE_SUBSCR @@ -5665,13 +5652,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ToBool(value, next_instr); DISPATCH_SAME_OPARG(); } STAT_INC(TO_BOOL, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ } /* Skip 2 cache entries */ @@ -5882,13 +5869,13 @@ uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; #if ENABLE_SPECIALIZATION - if (ADAPTIVE_COUNTER_IS_ZERO(counter)) { + if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_UnpackSequence(seq, next_instr, oparg); DISPATCH_SAME_OPARG(); } STAT_INC(UNPACK_SEQUENCE, deferred); - DECREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); + ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); #endif /* ENABLE_SPECIALIZATION */ (void)seq; (void)counter; diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 018cd66..0f60290 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -590,7 +590,7 @@ de_instrument(PyCodeObject *code, int i, int event) CHECK(_PyOpcode_Deopt[deinstrumented] == deinstrumented); *opcode_ptr = deinstrumented; if (_PyOpcode_Caches[deinstrumented]) { - instr[1].cache = adaptive_counter_warmup(); + instr[1].counter = adaptive_counter_warmup(); } } @@ -611,7 +611,7 @@ de_instrument_line(PyCodeObject *code, int i) CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); instr->op.code = original_opcode; if (_PyOpcode_Caches[original_opcode]) { - instr[1].cache = adaptive_counter_warmup(); + instr[1].counter = adaptive_counter_warmup(); } assert(instr->op.code != INSTRUMENTED_LINE); } @@ -634,7 +634,7 @@ de_instrument_per_instruction(PyCodeObject *code, int i) CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); *opcode_ptr = original_opcode; if (_PyOpcode_Caches[original_opcode]) { - instr[1].cache = adaptive_counter_warmup(); + instr[1].counter = adaptive_counter_warmup(); } assert(*opcode_ptr != INSTRUMENTED_INSTRUCTION); assert(instr->op.code != INSTRUMENTED_INSTRUCTION); @@ -667,7 +667,7 @@ instrument(PyCodeObject *code, int i) assert(instrumented); *opcode_ptr = instrumented; if (_PyOpcode_Caches[deopt]) { - instr[1].cache = adaptive_counter_warmup(); + instr[1].counter = adaptive_counter_warmup(); } } } diff --git a/Python/optimizer.c b/Python/optimizer.c index 38ab6d3..5c69d9d 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1,6 +1,7 @@ #include "Python.h" #include "opcode.h" #include "pycore_interp.h" +#include "pycore_backoff.h" #include "pycore_bitutils.h" // _Py_popcount32() #include "pycore_object.h" // _PyObject_GC_UNTRACK() #include "pycore_opcode_metadata.h" // _PyOpcode_OpName[] @@ -110,9 +111,7 @@ never_optimize( _PyExecutorObject **exec, int Py_UNUSED(stack_entries)) { - /* Although it should be benign for this to be called, - * it shouldn't happen, so fail in debug builds. */ - assert(0 && "never optimize should never be called"); + // This may be called if the optimizer is reset return 0; } @@ -127,25 +126,12 @@ PyTypeObject _PyDefaultOptimizer_Type = { static _PyOptimizerObject _PyOptimizer_Default = { PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type) .optimize = never_optimize, - .resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, - .backedge_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, - .side_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, }; -static uint32_t -shift_and_offset_threshold(uint32_t threshold) -{ - return (threshold << OPTIMIZER_BITS_IN_COUNTER) + (1 << 15); -} - _PyOptimizerObject * PyUnstable_GetOptimizer(void) { PyInterpreterState *interp = _PyInterpreterState_GET(); - assert(interp->optimizer_backedge_threshold == - shift_and_offset_threshold(interp->optimizer->backedge_threshold)); - assert(interp->optimizer_resume_threshold == - shift_and_offset_threshold(interp->optimizer->resume_threshold)); if (interp->optimizer == &_PyOptimizer_Default) { return NULL; } @@ -190,13 +176,6 @@ _Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer) } Py_INCREF(optimizer); interp->optimizer = optimizer; - interp->optimizer_backedge_threshold = shift_and_offset_threshold(optimizer->backedge_threshold); - interp->optimizer_resume_threshold = shift_and_offset_threshold(optimizer->resume_threshold); - interp->optimizer_side_threshold = optimizer->side_threshold; - if (optimizer == &_PyOptimizer_Default) { - assert(interp->optimizer_backedge_threshold > (1 << 16)); - assert(interp->optimizer_resume_threshold > (1 << 16)); - } return old; } @@ -1109,7 +1088,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil assert(exit_count < COLD_EXIT_COUNT); for (int i = 0; i < exit_count; i++) { executor->exits[i].executor = &COLD_EXITS[i]; - executor->exits[i].temperature = 0; + executor->exits[i].temperature = initial_temperature_backoff_counter(); } int next_exit = exit_count-1; _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length]; @@ -1291,11 +1270,6 @@ PyUnstable_Optimizer_NewUOpOptimizer(void) return NULL; } opt->optimize = uop_optimize; - opt->resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; - // Need a few iterations to settle specializations, - // and to ammortize the cost of optimization. - opt->side_threshold = 16; - opt->backedge_threshold = 16; return (PyObject *)opt; } @@ -1385,9 +1359,6 @@ PyUnstable_Optimizer_NewCounter(void) return NULL; } opt->base.optimize = counter_optimize; - opt->base.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; - opt->base.side_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; - opt->base.backedge_threshold = 0; opt->count = 0; return (PyObject *)opt; } @@ -1554,7 +1525,7 @@ _Py_ExecutorClear(_PyExecutorObject *executor) for (uint32_t i = 0; i < executor->exit_count; i++) { Py_DECREF(executor->exits[i].executor); executor->exits[i].executor = &COLD_EXITS[i]; - executor->exits[i].temperature = INT16_MIN; + executor->exits[i].temperature = initial_unreachable_backoff_counter(); } _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index]; assert(instruction->op.code == ENTER_EXECUTOR); diff --git a/Python/specialize.c b/Python/specialize.c index f1e32d0..0b4b199 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -419,22 +419,20 @@ _PyCode_Quicken(PyCodeObject *code) int caches = _PyOpcode_Caches[opcode]; if (caches) { // The initial value depends on the opcode - int initial_value; switch (opcode) { case JUMP_BACKWARD: - initial_value = 0; + instructions[i + 1].counter = initial_jump_backoff_counter(); break; case POP_JUMP_IF_FALSE: case POP_JUMP_IF_TRUE: case POP_JUMP_IF_NONE: case POP_JUMP_IF_NOT_NONE: - initial_value = 0x5555; // Alternating 0, 1 bits + instructions[i + 1].cache = 0x5555; // Alternating 0, 1 bits break; default: - initial_value = adaptive_counter_warmup(); + instructions[i + 1].counter = adaptive_counter_warmup(); break; } - instructions[i + 1].cache = initial_value; i += caches; } } diff --git a/Tools/jit/template.c b/Tools/jit/template.c index 5416008..351bc2f 100644 --- a/Tools/jit/template.c +++ b/Tools/jit/template.c @@ -1,5 +1,6 @@ #include "Python.h" +#include "pycore_backoff.h" #include "pycore_call.h" #include "pycore_ceval.h" #include "pycore_cell.h" |