From 70d378cdaa99f995bdce278439ef7c4defe4f805 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Mon, 18 Dec 2023 13:16:45 +0000 Subject: GH-111485: Break up instructions with unused cache entries into component micro-ops (GH-113169) --- Lib/test/test_generated_cases.py | 4 ++ Python/generated_cases.c.h | 71 ++++++++++++++++++++++++++++++++ Tools/cases_generator/analyzer.py | 31 ++++++++++---- Tools/cases_generator/parser.py | 1 + Tools/cases_generator/tier1_generator.py | 3 +- 5 files changed, 102 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_generated_cases.py b/Lib/test/test_generated_cases.py index 74cebbe..3541a4e 100644 --- a/Lib/test/test_generated_cases.py +++ b/Lib/test/test_generated_cases.py @@ -457,6 +457,7 @@ class TestGeneratedCases(unittest.TestCase): PyObject *left; PyObject *arg2; PyObject *res; + /* Skip 5 cache entries */ right = stack_pointer[-1]; left = stack_pointer[-2]; arg2 = stack_pointer[-3]; @@ -467,6 +468,7 @@ class TestGeneratedCases(unittest.TestCase): } """ self.run_cases_test(input, output) + def test_unused_caches(self): input = """ inst(OP, (unused/1, unused/2 --)) { @@ -478,6 +480,8 @@ class TestGeneratedCases(unittest.TestCase): frame->instr_ptr = next_instr; next_instr += 4; INSTRUCTION_STATS(OP); + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ body(); DISPATCH(); } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index b202d14..a274427 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -452,6 +452,7 @@ PyObject *sub; PyObject *dict; PyObject *res; + /* Skip 1 cache entry */ sub = stack_pointer[-1]; dict = stack_pointer[-2]; DEOPT_IF(!PyDict_CheckExact(dict), BINARY_SUBSCR); @@ -476,6 +477,7 @@ static_assert(INLINE_CACHE_ENTRIES_BINARY_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; + /* Skip 1 cache entry */ sub = stack_pointer[-1]; container = stack_pointer[-2]; DEOPT_IF(tstate->interp->eval_frame, BINARY_SUBSCR); @@ -509,6 +511,7 @@ PyObject *sub; PyObject *list; PyObject *res; + /* Skip 1 cache entry */ sub = stack_pointer[-1]; list = stack_pointer[-2]; DEOPT_IF(!PyLong_CheckExact(sub), BINARY_SUBSCR); @@ -536,6 +539,7 @@ PyObject *sub; PyObject *str; PyObject *res; + /* Skip 1 cache entry */ sub = stack_pointer[-1]; str = stack_pointer[-2]; DEOPT_IF(!PyLong_CheckExact(sub), BINARY_SUBSCR); @@ -563,6 +567,7 @@ PyObject *sub; PyObject *tuple; PyObject *res; + /* Skip 1 cache entry */ sub = stack_pointer[-1]; tuple = stack_pointer[-2]; DEOPT_IF(!PyLong_CheckExact(sub), BINARY_SUBSCR); @@ -840,6 +845,8 @@ PyObject **args; PyObject *null; PyObject *callable; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1001,6 +1008,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1035,6 +1044,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1080,6 +1091,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1119,6 +1132,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1279,6 +1294,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1408,6 +1425,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1444,6 +1463,8 @@ PyObject **args; PyObject *self; PyObject *callable; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1475,6 +1496,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1517,6 +1540,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1559,6 +1584,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1603,6 +1630,8 @@ PyObject *self_or_null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1728,6 +1757,7 @@ PyObject **args; PyObject *self_or_null; PyObject *callable; + /* Skip 1 cache entry */ args = &stack_pointer[-oparg]; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1774,6 +1804,8 @@ PyObject *null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1801,6 +1833,8 @@ PyObject *null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1828,6 +1862,8 @@ PyObject *null; PyObject *callable; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ args = &stack_pointer[-oparg]; null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; @@ -1982,6 +2018,7 @@ PyObject *right; PyObject *left; PyObject *res; + /* Skip 1 cache entry */ right = stack_pointer[-1]; left = stack_pointer[-2]; DEOPT_IF(!PyFloat_CheckExact(left), COMPARE_OP); @@ -2008,6 +2045,7 @@ PyObject *right; PyObject *left; PyObject *res; + /* Skip 1 cache entry */ right = stack_pointer[-1]; left = stack_pointer[-2]; DEOPT_IF(!PyLong_CheckExact(left), COMPARE_OP); @@ -2038,6 +2076,7 @@ PyObject *right; PyObject *left; PyObject *res; + /* Skip 1 cache entry */ right = stack_pointer[-1]; left = stack_pointer[-2]; DEOPT_IF(!PyUnicode_CheckExact(left), COMPARE_OP); @@ -2469,6 +2508,7 @@ INSTRUCTION_STATS(FOR_ITER_GEN); static_assert(INLINE_CACHE_ENTRIES_FOR_ITER == 1, "incorrect cache size"); PyObject *iter; + /* Skip 1 cache entry */ iter = stack_pointer[-1]; DEOPT_IF(tstate->interp->eval_frame, FOR_ITER); PyGenObject *gen = (PyGenObject *)iter; @@ -2843,6 +2883,7 @@ _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 4; INSTRUCTION_STATS(INSTRUMENTED_CALL); + /* Skip 3 cache entries */ int is_meth = PEEK(oparg + 1) != NULL; int total_args = oparg + is_meth; PyObject *function = PEEK(oparg + 2); @@ -2929,6 +2970,7 @@ _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 2; INSTRUCTION_STATS(INSTRUMENTED_FOR_ITER); + /* Skip 1 cache entry */ _Py_CODEUNIT *target; PyObject *iter = TOP(); PyObject *next = (*Py_TYPE(iter)->tp_iternext)(iter); @@ -2976,6 +3018,7 @@ _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 2; INSTRUCTION_STATS(INSTRUMENTED_JUMP_BACKWARD); + /* Skip 1 cache entry */ CHECK_EVAL_BREAKER(); INSTRUMENTED_JUMP(this_instr, next_instr - oparg, PY_MONITORING_EVENT_JUMP); DISPATCH(); @@ -2993,6 +3036,7 @@ _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 2; INSTRUCTION_STATS(INSTRUMENTED_LOAD_SUPER_ATTR); + /* Skip 1 cache entry */ // cancel out the decrement that will happen in LOAD_SUPER_ATTR; we // don't want to specialize instrumented instructions INCREMENT_ADAPTIVE_COUNTER(this_instr[1].cache); @@ -3003,6 +3047,7 @@ _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 2; INSTRUCTION_STATS(INSTRUMENTED_POP_JUMP_IF_FALSE); + /* Skip 1 cache entry */ PyObject *cond = POP(); assert(PyBool_Check(cond)); int flag = Py_IsFalse(cond); @@ -3018,6 +3063,7 @@ _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 2; INSTRUCTION_STATS(INSTRUMENTED_POP_JUMP_IF_NONE); + /* Skip 1 cache entry */ PyObject *value = POP(); int flag = Py_IsNone(value); int offset; @@ -3039,6 +3085,7 @@ _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 2; INSTRUCTION_STATS(INSTRUMENTED_POP_JUMP_IF_NOT_NONE); + /* Skip 1 cache entry */ PyObject *value = POP(); int offset; int nflag = Py_IsNone(value); @@ -3060,6 +3107,7 @@ _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 2; INSTRUCTION_STATS(INSTRUMENTED_POP_JUMP_IF_TRUE); + /* Skip 1 cache entry */ PyObject *cond = POP(); assert(PyBool_Check(cond)); int flag = Py_IsTrue(cond); @@ -3216,6 +3264,7 @@ _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 2; INSTRUCTION_STATS(JUMP_BACKWARD); + /* Skip 1 cache entry */ CHECK_EVAL_BREAKER(); assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); @@ -3429,6 +3478,7 @@ INSTRUCTION_STATS(LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN); static_assert(INLINE_CACHE_ENTRIES_LOAD_ATTR == 9, "incorrect cache size"); PyObject *owner; + /* Skip 1 cache entry */ owner = stack_pointer[-1]; uint32_t type_version = read_u32(&this_instr[2].cache); uint32_t func_version = read_u32(&this_instr[4].cache); @@ -3743,6 +3793,7 @@ INSTRUCTION_STATS(LOAD_ATTR_PROPERTY); static_assert(INLINE_CACHE_ENTRIES_LOAD_ATTR == 9, "incorrect cache size"); PyObject *owner; + /* Skip 1 cache entry */ owner = stack_pointer[-1]; uint32_t type_version = read_u32(&this_instr[2].cache); uint32_t func_version = read_u32(&this_instr[4].cache); @@ -4300,6 +4351,7 @@ PyObject *class; PyObject *global_super; PyObject *attr; + /* Skip 1 cache entry */ self = stack_pointer[-1]; class = stack_pointer[-2]; global_super = stack_pointer[-3]; @@ -4328,6 +4380,7 @@ PyObject *global_super; PyObject *attr; PyObject *self_or_null; + /* Skip 1 cache entry */ self = stack_pointer[-1]; class = stack_pointer[-2]; global_super = stack_pointer[-3]; @@ -4927,6 +4980,7 @@ static_assert(INLINE_CACHE_ENTRIES_SEND == 1, "incorrect cache size"); PyObject *v; PyObject *receiver; + /* Skip 1 cache entry */ v = stack_pointer[-1]; receiver = stack_pointer[-2]; DEOPT_IF(tstate->interp->eval_frame, SEND); @@ -5157,6 +5211,7 @@ static_assert(INLINE_CACHE_ENTRIES_STORE_ATTR == 4, "incorrect cache size"); PyObject *owner; PyObject *value; + /* Skip 1 cache entry */ owner = stack_pointer[-1]; value = stack_pointer[-2]; uint32_t type_version = read_u32(&this_instr[2].cache); @@ -5374,6 +5429,7 @@ PyObject *sub; PyObject *dict; PyObject *value; + /* Skip 1 cache entry */ sub = stack_pointer[-1]; dict = stack_pointer[-2]; value = stack_pointer[-3]; @@ -5394,6 +5450,7 @@ PyObject *sub; PyObject *list; PyObject *value; + /* Skip 1 cache entry */ sub = stack_pointer[-1]; list = stack_pointer[-2]; value = stack_pointer[-3]; @@ -5470,6 +5527,7 @@ static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; + /* Skip 1 cache entry */ value = stack_pointer[-1]; uint32_t version = read_u32(&this_instr[2].cache); // This one is a bit weird, because we expect *some* failures: @@ -5488,6 +5546,8 @@ INSTRUCTION_STATS(TO_BOOL_BOOL); static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ value = stack_pointer[-1]; DEOPT_IF(!PyBool_Check(value), TO_BOOL); STAT_INC(TO_BOOL, hit); @@ -5501,6 +5561,8 @@ static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ value = stack_pointer[-1]; DEOPT_IF(!PyLong_CheckExact(value), TO_BOOL); STAT_INC(TO_BOOL, hit); @@ -5523,6 +5585,8 @@ static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ value = stack_pointer[-1]; DEOPT_IF(!PyList_CheckExact(value), TO_BOOL); STAT_INC(TO_BOOL, hit); @@ -5539,6 +5603,8 @@ static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ value = stack_pointer[-1]; // This one is a bit weird, because we expect *some* failures: DEOPT_IF(!Py_IsNone(value), TO_BOOL); @@ -5555,6 +5621,8 @@ static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; + /* Skip 1 cache entry */ + /* Skip 2 cache entries */ value = stack_pointer[-1]; DEOPT_IF(!PyUnicode_CheckExact(value), TO_BOOL); STAT_INC(TO_BOOL, hit); @@ -5669,6 +5737,7 @@ static_assert(INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE == 1, "incorrect cache size"); PyObject *seq; PyObject **values; + /* Skip 1 cache entry */ seq = stack_pointer[-1]; values = &stack_pointer[-1]; DEOPT_IF(!PyList_CheckExact(seq), UNPACK_SEQUENCE); @@ -5690,6 +5759,7 @@ static_assert(INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE == 1, "incorrect cache size"); PyObject *seq; PyObject **values; + /* Skip 1 cache entry */ seq = stack_pointer[-1]; values = &stack_pointer[-1]; DEOPT_IF(!PyTuple_CheckExact(seq), UNPACK_SEQUENCE); @@ -5711,6 +5781,7 @@ static_assert(INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE == 1, "incorrect cache size"); PyObject *seq; PyObject **values; + /* Skip 1 cache entry */ seq = stack_pointer[-1]; values = &stack_pointer[-1]; DEOPT_IF(!PyTuple_CheckExact(seq), UNPACK_SEQUENCE); diff --git a/Tools/cases_generator/analyzer.py b/Tools/cases_generator/analyzer.py index 2147f6f..e077eb0 100644 --- a/Tools/cases_generator/analyzer.py +++ b/Tools/cases_generator/analyzer.py @@ -234,9 +234,9 @@ def analyze_stack(op: parser.InstDef) -> StackEffect: return StackEffect(inputs, outputs) -def analyze_caches(op: parser.InstDef) -> list[CacheEntry]: +def analyze_caches(inputs: list[parser.InputEffect]) -> list[CacheEntry]: caches: list[parser.CacheEffect] = [ - i for i in op.inputs if isinstance(i, parser.CacheEffect) + i for i in inputs if isinstance(i, parser.CacheEffect) ] return [CacheEntry(i.name, int(i.size)) for i in caches] @@ -314,13 +314,13 @@ def compute_properties(op: parser.InstDef) -> Properties: ) -def make_uop(name: str, op: parser.InstDef) -> Uop: +def make_uop(name: str, op: parser.InstDef, inputs: list[parser.InputEffect]) -> Uop: return Uop( name=name, context=op.context, annotations=op.annotations, stack=analyze_stack(op), - caches=analyze_caches(op), + caches=analyze_caches(inputs), body=op.block.tokens, properties=compute_properties(op), ) @@ -333,7 +333,7 @@ def add_op(op: parser.InstDef, uops: dict[str, Uop]) -> None: raise override_error( op.name, op.context, uops[op.name].context, op.tokens[0] ) - uops[op.name] = make_uop(op.name, op) + uops[op.name] = make_uop(op.name, op, op.inputs) def add_instruction( @@ -347,10 +347,27 @@ def desugar_inst( ) -> None: assert inst.kind == "inst" name = inst.name - uop = make_uop("_" + inst.name, inst) + op_inputs: list[parser.InputEffect] = [] + parts: list[Part] = [] + uop_index = -1 + # Move unused cache entries to the Instruction, removing them from the Uop. + for input in inst.inputs: + if isinstance(input, parser.CacheEffect) and input.name == "unused": + parts.append(Skip(input.size)) + else: + op_inputs.append(input) + if uop_index < 0: + uop_index = len(parts) + # Place holder for the uop. + parts.append(Skip(0)) + uop = make_uop("_" + inst.name, inst, op_inputs) uop.implicitly_created = True uops[inst.name] = uop - add_instruction(name, [uop], instructions) + if uop_index < 0: + parts.append(uop) + else: + parts[uop_index] = uop + add_instruction(name, parts, instructions) def add_macro( diff --git a/Tools/cases_generator/parser.py b/Tools/cases_generator/parser.py index 12173a6..fe4e8e4 100644 --- a/Tools/cases_generator/parser.py +++ b/Tools/cases_generator/parser.py @@ -7,6 +7,7 @@ from parsing import ( Context, CacheEffect, StackEffect, + InputEffect, OpName, AstNode, ) diff --git a/Tools/cases_generator/tier1_generator.py b/Tools/cases_generator/tier1_generator.py index bcfd2d8..49cede9 100644 --- a/Tools/cases_generator/tier1_generator.py +++ b/Tools/cases_generator/tier1_generator.py @@ -151,7 +151,8 @@ def generate_tier1( stack = Stack() for part in inst.parts: # Only emit braces if more than one uop - offset = write_uop(part, out, offset, stack, inst, len(inst.parts) > 1) + insert_braces = len([p for p in inst.parts if isinstance(p, Uop)]) > 1 + offset = write_uop(part, out, offset, stack, inst, insert_braces) out.start_line() if not inst.parts[-1].properties.always_exits: stack.flush(out) -- cgit v0.12