diff options
author | Petr Viktorin <encukou@gmail.com> | 2024-06-21 15:19:31 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-21 15:19:31 (GMT) |
commit | 6f1d448bc110633eda110310fd833bd46e7b30f2 (patch) | |
tree | 39555edee103339beea386042b7ffcbcd56ddcda /Python | |
parent | 7595e6743ac78ac0dd19418176f66d251668fafc (diff) | |
download | cpython-6f1d448bc110633eda110310fd833bd46e7b30f2.zip cpython-6f1d448bc110633eda110310fd833bd46e7b30f2.tar.gz cpython-6f1d448bc110633eda110310fd833bd46e7b30f2.tar.bz2 |
gh-113993: Allow interned strings to be mortal, and fix related issues (GH-120520)
* Add an InternalDocs file describing how interning should work and how to use it.
* Add internal functions to *explicitly* request what kind of interning is done:
- `_PyUnicode_InternMortal`
- `_PyUnicode_InternImmortal`
- `_PyUnicode_InternStatic`
* Switch uses of `PyUnicode_InternInPlace` to those.
* Disallow using `_Py_SetImmortal` on strings directly.
You should use `_PyUnicode_InternImmortal` instead:
- Strings should be interned before immortalization, otherwise you're possibly
interning a immortalizing copy.
- `_Py_SetImmortal` doesn't handle the `SSTATE_INTERNED_MORTAL` to
`SSTATE_INTERNED_IMMORTAL` update, and those flags can't be changed in
backports, as they are now part of public API and version-specific ABI.
* Add private `_only_immortal` argument for `sys.getunicodeinternedsize`, used in refleak test machinery.
* Make sure the statically allocated string singletons are unique. This means these sets are now disjoint:
- `_Py_ID`
- `_Py_STR` (including the empty string)
- one-character latin-1 singletons
Now, when you intern a singleton, that exact singleton will be interned.
* Add a `_Py_LATIN1_CHR` macro, use it instead of `_Py_ID`/`_Py_STR` for one-character latin-1 singletons everywhere (including Clinic).
* Intern `_Py_STR` singletons at startup.
* For free-threaded builds, intern `_Py_LATIN1_CHR` singletons at startup.
* Beef up the tests. Cover internal details (marked with `@cpython_only`).
* Add lots of assertions
Co-Authored-By: Eric Snow <ericsnowcurrently@gmail.com>
Diffstat (limited to 'Python')
-rw-r--r-- | Python/ast_opt.c | 3 | ||||
-rw-r--r-- | Python/ast_unparse.c | 10 | ||||
-rw-r--r-- | Python/clinic/sysmodule.c.h | 52 | ||||
-rw-r--r-- | Python/codecs.c | 6 | ||||
-rw-r--r-- | Python/compile.c | 3 | ||||
-rw-r--r-- | Python/getargs.c | 3 | ||||
-rw-r--r-- | Python/import.c | 8 | ||||
-rw-r--r-- | Python/marshal.c | 17 | ||||
-rw-r--r-- | Python/sysmodule.c | 28 |
9 files changed, 96 insertions, 34 deletions
diff --git a/Python/ast_opt.c b/Python/ast_opt.c index 41e906c..6d1bfaf 100644 --- a/Python/ast_opt.c +++ b/Python/ast_opt.c @@ -273,10 +273,9 @@ parse_literal(PyObject *fmt, Py_ssize_t *ppos, PyArena *arena) PyObject *str = PyUnicode_Substring(fmt, start, pos); /* str = str.replace('%%', '%') */ if (str && has_percents) { - _Py_DECLARE_STR(percent, "%"); _Py_DECLARE_STR(dbl_percent, "%%"); Py_SETREF(str, PyUnicode_Replace(str, &_Py_STR(dbl_percent), - &_Py_STR(percent), -1)); + _Py_LATIN1_CHR('%'), -1)); } if (!str) { return NULL; diff --git a/Python/ast_unparse.c b/Python/ast_unparse.c index 27c3400..86f7a58 100644 --- a/Python/ast_unparse.c +++ b/Python/ast_unparse.c @@ -10,9 +10,7 @@ * See ast.unparse for a full unparser (written in Python) */ -_Py_DECLARE_STR(open_br, "{"); _Py_DECLARE_STR(dbl_open_br, "{{"); -_Py_DECLARE_STR(close_br, "}"); _Py_DECLARE_STR(dbl_close_br, "}}"); /* We would statically initialize this if doing so were simple enough. */ @@ -580,11 +578,13 @@ escape_braces(PyObject *orig) { PyObject *temp; PyObject *result; - temp = PyUnicode_Replace(orig, &_Py_STR(open_br), &_Py_STR(dbl_open_br), -1); + temp = PyUnicode_Replace(orig, _Py_LATIN1_CHR('{'), + &_Py_STR(dbl_open_br), -1); if (!temp) { return NULL; } - result = PyUnicode_Replace(temp, &_Py_STR(close_br), &_Py_STR(dbl_close_br), -1); + result = PyUnicode_Replace(temp, _Py_LATIN1_CHR('}'), + &_Py_STR(dbl_close_br), -1); Py_DECREF(temp); return result; } @@ -678,7 +678,7 @@ append_formattedvalue(_PyUnicodeWriter *writer, expr_ty e) if (!temp_fv_str) { return -1; } - if (PyUnicode_Find(temp_fv_str, &_Py_STR(open_br), 0, 1, 1) == 0) { + if (PyUnicode_Find(temp_fv_str, _Py_LATIN1_CHR('{'), 0, 1, 1) == 0) { /* Expression starts with a brace, split it with a space from the outer one. */ outer_brace = "{ "; diff --git a/Python/clinic/sysmodule.c.h b/Python/clinic/sysmodule.c.h index 56a831e..8277d28 100644 --- a/Python/clinic/sysmodule.c.h +++ b/Python/clinic/sysmodule.c.h @@ -968,24 +968,64 @@ exit: } PyDoc_STRVAR(sys_getunicodeinternedsize__doc__, -"getunicodeinternedsize($module, /)\n" +"getunicodeinternedsize($module, /, *, _only_immortal=False)\n" "--\n" "\n" "Return the number of elements of the unicode interned dictionary"); #define SYS_GETUNICODEINTERNEDSIZE_METHODDEF \ - {"getunicodeinternedsize", (PyCFunction)sys_getunicodeinternedsize, METH_NOARGS, sys_getunicodeinternedsize__doc__}, + {"getunicodeinternedsize", _PyCFunction_CAST(sys_getunicodeinternedsize), METH_FASTCALL|METH_KEYWORDS, sys_getunicodeinternedsize__doc__}, static Py_ssize_t -sys_getunicodeinternedsize_impl(PyObject *module); +sys_getunicodeinternedsize_impl(PyObject *module, int _only_immortal); static PyObject * -sys_getunicodeinternedsize(PyObject *module, PyObject *Py_UNUSED(ignored)) +sys_getunicodeinternedsize(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(_only_immortal), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"_only_immortal", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "getunicodeinternedsize", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; + int _only_immortal = 0; Py_ssize_t _return_value; - _return_value = sys_getunicodeinternedsize_impl(module); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 0, 0, argsbuf); + if (!args) { + goto exit; + } + if (!noptargs) { + goto skip_optional_kwonly; + } + _only_immortal = PyObject_IsTrue(args[0]); + if (_only_immortal < 0) { + goto exit; + } +skip_optional_kwonly: + _return_value = sys_getunicodeinternedsize_impl(module, _only_immortal); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1574,4 +1614,4 @@ exit: #ifndef SYS_GETANDROIDAPILEVEL_METHODDEF #define SYS_GETANDROIDAPILEVEL_METHODDEF #endif /* !defined(SYS_GETANDROIDAPILEVEL_METHODDEF) */ -/*[clinic end generated code: output=ef7c35945443d300 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=9cc9069aef1482bc input=a9049054013a1b77]*/ diff --git a/Python/codecs.c b/Python/codecs.c index bed2453..9c0a3fa 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -147,7 +147,9 @@ PyObject *_PyCodec_Lookup(const char *encoding) if (v == NULL) { return NULL; } - PyUnicode_InternInPlace(&v); + + /* Intern the string. We'll make it immortal later if lookup succeeds. */ + _PyUnicode_InternMortal(interp, &v); /* First, try to lookup the name in the registry dictionary */ PyObject *result; @@ -200,6 +202,8 @@ PyObject *_PyCodec_Lookup(const char *encoding) goto onError; } + _PyUnicode_InternImmortal(interp, &v); + /* Cache and return the result */ if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) { Py_DECREF(result); diff --git a/Python/compile.c b/Python/compile.c index 9a81b8b..b572481 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -645,8 +645,7 @@ compiler_set_qualname(struct compiler *c) } if (base != NULL) { - _Py_DECLARE_STR(dot, "."); - name = PyUnicode_Concat(base, &_Py_STR(dot)); + name = PyUnicode_Concat(base, _Py_LATIN1_CHR('.')); Py_DECREF(base); if (name == NULL) { return ERROR; diff --git a/Python/getargs.c b/Python/getargs.c index 88f4c58..75c1797 100644 --- a/Python/getargs.c +++ b/Python/getargs.c @@ -1934,7 +1934,8 @@ new_kwtuple(const char * const *keywords, int total, int pos) Py_DECREF(kwtuple); return NULL; } - PyUnicode_InternInPlace(&str); + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyUnicode_InternImmortal(interp, &str); PyTuple_SET_ITEM(kwtuple, i, str); } return kwtuple; diff --git a/Python/import.c b/Python/import.c index b7e2c3e..366c016 100644 --- a/Python/import.c +++ b/Python/import.c @@ -1967,8 +1967,6 @@ import_run_extension(PyThreadState *tstate, PyModInitFunction p0, if (res.kind == _Py_ext_module_kind_SINGLEPHASE) { /* Remember the filename as the __file__ attribute */ if (info->filename != NULL) { - // XXX There's a refleak somewhere with the filename. - // Until we can track it down, we intern it. PyObject *filename = NULL; if (switched) { // The original filename may be allocated by subinterpreter's @@ -1980,7 +1978,11 @@ import_run_extension(PyThreadState *tstate, PyModInitFunction p0, } else { filename = Py_NewRef(info->filename); } - PyUnicode_InternInPlace(&filename); + // XXX There's a refleak somewhere with the filename. + // Until we can track it down, we immortalize it. + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyUnicode_InternImmortal(interp, &filename); + if (PyModule_AddObjectRef(mod, "__file__", filename) < 0) { PyErr_Clear(); /* Not important enough to report */ } diff --git a/Python/marshal.c b/Python/marshal.c index ca22d6d..a46fc0c 100644 --- a/Python/marshal.c +++ b/Python/marshal.c @@ -14,6 +14,7 @@ #include "pycore_long.h" // _PyLong_DigitCount #include "pycore_setobject.h" // _PySet_NextEntry() #include "marshal.h" // Py_MARSHAL_VERSION +#include "pycore_pystate.h" // _PyInterpreterState_GET() #ifdef __APPLE__ # include "TargetConditionals.h" @@ -1184,8 +1185,12 @@ r_object(RFILE *p) v = PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, ptr, n); if (v == NULL) break; - if (is_interned) - PyUnicode_InternInPlace(&v); + if (is_interned) { + // marshal is meant to serialize .pyc files with code + // objects, and code-related strings are currently immortal. + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyUnicode_InternImmortal(interp, &v); + } retval = v; R_REF(retval); break; @@ -1217,8 +1222,12 @@ r_object(RFILE *p) } if (v == NULL) break; - if (is_interned) - PyUnicode_InternInPlace(&v); + if (is_interned) { + // marshal is meant to serialize .pyc files with code + // objects, and code-related strings are currently immortal. + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyUnicode_InternImmortal(interp, &v); + } retval = v; R_REF(retval); break; diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 3bb7b4d..1fff7e4 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -747,7 +747,7 @@ sys_displayhook(PyObject *module, PyObject *o) if (o == Py_None) { Py_RETURN_NONE; } - if (PyObject_SetAttr(builtins, &_Py_ID(_), Py_None) != 0) + if (PyObject_SetAttr(builtins, _Py_LATIN1_CHR('_'), Py_None) != 0) return NULL; outf = _PySys_GetAttr(tstate, &_Py_ID(stdout)); if (outf == NULL || outf == Py_None) { @@ -769,10 +769,9 @@ sys_displayhook(PyObject *module, PyObject *o) return NULL; } } - _Py_DECLARE_STR(newline, "\n"); - if (PyFile_WriteObject(&_Py_STR(newline), outf, Py_PRINT_RAW) != 0) + if (PyFile_WriteObject(_Py_LATIN1_CHR('\n'), outf, Py_PRINT_RAW) != 0) return NULL; - if (PyObject_SetAttr(builtins, &_Py_ID(_), o) != 0) + if (PyObject_SetAttr(builtins, _Py_LATIN1_CHR('_'), o) != 0) return NULL; Py_RETURN_NONE; } @@ -930,7 +929,7 @@ sys_getfilesystemencoding_impl(PyObject *module) if (u == NULL) { return NULL; } - _PyUnicode_InternInPlace(interp, &u); + _PyUnicode_InternImmortal(interp, &u); return u; } @@ -950,7 +949,7 @@ sys_getfilesystemencodeerrors_impl(PyObject *module) if (u == NULL) { return NULL; } - _PyUnicode_InternInPlace(interp, &u); + _PyUnicode_InternImmortal(interp, &u); return u; } @@ -972,8 +971,9 @@ sys_intern_impl(PyObject *module, PyObject *s) /*[clinic end generated code: output=be680c24f5c9e5d6 input=849483c006924e2f]*/ { if (PyUnicode_CheckExact(s)) { + PyInterpreterState *interp = _PyInterpreterState_GET(); Py_INCREF(s); - PyUnicode_InternInPlace(&s); + _PyUnicode_InternMortal(interp, &s); return s; } else { @@ -2007,14 +2007,22 @@ sys_getallocatedblocks_impl(PyObject *module) /*[clinic input] sys.getunicodeinternedsize -> Py_ssize_t + * + _only_immortal: bool = False + Return the number of elements of the unicode interned dictionary [clinic start generated code]*/ static Py_ssize_t -sys_getunicodeinternedsize_impl(PyObject *module) -/*[clinic end generated code: output=ad0e4c9738ed4129 input=726298eaa063347a]*/ +sys_getunicodeinternedsize_impl(PyObject *module, int _only_immortal) +/*[clinic end generated code: output=29a6377a94a14f70 input=0330b3408dd5bcc6]*/ { - return _PyUnicode_InternedSize(); + if (_only_immortal) { + return _PyUnicode_InternedSize_Immortal(); + } + else { + return _PyUnicode_InternedSize(); + } } /*[clinic input] |