diff options
author | Eric Snow <ericsnowcurrently@gmail.com> | 2023-11-27 23:51:12 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-27 23:51:12 (GMT) |
commit | 4f71f1680de11baf7fb421ef500f36ffd5936b58 (patch) | |
tree | 425096370065266c1c762cd13bd0503b7a5952b4 /Objects/unicodeobject.c | |
parent | 60a08e6ff2ef8306d276a40f861bf46f34ba09bd (diff) | |
download | cpython-4f71f1680de11baf7fb421ef500f36ffd5936b58.zip cpython-4f71f1680de11baf7fb421ef500f36ffd5936b58.tar.gz cpython-4f71f1680de11baf7fb421ef500f36ffd5936b58.tar.bz2 |
[3.12] gh-106931: Intern Statically Allocated Strings Globally (gh-107272) (gh-110713)
We tried this before with a dict and for all interned strings. That ran into problems due to interpreter isolation. However, exclusively using a per-interpreter cache caused some inconsistency that can eliminate the benefit of interning. Here we circle back to using a global cache, but only for statically allocated strings. We also use a more-basic _Py_hashtable_t for that global cache instead of a dict.
Ideally we would only have the global cache, but the optional isolation of each interpreter's allocator means that a non-static string object must not outlive its interpreter. Thus we would have to store a copy of each such interned string in the global cache, tied to the main interpreter.
(cherry-picked from commit b72947a8d26915156323ccfd04d273199ecb870c)
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 72 |
1 files changed, 69 insertions, 3 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 26aa139..ec82eda 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -235,15 +235,54 @@ static inline PyObject *get_interned_dict(PyInterpreterState *interp) return _Py_INTERP_CACHED_OBJECT(interp, interned_strings); } +#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings + Py_ssize_t _PyUnicode_InternedSize(void) { - return PyObject_Length(get_interned_dict(_PyInterpreterState_GET())); + PyObject *dict = get_interned_dict(_PyInterpreterState_GET()); + return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict); +} + +static Py_hash_t unicode_hash(PyObject *); +static int unicode_compare_eq(PyObject *, PyObject *); + +static Py_uhash_t +hashtable_unicode_hash(const void *key) +{ + return unicode_hash((PyObject *)key); +} + +static int +hashtable_unicode_compare(const void *key1, const void *key2) +{ + PyObject *obj1 = (PyObject *)key1; + PyObject *obj2 = (PyObject *)key2; + if (obj1 != NULL && obj2 != NULL) { + return unicode_compare_eq(obj1, obj2); + } + else { + return obj1 == obj2; + } } static int init_interned_dict(PyInterpreterState *interp) { + if (_Py_IsMainInterpreter(interp)) { + assert(INTERNED_STRINGS == NULL); + _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree}; + INTERNED_STRINGS = _Py_hashtable_new_full( + hashtable_unicode_hash, + hashtable_unicode_compare, + NULL, + NULL, + &hashtable_alloc + ); + if (INTERNED_STRINGS == NULL) { + return -1; + } + } assert(get_interned_dict(interp) == NULL); PyObject *interned = interned = PyDict_New(); if (interned == NULL) { @@ -262,6 +301,10 @@ clear_interned_dict(PyInterpreterState *interp) Py_DECREF(interned); _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL; } + if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) { + _Py_hashtable_destroy(INTERNED_STRINGS); + INTERNED_STRINGS = NULL; + } } #define _Py_RETURN_UNICODE_EMPTY() \ @@ -1222,6 +1265,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) _PyUnicode_STATE(unicode).kind = kind; _PyUnicode_STATE(unicode).compact = 1; _PyUnicode_STATE(unicode).ascii = is_ascii; + _PyUnicode_STATE(unicode).statically_allocated = 0; if (is_ascii) { ((char*)data)[size] = 0; } @@ -1552,7 +1596,9 @@ unicode_dealloc(PyObject *unicode) * we accidentally decref an immortal string out of existence. Since * the string is an immortal object, just re-set the reference count. */ - if (PyUnicode_CHECK_INTERNED(unicode)) { + if (PyUnicode_CHECK_INTERNED(unicode) + || _PyUnicode_STATE(unicode).statically_allocated) + { _Py_SetImmortal(unicode); return; } @@ -14502,6 +14548,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) _PyUnicode_STATE(self).kind = kind; _PyUnicode_STATE(self).compact = 0; _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; + _PyUnicode_STATE(self).statically_allocated = 0; _PyUnicode_UTF8_LENGTH(self) = 0; _PyUnicode_UTF8(self) = NULL; _PyUnicode_DATA_ANY(self) = NULL; @@ -14725,6 +14772,23 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p) return; } + /* Look in the global cache first. */ + PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s); + if (r != NULL && r != s) { + Py_SETREF(*p, Py_NewRef(r)); + return; + } + + /* Handle statically allocated strings. */ + if (_PyUnicode_STATE(s).statically_allocated) { + assert(_Py_IsImmortal(s)); + if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) { + _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC; + } + return; + } + + /* Look in the per-interpreter cache. */ PyObject *interned = get_interned_dict(interp); assert(interned != NULL); @@ -14740,9 +14804,11 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p) } if (_Py_IsImmortal(s)) { + // XXX Restrict this to the main interpreter? _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC; - return; + return; } + #ifdef Py_REF_DEBUG /* The reference count value excluding the 2 references from the interned dictionary should be excluded from the RefTotal. The |