summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorEric Snow <ericsnowcurrently@gmail.com>2023-11-27 23:51:12 (GMT)
committerGitHub <noreply@github.com>2023-11-27 23:51:12 (GMT)
commit4f71f1680de11baf7fb421ef500f36ffd5936b58 (patch)
tree425096370065266c1c762cd13bd0503b7a5952b4 /Objects/unicodeobject.c
parent60a08e6ff2ef8306d276a40f861bf46f34ba09bd (diff)
downloadcpython-4f71f1680de11baf7fb421ef500f36ffd5936b58.zip
cpython-4f71f1680de11baf7fb421ef500f36ffd5936b58.tar.gz
cpython-4f71f1680de11baf7fb421ef500f36ffd5936b58.tar.bz2
[3.12] gh-106931: Intern Statically Allocated Strings Globally (gh-107272) (gh-110713)
We tried this before with a dict and for all interned strings. That ran into problems due to interpreter isolation. However, exclusively using a per-interpreter cache caused some inconsistency that can eliminate the benefit of interning. Here we circle back to using a global cache, but only for statically allocated strings. We also use a more-basic _Py_hashtable_t for that global cache instead of a dict. Ideally we would only have the global cache, but the optional isolation of each interpreter's allocator means that a non-static string object must not outlive its interpreter. Thus we would have to store a copy of each such interned string in the global cache, tied to the main interpreter. (cherry-picked from commit b72947a8d26915156323ccfd04d273199ecb870c)
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c72
1 files changed, 69 insertions, 3 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 26aa139..ec82eda 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -235,15 +235,54 @@ static inline PyObject *get_interned_dict(PyInterpreterState *interp)
return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
}
+#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
+
Py_ssize_t
_PyUnicode_InternedSize(void)
{
- return PyObject_Length(get_interned_dict(_PyInterpreterState_GET()));
+ PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
+ return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
+}
+
+static Py_hash_t unicode_hash(PyObject *);
+static int unicode_compare_eq(PyObject *, PyObject *);
+
+static Py_uhash_t
+hashtable_unicode_hash(const void *key)
+{
+ return unicode_hash((PyObject *)key);
+}
+
+static int
+hashtable_unicode_compare(const void *key1, const void *key2)
+{
+ PyObject *obj1 = (PyObject *)key1;
+ PyObject *obj2 = (PyObject *)key2;
+ if (obj1 != NULL && obj2 != NULL) {
+ return unicode_compare_eq(obj1, obj2);
+ }
+ else {
+ return obj1 == obj2;
+ }
}
static int
init_interned_dict(PyInterpreterState *interp)
{
+ if (_Py_IsMainInterpreter(interp)) {
+ assert(INTERNED_STRINGS == NULL);
+ _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
+ INTERNED_STRINGS = _Py_hashtable_new_full(
+ hashtable_unicode_hash,
+ hashtable_unicode_compare,
+ NULL,
+ NULL,
+ &hashtable_alloc
+ );
+ if (INTERNED_STRINGS == NULL) {
+ return -1;
+ }
+ }
assert(get_interned_dict(interp) == NULL);
PyObject *interned = interned = PyDict_New();
if (interned == NULL) {
@@ -262,6 +301,10 @@ clear_interned_dict(PyInterpreterState *interp)
Py_DECREF(interned);
_Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
}
+ if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) {
+ _Py_hashtable_destroy(INTERNED_STRINGS);
+ INTERNED_STRINGS = NULL;
+ }
}
#define _Py_RETURN_UNICODE_EMPTY() \
@@ -1222,6 +1265,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
_PyUnicode_STATE(unicode).kind = kind;
_PyUnicode_STATE(unicode).compact = 1;
_PyUnicode_STATE(unicode).ascii = is_ascii;
+ _PyUnicode_STATE(unicode).statically_allocated = 0;
if (is_ascii) {
((char*)data)[size] = 0;
}
@@ -1552,7 +1596,9 @@ unicode_dealloc(PyObject *unicode)
* we accidentally decref an immortal string out of existence. Since
* the string is an immortal object, just re-set the reference count.
*/
- if (PyUnicode_CHECK_INTERNED(unicode)) {
+ if (PyUnicode_CHECK_INTERNED(unicode)
+ || _PyUnicode_STATE(unicode).statically_allocated)
+ {
_Py_SetImmortal(unicode);
return;
}
@@ -14502,6 +14548,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
_PyUnicode_STATE(self).kind = kind;
_PyUnicode_STATE(self).compact = 0;
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
+ _PyUnicode_STATE(self).statically_allocated = 0;
_PyUnicode_UTF8_LENGTH(self) = 0;
_PyUnicode_UTF8(self) = NULL;
_PyUnicode_DATA_ANY(self) = NULL;
@@ -14725,6 +14772,23 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
return;
}
+ /* Look in the global cache first. */
+ PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
+ if (r != NULL && r != s) {
+ Py_SETREF(*p, Py_NewRef(r));
+ return;
+ }
+
+ /* Handle statically allocated strings. */
+ if (_PyUnicode_STATE(s).statically_allocated) {
+ assert(_Py_IsImmortal(s));
+ if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) {
+ _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
+ }
+ return;
+ }
+
+ /* Look in the per-interpreter cache. */
PyObject *interned = get_interned_dict(interp);
assert(interned != NULL);
@@ -14740,9 +14804,11 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
}
if (_Py_IsImmortal(s)) {
+ // XXX Restrict this to the main interpreter?
_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
- return;
+ return;
}
+
#ifdef Py_REF_DEBUG
/* The reference count value excluding the 2 references from the
interned dictionary should be excluded from the RefTotal. The