summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--InternalDocs/string_interning.md66
-rw-r--r--Objects/unicodeobject.c36
2 files changed, 40 insertions, 62 deletions
diff --git a/InternalDocs/string_interning.md b/InternalDocs/string_interning.md
index 930ea11..358e2c0 100644
--- a/InternalDocs/string_interning.md
+++ b/InternalDocs/string_interning.md
@@ -8,51 +8,50 @@
This is used to optimize dict and attribute lookups, among other things.
-Python uses three different mechanisms to intern strings:
+Python uses two different mechanisms to intern strings: singletons and
+dynamic interning.
-- Singleton strings marked in C source with `_Py_STR` and `_Py_ID` macros.
- These are statically allocated, and collected using `make regen-global-objects`
- (`Tools/build/generate_global_objects.py`), which generates code
- for declaration, initialization and finalization.
+## Singletons
- The difference between the two kinds is not important. (A `_Py_ID` string is
- a valid C name, with which we can refer to it; a `_Py_STR` may e.g. contain
- non-identifier characters, so it needs a separate C-compatible name.)
+The 256 possible one-character latin-1 strings, which can be retrieved with
+`_Py_LATIN1_CHR(c)`, are stored in statically allocated arrays,
+`_PyRuntime.static_objects.strings.ascii` and
+`_PyRuntime.static_objects.strings.latin1`.
- The empty string is in this category (as `_Py_STR(empty)`).
+Longer singleton strings are marked in C source with `_Py_ID` (if the string
+is a valid C identifier fragment) or `_Py_STR` (if it needs a separate
+C-compatible name.)
+These are also stored in statically allocated arrays.
+They are collected from CPython sources using `make regen-global-objects`
+(`Tools/build/generate_global_objects.py`), which generates code
+for declaration, initialization and finalization.
- These singletons are interned in a runtime-global lookup table,
- `_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`),
- at runtime initialization.
+The empty string is one of the singletons: `_Py_STR(empty)`.
-- The 256 possible one-character latin-1 strings are singletons,
- which can be retrieved with `_Py_LATIN1_CHR(c)`, are stored in runtime-global
- arrays, `_PyRuntime.static_objects.strings.ascii` and
- `_PyRuntime.static_objects.strings.latin1`.
+The three sets of singletons (`_Py_LATIN1_CHR`, `_Py_ID`, `_Py_STR`)
+are disjoint.
+If you have such a singleton, it (and no other copy) will be interned.
- These are NOT interned at startup in the normal build.
- In the free-threaded build, they are; this avoids modifying the
- global lookup table after threads are started.
+These singletons are interned in a runtime-global lookup table,
+`_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`),
+at runtime initialization, and immutable until it's torn down
+at runtime finalization.
+It is shared across threads and interpreters without any synchronization.
- Interning a one-char latin-1 string will always intern the corresponding
- singleton.
-- All other strings are allocated dynamically, and have their
- `_PyUnicode_STATE(s).statically_allocated` flag set to zero.
- When interned, such strings are added to an interpreter-wide dict,
- `PyInterpreterState.cached_objects.interned_strings`.
+## Dynamically allocated strings
- The key and value of each entry in this dict reference the same object.
+All other strings are allocated dynamically, and have their
+`_PyUnicode_STATE(s).statically_allocated` flag set to zero.
+When interned, such strings are added to an interpreter-wide dict,
+`PyInterpreterState.cached_objects.interned_strings`.
-The three sets of singletons (`_Py_STR`, `_Py_ID`, `_Py_LATIN1_CHR`)
-are disjoint.
-If you have such a singleton, it (and no other copy) will be interned.
+The key and value of each entry in this dict reference the same object.
## Immortality and reference counting
-Invariant: Every immortal string is interned, *except* the one-char latin-1
-singletons (which might but might not be interned).
+Invariant: Every immortal string is interned.
In practice, this means that you must not use `_Py_SetImmortal` on
a string. (If you know it's already immortal, don't immortalize it;
@@ -115,8 +114,5 @@ The valid transitions between these states are:
Using `_PyUnicode_InternStatic` on these is an error; the other cases
don't change the state.
-- One-char latin-1 singletons can be interned (0 -> 3) using any interning
- function; after that the functions don't change the state.
-
-- Other statically allocated strings are interned (0 -> 3) at runtime init;
+- Singletons are interned (0 -> 3) at runtime init;
after that all interning functions don't change the state.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 6196a8e..ffb879a 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -325,7 +325,8 @@ init_global_interned_strings(PyInterpreterState *interp)
return _PyStatus_ERR("failed to create global interned dict");
}
- /* Intern statically allocated string identifiers and deepfreeze strings.
+ /* Intern statically allocated string identifiers, deepfreeze strings,
+ * and one-byte latin-1 strings.
* This must be done before any module initialization so that statically
* allocated string identifiers are used instead of heap allocated strings.
* Deepfreeze uses the interned identifiers if present to save space
@@ -333,14 +334,11 @@ init_global_interned_strings(PyInterpreterState *interp)
*/
_PyUnicode_InitStaticStrings(interp);
-#ifdef Py_GIL_DISABLED
-// In the free-threaded build, intern the 1-byte strings as well
for (int i = 0; i < 256; i++) {
PyObject *s = LATIN1(i);
_PyUnicode_InternStatic(interp, &s);
assert(s == LATIN1(i));
}
-#endif
#ifdef Py_DEBUG
assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
@@ -15355,27 +15353,15 @@ intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
assert(s != NULL);
assert(_PyUnicode_CHECK(s));
assert(_PyUnicode_STATE(s).statically_allocated);
-
- switch (PyUnicode_CHECK_INTERNED(s)) {
- case SSTATE_NOT_INTERNED:
- break;
- case SSTATE_INTERNED_IMMORTAL_STATIC:
- return s;
- default:
- Py_FatalError("_PyUnicode_InternStatic called on wrong string");
- }
+ assert(!PyUnicode_CHECK_INTERNED(s));
#ifdef Py_DEBUG
/* We must not add process-global interned string if there's already a
* per-interpreter interned_dict, which might contain duplicates.
- * Except "short string" singletons: those are special-cased. */
+ */
PyObject *interned = get_interned_dict(interp);
- assert(interned == NULL || unicode_is_singleton(s));
-#ifdef Py_GIL_DISABLED
- // In the free-threaded build, don't allow even the short strings.
assert(interned == NULL);
#endif
-#endif
/* Look in the global cache first. */
PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
@@ -15446,11 +15432,6 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
return s;
}
- /* Handle statically allocated strings. */
- if (_PyUnicode_STATE(s).statically_allocated) {
- return intern_static(interp, s);
- }
-
/* Is it already interned? */
switch (PyUnicode_CHECK_INTERNED(s)) {
case SSTATE_NOT_INTERNED:
@@ -15467,6 +15448,9 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
return s;
}
+ /* Statically allocated strings must be already interned. */
+ assert(!_PyUnicode_STATE(s).statically_allocated);
+
#if Py_GIL_DISABLED
/* In the free-threaded build, all interned strings are immortal */
immortalize = 1;
@@ -15477,13 +15461,11 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
immortalize = 1;
}
- /* if it's a short string, get the singleton -- and intern it */
+ /* if it's a short string, get the singleton */
if (PyUnicode_GET_LENGTH(s) == 1 &&
PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
- if (!PyUnicode_CHECK_INTERNED(r)) {
- r = intern_static(interp, r);
- }
+ assert(PyUnicode_CHECK_INTERNED(r));
Py_DECREF(s);
return r;
}