diff options
author | Victor Stinner <vstinner@python.org> | 2020-12-25 23:41:46 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-12-25 23:41:46 (GMT) |
commit | ba3d67c2fb04a7842741b1b6da5d67f22c579f33 (patch) | |
tree | d5d8f09c94f413f54ce6014e253ad1de59548bba /Objects | |
parent | f0853bcedf8531856bdda149a540eaa0fc26e692 (diff) | |
download | cpython-ba3d67c2fb04a7842741b1b6da5d67f22c579f33.zip cpython-ba3d67c2fb04a7842741b1b6da5d67f22c579f33.tar.gz cpython-ba3d67c2fb04a7842741b1b6da5d67f22c579f33.tar.bz2 |
bpo-39465: Fix _PyUnicode_FromId() for subinterpreters (GH-20058)
Make _PyUnicode_FromId() function compatible with subinterpreters.
Each interpreter now has an array of identifier objects (interned
strings decoded from UTF-8).
* Add PyInterpreterState.unicode.identifiers: array of identifiers
objects.
* Add _PyRuntimeState.unicode_ids used to allocate unique indexes
to _Py_Identifier.
* Rewrite the _Py_Identifier structure.
Microbenchmark on _PyUnicode_FromId(&PyId_a) with _Py_IDENTIFIER(a):
[ref] 2.42 ns +- 0.00 ns -> [atomic] 3.39 ns +- 0.00 ns: 1.40x slower
This change adds 1 ns per _PyUnicode_FromId() call in average.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 85 |
1 files changed, 62 insertions, 23 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4093555..3238d1e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -41,6 +41,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #define PY_SSIZE_T_CLEAN #include "Python.h" #include "pycore_abstract.h" // _PyIndex_Check() +#include "pycore_atomic_funcs.h" // _Py_atomic_size_get() #include "pycore_bytes_methods.h" // _Py_bytes_lower() #include "pycore_format.h" // F_LJUST #include "pycore_initconfig.h" // _PyStatus_OK() @@ -302,9 +303,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed); -/* List of static strings. */ -static _Py_Identifier *static_strings = NULL; - /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -2312,42 +2310,85 @@ PyUnicode_FromString(const char *u) return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); } + PyObject * _PyUnicode_FromId(_Py_Identifier *id) { - if (id->object) { - return id->object; + PyInterpreterState *interp = _PyInterpreterState_GET(); + struct _Py_unicode_ids *ids = &interp->unicode.ids; + + int index = _Py_atomic_size_get(&id->index); + if (index < 0) { + struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids; + + PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK); + // Check again to detect concurrent access. Another thread can have + // initialized the index while this thread waited for the lock. + index = _Py_atomic_size_get(&id->index); + if (index < 0) { + assert(rt_ids->next_index < PY_SSIZE_T_MAX); + index = rt_ids->next_index; + rt_ids->next_index++; + _Py_atomic_size_set(&id->index, index); + } + PyThread_release_lock(rt_ids->lock); } + assert(index >= 0); PyObject *obj; - obj = PyUnicode_DecodeUTF8Stateful(id->string, - strlen(id->string), + if (index < ids->size) { + obj = ids->array[index]; + if (obj) { + // Return a borrowed reference + return obj; + } + } + + obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string), NULL, NULL); if (!obj) { return NULL; } PyUnicode_InternInPlace(&obj); - assert(!id->next); - id->object = obj; - id->next = static_strings; - static_strings = id; - return id->object; + if (index >= ids->size) { + // Overallocate to reduce the number of realloc + Py_ssize_t new_size = Py_MAX(index * 2, 16); + Py_ssize_t item_size = sizeof(ids->array[0]); + PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size); + if (new_array == NULL) { + PyErr_NoMemory(); + return NULL; + } + memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size); + ids->array = new_array; + ids->size = new_size; + } + + // The array stores a strong reference + ids->array[index] = obj; + + // Return a borrowed reference + return obj; } + static void -unicode_clear_static_strings(void) +unicode_clear_identifiers(PyThreadState *tstate) { - _Py_Identifier *tmp, *s = static_strings; - while (s) { - Py_CLEAR(s->object); - tmp = s->next; - s->next = NULL; - s = tmp; + PyInterpreterState *interp = _PyInterpreterState_GET(); + struct _Py_unicode_ids *ids = &interp->unicode.ids; + for (Py_ssize_t i=0; i < ids->size; i++) { + Py_XDECREF(ids->array[i]); } - static_strings = NULL; + ids->size = 0; + PyMem_Free(ids->array); + ids->array = NULL; + // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid + // after Py_Finalize(). } + /* Internal function, doesn't check maximum character */ PyObject* @@ -16238,9 +16279,7 @@ _PyUnicode_Fini(PyThreadState *tstate) Py_CLEAR(state->latin1[i]); } - if (_Py_IsMainInterpreter(tstate)) { - unicode_clear_static_strings(); - } + unicode_clear_identifiers(tstate); _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec); } |