summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@python.org>2020-06-23 22:10:40 (GMT)
committerGitHub <noreply@github.com>2020-06-23 22:10:40 (GMT)
commitf363d0a6e9cfa50677a6de203735fbc0d06c2f49 (patch)
tree9092c9d82a215dcfce789e4ad81ac2b4e8be2fed
parentd051801052211b533c46a593b1c1bccf649a171c (diff)
downloadcpython-f363d0a6e9cfa50677a6de203735fbc0d06c2f49.zip
cpython-f363d0a6e9cfa50677a6de203735fbc0d06c2f49.tar.gz
cpython-f363d0a6e9cfa50677a6de203735fbc0d06c2f49.tar.bz2
bpo-40521: Make empty Unicode string per interpreter (GH-21096)
Each interpreter now has its own empty Unicode string singleton.
-rw-r--r--Include/internal/pycore_interp.h2
-rw-r--r--Include/internal/pycore_pylifecycle.h2
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst2
-rw-r--r--Objects/stringlib/asciilib.h1
-rw-r--r--Objects/stringlib/partition.h7
-rw-r--r--Objects/stringlib/stringdefs.h4
-rw-r--r--Objects/stringlib/ucs1lib.h1
-rw-r--r--Objects/stringlib/ucs2lib.h1
-rw-r--r--Objects/stringlib/ucs4lib.h1
-rw-r--r--Objects/stringlib/unicodedefs.h1
-rw-r--r--Objects/unicodeobject.c190
-rw-r--r--Python/pylifecycle.c8
12 files changed, 130 insertions, 90 deletions
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index 435a72a..d8947e7 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -71,6 +71,8 @@ struct _Py_bytes_state {
};
struct _Py_unicode_state {
+ // The empty Unicode object is a singleton to improve performance.
+ PyObject *empty;
struct _Py_unicode_fs_codec fs_codec;
};
diff --git a/Include/internal/pycore_pylifecycle.h b/Include/internal/pycore_pylifecycle.h
index cd47044..f29c7cb 100644
--- a/Include/internal/pycore_pylifecycle.h
+++ b/Include/internal/pycore_pylifecycle.h
@@ -31,7 +31,7 @@ PyAPI_FUNC(int) _Py_IsLocaleCoercionTarget(const char *ctype_loc);
/* Various one-time initializers */
-extern PyStatus _PyUnicode_Init(void);
+extern PyStatus _PyUnicode_Init(PyThreadState *tstate);
extern int _PyStructSequence_Init(void);
extern int _PyLong_Init(PyThreadState *tstate);
extern PyStatus _PyFaulthandler_Init(int enable);
diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
index 9b94bcc..e970551 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
@@ -2,7 +2,7 @@ Each interpreter now its has own free lists, singletons and caches:
* Free lists: float, tuple, list, dict, frame, context,
asynchronous generator, MemoryError.
-* Singletons: empty tuple, empty bytes string,
+* Singletons: empty tuple, empty bytes string, empty Unicode string,
single byte character.
* Slice cache.
diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h
index 8599d38..7749e8f 100644
--- a/Objects/stringlib/asciilib.h
+++ b/Objects/stringlib/asciilib.h
@@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
diff --git a/Objects/stringlib/partition.h b/Objects/stringlib/partition.h
index 3731df5..bcc2176 100644
--- a/Objects/stringlib/partition.h
+++ b/Objects/stringlib/partition.h
@@ -1,9 +1,14 @@
/* stringlib: partition implementation */
#ifndef STRINGLIB_FASTSEARCH_H
-#error must include "stringlib/fastsearch.h" before including this module
+# error must include "stringlib/fastsearch.h" before including this module
#endif
+#if !STRINGLIB_MUTABLE && !defined(STRINGLIB_GET_EMPTY)
+# error "STRINGLIB_GET_EMPTY must be defined if STRINGLIB_MUTABLE is zero"
+#endif
+
+
Py_LOCAL_INLINE(PyObject*)
STRINGLIB(partition)(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h
index c12ecc5..88641b2 100644
--- a/Objects/stringlib/stringdefs.h
+++ b/Objects/stringlib/stringdefs.h
@@ -1,10 +1,6 @@
#ifndef STRINGLIB_STRINGDEFS_H
#define STRINGLIB_STRINGDEFS_H
-#ifndef STRINGLIB_GET_EMPTY
-# error "STRINGLIB_GET_EMPTY macro must be defined"
-#endif
-
/* this is sort of a hack. there's at least one place (formatting
floats) where some stringlib code takes a different path if it's
compiled as unicode. */
diff --git a/Objects/stringlib/ucs1lib.h b/Objects/stringlib/ucs1lib.h
index bdf3035..5b0b8a0 100644
--- a/Objects/stringlib/ucs1lib.h
+++ b/Objects/stringlib/ucs1lib.h
@@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
diff --git a/Objects/stringlib/ucs2lib.h b/Objects/stringlib/ucs2lib.h
index 9d68888..6af0151 100644
--- a/Objects/stringlib/ucs2lib.h
+++ b/Objects/stringlib/ucs2lib.h
@@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
diff --git a/Objects/stringlib/ucs4lib.h b/Objects/stringlib/ucs4lib.h
index c7dfa52..39071a0 100644
--- a/Objects/stringlib/ucs4lib.h
+++ b/Objects/stringlib/ucs4lib.h
@@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h
index e4d4163..5ea79cd 100644
--- a/Objects/stringlib/unicodedefs.h
+++ b/Objects/stringlib/unicodedefs.h
@@ -13,7 +13,6 @@
#define STRINGLIB_CHAR Py_UNICODE
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_GET_EMPTY() unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 1433848..06ca7a5 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -222,26 +222,43 @@ extern "C" {
static PyObject *interned = NULL;
#endif
-/* The empty Unicode object is shared to improve performance. */
-static PyObject *unicode_empty = NULL;
+static struct _Py_unicode_state*
+get_unicode_state(void)
+{
+ PyInterpreterState *interp = _PyInterpreterState_GET();
+ return &interp->unicode;
+}
-#define _Py_INCREF_UNICODE_EMPTY() \
- do { \
- if (unicode_empty != NULL) \
- Py_INCREF(unicode_empty); \
- else { \
- unicode_empty = PyUnicode_New(0, 0); \
- if (unicode_empty != NULL) { \
- Py_INCREF(unicode_empty); \
- assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
- } \
- } \
- } while (0)
-#define _Py_RETURN_UNICODE_EMPTY() \
- do { \
- _Py_INCREF_UNICODE_EMPTY(); \
- return unicode_empty; \
+// Return a borrowed reference to the empty string singleton.
+// Return NULL if the singleton was not created yet.
+static inline PyObject* unicode_get_empty(void)
+{
+ struct _Py_unicode_state *state = get_unicode_state();
+ return state->empty;
+}
+
+static inline PyObject* unicode_new_empty(void)
+{
+ struct _Py_unicode_state *state = get_unicode_state();
+ PyObject *empty = state->empty;
+ if (empty != NULL) {
+ Py_INCREF(empty);
+ }
+ else {
+ empty = PyUnicode_New(0, 0);
+ if (empty != NULL) {
+ Py_INCREF(empty);
+ assert(_PyUnicode_CheckConsistency(empty, 1));
+ state->empty = empty;
+ }
+ }
+ return empty;
+}
+
+#define _Py_RETURN_UNICODE_EMPTY() \
+ do { \
+ return unicode_new_empty(); \
} while (0)
static inline void
@@ -676,11 +693,15 @@ unicode_result_ready(PyObject *unicode)
length = PyUnicode_GET_LENGTH(unicode);
if (length == 0) {
- if (unicode != unicode_empty) {
+ PyObject *empty = unicode_get_empty();
+ if (unicode != empty) {
Py_DECREF(unicode);
- _Py_RETURN_UNICODE_EMPTY();
+
+ Py_INCREF(empty);
+ return empty;
}
- return unicode_empty;
+ // unicode is the empty string singleton
+ return unicode;
}
#ifdef LATIN1_SINGLETONS
@@ -864,7 +885,7 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str,
to keep things simple, we use a single bitmask, using the least 5
bits from each unicode characters as the bit index. */
-/* the linebreak mask is set up by Unicode_Init below */
+/* the linebreak mask is set up by _PyUnicode_Init() below */
#if LONG_BIT >= 128
#define BLOOM_WIDTH 128
@@ -938,6 +959,8 @@ ensure_unicode(PyObject *obj)
/* Compilation of templated routines */
+#define STRINGLIB_GET_EMPTY() unicode_get_empty()
+
#include "stringlib/asciilib.h"
#include "stringlib/fastsearch.h"
#include "stringlib/partition.h"
@@ -986,6 +1009,8 @@ _Py_COMP_DIAG_IGNORE_DEPR_DECLS
#include "stringlib/undef.h"
_Py_COMP_DIAG_POP
+#undef STRINGLIB_GET_EMPTY
+
/* --- Unicode Object ----------------------------------------------------- */
static inline Py_ssize_t
@@ -1234,9 +1259,12 @@ _PyUnicode_New(Py_ssize_t length)
size_t new_size;
/* Optimization for empty strings */
- if (length == 0 && unicode_empty != NULL) {
- Py_INCREF(unicode_empty);
- return (PyUnicodeObject*)unicode_empty;
+ if (length == 0) {
+ PyObject *empty = unicode_get_empty();
+ if (empty != NULL) {
+ Py_INCREF(empty);
+ return (PyUnicodeObject *)empty;
+ }
}
/* Ensure we won't overflow the size. */
@@ -1386,6 +1414,15 @@ _PyUnicode_Dump(PyObject *op)
PyObject *
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
{
+ /* Optimization for empty strings */
+ if (size == 0) {
+ PyObject *empty = unicode_get_empty();
+ if (empty != NULL) {
+ Py_INCREF(empty);
+ return empty;
+ }
+ }
+
PyObject *obj;
PyCompactUnicodeObject *unicode;
void *data;
@@ -1394,12 +1431,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
Py_ssize_t char_size;
Py_ssize_t struct_size;
- /* Optimization for empty strings */
- if (size == 0 && unicode_empty != NULL) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
-
is_ascii = 0;
is_sharing = 0;
struct_size = sizeof(PyCompactUnicodeObject);
@@ -1970,7 +2001,8 @@ unicode_dealloc(PyObject *unicode)
static int
unicode_is_singleton(PyObject *unicode)
{
- if (unicode == unicode_empty) {
+ struct _Py_unicode_state *state = get_unicode_state();
+ if (unicode == state->empty) {
return 1;
}
#ifdef LATIN1_SINGLETONS
@@ -2026,10 +2058,10 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
return 0;
if (length == 0) {
- _Py_INCREF_UNICODE_EMPTY();
- if (!unicode_empty)
+ PyObject *empty = unicode_new_empty();
+ if (!empty)
return -1;
- Py_SETREF(*p_unicode, unicode_empty);
+ Py_SETREF(*p_unicode, empty);
return 0;
}
@@ -10836,10 +10868,10 @@ replace(PyObject *self, PyObject *str1,
}
new_size = slen + n * (len2 - len1);
if (new_size == 0) {
- _Py_INCREF_UNICODE_EMPTY();
- if (!unicode_empty)
+ PyObject *empty = unicode_new_empty();
+ if (!empty)
goto error;
- u = unicode_empty;
+ u = empty;
goto done;
}
if (new_size > (PY_SSIZE_T_MAX / rkind)) {
@@ -11497,10 +11529,13 @@ PyUnicode_Concat(PyObject *left, PyObject *right)
return NULL;
/* Shortcuts */
- if (left == unicode_empty)
+ PyObject *empty = unicode_get_empty(); // Borrowed reference
+ if (left == empty) {
return PyUnicode_FromObject(right);
- if (right == unicode_empty)
+ }
+ if (right == empty) {
return PyUnicode_FromObject(left);
+ }
left_len = PyUnicode_GET_LENGTH(left);
right_len = PyUnicode_GET_LENGTH(right);
@@ -11551,14 +11586,16 @@ PyUnicode_Append(PyObject **p_left, PyObject *right)
goto error;
/* Shortcuts */
- if (left == unicode_empty) {
+ PyObject *empty = unicode_get_empty(); // Borrowed reference
+ if (left == empty) {
Py_DECREF(left);
Py_INCREF(right);
*p_left = right;
return;
}
- if (right == unicode_empty)
+ if (right == empty) {
return;
+ }
left_len = PyUnicode_GET_LENGTH(left);
right_len = PyUnicode_GET_LENGTH(right);
@@ -13255,12 +13292,12 @@ PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
len1 = PyUnicode_GET_LENGTH(str_obj);
len2 = PyUnicode_GET_LENGTH(sep_obj);
if (kind1 < kind2 || len1 < len2) {
- _Py_INCREF_UNICODE_EMPTY();
- if (!unicode_empty)
+ PyObject *empty = unicode_get_empty(); // Borrowed reference
+ if (!empty) {
out = NULL;
+ }
else {
- out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
- Py_DECREF(unicode_empty);
+ out = PyTuple_Pack(3, str_obj, empty, empty);
}
return out;
}
@@ -13313,12 +13350,12 @@ PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
len1 = PyUnicode_GET_LENGTH(str_obj);
len2 = PyUnicode_GET_LENGTH(sep_obj);
if (kind1 < kind2 || len1 < len2) {
- _Py_INCREF_UNICODE_EMPTY();
- if (!unicode_empty)
+ PyObject *empty = unicode_get_empty(); // Borrowed reference
+ if (!empty) {
out = NULL;
+ }
else {
- out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
- Py_DECREF(unicode_empty);
+ out = PyTuple_Pack(3, empty, empty, str_obj);
}
return out;
}
@@ -15538,10 +15575,10 @@ PyTypeObject PyUnicode_Type = {
/* Initialize the Unicode implementation */
PyStatus
-_PyUnicode_Init(void)
+_PyUnicode_Init(PyThreadState *tstate)
{
/* XXX - move this array to unicodectype.c ? */
- Py_UCS2 linebreak[] = {
+ const Py_UCS2 linebreak[] = {
0x000A, /* LINE FEED */
0x000D, /* CARRIAGE RETURN */
0x001C, /* FILE SEPARATOR */
@@ -15553,29 +15590,31 @@ _PyUnicode_Init(void)
};
/* Init the implementation */
- _Py_INCREF_UNICODE_EMPTY();
- if (!unicode_empty) {
- return _PyStatus_ERR("Can't create empty string");
+ PyObject *empty = unicode_new_empty();
+ if (!empty) {
+ return _PyStatus_NO_MEMORY();
}
- Py_DECREF(unicode_empty);
+ Py_DECREF(empty);
- if (PyType_Ready(&PyUnicode_Type) < 0) {
- return _PyStatus_ERR("Can't initialize unicode type");
- }
+ if (_Py_IsMainInterpreter(tstate)) {
+ /* initialize the linebreak bloom filter */
+ bloom_linebreak = make_bloom_mask(
+ PyUnicode_2BYTE_KIND, linebreak,
+ Py_ARRAY_LENGTH(linebreak));
- /* initialize the linebreak bloom filter */
- bloom_linebreak = make_bloom_mask(
- PyUnicode_2BYTE_KIND, linebreak,
- Py_ARRAY_LENGTH(linebreak));
+ if (PyType_Ready(&PyUnicode_Type) < 0) {
+ return _PyStatus_ERR("Can't initialize unicode type");
+ }
- if (PyType_Ready(&EncodingMapType) < 0) {
- return _PyStatus_ERR("Can't initialize encoding map type");
- }
- if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
- return _PyStatus_ERR("Can't initialize field name iterator type");
- }
- if (PyType_Ready(&PyFormatterIter_Type) < 0) {
- return _PyStatus_ERR("Can't initialize formatter iter type");
+ if (PyType_Ready(&EncodingMapType) < 0) {
+ return _PyStatus_ERR("Can't initialize encoding map type");
+ }
+ if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
+ return _PyStatus_ERR("Can't initialize field name iterator type");
+ }
+ if (PyType_Ready(&PyFormatterIter_Type) < 0) {
+ return _PyStatus_ERR("Can't initialize formatter iter type");
+ }
}
return _PyStatus_OK();
}
@@ -16205,7 +16244,10 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void)
void
_PyUnicode_Fini(PyThreadState *tstate)
{
- if (_Py_IsMainInterpreter(tstate)) {
+ struct _Py_unicode_state *state = &tstate->interp->unicode;
+
+ int is_main_interp = _Py_IsMainInterpreter(tstate);
+ if (is_main_interp) {
#if defined(WITH_VALGRIND) || defined(__INSURE__)
/* Insure++ is a memory analysis tool that aids in discovering
* memory leaks and other memory problems. On Python exit, the
@@ -16218,9 +16260,11 @@ _PyUnicode_Fini(PyThreadState *tstate)
*/
unicode_release_interned();
#endif /* __INSURE__ */
+ }
- Py_CLEAR(unicode_empty);
+ Py_CLEAR(state->empty);
+ if (is_main_interp) {
#ifdef LATIN1_SINGLETONS
for (Py_ssize_t i = 0; i < 256; i++) {
Py_CLEAR(unicode_latin1[i]);
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index f0b40b3..eda4c6a 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -595,11 +595,9 @@ pycore_init_types(PyThreadState *tstate)
return _PyStatus_ERR("can't init longs");
}
- if (is_main_interp) {
- status = _PyUnicode_Init();
- if (_PyStatus_EXCEPTION(status)) {
- return status;
- }
+ status = _PyUnicode_Init(tstate);
+ if (_PyStatus_EXCEPTION(status)) {
+ return status;
}
status = _PyExc_Init(tstate);