diff options
Diffstat (limited to 'Include')
-rw-r--r-- | Include/cpython/unicodeobject.h | 233 | ||||
-rw-r--r-- | Include/internal/pycore_runtime_init.h | 1 | ||||
-rw-r--r-- | Include/unicodeobject.h | 11 |
3 files changed, 15 insertions, 230 deletions
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 1e3bdad..8c53962 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -11,10 +11,6 @@ /* --- Internal Unicode Operations ---------------------------------------- */ -#ifndef USE_UNICODE_WCHAR_CACHE -# define USE_UNICODE_WCHAR_CACHE 1 -#endif /* USE_UNICODE_WCHAR_CACHE */ - // Static inline functions to work with surrogates static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) { return (0xD800 <= ch && ch <= 0xDFFF); @@ -51,7 +47,7 @@ static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) { /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject structure. state.ascii and state.compact are set, and the data - immediately follow the structure. utf8_length and wstr_length can be found + immediately follow the structure. utf8_length can be found in the length field; the utf8 pointer is equal to the data pointer. */ typedef struct { /* There are 4 forms of Unicode strings: @@ -63,8 +59,7 @@ typedef struct { * kind = PyUnicode_1BYTE_KIND * compact = 1 * ascii = 1 - * ready = 1 - * (length is the length of the utf8 and wstr strings) + * (length is the length of the utf8) * (data starts just after the structure) * (since ASCII is decoded from UTF-8, the utf8 string are the data) @@ -75,55 +70,27 @@ typedef struct { * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or PyUnicode_4BYTE_KIND * compact = 1 - * ready = 1 * ascii = 0 * utf8 is not shared with data * utf8_length = 0 if utf8 is NULL - * wstr is shared with data and wstr_length=length - if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 - or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 - * wstr_length = 0 if wstr is NULL * (data starts just after the structure) - - legacy string, not ready: - - * structure = PyUnicodeObject - * test: kind == PyUnicode_WCHAR_KIND - * length = 0 (use wstr_length) - * hash = -1 - * kind = PyUnicode_WCHAR_KIND - * compact = 0 - * ascii = 0 - * ready = 0 - * interned = SSTATE_NOT_INTERNED - * wstr is not NULL - * data.any is NULL - * utf8 is NULL - * utf8_length = 0 - - - legacy string, ready: + - legacy string: * structure = PyUnicodeObject structure - * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND + * test: !PyUnicode_IS_COMPACT(op) * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or PyUnicode_4BYTE_KIND * compact = 0 - * ready = 1 * data.any is not NULL * utf8 is shared and utf8_length = length with data.any if ascii = 1 * utf8_length = 0 if utf8 is NULL - * wstr is shared with data.any and wstr_length = length - if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 - or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 - * wstr_length = 0 if wstr is NULL Compact strings use only one memory block (structure + characters), whereas legacy strings use one block for the structure and one block for characters. - Legacy strings are created by PyUnicode_FromUnicode() and - PyUnicode_FromStringAndSize(NULL, size) functions. They become ready - when PyUnicode_READY() is called. + Legacy strings are created by subclasses of Unicode. See also _PyUnicode_CheckConsistency(). */ @@ -142,11 +109,6 @@ typedef struct { unsigned int interned:2; /* Character size: - - PyUnicode_WCHAR_KIND (0): - - * character type = wchar_t (16 or 32 bits, depending on the - platform) - - PyUnicode_1BYTE_KIND (1): * character type = Py_UCS1 (8 bits, unsigned) @@ -177,16 +139,10 @@ typedef struct { and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is set, use the PyASCIIObject structure. */ unsigned int ascii:1; - /* The ready flag indicates whether the object layout is initialized - completely. This means that this is either a compact object, or - the data pointer is filled out. The bit is redundant, and helps - to minimize the test in PyUnicode_IS_READY(). */ - unsigned int ready:1; /* Padding to ensure that PyUnicode_DATA() is always aligned to 4 bytes (see issue #19537 on m68k). */ - unsigned int :24; + unsigned int :25; } state; - wchar_t *wstr; /* wchar_t representation (null-terminated) */ } PyASCIIObject; /* Non-ASCII strings allocated through PyUnicode_New use the @@ -197,13 +153,9 @@ typedef struct { Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the * terminating \0. */ char *utf8; /* UTF-8 representation (null-terminated) */ - Py_ssize_t wstr_length; /* Number of code points in wstr, possible - * surrogates count as two code points. */ } PyCompactUnicodeObject; -/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the - PyUnicodeObject structure. The actual string data is initially in the wstr - block, and copied into the data block using _PyUnicode_Ready. */ +/* Object format for Unicode subclasses. */ typedef struct { PyCompactUnicodeObject _base; union { @@ -247,10 +199,9 @@ static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) { # define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op)) #endif -/* Fast check to determine whether an object is ready. Equivalent to: - PyUnicode_IS_COMPACT(op) || _PyUnicodeObject_CAST(op)->data.any */ +/* For backward compatibility */ static inline unsigned int PyUnicode_IS_READY(PyObject *op) { - return _PyASCIIObject_CAST(op)->state.ready; + return 1; } #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 # define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op)) @@ -260,7 +211,6 @@ static inline unsigned int PyUnicode_IS_READY(PyObject *op) { string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be ready. */ static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) { - assert(PyUnicode_IS_READY(op)); return _PyASCIIObject_CAST(op)->state.ascii; } #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 @@ -286,10 +236,6 @@ static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) { #endif enum PyUnicode_Kind { -/* String contains only wstr byte characters. This is only possible - when the string was created with a legacy API and _PyUnicode_Ready() - has not been called yet. */ - PyUnicode_WCHAR_KIND = 0, /* Return values of the PyUnicode_KIND() function: */ PyUnicode_1BYTE_KIND = 1, PyUnicode_2BYTE_KIND = 2, @@ -298,8 +244,7 @@ enum PyUnicode_Kind { /* Return one of the PyUnicode_*_KIND values defined above. */ #define PyUnicode_KIND(op) \ - (assert(PyUnicode_IS_READY(op)), \ - _PyASCIIObject_CAST(op)->state.kind) + (_PyASCIIObject_CAST(op)->state.kind) /* Return a void pointer to the raw unicode buffer. */ static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) { @@ -335,11 +280,8 @@ static inline void* PyUnicode_DATA(PyObject *op) { #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op)) #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op)) -/* Returns the length of the unicode string. The caller has to make sure that - the string has it's canonical representation set before calling - this function. Call PyUnicode_(FAST_)Ready to ensure that. */ +/* Returns the length of the unicode string. */ static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) { - assert(PyUnicode_IS_READY(op)); return _PyASCIIObject_CAST(op)->length; } #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 @@ -400,7 +342,6 @@ static inline Py_UCS4 PyUnicode_READ(int kind, cache kind and use PyUnicode_READ instead. */ static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index) { - assert(PyUnicode_IS_READY(unicode)); int kind = PyUnicode_KIND(unicode); if (kind == PyUnicode_1BYTE_KIND) { return PyUnicode_1BYTE_DATA(unicode)[index]; @@ -421,7 +362,6 @@ static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index) than iterating over the string. */ static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op) { - assert(PyUnicode_IS_READY(op)); if (PyUnicode_IS_ASCII(op)) { return 0x7fU; } @@ -453,27 +393,10 @@ PyAPI_FUNC(PyObject*) PyUnicode_New( Py_UCS4 maxchar /* maximum code point value in the string */ ); -/* Initializes the canonical string representation from the deprecated - wstr/Py_UNICODE representation. This function is used to convert Unicode - objects which were created using the old API to the new flexible format - introduced with PEP 393. - - Don't call this function directly, use the public PyUnicode_READY() function - instead. */ -PyAPI_FUNC(int) _PyUnicode_Ready( - PyObject *unicode /* Unicode object */ - ); - -/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best - case. If the canonical representation is not yet set, it will still call - _PyUnicode_Ready(). - Returns 0 on success and -1 on errors. */ +/* For backward compatibility */ static inline int PyUnicode_READY(PyObject *op) { - if (PyUnicode_IS_READY(op)) { - return 0; - } - return _PyUnicode_Ready(op); + return 0; } #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 # define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op)) @@ -565,133 +488,6 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( Py_ssize_t start, Py_ssize_t end); -/* --- Legacy deprecated API ---------------------------------------------- */ - -/* Create a Unicode Object from the Py_UNICODE buffer u of the given - size. - - u may be NULL which causes the contents to be undefined. It is the - user's responsibility to fill in the needed data afterwards. Note - that modifying the Unicode object contents after construction is - only allowed if u was set to NULL. - - The buffer is copied into the new object. */ -Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( - const Py_UNICODE *u, /* Unicode buffer */ - Py_ssize_t size /* size of buffer */ - ); - -/* Return a read-only pointer to the Unicode object's internal - Py_UNICODE buffer. - If the wchar_t/Py_UNICODE representation is not yet available, this - function will calculate it. */ -Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( - PyObject *unicode /* Unicode object */ - ); - -/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string - contains null characters. */ -PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode( - PyObject *unicode /* Unicode object */ - ); - -/* Return a read-only pointer to the Unicode object's internal - Py_UNICODE buffer and save the length at size. - If the wchar_t/Py_UNICODE representation is not yet available, this - function will calculate it. */ - -Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( - PyObject *unicode, /* Unicode object */ - Py_ssize_t *size /* location where to save the length */ - ); - - -/* Fast access macros */ - -Py_DEPRECATED(3.3) -static inline Py_ssize_t PyUnicode_WSTR_LENGTH(PyObject *op) -{ - if (PyUnicode_IS_COMPACT_ASCII(op)) { - return _PyASCIIObject_CAST(op)->length; - } - else { - return _PyCompactUnicodeObject_CAST(op)->wstr_length; - } -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_WSTR_LENGTH(op) PyUnicode_WSTR_LENGTH(_PyObject_CAST(op)) -#endif - -/* Returns the deprecated Py_UNICODE representation's size in code units - (this includes surrogate pairs as 2 units). - If the Py_UNICODE representation is not available, it will be computed - on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ - -Py_DEPRECATED(3.3) -static inline Py_ssize_t PyUnicode_GET_SIZE(PyObject *op) -{ - _Py_COMP_DIAG_PUSH - _Py_COMP_DIAG_IGNORE_DEPR_DECLS - if (_PyASCIIObject_CAST(op)->wstr == _Py_NULL) { - (void)PyUnicode_AsUnicode(op); - assert(_PyASCIIObject_CAST(op)->wstr != _Py_NULL); - } - return PyUnicode_WSTR_LENGTH(op); - _Py_COMP_DIAG_POP -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_GET_SIZE(op) PyUnicode_GET_SIZE(_PyObject_CAST(op)) -#endif - -Py_DEPRECATED(3.3) -static inline Py_ssize_t PyUnicode_GET_DATA_SIZE(PyObject *op) -{ - _Py_COMP_DIAG_PUSH - _Py_COMP_DIAG_IGNORE_DEPR_DECLS - return PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE; - _Py_COMP_DIAG_POP -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_GET_DATA_SIZE(op) PyUnicode_GET_DATA_SIZE(_PyObject_CAST(op)) -#endif - -/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE - representation on demand. Using this macro is very inefficient now, - try to port your code to use the new PyUnicode_*BYTE_DATA() macros or - use PyUnicode_WRITE() and PyUnicode_READ(). */ - -Py_DEPRECATED(3.3) -static inline Py_UNICODE* PyUnicode_AS_UNICODE(PyObject *op) -{ - wchar_t *wstr = _PyASCIIObject_CAST(op)->wstr; - if (wstr != _Py_NULL) { - return wstr; - } - - _Py_COMP_DIAG_PUSH - _Py_COMP_DIAG_IGNORE_DEPR_DECLS - return PyUnicode_AsUnicode(op); - _Py_COMP_DIAG_POP -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_AS_UNICODE(op) PyUnicode_AS_UNICODE(_PyObject_CAST(op)) -#endif - -Py_DEPRECATED(3.3) -static inline const char* PyUnicode_AS_DATA(PyObject *op) -{ - _Py_COMP_DIAG_PUSH - _Py_COMP_DIAG_IGNORE_DEPR_DECLS - Py_UNICODE *data = PyUnicode_AS_UNICODE(op); - // In C++, casting directly PyUnicode* to const char* is not valid - return _Py_STATIC_CAST(const char*, _Py_STATIC_CAST(const void*, data)); - _Py_COMP_DIAG_POP -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_AS_DATA(op) PyUnicode_AS_DATA(_PyObject_CAST(op)) -#endif - - /* --- _PyUnicodeWriter API ----------------------------------------------- */ typedef struct { @@ -748,8 +544,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Return 0 on success, raise an exception and return -1 on error. */ #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \ - (assert((KIND) != PyUnicode_WCHAR_KIND), \ - (KIND) <= (WRITER)->kind \ + ((KIND) <= (WRITER)->kind \ ? 0 \ : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND))) diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h index 57cacb9..737507f 100644 --- a/Include/internal/pycore_runtime_init.h +++ b/Include/internal/pycore_runtime_init.h @@ -102,7 +102,6 @@ extern "C" { .kind = 1, \ .compact = 1, \ .ascii = ASCII, \ - .ready = 1, \ }, \ } #define _PyASCIIObject_INIT(LITERAL) \ diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 1d2f546..f71f379 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -171,13 +171,6 @@ PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( ); #endif -/* Get the number of Py_UNICODE units in the - string representation. */ - -Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( - PyObject *unicode /* Unicode object */ - ); - #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 /* Read a character from the string. */ @@ -198,9 +191,7 @@ PyAPI_FUNC(int) PyUnicode_WriteChar( ); #endif -/* Resize a Unicode object. The length is the number of characters, except - if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length - is the number of Py_UNICODE characters. +/* Resize a Unicode object. The length is the number of codepoints. *unicode is modified to point to the new (resized) object and 0 returned on success. |