diff options
-rw-r--r-- | Include/unicodeobject.h | 46 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 135 |
2 files changed, 144 insertions, 37 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 99dcdd8..ba73e56 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -206,6 +206,52 @@ extern "C" { immediately follow the structure. utf8_length and wstr_length can be found in the length field; the utf8 pointer is equal to the data pointer. */ typedef struct { + /* Unicode strings can be in 4 states: + + - compact ascii: + + * structure = PyASCIIObject + * kind = PyUnicode_1BYTE_KIND + * compact = 1 + * ascii = 1 + * ready = 1 + * utf8 = data + + - compact: + + * structure = PyCompactUnicodeObject + * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or + PyUnicode_4BYTE_KIND + * compact = 1 + * ready = 1 + * (ascii = 0) + + - string created by the legacy API (not ready): + + * structure = PyUnicodeObject + * kind = PyUnicode_WCHAR_KIND + * compact = 0 + * ready = 0 + * wstr is not NULL + * data.any is NULL + * utf8 is NULL + * interned = SSTATE_NOT_INTERNED + * (ascii = 0) + + - string created by the legacy API, ready: + + * structure = PyUnicodeObject structure + * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or + PyUnicode_4BYTE_KIND + * compact = 0 + * ready = 1 + * data.any is not NULL + * (ascii = 0) + + String created by the legacy API becomes ready when calling + PyUnicode_READY(). + + See also _PyUnicode_CheckConsistency(). */ PyObject_HEAD Py_ssize_t length; /* Number of code points in the string */ Py_hash_t hash; /* Hash value; -1 if not set */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 29788b3..284809d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -89,25 +89,16 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. extern "C" { #endif -/* Generic helper macro to convert characters of different types. - from_type and to_type have to be valid type names, begin and end - are pointers to the source characters which should be of type - "from_type *". to is a pointer of type "to_type *" and points to the - buffer where the result characters are written to. */ -#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ - do { \ - const from_type *iter_; to_type *to_; \ - for (iter_ = (begin), to_ = (to_type *)(to); \ - iter_ < (end); \ - ++iter_, ++to_) { \ - *to_ = (to_type)*iter_; \ - } \ - } while (0) +#ifdef Py_DEBUG +# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op) +#else +# define _PyUnicode_CHECK(op) PyUnicode_Check(op) +#endif #define _PyUnicode_UTF8(op) \ (((PyCompactUnicodeObject*)(op))->utf8) #define PyUnicode_UTF8(op) \ - (assert(PyUnicode_Check(op)), \ + (assert(_PyUnicode_CHECK(op)), \ assert(PyUnicode_IS_READY(op)), \ PyUnicode_IS_COMPACT_ASCII(op) ? \ ((char*)((PyASCIIObject*)(op) + 1)) : \ @@ -115,7 +106,7 @@ extern "C" { #define _PyUnicode_UTF8_LENGTH(op) \ (((PyCompactUnicodeObject*)(op))->utf8_length) #define PyUnicode_UTF8_LENGTH(op) \ - (assert(PyUnicode_Check(op)), \ + (assert(_PyUnicode_CHECK(op)), \ assert(PyUnicode_IS_READY(op)), \ PyUnicode_IS_COMPACT_ASCII(op) ? \ ((PyASCIIObject*)(op))->length : \ @@ -125,22 +116,42 @@ extern "C" { #define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length) #define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state) #define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash) -#define _PyUnicode_KIND(op) \ - (assert(PyUnicode_Check(op)), \ +#define _PyUnicode_KIND(op) \ + (assert(_PyUnicode_CHECK(op)), \ ((PyASCIIObject *)(op))->state.kind) -#define _PyUnicode_GET_LENGTH(op) \ - (assert(PyUnicode_Check(op)), \ +#define _PyUnicode_GET_LENGTH(op) \ + (assert(_PyUnicode_CHECK(op)), \ ((PyASCIIObject *)(op))->length) #define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any) +#undef PyUnicode_READY +#define PyUnicode_READY(op) \ + (assert(_PyUnicode_CHECK(op)), \ + (PyUnicode_IS_READY(op) ? \ + 0 : _PyUnicode_Ready((PyObject *)(op)))) + /* true if the Unicode object has an allocated UTF-8 memory block (not shared with other data) */ -#define _PyUnicode_HAS_UTF8_MEMORY(op) \ - (assert(PyUnicode_Check(op)), \ - (!PyUnicode_IS_COMPACT_ASCII(op) \ - && _PyUnicode_UTF8(op) \ +#define _PyUnicode_HAS_UTF8_MEMORY(op) \ + (assert(_PyUnicode_CHECK(op)), \ + (!PyUnicode_IS_COMPACT_ASCII(op) \ + && _PyUnicode_UTF8(op) \ && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) +/* Generic helper macro to convert characters of different types. + from_type and to_type have to be valid type names, begin and end + are pointers to the source characters which should be of type + "from_type *". to is a pointer of type "to_type *" and points to the + buffer where the result characters are written to. */ +#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ + do { \ + const from_type *iter_; to_type *to_; \ + for (iter_ = (begin), to_ = (to_type *)(to); \ + iter_ < (end); \ + ++iter_, ++to_) { \ + *to_ = (to_type)*iter_; \ + } \ + } while (0) /* The Unicode string has been modified: reset the hash */ #define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) @@ -250,6 +261,57 @@ PyUnicode_GetMax(void) #endif } +#ifdef Py_DEBUG +static int +_PyUnicode_CheckConsistency(void *op) +{ + PyASCIIObject *ascii; + unsigned int kind; + + assert(PyUnicode_Check(op)); + + ascii = (PyASCIIObject *)op; + kind = ascii->state.kind; + + if (ascii->state.ascii == 1) { + assert(kind == PyUnicode_1BYTE_KIND); + assert(ascii->state.compact == 1); + assert(ascii->state.ready == 1); + } + else if (ascii->state.compact == 1) { + assert(kind == PyUnicode_1BYTE_KIND + || kind == PyUnicode_2BYTE_KIND + || kind == PyUnicode_4BYTE_KIND); + assert(ascii->state.compact == 1); + assert(ascii->state.ascii == 0); + assert(ascii->state.ready == 1); + } else { + PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; + PyUnicodeObject *unicode = (PyUnicodeObject *)op; + + if (kind == PyUnicode_WCHAR_KIND) { + assert(!ascii->state.compact == 1); + assert(ascii->state.ascii == 0); + assert(!ascii->state.ready == 1); + assert(ascii->wstr != NULL); + assert(unicode->data.any == NULL); + assert(compact->utf8 == NULL); + assert(ascii->state.interned == SSTATE_NOT_INTERNED); + } + else { + assert(kind == PyUnicode_1BYTE_KIND + || kind == PyUnicode_2BYTE_KIND + || kind == PyUnicode_4BYTE_KIND); + assert(!ascii->state.compact == 1); + assert(ascii->state.ready == 1); + assert(unicode->data.any != NULL); + assert(ascii->state.ascii == 0); + } + } + return 1; +} +#endif + /* --- Bloom Filters ----------------------------------------------------- */ /* stuff to implement simple "bloom filters" for Unicode characters. @@ -542,7 +604,7 @@ _PyUnicode_New(Py_ssize_t length) static const char* unicode_kind_name(PyObject *unicode) { - assert(PyUnicode_Check(unicode)); + assert(_PyUnicode_CHECK(unicode)); if (!PyUnicode_IS_COMPACT(unicode)) { if (!PyUnicode_IS_READY(unicode)) @@ -744,7 +806,8 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, const wchar_t *iter; Py_UCS4 *ucs4_out; - assert(unicode && PyUnicode_Check(unicode)); + assert(unicode != NULL); + assert(_PyUnicode_CHECK(unicode)); assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); ucs4_out = PyUnicode_4BYTE_DATA(unicode); @@ -771,7 +834,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, static int _PyUnicode_Dirty(PyObject *unicode) { - assert(PyUnicode_Check(unicode)); + assert(_PyUnicode_CHECK(unicode)); if (Py_REFCNT(unicode) != 1) { PyErr_SetString(PyExc_ValueError, "Cannot modify a string having more than 1 reference"); @@ -966,10 +1029,8 @@ _PyUnicode_Ready(PyObject *obj) strings were created using _PyObject_New() and where no canonical representation (the str field) has been set yet aka strings which are not yet ready. */ - assert(PyUnicode_Check(obj)); - assert(!PyUnicode_IS_READY(obj)); - assert(!PyUnicode_IS_COMPACT(obj)); - assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND); + assert(_PyUnicode_CHECK(unicode)); + assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); assert(_PyUnicode_WSTR(unicode) != NULL); assert(_PyUnicode_DATA_ANY(unicode) == NULL); assert(_PyUnicode_UTF8(unicode) == NULL); @@ -1154,7 +1215,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) assert(PyUnicode_Check(unicode)); assert(0 <= length); - if (!PyUnicode_IS_COMPACT(unicode) && !PyUnicode_IS_READY(unicode)) + if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) old_length = PyUnicode_WSTR_LENGTH(unicode); else old_length = PyUnicode_GET_LENGTH(unicode); @@ -1907,7 +1968,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) case 'U': { PyObject *obj = va_arg(count, PyObject *); - assert(obj && PyUnicode_Check(obj)); + assert(obj && _PyUnicode_CHECK(obj)); if (PyUnicode_READY(obj) == -1) goto fail; argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); @@ -1921,7 +1982,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) const char *str = va_arg(count, const char *); PyObject *str_obj; assert(obj || str); - assert(!obj || PyUnicode_Check(obj)); + assert(!obj || _PyUnicode_CHECK(obj)); if (obj) { if (PyUnicode_READY(obj) == -1) goto fail; @@ -9570,7 +9631,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) void *data; Py_UCS4 chr; - assert(PyUnicode_Check(uni)); + assert(_PyUnicode_CHECK(uni)); if (PyUnicode_READY(uni) == -1) return -1; kind = PyUnicode_KIND(uni); @@ -12698,7 +12759,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); if (unicode == NULL) return NULL; - assert(PyUnicode_Check(unicode)); + assert(_PyUnicode_CHECK(unicode)); if (PyUnicode_READY(unicode)) return NULL; @@ -13054,7 +13115,7 @@ unicodeiter_next(unicodeiterobject *it) seq = it->it_seq; if (seq == NULL) return NULL; - assert(PyUnicode_Check(seq)); + assert(_PyUnicode_CHECK(seq)); if (it->it_index < PyUnicode_GET_LENGTH(seq)) { int kind = PyUnicode_KIND(seq); |