2 files changed, 144 insertions, 37 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 99dcdd8..ba73e56 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -206,6 +206,52 @@ extern "C" {
    immediately follow the structure. utf8_length and wstr_length can be found
    in the length field; the utf8 pointer is equal to the data pointer. */
 typedef struct {
+    /* Unicode strings can be in 4 states:
+
+       - compact ascii:
+
+         * structure = PyASCIIObject
+         * kind = PyUnicode_1BYTE_KIND
+         * compact = 1
+         * ascii = 1
+         * ready = 1
+         * utf8 = data
+
+       - compact:
+
+         * structure = PyCompactUnicodeObject
+         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
+           PyUnicode_4BYTE_KIND
+         * compact = 1
+         * ready = 1
+         * (ascii = 0)
+
+       - string created by the legacy API (not ready):
+
+         * structure = PyUnicodeObject
+         * kind = PyUnicode_WCHAR_KIND
+         * compact = 0
+         * ready = 0
+         * wstr is not NULL
+         * data.any is NULL
+         * utf8 is NULL
+         * interned = SSTATE_NOT_INTERNED
+         * (ascii = 0)
+
+       - string created by the legacy API, ready:
+
+         * structure = PyUnicodeObject structure
+         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
+           PyUnicode_4BYTE_KIND
+         * compact = 0
+         * ready = 1
+         * data.any is not NULL
+         * (ascii = 0)
+
+       String created by the legacy API becomes ready when calling
+       PyUnicode_READY().
+
+       See also _PyUnicode_CheckConsistency(). */
     PyObject_HEAD
     Py_ssize_t length;          /* Number of code points in the string */
     Py_hash_t hash;             /* Hash value; -1 if not set */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 29788b3..284809d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -89,25 +89,16 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 extern "C" {
 #endif
 
-/* Generic helper macro to convert characters of different types.
-   from_type and to_type have to be valid type names, begin and end
-   are pointers to the source characters which should be of type
-   "from_type *".  to is a pointer of type "to_type *" and points to the
-   buffer where the result characters are written to. */
-#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
-    do { \
-        const from_type *iter_; to_type *to_; \
-        for (iter_ = (begin), to_ = (to_type *)(to); \
-             iter_ < (end); \
-             ++iter_, ++to_) { \
-            *to_ = (to_type)*iter_; \
-        } \
-    } while (0)
+#ifdef Py_DEBUG
+#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
+#else
+#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
+#endif
 
 #define _PyUnicode_UTF8(op)                             \
     (((PyCompactUnicodeObject*)(op))->utf8)
 #define PyUnicode_UTF8(op)                              \
-    (assert(PyUnicode_Check(op)),                       \
+    (assert(_PyUnicode_CHECK(op)),                      \
      assert(PyUnicode_IS_READY(op)),                    \
      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
          ((char*)((PyASCIIObject*)(op) + 1)) :          \
@@ -115,7 +106,7 @@ extern "C" {
 #define _PyUnicode_UTF8_LENGTH(op)                      \
     (((PyCompactUnicodeObject*)(op))->utf8_length)
 #define PyUnicode_UTF8_LENGTH(op)                       \
-    (assert(PyUnicode_Check(op)),                       \
+    (assert(_PyUnicode_CHECK(op)),                      \
      assert(PyUnicode_IS_READY(op)),                    \
      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
          ((PyASCIIObject*)(op))->length :               \
@@ -125,22 +116,42 @@ extern "C" {
 #define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
 #define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
 #define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
-#define _PyUnicode_KIND(op) \
-    (assert(PyUnicode_Check(op)), \
+#define _PyUnicode_KIND(op)                             \
+    (assert(_PyUnicode_CHECK(op)),                      \
      ((PyASCIIObject *)(op))->state.kind)
-#define _PyUnicode_GET_LENGTH(op)                \
-    (assert(PyUnicode_Check(op)),               \
+#define _PyUnicode_GET_LENGTH(op)                       \
+    (assert(_PyUnicode_CHECK(op)),                      \
      ((PyASCIIObject *)(op))->length)
 #define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
 
+#undef PyUnicode_READY
+#define PyUnicode_READY(op)                             \
+    (assert(_PyUnicode_CHECK(op)),                      \
+     (PyUnicode_IS_READY(op) ?                          \
+      0 : _PyUnicode_Ready((PyObject *)(op))))
+
 /* true if the Unicode object has an allocated UTF-8 memory block
    (not shared with other data) */
-#define _PyUnicode_HAS_UTF8_MEMORY(op) \
-    (assert(PyUnicode_Check(op)),                       \
-     (!PyUnicode_IS_COMPACT_ASCII(op) \
-      && _PyUnicode_UTF8(op) \
+#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
+    (assert(_PyUnicode_CHECK(op)),                      \
+     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
+      && _PyUnicode_UTF8(op)                            \
       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
 
+/* Generic helper macro to convert characters of different types.
+   from_type and to_type have to be valid type names, begin and end
+   are pointers to the source characters which should be of type
+   "from_type *".  to is a pointer of type "to_type *" and points to the
+   buffer where the result characters are written to. */
+#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
+    do {                                                \
+        const from_type *iter_; to_type *to_;           \
+        for (iter_ = (begin), to_ = (to_type *)(to);    \
+             iter_ < (end);                             \
+             ++iter_, ++to_) {                          \
+            *to_ = (to_type)*iter_;                     \
+        }                                               \
+    } while (0)
 
 /* The Unicode string has been modified: reset the hash */
 #define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
@@ -250,6 +261,57 @@ PyUnicode_GetMax(void)
 #endif
 }
 
+#ifdef Py_DEBUG
+static int
+_PyUnicode_CheckConsistency(void *op)
+{
+    PyASCIIObject *ascii;
+    unsigned int kind;
+
+    assert(PyUnicode_Check(op));
+
+    ascii = (PyASCIIObject *)op;
+    kind = ascii->state.kind;
+
+    if (ascii->state.ascii == 1) {
+        assert(kind == PyUnicode_1BYTE_KIND);
+        assert(ascii->state.compact == 1);
+        assert(ascii->state.ready == 1);
+    }
+    else if (ascii->state.compact == 1) {
+        assert(kind == PyUnicode_1BYTE_KIND
+               || kind == PyUnicode_2BYTE_KIND
+               || kind == PyUnicode_4BYTE_KIND);
+        assert(ascii->state.compact == 1);
+        assert(ascii->state.ascii == 0);
+        assert(ascii->state.ready == 1);
+    } else {
+        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
+        PyUnicodeObject *unicode = (PyUnicodeObject *)op;
+
+        if (kind == PyUnicode_WCHAR_KIND) {
+            assert(!ascii->state.compact == 1);
+            assert(ascii->state.ascii == 0);
+            assert(!ascii->state.ready == 1);
+            assert(ascii->wstr != NULL);
+            assert(unicode->data.any == NULL);
+            assert(compact->utf8 == NULL);
+            assert(ascii->state.interned == SSTATE_NOT_INTERNED);
+        }
+        else {
+            assert(kind == PyUnicode_1BYTE_KIND
+                   || kind == PyUnicode_2BYTE_KIND
+                   || kind == PyUnicode_4BYTE_KIND);
+            assert(!ascii->state.compact == 1);
+            assert(ascii->state.ready == 1);
+            assert(unicode->data.any != NULL);
+            assert(ascii->state.ascii == 0);
+        }
+    }
+    return 1;
+}
+#endif
+
 /* --- Bloom Filters ----------------------------------------------------- */
 
 /* stuff to implement simple "bloom filters" for Unicode characters.
@@ -542,7 +604,7 @@ _PyUnicode_New(Py_ssize_t length)
 static const char*
 unicode_kind_name(PyObject *unicode)
 {
-    assert(PyUnicode_Check(unicode));
+    assert(_PyUnicode_CHECK(unicode));
     if (!PyUnicode_IS_COMPACT(unicode))
     {
         if (!PyUnicode_IS_READY(unicode))
@@ -744,7 +806,8 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
     const wchar_t *iter;
     Py_UCS4 *ucs4_out;
 
-    assert(unicode && PyUnicode_Check(unicode));
+    assert(unicode != NULL);
+    assert(_PyUnicode_CHECK(unicode));
     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
 
@@ -771,7 +834,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
 static int
 _PyUnicode_Dirty(PyObject *unicode)
 {
-    assert(PyUnicode_Check(unicode));
+    assert(_PyUnicode_CHECK(unicode));
     if (Py_REFCNT(unicode) != 1) {
         PyErr_SetString(PyExc_ValueError,
                         "Cannot modify a string having more than 1 reference");
@@ -966,10 +1029,8 @@ _PyUnicode_Ready(PyObject *obj)
        strings were created using _PyObject_New() and where no canonical
        representation (the str field) has been set yet aka strings
        which are not yet ready. */
-    assert(PyUnicode_Check(obj));
-    assert(!PyUnicode_IS_READY(obj));
-    assert(!PyUnicode_IS_COMPACT(obj));
-    assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
+    assert(_PyUnicode_CHECK(unicode));
+    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
     assert(_PyUnicode_WSTR(unicode) != NULL);
     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
     assert(_PyUnicode_UTF8(unicode) == NULL);
@@ -1154,7 +1215,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
     assert(PyUnicode_Check(unicode));
     assert(0 <= length);
 
-    if (!PyUnicode_IS_COMPACT(unicode) && !PyUnicode_IS_READY(unicode))
+    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
         old_length = PyUnicode_WSTR_LENGTH(unicode);
     else
         old_length = PyUnicode_GET_LENGTH(unicode);
@@ -1907,7 +1968,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
             case 'U':
             {
                 PyObject *obj = va_arg(count, PyObject *);
-                assert(obj && PyUnicode_Check(obj));
+                assert(obj && _PyUnicode_CHECK(obj));
                 if (PyUnicode_READY(obj) == -1)
                     goto fail;
                 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
@@ -1921,7 +1982,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
                 const char *str = va_arg(count, const char *);
                 PyObject *str_obj;
                 assert(obj || str);
-                assert(!obj || PyUnicode_Check(obj));
+                assert(!obj || _PyUnicode_CHECK(obj));
                 if (obj) {
                     if (PyUnicode_READY(obj) == -1)
                         goto fail;
@@ -9570,7 +9631,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
     void *data;
     Py_UCS4 chr;
 
-    assert(PyUnicode_Check(uni));
+    assert(_PyUnicode_CHECK(uni));
     if (PyUnicode_READY(uni) == -1)
         return -1;
     kind = PyUnicode_KIND(uni);
@@ -12698,7 +12759,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
     unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
     if (unicode == NULL)
         return NULL;
-    assert(PyUnicode_Check(unicode));
+    assert(_PyUnicode_CHECK(unicode));
     if (PyUnicode_READY(unicode))
         return NULL;
 
@@ -13054,7 +13115,7 @@ unicodeiter_next(unicodeiterobject *it)
     seq = it->it_seq;
     if (seq == NULL)
         return NULL;
-    assert(PyUnicode_Check(seq));
+    assert(_PyUnicode_CHECK(seq));
 
     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
         int kind = PyUnicode_KIND(seq);