Merging the py3k-pep3137 branch back into the py3k branch.

No detailed change log; just check out the change log for the py3k-pep3137 branch. The most obvious changes: - str8 renamed to bytes (PyString at the C level); - bytes renamed to buffer (PyBytes at the C level); - PyString and PyUnicode are no longer compatible. I.e. we now have an immutable bytes type and a mutable bytes type. The behavior of PyString was modified quite a bit, to make it more bytes-like. Some changes are still on the to-do list.
author: Guido van Rossum <guido@python.org> 2007-11-06 21:34:58 (GMT)
committer: Guido van Rossum <guido@python.org> 2007-11-06 21:34:58 (GMT)
commit: 98297ee7815939b124156e438b22bd652d67b5db (patch)
tree: a9d239ebd87c73af2571ab48003984c4e18e27e5 /Objects/unicodeobject.c
parent: a19f80c6df2df5e8a5d0cff37131097835ef971e (diff)
download: cpython-98297ee7815939b124156e438b22bd652d67b5db.zip
cpython-98297ee7815939b124156e438b22bd652d67b5db.tar.gz
cpython-98297ee7815939b124156e438b22bd652d67b5db.tar.bz2
1 files changed, 94 insertions, 131 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c568a8e..ae34c9e 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -101,7 +101,7 @@ extern "C" {
    function will delete the reference from this dictionary.
 
    Another way to look at this is that to say that the actual reference
-   count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
+   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
 */
 static PyObject *interned;
 
@@ -998,7 +998,10 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
 	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 				     PyUnicode_GET_SIZE(obj));
     }
-    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
+    PyErr_Format(PyExc_TypeError,
+                 "Can't convert '%.100s' object to str implicitly",
+                 Py_Type(obj)->tp_name);
+    return NULL;
 }
 
 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
@@ -1219,22 +1222,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
     v = PyCodec_Encode(unicode, encoding, errors);
     if (v == NULL)
         goto onError;
-    if (!PyBytes_Check(v)) {
-        if (PyString_Check(v)) {
-            /* Old codec, turn it into bytes */
-            PyObject *b = PyBytes_FromObject(v);
-            Py_DECREF(v);
-            return b;
-        }
-        PyErr_Format(PyExc_TypeError,
-                     "encoder did not return a bytes object "
-                     "(type=%.400s, encoding=%.20s, errors=%.20s)",
-                     v->ob_type->tp_name,
-                     encoding ? encoding : "NULL",
-                     errors ? errors : "NULL");
-        Py_DECREF(v);
-        goto onError;
-    }
+    assert(PyString_Check(v));
     return v;
 
  onError:
@@ -1245,19 +1233,15 @@ PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 					    const char *errors)
 {
     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
-    PyObject *b;
     if (v)
         return v;
     if (errors != NULL)
         Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
-    b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
+    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
                              PyUnicode_GET_SIZE(unicode),
                              NULL);
-    if (!b)
+    if (!v)
         return NULL;
-    v = PyString_FromStringAndSize(PyBytes_AsString(b),
-                                   PyBytes_Size(b));
-    Py_DECREF(b);
     ((PyUnicodeObject *)unicode)->defenc = v;
     return v;
 }
@@ -1420,11 +1404,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
     if (!inputobj)
         goto onError;
-    if (!PyBytes_Check(inputobj)) {
+    if (!PyString_Check(inputobj)) {
 	PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
     }
-    *input = PyBytes_AS_STRING(inputobj);
-    insize = PyBytes_GET_SIZE(inputobj);
+    *input = PyString_AS_STRING(inputobj);
+    insize = PyString_GET_SIZE(inputobj);
     *inend = *input + insize;
     /* we can DECREF safely, as the exception has another reference,
        so the object won't go away. */
@@ -1674,7 +1658,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
                    int encodeWhiteSpace,
                    const char *errors)
 {
-    PyObject *v;
+    PyObject *v, *result;
     /* It might be possible to tighten this worst case */
     Py_ssize_t cbAllocated = 5 * size;
     int inShift = 0;
@@ -1685,7 +1669,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
     char * start;
 
     if (size == 0)
-	return PyBytes_FromStringAndSize(NULL, 0);
+	return PyString_FromStringAndSize(NULL, 0);
 
     v = PyBytes_FromStringAndSize(NULL, cbAllocated);
     if (v == NULL)
@@ -1757,11 +1741,9 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
         *out++ = '-';
     }
 
-    if (PyBytes_Resize(v, out - start)) {
-        Py_DECREF(v);
-        return NULL;
-    }
-    return v;
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
+    Py_DECREF(v);
+    return result;
 }
 
 #undef SPECIAL
@@ -2001,11 +1983,11 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 {
 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
 
-    Py_ssize_t i;           /* index into s of next input byte */
-    PyObject *v;        /* result string object */
-    char *p;            /* next free byte in output buffer */
-    Py_ssize_t nallocated;  /* number of result bytes allocated */
-    Py_ssize_t nneeded;        /* number of result bytes needed */
+    Py_ssize_t i;                /* index into s of next input byte */
+    PyObject *result;            /* result string object */
+    char *p;                     /* next free byte in output buffer */
+    Py_ssize_t nallocated;      /* number of result bytes allocated */
+    Py_ssize_t nneeded;            /* number of result bytes needed */
     char stackbuf[MAX_SHORT_UNICHARS * 4];
 
     assert(s != NULL);
@@ -2017,7 +1999,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
          * turns out we need.
          */
         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
-        v = NULL;   /* will allocate after we're done */
+        result = NULL;   /* will allocate after we're done */
         p = stackbuf;
     }
     else {
@@ -2025,10 +2007,10 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
         nallocated = size * 4;
         if (nallocated / 4 != size)  /* overflow! */
             return PyErr_NoMemory();
-        v = PyBytes_FromStringAndSize(NULL, nallocated);
-        if (v == NULL)
+        result = PyString_FromStringAndSize(NULL, nallocated);
+        if (result == NULL)
             return NULL;
-        p = PyBytes_AS_STRING(v);
+        p = PyString_AS_STRING(result);
     }
 
     for (i = 0; i < size;) {
@@ -2072,19 +2054,19 @@ encodeUCS4:
         }
     }
 
-    if (v == NULL) {
+    if (result == NULL) {
         /* This was stack allocated. */
         nneeded = p - stackbuf;
         assert(nneeded <= nallocated);
-        v = PyBytes_FromStringAndSize(stackbuf, nneeded);
+        result = PyString_FromStringAndSize(stackbuf, nneeded);
     }
     else {
     	/* Cut back to size actually needed. */
-        nneeded = p - PyBytes_AS_STRING(v);
+        nneeded = p - PyString_AS_STRING(result);
         assert(nneeded <= nallocated);
-        PyBytes_Resize(v, nneeded);
+        _PyString_Resize(&result, nneeded);
     }
-    return v;
+    return result;
 
 #undef MAX_SHORT_UNICHARS
 }
@@ -2279,7 +2261,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
 		      const char *errors,
 		      int byteorder)
 {
-    PyObject *v;
+    PyObject *v, *result;
     unsigned char *p;
 #ifndef Py_UNICODE_WIDE
     int i, pairs;
@@ -2319,7 +2301,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
     if (byteorder == 0)
 	STORECHAR(0xFEFF);
     if (size == 0)
-        return v;
+        goto done;
 
     if (byteorder == -1) {
         /* force LE */
@@ -2350,7 +2332,11 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
 #endif
         STORECHAR(ch);
     }
-    return v;
+
+  done:
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
+    Py_DECREF(v);
+    return result;
 #undef STORECHAR
 }
 
@@ -2549,7 +2535,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
 		      const char *errors,
 		      int byteorder)
 {
-    PyObject *v;
+    PyObject *v, *result;
     unsigned char *p;
 #ifdef Py_UNICODE_WIDE
     int i, pairs;
@@ -2584,7 +2570,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
     if (byteorder == 0)
 	STORECHAR(0xFEFF);
     if (size == 0)
-        return v;
+        goto done;
 
     if (byteorder == -1) {
         /* force LE */
@@ -2610,7 +2596,11 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
         if (ch2)
             STORECHAR(ch2);
     }
-    return v;
+
+  done:
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
+    Py_DECREF(v);
+    return result;
 #undef STORECHAR
 }
 
@@ -2900,7 +2890,7 @@ static const char *hexdigits = "0123456789abcdef";
 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
 					Py_ssize_t size)
 {
-    PyObject *repr;
+    PyObject *repr, *result;
     char *p;
 
     /* XXX(nnorwitz): rather than over-allocating, it would be
@@ -3023,12 +3013,10 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
             *p++ = (char) ch;
     }
 
-    *p = '\0';
-    if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
-        Py_DECREF(repr);
-        return NULL;
-    }
-    return repr;
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
+                                        p - PyBytes_AS_STRING(repr));
+    Py_DECREF(repr);
+    return result;
 }
 
 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
@@ -3159,7 +3147,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
 					   Py_ssize_t size)
 {
-    PyObject *repr;
+    PyObject *repr, *result;
     char *p;
     char *q;
 
@@ -3171,7 +3159,7 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
     if (repr == NULL)
         return NULL;
     if (size == 0)
-	return repr;
+        goto done;
 
     p = q = PyBytes_AS_STRING(repr);
     while (size-- > 0) {
@@ -3205,12 +3193,12 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
 	else
             *p++ = (char) ch;
     }
-    *p = '\0';
-    if (PyBytes_Resize(repr, p - q)) {
-        Py_DECREF(repr);
-        return NULL;
-    }
-    return repr;
+    size = p - q;
+
+  done:
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
+    Py_DECREF(repr);
+    return result;
 }
 
 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
@@ -3445,23 +3433,23 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
     /* pointer into the output */
     char *str;
     /* current output position */
-    Py_ssize_t respos = 0;
     Py_ssize_t ressize;
     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
+    PyObject *result = NULL;
     /* the following variable is used for caching string comparisons
      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
     int known_errorHandler = -1;
 
     /* allocate enough for a simple encoding without
        replacements, if we need more, we'll resize */
+    if (size == 0)
+        return PyString_FromStringAndSize(NULL, 0);
     res = PyBytes_FromStringAndSize(NULL, size);
     if (res == NULL)
-        goto onError;
-    if (size == 0)
-	return res;
+        return NULL;
     str = PyBytes_AS_STRING(res);
     ressize = size;
 
@@ -3589,20 +3577,13 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
 	    }
 	}
     }
-    /* Resize if we allocated to much */
-    respos = str - PyBytes_AS_STRING(res);
-    if (respos<ressize)
-       /* If this falls res will be NULL */
-	PyBytes_Resize(res, respos);
-    Py_XDECREF(errorHandler);
-    Py_XDECREF(exc);
-    return res;
-
-    onError:
-    Py_XDECREF(res);
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
+                                        str - PyBytes_AS_STRING(res));
+  onError:
+    Py_DECREF(res);
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
-    return NULL;
+    return result;
 }
 
 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
@@ -3848,20 +3829,20 @@ static int encode_mbcs(PyObject **repr,
 
     if (*repr == NULL) {
 	/* Create string object */
-	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
+	*repr = PyString_FromStringAndSize(NULL, mbcssize);
 	if (*repr == NULL)
 	    return -1;
     }
     else {
 	/* Extend string object */
-	n = PyBytes_Size(*repr);
-	if (PyBytes_Resize(*repr, n + mbcssize) < 0)
+	n = PyString_Size(*repr);
+	if (_PyString_Resize(repr, n + mbcssize) < 0)
 	    return -1;
     }
 
     /* Do the conversion */
     if (size > 0) {
-	char *s = PyBytes_AS_STRING(*repr) + n;
+	char *s = PyString_AS_STRING(*repr) + n;
 	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
 	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
 	    return -1;
@@ -4341,16 +4322,14 @@ static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
 }
 
 static int
-charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
+charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
 {
-	Py_ssize_t outsize = PyBytes_GET_SIZE(  outobj);
+	Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
 	/* exponentially overallocate to minimize reallocations */
 	if (requiredsize < 2*outsize)
 	    requiredsize = 2*outsize;
-	if (PyBytes_Resize(outobj, requiredsize)) {
-	    Py_DECREF(outobj);
+	if (_PyString_Resize(outobj, requiredsize))
 	    return -1;
-	}
 	return 0;
 }
 
@@ -4365,21 +4344,21 @@ typedef enum charmapencode_result {
    reallocation error occurred. The caller must decref the result */
 static
 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
-    PyObject *outobj, Py_ssize_t *outpos)
+    PyObject **outobj, Py_ssize_t *outpos)
 {
     PyObject *rep;
     char *outstart;
-    Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
+    Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
 
     if (Py_Type(mapping) == &EncodingMapType) {
         int res = encoding_map_lookup(c, mapping);
 	Py_ssize_t requiredsize = *outpos+1;
         if (res == -1)
             return enc_FAILED;
-	if (outsize<requiredsize) 
+	if (outsize<requiredsize)
 	    if (charmapencode_resize(outobj, outpos, requiredsize))
 		return enc_EXCEPTION;
-        outstart = PyBytes_AS_STRING(outobj);
+        outstart = PyString_AS_STRING(*outobj);
 	outstart[(*outpos)++] = (char)res;
 	return enc_SUCCESS;
     }
@@ -4398,7 +4377,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
 		    Py_DECREF(rep);
 		    return enc_EXCEPTION;
 		}
-            outstart = PyBytes_AS_STRING(outobj);
+            outstart = PyString_AS_STRING(*outobj);
 	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
 	}
 	else {
@@ -4410,7 +4389,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
 		    Py_DECREF(rep);
 		    return enc_EXCEPTION;
 		}
-            outstart = PyBytes_AS_STRING(outobj);
+            outstart = PyString_AS_STRING(*outobj);
 	    memcpy(outstart + *outpos, repchars, repsize);
 	    *outpos += repsize;
 	}
@@ -4426,7 +4405,7 @@ int charmap_encoding_error(
     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
     PyObject **exceptionObject,
     int *known_errorHandler, PyObject **errorHandler, const char *errors,
-    PyObject *res, Py_ssize_t *respos)
+    PyObject **res, Py_ssize_t *respos)
 {
     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
     Py_ssize_t repsize;
@@ -4561,7 +4540,7 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
 
     /* allocate enough for a simple encoding without
        replacements, if we need more, we'll resize */
-    res = PyBytes_FromStringAndSize(NULL, size);
+    res = PyString_FromStringAndSize(NULL, size);
     if (res == NULL)
         goto onError;
     if (size == 0)
@@ -4569,14 +4548,14 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
 
     while (inpos<size) {
 	/* try to encode it */
-	charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
+	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
 	if (x==enc_EXCEPTION) /* error */
 	    goto onError;
 	if (x==enc_FAILED) { /* unencodable character */
 	    if (charmap_encoding_error(p, size, &inpos, mapping,
 		&exc,
 		&known_errorHandler, &errorHandler, errors,
-		res, &respos)) {
+		&res, &respos)) {
 		goto onError;
 	    }
 	}
@@ -4586,10 +4565,9 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
     }
 
     /* Resize if we allocated to much */
-    if (respos<PyBytes_GET_SIZE(res)) {
-	if (PyBytes_Resize(res, respos))
-	    goto onError;
-    }
+    if (respos<PyString_GET_SIZE(res))
+	_PyString_Resize(&res, respos);
+
     Py_XDECREF(exc);
     Py_XDECREF(errorHandler);
     return res;
@@ -5483,20 +5461,14 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
 
 	item = PySequence_Fast_GET_ITEM(fseq, i);
 	/* Convert item to Unicode. */
-	if (!PyString_Check(item) && !PyUnicode_Check(item))
-	{
-		if (PyBytes_Check(item))
-		{
-			PyErr_Format(PyExc_TypeError,
-                            "sequence item %d: join() will not operate on "
-                            "bytes objects", i);
-			goto onError;
-		}
-		item = PyObject_Unicode(item);
+	if (!PyUnicode_Check(item)) {
+	    PyErr_Format(PyExc_TypeError,
+			 "sequence item %zd: expected str instance,"
+			 " %.80s found",
+			 i, Py_Type(item)->tp_name);
+	    goto onError;
 	}
-	else
-		item = PyUnicode_FromObject(item);
-
+	item = PyUnicode_FromObject(item);
 	if (item == NULL)
 	    goto onError;
 	/* We own a reference to item from here on. */
@@ -6396,9 +6368,6 @@ PyObject *PyUnicode_Concat(PyObject *left,
 {
     PyUnicodeObject *u = NULL, *v = NULL, *w;
 
-    if (PyBytes_Check(left) || PyBytes_Check(right))
-        return PyBytes_Concat(left, right);
-
     /* Coerce the two arguments */
     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
     if (u == NULL)
@@ -6515,7 +6484,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args)
     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
     if (v == NULL)
         goto onError;
-    if (!PyBytes_Check(v)) {
+    if (!PyString_Check(v)) {
         PyErr_Format(PyExc_TypeError,
                      "encoder did not return a bytes object "
                      "(type=%.400s)",
@@ -8232,12 +8201,6 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
     return NULL;
 }
 
-#define F_LJUST (1<<0)
-#define F_SIGN	(1<<1)
-#define F_BLANK (1<<2)
-#define F_ALT	(1<<3)
-#define F_ZERO	(1<<4)
-
 static Py_ssize_t
 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
 {
author	Guido van Rossum <guido@python.org>	2007-11-06 21:34:58 (GMT)
committer	Guido van Rossum <guido@python.org>	2007-11-06 21:34:58 (GMT)
commit	98297ee7815939b124156e438b22bd652d67b5db (patch)
tree	a9d239ebd87c73af2571ab48003984c4e18e27e5 /Objects/unicodeobject.c
parent	a19f80c6df2df5e8a5d0cff37131097835ef971e (diff)
download	cpython-98297ee7815939b124156e438b22bd652d67b5db.zip cpython-98297ee7815939b124156e438b22bd652d67b5db.tar.gz cpython-98297ee7815939b124156e438b22bd652d67b5db.tar.bz2