SF patch #470578: Fixes to synchronize unicode() and str()

This patch implements what we have discussed on python-dev late in September: str(obj) and unicode(obj) should behave similar, while the old behaviour is retained for unicode(obj, encoding, errors). The patch also adds a new feature with which objects can provide unicode(obj) with input data: the __unicode__ method. Currently no new tp_unicode slot is implemented; this is left as option for the future. Note that PyUnicode_FromEncodedObject() no longer accepts Unicode objects as input. The API name already suggests that Unicode objects do not belong in the list of acceptable objects and the functionality was only needed because PyUnicode_FromEncodedObject() was being used directly by unicode(). The latter was changed in the discussed way: * unicode(obj) calls PyObject_Unicode() * unicode(obj, encoding, errors) calls PyUnicode_FromEncodedObject() One thing left open to discussion is whether to leave the PyUnicode_FromObject() API as a thin API extension on top of PyUnicode_FromEncodedObject() or to turn it into a (macro) alias for PyObject_Unicode() and deprecate it. Doing so would have some surprising consequences though, e.g. u"abc" + 123 would turn out as u"abc123"... [Marc-Andre didn't have time to check this in before the deadline. I hope this is OK, Marc-Andre! You can still make changes and commit them on the trunk after the branch has been made, but then please mail Barry a context diff if you want the change to be merged into the 2.2b1 release branch. GvR]
author: Guido van Rossum <guido@python.org> 2001-10-19 02:01:31 (GMT)
committer: Guido van Rossum <guido@python.org> 2001-10-19 02:01:31 (GMT)
commit: b8c65bc27ffc61c659180c351d3cc283abd1be45 (patch)
tree: 45f9f5c11d6ea41fa7e78aaab2b9531ec1f9cf92
parent: 93505a2f2b468bed3ec2ba013f07ee063cf618e8 (diff)
download: cpython-b8c65bc27ffc61c659180c351d3cc283abd1be45.zip
cpython-b8c65bc27ffc61c659180c351d3cc283abd1be45.tar.gz
cpython-b8c65bc27ffc61c659180c351d3cc283abd1be45.tar.bz2
3 files changed, 76 insertions, 72 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 41feae2..368a212 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -454,14 +454,12 @@ extern DL_IMPORT(int) PyUnicode_Resize(
 
    Coercion is done in the following way:
 
-   1. Unicode objects are passed back as-is with incremented
-      refcount.
-
-   2. String and other char buffer compatible objects are decoded
+   1. String and other char buffer compatible objects are decoded
       under the assumptions that they contain data using the current
       default encoding. Decoding is done in "strict" mode.
 
-   3. All other objects raise an exception.
+   2. All other objects (including Unicode objects) raise an
+      exception.
 
    The API returns NULL in case of an error. The caller is responsible
    for decref'ing the returned objects.
@@ -474,12 +472,13 @@ extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
     const char *errors          /* error handling */
     );
 
-/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
-   which results in using the default encoding as basis for 
-   decoding the object.
-
-   Coerces obj to an Unicode object and return a reference with
+/* Coerce obj to an Unicode object and return a reference with
    *incremented* refcount.
+   
+   Unicode objects are passed back as-is (subclasses are converted to
+   true Unicode objects), all other objects are delegated to
+   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
+   using the default encoding as basis for decoding the object.
 
    The API returns NULL in case of an error. The caller is responsible
    for decref'ing the returned objects.
diff --git a/Objects/object.c b/Objects/object.c
index af0c0bb..aa5f87c 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -296,39 +296,50 @@ PyObject_Unicode(PyObject *v)
 	
 	if (v == NULL)
 		res = PyString_FromString("<NULL>");
-	else if (PyUnicode_Check(v)) {
+	if (PyUnicode_CheckExact(v)) {
 		Py_INCREF(v);
 		return v;
 	}
-	else if (PyString_Check(v)) {
+	if (PyUnicode_Check(v)) {
+		/* For a Unicode subtype that's not a Unicode object,
+		   return a true Unicode object with the same data. */
+		return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(v),
+					     PyUnicode_GET_SIZE(v));
+	}
+	if (PyString_Check(v)) {
 		Py_INCREF(v);
 	    	res = v;
     	}
-	else if (v->ob_type->tp_str != NULL)
-		res = (*v->ob_type->tp_str)(v);
 	else {
 		PyObject *func;
-		static PyObject *strstr;
-		if (strstr == NULL) {
-			strstr= PyString_InternFromString("__str__");
-			if (strstr == NULL)
+		static PyObject *unicodestr;
+		/* XXX As soon as we have a tp_unicode slot, we should
+		       check this before trying the __unicode__
+		       method. */
+		if (unicodestr == NULL) {
+			unicodestr= PyString_InternFromString(
+						       "__unicode__");
+			if (unicodestr == NULL)
 				return NULL;
 		}
-		if (!PyInstance_Check(v) ||
-		    (func = PyObject_GetAttr(v, strstr)) == NULL) {
-			PyErr_Clear();
-			res = PyObject_Repr(v);
-		}
-		else {
+		func = PyObject_GetAttr(v, unicodestr);
+		if (func != NULL) {
 		    	res = PyEval_CallObject(func, (PyObject *)NULL);
 			Py_DECREF(func);
 		}
+		else {
+			PyErr_Clear();
+			if (v->ob_type->tp_str != NULL)
+				res = (*v->ob_type->tp_str)(v);
+			else
+				res = PyObject_Repr(v);
+		}
 	}
 	if (res == NULL)
 		return NULL;
 	if (!PyUnicode_Check(res)) {
-		PyObject* str;
-		str = PyUnicode_FromObject(res);
+		PyObject *str;
+		str = PyUnicode_FromEncodedObject(res, NULL, "strict");
 		Py_DECREF(res);
 		if (str)
 			res = str;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index a252587..a29c75b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -395,6 +395,18 @@ int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 
 PyObject *PyUnicode_FromObject(register PyObject *obj)
 {
+    /* XXX Perhaps we should make this API an alias of
+           PyObject_Unicode() instead ?! */
+    if (PyUnicode_CheckExact(obj)) {
+	Py_INCREF(obj);
+	return obj;
+    }
+    if (PyUnicode_Check(obj)) {
+	/* For a Unicode subtype that's not a Unicode object,
+	   return a true Unicode object with the same data. */
+	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
+				     PyUnicode_GET_SIZE(obj));
+    }
     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 }
 
@@ -406,69 +418,49 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
     int len;
     int owned = 0;
     PyObject *v;
-    int reclevel;
     
     if (obj == NULL) {
 	PyErr_BadInternalCall();
 	return NULL;
     }
 
-    /* Coerce object */
-    for (reclevel = 0; reclevel < 2; reclevel++) {
+#if 0
+    /* For b/w compatibility we also accept Unicode objects provided
+       that no encodings is given and then redirect to PyObject_Unicode() 
+       which then applies the additional logic for Unicode subclasses.
+
+       NOTE: This API should really only be used for object which
+             represent *encoded* Unicode !
 
+    */
 	if (PyUnicode_Check(obj)) {
 	    if (encoding) {
 		PyErr_SetString(PyExc_TypeError,
 				"decoding Unicode is not supported");
-		goto onError;
-	    }
-	    if (PyUnicode_CheckExact(obj)) {
-		Py_INCREF(obj);
-		v = obj;
+	    return NULL;
 	    }
-	    else {
-		/* For a subclass of unicode, return a true unicode object
-		   with the same string value. */
-		v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
-					  PyUnicode_GET_SIZE(obj));
+	return PyObject_Unicode(obj);
 	    }
-	    goto done;
+#else
+    if (PyUnicode_Check(obj)) {
+	PyErr_SetString(PyExc_TypeError,
+			"decoding Unicode is not supported");
+	return NULL;
 	}
-	else if (PyString_Check(obj)) {
+#endif
+
+    /* Coerce object */
+    if (PyString_Check(obj)) {
 	    s = PyString_AS_STRING(obj);
 	    len = PyString_GET_SIZE(obj);
-	    break;
-	}
-	else {
-	    PyObject *w;
-
-	    /* Try char buffer interface */
-            if (PyObject_AsCharBuffer(obj, &s, &len))
-		PyErr_Clear();
-	    else
-		break;
-    
-	    /* Mimic the behaviour of str(object) if everything else
-    	       fails (see PyObject_Str()); this also covers instances
-    	       which implement __str__. */
-	    if (obj->ob_type->tp_str == NULL)
-		w = PyObject_Repr(obj);
-	    else
-		w = (*obj->ob_type->tp_str)(obj);
-	    if (w == NULL)
-		goto onError;
-	    if (owned) {
-		Py_DECREF(obj);
 	    }
-	    obj = w;
-	    owned = 1;
-	}
-    }
-
-    if (s == NULL) {
+    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
+	/* Overwrite the error message with something more useful in
+	   case of a TypeError. */
+	if (PyErr_ExceptionMatches(PyExc_TypeError))
 	PyErr_Format(PyExc_TypeError,
-		     "coercing to Unicode: __str__ recursion limit exceeded "
-		     "(last type: %.80s)",
+			 "coercing to Unicode: need string or buffer, "
+			 "%.80s found",
 		     obj->ob_type->tp_name);
 	goto onError;
     }
@@ -481,7 +473,6 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
     else 
 	v = PyUnicode_Decode(s, len, encoding, errors);
 
- done:
     if (owned) {
 	Py_DECREF(obj);
     }
@@ -5653,6 +5644,9 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 	    return NULL;
 	if (x == NULL)
 		return (PyObject *)_PyUnicode_New(0);
+	if (encoding == NULL && errors == NULL)
+	    return PyObject_Unicode(x);
+	else
 	return PyUnicode_FromEncodedObject(x, encoding, errors);
 }
author	Guido van Rossum <guido@python.org>	2001-10-19 02:01:31 (GMT)
committer	Guido van Rossum <guido@python.org>	2001-10-19 02:01:31 (GMT)
commit	b8c65bc27ffc61c659180c351d3cc283abd1be45 (patch)
tree	45f9f5c11d6ea41fa7e78aaab2b9531ec1f9cf92
parent	93505a2f2b468bed3ec2ba013f07ee063cf618e8 (diff)
download	cpython-b8c65bc27ffc61c659180c351d3cc283abd1be45.zip cpython-b8c65bc27ffc61c659180c351d3cc283abd1be45.tar.gz cpython-b8c65bc27ffc61c659180c351d3cc283abd1be45.tar.bz2