From db12d454e6176e9c933babe3ce40b225307c6305 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= <martin@v.loewis.de>
Date: Sat, 2 May 2009 18:52:14 +0000
Subject: Issue #3672: Reject surrogates in utf-8 codec; add surrogates error
 handler.

---
 Doc/library/codecs.rst       | 12 ++++++
 Lib/test/test_bytes.py       |  4 +-
 Lib/test/test_codecs.py      | 15 +++++++-
 Lib/test/test_unicode.py     |  6 +--
 Lib/test/test_unicodedata.py |  3 +-
 Misc/NEWS                    |  2 +
 Objects/unicodeobject.c      | 83 +++++++++++++++++++++++++++++++++------
 Python/codecs.c              | 92 ++++++++++++++++++++++++++++++++++++++++++++
 Python/marshal.c             |  6 ++-
 9 files changed, 202 insertions(+), 21 deletions(-)

diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
index 4b6c7e5..ab578ea 100644
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -323,6 +323,18 @@ and implemented by all standard Python codecs:
 |                         | (only for encoding).                          |
 +-------------------------+-----------------------------------------------+
 
+In addition, the following error handlers are specific to a single codec:
+
++------------------+---------+--------------------------------------------+
+| Value            | Codec   | Meaning                                    |
++==================+=========+============================================+
+| ``'surrogates'`` | utf-8   | Allow encoding and decoding of surrogate   |
+|                  |         | codes in UTF-8.                            |
++------------------+---------+--------------------------------------------+
+
+.. versionadded:: 3.1
+   The ``'surrogates'`` error handler.
+
 The set of allowed values can be extended via :meth:`register_error`.
 
 
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index a3ea40a..992f3d2 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -169,13 +169,13 @@ class BaseBytesTest(unittest.TestCase):
                     self.assertEqual(b[start:stop:step], self.type2test(L[start:stop:step]))
 
     def test_encoding(self):
-        sample = "Hello world\n\u1234\u5678\u9abc\udef0"
+        sample = "Hello world\n\u1234\u5678\u9abc"
         for enc in ("utf8", "utf16"):
             b = self.type2test(sample, enc)
             self.assertEqual(b, self.type2test(sample.encode(enc)))
         self.assertRaises(UnicodeEncodeError, self.type2test, sample, "latin1")
         b = self.type2test(sample, "latin1", "ignore")
-        self.assertEqual(b, self.type2test(sample[:-4], "utf-8"))
+        self.assertEqual(b, self.type2test(sample[:-3], "utf-8"))
 
     def test_decode(self):
         sample = "Hello world\n\u1234\u5678\u9abc\def0\def0"
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 1730dbe..6706507 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -541,6 +541,17 @@ class UTF8Test(ReadTest):
         self.check_state_handling_decode(self.encoding,
                                          u, u.encode(self.encoding))
 
+    def test_lone_surrogates(self):
+        self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
+        self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
+
+    def test_surrogates_handler(self):
+        self.assertEquals("abc\ud800def".encode("utf-8", "surrogates"),
+                          b"abc\xed\xa0\x80def")
+        self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogates"),
+                          "abc\ud800def")
+        self.assertTrue(codecs.lookup_error("surrogates"))
+
 class UTF7Test(ReadTest):
     encoding = "utf-7"
 
@@ -1023,12 +1034,12 @@ class NameprepTest(unittest.TestCase):
                 # Skipped
                 continue
             # The Unicode strings are given in UTF-8
-            orig = str(orig, "utf-8")
+            orig = str(orig, "utf-8", "surrogates")
             if prepped is None:
                 # Input contains prohibited characters
                 self.assertRaises(UnicodeError, nameprep, orig)
             else:
-                prepped = str(prepped, "utf-8")
+                prepped = str(prepped, "utf-8", "surrogates")
                 try:
                     self.assertEquals(nameprep(orig), prepped)
                 except Exception as e:
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 1fddc06..220a8eb 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -886,10 +886,10 @@ class UnicodeTest(
         self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
         self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82')
         self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96')
-        self.assertEqual('\ud800'.encode('utf-8'), b'\xed\xa0\x80')
-        self.assertEqual('\udc00'.encode('utf-8'), b'\xed\xb0\x80')
+        self.assertEqual('\ud800'.encode('utf-8', 'surrogates'), b'\xed\xa0\x80')
+        self.assertEqual('\udc00'.encode('utf-8', 'surrogates'), b'\xed\xb0\x80')
         self.assertEqual(
-            ('\ud800\udc02'*1000).encode('utf-8'),
+            ('\ud800\udc02'*1000).encode('utf-8', 'surrogates'),
             b'\xf0\x90\x80\x82'*1000
         )
         self.assertEqual(
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index aed8eaa..b84aaaf 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -13,6 +13,7 @@ import subprocess
 import test.support
 
 encoding = 'utf-8'
+errors = 'surrogates'
 
 
 ### Run tests
@@ -61,7 +62,7 @@ class UnicodeMethodsTest(unittest.TestCase):
                 (char + 'ABC').title(),
 
                 ]
-            h.update(''.join(data).encode(encoding))
+            h.update(''.join(data).encode(encoding, errors))
         result = h.hexdigest()
         self.assertEqual(result, self.expectedchecksum)
 
diff --git a/Misc/NEWS b/Misc/NEWS
index 7f22b0d..f4116ad 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,8 @@ What's New in Python 3.1 beta 1?
 Core and Builtins
 -----------------
 
+- Issue #3672: Reject surrogates in utf-8 codec; add surrogates error handler.
+
 - Issue #5883: In the io module, the BufferedIOBase and TextIOBase ABCs have
   received a new method, detach().  detach() disconnects the underlying stream
   from the buffer or text IO and returns it.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 68d4fc4..cc70bad 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -154,6 +154,11 @@ const unsigned char _Py_ascii_whitespace[] = {
     0, 0, 0, 0, 0, 0, 0, 0
 };
 
+static PyObject *unicode_encode_call_errorhandler(const char *errors,
+       PyObject **errorHandler,const char *encoding, const char *reason,
+       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
+       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
+
 /* Same for linebreaks */
 static unsigned char ascii_linebreak[] = {
     0, 0, 0, 0, 0, 0, 0, 0,
@@ -2214,14 +2219,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
                 goto utf8Error;
             }
             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
-            if (ch < 0x0800) {
-                /* Note: UTF-8 encodings of surrogates are considered
-                   legal UTF-8 sequences;
-
-                   XXX For wide builds (UCS-4) we should probably try
-                   to recombine the surrogates into a single code
-                   unit.
-                */
+            if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
                 errmsg = "illegal encoding";
                 startinpos = s-starts;
                 endinpos = startinpos+3;
@@ -2328,6 +2326,8 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
     Py_ssize_t nallocated;      /* number of result bytes allocated */
     Py_ssize_t nneeded;            /* number of result bytes needed */
     char stackbuf[MAX_SHORT_UNICHARS * 4];
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
 
     assert(s != NULL);
     assert(size >= 0);
@@ -2367,6 +2367,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
         else {
             /* Encode UCS2 Unicode ordinals */
             if (ch < 0x10000) {
+#ifndef Py_UNICODE_WIDE
                 /* Special case: check for high surrogate */
                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
                     Py_UCS4 ch2 = s[i];
@@ -2379,6 +2380,36 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
                     }
                     /* Fall through: handles isolated high surrogates */
                 }
+#endif
+                if (ch >= 0xd800 && ch <= 0xdfff) {
+                    Py_ssize_t newpos;
+                    PyObject *rep;
+                    char *prep;
+                    int k;
+                    rep = unicode_encode_call_errorhandler
+                        (errors, &errorHandler, "utf-8", "surrogates not allowed", 
+                         s, size, &exc, i-1, i, &newpos);
+                    if (!rep)
+                        goto error;
+                    /* Implementation limitations: only support error handler that return
+                       bytes, and only support up to four replacement bytes. */
+                    if (!PyBytes_Check(rep)) {
+                        PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
+                        Py_DECREF(rep);
+                        goto error;
+                    }
+                    if (PyBytes_Size(rep) > 4) {
+                        PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
+                        Py_DECREF(rep);
+                        goto error;
+                    }
+                    prep = PyBytes_AsString(rep);
+                    for(k = PyBytes_Size(rep); k > 0; k--)
+                        *p++ = *prep++;
+                    Py_DECREF(rep);
+                    continue;
+                    
+                }
                 *p++ = (char)(0xe0 | (ch >> 12));
                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
                 *p++ = (char)(0x80 | (ch & 0x3f));
@@ -2405,7 +2436,14 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
         assert(nneeded <= nallocated);
         _PyBytes_Resize(&result, nneeded);
     }
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return result;
+ error:
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    Py_XDECREF(result);
+    return NULL;
 
 #undef MAX_SHORT_UNICHARS
 }
@@ -3897,7 +3935,7 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
                                                   Py_ssize_t startpos, Py_ssize_t endpos,
                                                   Py_ssize_t *newpos)
 {
-    static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
+    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
 
     PyObject *restuple;
     PyObject *resunicode;
@@ -3918,15 +3956,20 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
     if (restuple == NULL)
         return NULL;
     if (!PyTuple_Check(restuple)) {
-        PyErr_SetString(PyExc_TypeError, &argparse[4]);
+        PyErr_SetString(PyExc_TypeError, &argparse[3]);
         Py_DECREF(restuple);
         return NULL;
     }
-    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+    if (!PyArg_ParseTuple(restuple, argparse,
                           &resunicode, newpos)) {
         Py_DECREF(restuple);
         return NULL;
     }
+    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
+        PyErr_SetString(PyExc_TypeError, &argparse[3]);
+        Py_DECREF(restuple);
+        return NULL;
+    }
     if (*newpos<0)
         *newpos = size+*newpos;
     if (*newpos<0 || *newpos>size) {
@@ -4064,6 +4107,12 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
                                                               collstart-startp, collend-startp, &newpos);
                 if (repunicode == NULL)
                     goto onError;
+                if (!PyUnicode_Check(repunicode)) {
+                    /* Implementation limitation: byte results not supported yet. */
+                    PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+                    Py_DECREF(repunicode);
+                    goto onError;
+                }
                 /* need more space? (at least enough for what we
                    have+the replacement+the rest of the string, so
                    we won't have to check space for encodable characters) */
@@ -5027,6 +5076,12 @@ int charmap_encoding_error(
                                                       collstartpos, collendpos, &newpos);
         if (repunicode == NULL)
             return -1;
+        if (!PyUnicode_Check(repunicode)) {
+            /* Implementation limitation: byte results not supported yet. */
+            PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+            Py_DECREF(repunicode);
+            return -1;
+        }
         /* generate replacement  */
         repsize = PyUnicode_GET_SIZE(repunicode);
         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
@@ -5588,6 +5643,12 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
                                                           collstart-s, collend-s, &newpos);
             if (repunicode == NULL)
                 goto onError;
+            if (!PyUnicode_Check(repunicode)) {
+                /* Implementation limitation: byte results not supported yet. */
+                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+                Py_DECREF(repunicode);
+                goto onError;
+            }
             /* generate replacement  */
             repsize = PyUnicode_GET_SIZE(repunicode);
             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
diff --git a/Python/codecs.c b/Python/codecs.c
index ebddc09..3f1412d 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -748,6 +748,85 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
     }
 }
 
+PyObject *PyCodec_SurrogateErrors(PyObject *exc)
+{
+    PyObject *restuple;
+    PyObject *object;
+    Py_ssize_t start;
+    Py_ssize_t end;
+    PyObject *res;
+    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+	Py_UNICODE *p;
+	Py_UNICODE *startp;
+	char *outp;
+	if (PyUnicodeEncodeError_GetStart(exc, &start))
+	    return NULL;
+	if (PyUnicodeEncodeError_GetEnd(exc, &end))
+	    return NULL;
+	if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+	    return NULL;
+	startp = PyUnicode_AS_UNICODE(object);
+	res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
+	if (!res) {
+	    Py_DECREF(object);
+	    return NULL;
+	}
+	outp = PyBytes_AsString(res);
+	for (p = startp+start; p < startp+end; p++) {
+	    Py_UNICODE ch = *p;
+	    if (ch < 0xd800 || ch > 0xdfff) {
+		/* Not a surrogate, fail with original exception */
+		PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+		Py_DECREF(res);
+		Py_DECREF(object);
+		return NULL;
+	    }
+	    *outp++ = (char)(0xe0 | (ch >> 12));
+	    *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+	    *outp++ = (char)(0x80 | (ch & 0x3f));
+	}
+	restuple = Py_BuildValue("(On)", res, end);
+	Py_DECREF(res);
+	Py_DECREF(object);
+	return restuple;
+    }
+    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
+	unsigned char *p;
+	Py_UNICODE ch = 0;
+	if (PyUnicodeDecodeError_GetStart(exc, &start))
+	    return NULL;
+	if (!(object = PyUnicodeDecodeError_GetObject(exc)))
+	    return NULL;
+	if (!(p = (unsigned char*)PyBytes_AsString(object))) {
+	    Py_DECREF(object);
+	    return NULL;
+	}
+	/* Try decoding a single surrogate character. If
+	   there are more, let the codec call us again. */
+	p += start;
+	if ((p[0] & 0xf0) == 0xe0 || 
+	    (p[1] & 0xc0) == 0x80 ||
+	    (p[2] & 0xc0) == 0x80) {
+	    /* it's a three-byte code */
+	    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
+	    if (ch < 0xd800 || ch > 0xdfff)
+		/* it's not a surrogate - fail */
+		ch = 0;
+	}
+	Py_DECREF(object);
+	if (ch == 0) {
+	    PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+	    return NULL;
+	}
+	return Py_BuildValue("(u#n)", &ch, 1, start+3);
+    }
+    else {
+	wrong_exception_type(exc);
+	return NULL;
+    }
+}
+
+	
 static PyObject *strict_errors(PyObject *self, PyObject *exc)
 {
     return PyCodec_StrictErrors(exc);
@@ -777,6 +856,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
     return PyCodec_BackslashReplaceErrors(exc);
 }
 
+static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
+{
+    return PyCodec_SurrogateErrors(exc);
+}
+
 static int _PyCodecRegistry_Init(void)
 {
     static struct {
@@ -823,6 +907,14 @@ static int _PyCodecRegistry_Init(void)
 		backslashreplace_errors,
 		METH_O
 	    }
+	},
+	{
+	    "surrogates",
+	    {
+		"surrogates",
+		surrogates_errors,
+		METH_O
+	    }
 	}
     };
 
diff --git a/Python/marshal.c b/Python/marshal.c
index bf7a26b..4ad873e 100644
--- a/Python/marshal.c
+++ b/Python/marshal.c
@@ -312,7 +312,9 @@ w_object(PyObject *v, WFILE *p)
 	}
 	else if (PyUnicode_CheckExact(v)) {
 	        PyObject *utf8;
-		utf8 = PyUnicode_AsUTF8String(v);
+		utf8 = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(v),
+					    PyUnicode_GET_SIZE(v),
+					    "surrogates");
 		if (utf8 == NULL) {
 			p->depth--;
 			p->error = WFERR_UNMARSHALLABLE;
@@ -810,7 +812,7 @@ r_object(RFILE *p)
 			retval = NULL;
 			break;
 		}
-		v = PyUnicode_DecodeUTF8(buffer, n, NULL);
+		v = PyUnicode_DecodeUTF8(buffer, n, "surrogates");
 		PyMem_DEL(buffer);
 		retval = v;
 		break;
-- 
cgit v0.12