diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2009-05-02 18:52:14 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2009-05-02 18:52:14 (GMT) |
commit | db12d454e6176e9c933babe3ce40b225307c6305 (patch) | |
tree | 28b09c64e9dfd797da58a98725bfb93b4dae7077 /Python | |
parent | 02953d244fdb2fe99853d2fe5db905df53c6596f (diff) | |
download | cpython-db12d454e6176e9c933babe3ce40b225307c6305.zip cpython-db12d454e6176e9c933babe3ce40b225307c6305.tar.gz cpython-db12d454e6176e9c933babe3ce40b225307c6305.tar.bz2 |
Issue #3672: Reject surrogates in utf-8 codec; add surrogates error
handler.
Diffstat (limited to 'Python')
-rw-r--r-- | Python/codecs.c | 92 | ||||
-rw-r--r-- | Python/marshal.c | 6 |
2 files changed, 96 insertions, 2 deletions
diff --git a/Python/codecs.c b/Python/codecs.c index ebddc09..3f1412d 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -748,6 +748,85 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } } +PyObject *PyCodec_SurrogateErrors(PyObject *exc) +{ + PyObject *restuple; + PyObject *object; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + Py_UNICODE *p; + Py_UNICODE *startp; + char *outp; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + startp = PyUnicode_AS_UNICODE(object); + res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); + if (!res) { + Py_DECREF(object); + return NULL; + } + outp = PyBytes_AsString(res); + for (p = startp+start; p < startp+end; p++) { + Py_UNICODE ch = *p; + if (ch < 0xd800 || ch > 0xdfff) { + /* Not a surrogate, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(res); + Py_DECREF(object); + return NULL; + } + *outp++ = (char)(0xe0 | (ch >> 12)); + *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *outp++ = (char)(0x80 | (ch & 0x3f)); + } + restuple = Py_BuildValue("(On)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + unsigned char *p; + Py_UNICODE ch = 0; + if (PyUnicodeDecodeError_GetStart(exc, &start)) + return NULL; + if (!(object = PyUnicodeDecodeError_GetObject(exc))) + return NULL; + if (!(p = (unsigned char*)PyBytes_AsString(object))) { + Py_DECREF(object); + return NULL; + } + /* Try decoding a single surrogate character. If + there are more, let the codec call us again. */ + p += start; + if ((p[0] & 0xf0) == 0xe0 || + (p[1] & 0xc0) == 0x80 || + (p[2] & 0xc0) == 0x80) { + /* it's a three-byte code */ + ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + if (ch < 0xd800 || ch > 0xdfff) + /* it's not a surrogate - fail */ + ch = 0; + } + Py_DECREF(object); + if (ch == 0) { + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; + } + return Py_BuildValue("(u#n)", &ch, 1, start+3); + } + else { + wrong_exception_type(exc); + return NULL; + } +} + + static PyObject *strict_errors(PyObject *self, PyObject *exc) { return PyCodec_StrictErrors(exc); @@ -777,6 +856,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) return PyCodec_BackslashReplaceErrors(exc); } +static PyObject *surrogates_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_SurrogateErrors(exc); +} + static int _PyCodecRegistry_Init(void) { static struct { @@ -823,6 +907,14 @@ static int _PyCodecRegistry_Init(void) backslashreplace_errors, METH_O } + }, + { + "surrogates", + { + "surrogates", + surrogates_errors, + METH_O + } } }; diff --git a/Python/marshal.c b/Python/marshal.c index bf7a26b..4ad873e 100644 --- a/Python/marshal.c +++ b/Python/marshal.c @@ -312,7 +312,9 @@ w_object(PyObject *v, WFILE *p) } else if (PyUnicode_CheckExact(v)) { PyObject *utf8; - utf8 = PyUnicode_AsUTF8String(v); + utf8 = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(v), + PyUnicode_GET_SIZE(v), + "surrogates"); if (utf8 == NULL) { p->depth--; p->error = WFERR_UNMARSHALLABLE; @@ -810,7 +812,7 @@ r_object(RFILE *p) retval = NULL; break; } - v = PyUnicode_DecodeUTF8(buffer, n, NULL); + v = PyUnicode_DecodeUTF8(buffer, n, "surrogates"); PyMem_DEL(buffer); retval = v; break; |