From 9f4b1e9c50da83b51a4b0c7ee7d7dc3ef94a0cf6 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 10 Nov 2011 20:56:30 +0100 Subject: Fix and deprecated the unicode_internal codec unicode_internal codec uses Py_UNICODE instead of the real internal representation (PEP 393: Py_UCS1, Py_UCS2 or Py_UCS4) for backward compatibility. --- Doc/library/codecs.rst | 2 ++ Doc/whatsnew/3.3.rst | 2 ++ Modules/_codecsmodule.c | 20 ++++++++++++++++---- Objects/unicodeobject.c | 28 ++++++++++++++++++++++------ 4 files changed, 42 insertions(+), 10 deletions(-) diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 4523c7f..a9fae95 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1173,6 +1173,8 @@ particular, the following variants typically exist: | unicode_internal | | Return the internal | | | | representation of the | | | | operand | +| | | | +| | | .. deprecated:: 3.3 | +--------------------+---------+---------------------------+ The following codecs provide bytes-to-bytes mappings. diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst index 911d8d9..7f4517f 100644 --- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -250,6 +250,8 @@ versions. (:issue:`12100`) +The ``unicode_internal`` codec has been deprecated. + crypt ----- diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 727cf5e..93cb1b7 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -675,18 +675,30 @@ unicode_internal_encode(PyObject *self, PyObject *obj; const char *errors = NULL; const char *data; - Py_ssize_t size; + Py_ssize_t len, size; if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode", &obj, &errors)) return NULL; if (PyUnicode_Check(obj)) { + Py_UNICODE *u; + if (PyUnicode_READY(obj) < 0) return NULL; - data = PyUnicode_AS_DATA(obj); - size = PyUnicode_GET_DATA_SIZE(obj); - return codec_tuple(PyBytes_FromStringAndSize(data, size), + + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "unicode_internal codecs has been deprecated", + 1)) + return NULL; + + u = PyUnicode_AsUnicodeAndSize(obj, &len); + if (u == NULL) + return NULL; + if (len > PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) + return PyErr_NoMemory(); + size = len * sizeof(Py_UNICODE); + return codec_tuple(PyBytes_FromStringAndSize((const char*)u, size), PyUnicode_GET_LENGTH(obj)); } else { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 61534b4..3f580b5 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6237,6 +6237,11 @@ _PyUnicode_DecodeUnicodeInternal(const char *s, PyObject *errorHandler = NULL; PyObject *exc = NULL; + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "unicode_internal codecs has been deprecated", + 1)) + return NULL; + /* XXX overflow detection missing */ v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); if (v == NULL) @@ -6270,15 +6275,26 @@ _PyUnicode_DecodeUnicodeInternal(const char *s, errors, &errorHandler, "unicode_internal", reason, &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &outpos)) { + &v, &outpos)) goto onError; - } + continue; } - else { - if (unicode_putchar(&v, &outpos, ch) < 0) - goto onError; - s += Py_UNICODE_SIZE; + + s += Py_UNICODE_SIZE; +#ifndef Py_UNICODE_WIDE + if (ch >= 0xD800 && ch <= 0xDBFF && s < end) + { + Py_UCS4 ch2 = *(Py_UNICODE*)s; + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) + { + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; + s += Py_UNICODE_SIZE; + } } +#endif + + if (unicode_putchar(&v, &outpos, ch) < 0) + goto onError; } if (PyUnicode_Resize(&v, outpos) < 0) -- cgit v0.12