diff options
author | Walter Dörwald <walter@livinglogic.de> | 2007-08-16 21:55:45 (GMT) |
---|---|---|
committer | Walter Dörwald <walter@livinglogic.de> | 2007-08-16 21:55:45 (GMT) |
commit | 41980caf644163f1ff74a793b30f1c424eeede82 (patch) | |
tree | dba1c68090fce4379eced5a27a5b8d4b4f55340c /Modules | |
parent | 066100909ae45e7acd59b2ac81338d3cfcf44384 (diff) | |
download | cpython-41980caf644163f1ff74a793b30f1c424eeede82.zip cpython-41980caf644163f1ff74a793b30f1c424eeede82.tar.gz cpython-41980caf644163f1ff74a793b30f1c424eeede82.tar.bz2 |
Apply SF patch #1775604: This adds three new codecs (utf-32, utf-32-le and
ut-32-be). On narrow builds the codecs combine surrogate pairs in the unicode
object into one codepoint on encoding and create surrogate pairs for
codepoints outside the BMP on decoding. Lone surrogates are passed through
unchanged in all cases.
Backport to the trunk will follow.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_codecsmodule.c | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index c500073..e3933e7 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -413,6 +413,126 @@ utf_16_ex_decode(PyObject *self, } static PyObject * +utf_32_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = 0; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded; + + if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode", + &data, &size, &errors, &final)) + return NULL; + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, + final ? NULL : &consumed); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + +static PyObject * +utf_32_le_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = -1; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + + if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode", + &data, &size, &errors, &final)) + return NULL; + + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, + &byteorder, final ? NULL : &consumed); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); + +} + +static PyObject * +utf_32_be_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = 1; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + + if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode", + &data, &size, &errors, &final)) + return NULL; + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, + &byteorder, final ? NULL : &consumed); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + +/* This non-standard version also provides access to the byteorder + parameter of the builtin UTF-32 codec. + + It returns a tuple (unicode, bytesread, byteorder) with byteorder + being the value in effect at the end of data. + +*/ + +static PyObject * +utf_32_ex_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = 0; + PyObject *unicode, *tuple; + int final = 0; + Py_ssize_t consumed; + + if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode", + &data, &size, &errors, &byteorder, &final)) + return NULL; + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, + final ? NULL : &consumed); + if (unicode == NULL) + return NULL; + tuple = Py_BuildValue("Oni", unicode, consumed, byteorder); + Py_DECREF(unicode); + return tuple; +} + +static PyObject * unicode_escape_decode(PyObject *self, PyObject *args) { @@ -700,6 +820,83 @@ utf_16_be_encode(PyObject *self, return v; } +/* This version provides access to the byteorder parameter of the + builtin UTF-32 codecs as optional third argument. It defaults to 0 + which means: use the native byte order and prepend the data with a + BOM mark. + +*/ + +static PyObject * +utf_32_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + int byteorder = 0; + + if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode", + &str, &errors, &byteorder)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors, + byteorder), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + +static PyObject * +utf_32_le_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors, + -1), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + +static PyObject * +utf_32_be_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors, + +1), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + static PyObject * unicode_escape_encode(PyObject *self, PyObject *args) @@ -916,6 +1113,13 @@ static PyMethodDef _codecs_functions[] = { {"utf_16_le_decode", utf_16_le_decode, METH_VARARGS}, {"utf_16_be_decode", utf_16_be_decode, METH_VARARGS}, {"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS}, + {"utf_32_encode", utf_32_encode, METH_VARARGS}, + {"utf_32_le_encode", utf_32_le_encode, METH_VARARGS}, + {"utf_32_be_encode", utf_32_be_encode, METH_VARARGS}, + {"utf_32_decode", utf_32_decode, METH_VARARGS}, + {"utf_32_le_decode", utf_32_le_decode, METH_VARARGS}, + {"utf_32_be_decode", utf_32_be_decode, METH_VARARGS}, + {"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS}, {"unicode_escape_encode", unicode_escape_encode, METH_VARARGS}, {"unicode_escape_decode", unicode_escape_decode, METH_VARARGS}, {"unicode_internal_encode", unicode_internal_encode, METH_VARARGS}, |