diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2003-01-26 11:30:36 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2003-01-26 11:30:36 (GMT) |
commit | 9789aefa616c14599af600bdea925299517da730 (patch) | |
tree | 801c3f6ed8aafb3aafeae179737c9aea5e185bb9 /Modules | |
parent | afef4eefa8b07287abe1c59b9ef918e91386f454 (diff) | |
download | cpython-9789aefa616c14599af600bdea925299517da730.zip cpython-9789aefa616c14599af600bdea925299517da730.tar.gz cpython-9789aefa616c14599af600bdea925299517da730.tar.bz2 |
Patch #670715: Universal Unicode Codec for POSIX iconv.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_iconv_codec.c | 626 |
1 files changed, 626 insertions, 0 deletions
diff --git a/Modules/_iconv_codec.c b/Modules/_iconv_codec.c new file mode 100644 index 0000000..9f08102 --- /dev/null +++ b/Modules/_iconv_codec.c @@ -0,0 +1,626 @@ +/* + * _iconv_codec.c + * + * libiconv adaptor for Python iconvcodec + * + * Author : Hye-Shik Chang <perky@FreeBSD.org> + * Created : 17 January 2003 + */ + +#include "Python.h" +#include <string.h> +#include <iconv.h> + +static const char *__version__ = "$Revision$"; + +#if Py_USING_UNICODE +# if Py_UNICODE_SIZE == 2 +# ifdef __GNU_LIBRARY__ +# define UNICODE_ENCODING "ucs-2" +# else +# define UNICODE_ENCODING "ucs-2-internal" +# endif +# define MBENCODED_LENGTH_MAX 4 +# elif Py_UNICODE_SIZE == 4 +# ifdef __GNU_LIBRARY__ +# define UNICODE_ENCODING "ucs-4" +# else +# define UNICODE_ENCODING "ucs-4-internal" +# endif +# define MBENCODED_LENGTH_MAX 6 +# endif +#else +# error "Unicode is not available" +#endif + +typedef struct { + PyObject_HEAD + iconv_t enchdl, dechdl; + char *encoding; +} iconvcodecObject; +PyDoc_STRVAR(iconvcodec_doc, "iconvcodec object"); + +staticforward PyTypeObject iconvcodec_Type; + + +#define ERROR_STRICT (PyObject *)(1) +#define ERROR_IGNORE (PyObject *)(2) +#define ERROR_REPLACE (PyObject *)(3) +#define ERROR_MAX ERROR_REPLACE + +#define REPLACEMENT_CHAR_DECODE 0xFFFD +#define REPLACEMENT_CHAR_ENCODE '?' + +#define DEFAULT_ENCODING "utf-8" + + +static PyObject * +get_errorcallback(const char *errors) +{ + if (errors == NULL || strcmp(errors, "strict") == 0) + return ERROR_STRICT; + else if (strcmp(errors, "ignore") == 0) + return ERROR_IGNORE; + else if (strcmp(errors, "replace") == 0) + return ERROR_REPLACE; + else + return PyCodec_LookupError(errors); +} + + +PyDoc_STRVAR(iconvcodec_encode__doc__, +"I.encode(unicode, [,errors]) -> (string, length consumed)\n\ +\n\ +Return an encoded string version of `unicode'. errors may be given to\n\ +set a different error handling scheme. Default is 'strict' meaning that\n\ +encoding errors raise a UnicodeEncodeError. Other possible values are\n\ +'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\ +registered with codecs.register_error that can handle UnicodeEncodeErrors."); + +static PyObject * +iconvcodec_encode(iconvcodecObject *self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = { "input", "errors", NULL }; + Py_UNICODE *input; + int inputlen; + char *errors = NULL/*strict*/, *out, *out_top; + const char *inp, *inp_top; + size_t inplen, inplen_total, outlen, outlen_total, estep; + PyObject *outputobj = NULL, *errorcb = NULL, + *exceptionobj = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "u#|s:encode", + kwlist, &input, &inputlen, &errors)) + return NULL; /* TypeError */ + + errorcb = get_errorcallback(errors); + if (errorcb == NULL) + return NULL; /* LookupError or something else from error handler */ + + inp = inp_top = (char *)input; + inplen = inplen_total = (size_t)(inputlen * Py_UNICODE_SIZE); + + outlen = inputlen * MBENCODED_LENGTH_MAX; + if (outlen < 16) + outlen = 16; /* for iso-2022 codecs */ + + outputobj = PyString_FromStringAndSize(NULL, outlen); + if (outputobj == NULL) + return NULL; + out = out_top = PyString_AS_STRING(outputobj); + outlen_total = outlen; + + estep = inputlen * Py_UNICODE_SIZE / 2; + +#define RESIZE_OUTBUFFER(size) { \ + size_t toadd = (size); \ + outlen_total += toadd; \ + outlen += toadd; \ + if (_PyString_Resize(&outputobj, outlen_total) == -1) \ + goto errorexit; \ + out = PyString_AS_STRING(outputobj) + (out - out_top); \ + out_top = PyString_AS_STRING(outputobj); \ +} + while (inplen > 0) { + if (iconv(self->enchdl, &inp, &inplen, &out, &outlen) == -1) { + char reason[128]; + int errpos; + + if (errno == E2BIG) { + RESIZE_OUTBUFFER(estep); + continue; + } + + if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) { + inplen -= Py_UNICODE_SIZE; + inp += Py_UNICODE_SIZE; + if (errorcb == ERROR_REPLACE) { + if (outlen < 1) + RESIZE_OUTBUFFER(errno == EINVAL ? 1 : estep); + outlen--; + *out++ = REPLACEMENT_CHAR_ENCODE; + } + if (errno == EINVAL) break; + else continue; + } + + errpos = (int)(inp - inp_top) / Py_UNICODE_SIZE; + sprintf(reason, "Undefined character map from " +#if Py_UNICODE_SIZE == 2 + "\\u%04x" +#elif Py_UNICODE_SIZE == 4 + "\\u%08x" +#endif + , *(Py_UNICODE *)inp); + + if (exceptionobj == NULL) { + if ((exceptionobj = PyUnicodeEncodeError_Create( + self->encoding, input, inputlen, + errpos, errpos + 1, reason)) == NULL) + goto errorexit; + } else { + if (PyUnicodeEncodeError_SetStart(exceptionobj, errpos) != 0) + goto errorexit; + if (PyUnicodeEncodeError_SetEnd(exceptionobj, errpos + 1) != 0) + goto errorexit; + if (PyUnicodeEncodeError_SetReason(exceptionobj, reason) != 0) + goto errorexit; + } + + if (errorcb == ERROR_STRICT) { + PyCodec_StrictErrors(exceptionobj); + goto errorexit; + } else { + PyObject *argsobj, *retobj, *retuni; + long newpos; + + argsobj = PyTuple_New(1); + if (argsobj == NULL) + goto errorexit; + PyTuple_SET_ITEM(argsobj, 0, exceptionobj); + Py_INCREF(exceptionobj); + retobj = PyObject_CallObject(errorcb, argsobj); + Py_DECREF(argsobj); + if (retobj == NULL) + goto errorexit; + + if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || + !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) || + !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) { + Py_DECREF(retobj); + PyErr_SetString(PyExc_ValueError, "encoding error handler " + "must return (unicode, int) tuple"); + goto errorexit; + } + if (PyUnicode_GET_SIZE(retuni) > 0) { +#define errorexit errorexit_cbpad + PyObject *retstr = NULL; + int retstrsize; + + retstr = PyUnicode_AsEncodedString( + retuni, self->encoding, NULL); + if (retstr == NULL || !PyString_Check(retstr)) + goto errorexit; + + retstrsize = PyString_GET_SIZE(retstr); + if (outlen < retstrsize) + RESIZE_OUTBUFFER(errno == EINVAL || retstrsize > estep + ? retstrsize - outlen : estep); + + memcpy(out, PyString_AS_STRING(retstr), retstrsize); + out += retstrsize; + outlen -= retstrsize; +#undef errorexit + if (0) { +errorexit_cbpad: Py_XDECREF(retobj); + Py_XDECREF(retstr); + goto errorexit; + } + Py_DECREF(retstr); + } + + newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1)); + Py_DECREF(retobj); + + if (newpos < 0) + newpos = inputlen - newpos; + if (newpos < 0 || newpos >= inputlen) + break; + inp = inp_top + Py_UNICODE_SIZE * newpos; + inplen = inplen_total - Py_UNICODE_SIZE * newpos; + } + } else + break; + } +#undef RESIZE_OUTBUFFER + + { + PyObject *rettup; + int finalsize; + + finalsize = (int)(out - out_top); + + if (finalsize != outlen_total) { + if (_PyString_Resize(&outputobj, finalsize) == -1) + goto errorexit; + } + + if (errorcb > ERROR_MAX) { + Py_DECREF(errorcb); + } + Py_XDECREF(exceptionobj); + + rettup = PyTuple_New(2); + if (rettup == NULL) { + Py_DECREF(outputobj); + return NULL; + } + PyTuple_SET_ITEM(rettup, 0, outputobj); + PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inputlen)); + return rettup; + } + +errorexit: + Py_XDECREF(outputobj); + if (errorcb > ERROR_MAX) { + Py_DECREF(errorcb); + } + Py_XDECREF(exceptionobj); + + return NULL; +} + +PyDoc_STRVAR(iconvcodec_decode__doc__, +"I.decode(string, [,errors]) -> (unicodeobject, length consumed)\n\ +\n\ +Decodes `string' using I, an iconvcodec instance. errors may be given\n\ +to set a different error handling scheme. Default is 'strict' meaning\n\ +that encoding errors raise a UnicodeDecodeError. Other possible values\n\ +are 'ignore' and 'replace' as well as any other name registerd with\n\ +codecs.register_error that is able to handle UnicodeDecodeErrors."); + +static PyObject * +iconvcodec_decode(iconvcodecObject *self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = { "input", "errors", NULL }; + char *errors = NULL/*strict*/, *out, *out_top; + const char *inp, *inp_top; + int inplen_int; + size_t inplen, inplen_total, outlen, outlen_total, estep; + PyObject *outputobj = NULL, *errorcb = NULL, + *exceptionobj = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|s:decode", + kwlist, &inp, &inplen_int, &errors)) + return NULL; /* TypeError */ + + errorcb = get_errorcallback(errors); + if (errorcb == NULL) + return NULL; /* LookupError or something else from error handler */ + + inp_top = inp; + inplen_total = inplen = (size_t)inplen_int; + + outputobj = PyUnicode_FromUnicode(NULL, inplen); + if (outputobj == NULL) + return NULL; + outlen_total = outlen = PyUnicode_GET_DATA_SIZE(outputobj); + out = out_top = (char *)PyUnicode_AS_UNICODE(outputobj); + + estep = outlen / 2; + +#define RESIZE_OUTBUFFER(size) { \ + size_t toadd = (size); \ + outlen_total += toadd; \ + outlen += toadd; \ + if (PyUnicode_Resize(&outputobj, outlen_total/Py_UNICODE_SIZE) == -1) \ + goto errorexit; \ + out = (char *)PyUnicode_AS_UNICODE(outputobj) + (out - out_top); \ + out_top = (char *)PyUnicode_AS_UNICODE(outputobj); \ +} + while (inplen > 0) { + if (iconv(self->dechdl, &inp, &inplen, &out, &outlen) == -1) { + char reason[128], *reasonpos = (char *)reason; + int errpos; + + if (errno == E2BIG) { + RESIZE_OUTBUFFER(estep); + continue; + } + + if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) { + inplen--; inp++; + if (errorcb == ERROR_REPLACE) { + Py_UNICODE *replp; + + if (outlen < Py_UNICODE_SIZE) + RESIZE_OUTBUFFER( + errno == EINVAL || Py_UNICODE_SIZE > estep + ? Py_UNICODE_SIZE : estep); + + /* some compilers hate casted lvalue */ + replp = (Py_UNICODE *)out; + assert((long)replp % Py_UNICODE_SIZE == 0);/* aligned? */ + *replp = REPLACEMENT_CHAR_DECODE; + + out += Py_UNICODE_SIZE; + outlen -= Py_UNICODE_SIZE; + } + if (errno == EINVAL) break; + else continue; + } + + errpos = (int)(inp - inp_top); + reasonpos += sprintf(reason, "Invalid multibyte sequence \\x%02x", + (unsigned char)*inp); + if (inplen > 1) { + reasonpos += sprintf(reasonpos, + "\\x%02x", (unsigned char)*(inp+1)); + if (inplen > 2) + sprintf(reasonpos, "\\x%02x", (unsigned char)*(inp+2)); + } + + if (exceptionobj == NULL) { + exceptionobj = PyUnicodeDecodeError_Create( + self->encoding, inp_top, inplen_total, + errpos, errpos + 1, reason); + if (exceptionobj == NULL) + goto errorexit; + } else { + if (PyUnicodeDecodeError_SetStart(exceptionobj, errpos) != 0) + goto errorexit; + if (PyUnicodeDecodeError_SetEnd(exceptionobj, errpos + 1) != 0) + goto errorexit; + if (PyUnicodeDecodeError_SetReason(exceptionobj, reason) != 0) + goto errorexit; + } + + if (errorcb == ERROR_STRICT) { + PyCodec_StrictErrors(exceptionobj); + goto errorexit; + } else { + PyObject *argsobj, *retobj, *retuni; + long newpos; + + argsobj = PyTuple_New(1); + if (argsobj == NULL) + goto errorexit; + PyTuple_SET_ITEM(argsobj, 0, exceptionobj); + Py_INCREF(exceptionobj); + retobj = PyObject_CallObject(errorcb, argsobj); + Py_DECREF(argsobj); + if (retobj == NULL) + goto errorexit; + + if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || + !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) || + !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) { + Py_DECREF(retobj); + PyErr_SetString(PyExc_ValueError, "decoding error handler " + "must return (unicode, int) tuple"); + goto errorexit; + } + if (PyUnicode_GET_SIZE(retuni) > 0) { +#define errorexit errorexit_cbpad + size_t retunisize; + + retunisize = PyUnicode_GET_DATA_SIZE(retuni); + if (outlen < retunisize) + RESIZE_OUTBUFFER(errno == EINVAL || retunisize > estep + ? retunisize - outlen : estep); + + memcpy(out, PyUnicode_AS_DATA(retuni), retunisize); + out += retunisize; + outlen -= retunisize; +#undef errorexit + if (0) { +errorexit_cbpad: Py_DECREF(retobj); + goto errorexit; + } + } + + newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1)); + Py_DECREF(retobj); + + if (newpos < 0) + newpos = inplen_total - newpos; + if (newpos < 0 || newpos >= inplen_total) + break; + inp = inp_top + newpos; + inplen = inplen_total - newpos; + } + } else + break; + } +#undef RESIZE_OUTBUFFER + + { + PyObject *rettup; + int finalsize; + + finalsize = (int)(out - out_top); + if (finalsize != outlen_total) { + if (PyUnicode_Resize(&outputobj, finalsize / Py_UNICODE_SIZE) == -1) + goto errorexit; + } + + if (errorcb > ERROR_MAX) { + Py_DECREF(errorcb); + } + Py_XDECREF(exceptionobj); + + rettup = PyTuple_New(2); + if (rettup == NULL) { + Py_DECREF(outputobj); + return NULL; + } + PyTuple_SET_ITEM(rettup, 0, outputobj); + PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inplen_total)); + return rettup; + } + +errorexit: + Py_XDECREF(outputobj); + if (errorcb > ERROR_MAX) { + Py_DECREF(errorcb); + } + Py_XDECREF(exceptionobj); + + return NULL; +} + +static struct PyMethodDef iconvcodec_methods[] = { + {"encode", (PyCFunction)iconvcodec_encode, + METH_VARARGS | METH_KEYWORDS, + iconvcodec_encode__doc__}, + {"decode", (PyCFunction)iconvcodec_decode, + METH_VARARGS | METH_KEYWORDS, + iconvcodec_decode__doc__}, + {NULL, NULL}, +}; + +static PyObject * +iconvcodec_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) +{ + PyObject *encobj = NULL; + iconvcodecObject *new = NULL; + + new = (iconvcodecObject *)type->tp_alloc(type, 0); + if (new == NULL) + return NULL; + + new->encoding = NULL; + new->enchdl = new->dechdl = (iconv_t)(-1); + + encobj = PyObject_GetAttrString((PyObject *)new, "encoding"); + if (encobj == NULL) { + PyErr_Clear(); + new->encoding = PyMem_Malloc(sizeof(DEFAULT_ENCODING)); + strcpy(new->encoding, DEFAULT_ENCODING); + } else if (!PyString_Check(encobj)) { + Py_DECREF(encobj); + PyErr_SetString(PyExc_TypeError, + "`encoding' attribute must be a string."); + goto errorexit; + } else { + new->encoding = PyMem_Malloc(PyString_GET_SIZE(encobj) + 1); + strcpy(new->encoding, PyString_AS_STRING(encobj)); + Py_DECREF(encobj); + } + + new->dechdl = iconv_open(UNICODE_ENCODING, new->encoding); + if (new->dechdl == (iconv_t)(-1)) { + PyErr_SetString(PyExc_ValueError, "unsupported decoding"); + goto errorexit; + } + + new->enchdl = iconv_open(new->encoding, UNICODE_ENCODING); + if (new->enchdl == (iconv_t)(-1)) { + PyErr_SetString(PyExc_ValueError, "unsupported encoding"); + iconv_close(new->dechdl); + new->dechdl = (iconv_t)(-1); + goto errorexit; + } + + return (PyObject *)new; + +errorexit: + Py_XDECREF(new); + + return NULL; +} + +static void +iconvcodec_dealloc(iconvcodecObject *self) +{ + _PyObject_GC_UNTRACK(self); + + if (self->enchdl != (iconv_t)-1) + iconv_close(self->enchdl); + if (self->dechdl != (iconv_t)-1) + iconv_close(self->dechdl); + if (self->encoding != NULL) + PyMem_Free(self->encoding); + + PyObject_GC_Del(self); +} + +static PyObject * +iconvcodec_repr(PyObject *self) +{ + return PyString_FromFormat("<iconvcodec encoding='%s'>", + ((iconvcodecObject *)self)->encoding); +} + +statichere PyTypeObject iconvcodec_Type = { + PyObject_HEAD_INIT(&PyType_Type) + 0, /* Number of items for varobject */ + "iconvcodec", /* Name of this type */ + sizeof(iconvcodecObject), /* Basic object size */ + 0, /* Item size for varobject */ + (destructor)iconvcodec_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + iconvcodec_repr, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | + Py_TPFLAGS_HAVE_GC, /* tp_flags */ + iconvcodec_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iterext */ + iconvcodec_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + PyType_GenericAlloc, /* tp_alloc */ + iconvcodec_new, /* tp_new */ + PyObject_GC_Del, /* tp_free */ +}; + +static struct PyMethodDef _iconv_codec_methods[] = { + {NULL, NULL}, +}; + +void +init_iconv_codec(void) +{ + PyObject *m, *d, *t; + + m = Py_InitModule("_iconv_codec", _iconv_codec_methods); + + d = PyModule_GetDict(m); + + t = PyString_FromString(__version__); + PyDict_SetItemString(d, "__version__", t); + Py_DECREF(t); + + PyDict_SetItemString(d, "iconvcodec", (PyObject *)(&iconvcodec_Type)); + PyModule_AddStringConstant(m, "internal_encoding", UNICODE_ENCODING); + + if (PyErr_Occurred()) + Py_FatalError("can't initialize the _iconv_codec module"); +} + +/* + * ex: ts=8 sts=4 et + * $Id$ + */ |