summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2003-01-26 11:30:36 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2003-01-26 11:30:36 (GMT)
commit9789aefa616c14599af600bdea925299517da730 (patch)
tree801c3f6ed8aafb3aafeae179737c9aea5e185bb9 /Modules
parentafef4eefa8b07287abe1c59b9ef918e91386f454 (diff)
downloadcpython-9789aefa616c14599af600bdea925299517da730.zip
cpython-9789aefa616c14599af600bdea925299517da730.tar.gz
cpython-9789aefa616c14599af600bdea925299517da730.tar.bz2
Patch #670715: Universal Unicode Codec for POSIX iconv.
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_iconv_codec.c626
1 files changed, 626 insertions, 0 deletions
diff --git a/Modules/_iconv_codec.c b/Modules/_iconv_codec.c
new file mode 100644
index 0000000..9f08102
--- /dev/null
+++ b/Modules/_iconv_codec.c
@@ -0,0 +1,626 @@
+/*
+ * _iconv_codec.c
+ *
+ * libiconv adaptor for Python iconvcodec
+ *
+ * Author : Hye-Shik Chang <perky@FreeBSD.org>
+ * Created : 17 January 2003
+ */
+
+#include "Python.h"
+#include <string.h>
+#include <iconv.h>
+
+static const char *__version__ = "$Revision$";
+
+#if Py_USING_UNICODE
+# if Py_UNICODE_SIZE == 2
+# ifdef __GNU_LIBRARY__
+# define UNICODE_ENCODING "ucs-2"
+# else
+# define UNICODE_ENCODING "ucs-2-internal"
+# endif
+# define MBENCODED_LENGTH_MAX 4
+# elif Py_UNICODE_SIZE == 4
+# ifdef __GNU_LIBRARY__
+# define UNICODE_ENCODING "ucs-4"
+# else
+# define UNICODE_ENCODING "ucs-4-internal"
+# endif
+# define MBENCODED_LENGTH_MAX 6
+# endif
+#else
+# error "Unicode is not available"
+#endif
+
+typedef struct {
+ PyObject_HEAD
+ iconv_t enchdl, dechdl;
+ char *encoding;
+} iconvcodecObject;
+PyDoc_STRVAR(iconvcodec_doc, "iconvcodec object");
+
+staticforward PyTypeObject iconvcodec_Type;
+
+
+#define ERROR_STRICT (PyObject *)(1)
+#define ERROR_IGNORE (PyObject *)(2)
+#define ERROR_REPLACE (PyObject *)(3)
+#define ERROR_MAX ERROR_REPLACE
+
+#define REPLACEMENT_CHAR_DECODE 0xFFFD
+#define REPLACEMENT_CHAR_ENCODE '?'
+
+#define DEFAULT_ENCODING "utf-8"
+
+
+static PyObject *
+get_errorcallback(const char *errors)
+{
+ if (errors == NULL || strcmp(errors, "strict") == 0)
+ return ERROR_STRICT;
+ else if (strcmp(errors, "ignore") == 0)
+ return ERROR_IGNORE;
+ else if (strcmp(errors, "replace") == 0)
+ return ERROR_REPLACE;
+ else
+ return PyCodec_LookupError(errors);
+}
+
+
+PyDoc_STRVAR(iconvcodec_encode__doc__,
+"I.encode(unicode, [,errors]) -> (string, length consumed)\n\
+\n\
+Return an encoded string version of `unicode'. errors may be given to\n\
+set a different error handling scheme. Default is 'strict' meaning that\n\
+encoding errors raise a UnicodeEncodeError. Other possible values are\n\
+'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\
+registered with codecs.register_error that can handle UnicodeEncodeErrors.");
+
+static PyObject *
+iconvcodec_encode(iconvcodecObject *self, PyObject *args, PyObject *kwargs)
+{
+ static char *kwlist[] = { "input", "errors", NULL };
+ Py_UNICODE *input;
+ int inputlen;
+ char *errors = NULL/*strict*/, *out, *out_top;
+ const char *inp, *inp_top;
+ size_t inplen, inplen_total, outlen, outlen_total, estep;
+ PyObject *outputobj = NULL, *errorcb = NULL,
+ *exceptionobj = NULL;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "u#|s:encode",
+ kwlist, &input, &inputlen, &errors))
+ return NULL; /* TypeError */
+
+ errorcb = get_errorcallback(errors);
+ if (errorcb == NULL)
+ return NULL; /* LookupError or something else from error handler */
+
+ inp = inp_top = (char *)input;
+ inplen = inplen_total = (size_t)(inputlen * Py_UNICODE_SIZE);
+
+ outlen = inputlen * MBENCODED_LENGTH_MAX;
+ if (outlen < 16)
+ outlen = 16; /* for iso-2022 codecs */
+
+ outputobj = PyString_FromStringAndSize(NULL, outlen);
+ if (outputobj == NULL)
+ return NULL;
+ out = out_top = PyString_AS_STRING(outputobj);
+ outlen_total = outlen;
+
+ estep = inputlen * Py_UNICODE_SIZE / 2;
+
+#define RESIZE_OUTBUFFER(size) { \
+ size_t toadd = (size); \
+ outlen_total += toadd; \
+ outlen += toadd; \
+ if (_PyString_Resize(&outputobj, outlen_total) == -1) \
+ goto errorexit; \
+ out = PyString_AS_STRING(outputobj) + (out - out_top); \
+ out_top = PyString_AS_STRING(outputobj); \
+}
+ while (inplen > 0) {
+ if (iconv(self->enchdl, &inp, &inplen, &out, &outlen) == -1) {
+ char reason[128];
+ int errpos;
+
+ if (errno == E2BIG) {
+ RESIZE_OUTBUFFER(estep);
+ continue;
+ }
+
+ if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) {
+ inplen -= Py_UNICODE_SIZE;
+ inp += Py_UNICODE_SIZE;
+ if (errorcb == ERROR_REPLACE) {
+ if (outlen < 1)
+ RESIZE_OUTBUFFER(errno == EINVAL ? 1 : estep);
+ outlen--;
+ *out++ = REPLACEMENT_CHAR_ENCODE;
+ }
+ if (errno == EINVAL) break;
+ else continue;
+ }
+
+ errpos = (int)(inp - inp_top) / Py_UNICODE_SIZE;
+ sprintf(reason, "Undefined character map from "
+#if Py_UNICODE_SIZE == 2
+ "\\u%04x"
+#elif Py_UNICODE_SIZE == 4
+ "\\u%08x"
+#endif
+ , *(Py_UNICODE *)inp);
+
+ if (exceptionobj == NULL) {
+ if ((exceptionobj = PyUnicodeEncodeError_Create(
+ self->encoding, input, inputlen,
+ errpos, errpos + 1, reason)) == NULL)
+ goto errorexit;
+ } else {
+ if (PyUnicodeEncodeError_SetStart(exceptionobj, errpos) != 0)
+ goto errorexit;
+ if (PyUnicodeEncodeError_SetEnd(exceptionobj, errpos + 1) != 0)
+ goto errorexit;
+ if (PyUnicodeEncodeError_SetReason(exceptionobj, reason) != 0)
+ goto errorexit;
+ }
+
+ if (errorcb == ERROR_STRICT) {
+ PyCodec_StrictErrors(exceptionobj);
+ goto errorexit;
+ } else {
+ PyObject *argsobj, *retobj, *retuni;
+ long newpos;
+
+ argsobj = PyTuple_New(1);
+ if (argsobj == NULL)
+ goto errorexit;
+ PyTuple_SET_ITEM(argsobj, 0, exceptionobj);
+ Py_INCREF(exceptionobj);
+ retobj = PyObject_CallObject(errorcb, argsobj);
+ Py_DECREF(argsobj);
+ if (retobj == NULL)
+ goto errorexit;
+
+ if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
+ !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) ||
+ !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
+ Py_DECREF(retobj);
+ PyErr_SetString(PyExc_ValueError, "encoding error handler "
+ "must return (unicode, int) tuple");
+ goto errorexit;
+ }
+ if (PyUnicode_GET_SIZE(retuni) > 0) {
+#define errorexit errorexit_cbpad
+ PyObject *retstr = NULL;
+ int retstrsize;
+
+ retstr = PyUnicode_AsEncodedString(
+ retuni, self->encoding, NULL);
+ if (retstr == NULL || !PyString_Check(retstr))
+ goto errorexit;
+
+ retstrsize = PyString_GET_SIZE(retstr);
+ if (outlen < retstrsize)
+ RESIZE_OUTBUFFER(errno == EINVAL || retstrsize > estep
+ ? retstrsize - outlen : estep);
+
+ memcpy(out, PyString_AS_STRING(retstr), retstrsize);
+ out += retstrsize;
+ outlen -= retstrsize;
+#undef errorexit
+ if (0) {
+errorexit_cbpad: Py_XDECREF(retobj);
+ Py_XDECREF(retstr);
+ goto errorexit;
+ }
+ Py_DECREF(retstr);
+ }
+
+ newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
+ Py_DECREF(retobj);
+
+ if (newpos < 0)
+ newpos = inputlen - newpos;
+ if (newpos < 0 || newpos >= inputlen)
+ break;
+ inp = inp_top + Py_UNICODE_SIZE * newpos;
+ inplen = inplen_total - Py_UNICODE_SIZE * newpos;
+ }
+ } else
+ break;
+ }
+#undef RESIZE_OUTBUFFER
+
+ {
+ PyObject *rettup;
+ int finalsize;
+
+ finalsize = (int)(out - out_top);
+
+ if (finalsize != outlen_total) {
+ if (_PyString_Resize(&outputobj, finalsize) == -1)
+ goto errorexit;
+ }
+
+ if (errorcb > ERROR_MAX) {
+ Py_DECREF(errorcb);
+ }
+ Py_XDECREF(exceptionobj);
+
+ rettup = PyTuple_New(2);
+ if (rettup == NULL) {
+ Py_DECREF(outputobj);
+ return NULL;
+ }
+ PyTuple_SET_ITEM(rettup, 0, outputobj);
+ PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inputlen));
+ return rettup;
+ }
+
+errorexit:
+ Py_XDECREF(outputobj);
+ if (errorcb > ERROR_MAX) {
+ Py_DECREF(errorcb);
+ }
+ Py_XDECREF(exceptionobj);
+
+ return NULL;
+}
+
+PyDoc_STRVAR(iconvcodec_decode__doc__,
+"I.decode(string, [,errors]) -> (unicodeobject, length consumed)\n\
+\n\
+Decodes `string' using I, an iconvcodec instance. errors may be given\n\
+to set a different error handling scheme. Default is 'strict' meaning\n\
+that encoding errors raise a UnicodeDecodeError. Other possible values\n\
+are 'ignore' and 'replace' as well as any other name registerd with\n\
+codecs.register_error that is able to handle UnicodeDecodeErrors.");
+
+static PyObject *
+iconvcodec_decode(iconvcodecObject *self, PyObject *args, PyObject *kwargs)
+{
+ static char *kwlist[] = { "input", "errors", NULL };
+ char *errors = NULL/*strict*/, *out, *out_top;
+ const char *inp, *inp_top;
+ int inplen_int;
+ size_t inplen, inplen_total, outlen, outlen_total, estep;
+ PyObject *outputobj = NULL, *errorcb = NULL,
+ *exceptionobj = NULL;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|s:decode",
+ kwlist, &inp, &inplen_int, &errors))
+ return NULL; /* TypeError */
+
+ errorcb = get_errorcallback(errors);
+ if (errorcb == NULL)
+ return NULL; /* LookupError or something else from error handler */
+
+ inp_top = inp;
+ inplen_total = inplen = (size_t)inplen_int;
+
+ outputobj = PyUnicode_FromUnicode(NULL, inplen);
+ if (outputobj == NULL)
+ return NULL;
+ outlen_total = outlen = PyUnicode_GET_DATA_SIZE(outputobj);
+ out = out_top = (char *)PyUnicode_AS_UNICODE(outputobj);
+
+ estep = outlen / 2;
+
+#define RESIZE_OUTBUFFER(size) { \
+ size_t toadd = (size); \
+ outlen_total += toadd; \
+ outlen += toadd; \
+ if (PyUnicode_Resize(&outputobj, outlen_total/Py_UNICODE_SIZE) == -1) \
+ goto errorexit; \
+ out = (char *)PyUnicode_AS_UNICODE(outputobj) + (out - out_top); \
+ out_top = (char *)PyUnicode_AS_UNICODE(outputobj); \
+}
+ while (inplen > 0) {
+ if (iconv(self->dechdl, &inp, &inplen, &out, &outlen) == -1) {
+ char reason[128], *reasonpos = (char *)reason;
+ int errpos;
+
+ if (errno == E2BIG) {
+ RESIZE_OUTBUFFER(estep);
+ continue;
+ }
+
+ if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) {
+ inplen--; inp++;
+ if (errorcb == ERROR_REPLACE) {
+ Py_UNICODE *replp;
+
+ if (outlen < Py_UNICODE_SIZE)
+ RESIZE_OUTBUFFER(
+ errno == EINVAL || Py_UNICODE_SIZE > estep
+ ? Py_UNICODE_SIZE : estep);
+
+ /* some compilers hate casted lvalue */
+ replp = (Py_UNICODE *)out;
+ assert((long)replp % Py_UNICODE_SIZE == 0);/* aligned? */
+ *replp = REPLACEMENT_CHAR_DECODE;
+
+ out += Py_UNICODE_SIZE;
+ outlen -= Py_UNICODE_SIZE;
+ }
+ if (errno == EINVAL) break;
+ else continue;
+ }
+
+ errpos = (int)(inp - inp_top);
+ reasonpos += sprintf(reason, "Invalid multibyte sequence \\x%02x",
+ (unsigned char)*inp);
+ if (inplen > 1) {
+ reasonpos += sprintf(reasonpos,
+ "\\x%02x", (unsigned char)*(inp+1));
+ if (inplen > 2)
+ sprintf(reasonpos, "\\x%02x", (unsigned char)*(inp+2));
+ }
+
+ if (exceptionobj == NULL) {
+ exceptionobj = PyUnicodeDecodeError_Create(
+ self->encoding, inp_top, inplen_total,
+ errpos, errpos + 1, reason);
+ if (exceptionobj == NULL)
+ goto errorexit;
+ } else {
+ if (PyUnicodeDecodeError_SetStart(exceptionobj, errpos) != 0)
+ goto errorexit;
+ if (PyUnicodeDecodeError_SetEnd(exceptionobj, errpos + 1) != 0)
+ goto errorexit;
+ if (PyUnicodeDecodeError_SetReason(exceptionobj, reason) != 0)
+ goto errorexit;
+ }
+
+ if (errorcb == ERROR_STRICT) {
+ PyCodec_StrictErrors(exceptionobj);
+ goto errorexit;
+ } else {
+ PyObject *argsobj, *retobj, *retuni;
+ long newpos;
+
+ argsobj = PyTuple_New(1);
+ if (argsobj == NULL)
+ goto errorexit;
+ PyTuple_SET_ITEM(argsobj, 0, exceptionobj);
+ Py_INCREF(exceptionobj);
+ retobj = PyObject_CallObject(errorcb, argsobj);
+ Py_DECREF(argsobj);
+ if (retobj == NULL)
+ goto errorexit;
+
+ if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
+ !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) ||
+ !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
+ Py_DECREF(retobj);
+ PyErr_SetString(PyExc_ValueError, "decoding error handler "
+ "must return (unicode, int) tuple");
+ goto errorexit;
+ }
+ if (PyUnicode_GET_SIZE(retuni) > 0) {
+#define errorexit errorexit_cbpad
+ size_t retunisize;
+
+ retunisize = PyUnicode_GET_DATA_SIZE(retuni);
+ if (outlen < retunisize)
+ RESIZE_OUTBUFFER(errno == EINVAL || retunisize > estep
+ ? retunisize - outlen : estep);
+
+ memcpy(out, PyUnicode_AS_DATA(retuni), retunisize);
+ out += retunisize;
+ outlen -= retunisize;
+#undef errorexit
+ if (0) {
+errorexit_cbpad: Py_DECREF(retobj);
+ goto errorexit;
+ }
+ }
+
+ newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
+ Py_DECREF(retobj);
+
+ if (newpos < 0)
+ newpos = inplen_total - newpos;
+ if (newpos < 0 || newpos >= inplen_total)
+ break;
+ inp = inp_top + newpos;
+ inplen = inplen_total - newpos;
+ }
+ } else
+ break;
+ }
+#undef RESIZE_OUTBUFFER
+
+ {
+ PyObject *rettup;
+ int finalsize;
+
+ finalsize = (int)(out - out_top);
+ if (finalsize != outlen_total) {
+ if (PyUnicode_Resize(&outputobj, finalsize / Py_UNICODE_SIZE) == -1)
+ goto errorexit;
+ }
+
+ if (errorcb > ERROR_MAX) {
+ Py_DECREF(errorcb);
+ }
+ Py_XDECREF(exceptionobj);
+
+ rettup = PyTuple_New(2);
+ if (rettup == NULL) {
+ Py_DECREF(outputobj);
+ return NULL;
+ }
+ PyTuple_SET_ITEM(rettup, 0, outputobj);
+ PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inplen_total));
+ return rettup;
+ }
+
+errorexit:
+ Py_XDECREF(outputobj);
+ if (errorcb > ERROR_MAX) {
+ Py_DECREF(errorcb);
+ }
+ Py_XDECREF(exceptionobj);
+
+ return NULL;
+}
+
+static struct PyMethodDef iconvcodec_methods[] = {
+ {"encode", (PyCFunction)iconvcodec_encode,
+ METH_VARARGS | METH_KEYWORDS,
+ iconvcodec_encode__doc__},
+ {"decode", (PyCFunction)iconvcodec_decode,
+ METH_VARARGS | METH_KEYWORDS,
+ iconvcodec_decode__doc__},
+ {NULL, NULL},
+};
+
+static PyObject *
+iconvcodec_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+ PyObject *encobj = NULL;
+ iconvcodecObject *new = NULL;
+
+ new = (iconvcodecObject *)type->tp_alloc(type, 0);
+ if (new == NULL)
+ return NULL;
+
+ new->encoding = NULL;
+ new->enchdl = new->dechdl = (iconv_t)(-1);
+
+ encobj = PyObject_GetAttrString((PyObject *)new, "encoding");
+ if (encobj == NULL) {
+ PyErr_Clear();
+ new->encoding = PyMem_Malloc(sizeof(DEFAULT_ENCODING));
+ strcpy(new->encoding, DEFAULT_ENCODING);
+ } else if (!PyString_Check(encobj)) {
+ Py_DECREF(encobj);
+ PyErr_SetString(PyExc_TypeError,
+ "`encoding' attribute must be a string.");
+ goto errorexit;
+ } else {
+ new->encoding = PyMem_Malloc(PyString_GET_SIZE(encobj) + 1);
+ strcpy(new->encoding, PyString_AS_STRING(encobj));
+ Py_DECREF(encobj);
+ }
+
+ new->dechdl = iconv_open(UNICODE_ENCODING, new->encoding);
+ if (new->dechdl == (iconv_t)(-1)) {
+ PyErr_SetString(PyExc_ValueError, "unsupported decoding");
+ goto errorexit;
+ }
+
+ new->enchdl = iconv_open(new->encoding, UNICODE_ENCODING);
+ if (new->enchdl == (iconv_t)(-1)) {
+ PyErr_SetString(PyExc_ValueError, "unsupported encoding");
+ iconv_close(new->dechdl);
+ new->dechdl = (iconv_t)(-1);
+ goto errorexit;
+ }
+
+ return (PyObject *)new;
+
+errorexit:
+ Py_XDECREF(new);
+
+ return NULL;
+}
+
+static void
+iconvcodec_dealloc(iconvcodecObject *self)
+{
+ _PyObject_GC_UNTRACK(self);
+
+ if (self->enchdl != (iconv_t)-1)
+ iconv_close(self->enchdl);
+ if (self->dechdl != (iconv_t)-1)
+ iconv_close(self->dechdl);
+ if (self->encoding != NULL)
+ PyMem_Free(self->encoding);
+
+ PyObject_GC_Del(self);
+}
+
+static PyObject *
+iconvcodec_repr(PyObject *self)
+{
+ return PyString_FromFormat("<iconvcodec encoding='%s'>",
+ ((iconvcodecObject *)self)->encoding);
+}
+
+statichere PyTypeObject iconvcodec_Type = {
+ PyObject_HEAD_INIT(&PyType_Type)
+ 0, /* Number of items for varobject */
+ "iconvcodec", /* Name of this type */
+ sizeof(iconvcodecObject), /* Basic object size */
+ 0, /* Item size for varobject */
+ (destructor)iconvcodec_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ iconvcodec_repr, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ PyObject_GenericGetAttr, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
+ Py_TPFLAGS_HAVE_GC, /* tp_flags */
+ iconvcodec_doc, /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iterext */
+ iconvcodec_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ 0, /* tp_init */
+ PyType_GenericAlloc, /* tp_alloc */
+ iconvcodec_new, /* tp_new */
+ PyObject_GC_Del, /* tp_free */
+};
+
+static struct PyMethodDef _iconv_codec_methods[] = {
+ {NULL, NULL},
+};
+
+void
+init_iconv_codec(void)
+{
+ PyObject *m, *d, *t;
+
+ m = Py_InitModule("_iconv_codec", _iconv_codec_methods);
+
+ d = PyModule_GetDict(m);
+
+ t = PyString_FromString(__version__);
+ PyDict_SetItemString(d, "__version__", t);
+ Py_DECREF(t);
+
+ PyDict_SetItemString(d, "iconvcodec", (PyObject *)(&iconvcodec_Type));
+ PyModule_AddStringConstant(m, "internal_encoding", UNICODE_ENCODING);
+
+ if (PyErr_Occurred())
+ Py_FatalError("can't initialize the _iconv_codec module");
+}
+
+/*
+ * ex: ts=8 sts=4 et
+ * $Id$
+ */