diff options
author | Thomas Wouters <thomas@python.org> | 2006-04-21 10:40:58 (GMT) |
---|---|---|
committer | Thomas Wouters <thomas@python.org> | 2006-04-21 10:40:58 (GMT) |
commit | 49fd7fa4431da299196d74087df4a04f99f9c46f (patch) | |
tree | 35ace5fe78d3d52c7a9ab356ab9f6dbf8d4b71f4 /Modules/cjkcodecs/multibytecodec.c | |
parent | 9ada3d6e29d5165dadacbe6be07bcd35cfbef59d (diff) | |
download | cpython-49fd7fa4431da299196d74087df4a04f99f9c46f.zip cpython-49fd7fa4431da299196d74087df4a04f99f9c46f.tar.gz cpython-49fd7fa4431da299196d74087df4a04f99f9c46f.tar.bz2 |
Merge p3yk branch with the trunk up to revision 45595. This breaks a fair
number of tests, all because of the codecs/_multibytecodecs issue described
here (it's not a Py3K issue, just something Py3K discovers):
http://mail.python.org/pipermail/python-dev/2006-April/064051.html
Hye-Shik Chang promised to look for a fix, so no need to fix it here. The
tests that are expected to break are:
test_codecencodings_cn
test_codecencodings_hk
test_codecencodings_jp
test_codecencodings_kr
test_codecencodings_tw
test_codecs
test_multibytecodec
This merge fixes an actual test failure (test_weakref) in this branch,
though, so I believe merging is the right thing to do anyway.
Diffstat (limited to 'Modules/cjkcodecs/multibytecodec.c')
-rw-r--r-- | Modules/cjkcodecs/multibytecodec.c | 1106 |
1 files changed, 801 insertions, 305 deletions
diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index f51b6f2..340de18 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -6,6 +6,7 @@ #define PY_SSIZE_T_CLEAN #include "Python.h" +#include "structmember.h" #include "multibytecodec.h" typedef struct { @@ -38,22 +39,14 @@ that encoding errors raise a UnicodeDecodeError. Other possible values\n\ are 'ignore' and 'replace' as well as any other name registerd with\n\ codecs.register_error that is able to handle UnicodeDecodeErrors."); -PyDoc_STRVAR(MultibyteCodec_StreamReader__doc__, -"I.StreamReader(stream[, errors]) -> StreamReader instance"); - -PyDoc_STRVAR(MultibyteCodec_StreamWriter__doc__, -"I.StreamWriter(stream[, errors]) -> StreamWriter instance"); - static char *codeckwarglist[] = {"input", "errors", NULL}; +static char *incnewkwarglist[] = {"errors", NULL}; +static char *incrementalkwarglist[] = {"input", "final", NULL}; static char *streamkwarglist[] = {"stream", "errors", NULL}; static PyObject *multibytecodec_encode(MultibyteCodec *, MultibyteCodec_State *, const Py_UNICODE **, Py_ssize_t, PyObject *, int); -static PyObject *mbstreamreader_create(MultibyteCodec *, - PyObject *, const char *); -static PyObject *mbstreamwriter_create(MultibyteCodec *, - PyObject *, const char *); #define MBENC_RESET MBENC_MAX<<1 /* reset after an encoding session */ @@ -83,7 +76,7 @@ make_tuple(PyObject *object, Py_ssize_t len) } static PyObject * -get_errorcallback(const char *errors) +internal_error_callback(const char *errors) { if (errors == NULL || strcmp(errors, "strict") == 0) return ERROR_STRICT; @@ -91,17 +84,88 @@ get_errorcallback(const char *errors) return ERROR_IGNORE; else if (strcmp(errors, "replace") == 0) return ERROR_REPLACE; + else + return PyString_FromString(errors); +} + +static PyObject * +call_error_callback(PyObject *errors, PyObject *exc) +{ + PyObject *args, *cb, *r; + + assert(PyString_Check(errors)); + cb = PyCodec_LookupError(PyString_AS_STRING(errors)); + if (cb == NULL) + return NULL; + + args = PyTuple_New(1); + if (args == NULL) { + Py_DECREF(cb); + return NULL; + } + + PyTuple_SET_ITEM(args, 0, exc); + Py_INCREF(exc); + + r = PyObject_CallObject(cb, args); + Py_DECREF(args); + Py_DECREF(cb); + return r; +} + +static PyObject * +codecctx_errors_get(MultibyteStatefulCodecContext *self) +{ + const char *errors; + + if (self->errors == ERROR_STRICT) + errors = "strict"; + else if (self->errors == ERROR_IGNORE) + errors = "ignore"; + else if (self->errors == ERROR_REPLACE) + errors = "replace"; else { - return PyCodec_LookupError(errors); + Py_INCREF(self->errors); + return self->errors; } + + return PyString_FromString(errors); +} + +static int +codecctx_errors_set(MultibyteStatefulCodecContext *self, PyObject *value, + void *closure) +{ + PyObject *cb; + + if (!PyString_Check(value)) { + PyErr_SetString(PyExc_TypeError, "errors must be a string"); + return -1; + } + + cb = internal_error_callback(PyString_AS_STRING(value)); + if (cb == NULL) + return -1; + + ERROR_DECREF(self->errors); + self->errors = cb; + return 0; } +/* This getset handlers list is used by all the stateful codec objects */ +static PyGetSetDef codecctx_getsets[] = { + {"errors", (getter)codecctx_errors_get, + (setter)codecctx_errors_set, + PyDoc_STR("how to treat errors")}, + {NULL,} +}; + static int expand_encodebuffer(MultibyteEncodeBuffer *buf, Py_ssize_t esize) { Py_ssize_t orgpos, orgsize; - orgpos = (Py_ssize_t)((char*)buf->outbuf - + orgpos = (Py_ssize_t)((char *)buf->outbuf - PyString_AS_STRING(buf->outobj)); orgsize = PyString_GET_SIZE(buf->outobj); if (_PyString_Resize(&buf->outobj, orgsize + ( @@ -125,8 +189,7 @@ expand_decodebuffer(MultibyteDecodeBuffer *buf, Py_ssize_t esize) { Py_ssize_t orgpos, orgsize; - orgpos = (Py_ssize_t)(buf->outbuf - - PyUnicode_AS_UNICODE(buf->outobj)); + orgpos = (Py_ssize_t)(buf->outbuf - PyUnicode_AS_UNICODE(buf->outobj)); orgsize = PyUnicode_GET_SIZE(buf->outobj); if (PyUnicode_Resize(&buf->outobj, orgsize + ( esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize)) == -1) @@ -144,16 +207,21 @@ expand_decodebuffer(MultibyteDecodeBuffer *buf, Py_ssize_t esize) goto errorexit; \ } + +/** + * MultibyteCodec object + */ + static int multibytecodec_encerror(MultibyteCodec *codec, MultibyteCodec_State *state, MultibyteEncodeBuffer *buf, PyObject *errors, Py_ssize_t e) { - PyObject *retobj = NULL, *retstr = NULL, *argsobj, *tobj; + PyObject *retobj = NULL, *retstr = NULL, *tobj; Py_ssize_t retstrsize, newpos; - const char *reason; Py_ssize_t esize, start, end; + const char *reason; if (e > 0) { reason = "illegal multibyte sequence"; @@ -166,7 +234,7 @@ multibytecodec_encerror(MultibyteCodec *codec, return 0; /* retry it */ case MBERR_TOOFEW: reason = "incomplete multibyte sequence"; - esize = (size_t)(buf->inbuf_end - buf->inbuf); + esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); break; case MBERR_INTERNAL: PyErr_SetString(PyExc_RuntimeError, @@ -230,21 +298,15 @@ multibytecodec_encerror(MultibyteCodec *codec, goto errorexit; } - argsobj = PyTuple_New(1); - if (argsobj == NULL) - goto errorexit; - - PyTuple_SET_ITEM(argsobj, 0, buf->excobj); - Py_INCREF(buf->excobj); - retobj = PyObject_CallObject(errors, argsobj); - Py_DECREF(argsobj); + retobj = call_error_callback(errors, buf->excobj); if (retobj == NULL) goto errorexit; if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || !PyUnicode_Check((tobj = PyTuple_GET_ITEM(retobj, 0))) || - !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) { - PyErr_SetString(PyExc_ValueError, + !(PyInt_Check(PyTuple_GET_ITEM(retobj, 1)) || + PyLong_Check(PyTuple_GET_ITEM(retobj, 1)))) { + PyErr_SetString(PyExc_TypeError, "encoding error handler must return " "(unicode, int) tuple"); goto errorexit; @@ -267,12 +329,13 @@ multibytecodec_encerror(MultibyteCodec *codec, buf->outbuf += retstrsize; newpos = PyInt_AsSsize_t(PyTuple_GET_ITEM(retobj, 1)); - if (newpos < 0) + if (newpos < 0 && !PyErr_Occurred()) newpos += (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top); if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) { + PyErr_Clear(); PyErr_Format(PyExc_IndexError, - "position %d from error handler out of bounds", - (int)newpos); + "position %zd from error handler out of bounds", + newpos); goto errorexit; } buf->inbuf = buf->inbuf_top + newpos; @@ -293,7 +356,7 @@ multibytecodec_decerror(MultibyteCodec *codec, MultibyteDecodeBuffer *buf, PyObject *errors, Py_ssize_t e) { - PyObject *argsobj, *retobj = NULL, *retuni = NULL; + PyObject *retobj = NULL, *retuni = NULL; Py_ssize_t retunisize, newpos; const char *reason; Py_ssize_t esize, start, end; @@ -309,7 +372,7 @@ multibytecodec_decerror(MultibyteCodec *codec, return 0; /* retry it */ case MBERR_TOOFEW: reason = "incomplete multibyte sequence"; - esize = (size_t)(buf->inbuf_end - buf->inbuf); + esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); break; case MBERR_INTERNAL: PyErr_SetString(PyExc_RuntimeError, @@ -354,21 +417,15 @@ multibytecodec_decerror(MultibyteCodec *codec, goto errorexit; } - argsobj = PyTuple_New(1); - if (argsobj == NULL) - goto errorexit; - - PyTuple_SET_ITEM(argsobj, 0, buf->excobj); - Py_INCREF(buf->excobj); - retobj = PyObject_CallObject(errors, argsobj); - Py_DECREF(argsobj); + retobj = call_error_callback(errors, buf->excobj); if (retobj == NULL) goto errorexit; if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) || - !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) { - PyErr_SetString(PyExc_ValueError, + !(PyInt_Check(PyTuple_GET_ITEM(retobj, 1)) || + PyLong_Check(PyTuple_GET_ITEM(retobj, 1)))) { + PyErr_SetString(PyExc_TypeError, "decoding error handler must return " "(unicode, int) tuple"); goto errorexit; @@ -383,12 +440,13 @@ multibytecodec_decerror(MultibyteCodec *codec, } newpos = PyInt_AsSsize_t(PyTuple_GET_ITEM(retobj, 1)); - if (newpos < 0) + if (newpos < 0 && !PyErr_Occurred()) newpos += (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top); if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) { + PyErr_Clear(); PyErr_Format(PyExc_IndexError, - "position %d from error handler out of bounds", - (int)newpos); + "position %zd from error handler out of bounds", + newpos); goto errorexit; } buf->inbuf = buf->inbuf_top + newpos; @@ -453,7 +511,7 @@ multibytecodec_encode(MultibyteCodec *codec, goto errorexit; } - finalsize = (Py_ssize_t)((char*)buf.outbuf - + finalsize = (Py_ssize_t)((char *)buf.outbuf - PyString_AS_STRING(buf.outobj)); if (finalsize != PyString_GET_SIZE(buf.outobj)) @@ -500,7 +558,7 @@ MultibyteCodec_Encode(MultibyteCodecObject *self, data = PyUnicode_AS_UNICODE(arg); datalen = PyUnicode_GET_SIZE(arg); - errorcb = get_errorcallback(errors); + errorcb = internal_error_callback(errors); if (errorcb == NULL) { Py_XDECREF(ucvt); return NULL; @@ -515,16 +573,12 @@ MultibyteCodec_Encode(MultibyteCodecObject *self, if (r == NULL) goto errorexit; - if (errorcb > ERROR_MAX) { - Py_DECREF(errorcb); - } + ERROR_DECREF(errorcb); Py_XDECREF(ucvt); return make_tuple(r, datalen); errorexit: - if (errorcb > ERROR_MAX) { - Py_DECREF(errorcb); - } + ERROR_DECREF(errorcb); Py_XDECREF(ucvt); return NULL; } @@ -543,18 +597,16 @@ MultibyteCodec_Decode(MultibyteCodecObject *self, codeckwarglist, &data, &datalen, &errors)) return NULL; - errorcb = get_errorcallback(errors); + errorcb = internal_error_callback(errors); if (errorcb == NULL) return NULL; if (datalen == 0) { - if (errorcb > ERROR_MAX) { - Py_DECREF(errorcb); - } + ERROR_DECREF(errorcb); return make_tuple(PyUnicode_FromUnicode(NULL, 0), 0); } - buf.outobj = buf.excobj = NULL; + buf.excobj = NULL; buf.inbuf = buf.inbuf_top = (unsigned char *)data; buf.inbuf_end = buf.inbuf_top + datalen; buf.outobj = PyUnicode_FromUnicode(NULL, datalen); @@ -590,49 +642,17 @@ MultibyteCodec_Decode(MultibyteCodecObject *self, goto errorexit; Py_XDECREF(buf.excobj); - if (errorcb > ERROR_MAX) { - Py_DECREF(errorcb); - } + ERROR_DECREF(errorcb); return make_tuple(buf.outobj, datalen); errorexit: - if (errorcb > ERROR_MAX) { - Py_DECREF(errorcb); - } + ERROR_DECREF(errorcb); Py_XDECREF(buf.excobj); Py_XDECREF(buf.outobj); return NULL; } -static PyObject * -MultibyteCodec_StreamReader(MultibyteCodecObject *self, - PyObject *args, PyObject *kwargs) -{ - PyObject *stream; - char *errors = NULL; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s:StreamReader", - streamkwarglist, &stream, &errors)) - return NULL; - - return mbstreamreader_create(self->codec, stream, errors); -} - -static PyObject * -MultibyteCodec_StreamWriter(MultibyteCodecObject *self, - PyObject *args, PyObject *kwargs) -{ - PyObject *stream; - char *errors = NULL; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s:StreamWriter", - streamkwarglist, &stream, &errors)) - return NULL; - - return mbstreamwriter_create(self->codec, stream, errors); -} - static struct PyMethodDef multibytecodec_methods[] = { {"encode", (PyCFunction)MultibyteCodec_Encode, METH_VARARGS | METH_KEYWORDS, @@ -640,12 +660,6 @@ static struct PyMethodDef multibytecodec_methods[] = { {"decode", (PyCFunction)MultibyteCodec_Decode, METH_VARARGS | METH_KEYWORDS, MultibyteCodec_Decode__doc__}, - {"StreamReader",(PyCFunction)MultibyteCodec_StreamReader, - METH_VARARGS | METH_KEYWORDS, - MultibyteCodec_StreamReader__doc__}, - {"StreamWriter",(PyCFunction)MultibyteCodec_StreamWriter, - METH_VARARGS | METH_KEYWORDS, - MultibyteCodec_StreamWriter__doc__}, {NULL, NULL}, }; @@ -655,8 +669,6 @@ multibytecodec_dealloc(MultibyteCodecObject *self) PyObject_Del(self); } - - static PyTypeObject MultibyteCodec_Type = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ @@ -690,13 +702,498 @@ static PyTypeObject MultibyteCodec_Type = { multibytecodec_methods, /* tp_methods */ }; + +/** + * Utility functions for stateful codec mechanism + */ + +#define STATEFUL_DCTX(o) ((MultibyteStatefulDecoderContext *)(o)) +#define STATEFUL_ECTX(o) ((MultibyteStatefulEncoderContext *)(o)) + +static PyObject * +encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, + PyObject *unistr, int final) +{ + PyObject *ucvt, *r = NULL; + Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL; + Py_ssize_t datalen, origpending; + + if (PyUnicode_Check(unistr)) + ucvt = NULL; + else { + unistr = ucvt = PyObject_Unicode(unistr); + if (unistr == NULL) + return NULL; + else if (!PyUnicode_Check(unistr)) { + PyErr_SetString(PyExc_TypeError, + "couldn't convert the object to unicode."); + Py_DECREF(ucvt); + return NULL; + } + } + + datalen = PyUnicode_GET_SIZE(unistr); + origpending = ctx->pendingsize; + + if (origpending > 0) { + inbuf_tmp = PyMem_New(Py_UNICODE, datalen + ctx->pendingsize); + if (inbuf_tmp == NULL) + goto errorexit; + memcpy(inbuf_tmp, ctx->pending, + Py_UNICODE_SIZE * ctx->pendingsize); + memcpy(inbuf_tmp + ctx->pendingsize, + PyUnicode_AS_UNICODE(unistr), + Py_UNICODE_SIZE * datalen); + datalen += ctx->pendingsize; + ctx->pendingsize = 0; + inbuf = inbuf_tmp; + } + else + inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr); + + inbuf_end = inbuf + datalen; + + r = multibytecodec_encode(ctx->codec, &ctx->state, + (const Py_UNICODE **)&inbuf, + datalen, ctx->errors, final ? MBENC_FLUSH : 0); + if (r == NULL) { + /* recover the original pending buffer */ + if (origpending > 0) + memcpy(ctx->pending, inbuf_tmp, + Py_UNICODE_SIZE * origpending); + ctx->pendingsize = origpending; + goto errorexit; + } + + if (inbuf < inbuf_end) { + ctx->pendingsize = (Py_ssize_t)(inbuf_end - inbuf); + if (ctx->pendingsize > MAXENCPENDING) { + /* normal codecs can't reach here */ + ctx->pendingsize = 0; + PyErr_SetString(PyExc_UnicodeError, + "pending buffer overflow"); + goto errorexit; + } + memcpy(ctx->pending, inbuf, + ctx->pendingsize * Py_UNICODE_SIZE); + } + + if (inbuf_tmp != NULL) + PyMem_Del(inbuf_tmp); + Py_XDECREF(ucvt); + return r; + +errorexit: + if (inbuf_tmp != NULL) + PyMem_Del(inbuf_tmp); + Py_XDECREF(r); + Py_XDECREF(ucvt); + return NULL; +} + +static int +decoder_append_pending(MultibyteStatefulDecoderContext *ctx, + MultibyteDecodeBuffer *buf) +{ + Py_ssize_t npendings; + + npendings = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); + if (npendings + ctx->pendingsize > MAXDECPENDING) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer overflow"); + return -1; + } + memcpy(ctx->pending + ctx->pendingsize, buf->inbuf, npendings); + ctx->pendingsize += npendings; + return 0; +} + +static int +decoder_prepare_buffer(MultibyteDecodeBuffer *buf, const char *data, + Py_ssize_t size) +{ + buf->inbuf = buf->inbuf_top = (const unsigned char *)data; + buf->inbuf_end = buf->inbuf_top + size; + if (buf->outobj == NULL) { /* only if outobj is not allocated yet */ + buf->outobj = PyUnicode_FromUnicode(NULL, size); + if (buf->outobj == NULL) + return -1; + buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj); + buf->outbuf_end = buf->outbuf + + PyUnicode_GET_SIZE(buf->outobj); + } + + return 0; +} + +static int +decoder_feed_buffer(MultibyteStatefulDecoderContext *ctx, + MultibyteDecodeBuffer *buf) +{ + while (buf->inbuf < buf->inbuf_end) { + Py_ssize_t inleft, outleft; + int r; + + inleft = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); + outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf); + + r = ctx->codec->decode(&ctx->state, ctx->codec->config, + &buf->inbuf, inleft, &buf->outbuf, outleft); + if (r == 0 || r == MBERR_TOOFEW) + break; + else if (multibytecodec_decerror(ctx->codec, &ctx->state, + buf, ctx->errors, r)) + return -1; + } + return 0; +} + + +/** + * MultibyteIncrementalEncoder object + */ + +static PyObject * +mbiencoder_encode(MultibyteIncrementalEncoderObject *self, + PyObject *args, PyObject *kwargs) +{ + PyObject *data; + int final = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|i:encode", + incrementalkwarglist, &data, &final)) + return NULL; + + return encoder_encode_stateful(STATEFUL_ECTX(self), data, final); +} + +static PyObject * +mbiencoder_reset(MultibyteIncrementalEncoderObject *self) +{ + if (self->codec->decreset != NULL && + self->codec->decreset(&self->state, self->codec->config) != 0) + return NULL; + self->pendingsize = 0; + + Py_RETURN_NONE; +} + +static struct PyMethodDef mbiencoder_methods[] = { + {"encode", (PyCFunction)mbiencoder_encode, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"reset", (PyCFunction)mbiencoder_reset, + METH_NOARGS, NULL}, + {NULL, NULL}, +}; + +static PyObject * +mbiencoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + MultibyteIncrementalEncoderObject *self; + PyObject *codec = NULL; + char *errors = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|s:IncrementalEncoder", + incnewkwarglist, &errors)) + return NULL; + + self = (MultibyteIncrementalEncoderObject *)type->tp_alloc(type, 0); + if (self == NULL) + return NULL; + + codec = PyObject_GetAttrString((PyObject *)type, "codec"); + if (codec == NULL) + goto errorexit; + if (!MultibyteCodec_Check(codec)) { + PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); + goto errorexit; + } + + self->codec = ((MultibyteCodecObject *)codec)->codec; + self->pendingsize = 0; + self->errors = internal_error_callback(errors); + if (self->errors == NULL) + goto errorexit; + if (self->codec->encinit != NULL && + self->codec->encinit(&self->state, self->codec->config) != 0) + goto errorexit; + + Py_DECREF(codec); + return (PyObject *)self; + +errorexit: + Py_XDECREF(self); + Py_XDECREF(codec); + return NULL; +} + +static int +mbiencoder_traverse(MultibyteIncrementalEncoderObject *self, + visitproc visit, void *arg) +{ + if (ERROR_ISCUSTOM(self->errors)) + Py_VISIT(self->errors); + return 0; +} + +static void +mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self) +{ + PyObject_GC_UnTrack(self); + ERROR_DECREF(self->errors); + self->ob_type->tp_free(self); +} + +static PyTypeObject MultibyteIncrementalEncoder_Type = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "MultibyteIncrementalEncoder", /* tp_name */ + sizeof(MultibyteIncrementalEncoderObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)mbiencoder_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC + | Py_TPFLAGS_BASETYPE, /* tp_flags */ + 0, /* tp_doc */ + (traverseproc)mbiencoder_traverse, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iterext */ + mbiencoder_methods, /* tp_methods */ + 0, /* tp_members */ + codecctx_getsets, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + mbiencoder_new, /* tp_new */ +}; + + +/** + * MultibyteIncrementalDecoder object + */ + +static PyObject * +mbidecoder_decode(MultibyteIncrementalDecoderObject *self, + PyObject *args, PyObject *kwargs) +{ + MultibyteDecodeBuffer buf; + char *data, *wdata; + Py_ssize_t wsize, finalsize = 0, size, origpending; + int final = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "t#|i:decode", + incrementalkwarglist, &data, &size, &final)) + return NULL; + + buf.outobj = buf.excobj = NULL; + origpending = self->pendingsize; + + if (self->pendingsize == 0) { + wsize = size; + wdata = data; + } + else { + wsize = size + self->pendingsize; + wdata = PyMem_Malloc(wsize); + if (wdata == NULL) + goto errorexit; + memcpy(wdata, self->pending, self->pendingsize); + memcpy(wdata + self->pendingsize, data, size); + self->pendingsize = 0; + } + + if (decoder_prepare_buffer(&buf, wdata, wsize) != 0) + goto errorexit; + + if (decoder_feed_buffer(STATEFUL_DCTX(self), &buf)) + goto errorexit; + + if (final && buf.inbuf < buf.inbuf_end) { + if (multibytecodec_decerror(self->codec, &self->state, + &buf, self->errors, MBERR_TOOFEW)) { + /* recover the original pending buffer */ + memcpy(self->pending, wdata, origpending); + self->pendingsize = origpending; + goto errorexit; + } + } + + if (buf.inbuf < buf.inbuf_end) { /* pending sequence still exists */ + if (decoder_append_pending(STATEFUL_DCTX(self), &buf) != 0) + goto errorexit; + } + + finalsize = (Py_ssize_t)(buf.outbuf - PyUnicode_AS_UNICODE(buf.outobj)); + if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) + if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) + goto errorexit; + + if (wdata != data) + PyMem_Del(wdata); + Py_XDECREF(buf.excobj); + return buf.outobj; + +errorexit: + if (wdata != NULL && wdata != data) + PyMem_Del(wdata); + Py_XDECREF(buf.excobj); + Py_XDECREF(buf.outobj); + return NULL; +} + +static PyObject * +mbidecoder_reset(MultibyteIncrementalDecoderObject *self) +{ + if (self->codec->decreset != NULL && + self->codec->decreset(&self->state, self->codec->config) != 0) + return NULL; + self->pendingsize = 0; + + Py_RETURN_NONE; +} + +static struct PyMethodDef mbidecoder_methods[] = { + {"decode", (PyCFunction)mbidecoder_decode, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"reset", (PyCFunction)mbidecoder_reset, + METH_NOARGS, NULL}, + {NULL, NULL}, +}; + +static PyObject * +mbidecoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + MultibyteIncrementalDecoderObject *self; + PyObject *codec = NULL; + char *errors = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|s:IncrementalDecoder", + incnewkwarglist, &errors)) + return NULL; + + self = (MultibyteIncrementalDecoderObject *)type->tp_alloc(type, 0); + if (self == NULL) + return NULL; + + codec = PyObject_GetAttrString((PyObject *)type, "codec"); + if (codec == NULL) + goto errorexit; + if (!MultibyteCodec_Check(codec)) { + PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); + goto errorexit; + } + + self->codec = ((MultibyteCodecObject *)codec)->codec; + self->pendingsize = 0; + self->errors = internal_error_callback(errors); + if (self->errors == NULL) + goto errorexit; + if (self->codec->decinit != NULL && + self->codec->decinit(&self->state, self->codec->config) != 0) + goto errorexit; + + Py_DECREF(codec); + return (PyObject *)self; + +errorexit: + Py_XDECREF(self); + Py_XDECREF(codec); + return NULL; +} + +static int +mbidecoder_traverse(MultibyteIncrementalDecoderObject *self, + visitproc visit, void *arg) +{ + if (ERROR_ISCUSTOM(self->errors)) + Py_VISIT(self->errors); + return 0; +} + +static void +mbidecoder_dealloc(MultibyteIncrementalDecoderObject *self) +{ + PyObject_GC_UnTrack(self); + ERROR_DECREF(self->errors); + self->ob_type->tp_free(self); +} + +static PyTypeObject MultibyteIncrementalDecoder_Type = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "MultibyteIncrementalDecoder", /* tp_name */ + sizeof(MultibyteIncrementalDecoderObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)mbidecoder_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC + | Py_TPFLAGS_BASETYPE, /* tp_flags */ + 0, /* tp_doc */ + (traverseproc)mbidecoder_traverse, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iterext */ + mbidecoder_methods, /* tp_methods */ + 0, /* tp_members */ + codecctx_getsets, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + mbidecoder_new, /* tp_new */ +}; + + +/** + * MultibyteStreamReader object + */ + static PyObject * mbstreamreader_iread(MultibyteStreamReaderObject *self, const char *method, Py_ssize_t sizehint) { MultibyteDecodeBuffer buf; PyObject *cres; - Py_ssize_t rsize, r, finalsize = 0; + Py_ssize_t rsize, finalsize = 0; if (sizehint == 0) return PyUnicode_FromUnicode(NULL, 0); @@ -740,39 +1237,13 @@ mbstreamreader_iread(MultibyteStreamReaderObject *self, } rsize = PyString_GET_SIZE(cres); - buf.inbuf = buf.inbuf_top = - (unsigned char *)PyString_AS_STRING(cres); - buf.inbuf_end = buf.inbuf_top + rsize; - if (buf.outobj == NULL) { - buf.outobj = PyUnicode_FromUnicode(NULL, rsize); - if (buf.outobj == NULL) - goto errorexit; - buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj); - buf.outbuf_end = buf.outbuf + - PyUnicode_GET_SIZE(buf.outobj); - } + if (decoder_prepare_buffer(&buf, PyString_AS_STRING(cres), + rsize) != 0) + goto errorexit; - r = 0; - if (rsize > 0) - while (buf.inbuf < buf.inbuf_end) { - Py_ssize_t inleft, outleft; - - inleft = (Py_ssize_t)(buf.inbuf_end - - buf.inbuf); - outleft = (Py_ssize_t)(buf.outbuf_end - - buf.outbuf); - - r = self->codec->decode(&self->state, - self->codec->config, - &buf.inbuf, inleft, - &buf.outbuf, outleft); - if (r == 0 || r == MBERR_TOOFEW) - break; - else if (multibytecodec_decerror(self->codec, - &self->state, &buf, - self->errors, r)) - goto errorexit; - } + if (rsize > 0 && decoder_feed_buffer( + (MultibyteStatefulDecoderContext *)self, &buf)) + goto errorexit; if (rsize == 0 || sizehint < 0) { /* end of file */ if (buf.inbuf < buf.inbuf_end && @@ -782,20 +1253,9 @@ mbstreamreader_iread(MultibyteStreamReaderObject *self, } if (buf.inbuf < buf.inbuf_end) { /* pending sequence exists */ - Py_ssize_t npendings; - - /* we can't assume that pendingsize is still 0 here. - * because this function can be called recursively - * from error callback */ - npendings = (Py_ssize_t)(buf.inbuf_end - buf.inbuf); - if (npendings + self->pendingsize > MAXDECPENDING) { - PyErr_SetString(PyExc_RuntimeError, - "pending buffer overflow"); + if (decoder_append_pending(STATEFUL_DCTX(self), + &buf) != 0) goto errorexit; - } - memcpy(self->pending + self->pendingsize, buf.inbuf, - npendings); - self->pendingsize += npendings; } finalsize = (Py_ssize_t)(buf.outbuf - @@ -901,8 +1361,7 @@ mbstreamreader_reset(MultibyteStreamReaderObject *self) return NULL; self->pendingsize = 0; - Py_INCREF(Py_None); - return Py_None; + Py_RETURN_NONE; } static struct PyMethodDef mbstreamreader_methods[] = { @@ -917,17 +1376,74 @@ static struct PyMethodDef mbstreamreader_methods[] = { {NULL, NULL}, }; -static void -mbstreamreader_dealloc(MultibyteStreamReaderObject *self) +static PyMemberDef mbstreamreader_members[] = { + {"stream", T_OBJECT, + offsetof(MultibyteStreamReaderObject, stream), + READONLY, NULL}, + {NULL,} +}; + +static PyObject * +mbstreamreader_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - if (self->errors > ERROR_MAX) { - Py_DECREF(self->errors); + MultibyteStreamReaderObject *self; + PyObject *stream, *codec = NULL; + char *errors = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|s:StreamReader", + streamkwarglist, &stream, &errors)) + return NULL; + + self = (MultibyteStreamReaderObject *)type->tp_alloc(type, 0); + if (self == NULL) + return NULL; + + codec = PyObject_GetAttrString((PyObject *)type, "codec"); + if (codec == NULL) + goto errorexit; + if (!MultibyteCodec_Check(codec)) { + PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); + goto errorexit; } - Py_DECREF(self->stream); - PyObject_Del(self); + + self->codec = ((MultibyteCodecObject *)codec)->codec; + self->stream = stream; + Py_INCREF(stream); + self->pendingsize = 0; + self->errors = internal_error_callback(errors); + if (self->errors == NULL) + goto errorexit; + if (self->codec->decinit != NULL && + self->codec->decinit(&self->state, self->codec->config) != 0) + goto errorexit; + + Py_DECREF(codec); + return (PyObject *)self; + +errorexit: + Py_XDECREF(self); + Py_XDECREF(codec); + return NULL; } +static int +mbstreamreader_traverse(MultibyteStreamReaderObject *self, + visitproc visit, void *arg) +{ + if (ERROR_ISCUSTOM(self->errors)) + Py_VISIT(self->errors); + Py_VISIT(self->stream); + return 0; +} +static void +mbstreamreader_dealloc(MultibyteStreamReaderObject *self) +{ + PyObject_GC_UnTrack(self); + ERROR_DECREF(self->errors); + Py_DECREF(self->stream); + self->ob_type->tp_free(self); +} static PyTypeObject MultibyteStreamReader_Type = { PyObject_HEAD_INIT(NULL) @@ -951,97 +1467,50 @@ static PyTypeObject MultibyteStreamReader_Type = { PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC + | Py_TPFLAGS_BASETYPE, /* tp_flags */ 0, /* tp_doc */ - 0, /* tp_traverse */ + (traverseproc)mbstreamreader_traverse, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iterext */ mbstreamreader_methods, /* tp_methods */ + mbstreamreader_members, /* tp_members */ + codecctx_getsets, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + mbstreamreader_new, /* tp_new */ }; + +/** + * MultibyteStreamWriter object + */ + static int mbstreamwriter_iwrite(MultibyteStreamWriterObject *self, PyObject *unistr) { - PyObject *wr, *ucvt, *r = NULL; - Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL; - Py_ssize_t datalen; - - if (PyUnicode_Check(unistr)) - ucvt = NULL; - else { - unistr = ucvt = PyObject_Unicode(unistr); - if (unistr == NULL) - return -1; - else if (!PyUnicode_Check(unistr)) { - PyErr_SetString(PyExc_TypeError, - "couldn't convert the object to unicode."); - Py_DECREF(ucvt); - return -1; - } - } + PyObject *str, *wr; - datalen = PyUnicode_GET_SIZE(unistr); - if (datalen == 0) { - Py_XDECREF(ucvt); - return 0; - } - - if (self->pendingsize > 0) { - inbuf_tmp = PyMem_New(Py_UNICODE, datalen + self->pendingsize); - if (inbuf_tmp == NULL) - goto errorexit; - memcpy(inbuf_tmp, self->pending, - Py_UNICODE_SIZE * self->pendingsize); - memcpy(inbuf_tmp + self->pendingsize, - PyUnicode_AS_UNICODE(unistr), - Py_UNICODE_SIZE * datalen); - datalen += self->pendingsize; - self->pendingsize = 0; - inbuf = inbuf_tmp; - } - else - inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr); - - inbuf_end = inbuf + datalen; - - r = multibytecodec_encode(self->codec, &self->state, - (const Py_UNICODE **)&inbuf, datalen, self->errors, 0); - if (r == NULL) - goto errorexit; - - if (inbuf < inbuf_end) { - self->pendingsize = (Py_ssize_t)(inbuf_end - inbuf); - if (self->pendingsize > MAXENCPENDING) { - self->pendingsize = 0; - PyErr_SetString(PyExc_RuntimeError, - "pending buffer overflow"); - goto errorexit; - } - memcpy(self->pending, inbuf, - self->pendingsize * Py_UNICODE_SIZE); - } + str = encoder_encode_stateful(STATEFUL_ECTX(self), unistr, 0); + if (str == NULL) + return -1; - wr = PyObject_CallMethod(self->stream, "write", "O", r); + wr = PyObject_CallMethod(self->stream, "write", "O", str); + Py_DECREF(str); if (wr == NULL) - goto errorexit; + return -1; - if (inbuf_tmp != NULL) - PyMem_Del(inbuf_tmp); - Py_DECREF(r); Py_DECREF(wr); - Py_XDECREF(ucvt); return 0; - -errorexit: - if (inbuf_tmp != NULL) - PyMem_Del(inbuf_tmp); - Py_XDECREF(r); - Py_XDECREF(ucvt); - return -1; } static PyObject * @@ -1054,10 +1523,8 @@ mbstreamwriter_write(MultibyteStreamWriterObject *self, PyObject *args) if (mbstreamwriter_iwrite(self, strobj)) return NULL; - else { - Py_INCREF(Py_None); - return Py_None; - } + else + Py_RETURN_NONE; } static PyObject * @@ -1087,8 +1554,7 @@ mbstreamwriter_writelines(MultibyteStreamWriterObject *self, PyObject *args) return NULL; } - Py_INCREF(Py_None); - return Py_None; + Py_RETURN_NONE; } static PyObject * @@ -1119,18 +1585,69 @@ mbstreamwriter_reset(MultibyteStreamWriterObject *self) } Py_DECREF(pwrt); - Py_INCREF(Py_None); - return Py_None; + Py_RETURN_NONE; +} + +static PyObject * +mbstreamwriter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + MultibyteStreamWriterObject *self; + PyObject *stream, *codec = NULL; + char *errors = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|s:StreamWriter", + streamkwarglist, &stream, &errors)) + return NULL; + + self = (MultibyteStreamWriterObject *)type->tp_alloc(type, 0); + if (self == NULL) + return NULL; + + codec = PyObject_GetAttrString((PyObject *)type, "codec"); + if (codec == NULL) + goto errorexit; + if (!MultibyteCodec_Check(codec)) { + PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); + goto errorexit; + } + + self->codec = ((MultibyteCodecObject *)codec)->codec; + self->stream = stream; + Py_INCREF(stream); + self->pendingsize = 0; + self->errors = internal_error_callback(errors); + if (self->errors == NULL) + goto errorexit; + if (self->codec->encinit != NULL && + self->codec->encinit(&self->state, self->codec->config) != 0) + goto errorexit; + + Py_DECREF(codec); + return (PyObject *)self; + +errorexit: + Py_XDECREF(self); + Py_XDECREF(codec); + return NULL; +} + +static int +mbstreamwriter_traverse(MultibyteStreamWriterObject *self, + visitproc visit, void *arg) +{ + if (ERROR_ISCUSTOM(self->errors)) + Py_VISIT(self->errors); + Py_VISIT(self->stream); + return 0; } static void mbstreamwriter_dealloc(MultibyteStreamWriterObject *self) { - if (self->errors > ERROR_MAX) { - Py_DECREF(self->errors); - } + PyObject_GC_UnTrack(self); + ERROR_DECREF(self->errors); Py_DECREF(self->stream); - PyObject_Del(self); + self->ob_type->tp_free(self); } static struct PyMethodDef mbstreamwriter_methods[] = { @@ -1143,7 +1660,12 @@ static struct PyMethodDef mbstreamwriter_methods[] = { {NULL, NULL}, }; - +static PyMemberDef mbstreamwriter_members[] = { + {"stream", T_OBJECT, + offsetof(MultibyteStreamWriterObject, stream), + READONLY, NULL}, + {NULL,} +}; static PyTypeObject MultibyteStreamWriter_Type = { PyObject_HEAD_INIT(NULL) @@ -1167,17 +1689,33 @@ static PyTypeObject MultibyteStreamWriter_Type = { PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC + | Py_TPFLAGS_BASETYPE, /* tp_flags */ 0, /* tp_doc */ - 0, /* tp_traverse */ + (traverseproc)mbstreamwriter_traverse, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iterext */ mbstreamwriter_methods, /* tp_methods */ + mbstreamwriter_members, /* tp_members */ + codecctx_getsets, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + mbstreamwriter_new, /* tp_new */ }; + +/** + * Exposed factory function + */ + static PyObject * __create_codec(PyObject *ignore, PyObject *arg) { @@ -1201,80 +1739,38 @@ __create_codec(PyObject *ignore, PyObject *arg) return (PyObject *)self; } -static PyObject * -mbstreamreader_create(MultibyteCodec *codec, - PyObject *stream, const char *errors) -{ - MultibyteStreamReaderObject *self; - - self = PyObject_New(MultibyteStreamReaderObject, - &MultibyteStreamReader_Type); - if (self == NULL) - return NULL; - - self->codec = codec; - self->stream = stream; - Py_INCREF(stream); - self->pendingsize = 0; - self->errors = get_errorcallback(errors); - if (self->errors == NULL) - goto errorexit; - if (self->codec->decinit != NULL && - self->codec->decinit(&self->state, self->codec->config) != 0) - goto errorexit; - - return (PyObject *)self; - -errorexit: - Py_XDECREF(self); - return NULL; -} - -static PyObject * -mbstreamwriter_create(MultibyteCodec *codec, - PyObject *stream, const char *errors) -{ - MultibyteStreamWriterObject *self; - - self = PyObject_New(MultibyteStreamWriterObject, - &MultibyteStreamWriter_Type); - if (self == NULL) - return NULL; - - self->codec = codec; - self->stream = stream; - Py_INCREF(stream); - self->pendingsize = 0; - self->errors = get_errorcallback(errors); - if (self->errors == NULL) - goto errorexit; - if (self->codec->encinit != NULL && - self->codec->encinit(&self->state, self->codec->config) != 0) - goto errorexit; - - return (PyObject *)self; - -errorexit: - Py_XDECREF(self); - return NULL; -} - static struct PyMethodDef __methods[] = { {"__create_codec", (PyCFunction)__create_codec, METH_O}, {NULL, NULL}, }; -void +PyMODINIT_FUNC init_multibytecodec(void) { + int i; + PyObject *m; + PyTypeObject *typelist[] = { + &MultibyteIncrementalEncoder_Type, + &MultibyteIncrementalDecoder_Type, + &MultibyteStreamReader_Type, + &MultibyteStreamWriter_Type, + NULL + }; + if (PyType_Ready(&MultibyteCodec_Type) < 0) return; - if (PyType_Ready(&MultibyteStreamReader_Type) < 0) - return; - if (PyType_Ready(&MultibyteStreamWriter_Type) < 0) + + m = Py_InitModule("_multibytecodec", __methods); + if (m == NULL) return; - Py_InitModule("_multibytecodec", __methods); + for (i = 0; typelist[i] != NULL; i++) { + if (PyType_Ready(typelist[i]) < 0) + return; + Py_INCREF(typelist[i]); + PyModule_AddObject(m, typelist[i]->tp_name, + (PyObject *)typelist[i]); + } if (PyErr_Occurred()) Py_FatalError("can't initialize the _multibytecodec module"); |