diff options
author | Christopher Thorne <libcthorne@users.noreply.github.com> | 2018-11-01 10:48:49 (GMT) |
---|---|---|
committer | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2018-11-01 10:48:49 (GMT) |
commit | ac22f6aa989f18c33c12615af1c66c73cf75d5e7 (patch) | |
tree | bb21f3018f9b5e4b40ede33ce78bba1b13980f86 /Modules/cjkcodecs/multibytecodec.c | |
parent | 4b5e62dbb22a3593e0db266c12f805b727a42b00 (diff) | |
download | cpython-ac22f6aa989f18c33c12615af1c66c73cf75d5e7.zip cpython-ac22f6aa989f18c33c12615af1c66c73cf75d5e7.tar.gz cpython-ac22f6aa989f18c33c12615af1c66c73cf75d5e7.tar.bz2 |
bpo-33578: Add getstate/setstate for CJK codec (GH-6984)
This implements getstate and setstate for the cjkcodecs multibyte incremental encoders/decoders, primarily to fix issues with seek/tell.
The encoder getstate/setstate is slightly tricky as the "state" is pending bytes + MultibyteCodec_State but only an integer can be returned. The approach I've taken is to encode this data into a long, similar to how .tell() encodes a "cookie_type" as a long.
https://bugs.python.org/issue33578
Diffstat (limited to 'Modules/cjkcodecs/multibytecodec.c')
-rw-r--r-- | Modules/cjkcodecs/multibytecodec.c | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 22172b0..4633499 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -896,6 +896,93 @@ _multibytecodec_MultibyteIncrementalEncoder_encode_impl(MultibyteIncrementalEnco } /*[clinic input] +_multibytecodec.MultibyteIncrementalEncoder.getstate +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self) +/*[clinic end generated code: output=9794a5ace70d7048 input=4a2a82874ffa40bb]*/ +{ + /* state made up of 1 byte for buffer size, up to MAXENCPENDING*4 bytes + for UTF-8 encoded buffer (each character can use up to 4 + bytes), and required bytes for MultibyteCodec_State.c. A byte + array is used to avoid different compilers generating different + values for the same state, e.g. as a result of struct padding. + */ + unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)]; + Py_ssize_t statesize; + const char *pendingbuffer = NULL; + Py_ssize_t pendingsize; + + if (self->pending != NULL) { + pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize); + if (pendingbuffer == NULL) { + return NULL; + } + if (pendingsize > MAXENCPENDING*4) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + return NULL; + } + statebytes[0] = pendingsize; + memcpy(statebytes+1, pendingbuffer, pendingsize); + statesize = 1 + pendingsize; + } else { + statebytes[0] = 0; + statesize = 1; + } + memcpy(statebytes+statesize, self->state.c, + sizeof(self->state.c)); + statesize += sizeof(self->state.c); + + return (PyObject *)_PyLong_FromByteArray(statebytes, statesize, + 1 /* little-endian */ , + 0 /* unsigned */ ); +} + +/*[clinic input] +_multibytecodec.MultibyteIncrementalEncoder.setstate + state as statelong: object(type='PyLongObject *', subclass_of='&PyLong_Type') + / +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self, + PyLongObject *statelong) +/*[clinic end generated code: output=4e5e98ac1f4039ca input=c80fb5830d4d2f76]*/ +{ + PyObject *pending = NULL; + unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)]; + + if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes), + 1 /* little-endian */ , + 0 /* unsigned */ ) < 0) { + goto errorexit; + } + + if (statebytes[0] > MAXENCPENDING*4) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + return NULL; + } + + pending = PyUnicode_DecodeUTF8((const char *)statebytes+1, + statebytes[0], "strict"); + if (pending == NULL) { + goto errorexit; + } + + Py_CLEAR(self->pending); + self->pending = pending; + memcpy(self->state.c, statebytes+1+statebytes[0], + sizeof(self->state.c)); + + Py_RETURN_NONE; + +errorexit: + Py_XDECREF(pending); + return NULL; +} + +/*[clinic input] _multibytecodec.MultibyteIncrementalEncoder.reset [clinic start generated code]*/ @@ -919,6 +1006,8 @@ _multibytecodec_MultibyteIncrementalEncoder_reset_impl(MultibyteIncrementalEncod static struct PyMethodDef mbiencoder_methods[] = { _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_ENCODE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_RESET_METHODDEF {NULL, NULL}, }; @@ -984,6 +1073,7 @@ mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self) { PyObject_GC_UnTrack(self); ERROR_DECREF(self->errors); + Py_CLEAR(self->pending); Py_TYPE(self)->tp_free(self); } @@ -1120,6 +1210,68 @@ errorexit: } /*[clinic input] +_multibytecodec.MultibyteIncrementalDecoder.getstate +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self) +/*[clinic end generated code: output=255009c4713b7f82 input=4006aa49bddbaa75]*/ +{ + PyObject *buffer; + + buffer = PyBytes_FromStringAndSize((const char *)self->pending, + self->pendingsize); + if (buffer == NULL) { + return NULL; + } + + return make_tuple(buffer, (Py_ssize_t)*self->state.c); +} + +/*[clinic input] +_multibytecodec.MultibyteIncrementalDecoder.setstate + state: object(subclass_of='&PyTuple_Type') + / +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self, + PyObject *state) +/*[clinic end generated code: output=106b2fbca3e2dcc2 input=e5d794e8baba1a47]*/ +{ + PyObject *buffer; + Py_ssize_t buffersize; + char *bufferstr; + unsigned long long flag; + + if (!PyArg_ParseTuple(state, "SK;setstate(): illegal state argument", + &buffer, &flag)) + { + return NULL; + } + + buffersize = PyBytes_Size(buffer); + if (buffersize == -1) { + return NULL; + } + + if (buffersize > MAXDECPENDING) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + return NULL; + } + + bufferstr = PyBytes_AsString(buffer); + if (bufferstr == NULL) { + return NULL; + } + self->pendingsize = buffersize; + memcpy(self->pending, bufferstr, self->pendingsize); + memcpy(self->state.c, (unsigned char *)&flag, sizeof(flag)); + + Py_RETURN_NONE; +} + +/*[clinic input] _multibytecodec.MultibyteIncrementalDecoder.reset [clinic start generated code]*/ @@ -1137,6 +1289,8 @@ _multibytecodec_MultibyteIncrementalDecoder_reset_impl(MultibyteIncrementalDecod static struct PyMethodDef mbidecoder_methods[] = { _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_DECODE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_RESET_METHODDEF {NULL, NULL}, }; |