diff options
author | Christopher Thorne <libcthorne@users.noreply.github.com> | 2018-11-01 10:48:49 (GMT) |
---|---|---|
committer | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2018-11-01 10:48:49 (GMT) |
commit | ac22f6aa989f18c33c12615af1c66c73cf75d5e7 (patch) | |
tree | bb21f3018f9b5e4b40ede33ce78bba1b13980f86 /Modules/cjkcodecs | |
parent | 4b5e62dbb22a3593e0db266c12f805b727a42b00 (diff) | |
download | cpython-ac22f6aa989f18c33c12615af1c66c73cf75d5e7.zip cpython-ac22f6aa989f18c33c12615af1c66c73cf75d5e7.tar.gz cpython-ac22f6aa989f18c33c12615af1c66c73cf75d5e7.tar.bz2 |
bpo-33578: Add getstate/setstate for CJK codec (GH-6984)
This implements getstate and setstate for the cjkcodecs multibyte incremental encoders/decoders, primarily to fix issues with seek/tell.
The encoder getstate/setstate is slightly tricky as the "state" is pending bytes + MultibyteCodec_State but only an integer can be returned. The approach I've taken is to encode this data into a long, similar to how .tell() encodes a "cookie_type" as a long.
https://bugs.python.org/issue33578
Diffstat (limited to 'Modules/cjkcodecs')
-rw-r--r-- | Modules/cjkcodecs/_codecs_cn.c | 38 | ||||
-rw-r--r-- | Modules/cjkcodecs/clinic/multibytecodec.c.h | 90 | ||||
-rw-r--r-- | Modules/cjkcodecs/multibytecodec.c | 154 | ||||
-rw-r--r-- | Modules/cjkcodecs/multibytecodec.h | 13 |
4 files changed, 273 insertions, 22 deletions
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c index 1fcc220..8a62f7e 100644 --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -52,6 +52,12 @@ } /* + * codecs in this file use the first byte of MultibyteCodec_State.c[8] + * to store a 0 or 1 state value + */ +#define CN_STATE_OFFSET 0 + +/* * GB2312 codec */ @@ -329,15 +335,15 @@ DECODER(gb18030) ENCODER_INIT(hz) { - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; return 0; } ENCODER_RESET(hz) { - if (state->i != 0) { + if (state->c[CN_STATE_OFFSET] != 0) { WRITEBYTE2('~', '}'); - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; NEXT_OUT(2); } return 0; @@ -350,10 +356,10 @@ ENCODER(hz) DBCHAR code; if (c < 0x80) { - if (state->i) { + if (state->c[CN_STATE_OFFSET]) { WRITEBYTE2('~', '}'); NEXT_OUT(2); - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; } WRITEBYTE1((unsigned char)c); NEXT(1, 1); @@ -375,10 +381,10 @@ ENCODER(hz) if (code & 0x8000) /* MSB set: GBK */ return 1; - if (state->i == 0) { + if (state->c[CN_STATE_OFFSET] == 0) { WRITEBYTE4('~', '{', code >> 8, code & 0xff); NEXT(1, 4); - state->i = 1; + state->c[CN_STATE_OFFSET] = 1; } else { WRITEBYTE2(code >> 8, code & 0xff); @@ -391,13 +397,13 @@ ENCODER(hz) DECODER_INIT(hz) { - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; return 0; } DECODER_RESET(hz) { - state->i = 0; + state->c[CN_STATE_OFFSET] = 0; return 0; } @@ -411,14 +417,14 @@ DECODER(hz) unsigned char c2 = INBYTE2; REQUIRE_INBUF(2); - if (c2 == '~' && state->i == 0) + if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0) OUTCHAR('~'); - else if (c2 == '{' && state->i == 0) - state->i = 1; /* set GB */ - else if (c2 == '\n' && state->i == 0) + else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0) + state->c[CN_STATE_OFFSET] = 1; /* set GB */ + else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0) ; /* line-continuation */ - else if (c2 == '}' && state->i == 1) - state->i = 0; /* set ASCII */ + else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1) + state->c[CN_STATE_OFFSET] = 0; /* set ASCII */ else return 1; NEXT_IN(2); @@ -428,7 +434,7 @@ DECODER(hz) if (c & 0x80) return 1; - if (state->i == 0) { /* ASCII mode */ + if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */ OUTCHAR(c); NEXT_IN(1); } diff --git a/Modules/cjkcodecs/clinic/multibytecodec.c.h b/Modules/cjkcodecs/clinic/multibytecodec.c.h index 25857fc..a58bb64 100644 --- a/Modules/cjkcodecs/clinic/multibytecodec.c.h +++ b/Modules/cjkcodecs/clinic/multibytecodec.c.h @@ -115,6 +115,50 @@ exit: return return_value; } +PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_getstate__doc__, +"getstate($self, /)\n" +"--\n" +"\n"); + +#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF \ + {"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalEncoder_getstate__doc__}, + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self); + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_getstate(MultibyteIncrementalEncoderObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(self); +} + +PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_setstate__doc__, +"setstate($self, state, /)\n" +"--\n" +"\n"); + +#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF \ + {"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalEncoder_setstate__doc__}, + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self, + PyLongObject *statelong); + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_setstate(MultibyteIncrementalEncoderObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + PyLongObject *statelong; + + if (!PyArg_Parse(arg, "O!:setstate", &PyLong_Type, &statelong)) { + goto exit; + } + return_value = _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(self, statelong); + +exit: + return return_value; +} + PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_reset__doc__, "reset($self, /)\n" "--\n" @@ -169,6 +213,50 @@ exit: return return_value; } +PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_getstate__doc__, +"getstate($self, /)\n" +"--\n" +"\n"); + +#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF \ + {"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalDecoder_getstate__doc__}, + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self); + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_getstate(MultibyteIncrementalDecoderObject *self, PyObject *Py_UNUSED(ignored)) +{ + return _multibytecodec_MultibyteIncrementalDecoder_getstate_impl(self); +} + +PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_setstate__doc__, +"setstate($self, state, /)\n" +"--\n" +"\n"); + +#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF \ + {"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalDecoder_setstate__doc__}, + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self, + PyObject *state); + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_setstate(MultibyteIncrementalDecoderObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + PyObject *state; + + if (!PyArg_Parse(arg, "O!:setstate", &PyTuple_Type, &state)) { + goto exit; + } + return_value = _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(self, state); + +exit: + return return_value; +} + PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_reset__doc__, "reset($self, /)\n" "--\n" @@ -330,4 +418,4 @@ PyDoc_STRVAR(_multibytecodec___create_codec__doc__, #define _MULTIBYTECODEC___CREATE_CODEC_METHODDEF \ {"__create_codec", (PyCFunction)_multibytecodec___create_codec, METH_O, _multibytecodec___create_codec__doc__}, -/*[clinic end generated code: output=680f59f4cfe63c25 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=2fa0a38494716b97 input=a9049054013a1b77]*/ diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 22172b0..4633499 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -896,6 +896,93 @@ _multibytecodec_MultibyteIncrementalEncoder_encode_impl(MultibyteIncrementalEnco } /*[clinic input] +_multibytecodec.MultibyteIncrementalEncoder.getstate +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self) +/*[clinic end generated code: output=9794a5ace70d7048 input=4a2a82874ffa40bb]*/ +{ + /* state made up of 1 byte for buffer size, up to MAXENCPENDING*4 bytes + for UTF-8 encoded buffer (each character can use up to 4 + bytes), and required bytes for MultibyteCodec_State.c. A byte + array is used to avoid different compilers generating different + values for the same state, e.g. as a result of struct padding. + */ + unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)]; + Py_ssize_t statesize; + const char *pendingbuffer = NULL; + Py_ssize_t pendingsize; + + if (self->pending != NULL) { + pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize); + if (pendingbuffer == NULL) { + return NULL; + } + if (pendingsize > MAXENCPENDING*4) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + return NULL; + } + statebytes[0] = pendingsize; + memcpy(statebytes+1, pendingbuffer, pendingsize); + statesize = 1 + pendingsize; + } else { + statebytes[0] = 0; + statesize = 1; + } + memcpy(statebytes+statesize, self->state.c, + sizeof(self->state.c)); + statesize += sizeof(self->state.c); + + return (PyObject *)_PyLong_FromByteArray(statebytes, statesize, + 1 /* little-endian */ , + 0 /* unsigned */ ); +} + +/*[clinic input] +_multibytecodec.MultibyteIncrementalEncoder.setstate + state as statelong: object(type='PyLongObject *', subclass_of='&PyLong_Type') + / +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self, + PyLongObject *statelong) +/*[clinic end generated code: output=4e5e98ac1f4039ca input=c80fb5830d4d2f76]*/ +{ + PyObject *pending = NULL; + unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)]; + + if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes), + 1 /* little-endian */ , + 0 /* unsigned */ ) < 0) { + goto errorexit; + } + + if (statebytes[0] > MAXENCPENDING*4) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + return NULL; + } + + pending = PyUnicode_DecodeUTF8((const char *)statebytes+1, + statebytes[0], "strict"); + if (pending == NULL) { + goto errorexit; + } + + Py_CLEAR(self->pending); + self->pending = pending; + memcpy(self->state.c, statebytes+1+statebytes[0], + sizeof(self->state.c)); + + Py_RETURN_NONE; + +errorexit: + Py_XDECREF(pending); + return NULL; +} + +/*[clinic input] _multibytecodec.MultibyteIncrementalEncoder.reset [clinic start generated code]*/ @@ -919,6 +1006,8 @@ _multibytecodec_MultibyteIncrementalEncoder_reset_impl(MultibyteIncrementalEncod static struct PyMethodDef mbiencoder_methods[] = { _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_ENCODE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_RESET_METHODDEF {NULL, NULL}, }; @@ -984,6 +1073,7 @@ mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self) { PyObject_GC_UnTrack(self); ERROR_DECREF(self->errors); + Py_CLEAR(self->pending); Py_TYPE(self)->tp_free(self); } @@ -1120,6 +1210,68 @@ errorexit: } /*[clinic input] +_multibytecodec.MultibyteIncrementalDecoder.getstate +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self) +/*[clinic end generated code: output=255009c4713b7f82 input=4006aa49bddbaa75]*/ +{ + PyObject *buffer; + + buffer = PyBytes_FromStringAndSize((const char *)self->pending, + self->pendingsize); + if (buffer == NULL) { + return NULL; + } + + return make_tuple(buffer, (Py_ssize_t)*self->state.c); +} + +/*[clinic input] +_multibytecodec.MultibyteIncrementalDecoder.setstate + state: object(subclass_of='&PyTuple_Type') + / +[clinic start generated code]*/ + +static PyObject * +_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self, + PyObject *state) +/*[clinic end generated code: output=106b2fbca3e2dcc2 input=e5d794e8baba1a47]*/ +{ + PyObject *buffer; + Py_ssize_t buffersize; + char *bufferstr; + unsigned long long flag; + + if (!PyArg_ParseTuple(state, "SK;setstate(): illegal state argument", + &buffer, &flag)) + { + return NULL; + } + + buffersize = PyBytes_Size(buffer); + if (buffersize == -1) { + return NULL; + } + + if (buffersize > MAXDECPENDING) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + return NULL; + } + + bufferstr = PyBytes_AsString(buffer); + if (bufferstr == NULL) { + return NULL; + } + self->pendingsize = buffersize; + memcpy(self->pending, bufferstr, self->pendingsize); + memcpy(self->state.c, (unsigned char *)&flag, sizeof(flag)); + + Py_RETURN_NONE; +} + +/*[clinic input] _multibytecodec.MultibyteIncrementalDecoder.reset [clinic start generated code]*/ @@ -1137,6 +1289,8 @@ _multibytecodec_MultibyteIncrementalDecoder_reset_impl(MultibyteIncrementalDecod static struct PyMethodDef mbidecoder_methods[] = { _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_DECODE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF + _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_RESET_METHODDEF {NULL, NULL}, }; diff --git a/Modules/cjkcodecs/multibytecodec.h b/Modules/cjkcodecs/multibytecodec.h index 5b8c222..6d34534 100644 --- a/Modules/cjkcodecs/multibytecodec.h +++ b/Modules/cjkcodecs/multibytecodec.h @@ -16,12 +16,15 @@ typedef uint16_t ucs2_t, DBCHAR; typedef unsigned short ucs2_t, DBCHAR; #endif -typedef union { - void *p; - int i; +/* + * A struct that provides 8 bytes of state for multibyte + * codecs. Codecs are free to use this how they want. Note: if you + * need to add a new field to this struct, ensure that its byte order + * is independent of CPU endianness so that the return value of + * getstate doesn't differ between little and big endian CPUs. + */ +typedef struct { unsigned char c[8]; - ucs2_t u2[4]; - Py_UCS4 u4[2]; } MultibyteCodec_State; typedef int (*mbcodec_init)(const void *config); |