summaryrefslogtreecommitdiffstats
path: root/Modules/cjkcodecs
diff options
context:
space:
mode:
authorChristopher Thorne <libcthorne@users.noreply.github.com>2018-11-01 10:48:49 (GMT)
committerMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2018-11-01 10:48:49 (GMT)
commitac22f6aa989f18c33c12615af1c66c73cf75d5e7 (patch)
treebb21f3018f9b5e4b40ede33ce78bba1b13980f86 /Modules/cjkcodecs
parent4b5e62dbb22a3593e0db266c12f805b727a42b00 (diff)
downloadcpython-ac22f6aa989f18c33c12615af1c66c73cf75d5e7.zip
cpython-ac22f6aa989f18c33c12615af1c66c73cf75d5e7.tar.gz
cpython-ac22f6aa989f18c33c12615af1c66c73cf75d5e7.tar.bz2
bpo-33578: Add getstate/setstate for CJK codec (GH-6984)
This implements getstate and setstate for the cjkcodecs multibyte incremental encoders/decoders, primarily to fix issues with seek/tell. The encoder getstate/setstate is slightly tricky as the "state" is pending bytes + MultibyteCodec_State but only an integer can be returned. The approach I've taken is to encode this data into a long, similar to how .tell() encodes a "cookie_type" as a long. https://bugs.python.org/issue33578
Diffstat (limited to 'Modules/cjkcodecs')
-rw-r--r--Modules/cjkcodecs/_codecs_cn.c38
-rw-r--r--Modules/cjkcodecs/clinic/multibytecodec.c.h90
-rw-r--r--Modules/cjkcodecs/multibytecodec.c154
-rw-r--r--Modules/cjkcodecs/multibytecodec.h13
4 files changed, 273 insertions, 22 deletions
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c
index 1fcc220..8a62f7e 100644
--- a/Modules/cjkcodecs/_codecs_cn.c
+++ b/Modules/cjkcodecs/_codecs_cn.c
@@ -52,6 +52,12 @@
}
/*
+ * codecs in this file use the first byte of MultibyteCodec_State.c[8]
+ * to store a 0 or 1 state value
+ */
+#define CN_STATE_OFFSET 0
+
+/*
* GB2312 codec
*/
@@ -329,15 +335,15 @@ DECODER(gb18030)
ENCODER_INIT(hz)
{
- state->i = 0;
+ state->c[CN_STATE_OFFSET] = 0;
return 0;
}
ENCODER_RESET(hz)
{
- if (state->i != 0) {
+ if (state->c[CN_STATE_OFFSET] != 0) {
WRITEBYTE2('~', '}');
- state->i = 0;
+ state->c[CN_STATE_OFFSET] = 0;
NEXT_OUT(2);
}
return 0;
@@ -350,10 +356,10 @@ ENCODER(hz)
DBCHAR code;
if (c < 0x80) {
- if (state->i) {
+ if (state->c[CN_STATE_OFFSET]) {
WRITEBYTE2('~', '}');
NEXT_OUT(2);
- state->i = 0;
+ state->c[CN_STATE_OFFSET] = 0;
}
WRITEBYTE1((unsigned char)c);
NEXT(1, 1);
@@ -375,10 +381,10 @@ ENCODER(hz)
if (code & 0x8000) /* MSB set: GBK */
return 1;
- if (state->i == 0) {
+ if (state->c[CN_STATE_OFFSET] == 0) {
WRITEBYTE4('~', '{', code >> 8, code & 0xff);
NEXT(1, 4);
- state->i = 1;
+ state->c[CN_STATE_OFFSET] = 1;
}
else {
WRITEBYTE2(code >> 8, code & 0xff);
@@ -391,13 +397,13 @@ ENCODER(hz)
DECODER_INIT(hz)
{
- state->i = 0;
+ state->c[CN_STATE_OFFSET] = 0;
return 0;
}
DECODER_RESET(hz)
{
- state->i = 0;
+ state->c[CN_STATE_OFFSET] = 0;
return 0;
}
@@ -411,14 +417,14 @@ DECODER(hz)
unsigned char c2 = INBYTE2;
REQUIRE_INBUF(2);
- if (c2 == '~' && state->i == 0)
+ if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0)
OUTCHAR('~');
- else if (c2 == '{' && state->i == 0)
- state->i = 1; /* set GB */
- else if (c2 == '\n' && state->i == 0)
+ else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0)
+ state->c[CN_STATE_OFFSET] = 1; /* set GB */
+ else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0)
; /* line-continuation */
- else if (c2 == '}' && state->i == 1)
- state->i = 0; /* set ASCII */
+ else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1)
+ state->c[CN_STATE_OFFSET] = 0; /* set ASCII */
else
return 1;
NEXT_IN(2);
@@ -428,7 +434,7 @@ DECODER(hz)
if (c & 0x80)
return 1;
- if (state->i == 0) { /* ASCII mode */
+ if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */
OUTCHAR(c);
NEXT_IN(1);
}
diff --git a/Modules/cjkcodecs/clinic/multibytecodec.c.h b/Modules/cjkcodecs/clinic/multibytecodec.c.h
index 25857fc..a58bb64 100644
--- a/Modules/cjkcodecs/clinic/multibytecodec.c.h
+++ b/Modules/cjkcodecs/clinic/multibytecodec.c.h
@@ -115,6 +115,50 @@ exit:
return return_value;
}
+PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_getstate__doc__,
+"getstate($self, /)\n"
+"--\n"
+"\n");
+
+#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF \
+ {"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalEncoder_getstate__doc__},
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self);
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_getstate(MultibyteIncrementalEncoderObject *self, PyObject *Py_UNUSED(ignored))
+{
+ return _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(self);
+}
+
+PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_setstate__doc__,
+"setstate($self, state, /)\n"
+"--\n"
+"\n");
+
+#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF \
+ {"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalEncoder_setstate__doc__},
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self,
+ PyLongObject *statelong);
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_setstate(MultibyteIncrementalEncoderObject *self, PyObject *arg)
+{
+ PyObject *return_value = NULL;
+ PyLongObject *statelong;
+
+ if (!PyArg_Parse(arg, "O!:setstate", &PyLong_Type, &statelong)) {
+ goto exit;
+ }
+ return_value = _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(self, statelong);
+
+exit:
+ return return_value;
+}
+
PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_reset__doc__,
"reset($self, /)\n"
"--\n"
@@ -169,6 +213,50 @@ exit:
return return_value;
}
+PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_getstate__doc__,
+"getstate($self, /)\n"
+"--\n"
+"\n");
+
+#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF \
+ {"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalDecoder_getstate__doc__},
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self);
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_getstate(MultibyteIncrementalDecoderObject *self, PyObject *Py_UNUSED(ignored))
+{
+ return _multibytecodec_MultibyteIncrementalDecoder_getstate_impl(self);
+}
+
+PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_setstate__doc__,
+"setstate($self, state, /)\n"
+"--\n"
+"\n");
+
+#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF \
+ {"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalDecoder_setstate__doc__},
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self,
+ PyObject *state);
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_setstate(MultibyteIncrementalDecoderObject *self, PyObject *arg)
+{
+ PyObject *return_value = NULL;
+ PyObject *state;
+
+ if (!PyArg_Parse(arg, "O!:setstate", &PyTuple_Type, &state)) {
+ goto exit;
+ }
+ return_value = _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(self, state);
+
+exit:
+ return return_value;
+}
+
PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_reset__doc__,
"reset($self, /)\n"
"--\n"
@@ -330,4 +418,4 @@ PyDoc_STRVAR(_multibytecodec___create_codec__doc__,
#define _MULTIBYTECODEC___CREATE_CODEC_METHODDEF \
{"__create_codec", (PyCFunction)_multibytecodec___create_codec, METH_O, _multibytecodec___create_codec__doc__},
-/*[clinic end generated code: output=680f59f4cfe63c25 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=2fa0a38494716b97 input=a9049054013a1b77]*/
diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c
index 22172b0..4633499 100644
--- a/Modules/cjkcodecs/multibytecodec.c
+++ b/Modules/cjkcodecs/multibytecodec.c
@@ -896,6 +896,93 @@ _multibytecodec_MultibyteIncrementalEncoder_encode_impl(MultibyteIncrementalEnco
}
/*[clinic input]
+_multibytecodec.MultibyteIncrementalEncoder.getstate
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self)
+/*[clinic end generated code: output=9794a5ace70d7048 input=4a2a82874ffa40bb]*/
+{
+ /* state made up of 1 byte for buffer size, up to MAXENCPENDING*4 bytes
+ for UTF-8 encoded buffer (each character can use up to 4
+ bytes), and required bytes for MultibyteCodec_State.c. A byte
+ array is used to avoid different compilers generating different
+ values for the same state, e.g. as a result of struct padding.
+ */
+ unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
+ Py_ssize_t statesize;
+ const char *pendingbuffer = NULL;
+ Py_ssize_t pendingsize;
+
+ if (self->pending != NULL) {
+ pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize);
+ if (pendingbuffer == NULL) {
+ return NULL;
+ }
+ if (pendingsize > MAXENCPENDING*4) {
+ PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
+ return NULL;
+ }
+ statebytes[0] = pendingsize;
+ memcpy(statebytes+1, pendingbuffer, pendingsize);
+ statesize = 1 + pendingsize;
+ } else {
+ statebytes[0] = 0;
+ statesize = 1;
+ }
+ memcpy(statebytes+statesize, self->state.c,
+ sizeof(self->state.c));
+ statesize += sizeof(self->state.c);
+
+ return (PyObject *)_PyLong_FromByteArray(statebytes, statesize,
+ 1 /* little-endian */ ,
+ 0 /* unsigned */ );
+}
+
+/*[clinic input]
+_multibytecodec.MultibyteIncrementalEncoder.setstate
+ state as statelong: object(type='PyLongObject *', subclass_of='&PyLong_Type')
+ /
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self,
+ PyLongObject *statelong)
+/*[clinic end generated code: output=4e5e98ac1f4039ca input=c80fb5830d4d2f76]*/
+{
+ PyObject *pending = NULL;
+ unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
+
+ if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes),
+ 1 /* little-endian */ ,
+ 0 /* unsigned */ ) < 0) {
+ goto errorexit;
+ }
+
+ if (statebytes[0] > MAXENCPENDING*4) {
+ PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
+ return NULL;
+ }
+
+ pending = PyUnicode_DecodeUTF8((const char *)statebytes+1,
+ statebytes[0], "strict");
+ if (pending == NULL) {
+ goto errorexit;
+ }
+
+ Py_CLEAR(self->pending);
+ self->pending = pending;
+ memcpy(self->state.c, statebytes+1+statebytes[0],
+ sizeof(self->state.c));
+
+ Py_RETURN_NONE;
+
+errorexit:
+ Py_XDECREF(pending);
+ return NULL;
+}
+
+/*[clinic input]
_multibytecodec.MultibyteIncrementalEncoder.reset
[clinic start generated code]*/
@@ -919,6 +1006,8 @@ _multibytecodec_MultibyteIncrementalEncoder_reset_impl(MultibyteIncrementalEncod
static struct PyMethodDef mbiencoder_methods[] = {
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_ENCODE_METHODDEF
+ _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF
+ _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_RESET_METHODDEF
{NULL, NULL},
};
@@ -984,6 +1073,7 @@ mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self)
{
PyObject_GC_UnTrack(self);
ERROR_DECREF(self->errors);
+ Py_CLEAR(self->pending);
Py_TYPE(self)->tp_free(self);
}
@@ -1120,6 +1210,68 @@ errorexit:
}
/*[clinic input]
+_multibytecodec.MultibyteIncrementalDecoder.getstate
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self)
+/*[clinic end generated code: output=255009c4713b7f82 input=4006aa49bddbaa75]*/
+{
+ PyObject *buffer;
+
+ buffer = PyBytes_FromStringAndSize((const char *)self->pending,
+ self->pendingsize);
+ if (buffer == NULL) {
+ return NULL;
+ }
+
+ return make_tuple(buffer, (Py_ssize_t)*self->state.c);
+}
+
+/*[clinic input]
+_multibytecodec.MultibyteIncrementalDecoder.setstate
+ state: object(subclass_of='&PyTuple_Type')
+ /
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self,
+ PyObject *state)
+/*[clinic end generated code: output=106b2fbca3e2dcc2 input=e5d794e8baba1a47]*/
+{
+ PyObject *buffer;
+ Py_ssize_t buffersize;
+ char *bufferstr;
+ unsigned long long flag;
+
+ if (!PyArg_ParseTuple(state, "SK;setstate(): illegal state argument",
+ &buffer, &flag))
+ {
+ return NULL;
+ }
+
+ buffersize = PyBytes_Size(buffer);
+ if (buffersize == -1) {
+ return NULL;
+ }
+
+ if (buffersize > MAXDECPENDING) {
+ PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
+ return NULL;
+ }
+
+ bufferstr = PyBytes_AsString(buffer);
+ if (bufferstr == NULL) {
+ return NULL;
+ }
+ self->pendingsize = buffersize;
+ memcpy(self->pending, bufferstr, self->pendingsize);
+ memcpy(self->state.c, (unsigned char *)&flag, sizeof(flag));
+
+ Py_RETURN_NONE;
+}
+
+/*[clinic input]
_multibytecodec.MultibyteIncrementalDecoder.reset
[clinic start generated code]*/
@@ -1137,6 +1289,8 @@ _multibytecodec_MultibyteIncrementalDecoder_reset_impl(MultibyteIncrementalDecod
static struct PyMethodDef mbidecoder_methods[] = {
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_DECODE_METHODDEF
+ _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF
+ _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_RESET_METHODDEF
{NULL, NULL},
};
diff --git a/Modules/cjkcodecs/multibytecodec.h b/Modules/cjkcodecs/multibytecodec.h
index 5b8c222..6d34534 100644
--- a/Modules/cjkcodecs/multibytecodec.h
+++ b/Modules/cjkcodecs/multibytecodec.h
@@ -16,12 +16,15 @@ typedef uint16_t ucs2_t, DBCHAR;
typedef unsigned short ucs2_t, DBCHAR;
#endif
-typedef union {
- void *p;
- int i;
+/*
+ * A struct that provides 8 bytes of state for multibyte
+ * codecs. Codecs are free to use this how they want. Note: if you
+ * need to add a new field to this struct, ensure that its byte order
+ * is independent of CPU endianness so that the return value of
+ * getstate doesn't differ between little and big endian CPUs.
+ */
+typedef struct {
unsigned char c[8];
- ucs2_t u2[4];
- Py_UCS4 u4[2];
} MultibyteCodec_State;
typedef int (*mbcodec_init)(const void *config);