summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2006-06-14 05:21:04 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2006-06-14 05:21:04 (GMT)
commitd825143be1118ba7e320661b3a71d8822ae5d600 (patch)
treea74764df14558614917b7f4d9abd56c44ffd5623
parent6ce9fe880be7416d88e6d800528db4079db6d6b1 (diff)
downloadcpython-d825143be1118ba7e320661b3a71d8822ae5d600.zip
cpython-d825143be1118ba7e320661b3a71d8822ae5d600.tar.gz
cpython-d825143be1118ba7e320661b3a71d8822ae5d600.tar.bz2
Patch #1455898: Incremental mode for "mbcs" codec.
-rw-r--r--Doc/api/concrete.tex12
-rw-r--r--Include/unicodeobject.h7
-rw-r--r--Lib/encodings/mbcs.py7
-rw-r--r--Misc/NEWS3
-rw-r--r--Modules/_codecsmodule.c15
-rw-r--r--Objects/unicodeobject.c214
6 files changed, 211 insertions, 47 deletions
diff --git a/Doc/api/concrete.tex b/Doc/api/concrete.tex
index 10247ab..40b178f 100644
--- a/Doc/api/concrete.tex
+++ b/Doc/api/concrete.tex
@@ -1431,6 +1431,18 @@ machine running the codec.
raised by the codec.
\end{cfuncdesc}
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCSStateful}{const char *s,
+ int size,
+ const char *errors,
+ int *consumed}
+ If \var{consumed} is \NULL{}, behave like
+ \cfunction{PyUnicode_DecodeMBCS()}. If \var{consumed} is not \NULL{},
+ \cfunction{PyUnicode_DecodeMBCSStateful()} will not decode trailing lead
+ byte and the number of bytes that have been decoded will be stored in
+ \var{consumed}.
+ \versionadded{2.5}
+\end{cfuncdesc}
+
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s,
Py_ssize_t size,
const char *errors}
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 3177051..c7e07a8 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -938,6 +938,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
const char *errors /* error handling */
);
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
+ const char *string, /* MBCS encoded string */
+ Py_ssize_t length, /* size of string */
+ const char *errors, /* error handling */
+ Py_ssize_t *consumed /* bytes consumed */
+ );
+
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
PyObject *unicode /* Unicode object */
);
diff --git a/Lib/encodings/mbcs.py b/Lib/encodings/mbcs.py
index ff77fde..a44ee7b 100644
--- a/Lib/encodings/mbcs.py
+++ b/Lib/encodings/mbcs.py
@@ -22,9 +22,10 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.mbcs_encode(input,self.errors)[0]
-class IncrementalDecoder(codecs.IncrementalDecoder):
- def decode(self, input, final=False):
- return codecs.mbcs_decode(input,self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ def _buffer_decode(self, input, errors, final):
+ return codecs.mbcs_decode(input,self.errors,final)
+
class StreamWriter(Codec,codecs.StreamWriter):
pass
diff --git a/Misc/NEWS b/Misc/NEWS
index a0d7182..3dbecd7 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -156,6 +156,9 @@ Extension Modules
Library
-------
+- Patch #1455898: The MBCS codec now supports the incremental mode for
+ double-byte encodings.
+
- ``difflib``'s ``SequenceMatcher.get_matching_blocks()`` was changed to
guarantee that adjacent triples in the return list always describe
non-adjacent blocks. Previously, a pair of matching blocks could end
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
index 32fa82f..6d384b7 100644
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -479,15 +479,20 @@ mbcs_decode(PyObject *self,
PyObject *args)
{
const char *data;
- Py_ssize_t size;
+ Py_ssize_t size, consumed;
const char *errors = NULL;
+ int final = 1;
+ PyObject *decoded;
- if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
- &data, &size, &errors))
+ if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode",
+ &data, &size, &errors, &final))
return NULL;
- return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
- size);
+ decoded = PyUnicode_DecodeMBCSStateful(
+ data, size, errors, final ? NULL : &consumed);
+ if (!decoded)
+ return NULL;
+ return codec_tuple(decoded, final ? size : consumed);
}
#endif /* MS_WINDOWS */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 3c06997..08fdb3f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
/* --- MBCS codecs for Windows -------------------------------------------- */
-PyObject *PyUnicode_DecodeMBCS(const char *s,
- Py_ssize_t size,
- const char *errors)
+#if SIZEOF_INT < SIZEOF_SSIZE_T
+#define NEED_RETRY
+#endif
+
+/* XXX This code is limited to "true" double-byte encodings, as
+ a) it assumes an incomplete character consists of a single byte, and
+ b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
+ encodings, see IsDBCSLeadByteEx documentation. */
+
+static int is_dbcs_lead_byte(const char *s, int offset)
+{
+ const char *curr = s + offset;
+
+ if (IsDBCSLeadByte(*curr)) {
+ const char *prev = CharPrev(s, curr);
+ return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
+ }
+ return 0;
+}
+
+/*
+ * Decode MBCS string into unicode object. If 'final' is set, converts
+ * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
+ */
+static int decode_mbcs(PyUnicodeObject **v,
+ const char *s, /* MBCS string */
+ int size, /* sizeof MBCS string */
+ int final)
{
- PyUnicodeObject *v;
Py_UNICODE *p;
- DWORD usize;
+ Py_ssize_t n = 0;
+ int usize = 0;
+
+ assert(size >= 0);
+
+ /* Skip trailing lead-byte unless 'final' is set */
+ if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
+ --size;
/* First get the size of the result */
- assert(size < INT_MAX);
- usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
- if (size > 0 && usize==0)
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (size > 0) {
+ usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
+ if (usize == 0) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
- v = _PyUnicode_New(usize);
- if (v == NULL)
- return NULL;
- if (usize == 0)
- return (PyObject *)v;
- p = PyUnicode_AS_UNICODE(v);
- if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
- Py_DECREF(v);
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (*v == NULL) {
+ /* Create unicode object */
+ *v = _PyUnicode_New(usize);
+ if (*v == NULL)
+ return -1;
+ }
+ else {
+ /* Extend unicode object */
+ n = PyUnicode_GET_SIZE(*v);
+ if (_PyUnicode_Resize(v, n + usize) < 0)
+ return -1;
+ }
+
+ /* Do the conversion */
+ if (size > 0) {
+ p = PyUnicode_AS_UNICODE(*v) + n;
+ if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
+
+ return size;
+}
+
+PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ PyUnicodeObject *v = NULL;
+ int done;
+
+ if (consumed)
+ *consumed = 0;
+
+#ifdef NEED_RETRY
+ retry:
+ if (size > INT_MAX)
+ done = decode_mbcs(&v, s, INT_MAX, 0);
+ else
+#endif
+ done = decode_mbcs(&v, s, (int)size, !consumed);
+
+ if (done < 0) {
+ Py_XDECREF(v);
+ return NULL;
+ }
+
+ if (consumed)
+ *consumed += done;
+
+#ifdef NEED_RETRY
+ if (size > INT_MAX) {
+ s += done;
+ size -= done;
+ goto retry;
}
+#endif
return (PyObject *)v;
}
-PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+PyObject *PyUnicode_DecodeMBCS(const char *s,
Py_ssize_t size,
const char *errors)
{
- PyObject *repr;
- char *s;
- DWORD mbcssize;
+ return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
+}
- /* If there are no characters, bail now! */
- if (size==0)
- return PyString_FromString("");
+/*
+ * Convert unicode into string object (MBCS).
+ * Returns 0 if succeed, -1 otherwise.
+ */
+static int encode_mbcs(PyObject **repr,
+ const Py_UNICODE *p, /* unicode */
+ int size) /* size of unicode */
+{
+ int mbcssize = 0;
+ Py_ssize_t n = 0;
+
+ assert(size >= 0);
/* First get the size of the result */
- assert(size<INT_MAX);
- mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
- if (mbcssize==0)
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (size > 0) {
+ mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+ if (mbcssize == 0) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
- repr = PyString_FromStringAndSize(NULL, mbcssize);
- if (repr == NULL)
- return NULL;
- if (mbcssize == 0)
- return repr;
+ if (*repr == NULL) {
+ /* Create string object */
+ *repr = PyString_FromStringAndSize(NULL, mbcssize);
+ if (*repr == NULL)
+ return -1;
+ }
+ else {
+ /* Extend string object */
+ n = PyString_Size(*repr);
+ if (_PyString_Resize(repr, n + mbcssize) < 0)
+ return -1;
+ }
/* Do the conversion */
- s = PyString_AS_STRING(repr);
- assert(size < INT_MAX);
- if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
- Py_DECREF(repr);
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (size > 0) {
+ char *s = PyString_AS_STRING(*repr) + n;
+ if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
}
+
+ return 0;
+}
+
+PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+ Py_ssize_t size,
+ const char *errors)
+{
+ PyObject *repr = NULL;
+ int ret;
+
+#ifdef NEED_RETRY
+ retry:
+ if (size > INT_MAX)
+ ret = encode_mbcs(&repr, p, INT_MAX);
+ else
+#endif
+ ret = encode_mbcs(&repr, p, (int)size);
+
+ if (ret < 0) {
+ Py_XDECREF(repr);
+ return NULL;
+ }
+
+#ifdef NEED_RETRY
+ if (size > INT_MAX) {
+ p += INT_MAX;
+ size -= INT_MAX;
+ goto retry;
+ }
+#endif
+
return repr;
}
@@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
NULL);
}
+#undef NEED_RETRY
+
#endif /* MS_WINDOWS */
/* --- Character Mapping Codec -------------------------------------------- */