diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2006-06-14 05:21:04 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2006-06-14 05:21:04 (GMT) |
commit | d825143be1118ba7e320661b3a71d8822ae5d600 (patch) | |
tree | a74764df14558614917b7f4d9abd56c44ffd5623 | |
parent | 6ce9fe880be7416d88e6d800528db4079db6d6b1 (diff) | |
download | cpython-d825143be1118ba7e320661b3a71d8822ae5d600.zip cpython-d825143be1118ba7e320661b3a71d8822ae5d600.tar.gz cpython-d825143be1118ba7e320661b3a71d8822ae5d600.tar.bz2 |
Patch #1455898: Incremental mode for "mbcs" codec.
-rw-r--r-- | Doc/api/concrete.tex | 12 | ||||
-rw-r--r-- | Include/unicodeobject.h | 7 | ||||
-rw-r--r-- | Lib/encodings/mbcs.py | 7 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Modules/_codecsmodule.c | 15 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 214 |
6 files changed, 211 insertions, 47 deletions
diff --git a/Doc/api/concrete.tex b/Doc/api/concrete.tex index 10247ab..40b178f 100644 --- a/Doc/api/concrete.tex +++ b/Doc/api/concrete.tex @@ -1431,6 +1431,18 @@ machine running the codec. raised by the codec. \end{cfuncdesc} +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCSStateful}{const char *s, + int size, + const char *errors, + int *consumed} + If \var{consumed} is \NULL{}, behave like + \cfunction{PyUnicode_DecodeMBCS()}. If \var{consumed} is not \NULL{}, + \cfunction{PyUnicode_DecodeMBCSStateful()} will not decode trailing lead + byte and the number of bytes that have been decoded will be stored in + \var{consumed}. + \versionadded{2.5} +\end{cfuncdesc} + \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s, Py_ssize_t size, const char *errors} diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 3177051..c7e07a8 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -938,6 +938,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( const char *errors /* error handling */ ); +PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( + const char *string, /* MBCS encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ + ); + PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( PyObject *unicode /* Unicode object */ ); diff --git a/Lib/encodings/mbcs.py b/Lib/encodings/mbcs.py index ff77fde..a44ee7b 100644 --- a/Lib/encodings/mbcs.py +++ b/Lib/encodings/mbcs.py @@ -22,9 +22,10 @@ class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): return codecs.mbcs_encode(input,self.errors)[0] -class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return codecs.mbcs_decode(input,self.errors)[0] +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, input, errors, final): + return codecs.mbcs_decode(input,self.errors,final) + class StreamWriter(Codec,codecs.StreamWriter): pass @@ -156,6 +156,9 @@ Extension Modules Library ------- +- Patch #1455898: The MBCS codec now supports the incremental mode for + double-byte encodings. + - ``difflib``'s ``SequenceMatcher.get_matching_blocks()`` was changed to guarantee that adjacent triples in the return list always describe non-adjacent blocks. Previously, a pair of matching blocks could end diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 32fa82f..6d384b7 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -479,15 +479,20 @@ mbcs_decode(PyObject *self, PyObject *args) { const char *data; - Py_ssize_t size; + Py_ssize_t size, consumed; const char *errors = NULL; + int final = 1; + PyObject *decoded; - if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode", + &data, &size, &errors, &final)) return NULL; - return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors), - size); + decoded = PyUnicode_DecodeMBCSStateful( + data, size, errors, final ? NULL : &consumed); + if (!decoded) + return NULL; + return codec_tuple(decoded, final ? size : consumed); } #endif /* MS_WINDOWS */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3c06997..08fdb3f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode) /* --- MBCS codecs for Windows -------------------------------------------- */ -PyObject *PyUnicode_DecodeMBCS(const char *s, - Py_ssize_t size, - const char *errors) +#if SIZEOF_INT < SIZEOF_SSIZE_T +#define NEED_RETRY +#endif + +/* XXX This code is limited to "true" double-byte encodings, as + a) it assumes an incomplete character consists of a single byte, and + b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte + encodings, see IsDBCSLeadByteEx documentation. */ + +static int is_dbcs_lead_byte(const char *s, int offset) +{ + const char *curr = s + offset; + + if (IsDBCSLeadByte(*curr)) { + const char *prev = CharPrev(s, curr); + return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); + } + return 0; +} + +/* + * Decode MBCS string into unicode object. If 'final' is set, converts + * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. + */ +static int decode_mbcs(PyUnicodeObject **v, + const char *s, /* MBCS string */ + int size, /* sizeof MBCS string */ + int final) { - PyUnicodeObject *v; Py_UNICODE *p; - DWORD usize; + Py_ssize_t n = 0; + int usize = 0; + + assert(size >= 0); + + /* Skip trailing lead-byte unless 'final' is set */ + if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) + --size; /* First get the size of the result */ - assert(size < INT_MAX); - usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0); - if (size > 0 && usize==0) - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); + if (usize == 0) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } - v = _PyUnicode_New(usize); - if (v == NULL) - return NULL; - if (usize == 0) - return (PyObject *)v; - p = PyUnicode_AS_UNICODE(v); - if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) { - Py_DECREF(v); - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (*v == NULL) { + /* Create unicode object */ + *v = _PyUnicode_New(usize); + if (*v == NULL) + return -1; + } + else { + /* Extend unicode object */ + n = PyUnicode_GET_SIZE(*v); + if (_PyUnicode_Resize(v, n + usize) < 0) + return -1; + } + + /* Do the conversion */ + if (size > 0) { + p = PyUnicode_AS_UNICODE(*v) + n; + if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } + + return size; +} + +PyObject *PyUnicode_DecodeMBCSStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + PyUnicodeObject *v = NULL; + int done; + + if (consumed) + *consumed = 0; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + done = decode_mbcs(&v, s, INT_MAX, 0); + else +#endif + done = decode_mbcs(&v, s, (int)size, !consumed); + + if (done < 0) { + Py_XDECREF(v); + return NULL; + } + + if (consumed) + *consumed += done; + +#ifdef NEED_RETRY + if (size > INT_MAX) { + s += done; + size -= done; + goto retry; } +#endif return (PyObject *)v; } -PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, +PyObject *PyUnicode_DecodeMBCS(const char *s, Py_ssize_t size, const char *errors) { - PyObject *repr; - char *s; - DWORD mbcssize; + return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); +} - /* If there are no characters, bail now! */ - if (size==0) - return PyString_FromString(""); +/* + * Convert unicode into string object (MBCS). + * Returns 0 if succeed, -1 otherwise. + */ +static int encode_mbcs(PyObject **repr, + const Py_UNICODE *p, /* unicode */ + int size) /* size of unicode */ +{ + int mbcssize = 0; + Py_ssize_t n = 0; + + assert(size >= 0); /* First get the size of the result */ - assert(size<INT_MAX); - mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL); - if (mbcssize==0) - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); + if (mbcssize == 0) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } - repr = PyString_FromStringAndSize(NULL, mbcssize); - if (repr == NULL) - return NULL; - if (mbcssize == 0) - return repr; + if (*repr == NULL) { + /* Create string object */ + *repr = PyString_FromStringAndSize(NULL, mbcssize); + if (*repr == NULL) + return -1; + } + else { + /* Extend string object */ + n = PyString_Size(*repr); + if (_PyString_Resize(repr, n + mbcssize) < 0) + return -1; + } /* Do the conversion */ - s = PyString_AS_STRING(repr); - assert(size < INT_MAX); - if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) { - Py_DECREF(repr); - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + char *s = PyString_AS_STRING(*repr) + n; + if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } } + + return 0; +} + +PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + PyObject *repr = NULL; + int ret; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + ret = encode_mbcs(&repr, p, INT_MAX); + else +#endif + ret = encode_mbcs(&repr, p, (int)size); + + if (ret < 0) { + Py_XDECREF(repr); + return NULL; + } + +#ifdef NEED_RETRY + if (size > INT_MAX) { + p += INT_MAX; + size -= INT_MAX; + goto retry; + } +#endif + return repr; } @@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode) NULL); } +#undef NEED_RETRY + #endif /* MS_WINDOWS */ /* --- Character Mapping Codec -------------------------------------------- */ |