diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2006-06-14 05:21:04 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2006-06-14 05:21:04 (GMT) |
commit | d825143be1118ba7e320661b3a71d8822ae5d600 (patch) | |
tree | a74764df14558614917b7f4d9abd56c44ffd5623 /Objects | |
parent | 6ce9fe880be7416d88e6d800528db4079db6d6b1 (diff) | |
download | cpython-d825143be1118ba7e320661b3a71d8822ae5d600.zip cpython-d825143be1118ba7e320661b3a71d8822ae5d600.tar.gz cpython-d825143be1118ba7e320661b3a71d8822ae5d600.tar.bz2 |
Patch #1455898: Incremental mode for "mbcs" codec.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 214 |
1 files changed, 175 insertions, 39 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3c06997..08fdb3f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode) /* --- MBCS codecs for Windows -------------------------------------------- */ -PyObject *PyUnicode_DecodeMBCS(const char *s, - Py_ssize_t size, - const char *errors) +#if SIZEOF_INT < SIZEOF_SSIZE_T +#define NEED_RETRY +#endif + +/* XXX This code is limited to "true" double-byte encodings, as + a) it assumes an incomplete character consists of a single byte, and + b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte + encodings, see IsDBCSLeadByteEx documentation. */ + +static int is_dbcs_lead_byte(const char *s, int offset) +{ + const char *curr = s + offset; + + if (IsDBCSLeadByte(*curr)) { + const char *prev = CharPrev(s, curr); + return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); + } + return 0; +} + +/* + * Decode MBCS string into unicode object. If 'final' is set, converts + * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. + */ +static int decode_mbcs(PyUnicodeObject **v, + const char *s, /* MBCS string */ + int size, /* sizeof MBCS string */ + int final) { - PyUnicodeObject *v; Py_UNICODE *p; - DWORD usize; + Py_ssize_t n = 0; + int usize = 0; + + assert(size >= 0); + + /* Skip trailing lead-byte unless 'final' is set */ + if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) + --size; /* First get the size of the result */ - assert(size < INT_MAX); - usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0); - if (size > 0 && usize==0) - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); + if (usize == 0) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } - v = _PyUnicode_New(usize); - if (v == NULL) - return NULL; - if (usize == 0) - return (PyObject *)v; - p = PyUnicode_AS_UNICODE(v); - if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) { - Py_DECREF(v); - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (*v == NULL) { + /* Create unicode object */ + *v = _PyUnicode_New(usize); + if (*v == NULL) + return -1; + } + else { + /* Extend unicode object */ + n = PyUnicode_GET_SIZE(*v); + if (_PyUnicode_Resize(v, n + usize) < 0) + return -1; + } + + /* Do the conversion */ + if (size > 0) { + p = PyUnicode_AS_UNICODE(*v) + n; + if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } + + return size; +} + +PyObject *PyUnicode_DecodeMBCSStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + PyUnicodeObject *v = NULL; + int done; + + if (consumed) + *consumed = 0; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + done = decode_mbcs(&v, s, INT_MAX, 0); + else +#endif + done = decode_mbcs(&v, s, (int)size, !consumed); + + if (done < 0) { + Py_XDECREF(v); + return NULL; + } + + if (consumed) + *consumed += done; + +#ifdef NEED_RETRY + if (size > INT_MAX) { + s += done; + size -= done; + goto retry; } +#endif return (PyObject *)v; } -PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, +PyObject *PyUnicode_DecodeMBCS(const char *s, Py_ssize_t size, const char *errors) { - PyObject *repr; - char *s; - DWORD mbcssize; + return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); +} - /* If there are no characters, bail now! */ - if (size==0) - return PyString_FromString(""); +/* + * Convert unicode into string object (MBCS). + * Returns 0 if succeed, -1 otherwise. + */ +static int encode_mbcs(PyObject **repr, + const Py_UNICODE *p, /* unicode */ + int size) /* size of unicode */ +{ + int mbcssize = 0; + Py_ssize_t n = 0; + + assert(size >= 0); /* First get the size of the result */ - assert(size<INT_MAX); - mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL); - if (mbcssize==0) - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); + if (mbcssize == 0) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } - repr = PyString_FromStringAndSize(NULL, mbcssize); - if (repr == NULL) - return NULL; - if (mbcssize == 0) - return repr; + if (*repr == NULL) { + /* Create string object */ + *repr = PyString_FromStringAndSize(NULL, mbcssize); + if (*repr == NULL) + return -1; + } + else { + /* Extend string object */ + n = PyString_Size(*repr); + if (_PyString_Resize(repr, n + mbcssize) < 0) + return -1; + } /* Do the conversion */ - s = PyString_AS_STRING(repr); - assert(size < INT_MAX); - if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) { - Py_DECREF(repr); - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + char *s = PyString_AS_STRING(*repr) + n; + if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } } + + return 0; +} + +PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + PyObject *repr = NULL; + int ret; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + ret = encode_mbcs(&repr, p, INT_MAX); + else +#endif + ret = encode_mbcs(&repr, p, (int)size); + + if (ret < 0) { + Py_XDECREF(repr); + return NULL; + } + +#ifdef NEED_RETRY + if (size > INT_MAX) { + p += INT_MAX; + size -= INT_MAX; + goto retry; + } +#endif + return repr; } @@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode) NULL); } +#undef NEED_RETRY + #endif /* MS_WINDOWS */ /* --- Character Mapping Codec -------------------------------------------- */ |