summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2006-06-14 05:21:04 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2006-06-14 05:21:04 (GMT)
commitd825143be1118ba7e320661b3a71d8822ae5d600 (patch)
treea74764df14558614917b7f4d9abd56c44ffd5623 /Objects
parent6ce9fe880be7416d88e6d800528db4079db6d6b1 (diff)
downloadcpython-d825143be1118ba7e320661b3a71d8822ae5d600.zip
cpython-d825143be1118ba7e320661b3a71d8822ae5d600.tar.gz
cpython-d825143be1118ba7e320661b3a71d8822ae5d600.tar.bz2
Patch #1455898: Incremental mode for "mbcs" codec.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c214
1 files changed, 175 insertions, 39 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 3c06997..08fdb3f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
/* --- MBCS codecs for Windows -------------------------------------------- */
-PyObject *PyUnicode_DecodeMBCS(const char *s,
- Py_ssize_t size,
- const char *errors)
+#if SIZEOF_INT < SIZEOF_SSIZE_T
+#define NEED_RETRY
+#endif
+
+/* XXX This code is limited to "true" double-byte encodings, as
+ a) it assumes an incomplete character consists of a single byte, and
+ b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
+ encodings, see IsDBCSLeadByteEx documentation. */
+
+static int is_dbcs_lead_byte(const char *s, int offset)
+{
+ const char *curr = s + offset;
+
+ if (IsDBCSLeadByte(*curr)) {
+ const char *prev = CharPrev(s, curr);
+ return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
+ }
+ return 0;
+}
+
+/*
+ * Decode MBCS string into unicode object. If 'final' is set, converts
+ * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
+ */
+static int decode_mbcs(PyUnicodeObject **v,
+ const char *s, /* MBCS string */
+ int size, /* sizeof MBCS string */
+ int final)
{
- PyUnicodeObject *v;
Py_UNICODE *p;
- DWORD usize;
+ Py_ssize_t n = 0;
+ int usize = 0;
+
+ assert(size >= 0);
+
+ /* Skip trailing lead-byte unless 'final' is set */
+ if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
+ --size;
/* First get the size of the result */
- assert(size < INT_MAX);
- usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
- if (size > 0 && usize==0)
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (size > 0) {
+ usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
+ if (usize == 0) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
- v = _PyUnicode_New(usize);
- if (v == NULL)
- return NULL;
- if (usize == 0)
- return (PyObject *)v;
- p = PyUnicode_AS_UNICODE(v);
- if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
- Py_DECREF(v);
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (*v == NULL) {
+ /* Create unicode object */
+ *v = _PyUnicode_New(usize);
+ if (*v == NULL)
+ return -1;
+ }
+ else {
+ /* Extend unicode object */
+ n = PyUnicode_GET_SIZE(*v);
+ if (_PyUnicode_Resize(v, n + usize) < 0)
+ return -1;
+ }
+
+ /* Do the conversion */
+ if (size > 0) {
+ p = PyUnicode_AS_UNICODE(*v) + n;
+ if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
+
+ return size;
+}
+
+PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ PyUnicodeObject *v = NULL;
+ int done;
+
+ if (consumed)
+ *consumed = 0;
+
+#ifdef NEED_RETRY
+ retry:
+ if (size > INT_MAX)
+ done = decode_mbcs(&v, s, INT_MAX, 0);
+ else
+#endif
+ done = decode_mbcs(&v, s, (int)size, !consumed);
+
+ if (done < 0) {
+ Py_XDECREF(v);
+ return NULL;
+ }
+
+ if (consumed)
+ *consumed += done;
+
+#ifdef NEED_RETRY
+ if (size > INT_MAX) {
+ s += done;
+ size -= done;
+ goto retry;
}
+#endif
return (PyObject *)v;
}
-PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+PyObject *PyUnicode_DecodeMBCS(const char *s,
Py_ssize_t size,
const char *errors)
{
- PyObject *repr;
- char *s;
- DWORD mbcssize;
+ return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
+}
- /* If there are no characters, bail now! */
- if (size==0)
- return PyString_FromString("");
+/*
+ * Convert unicode into string object (MBCS).
+ * Returns 0 if succeed, -1 otherwise.
+ */
+static int encode_mbcs(PyObject **repr,
+ const Py_UNICODE *p, /* unicode */
+ int size) /* size of unicode */
+{
+ int mbcssize = 0;
+ Py_ssize_t n = 0;
+
+ assert(size >= 0);
/* First get the size of the result */
- assert(size<INT_MAX);
- mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
- if (mbcssize==0)
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (size > 0) {
+ mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+ if (mbcssize == 0) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
- repr = PyString_FromStringAndSize(NULL, mbcssize);
- if (repr == NULL)
- return NULL;
- if (mbcssize == 0)
- return repr;
+ if (*repr == NULL) {
+ /* Create string object */
+ *repr = PyString_FromStringAndSize(NULL, mbcssize);
+ if (*repr == NULL)
+ return -1;
+ }
+ else {
+ /* Extend string object */
+ n = PyString_Size(*repr);
+ if (_PyString_Resize(repr, n + mbcssize) < 0)
+ return -1;
+ }
/* Do the conversion */
- s = PyString_AS_STRING(repr);
- assert(size < INT_MAX);
- if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
- Py_DECREF(repr);
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (size > 0) {
+ char *s = PyString_AS_STRING(*repr) + n;
+ if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
}
+
+ return 0;
+}
+
+PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+ Py_ssize_t size,
+ const char *errors)
+{
+ PyObject *repr = NULL;
+ int ret;
+
+#ifdef NEED_RETRY
+ retry:
+ if (size > INT_MAX)
+ ret = encode_mbcs(&repr, p, INT_MAX);
+ else
+#endif
+ ret = encode_mbcs(&repr, p, (int)size);
+
+ if (ret < 0) {
+ Py_XDECREF(repr);
+ return NULL;
+ }
+
+#ifdef NEED_RETRY
+ if (size > INT_MAX) {
+ p += INT_MAX;
+ size -= INT_MAX;
+ goto retry;
+ }
+#endif
+
return repr;
}
@@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
NULL);
}
+#undef NEED_RETRY
+
#endif /* MS_WINDOWS */
/* --- Character Mapping Codec -------------------------------------------- */