diff options
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 295 |
1 files changed, 233 insertions, 62 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c3ab2d8..9e35b61 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode) /* --- MBCS codecs for Windows -------------------------------------------- */ -PyObject *PyUnicode_DecodeMBCS(const char *s, - Py_ssize_t size, - const char *errors) +#if SIZEOF_INT < SIZEOF_SSIZE_T +#define NEED_RETRY +#endif + +/* XXX This code is limited to "true" double-byte encodings, as + a) it assumes an incomplete character consists of a single byte, and + b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte + encodings, see IsDBCSLeadByteEx documentation. */ + +static int is_dbcs_lead_byte(const char *s, int offset) +{ + const char *curr = s + offset; + + if (IsDBCSLeadByte(*curr)) { + const char *prev = CharPrev(s, curr); + return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); + } + return 0; +} + +/* + * Decode MBCS string into unicode object. If 'final' is set, converts + * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. + */ +static int decode_mbcs(PyUnicodeObject **v, + const char *s, /* MBCS string */ + int size, /* sizeof MBCS string */ + int final) { - PyUnicodeObject *v; Py_UNICODE *p; - DWORD usize; + Py_ssize_t n = 0; + int usize = 0; + + assert(size >= 0); + + /* Skip trailing lead-byte unless 'final' is set */ + if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) + --size; /* First get the size of the result */ - assert(size < INT_MAX); - usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0); - if (size > 0 && usize==0) - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); + if (usize == 0) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } - v = _PyUnicode_New(usize); - if (v == NULL) - return NULL; - if (usize == 0) - return (PyObject *)v; - p = PyUnicode_AS_UNICODE(v); - if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) { - Py_DECREF(v); - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (*v == NULL) { + /* Create unicode object */ + *v = _PyUnicode_New(usize); + if (*v == NULL) + return -1; + } + else { + /* Extend unicode object */ + n = PyUnicode_GET_SIZE(*v); + if (_PyUnicode_Resize(v, n + usize) < 0) + return -1; + } + + /* Do the conversion */ + if (size > 0) { + p = PyUnicode_AS_UNICODE(*v) + n; + if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } + + return size; +} + +PyObject *PyUnicode_DecodeMBCSStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + PyUnicodeObject *v = NULL; + int done; + + if (consumed) + *consumed = 0; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + done = decode_mbcs(&v, s, INT_MAX, 0); + else +#endif + done = decode_mbcs(&v, s, (int)size, !consumed); + + if (done < 0) { + Py_XDECREF(v); + return NULL; + } + + if (consumed) + *consumed += done; + +#ifdef NEED_RETRY + if (size > INT_MAX) { + s += done; + size -= done; + goto retry; } +#endif return (PyObject *)v; } -PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, +PyObject *PyUnicode_DecodeMBCS(const char *s, Py_ssize_t size, const char *errors) { - PyObject *repr; - char *s; - DWORD mbcssize; + return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); +} + +/* + * Convert unicode into string object (MBCS). + * Returns 0 if succeed, -1 otherwise. + */ +static int encode_mbcs(PyObject **repr, + const Py_UNICODE *p, /* unicode */ + int size) /* size of unicode */ +{ + int mbcssize = 0; + Py_ssize_t n = 0; - /* If there are no characters, bail now! */ - if (size==0) - return PyString_FromString(""); + assert(size >= 0); /* First get the size of the result */ - assert(size<INT_MAX); - mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL); - if (mbcssize==0) - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); + if (mbcssize == 0) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } - repr = PyString_FromStringAndSize(NULL, mbcssize); - if (repr == NULL) - return NULL; - if (mbcssize == 0) - return repr; + if (*repr == NULL) { + /* Create string object */ + *repr = PyString_FromStringAndSize(NULL, mbcssize); + if (*repr == NULL) + return -1; + } + else { + /* Extend string object */ + n = PyString_Size(*repr); + if (_PyString_Resize(repr, n + mbcssize) < 0) + return -1; + } /* Do the conversion */ - s = PyString_AS_STRING(repr); - assert(size < INT_MAX); - if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) { - Py_DECREF(repr); - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + char *s = PyString_AS_STRING(*repr) + n; + if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } + + return 0; +} + +PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + PyObject *repr = NULL; + int ret; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + ret = encode_mbcs(&repr, p, INT_MAX); + else +#endif + ret = encode_mbcs(&repr, p, (int)size); + + if (ret < 0) { + Py_XDECREF(repr); + return NULL; } + +#ifdef NEED_RETRY + if (size > INT_MAX) { + p += INT_MAX; + size -= INT_MAX; + goto retry; + } +#endif + return repr; } @@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode) NULL); } +#undef NEED_RETRY + #endif /* MS_WINDOWS */ /* --- Character Mapping Codec -------------------------------------------- */ @@ -4491,11 +4627,11 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) /* Make sure we have enough space for the separator and the item. */ itemlen = PyUnicode_GET_SIZE(item); new_res_used = res_used + itemlen; - if (new_res_used <= 0) + if (new_res_used < 0) goto Overflow; if (i < seqlen - 1) { new_res_used += seplen; - if (new_res_used <= 0) + if (new_res_used < 0) goto Overflow; } if (new_res_used > res_alloc) { @@ -4536,7 +4672,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) Overflow: PyErr_SetString(PyExc_OverflowError, - "join() is too long for a Python string"); + "join() result is too long for a Python string"); Py_DECREF(item); /* fall through */ @@ -6667,29 +6803,44 @@ PyDoc_STRVAR(startswith__doc__, \n\ Return True if S starts with the specified prefix, False otherwise.\n\ With optional start, test S beginning at that position.\n\ -With optional end, stop comparing S at that position."); +With optional end, stop comparing S at that position.\n\ +prefix can also be a tuple of strings to try."); static PyObject * unicode_startswith(PyUnicodeObject *self, PyObject *args) { + PyObject *subobj; PyUnicodeObject *substring; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *result; + int result; - if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, + if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) return NULL; - substring = (PyUnicodeObject *)PyUnicode_FromObject( - (PyObject *)substring); + if (PyTuple_Check(subobj)) { + Py_ssize_t i; + for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + substring = (PyUnicodeObject *)PyUnicode_FromObject( + PyTuple_GET_ITEM(subobj, i)); + if (substring == NULL) + return NULL; + result = tailmatch(self, substring, start, end, -1); + Py_DECREF(substring); + if (result) { + Py_RETURN_TRUE; + } + } + /* nothing matched */ + Py_RETURN_FALSE; + } + substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); if (substring == NULL) - return NULL; - - result = PyBool_FromLong(tailmatch(self, substring, start, end, -1)); - + return NULL; + result = tailmatch(self, substring, start, end, -1); Py_DECREF(substring); - return result; + return PyBool_FromLong(result); } @@ -6698,29 +6849,44 @@ PyDoc_STRVAR(endswith__doc__, \n\ Return True if S ends with the specified suffix, False otherwise.\n\ With optional start, test S beginning at that position.\n\ -With optional end, stop comparing S at that position."); +With optional end, stop comparing S at that position.\n\ +suffix can also be a tuple of strings to try."); static PyObject * unicode_endswith(PyUnicodeObject *self, PyObject *args) { + PyObject *subobj; PyUnicodeObject *substring; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *result; + int result; - if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, - _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) + if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, + _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) return NULL; - substring = (PyUnicodeObject *)PyUnicode_FromObject( - (PyObject *)substring); + if (PyTuple_Check(subobj)) { + Py_ssize_t i; + for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + substring = (PyUnicodeObject *)PyUnicode_FromObject( + PyTuple_GET_ITEM(subobj, i)); + if (substring == NULL) + return NULL; + result = tailmatch(self, substring, start, end, +1); + Py_DECREF(substring); + if (result) { + Py_RETURN_TRUE; + } + } + Py_RETURN_FALSE; + } + substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); if (substring == NULL) - return NULL; - - result = PyBool_FromLong(tailmatch(self, substring, start, end, +1)); + return NULL; + result = tailmatch(self, substring, start, end, +1); Py_DECREF(substring); - return result; + return PyBool_FromLong(result); } @@ -7748,6 +7914,9 @@ void _PyUnicode_Init(void) unicode_freelist = NULL; unicode_freelist_size = 0; unicode_empty = _PyUnicode_New(0); + if (!unicode_empty) + return; + strcpy(unicode_default_encoding, "ascii"); for (i = 0; i < 256; i++) unicode_latin1[i] = NULL; @@ -7758,6 +7927,8 @@ void _PyUnicode_Init(void) bloom_linebreak = make_bloom_mask( linebreak, sizeof(linebreak) / sizeof(linebreak[0]) ); + + PyType_Ready(&EncodingMapType); } /* Finalize the Unicode implementation */ |