diff options
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 225 |
1 files changed, 94 insertions, 131 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4491167..3fdce82 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1029,8 +1029,7 @@ resize_copy(PyObject *unicode, Py_ssize_t length) if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { PyObject *copy; - if (PyUnicode_READY(unicode) == -1) - return NULL; + assert(PyUnicode_IS_READY(unicode)); copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); if (copy == NULL) @@ -1974,14 +1973,11 @@ unicode_char(Py_UCS4 ch) unicode = PyUnicode_New(1, ch); if (unicode == NULL) return NULL; - switch (PyUnicode_KIND(unicode)) { - case PyUnicode_1BYTE_KIND: - PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch; - break; - case PyUnicode_2BYTE_KIND: + + assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); + if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; - break; - default: + } else { assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); PyUnicode_4BYTE_DATA(unicode)[0] = ch; } @@ -1992,12 +1988,32 @@ unicode_char(Py_UCS4 ch) PyObject * PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) { + if (u == NULL) + return (PyObject*)_PyUnicode_New(size); + + if (size < 0) { + PyErr_BadInternalCall(); + return NULL; + } + + return PyUnicode_FromWideChar(u, size); +} + +PyObject * +PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) +{ PyObject *unicode; Py_UCS4 maxchar = 0; Py_ssize_t num_surrogates; - if (u == NULL) - return (PyObject*)_PyUnicode_New(size); + if (u == NULL && size != 0) { + PyErr_BadInternalCall(); + return NULL; + } + + if (size == -1) { + size = wcslen(u); + } /* If the Unicode data is known at construction time, we can apply some optimizations which share commonly used objects. */ @@ -2482,27 +2498,6 @@ PyUnicode_AsUCS4Copy(PyObject *string) return as_ucs4(string, NULL, 0, 1); } -#ifdef HAVE_WCHAR_H - -PyObject * -PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) -{ - if (w == NULL) { - if (size == 0) - _Py_RETURN_UNICODE_EMPTY(); - PyErr_BadInternalCall(); - return NULL; - } - - if (size == -1) { - size = wcslen(w); - } - - return PyUnicode_FromUnicode(w, size); -} - -#endif /* HAVE_WCHAR_H */ - /* maximum number of characters required for output of %lld or %p. We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, plus 1 for the sign. 53/22 is an upper bound for log10(256). */ @@ -3300,7 +3295,7 @@ PyUnicode_Encode(const Py_UNICODE *s, { PyObject *v, *unicode; - unicode = PyUnicode_FromUnicode(s, size); + unicode = PyUnicode_FromWideChar(s, size); if (unicode == NULL) return NULL; v = PyUnicode_AsEncodedString(unicode, encoding, errors); @@ -3412,11 +3407,9 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) { Py_ssize_t wlen, wlen2; wchar_t *wstr; - PyObject *bytes = NULL; char *errmsg; - PyObject *reason = NULL; - PyObject *exc; - size_t error_pos; + PyObject *bytes, *reason, *exc; + size_t error_pos, errlen; int surrogateescape; if (locale_error_handler(errors, &surrogateescape) < 0) @@ -3471,6 +3464,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); if (len2 == (size_t)-1 || len2 > len) { + Py_DECREF(bytes); error_pos = (size_t)-1; goto encode_error; } @@ -3486,17 +3480,15 @@ encode_error: error_pos = wcstombs_errorpos(wstr); PyMem_Free(wstr); - Py_XDECREF(bytes); - - if (errmsg != NULL) { - size_t errlen; - wstr = Py_DecodeLocale(errmsg, &errlen); - if (wstr != NULL) { - reason = PyUnicode_FromWideChar(wstr, errlen); - PyMem_RawFree(wstr); - } else - errmsg = NULL; + + wstr = Py_DecodeLocale(errmsg, &errlen); + if (wstr != NULL) { + reason = PyUnicode_FromWideChar(wstr, errlen); + PyMem_RawFree(wstr); + } else { + errmsg = NULL; } + if (errmsg == NULL) reason = PyUnicode_FromString( "wcstombs() encountered an unencodable " @@ -3512,7 +3504,7 @@ encode_error: Py_DECREF(reason); if (exc != NULL) { PyCodec_StrictErrors(exc); - Py_XDECREF(exc); + Py_DECREF(exc); } return NULL; } @@ -3719,10 +3711,9 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, size_t wlen, wlen2; PyObject *unicode; int surrogateescape; - size_t error_pos; + size_t error_pos, errlen; char *errmsg; - PyObject *reason = NULL; /* initialize to prevent gcc warning */ - PyObject *exc; + PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */ if (locale_error_handler(errors, &surrogateescape) < 0) return NULL; @@ -3780,19 +3771,16 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, return unicode; decode_error: - reason = NULL; errmsg = strerror(errno); assert(errmsg != NULL); error_pos = mbstowcs_errorpos(str, len); - if (errmsg != NULL) { - size_t errlen; - wstr = Py_DecodeLocale(errmsg, &errlen); - if (wstr != NULL) { - reason = PyUnicode_FromWideChar(wstr, errlen); - PyMem_RawFree(wstr); - } + wstr = Py_DecodeLocale(errmsg, &errlen); + if (wstr != NULL) { + reason = PyUnicode_FromWideChar(wstr, errlen); + PyMem_RawFree(wstr); } + if (reason == NULL) reason = PyUnicode_FromString( "mbstowcs() encountered an invalid multibyte sequence"); @@ -3807,7 +3795,7 @@ decode_error: Py_DECREF(reason); if (exc != NULL) { PyCodec_StrictErrors(exc); - Py_XDECREF(exc); + Py_DECREF(exc); } return NULL; } @@ -4140,7 +4128,11 @@ PyUnicode_GetSize(PyObject *unicode) PyErr_BadArgument(); goto onError; } - return PyUnicode_GET_SIZE(unicode); + if (_PyUnicode_WSTR(unicode) == NULL) { + if (PyUnicode_AsUnicode(unicode) == NULL) + goto onError; + } + return PyUnicode_WSTR_LENGTH(unicode); onError: return -1; @@ -4248,7 +4240,7 @@ unicode_decode_call_errorhandler_wchar( Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, PyObject **output, Py_ssize_t *outpos) { - static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; + static const char *argparse = "Un;decoding error handler must return (str, int) tuple"; PyObject *restuple = NULL; PyObject *repunicode = NULL; @@ -4281,10 +4273,10 @@ unicode_decode_call_errorhandler_wchar( if (restuple == NULL) goto onError; if (!PyTuple_Check(restuple)) { - PyErr_SetString(PyExc_TypeError, &argparse[4]); + PyErr_SetString(PyExc_TypeError, &argparse[3]); goto onError; } - if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) + if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos)) goto onError; /* Copy back the bytes variables, which might have been modified by the @@ -4292,9 +4284,6 @@ unicode_decode_call_errorhandler_wchar( inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); if (!inputobj) goto onError; - if (!PyBytes_Check(inputobj)) { - PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); - } *input = PyBytes_AS_STRING(inputobj); insize = PyBytes_GET_SIZE(inputobj); *inend = *input + insize; @@ -4335,7 +4324,7 @@ unicode_decode_call_errorhandler_wchar( *inptr = *input + newpos; /* we made it! */ - Py_XDECREF(restuple); + Py_DECREF(restuple); return 0; overflow: @@ -4356,7 +4345,7 @@ unicode_decode_call_errorhandler_writer( Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) { - static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; + static const char *argparse = "Un;decoding error handler must return (str, int) tuple"; PyObject *restuple = NULL; PyObject *repunicode = NULL; @@ -4383,10 +4372,10 @@ unicode_decode_call_errorhandler_writer( if (restuple == NULL) goto onError; if (!PyTuple_Check(restuple)) { - PyErr_SetString(PyExc_TypeError, &argparse[4]); + PyErr_SetString(PyExc_TypeError, &argparse[3]); goto onError; } - if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) + if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos)) goto onError; /* Copy back the bytes variables, which might have been modified by the @@ -4394,9 +4383,6 @@ unicode_decode_call_errorhandler_writer( inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); if (!inputobj) goto onError; - if (!PyBytes_Check(inputobj)) { - PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); - } *input = PyBytes_AS_STRING(inputobj); insize = PyBytes_GET_SIZE(inputobj); *inend = *input + insize; @@ -4411,8 +4397,6 @@ unicode_decode_call_errorhandler_writer( goto onError; } - if (PyUnicode_READY(repunicode) < 0) - goto onError; replen = PyUnicode_GET_LENGTH(repunicode); if (replen > 1) { writer->min_length += replen - 1; @@ -4428,7 +4412,7 @@ unicode_decode_call_errorhandler_writer( *inptr = *input + newpos; /* we made it! */ - Py_XDECREF(restuple); + Py_DECREF(restuple); return 0; onError: @@ -4834,7 +4818,7 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s, const char *errors) { PyObject *result; - PyObject *tmp = PyUnicode_FromUnicode(s, size); + PyObject *tmp = PyUnicode_FromWideChar(s, size); if (tmp == NULL) return NULL; result = _PyUnicode_EncodeUTF7(tmp, base64SetO, @@ -5190,7 +5174,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s, { PyObject *v, *unicode; - unicode = PyUnicode_FromUnicode(s, size); + unicode = PyUnicode_FromWideChar(s, size); if (unicode == NULL) return NULL; v = _PyUnicode_AsUTF8String(unicode, errors); @@ -5515,7 +5499,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s, int byteorder) { PyObject *result; - PyObject *tmp = PyUnicode_FromUnicode(s, size); + PyObject *tmp = PyUnicode_FromWideChar(s, size); if (tmp == NULL) return NULL; result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); @@ -5868,7 +5852,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s, int byteorder) { PyObject *result; - PyObject *tmp = PyUnicode_FromUnicode(s, size); + PyObject *tmp = PyUnicode_FromWideChar(s, size); if (tmp == NULL) return NULL; result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); @@ -6259,7 +6243,7 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { PyObject *result; - PyObject *tmp = PyUnicode_FromUnicode(s, size); + PyObject *tmp = PyUnicode_FromWideChar(s, size); if (tmp == NULL) { return NULL; } @@ -6476,7 +6460,7 @@ PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { PyObject *result; - PyObject *tmp = PyUnicode_FromUnicode(s, size); + PyObject *tmp = PyUnicode_FromWideChar(s, size); if (tmp == NULL) return NULL; result = PyUnicode_AsRawUnicodeEscapeString(tmp); @@ -6814,7 +6798,7 @@ unicode_encode_ucs1(PyObject *unicode, goto onError; /* subtract preallocated bytes */ - writer.min_size -= 1; + writer.min_size -= newpos - collstart; if (PyBytes_Check(rep)) { /* Directly copy bytes result to output. */ @@ -6830,33 +6814,19 @@ unicode_encode_ucs1(PyObject *unicode, if (PyUnicode_READY(rep) < 0) goto onError; - if (PyUnicode_IS_ASCII(rep)) { - /* Fast path: all characters are smaller than limit */ - assert(limit >= 128); - assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); - str = _PyBytesWriter_WriteBytes(&writer, str, - PyUnicode_DATA(rep), - PyUnicode_GET_LENGTH(rep)); - } - else { - Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep); - - str = _PyBytesWriter_Prepare(&writer, str, repsize); - if (str == NULL) - goto onError; - - /* check if there is anything unencodable in the - replacement and copy it to the output */ - for (i = 0; repsize-->0; ++i, ++str) { - ch = PyUnicode_READ_CHAR(rep, i); - if (ch >= limit) { - raise_encode_exception(&exc, encoding, unicode, - pos, pos+1, reason); - goto onError; - } - *str = (char)ch; - } + if (limit == 256 ? + PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND : + !PyUnicode_IS_ASCII(rep)) + { + /* Not all characters are smaller than limit */ + raise_encode_exception(&exc, encoding, unicode, + collstart, collend, reason); + goto onError; } + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + str = _PyBytesWriter_WriteBytes(&writer, str, + PyUnicode_DATA(rep), + PyUnicode_GET_LENGTH(rep)); } pos = newpos; Py_CLEAR(rep); @@ -6887,7 +6857,7 @@ PyUnicode_EncodeLatin1(const Py_UNICODE *p, const char *errors) { PyObject *result; - PyObject *unicode = PyUnicode_FromUnicode(p, size); + PyObject *unicode = PyUnicode_FromWideChar(p, size); if (unicode == NULL) return NULL; result = unicode_encode_ucs1(unicode, errors, 256); @@ -7028,7 +6998,7 @@ PyUnicode_EncodeASCII(const Py_UNICODE *p, const char *errors) { PyObject *result; - PyObject *unicode = PyUnicode_FromUnicode(p, size); + PyObject *unicode = PyUnicode_FromWideChar(p, size); if (unicode == NULL) return NULL; result = unicode_encode_ucs1(unicode, errors, 128); @@ -7754,7 +7724,7 @@ PyUnicode_EncodeMBCS(const Py_UNICODE *p, const char *errors) { PyObject *unicode, *res; - unicode = PyUnicode_FromUnicode(p, size); + unicode = PyUnicode_FromWideChar(p, size); if (unicode == NULL) return NULL; res = encode_code_page(CP_ACP, unicode, errors); @@ -8602,7 +8572,7 @@ PyUnicode_EncodeCharmap(const Py_UNICODE *p, const char *errors) { PyObject *result; - PyObject *unicode = PyUnicode_FromUnicode(p, size); + PyObject *unicode = PyUnicode_FromWideChar(p, size); if (unicode == NULL) return NULL; result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); @@ -8657,7 +8627,7 @@ unicode_translate_call_errorhandler(const char *errors, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos) { - static const char *argparse = "O!n;translating error handler must return (str, int) tuple"; + static const char *argparse = "Un;translating error handler must return (str, int) tuple"; Py_ssize_t i_newpos; PyObject *restuple; @@ -8679,11 +8649,11 @@ unicode_translate_call_errorhandler(const char *errors, if (restuple == NULL) return NULL; if (!PyTuple_Check(restuple)) { - PyErr_SetString(PyExc_TypeError, &argparse[4]); + PyErr_SetString(PyExc_TypeError, &argparse[3]); Py_DECREF(restuple); return NULL; } - if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, + if (!PyArg_ParseTuple(restuple, argparse, &resunicode, &i_newpos)) { Py_DECREF(restuple); return NULL; @@ -9042,7 +9012,7 @@ PyUnicode_TranslateCharmap(const Py_UNICODE *p, const char *errors) { PyObject *result; - PyObject *unicode = PyUnicode_FromUnicode(p, size); + PyObject *unicode = PyUnicode_FromWideChar(p, size); if (!unicode) return NULL; result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); @@ -9170,14 +9140,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, return -1; } - unicode = PyUnicode_FromUnicode(s, length); + unicode = PyUnicode_FromWideChar(s, length); if (unicode == NULL) return -1; - if (PyUnicode_READY(unicode) == -1) { - Py_DECREF(unicode); - return -1; - } kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); @@ -15359,7 +15325,7 @@ unicodeiter_reduce(unicodeiterobject *it) return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), it->it_seq, it->it_index); } else { - PyObject *u = PyUnicode_FromUnicode(NULL, 0); + PyObject *u = (PyObject *)_PyUnicode_New(0); if (u == NULL) return NULL; return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); @@ -15454,10 +15420,7 @@ unicode_iter(PyObject *seq) size_t Py_UNICODE_strlen(const Py_UNICODE *u) { - int res = 0; - while(*u++) - res++; - return res; + return wcslen(u); } Py_UNICODE* @@ -15482,8 +15445,8 @@ Py_UNICODE* Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) { Py_UNICODE *u1 = s1; - u1 += Py_UNICODE_strlen(u1); - Py_UNICODE_strcpy(u1, s2); + u1 += wcslen(u1); + while ((*u1++ = *s2++)); return s1; } @@ -15532,7 +15495,7 @@ Py_UNICODE* Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) { const Py_UNICODE *p; - p = s + Py_UNICODE_strlen(s); + p = s + wcslen(s); while (p != s) { p--; if (*p == c) |