diff options
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 475 |
1 files changed, 144 insertions, 331 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a6e02f4..0733011 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3327,53 +3327,6 @@ PyUnicode_AsEncodedObject(PyObject *unicode, return NULL; } -static size_t -wcstombs_errorpos(const wchar_t *wstr) -{ - size_t len; -#if SIZEOF_WCHAR_T == 2 - wchar_t buf[3]; -#else - wchar_t buf[2]; -#endif - char outbuf[MB_LEN_MAX]; - const wchar_t *start, *previous; - -#if SIZEOF_WCHAR_T == 2 - buf[2] = 0; -#else - buf[1] = 0; -#endif - start = wstr; - while (*wstr != L'\0') - { - previous = wstr; -#if SIZEOF_WCHAR_T == 2 - if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) - && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) - { - buf[0] = wstr[0]; - buf[1] = wstr[1]; - wstr += 2; - } - else { - buf[0] = *wstr; - buf[1] = 0; - wstr++; - } -#else - buf[0] = *wstr; - wstr++; -#endif - len = wcstombs(outbuf, buf, sizeof(outbuf)); - if (len == (size_t)-1) - return previous - start; - } - - /* failed to find the unencodable character */ - return 0; -} - static int locale_error_handler(const char *errors, int *surrogateescape) { @@ -3396,131 +3349,61 @@ locale_error_handler(const char *errors, int *surrogateescape) } static PyObject * -unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale) +unicode_encode_locale(PyObject *unicode, const char *errors, + int current_locale) { - Py_ssize_t wlen, wlen2; - wchar_t *wstr; - char *errmsg; - PyObject *bytes, *reason, *exc; - size_t error_pos, errlen; int surrogateescape; - if (locale_error_handler(errors, &surrogateescape) < 0) return NULL; - wstr = PyUnicode_AsWideCharString(unicode, &wlen); - if (wstr == NULL) + Py_ssize_t wlen; + wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen); + if (wstr == NULL) { return NULL; + } - wlen2 = wcslen(wstr); + Py_ssize_t wlen2 = wcslen(wstr); if (wlen2 != wlen) { PyMem_Free(wstr); PyErr_SetString(PyExc_ValueError, "embedded null character"); return NULL; } - if (surrogateescape) { - /* "surrogateescape" error handler */ - char *str; - - if (current_locale) { - str = _Py_EncodeCurrentLocale(wstr, &error_pos); - } - else { - str = Py_EncodeLocale(wstr, &error_pos); - } - if (str == NULL) { - if (error_pos == (size_t)-1) { - PyErr_NoMemory(); - PyMem_Free(wstr); - return NULL; - } - else { - goto encode_error; + char *str; + size_t error_pos; + const char *reason; + int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason, + current_locale, surrogateescape); + if (res != 0) { + if (res == -2) { + PyObject *exc; + exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns", + "locale", unicode, + (Py_ssize_t)error_pos, + (Py_ssize_t)(error_pos+1), + reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); } - } - PyMem_Free(wstr); - - bytes = PyBytes_FromString(str); - if (current_locale) { - PyMem_RawFree(str); + return NULL; } else { - PyMem_Free(str); - } - } - else { - /* strict mode */ - size_t len, len2; - - len = wcstombs(NULL, wstr, 0); - if (len == (size_t)-1) { - error_pos = (size_t)-1; - goto encode_error; - } - - bytes = PyBytes_FromStringAndSize(NULL, len); - if (bytes == NULL) { + PyErr_NoMemory(); PyMem_Free(wstr); return NULL; } - - len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); - if (len2 == (size_t)-1 || len2 > len) { - Py_DECREF(bytes); - error_pos = (size_t)-1; - goto encode_error; - } - PyMem_Free(wstr); } - return bytes; - -encode_error: - errmsg = strerror(errno); - assert(errmsg != NULL); - - if (error_pos == (size_t)-1) - error_pos = wcstombs_errorpos(wstr); - PyMem_Free(wstr); - wstr = Py_DecodeLocale(errmsg, &errlen); - if (wstr != NULL) { - reason = PyUnicode_FromWideChar(wstr, errlen); - PyMem_RawFree(wstr); - } else { - errmsg = NULL; - } - - if (errmsg == NULL) - reason = PyUnicode_FromString( - "wcstombs() encountered an unencodable " - "wide character"); - if (reason == NULL) - return NULL; - - exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", - "locale", unicode, - (Py_ssize_t)error_pos, - (Py_ssize_t)(error_pos+1), - reason); - Py_DECREF(reason); - if (exc != NULL) { - PyCodec_StrictErrors(exc); - Py_DECREF(exc); - } - return NULL; + PyObject *bytes = PyBytes_FromString(str); + PyMem_RawFree(str); + return bytes; } PyObject * PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) { - return unicode_encode_locale(unicode, errors, 0); -} - -PyObject * -_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors) -{ return unicode_encode_locale(unicode, errors, 1); } @@ -3687,51 +3570,11 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode, return NULL; } -static size_t -mbstowcs_errorpos(const char *str, size_t len) -{ -#ifdef HAVE_MBRTOWC - const char *start = str; - mbstate_t mbs; - size_t converted; - wchar_t ch; - - memset(&mbs, 0, sizeof mbs); - while (len) - { - converted = mbrtowc(&ch, str, len, &mbs); - if (converted == 0) - /* Reached end of string */ - break; - if (converted == (size_t)-1 || converted == (size_t)-2) { - /* Conversion error or incomplete character */ - return str - start; - } - else { - str += converted; - len -= converted; - } - } - /* failed to find the undecodable byte sequence */ - return 0; -#endif - return 0; -} - static PyObject* unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, int current_locale) { - wchar_t smallbuf[256]; - size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); - wchar_t *wstr; - size_t wlen, wlen2; - PyObject *unicode; int surrogateescape; - size_t error_pos, errlen; - char *errmsg; - PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */ - if (locale_error_handler(errors, &surrogateescape) < 0) return NULL; @@ -3740,113 +3583,47 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, return NULL; } - if (surrogateescape) { - /* "surrogateescape" error handler */ - if (current_locale) { - wstr = _Py_DecodeCurrentLocale(str, &wlen); + wchar_t *wstr; + size_t wlen; + const char *reason; + int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, + current_locale, surrogateescape); + if (res != 0) { + if (res == -2) { + PyObject *exc; + exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", + "locale", str, len, + (Py_ssize_t)wlen, + (Py_ssize_t)(wlen + 1), + reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); + } } else { - wstr = Py_DecodeLocale(str, &wlen); - } - if (wstr == NULL) { - if (wlen == (size_t)-1) - PyErr_NoMemory(); - else - PyErr_SetFromErrno(PyExc_OSError); - return NULL; + PyErr_NoMemory(); } - - unicode = PyUnicode_FromWideChar(wstr, wlen); - PyMem_RawFree(wstr); + return NULL; } - else { - /* strict mode */ -#ifndef HAVE_BROKEN_MBSTOWCS - wlen = mbstowcs(NULL, str, 0); -#else - wlen = len; -#endif - if (wlen == (size_t)-1) - goto decode_error; - if (wlen+1 <= smallbuf_len) { - wstr = smallbuf; - } - else { - wstr = PyMem_New(wchar_t, wlen+1); - if (!wstr) - return PyErr_NoMemory(); - } - wlen2 = mbstowcs(wstr, str, wlen+1); - if (wlen2 == (size_t)-1) { - if (wstr != smallbuf) - PyMem_Free(wstr); - goto decode_error; - } -#ifdef HAVE_BROKEN_MBSTOWCS - assert(wlen2 == wlen); -#endif - unicode = PyUnicode_FromWideChar(wstr, wlen2); - if (wstr != smallbuf) - PyMem_Free(wstr); - } + PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen); + PyMem_RawFree(wstr); return unicode; - -decode_error: - errmsg = strerror(errno); - assert(errmsg != NULL); - - error_pos = mbstowcs_errorpos(str, len); - wstr = Py_DecodeLocale(errmsg, &errlen); - if (wstr != NULL) { - reason = PyUnicode_FromWideChar(wstr, errlen); - PyMem_RawFree(wstr); - } - - if (reason == NULL) - reason = PyUnicode_FromString( - "mbstowcs() encountered an invalid multibyte sequence"); - if (reason == NULL) - return NULL; - - exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", - "locale", str, len, - (Py_ssize_t)error_pos, - (Py_ssize_t)(error_pos+1), - reason); - Py_DECREF(reason); - if (exc != NULL) { - PyCodec_StrictErrors(exc); - Py_DECREF(exc); - } - return NULL; } PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, const char *errors) { - return unicode_decode_locale(str, len, errors, 0); -} - -PyObject* -_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len, - const char *errors) -{ return unicode_decode_locale(str, len, errors, 1); } PyObject* -_PyUnicode_DecodeCurrentLocale(const char *str, const char *errors) -{ - return unicode_decode_locale(str, (Py_ssize_t)strlen(str), errors, 1); -} - -PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors) { Py_ssize_t size = (Py_ssize_t)strlen(str); - return unicode_decode_locale(str, size, errors, 0); + return unicode_decode_locale(str, size, errors, 1); } @@ -3878,7 +3655,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) Py_FileSystemDefaultEncodeErrors); } else { - return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors); + return unicode_decode_locale(s, size, + Py_FileSystemDefaultEncodeErrors, 0); } #endif } @@ -5128,17 +4906,23 @@ onError: } -/* UTF-8 decoder using the surrogateescape error handler . +/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is + non-zero, use strict error handler otherwise. - On success, return a pointer to a newly allocated wide character string (use - PyMem_RawFree() to free the memory) and write the output length (in number - of wchar_t units) into *p_wlen (if p_wlen is set). + On success, write a pointer to a newly allocated wide character string into + *wstr (use PyMem_RawFree() to free the memory) and write the output length + (in number of wchar_t units) into *wlen (if wlen is set). - On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen - (if p_wlen is set). */ -wchar_t* -_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) + On memory allocation failure, return -1. + + On decoding error (if surrogateescape is zero), return -2. If wlen is + non-NULL, write the start of the illegal byte sequence into *wlen. If reason + is not NULL, write the decoding error message into *reason. */ +int +_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, + const char **reason, int surrogateescape) { + const char *orig_s = s; const char *e; wchar_t *unicode; Py_ssize_t outpos; @@ -5146,18 +4930,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) /* Note: size will always be longer than the resulting Unicode character count */ if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { - if (p_wlen) { - *p_wlen = (size_t)-1; - } - return NULL; + return -1; } unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); if (!unicode) { - if (p_wlen) { - *p_wlen = (size_t)-1; - } - return NULL; + return -1; } /* Unpack UTF-8 encoded data */ @@ -5175,7 +4953,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) Py_UNREACHABLE(); #else assert(ch > 0xFFFF && ch <= MAX_UNICODE); - /* compute and append the two surrogates: */ + /* write a surrogate pair */ unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); #endif @@ -5183,60 +4961,88 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) else { if (!ch && s == e) break; + if (!surrogateescape) { + PyMem_RawFree(unicode ); + if (reason != NULL) { + switch (ch) { + case 0: + *reason = "unexpected end of data"; + break; + case 1: + *reason = "invalid start byte"; + break; + /* 2, 3, 4 */ + default: + *reason = "invalid continuation byte"; + break; + } + } + if (wlen != NULL) { + *wlen = s - orig_s; + } + return -2; + } /* surrogateescape */ unicode[outpos++] = 0xDC00 + (unsigned char)*s++; } } unicode[outpos] = L'\0'; - if (p_wlen) { - *p_wlen = outpos; + if (wlen) { + *wlen = outpos; } - return unicode; + *wstr = unicode; + return 0; +} + +wchar_t* +_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen) +{ + wchar_t *wstr; + int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1); + if (res != 0) { + return NULL; + } + return wstr; } /* UTF-8 encoder using the surrogateescape error handler . - On success, return a pointer to a newly allocated character string (use - PyMem_Free() to free the memory). + On success, return 0 and write the newly allocated character string (use + PyMem_Free() to free the memory) into *str. - On encoding failure, return NULL and write the position of the invalid - surrogate character into *error_pos (if error_pos is set). + On encoding failure, return -2 and write the position of the invalid + surrogate character into *error_pos (if error_pos is set) and the decoding + error message into *reason (if reason is set). - On memory allocation failure, return NULL and write (size_t)-1 into - *error_pos (if error_pos is set). */ -char* -_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos, - int raw_malloc) + On memory allocation failure, return -1. */ +int +_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, + const char **reason, int raw_malloc, int surrogateescape) { const Py_ssize_t max_char_size = 4; Py_ssize_t len = wcslen(text); assert(len >= 0); + if (len > PY_SSIZE_T_MAX / max_char_size - 1) { + return -1; + } char *bytes; - if (len <= PY_SSIZE_T_MAX / max_char_size - 1) { - if (raw_malloc) { - bytes = PyMem_RawMalloc((len + 1) * max_char_size); - } - else { - bytes = PyMem_Malloc((len + 1) * max_char_size); - } + if (raw_malloc) { + bytes = PyMem_RawMalloc((len + 1) * max_char_size); } else { - bytes = NULL; + bytes = PyMem_Malloc((len + 1) * max_char_size); } if (bytes == NULL) { - if (error_pos != NULL) { - *error_pos = (size_t)-1; - } - return NULL; + return -1; } char *p = bytes; Py_ssize_t i; - for (i = 0; i < len;) { - Py_UCS4 ch = text[i++]; + for (i = 0; i < len; i++) { + Py_UCS4 ch = text[i]; if (ch < 0x80) { /* Encode ASCII */ @@ -5250,11 +5056,20 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos, } else if (Py_UNICODE_IS_SURROGATE(ch)) { /* surrogateescape error handler */ - if (!(0xDC80 <= ch && ch <= 0xDCFF)) { + if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) { if (error_pos != NULL) { - *error_pos = (size_t)i - 1; + *error_pos = (size_t)i; } - goto error; + if (reason != NULL) { + *reason = "encoding error"; + } + if (raw_malloc) { + PyMem_RawFree(bytes); + } + else { + PyMem_Free(bytes); + } + return -2; } *p++ = (char)(ch & 0xff); } @@ -5286,18 +5101,16 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos, if (error_pos != NULL) { *error_pos = (size_t)-1; } - goto error; - } - return bytes2; - - error: - if (raw_malloc) { - PyMem_RawFree(bytes); - } - else { - PyMem_Free(bytes); + if (raw_malloc) { + PyMem_RawFree(bytes); + } + else { + PyMem_Free(bytes); + } + return -1; } - return NULL; + *str = bytes2; + return 0; } |