From 7ed7aead9503102d2ed316175f198104e0cd674c Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 15 Jan 2018 10:45:49 +0100 Subject: bpo-29240: Fix locale encodings in UTF-8 Mode (#5170) Modify locale.localeconv(), time.tzname, os.strerror() and other functions to ignore the UTF-8 Mode: always use the current locale encoding. Changes: * Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx(). On decoding or encoding error, they return the position of the error and an error message which are used to raise Unicode errors in PyUnicode_DecodeLocale() and PyUnicode_EncodeLocale(). * Replace _Py_DecodeCurrentLocale() with _Py_DecodeLocaleEx(). * PyUnicode_DecodeLocale() now uses _Py_DecodeLocaleEx() for all cases, especially for the strict error handler. * Add _Py_DecodeUTF8Ex(): return more information on decoding error and supports the strict error handler. * Rename _Py_EncodeUTF8_surrogateescape() to _Py_EncodeUTF8Ex(). * Replace _Py_EncodeCurrentLocale() with _Py_EncodeLocaleEx(). * Ignore the UTF-8 mode to encode/decode localeconv(), strerror() and time zone name. * Remove PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize() and PyUnicode_EncodeLocale() now ignore the UTF-8 mode: always use the "current" locale. * Remove _PyUnicode_DecodeCurrentLocale(), _PyUnicode_DecodeCurrentLocaleAndSize() and _PyUnicode_EncodeCurrentLocale(). --- Doc/c-api/sys.rst | 22 +++ Doc/c-api/unicode.rst | 16 ++ Include/fileutils.h | 37 +++- Include/unicodeobject.h | 14 -- Modules/_datetimemodule.c | 2 +- Modules/_localemodule.c | 3 +- Modules/getpath.c | 4 +- Modules/readline.c | 4 +- Modules/timemodule.c | 11 +- Objects/unicodeobject.c | 475 ++++++++++++++-------------------------------- Python/fileutils.c | 385 +++++++++++++++++++++++-------------- Python/pathconfig.c | 4 +- 12 files changed, 472 insertions(+), 505 deletions(-) diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst index 20bc7bd..e4da96c 100644 --- a/Doc/c-api/sys.rst +++ b/Doc/c-api/sys.rst @@ -106,6 +106,16 @@ Operating System Utilities surrogate character, escape the bytes using the surrogateescape error handler instead of decoding them. + Encoding, highest priority to lowest priority: + + * ``UTF-8`` on macOS and Android; + * ``UTF-8`` if the Python UTF-8 mode is enabled; + * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``, + ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias), + and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the + ``ISO-8859-1`` encoding. + * the current locale encoding. + Return a pointer to a newly allocated wide character string, use :c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write the number of wide characters excluding the null character into ``*size`` @@ -137,6 +147,18 @@ Operating System Utilities :ref:`surrogateescape error handler `: surrogate characters in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF. + Encoding, highest priority to lowest priority: + + * ``UTF-8`` on macOS and Android; + * ``UTF-8`` if the Python UTF-8 mode is enabled; + * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``, + ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias), + and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the + ``ISO-8859-1`` encoding. + * the current locale encoding. + + The function uses the UTF-8 encoding in the Python UTF-8 mode. + Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free` to free the memory. Return ``NULL`` on encoding error or memory allocation error diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 45aff1b..3f6c055 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -770,12 +770,20 @@ system. :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at Python startup). + This function ignores the Python UTF-8 mode. + .. seealso:: The :c:func:`Py_DecodeLocale` function. .. versionadded:: 3.3 + .. versionchanged:: 3.7 + The function now also uses the current locale encoding for the + ``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale` + was used for the ``surrogateescape``, and the current locale encoding was + used for ``strict``. + .. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors) @@ -797,12 +805,20 @@ system. :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at Python startup). + This function ignores the Python UTF-8 mode. + .. seealso:: The :c:func:`Py_EncodeLocale` function. .. versionadded:: 3.3 + .. versionchanged:: 3.7 + The function now also uses the current locale encoding for the + ``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale` + was used for the ``surrogateescape``, and the current locale encoding was + used for ``strict``. + File System Encoding """""""""""""""""""" diff --git a/Include/fileutils.h b/Include/fileutils.h index 2527d84..b4f8b11 100644 --- a/Include/fileutils.h +++ b/Include/fileutils.h @@ -20,18 +20,41 @@ PyAPI_FUNC(char*) _Py_EncodeLocaleRaw( #endif #ifdef Py_BUILD_CORE +PyAPI_FUNC(int) _Py_DecodeUTF8Ex( + const char *arg, + Py_ssize_t arglen, + wchar_t **wstr, + size_t *wlen, + const char **reason, + int surrogateescape); + +PyAPI_FUNC(int) _Py_EncodeUTF8Ex( + const wchar_t *text, + char **str, + size_t *error_pos, + const char **reason, + int raw_malloc, + int surrogateescape); + PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape( - const char *s, - Py_ssize_t size, - size_t *p_wlen); + const char *arg, + Py_ssize_t arglen); -PyAPI_FUNC(wchar_t *) _Py_DecodeCurrentLocale( +PyAPI_FUNC(int) _Py_DecodeLocaleEx( const char *arg, - size_t *size); + wchar_t **wstr, + size_t *wlen, + const char **reason, + int current_locale, + int surrogateescape); -PyAPI_FUNC(char*) _Py_EncodeCurrentLocale( +PyAPI_FUNC(int) _Py_EncodeLocaleEx( const wchar_t *text, - size_t *error_pos); + char **str, + size_t *error_pos, + const char **reason, + int current_locale, + int surrogateescape); #endif #ifndef Py_LIMITED_API diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index d263026..0274de6 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1810,20 +1810,6 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( PyObject *unicode, const char *errors ); - -PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocale( - const char *str, - const char *errors); - -PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocaleAndSize( - const char *str, - Py_ssize_t len, - const char *errors); - -PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCurrentLocale( - PyObject *unicode, - const char *errors - ); #endif /* --- File system encoding ---------------------------------------------- */ diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 6241967..e68c7c0 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -696,7 +696,7 @@ static int parse_isoformat_date(const char *dtstr, if (NULL == p) { return -1; } - + if (*(p++) != '-') { return -2; } diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c index e364668..324b694 100644 --- a/Modules/_localemodule.c +++ b/Modules/_localemodule.c @@ -572,8 +572,9 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args) if (!PyArg_ParseTuple(args, "sz", &domain, &codeset)) return NULL; codeset = bind_textdomain_codeset(domain, codeset); - if (codeset) + if (codeset) { return PyUnicode_DecodeLocale(codeset, NULL); + } Py_RETURN_NONE; } #endif diff --git a/Modules/getpath.c b/Modules/getpath.c index 85e737b..e6a3e8e 100644 --- a/Modules/getpath.c +++ b/Modules/getpath.c @@ -449,8 +449,8 @@ search_for_exec_prefix(const _PyCoreConfig *core_config, n = fread(buf, 1, MAXPATHLEN, f); buf[n] = '\0'; fclose(f); - rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n, NULL); - if (rel_builddir_path != NULL) { + rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n); + if (rel_builddir_path) { wcsncpy(exec_prefix, calculate->argv0_path, MAXPATHLEN); exec_prefix[MAXPATHLEN] = L'\0'; joinpath(exec_prefix, rel_builddir_path); diff --git a/Modules/readline.c b/Modules/readline.c index caf661c..811fca8 100644 --- a/Modules/readline.c +++ b/Modules/readline.c @@ -132,13 +132,13 @@ static PyModuleDef readlinemodule; static PyObject * encode(PyObject *b) { - return _PyUnicode_EncodeCurrentLocale(b, "surrogateescape"); + return PyUnicode_EncodeLocale(b, "surrogateescape"); } static PyObject * decode(const char *s) { - return _PyUnicode_DecodeCurrentLocale(s, "surrogateescape"); + return PyUnicode_DecodeLocale(s, "surrogateescape"); } diff --git a/Modules/timemodule.c b/Modules/timemodule.c index 4e7f9d9..b17ab5a 100644 --- a/Modules/timemodule.c +++ b/Modules/timemodule.c @@ -418,11 +418,11 @@ tmtotuple(struct tm *p SET(8, p->tm_isdst); #ifdef HAVE_STRUCT_TM_TM_ZONE PyStructSequence_SET_ITEM(v, 9, - _PyUnicode_DecodeCurrentLocale(p->tm_zone, "surrogateescape")); + PyUnicode_DecodeLocale(p->tm_zone, "surrogateescape")); SET(10, p->tm_gmtoff); #else PyStructSequence_SET_ITEM(v, 9, - _PyUnicode_DecodeCurrentLocale(zone, "surrogateescape")); + PyUnicode_DecodeLocale(zone, "surrogateescape")); PyStructSequence_SET_ITEM(v, 10, _PyLong_FromTime_t(gmtoff)); #endif /* HAVE_STRUCT_TM_TM_ZONE */ #undef SET @@ -809,8 +809,7 @@ time_strftime(PyObject *self, PyObject *args) #ifdef HAVE_WCSFTIME ret = PyUnicode_FromWideChar(outbuf, buflen); #else - ret = _PyUnicode_DecodeCurrentLocaleAndSize(outbuf, buflen, - "surrogateescape"); + ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, "surrogateescape"); #endif PyMem_Free(outbuf); break; @@ -1541,8 +1540,8 @@ PyInit_timezone(PyObject *m) { PyModule_AddIntConstant(m, "altzone", timezone-3600); #endif PyModule_AddIntConstant(m, "daylight", daylight); - otz0 = _PyUnicode_DecodeCurrentLocale(tzname[0], "surrogateescape"); - otz1 = _PyUnicode_DecodeCurrentLocale(tzname[1], "surrogateescape"); + otz0 = PyUnicode_DecodeLocale(tzname[0], "surrogateescape"); + otz1 = PyUnicode_DecodeLocale(tzname[1], "surrogateescape"); PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1)); #else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/ { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a6e02f4..0733011 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3327,53 +3327,6 @@ PyUnicode_AsEncodedObject(PyObject *unicode, return NULL; } -static size_t -wcstombs_errorpos(const wchar_t *wstr) -{ - size_t len; -#if SIZEOF_WCHAR_T == 2 - wchar_t buf[3]; -#else - wchar_t buf[2]; -#endif - char outbuf[MB_LEN_MAX]; - const wchar_t *start, *previous; - -#if SIZEOF_WCHAR_T == 2 - buf[2] = 0; -#else - buf[1] = 0; -#endif - start = wstr; - while (*wstr != L'\0') - { - previous = wstr; -#if SIZEOF_WCHAR_T == 2 - if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) - && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) - { - buf[0] = wstr[0]; - buf[1] = wstr[1]; - wstr += 2; - } - else { - buf[0] = *wstr; - buf[1] = 0; - wstr++; - } -#else - buf[0] = *wstr; - wstr++; -#endif - len = wcstombs(outbuf, buf, sizeof(outbuf)); - if (len == (size_t)-1) - return previous - start; - } - - /* failed to find the unencodable character */ - return 0; -} - static int locale_error_handler(const char *errors, int *surrogateescape) { @@ -3396,131 +3349,61 @@ locale_error_handler(const char *errors, int *surrogateescape) } static PyObject * -unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale) +unicode_encode_locale(PyObject *unicode, const char *errors, + int current_locale) { - Py_ssize_t wlen, wlen2; - wchar_t *wstr; - char *errmsg; - PyObject *bytes, *reason, *exc; - size_t error_pos, errlen; int surrogateescape; - if (locale_error_handler(errors, &surrogateescape) < 0) return NULL; - wstr = PyUnicode_AsWideCharString(unicode, &wlen); - if (wstr == NULL) + Py_ssize_t wlen; + wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen); + if (wstr == NULL) { return NULL; + } - wlen2 = wcslen(wstr); + Py_ssize_t wlen2 = wcslen(wstr); if (wlen2 != wlen) { PyMem_Free(wstr); PyErr_SetString(PyExc_ValueError, "embedded null character"); return NULL; } - if (surrogateescape) { - /* "surrogateescape" error handler */ - char *str; - - if (current_locale) { - str = _Py_EncodeCurrentLocale(wstr, &error_pos); - } - else { - str = Py_EncodeLocale(wstr, &error_pos); - } - if (str == NULL) { - if (error_pos == (size_t)-1) { - PyErr_NoMemory(); - PyMem_Free(wstr); - return NULL; - } - else { - goto encode_error; + char *str; + size_t error_pos; + const char *reason; + int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason, + current_locale, surrogateescape); + if (res != 0) { + if (res == -2) { + PyObject *exc; + exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns", + "locale", unicode, + (Py_ssize_t)error_pos, + (Py_ssize_t)(error_pos+1), + reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); } - } - PyMem_Free(wstr); - - bytes = PyBytes_FromString(str); - if (current_locale) { - PyMem_RawFree(str); + return NULL; } else { - PyMem_Free(str); - } - } - else { - /* strict mode */ - size_t len, len2; - - len = wcstombs(NULL, wstr, 0); - if (len == (size_t)-1) { - error_pos = (size_t)-1; - goto encode_error; - } - - bytes = PyBytes_FromStringAndSize(NULL, len); - if (bytes == NULL) { + PyErr_NoMemory(); PyMem_Free(wstr); return NULL; } - - len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); - if (len2 == (size_t)-1 || len2 > len) { - Py_DECREF(bytes); - error_pos = (size_t)-1; - goto encode_error; - } - PyMem_Free(wstr); } - return bytes; - -encode_error: - errmsg = strerror(errno); - assert(errmsg != NULL); - - if (error_pos == (size_t)-1) - error_pos = wcstombs_errorpos(wstr); - PyMem_Free(wstr); - wstr = Py_DecodeLocale(errmsg, &errlen); - if (wstr != NULL) { - reason = PyUnicode_FromWideChar(wstr, errlen); - PyMem_RawFree(wstr); - } else { - errmsg = NULL; - } - - if (errmsg == NULL) - reason = PyUnicode_FromString( - "wcstombs() encountered an unencodable " - "wide character"); - if (reason == NULL) - return NULL; - - exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", - "locale", unicode, - (Py_ssize_t)error_pos, - (Py_ssize_t)(error_pos+1), - reason); - Py_DECREF(reason); - if (exc != NULL) { - PyCodec_StrictErrors(exc); - Py_DECREF(exc); - } - return NULL; + PyObject *bytes = PyBytes_FromString(str); + PyMem_RawFree(str); + return bytes; } PyObject * PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) { - return unicode_encode_locale(unicode, errors, 0); -} - -PyObject * -_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors) -{ return unicode_encode_locale(unicode, errors, 1); } @@ -3687,51 +3570,11 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode, return NULL; } -static size_t -mbstowcs_errorpos(const char *str, size_t len) -{ -#ifdef HAVE_MBRTOWC - const char *start = str; - mbstate_t mbs; - size_t converted; - wchar_t ch; - - memset(&mbs, 0, sizeof mbs); - while (len) - { - converted = mbrtowc(&ch, str, len, &mbs); - if (converted == 0) - /* Reached end of string */ - break; - if (converted == (size_t)-1 || converted == (size_t)-2) { - /* Conversion error or incomplete character */ - return str - start; - } - else { - str += converted; - len -= converted; - } - } - /* failed to find the undecodable byte sequence */ - return 0; -#endif - return 0; -} - static PyObject* unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, int current_locale) { - wchar_t smallbuf[256]; - size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); - wchar_t *wstr; - size_t wlen, wlen2; - PyObject *unicode; int surrogateescape; - size_t error_pos, errlen; - char *errmsg; - PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */ - if (locale_error_handler(errors, &surrogateescape) < 0) return NULL; @@ -3740,113 +3583,47 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, return NULL; } - if (surrogateescape) { - /* "surrogateescape" error handler */ - if (current_locale) { - wstr = _Py_DecodeCurrentLocale(str, &wlen); + wchar_t *wstr; + size_t wlen; + const char *reason; + int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, + current_locale, surrogateescape); + if (res != 0) { + if (res == -2) { + PyObject *exc; + exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", + "locale", str, len, + (Py_ssize_t)wlen, + (Py_ssize_t)(wlen + 1), + reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); + } } else { - wstr = Py_DecodeLocale(str, &wlen); - } - if (wstr == NULL) { - if (wlen == (size_t)-1) - PyErr_NoMemory(); - else - PyErr_SetFromErrno(PyExc_OSError); - return NULL; + PyErr_NoMemory(); } - - unicode = PyUnicode_FromWideChar(wstr, wlen); - PyMem_RawFree(wstr); + return NULL; } - else { - /* strict mode */ -#ifndef HAVE_BROKEN_MBSTOWCS - wlen = mbstowcs(NULL, str, 0); -#else - wlen = len; -#endif - if (wlen == (size_t)-1) - goto decode_error; - if (wlen+1 <= smallbuf_len) { - wstr = smallbuf; - } - else { - wstr = PyMem_New(wchar_t, wlen+1); - if (!wstr) - return PyErr_NoMemory(); - } - wlen2 = mbstowcs(wstr, str, wlen+1); - if (wlen2 == (size_t)-1) { - if (wstr != smallbuf) - PyMem_Free(wstr); - goto decode_error; - } -#ifdef HAVE_BROKEN_MBSTOWCS - assert(wlen2 == wlen); -#endif - unicode = PyUnicode_FromWideChar(wstr, wlen2); - if (wstr != smallbuf) - PyMem_Free(wstr); - } + PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen); + PyMem_RawFree(wstr); return unicode; - -decode_error: - errmsg = strerror(errno); - assert(errmsg != NULL); - - error_pos = mbstowcs_errorpos(str, len); - wstr = Py_DecodeLocale(errmsg, &errlen); - if (wstr != NULL) { - reason = PyUnicode_FromWideChar(wstr, errlen); - PyMem_RawFree(wstr); - } - - if (reason == NULL) - reason = PyUnicode_FromString( - "mbstowcs() encountered an invalid multibyte sequence"); - if (reason == NULL) - return NULL; - - exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", - "locale", str, len, - (Py_ssize_t)error_pos, - (Py_ssize_t)(error_pos+1), - reason); - Py_DECREF(reason); - if (exc != NULL) { - PyCodec_StrictErrors(exc); - Py_DECREF(exc); - } - return NULL; } PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, const char *errors) { - return unicode_decode_locale(str, len, errors, 0); -} - -PyObject* -_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len, - const char *errors) -{ return unicode_decode_locale(str, len, errors, 1); } PyObject* -_PyUnicode_DecodeCurrentLocale(const char *str, const char *errors) -{ - return unicode_decode_locale(str, (Py_ssize_t)strlen(str), errors, 1); -} - -PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors) { Py_ssize_t size = (Py_ssize_t)strlen(str); - return unicode_decode_locale(str, size, errors, 0); + return unicode_decode_locale(str, size, errors, 1); } @@ -3878,7 +3655,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) Py_FileSystemDefaultEncodeErrors); } else { - return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors); + return unicode_decode_locale(s, size, + Py_FileSystemDefaultEncodeErrors, 0); } #endif } @@ -5128,17 +4906,23 @@ onError: } -/* UTF-8 decoder using the surrogateescape error handler . +/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is + non-zero, use strict error handler otherwise. - On success, return a pointer to a newly allocated wide character string (use - PyMem_RawFree() to free the memory) and write the output length (in number - of wchar_t units) into *p_wlen (if p_wlen is set). + On success, write a pointer to a newly allocated wide character string into + *wstr (use PyMem_RawFree() to free the memory) and write the output length + (in number of wchar_t units) into *wlen (if wlen is set). - On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen - (if p_wlen is set). */ -wchar_t* -_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) + On memory allocation failure, return -1. + + On decoding error (if surrogateescape is zero), return -2. If wlen is + non-NULL, write the start of the illegal byte sequence into *wlen. If reason + is not NULL, write the decoding error message into *reason. */ +int +_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, + const char **reason, int surrogateescape) { + const char *orig_s = s; const char *e; wchar_t *unicode; Py_ssize_t outpos; @@ -5146,18 +4930,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) /* Note: size will always be longer than the resulting Unicode character count */ if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { - if (p_wlen) { - *p_wlen = (size_t)-1; - } - return NULL; + return -1; } unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); if (!unicode) { - if (p_wlen) { - *p_wlen = (size_t)-1; - } - return NULL; + return -1; } /* Unpack UTF-8 encoded data */ @@ -5175,7 +4953,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) Py_UNREACHABLE(); #else assert(ch > 0xFFFF && ch <= MAX_UNICODE); - /* compute and append the two surrogates: */ + /* write a surrogate pair */ unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); #endif @@ -5183,60 +4961,88 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) else { if (!ch && s == e) break; + if (!surrogateescape) { + PyMem_RawFree(unicode ); + if (reason != NULL) { + switch (ch) { + case 0: + *reason = "unexpected end of data"; + break; + case 1: + *reason = "invalid start byte"; + break; + /* 2, 3, 4 */ + default: + *reason = "invalid continuation byte"; + break; + } + } + if (wlen != NULL) { + *wlen = s - orig_s; + } + return -2; + } /* surrogateescape */ unicode[outpos++] = 0xDC00 + (unsigned char)*s++; } } unicode[outpos] = L'\0'; - if (p_wlen) { - *p_wlen = outpos; + if (wlen) { + *wlen = outpos; } - return unicode; + *wstr = unicode; + return 0; +} + +wchar_t* +_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen) +{ + wchar_t *wstr; + int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1); + if (res != 0) { + return NULL; + } + return wstr; } /* UTF-8 encoder using the surrogateescape error handler . - On success, return a pointer to a newly allocated character string (use - PyMem_Free() to free the memory). + On success, return 0 and write the newly allocated character string (use + PyMem_Free() to free the memory) into *str. - On encoding failure, return NULL and write the position of the invalid - surrogate character into *error_pos (if error_pos is set). + On encoding failure, return -2 and write the position of the invalid + surrogate character into *error_pos (if error_pos is set) and the decoding + error message into *reason (if reason is set). - On memory allocation failure, return NULL and write (size_t)-1 into - *error_pos (if error_pos is set). */ -char* -_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos, - int raw_malloc) + On memory allocation failure, return -1. */ +int +_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, + const char **reason, int raw_malloc, int surrogateescape) { const Py_ssize_t max_char_size = 4; Py_ssize_t len = wcslen(text); assert(len >= 0); + if (len > PY_SSIZE_T_MAX / max_char_size - 1) { + return -1; + } char *bytes; - if (len <= PY_SSIZE_T_MAX / max_char_size - 1) { - if (raw_malloc) { - bytes = PyMem_RawMalloc((len + 1) * max_char_size); - } - else { - bytes = PyMem_Malloc((len + 1) * max_char_size); - } + if (raw_malloc) { + bytes = PyMem_RawMalloc((len + 1) * max_char_size); } else { - bytes = NULL; + bytes = PyMem_Malloc((len + 1) * max_char_size); } if (bytes == NULL) { - if (error_pos != NULL) { - *error_pos = (size_t)-1; - } - return NULL; + return -1; } char *p = bytes; Py_ssize_t i; - for (i = 0; i < len;) { - Py_UCS4 ch = text[i++]; + for (i = 0; i < len; i++) { + Py_UCS4 ch = text[i]; if (ch < 0x80) { /* Encode ASCII */ @@ -5250,11 +5056,20 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos, } else if (Py_UNICODE_IS_SURROGATE(ch)) { /* surrogateescape error handler */ - if (!(0xDC80 <= ch && ch <= 0xDCFF)) { + if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) { if (error_pos != NULL) { - *error_pos = (size_t)i - 1; + *error_pos = (size_t)i; } - goto error; + if (reason != NULL) { + *reason = "encoding error"; + } + if (raw_malloc) { + PyMem_RawFree(bytes); + } + else { + PyMem_Free(bytes); + } + return -2; } *p++ = (char)(ch & 0xff); } @@ -5286,18 +5101,16 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos, if (error_pos != NULL) { *error_pos = (size_t)-1; } - goto error; - } - return bytes2; - - error: - if (raw_malloc) { - PyMem_RawFree(bytes); - } - else { - PyMem_Free(bytes); + if (raw_malloc) { + PyMem_RawFree(bytes); + } + else { + PyMem_Free(bytes); + } + return -1; } - return NULL; + *str = bytes2; + return 0; } diff --git a/Python/fileutils.c b/Python/fileutils.c index 9275494..a50075e 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -20,9 +20,6 @@ extern int winerror_to_errno(int); #include #endif /* HAVE_FCNTL_H */ -extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text, - size_t *error_pos, int raw_malloc); - #ifdef O_CLOEXEC /* Does open() support the O_CLOEXEC flag? Possible values: @@ -69,7 +66,10 @@ _Py_device_encoding(int fd) Py_RETURN_NONE; } -#if !defined(__APPLE__) && !defined(MS_WINDOWS) +#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) + +#define USE_FORCE_ASCII + extern int _Py_normalize_encoding(const char *, char *, size_t); /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale. @@ -90,7 +90,7 @@ extern int _Py_normalize_encoding(const char *, char *, size_t); 1: the workaround is used: Py_EncodeLocale() uses encode_ascii_surrogateescape() and Py_DecodeLocale() uses - decode_ascii_surrogateescape() + decode_ascii() 0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and Py_DecodeLocale() uses mbstowcs() -1: unknown, need to call check_force_ascii() to get the value @@ -180,16 +180,15 @@ error: return 1; } -static char* -encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_malloc) +static int +encode_ascii(const wchar_t *text, char **str, + size_t *error_pos, const char **reason, + int raw_malloc, int surrogateescape) { char *result = NULL, *out; size_t len, i; wchar_t ch; - if (error_pos != NULL) - *error_pos = (size_t)-1; - len = wcslen(text); /* +1 for NULL byte */ @@ -199,8 +198,9 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal else { result = PyMem_Malloc(len + 1); } - if (result == NULL) - return NULL; + if (result == NULL) { + return -1; + } out = result; for (i=0; i PY_SSIZE_T_MAX/sizeof(wchar_t)) - return NULL; - res = PyMem_RawMalloc(argsize*sizeof(wchar_t)); - if (!res) - return NULL; + if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) { + return -1; + } + res = PyMem_RawMalloc(argsize * sizeof(wchar_t)); + if (!res) { + return -1; + } - in = (unsigned char*)arg; out = res; - while(*in) - if(*in < 128) - *out++ = *in++; - else - *out++ = 0xdc00 + *in++; + for (in = (unsigned char*)arg; *in; in++) { + unsigned char ch = *in; + if (ch < 128) { + *out++ = ch; + } + else { + if (!surrogateescape) { + PyMem_RawFree(res); + if (wlen) { + *wlen = in - (unsigned char*)arg; + } + if (reason) { + *reason = "decoding error"; + } + return -2; + } + *out++ = 0xdc00 + ch; + } + } *out = 0; - if (size != NULL) - *size = out - res; - return res; + + if (wlen != NULL) { + *wlen = out - res; + } + *wstr = res; + return 0; } -#endif +#endif /* !HAVE_MBRTOWC */ -#if !defined(__APPLE__) && !defined(__ANDROID__) -static wchar_t* -decode_current_locale(const char* arg, size_t *size) +static int +decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, + const char **reason, int surrogateescape) { wchar_t *res; size_t argsize; @@ -284,15 +308,15 @@ decode_current_locale(const char* arg, size_t *size) argsize = mbstowcs(NULL, arg, 0); #endif if (argsize != (size_t)-1) { - if (argsize == PY_SSIZE_T_MAX) - goto oom; - argsize += 1; - if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) - goto oom; - res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t)); - if (!res) - goto oom; - count = mbstowcs(res, arg, argsize); + if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { + return -1; + } + res = (wchar_t *)PyMem_RawMalloc((argsize + 1) * sizeof(wchar_t)); + if (!res) { + return -1; + } + + count = mbstowcs(res, arg, argsize + 1); if (count != (size_t)-1) { wchar_t *tmp; /* Only use the result if it contains no @@ -301,13 +325,16 @@ decode_current_locale(const char* arg, size_t *size) !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) ; if (*tmp == 0) { - if (size != NULL) - *size = count; - return res; + if (wlen != NULL) { + *wlen = count; + } + *wstr = res; + return 0; } } PyMem_RawFree(res); } + /* Conversion failed. Fall back to escaping with surrogateescape. */ #ifdef HAVE_MBRTOWC /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */ @@ -315,30 +342,37 @@ decode_current_locale(const char* arg, size_t *size) /* Overallocate; as multi-byte characters are in the argument, the actual output could use less memory. */ argsize = strlen(arg) + 1; - if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) - goto oom; - res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t)); - if (!res) - goto oom; + if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) { + return -1; + } + res = (wchar_t*)PyMem_RawMalloc(argsize * sizeof(wchar_t)); + if (!res) { + return -1; + } + in = (unsigned char*)arg; out = res; memset(&mbs, 0, sizeof mbs); while (argsize) { size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); - if (converted == 0) + if (converted == 0) { /* Reached end of string; null char stored. */ break; + } + if (converted == (size_t)-2) { /* Incomplete character. This should never happen, since we provide everything that we have - unless there is a bug in the C library, or I misunderstood how mbrtowc works. */ - PyMem_RawFree(res); - if (size != NULL) - *size = (size_t)-2; - return NULL; + goto decode_error; } + if (converted == (size_t)-1) { + if (!surrogateescape) { + goto decode_error; + } + /* Conversion error. Escape as UTF-8b, and start over in the initial shift state. */ *out++ = 0xdc00 + *in++; @@ -346,12 +380,18 @@ decode_current_locale(const char* arg, size_t *size) memset(&mbs, 0, sizeof mbs); continue; } + if (Py_UNICODE_IS_SURROGATE(*out)) { + if (!surrogateescape) { + goto decode_error; + } + /* Surrogate character. Escape the original byte sequence with surrogateescape. */ argsize -= converted; - while (converted--) + while (converted--) { *out++ = 0xdc00 + *in++; + } continue; } /* successfully converted some bytes */ @@ -359,55 +399,80 @@ decode_current_locale(const char* arg, size_t *size) argsize -= converted; out++; } - if (size != NULL) - *size = out - res; + if (wlen != NULL) { + *wlen = out - res; + } + *wstr = res; + return 0; + +decode_error: + PyMem_RawFree(res); + if (wlen) { + *wlen = in - (unsigned char*)arg; + } + if (reason) { + *reason = "decoding error"; + } + return -2; #else /* HAVE_MBRTOWC */ /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ - res = decode_ascii_surrogateescape(arg, size); - if (res == NULL) - goto oom; + return decode_ascii(arg, wstr, wlen, reason, surrogateescape); #endif /* HAVE_MBRTOWC */ - return res; - -oom: - if (size != NULL) { - *size = (size_t)-1; - } - return NULL; } -#endif -static wchar_t* -decode_locale(const char* arg, size_t *size, int ignore_utf8_mode) +/* Decode a byte string from the locale encoding. + + Use the strict error handler if 'surrogateescape' is zero. Use the + surrogateescape error handler if 'surrogateescape' is non-zero: undecodable + bytes are decoded as characters in range U+DC80..U+DCFF. If a byte sequence + can be decoded as a surrogate character, escape the bytes using the + surrogateescape error handler instead of decoding them. + + On sucess, return 0 and write the newly allocated wide character string into + *wstr (use PyMem_RawFree() to free the memory). If wlen is not NULL, write + the number of wide characters excluding the null character into *wlen. + + On memory allocation failure, return -1. + + On decoding error, return -2. If wlen is not NULL, write the start of + invalid byte sequence in the input string into *wlen. If reason is not NULL, + write the decoding error message into *reason. + + Use the Py_EncodeLocaleEx() function to encode the character string back to + a byte string. */ +int +_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, + const char **reason, + int current_locale, int surrogateescape) { + if (current_locale) { + return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); + } + #if defined(__APPLE__) || defined(__ANDROID__) - return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); + return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, + surrogateescape); #else - if (!ignore_utf8_mode && Py_UTF8Mode == 1) { - return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); + if (Py_UTF8Mode == 1) { + return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, + surrogateescape); } -#ifndef MS_WINDOWS - if (force_ascii == -1) +#ifdef USE_FORCE_ASCII + if (force_ascii == -1) { force_ascii = check_force_ascii(); + } if (force_ascii) { /* force ASCII encoding to workaround mbstowcs() issue */ - wchar_t *wstr = decode_ascii_surrogateescape(arg, size); - if (wstr == NULL) { - if (size != NULL) { - *size = (size_t)-1; - } - return NULL; - } - return wstr; + return decode_ascii(arg, wstr, wlen, reason, surrogateescape); } #endif - return decode_current_locale(arg, size); + return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); #endif /* __APPLE__ or __ANDROID__ */ } @@ -432,23 +497,24 @@ decode_locale(const char* arg, size_t *size, int ignore_utf8_mode) Use the Py_EncodeLocale() function to encode the character string back to a byte string. */ wchar_t* -Py_DecodeLocale(const char* arg, size_t *size) +Py_DecodeLocale(const char* arg, size_t *wlen) { - return decode_locale(arg, size, 0); -} - - -/* Similar to Py_DecodeLocale() but ignore the UTF-8 mode */ -wchar_t* -_Py_DecodeCurrentLocale(const char* arg, size_t *size) -{ - return decode_locale(arg, size, 1); + wchar_t *wstr; + int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1); + if (res != 0) { + if (wlen != NULL) { + *wlen = (size_t)res; + } + return NULL; + } + return wstr; } -#if !defined(__APPLE__) && !defined(__ANDROID__) -static char* -encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) +static int +encode_current_locale(const wchar_t *text, char **str, + size_t *error_pos, const char **reason, + int raw_malloc, int surrogateescape) { const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; @@ -464,38 +530,37 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) for (i=0; i < len; i++) { c = text[i]; if (c >= 0xdc80 && c <= 0xdcff) { + if (!surrogateescape) { + goto encode_error; + } /* UTF-8b surrogate */ if (bytes != NULL) { *bytes++ = c - 0xdc00; size--; } - else + else { size++; + } continue; } else { buf[0] = c; - if (bytes != NULL) + if (bytes != NULL) { converted = wcstombs(bytes, buf, size); - else + } + else { converted = wcstombs(NULL, buf, 0); + } if (converted == (size_t)-1) { - if (raw_malloc) { - PyMem_RawFree(result); - } - else { - PyMem_Free(result); - } - if (error_pos != NULL) - *error_pos = i; - return NULL; + goto encode_error; } if (bytes != NULL) { bytes += converted; size -= converted; } - else + else { size += converted; + } } } if (result != NULL) { @@ -511,40 +576,80 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) result = PyMem_Malloc(size); } if (result == NULL) { - if (error_pos != NULL) { - *error_pos = (size_t)-1; - } - return NULL; + return -1; } bytes = result; } - return result; + *str = result; + return 0; + +encode_error: + if (raw_malloc) { + PyMem_RawFree(result); + } + else { + PyMem_Free(result); + } + if (error_pos != NULL) { + *error_pos = i; + } + if (reason) { + *reason = "encoding error"; + } + return -2; } -#endif -static char* -encode_locale(const wchar_t *text, size_t *error_pos, - int raw_malloc, int ignore_utf8_mode) +static int +encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, + const char **reason, + int raw_malloc, int current_locale, int surrogateescape) { + if (current_locale) { + return encode_current_locale(text, str, error_pos, reason, + raw_malloc, surrogateescape); + } + #if defined(__APPLE__) || defined(__ANDROID__) - return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc); + return _Py_EncodeUTF8Ex(text, str, error_pos, reason, + raw_malloc, surrogateescape); #else /* __APPLE__ */ - if (!ignore_utf8_mode && Py_UTF8Mode == 1) { - return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc); + if (Py_UTF8Mode == 1) { + return _Py_EncodeUTF8Ex(text, str, error_pos, reason, + raw_malloc, surrogateescape); } -#ifndef MS_WINDOWS - if (force_ascii == -1) +#ifdef USE_FORCE_ASCII + if (force_ascii == -1) { force_ascii = check_force_ascii(); + } - if (force_ascii) - return encode_ascii_surrogateescape(text, error_pos, raw_malloc); + if (force_ascii) { + return encode_ascii(text, str, error_pos, reason, + raw_malloc, surrogateescape); + } #endif - return encode_current_locale(text, error_pos, raw_malloc); + return encode_current_locale(text, str, error_pos, reason, + raw_malloc, surrogateescape); #endif /* __APPLE__ or __ANDROID__ */ } +static char* +encode_locale(const wchar_t *text, size_t *error_pos, + int raw_malloc, int current_locale) +{ + char *str; + int res = encode_locale_ex(text, &str, error_pos, NULL, + raw_malloc, current_locale, 1); + if (res != -2 && error_pos) { + *error_pos = (size_t)-1; + } + if (res != 0) { + return NULL; + } + return str; +} + /* Encode a wide character string to the locale encoding with the surrogateescape error handler: surrogate characters in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF. @@ -573,11 +678,13 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos) } -/* Similar to _Py_EncodeLocaleRaw() but ignore the UTF-8 Mode */ -char* -_Py_EncodeCurrentLocale(const wchar_t *text, size_t *error_pos) +int +_Py_EncodeLocaleEx(const wchar_t *text, char **str, + size_t *error_pos, const char **reason, + int current_locale, int surrogateescape) { - return encode_locale(text, error_pos, 1, 1); + return encode_locale_ex(text, str, error_pos, reason, 1, + current_locale, surrogateescape); } diff --git a/Python/pathconfig.c b/Python/pathconfig.c index 9591fcc..7ebd69b 100644 --- a/Python/pathconfig.c +++ b/Python/pathconfig.c @@ -382,8 +382,8 @@ _Py_FindEnvConfigValue(FILE *env_file, const wchar_t *key, /* Comment - skip */ continue; } - tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n, NULL); - if (tmpbuffer != NULL) { + tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n); + if (tmpbuffer) { wchar_t * state; wchar_t * tok = wcstok(tmpbuffer, L" \t\r\n", &state); if ((tok != NULL) && !wcscmp(tok, key)) { -- cgit v0.12