summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2018-01-15 09:45:49 (GMT)
committerGitHub <noreply@github.com>2018-01-15 09:45:49 (GMT)
commit7ed7aead9503102d2ed316175f198104e0cd674c (patch)
tree0b70b3b7d2eed5ea92552c1b93953d0333f5a869
parentee3b83547c6b0cac1da2cb44aaaea533a1d1bbc8 (diff)
downloadcpython-7ed7aead9503102d2ed316175f198104e0cd674c.zip
cpython-7ed7aead9503102d2ed316175f198104e0cd674c.tar.gz
cpython-7ed7aead9503102d2ed316175f198104e0cd674c.tar.bz2
bpo-29240: Fix locale encodings in UTF-8 Mode (#5170)
Modify locale.localeconv(), time.tzname, os.strerror() and other functions to ignore the UTF-8 Mode: always use the current locale encoding. Changes: * Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx(). On decoding or encoding error, they return the position of the error and an error message which are used to raise Unicode errors in PyUnicode_DecodeLocale() and PyUnicode_EncodeLocale(). * Replace _Py_DecodeCurrentLocale() with _Py_DecodeLocaleEx(). * PyUnicode_DecodeLocale() now uses _Py_DecodeLocaleEx() for all cases, especially for the strict error handler. * Add _Py_DecodeUTF8Ex(): return more information on decoding error and supports the strict error handler. * Rename _Py_EncodeUTF8_surrogateescape() to _Py_EncodeUTF8Ex(). * Replace _Py_EncodeCurrentLocale() with _Py_EncodeLocaleEx(). * Ignore the UTF-8 mode to encode/decode localeconv(), strerror() and time zone name. * Remove PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize() and PyUnicode_EncodeLocale() now ignore the UTF-8 mode: always use the "current" locale. * Remove _PyUnicode_DecodeCurrentLocale(), _PyUnicode_DecodeCurrentLocaleAndSize() and _PyUnicode_EncodeCurrentLocale().
-rw-r--r--Doc/c-api/sys.rst22
-rw-r--r--Doc/c-api/unicode.rst16
-rw-r--r--Include/fileutils.h37
-rw-r--r--Include/unicodeobject.h14
-rw-r--r--Modules/_datetimemodule.c2
-rw-r--r--Modules/_localemodule.c3
-rw-r--r--Modules/getpath.c4
-rw-r--r--Modules/readline.c4
-rw-r--r--Modules/timemodule.c11
-rw-r--r--Objects/unicodeobject.c475
-rw-r--r--Python/fileutils.c385
-rw-r--r--Python/pathconfig.c4
12 files changed, 472 insertions, 505 deletions
diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst
index 20bc7bd..e4da96c 100644
--- a/Doc/c-api/sys.rst
+++ b/Doc/c-api/sys.rst
@@ -106,6 +106,16 @@ Operating System Utilities
surrogate character, escape the bytes using the surrogateescape error
handler instead of decoding them.
+ Encoding, highest priority to lowest priority:
+
+ * ``UTF-8`` on macOS and Android;
+ * ``UTF-8`` if the Python UTF-8 mode is enabled;
+ * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
+ ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
+ and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
+ ``ISO-8859-1`` encoding.
+ * the current locale encoding.
+
Return a pointer to a newly allocated wide character string, use
:c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write
the number of wide characters excluding the null character into ``*size``
@@ -137,6 +147,18 @@ Operating System Utilities
:ref:`surrogateescape error handler <surrogateescape>`: surrogate characters
in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
+ Encoding, highest priority to lowest priority:
+
+ * ``UTF-8`` on macOS and Android;
+ * ``UTF-8`` if the Python UTF-8 mode is enabled;
+ * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
+ ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
+ and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
+ ``ISO-8859-1`` encoding.
+ * the current locale encoding.
+
+ The function uses the UTF-8 encoding in the Python UTF-8 mode.
+
Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free`
to free the memory. Return ``NULL`` on encoding error or memory allocation
error
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 45aff1b..3f6c055 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -770,12 +770,20 @@ system.
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
Python startup).
+ This function ignores the Python UTF-8 mode.
+
.. seealso::
The :c:func:`Py_DecodeLocale` function.
.. versionadded:: 3.3
+ .. versionchanged:: 3.7
+ The function now also uses the current locale encoding for the
+ ``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale`
+ was used for the ``surrogateescape``, and the current locale encoding was
+ used for ``strict``.
+
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors)
@@ -797,12 +805,20 @@ system.
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
Python startup).
+ This function ignores the Python UTF-8 mode.
+
.. seealso::
The :c:func:`Py_EncodeLocale` function.
.. versionadded:: 3.3
+ .. versionchanged:: 3.7
+ The function now also uses the current locale encoding for the
+ ``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale`
+ was used for the ``surrogateescape``, and the current locale encoding was
+ used for ``strict``.
+
File System Encoding
""""""""""""""""""""
diff --git a/Include/fileutils.h b/Include/fileutils.h
index 2527d84..b4f8b11 100644
--- a/Include/fileutils.h
+++ b/Include/fileutils.h
@@ -20,18 +20,41 @@ PyAPI_FUNC(char*) _Py_EncodeLocaleRaw(
#endif
#ifdef Py_BUILD_CORE
+PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
+ const char *arg,
+ Py_ssize_t arglen,
+ wchar_t **wstr,
+ size_t *wlen,
+ const char **reason,
+ int surrogateescape);
+
+PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
+ const wchar_t *text,
+ char **str,
+ size_t *error_pos,
+ const char **reason,
+ int raw_malloc,
+ int surrogateescape);
+
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
- const char *s,
- Py_ssize_t size,
- size_t *p_wlen);
+ const char *arg,
+ Py_ssize_t arglen);
-PyAPI_FUNC(wchar_t *) _Py_DecodeCurrentLocale(
+PyAPI_FUNC(int) _Py_DecodeLocaleEx(
const char *arg,
- size_t *size);
+ wchar_t **wstr,
+ size_t *wlen,
+ const char **reason,
+ int current_locale,
+ int surrogateescape);
-PyAPI_FUNC(char*) _Py_EncodeCurrentLocale(
+PyAPI_FUNC(int) _Py_EncodeLocaleEx(
const wchar_t *text,
- size_t *error_pos);
+ char **str,
+ size_t *error_pos,
+ const char **reason,
+ int current_locale,
+ int surrogateescape);
#endif
#ifndef Py_LIMITED_API
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index d263026..0274de6 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1810,20 +1810,6 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
PyObject *unicode,
const char *errors
);
-
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocale(
- const char *str,
- const char *errors);
-
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocaleAndSize(
- const char *str,
- Py_ssize_t len,
- const char *errors);
-
-PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCurrentLocale(
- PyObject *unicode,
- const char *errors
- );
#endif
/* --- File system encoding ---------------------------------------------- */
diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 6241967..e68c7c0 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -696,7 +696,7 @@ static int parse_isoformat_date(const char *dtstr,
if (NULL == p) {
return -1;
}
-
+
if (*(p++) != '-') {
return -2;
}
diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c
index e364668..324b694 100644
--- a/Modules/_localemodule.c
+++ b/Modules/_localemodule.c
@@ -572,8 +572,9 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
if (!PyArg_ParseTuple(args, "sz", &domain, &codeset))
return NULL;
codeset = bind_textdomain_codeset(domain, codeset);
- if (codeset)
+ if (codeset) {
return PyUnicode_DecodeLocale(codeset, NULL);
+ }
Py_RETURN_NONE;
}
#endif
diff --git a/Modules/getpath.c b/Modules/getpath.c
index 85e737b..e6a3e8e 100644
--- a/Modules/getpath.c
+++ b/Modules/getpath.c
@@ -449,8 +449,8 @@ search_for_exec_prefix(const _PyCoreConfig *core_config,
n = fread(buf, 1, MAXPATHLEN, f);
buf[n] = '\0';
fclose(f);
- rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n, NULL);
- if (rel_builddir_path != NULL) {
+ rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n);
+ if (rel_builddir_path) {
wcsncpy(exec_prefix, calculate->argv0_path, MAXPATHLEN);
exec_prefix[MAXPATHLEN] = L'\0';
joinpath(exec_prefix, rel_builddir_path);
diff --git a/Modules/readline.c b/Modules/readline.c
index caf661c..811fca8 100644
--- a/Modules/readline.c
+++ b/Modules/readline.c
@@ -132,13 +132,13 @@ static PyModuleDef readlinemodule;
static PyObject *
encode(PyObject *b)
{
- return _PyUnicode_EncodeCurrentLocale(b, "surrogateescape");
+ return PyUnicode_EncodeLocale(b, "surrogateescape");
}
static PyObject *
decode(const char *s)
{
- return _PyUnicode_DecodeCurrentLocale(s, "surrogateescape");
+ return PyUnicode_DecodeLocale(s, "surrogateescape");
}
diff --git a/Modules/timemodule.c b/Modules/timemodule.c
index 4e7f9d9..b17ab5a 100644
--- a/Modules/timemodule.c
+++ b/Modules/timemodule.c
@@ -418,11 +418,11 @@ tmtotuple(struct tm *p
SET(8, p->tm_isdst);
#ifdef HAVE_STRUCT_TM_TM_ZONE
PyStructSequence_SET_ITEM(v, 9,
- _PyUnicode_DecodeCurrentLocale(p->tm_zone, "surrogateescape"));
+ PyUnicode_DecodeLocale(p->tm_zone, "surrogateescape"));
SET(10, p->tm_gmtoff);
#else
PyStructSequence_SET_ITEM(v, 9,
- _PyUnicode_DecodeCurrentLocale(zone, "surrogateescape"));
+ PyUnicode_DecodeLocale(zone, "surrogateescape"));
PyStructSequence_SET_ITEM(v, 10, _PyLong_FromTime_t(gmtoff));
#endif /* HAVE_STRUCT_TM_TM_ZONE */
#undef SET
@@ -809,8 +809,7 @@ time_strftime(PyObject *self, PyObject *args)
#ifdef HAVE_WCSFTIME
ret = PyUnicode_FromWideChar(outbuf, buflen);
#else
- ret = _PyUnicode_DecodeCurrentLocaleAndSize(outbuf, buflen,
- "surrogateescape");
+ ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, "surrogateescape");
#endif
PyMem_Free(outbuf);
break;
@@ -1541,8 +1540,8 @@ PyInit_timezone(PyObject *m) {
PyModule_AddIntConstant(m, "altzone", timezone-3600);
#endif
PyModule_AddIntConstant(m, "daylight", daylight);
- otz0 = _PyUnicode_DecodeCurrentLocale(tzname[0], "surrogateescape");
- otz1 = _PyUnicode_DecodeCurrentLocale(tzname[1], "surrogateescape");
+ otz0 = PyUnicode_DecodeLocale(tzname[0], "surrogateescape");
+ otz1 = PyUnicode_DecodeLocale(tzname[1], "surrogateescape");
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
{
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index a6e02f4..0733011 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3327,53 +3327,6 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
return NULL;
}
-static size_t
-wcstombs_errorpos(const wchar_t *wstr)
-{
- size_t len;
-#if SIZEOF_WCHAR_T == 2
- wchar_t buf[3];
-#else
- wchar_t buf[2];
-#endif
- char outbuf[MB_LEN_MAX];
- const wchar_t *start, *previous;
-
-#if SIZEOF_WCHAR_T == 2
- buf[2] = 0;
-#else
- buf[1] = 0;
-#endif
- start = wstr;
- while (*wstr != L'\0')
- {
- previous = wstr;
-#if SIZEOF_WCHAR_T == 2
- if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
- && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
- {
- buf[0] = wstr[0];
- buf[1] = wstr[1];
- wstr += 2;
- }
- else {
- buf[0] = *wstr;
- buf[1] = 0;
- wstr++;
- }
-#else
- buf[0] = *wstr;
- wstr++;
-#endif
- len = wcstombs(outbuf, buf, sizeof(outbuf));
- if (len == (size_t)-1)
- return previous - start;
- }
-
- /* failed to find the unencodable character */
- return 0;
-}
-
static int
locale_error_handler(const char *errors, int *surrogateescape)
{
@@ -3396,131 +3349,61 @@ locale_error_handler(const char *errors, int *surrogateescape)
}
static PyObject *
-unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale)
+unicode_encode_locale(PyObject *unicode, const char *errors,
+ int current_locale)
{
- Py_ssize_t wlen, wlen2;
- wchar_t *wstr;
- char *errmsg;
- PyObject *bytes, *reason, *exc;
- size_t error_pos, errlen;
int surrogateescape;
-
if (locale_error_handler(errors, &surrogateescape) < 0)
return NULL;
- wstr = PyUnicode_AsWideCharString(unicode, &wlen);
- if (wstr == NULL)
+ Py_ssize_t wlen;
+ wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
+ if (wstr == NULL) {
return NULL;
+ }
- wlen2 = wcslen(wstr);
+ Py_ssize_t wlen2 = wcslen(wstr);
if (wlen2 != wlen) {
PyMem_Free(wstr);
PyErr_SetString(PyExc_ValueError, "embedded null character");
return NULL;
}
- if (surrogateescape) {
- /* "surrogateescape" error handler */
- char *str;
-
- if (current_locale) {
- str = _Py_EncodeCurrentLocale(wstr, &error_pos);
- }
- else {
- str = Py_EncodeLocale(wstr, &error_pos);
- }
- if (str == NULL) {
- if (error_pos == (size_t)-1) {
- PyErr_NoMemory();
- PyMem_Free(wstr);
- return NULL;
- }
- else {
- goto encode_error;
+ char *str;
+ size_t error_pos;
+ const char *reason;
+ int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
+ current_locale, surrogateescape);
+ if (res != 0) {
+ if (res == -2) {
+ PyObject *exc;
+ exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
+ "locale", unicode,
+ (Py_ssize_t)error_pos,
+ (Py_ssize_t)(error_pos+1),
+ reason);
+ if (exc != NULL) {
+ PyCodec_StrictErrors(exc);
+ Py_DECREF(exc);
}
- }
- PyMem_Free(wstr);
-
- bytes = PyBytes_FromString(str);
- if (current_locale) {
- PyMem_RawFree(str);
+ return NULL;
}
else {
- PyMem_Free(str);
- }
- }
- else {
- /* strict mode */
- size_t len, len2;
-
- len = wcstombs(NULL, wstr, 0);
- if (len == (size_t)-1) {
- error_pos = (size_t)-1;
- goto encode_error;
- }
-
- bytes = PyBytes_FromStringAndSize(NULL, len);
- if (bytes == NULL) {
+ PyErr_NoMemory();
PyMem_Free(wstr);
return NULL;
}
-
- len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
- if (len2 == (size_t)-1 || len2 > len) {
- Py_DECREF(bytes);
- error_pos = (size_t)-1;
- goto encode_error;
- }
- PyMem_Free(wstr);
}
- return bytes;
-
-encode_error:
- errmsg = strerror(errno);
- assert(errmsg != NULL);
-
- if (error_pos == (size_t)-1)
- error_pos = wcstombs_errorpos(wstr);
-
PyMem_Free(wstr);
- wstr = Py_DecodeLocale(errmsg, &errlen);
- if (wstr != NULL) {
- reason = PyUnicode_FromWideChar(wstr, errlen);
- PyMem_RawFree(wstr);
- } else {
- errmsg = NULL;
- }
-
- if (errmsg == NULL)
- reason = PyUnicode_FromString(
- "wcstombs() encountered an unencodable "
- "wide character");
- if (reason == NULL)
- return NULL;
-
- exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
- "locale", unicode,
- (Py_ssize_t)error_pos,
- (Py_ssize_t)(error_pos+1),
- reason);
- Py_DECREF(reason);
- if (exc != NULL) {
- PyCodec_StrictErrors(exc);
- Py_DECREF(exc);
- }
- return NULL;
+ PyObject *bytes = PyBytes_FromString(str);
+ PyMem_RawFree(str);
+ return bytes;
}
PyObject *
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
{
- return unicode_encode_locale(unicode, errors, 0);
-}
-
-PyObject *
-_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors)
-{
return unicode_encode_locale(unicode, errors, 1);
}
@@ -3687,51 +3570,11 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
return NULL;
}
-static size_t
-mbstowcs_errorpos(const char *str, size_t len)
-{
-#ifdef HAVE_MBRTOWC
- const char *start = str;
- mbstate_t mbs;
- size_t converted;
- wchar_t ch;
-
- memset(&mbs, 0, sizeof mbs);
- while (len)
- {
- converted = mbrtowc(&ch, str, len, &mbs);
- if (converted == 0)
- /* Reached end of string */
- break;
- if (converted == (size_t)-1 || converted == (size_t)-2) {
- /* Conversion error or incomplete character */
- return str - start;
- }
- else {
- str += converted;
- len -= converted;
- }
- }
- /* failed to find the undecodable byte sequence */
- return 0;
-#endif
- return 0;
-}
-
static PyObject*
unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
int current_locale)
{
- wchar_t smallbuf[256];
- size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
- wchar_t *wstr;
- size_t wlen, wlen2;
- PyObject *unicode;
int surrogateescape;
- size_t error_pos, errlen;
- char *errmsg;
- PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
-
if (locale_error_handler(errors, &surrogateescape) < 0)
return NULL;
@@ -3740,113 +3583,47 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
return NULL;
}
- if (surrogateescape) {
- /* "surrogateescape" error handler */
- if (current_locale) {
- wstr = _Py_DecodeCurrentLocale(str, &wlen);
+ wchar_t *wstr;
+ size_t wlen;
+ const char *reason;
+ int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
+ current_locale, surrogateescape);
+ if (res != 0) {
+ if (res == -2) {
+ PyObject *exc;
+ exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
+ "locale", str, len,
+ (Py_ssize_t)wlen,
+ (Py_ssize_t)(wlen + 1),
+ reason);
+ if (exc != NULL) {
+ PyCodec_StrictErrors(exc);
+ Py_DECREF(exc);
+ }
}
else {
- wstr = Py_DecodeLocale(str, &wlen);
- }
- if (wstr == NULL) {
- if (wlen == (size_t)-1)
- PyErr_NoMemory();
- else
- PyErr_SetFromErrno(PyExc_OSError);
- return NULL;
+ PyErr_NoMemory();
}
-
- unicode = PyUnicode_FromWideChar(wstr, wlen);
- PyMem_RawFree(wstr);
+ return NULL;
}
- else {
- /* strict mode */
-#ifndef HAVE_BROKEN_MBSTOWCS
- wlen = mbstowcs(NULL, str, 0);
-#else
- wlen = len;
-#endif
- if (wlen == (size_t)-1)
- goto decode_error;
- if (wlen+1 <= smallbuf_len) {
- wstr = smallbuf;
- }
- else {
- wstr = PyMem_New(wchar_t, wlen+1);
- if (!wstr)
- return PyErr_NoMemory();
- }
- wlen2 = mbstowcs(wstr, str, wlen+1);
- if (wlen2 == (size_t)-1) {
- if (wstr != smallbuf)
- PyMem_Free(wstr);
- goto decode_error;
- }
-#ifdef HAVE_BROKEN_MBSTOWCS
- assert(wlen2 == wlen);
-#endif
- unicode = PyUnicode_FromWideChar(wstr, wlen2);
- if (wstr != smallbuf)
- PyMem_Free(wstr);
- }
+ PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
+ PyMem_RawFree(wstr);
return unicode;
-
-decode_error:
- errmsg = strerror(errno);
- assert(errmsg != NULL);
-
- error_pos = mbstowcs_errorpos(str, len);
- wstr = Py_DecodeLocale(errmsg, &errlen);
- if (wstr != NULL) {
- reason = PyUnicode_FromWideChar(wstr, errlen);
- PyMem_RawFree(wstr);
- }
-
- if (reason == NULL)
- reason = PyUnicode_FromString(
- "mbstowcs() encountered an invalid multibyte sequence");
- if (reason == NULL)
- return NULL;
-
- exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
- "locale", str, len,
- (Py_ssize_t)error_pos,
- (Py_ssize_t)(error_pos+1),
- reason);
- Py_DECREF(reason);
- if (exc != NULL) {
- PyCodec_StrictErrors(exc);
- Py_DECREF(exc);
- }
- return NULL;
}
PyObject*
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
const char *errors)
{
- return unicode_decode_locale(str, len, errors, 0);
-}
-
-PyObject*
-_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len,
- const char *errors)
-{
return unicode_decode_locale(str, len, errors, 1);
}
PyObject*
-_PyUnicode_DecodeCurrentLocale(const char *str, const char *errors)
-{
- return unicode_decode_locale(str, (Py_ssize_t)strlen(str), errors, 1);
-}
-
-PyObject*
PyUnicode_DecodeLocale(const char *str, const char *errors)
{
Py_ssize_t size = (Py_ssize_t)strlen(str);
- return unicode_decode_locale(str, size, errors, 0);
+ return unicode_decode_locale(str, size, errors, 1);
}
@@ -3878,7 +3655,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
Py_FileSystemDefaultEncodeErrors);
}
else {
- return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
+ return unicode_decode_locale(s, size,
+ Py_FileSystemDefaultEncodeErrors, 0);
}
#endif
}
@@ -5128,17 +4906,23 @@ onError:
}
-/* UTF-8 decoder using the surrogateescape error handler .
+/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
+ non-zero, use strict error handler otherwise.
- On success, return a pointer to a newly allocated wide character string (use
- PyMem_RawFree() to free the memory) and write the output length (in number
- of wchar_t units) into *p_wlen (if p_wlen is set).
+ On success, write a pointer to a newly allocated wide character string into
+ *wstr (use PyMem_RawFree() to free the memory) and write the output length
+ (in number of wchar_t units) into *wlen (if wlen is set).
- On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
- (if p_wlen is set). */
-wchar_t*
-_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
+ On memory allocation failure, return -1.
+
+ On decoding error (if surrogateescape is zero), return -2. If wlen is
+ non-NULL, write the start of the illegal byte sequence into *wlen. If reason
+ is not NULL, write the decoding error message into *reason. */
+int
+_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
+ const char **reason, int surrogateescape)
{
+ const char *orig_s = s;
const char *e;
wchar_t *unicode;
Py_ssize_t outpos;
@@ -5146,18 +4930,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
/* Note: size will always be longer than the resulting Unicode
character count */
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
- if (p_wlen) {
- *p_wlen = (size_t)-1;
- }
- return NULL;
+ return -1;
}
unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
if (!unicode) {
- if (p_wlen) {
- *p_wlen = (size_t)-1;
- }
- return NULL;
+ return -1;
}
/* Unpack UTF-8 encoded data */
@@ -5175,7 +4953,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
Py_UNREACHABLE();
#else
assert(ch > 0xFFFF && ch <= MAX_UNICODE);
- /* compute and append the two surrogates: */
+ /* write a surrogate pair */
unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
#endif
@@ -5183,60 +4961,88 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
else {
if (!ch && s == e)
break;
+ if (!surrogateescape) {
+ PyMem_RawFree(unicode );
+ if (reason != NULL) {
+ switch (ch) {
+ case 0:
+ *reason = "unexpected end of data";
+ break;
+ case 1:
+ *reason = "invalid start byte";
+ break;
+ /* 2, 3, 4 */
+ default:
+ *reason = "invalid continuation byte";
+ break;
+ }
+ }
+ if (wlen != NULL) {
+ *wlen = s - orig_s;
+ }
+ return -2;
+ }
/* surrogateescape */
unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
}
}
unicode[outpos] = L'\0';
- if (p_wlen) {
- *p_wlen = outpos;
+ if (wlen) {
+ *wlen = outpos;
}
- return unicode;
+ *wstr = unicode;
+ return 0;
+}
+
+wchar_t*
+_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
+{
+ wchar_t *wstr;
+ int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
+ if (res != 0) {
+ return NULL;
+ }
+ return wstr;
}
/* UTF-8 encoder using the surrogateescape error handler .
- On success, return a pointer to a newly allocated character string (use
- PyMem_Free() to free the memory).
+ On success, return 0 and write the newly allocated character string (use
+ PyMem_Free() to free the memory) into *str.
- On encoding failure, return NULL and write the position of the invalid
- surrogate character into *error_pos (if error_pos is set).
+ On encoding failure, return -2 and write the position of the invalid
+ surrogate character into *error_pos (if error_pos is set) and the decoding
+ error message into *reason (if reason is set).
- On memory allocation failure, return NULL and write (size_t)-1 into
- *error_pos (if error_pos is set). */
-char*
-_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
- int raw_malloc)
+ On memory allocation failure, return -1. */
+int
+_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
+ const char **reason, int raw_malloc, int surrogateescape)
{
const Py_ssize_t max_char_size = 4;
Py_ssize_t len = wcslen(text);
assert(len >= 0);
+ if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
+ return -1;
+ }
char *bytes;
- if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
- if (raw_malloc) {
- bytes = PyMem_RawMalloc((len + 1) * max_char_size);
- }
- else {
- bytes = PyMem_Malloc((len + 1) * max_char_size);
- }
+ if (raw_malloc) {
+ bytes = PyMem_RawMalloc((len + 1) * max_char_size);
}
else {
- bytes = NULL;
+ bytes = PyMem_Malloc((len + 1) * max_char_size);
}
if (bytes == NULL) {
- if (error_pos != NULL) {
- *error_pos = (size_t)-1;
- }
- return NULL;
+ return -1;
}
char *p = bytes;
Py_ssize_t i;
- for (i = 0; i < len;) {
- Py_UCS4 ch = text[i++];
+ for (i = 0; i < len; i++) {
+ Py_UCS4 ch = text[i];
if (ch < 0x80) {
/* Encode ASCII */
@@ -5250,11 +5056,20 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
}
else if (Py_UNICODE_IS_SURROGATE(ch)) {
/* surrogateescape error handler */
- if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
+ if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
if (error_pos != NULL) {
- *error_pos = (size_t)i - 1;
+ *error_pos = (size_t)i;
}
- goto error;
+ if (reason != NULL) {
+ *reason = "encoding error";
+ }
+ if (raw_malloc) {
+ PyMem_RawFree(bytes);
+ }
+ else {
+ PyMem_Free(bytes);
+ }
+ return -2;
}
*p++ = (char)(ch & 0xff);
}
@@ -5286,18 +5101,16 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
if (error_pos != NULL) {
*error_pos = (size_t)-1;
}
- goto error;
- }
- return bytes2;
-
- error:
- if (raw_malloc) {
- PyMem_RawFree(bytes);
- }
- else {
- PyMem_Free(bytes);
+ if (raw_malloc) {
+ PyMem_RawFree(bytes);
+ }
+ else {
+ PyMem_Free(bytes);
+ }
+ return -1;
}
- return NULL;
+ *str = bytes2;
+ return 0;
}
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 9275494..a50075e 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -20,9 +20,6 @@ extern int winerror_to_errno(int);
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
-extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
- size_t *error_pos, int raw_malloc);
-
#ifdef O_CLOEXEC
/* Does open() support the O_CLOEXEC flag? Possible values:
@@ -69,7 +66,10 @@ _Py_device_encoding(int fd)
Py_RETURN_NONE;
}
-#if !defined(__APPLE__) && !defined(MS_WINDOWS)
+#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS)
+
+#define USE_FORCE_ASCII
+
extern int _Py_normalize_encoding(const char *, char *, size_t);
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
@@ -90,7 +90,7 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
1: the workaround is used: Py_EncodeLocale() uses
encode_ascii_surrogateescape() and Py_DecodeLocale() uses
- decode_ascii_surrogateescape()
+ decode_ascii()
0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and
Py_DecodeLocale() uses mbstowcs()
-1: unknown, need to call check_force_ascii() to get the value
@@ -180,16 +180,15 @@ error:
return 1;
}
-static char*
-encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_malloc)
+static int
+encode_ascii(const wchar_t *text, char **str,
+ size_t *error_pos, const char **reason,
+ int raw_malloc, int surrogateescape)
{
char *result = NULL, *out;
size_t len, i;
wchar_t ch;
- if (error_pos != NULL)
- *error_pos = (size_t)-1;
-
len = wcslen(text);
/* +1 for NULL byte */
@@ -199,8 +198,9 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
else {
result = PyMem_Malloc(len + 1);
}
- if (result == NULL)
- return NULL;
+ if (result == NULL) {
+ return -1;
+ }
out = result;
for (i=0; i<len; i++) {
@@ -210,60 +210,84 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
/* ASCII character */
*out++ = (char)ch;
}
- else if (0xdc80 <= ch && ch <= 0xdcff) {
+ else if (surrogateescape && 0xdc80 <= ch && ch <= 0xdcff) {
/* UTF-8b surrogate */
*out++ = (char)(ch - 0xdc00);
}
else {
- if (error_pos != NULL) {
- *error_pos = i;
- }
if (raw_malloc) {
PyMem_RawFree(result);
}
else {
PyMem_Free(result);
}
- return NULL;
+ if (error_pos != NULL) {
+ *error_pos = i;
+ }
+ if (reason) {
+ *reason = "encoding error";
+ }
+ return -2;
}
}
*out = '\0';
- return result;
+ *str = result;
+ return 0;
}
-#endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
+#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
+
-#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
-static wchar_t*
-decode_ascii_surrogateescape(const char *arg, size_t *size)
+#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
+static int
+decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
+ const char **reason, int surrogateescape)
{
wchar_t *res;
unsigned char *in;
wchar_t *out;
size_t argsize = strlen(arg) + 1;
- if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
- return NULL;
- res = PyMem_RawMalloc(argsize*sizeof(wchar_t));
- if (!res)
- return NULL;
+ if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
+ return -1;
+ }
+ res = PyMem_RawMalloc(argsize * sizeof(wchar_t));
+ if (!res) {
+ return -1;
+ }
- in = (unsigned char*)arg;
out = res;
- while(*in)
- if(*in < 128)
- *out++ = *in++;
- else
- *out++ = 0xdc00 + *in++;
+ for (in = (unsigned char*)arg; *in; in++) {
+ unsigned char ch = *in;
+ if (ch < 128) {
+ *out++ = ch;
+ }
+ else {
+ if (!surrogateescape) {
+ PyMem_RawFree(res);
+ if (wlen) {
+ *wlen = in - (unsigned char*)arg;
+ }
+ if (reason) {
+ *reason = "decoding error";
+ }
+ return -2;
+ }
+ *out++ = 0xdc00 + ch;
+ }
+ }
*out = 0;
- if (size != NULL)
- *size = out - res;
- return res;
+
+ if (wlen != NULL) {
+ *wlen = out - res;
+ }
+ *wstr = res;
+ return 0;
}
-#endif
+#endif /* !HAVE_MBRTOWC */
-#if !defined(__APPLE__) && !defined(__ANDROID__)
-static wchar_t*
-decode_current_locale(const char* arg, size_t *size)
+static int
+decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
+ const char **reason, int surrogateescape)
{
wchar_t *res;
size_t argsize;
@@ -284,15 +308,15 @@ decode_current_locale(const char* arg, size_t *size)
argsize = mbstowcs(NULL, arg, 0);
#endif
if (argsize != (size_t)-1) {
- if (argsize == PY_SSIZE_T_MAX)
- goto oom;
- argsize += 1;
- if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
- goto oom;
- res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t));
- if (!res)
- goto oom;
- count = mbstowcs(res, arg, argsize);
+ if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
+ return -1;
+ }
+ res = (wchar_t *)PyMem_RawMalloc((argsize + 1) * sizeof(wchar_t));
+ if (!res) {
+ return -1;
+ }
+
+ count = mbstowcs(res, arg, argsize + 1);
if (count != (size_t)-1) {
wchar_t *tmp;
/* Only use the result if it contains no
@@ -301,13 +325,16 @@ decode_current_locale(const char* arg, size_t *size)
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
;
if (*tmp == 0) {
- if (size != NULL)
- *size = count;
- return res;
+ if (wlen != NULL) {
+ *wlen = count;
+ }
+ *wstr = res;
+ return 0;
}
}
PyMem_RawFree(res);
}
+
/* Conversion failed. Fall back to escaping with surrogateescape. */
#ifdef HAVE_MBRTOWC
/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
@@ -315,30 +342,37 @@ decode_current_locale(const char* arg, size_t *size)
/* Overallocate; as multi-byte characters are in the argument, the
actual output could use less memory. */
argsize = strlen(arg) + 1;
- if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
- goto oom;
- res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t));
- if (!res)
- goto oom;
+ if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
+ return -1;
+ }
+ res = (wchar_t*)PyMem_RawMalloc(argsize * sizeof(wchar_t));
+ if (!res) {
+ return -1;
+ }
+
in = (unsigned char*)arg;
out = res;
memset(&mbs, 0, sizeof mbs);
while (argsize) {
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
- if (converted == 0)
+ if (converted == 0) {
/* Reached end of string; null char stored. */
break;
+ }
+
if (converted == (size_t)-2) {
/* Incomplete character. This should never happen,
since we provide everything that we have -
unless there is a bug in the C library, or I
misunderstood how mbrtowc works. */
- PyMem_RawFree(res);
- if (size != NULL)
- *size = (size_t)-2;
- return NULL;
+ goto decode_error;
}
+
if (converted == (size_t)-1) {
+ if (!surrogateescape) {
+ goto decode_error;
+ }
+
/* Conversion error. Escape as UTF-8b, and start over
in the initial shift state. */
*out++ = 0xdc00 + *in++;
@@ -346,12 +380,18 @@ decode_current_locale(const char* arg, size_t *size)
memset(&mbs, 0, sizeof mbs);
continue;
}
+
if (Py_UNICODE_IS_SURROGATE(*out)) {
+ if (!surrogateescape) {
+ goto decode_error;
+ }
+
/* Surrogate character. Escape the original
byte sequence with surrogateescape. */
argsize -= converted;
- while (converted--)
+ while (converted--) {
*out++ = 0xdc00 + *in++;
+ }
continue;
}
/* successfully converted some bytes */
@@ -359,55 +399,80 @@ decode_current_locale(const char* arg, size_t *size)
argsize -= converted;
out++;
}
- if (size != NULL)
- *size = out - res;
+ if (wlen != NULL) {
+ *wlen = out - res;
+ }
+ *wstr = res;
+ return 0;
+
+decode_error:
+ PyMem_RawFree(res);
+ if (wlen) {
+ *wlen = in - (unsigned char*)arg;
+ }
+ if (reason) {
+ *reason = "decoding error";
+ }
+ return -2;
#else /* HAVE_MBRTOWC */
/* Cannot use C locale for escaping; manually escape as if charset
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
correctly in the locale's charset, which must be an ASCII superset. */
- res = decode_ascii_surrogateescape(arg, size);
- if (res == NULL)
- goto oom;
+ return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
#endif /* HAVE_MBRTOWC */
- return res;
-
-oom:
- if (size != NULL) {
- *size = (size_t)-1;
- }
- return NULL;
}
-#endif
-static wchar_t*
-decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
+/* Decode a byte string from the locale encoding.
+
+ Use the strict error handler if 'surrogateescape' is zero. Use the
+ surrogateescape error handler if 'surrogateescape' is non-zero: undecodable
+ bytes are decoded as characters in range U+DC80..U+DCFF. If a byte sequence
+ can be decoded as a surrogate character, escape the bytes using the
+ surrogateescape error handler instead of decoding them.
+
+ On sucess, return 0 and write the newly allocated wide character string into
+ *wstr (use PyMem_RawFree() to free the memory). If wlen is not NULL, write
+ the number of wide characters excluding the null character into *wlen.
+
+ On memory allocation failure, return -1.
+
+ On decoding error, return -2. If wlen is not NULL, write the start of
+ invalid byte sequence in the input string into *wlen. If reason is not NULL,
+ write the decoding error message into *reason.
+
+ Use the Py_EncodeLocaleEx() function to encode the character string back to
+ a byte string. */
+int
+_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
+ const char **reason,
+ int current_locale, int surrogateescape)
{
+ if (current_locale) {
+ return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
+ }
+
#if defined(__APPLE__) || defined(__ANDROID__)
- return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+ return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
+ surrogateescape);
#else
- if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
- return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+ if (Py_UTF8Mode == 1) {
+ return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
+ surrogateescape);
}
-#ifndef MS_WINDOWS
- if (force_ascii == -1)
+#ifdef USE_FORCE_ASCII
+ if (force_ascii == -1) {
force_ascii = check_force_ascii();
+ }
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
- wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
- if (wstr == NULL) {
- if (size != NULL) {
- *size = (size_t)-1;
- }
- return NULL;
- }
- return wstr;
+ return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
}
#endif
- return decode_current_locale(arg, size);
+ return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
#endif /* __APPLE__ or __ANDROID__ */
}
@@ -432,23 +497,24 @@ decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */
wchar_t*
-Py_DecodeLocale(const char* arg, size_t *size)
+Py_DecodeLocale(const char* arg, size_t *wlen)
{
- return decode_locale(arg, size, 0);
-}
-
-
-/* Similar to Py_DecodeLocale() but ignore the UTF-8 mode */
-wchar_t*
-_Py_DecodeCurrentLocale(const char* arg, size_t *size)
-{
- return decode_locale(arg, size, 1);
+ wchar_t *wstr;
+ int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1);
+ if (res != 0) {
+ if (wlen != NULL) {
+ *wlen = (size_t)res;
+ }
+ return NULL;
+ }
+ return wstr;
}
-#if !defined(__APPLE__) && !defined(__ANDROID__)
-static char*
-encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
+static int
+encode_current_locale(const wchar_t *text, char **str,
+ size_t *error_pos, const char **reason,
+ int raw_malloc, int surrogateescape)
{
const size_t len = wcslen(text);
char *result = NULL, *bytes = NULL;
@@ -464,38 +530,37 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
for (i=0; i < len; i++) {
c = text[i];
if (c >= 0xdc80 && c <= 0xdcff) {
+ if (!surrogateescape) {
+ goto encode_error;
+ }
/* UTF-8b surrogate */
if (bytes != NULL) {
*bytes++ = c - 0xdc00;
size--;
}
- else
+ else {
size++;
+ }
continue;
}
else {
buf[0] = c;
- if (bytes != NULL)
+ if (bytes != NULL) {
converted = wcstombs(bytes, buf, size);
- else
+ }
+ else {
converted = wcstombs(NULL, buf, 0);
+ }
if (converted == (size_t)-1) {
- if (raw_malloc) {
- PyMem_RawFree(result);
- }
- else {
- PyMem_Free(result);
- }
- if (error_pos != NULL)
- *error_pos = i;
- return NULL;
+ goto encode_error;
}
if (bytes != NULL) {
bytes += converted;
size -= converted;
}
- else
+ else {
size += converted;
+ }
}
}
if (result != NULL) {
@@ -511,40 +576,80 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
result = PyMem_Malloc(size);
}
if (result == NULL) {
- if (error_pos != NULL) {
- *error_pos = (size_t)-1;
- }
- return NULL;
+ return -1;
}
bytes = result;
}
- return result;
+ *str = result;
+ return 0;
+
+encode_error:
+ if (raw_malloc) {
+ PyMem_RawFree(result);
+ }
+ else {
+ PyMem_Free(result);
+ }
+ if (error_pos != NULL) {
+ *error_pos = i;
+ }
+ if (reason) {
+ *reason = "encoding error";
+ }
+ return -2;
}
-#endif
-static char*
-encode_locale(const wchar_t *text, size_t *error_pos,
- int raw_malloc, int ignore_utf8_mode)
+static int
+encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
+ const char **reason,
+ int raw_malloc, int current_locale, int surrogateescape)
{
+ if (current_locale) {
+ return encode_current_locale(text, str, error_pos, reason,
+ raw_malloc, surrogateescape);
+ }
+
#if defined(__APPLE__) || defined(__ANDROID__)
- return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
+ return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
+ raw_malloc, surrogateescape);
#else /* __APPLE__ */
- if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
- return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
+ if (Py_UTF8Mode == 1) {
+ return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
+ raw_malloc, surrogateescape);
}
-#ifndef MS_WINDOWS
- if (force_ascii == -1)
+#ifdef USE_FORCE_ASCII
+ if (force_ascii == -1) {
force_ascii = check_force_ascii();
+ }
- if (force_ascii)
- return encode_ascii_surrogateescape(text, error_pos, raw_malloc);
+ if (force_ascii) {
+ return encode_ascii(text, str, error_pos, reason,
+ raw_malloc, surrogateescape);
+ }
#endif
- return encode_current_locale(text, error_pos, raw_malloc);
+ return encode_current_locale(text, str, error_pos, reason,
+ raw_malloc, surrogateescape);
#endif /* __APPLE__ or __ANDROID__ */
}
+static char*
+encode_locale(const wchar_t *text, size_t *error_pos,
+ int raw_malloc, int current_locale)
+{
+ char *str;
+ int res = encode_locale_ex(text, &str, error_pos, NULL,
+ raw_malloc, current_locale, 1);
+ if (res != -2 && error_pos) {
+ *error_pos = (size_t)-1;
+ }
+ if (res != 0) {
+ return NULL;
+ }
+ return str;
+}
+
/* Encode a wide character string to the locale encoding with the
surrogateescape error handler: surrogate characters in the range
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
@@ -573,11 +678,13 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
}
-/* Similar to _Py_EncodeLocaleRaw() but ignore the UTF-8 Mode */
-char*
-_Py_EncodeCurrentLocale(const wchar_t *text, size_t *error_pos)
+int
+_Py_EncodeLocaleEx(const wchar_t *text, char **str,
+ size_t *error_pos, const char **reason,
+ int current_locale, int surrogateescape)
{
- return encode_locale(text, error_pos, 1, 1);
+ return encode_locale_ex(text, str, error_pos, reason, 1,
+ current_locale, surrogateescape);
}
diff --git a/Python/pathconfig.c b/Python/pathconfig.c
index 9591fcc..7ebd69b 100644
--- a/Python/pathconfig.c
+++ b/Python/pathconfig.c
@@ -382,8 +382,8 @@ _Py_FindEnvConfigValue(FILE *env_file, const wchar_t *key,
/* Comment - skip */
continue;
}
- tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n, NULL);
- if (tmpbuffer != NULL) {
+ tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n);
+ if (tmpbuffer) {
wchar_t * state;
wchar_t * tok = wcstok(tmpbuffer, L" \t\r\n", &state);
if ((tok != NULL) && !wcscmp(tok, key)) {