From e662c398d87f136497f8ec672e83657ae3a599e0 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sun, 1 Nov 2020 23:07:23 +0100 Subject: bpo-42236: Use UTF-8 encoding if nl_langinfo(CODESET) fails (GH-23086) If the nl_langinfo(CODESET) function returns an empty string, Python now uses UTF-8 as the filesystem encoding. In May 2010 (commit b744ba1d14c5487576c95d0311e357b707600b47), I modified Python to log a warning and use UTF-8 as the filesystem encoding (instead of None) if nl_langinfo(CODESET) returns an empty string. In August 2020 (commit 94908bbc1503df830d1d615e7b57744ae1b41079), I modified Python startup to fail with a fatal error and a specific error message if nl_langinfo(CODESET) returns an empty string. The intent was to prevent guessing the encoding and also investigate user configuration where this case happens. In 10 years (2010 to 2020), I saw zero user report about the error message related to nl_langinfo(CODESET) returning an empty string. Today, UTF-8 became the defacto standard and it's safe to make the assumption that the user expects UTF-8. For example, nl_langinfo(CODESET) can return an empty string on macOS if the LC_CTYPE locale is not supported, and UTF-8 is the default encoding on macOS. While this change is likely to not affect anyone in practice, it should make UTF-8 lover happy ;-) Rewrite also the documentation explaining how Python selects the filesystem encoding and error handler. --- Doc/c-api/init_config.rst | 52 +++++++++++++++++++--- Doc/library/sys.rst | 31 ++++++------- Include/cpython/initconfig.h | 37 +++------------ Include/internal/pycore_fileutils.h | 2 +- Include/pyport.h | 8 +++- .../2020-11-01-21-21-38.bpo-42236.MPx-NK.rst | 2 + Python/fileutils.c | 34 ++++---------- Python/initconfig.c | 12 ++--- 8 files changed, 88 insertions(+), 90 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-11-01-21-21-38.bpo-42236.MPx-NK.rst diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst index 37f5b9f..92a6c3a 100644 --- a/Doc/c-api/init_config.rst +++ b/Doc/c-api/init_config.rst @@ -253,10 +253,16 @@ PyPreConfig See :c:member:`PyConfig.isolated`. - .. c:member:: int legacy_windows_fs_encoding (Windows only) + .. c:member:: int legacy_windows_fs_encoding - If non-zero, disable UTF-8 Mode, set the Python filesystem encoding to - ``mbcs``, set the filesystem error handler to ``replace``. + If non-zero: + + * Set :c:member:`PyPreConfig.utf8_mode` to ``0``, + * Set :c:member:`PyConfig.filesystem_encoding` to ``"mbcs"``, + * Set :c:member:`PyConfig.filesystem_errors` to ``"replace"``. + + Initialized the from :envvar:`PYTHONLEGACYWINDOWSFSENCODING` environment + variable value. Only available on Windows. ``#ifdef MS_WINDOWS`` macro can be used for Windows specific code. @@ -499,11 +505,47 @@ PyConfig .. c:member:: wchar_t* filesystem_encoding - Filesystem encoding, :func:`sys.getfilesystemencoding`. + Filesystem encoding: :func:`sys.getfilesystemencoding`. + + On macOS, Android and VxWorks: use ``"utf-8"`` by default. + + On Windows: use ``"utf-8"`` by default, or ``"mbcs"`` if + :c:member:`~PyPreConfig.legacy_windows_fs_encoding` of + :c:type:`PyPreConfig` is non-zero. + + Default encoding on other platforms: + + * ``"utf-8"`` if :c:member:`PyPreConfig.utf8_mode` is non-zero. + * ``"ascii"`` if Python detects that ``nl_langinfo(CODESET)`` announces + the ASCII encoding (or Roman8 encoding on HP-UX), whereas the + ``mbstowcs()`` function decodes from a different encoding (usually + Latin1). + * ``"utf-8"`` if ``nl_langinfo(CODESET)`` returns an empty string. + * Otherwise, use the LC_CTYPE locale encoding: + ``nl_langinfo(CODESET)`` result. + + At Python statup, the encoding name is normalized to the Python codec + name. For example, ``"ANSI_X3.4-1968"`` is replaced with ``"ascii"``. + + See also the :c:member:`~PyConfig.filesystem_errors` member. .. c:member:: wchar_t* filesystem_errors - Filesystem encoding errors, :func:`sys.getfilesystemencodeerrors`. + Filesystem error handler: :func:`sys.getfilesystemencodeerrors`. + + On Windows: use ``"surrogatepass"`` by default, or ``"replace"`` if + :c:member:`~PyPreConfig.legacy_windows_fs_encoding` of + :c:type:`PyPreConfig` is non-zero. + + On other platforms: use ``"surrogateescape"`` by default. + + Supported error handlers: + + * ``"strict"`` + * ``"surrogateescape"`` + * ``"surrogatepass"`` (only supported with the UTF-8 encoding) + + See also the :c:member:`~PyConfig.filesystem_encoding` member. .. c:member:: unsigned long hash_seed .. c:member:: int use_hash_seed diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst index 468a30d..2f0840e 100644 --- a/Doc/library/sys.rst +++ b/Doc/library/sys.rst @@ -616,29 +616,20 @@ always available. .. function:: getfilesystemencoding() Return the name of the encoding used to convert between Unicode - filenames and bytes filenames. For best compatibility, str should be - used for filenames in all cases, although representing filenames as bytes - is also supported. Functions accepting or returning filenames should support - either str or bytes and internally convert to the system's preferred - representation. + filenames and bytes filenames. + + For best compatibility, str should be used for filenames in all cases, + although representing filenames as bytes is also supported. Functions + accepting or returning filenames should support either str or bytes and + internally convert to the system's preferred representation. This encoding is always ASCII-compatible. :func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that the correct encoding and errors mode are used. - * In the UTF-8 mode, the encoding is ``utf-8`` on any platform. - - * On macOS, the encoding is ``'utf-8'``. - - * On Unix, the encoding is the locale encoding. - - * On Windows, the encoding may be ``'utf-8'`` or ``'mbcs'``, depending - on user configuration. - - * On Android, the encoding is ``'utf-8'``. - - * On VxWorks, the encoding is ``'utf-8'``. + The filesystem encoding is initialized from + :c:member:`PyConfig.filesystem_encoding`. .. versionchanged:: 3.2 :func:`getfilesystemencoding` result cannot be ``None`` anymore. @@ -660,6 +651,9 @@ always available. :func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that the correct encoding and errors mode are used. + The filesystem error handler is initialized from + :c:member:`PyConfig.filesystem_errors`. + .. versionadded:: 3.6 .. function:: getrefcount(object) @@ -1457,6 +1451,9 @@ always available. This is equivalent to defining the :envvar:`PYTHONLEGACYWINDOWSFSENCODING` environment variable before launching Python. + See also :func:`sys.getfilesystemencoding` and + :func:`sys.getfilesystemencodeerrors`. + .. availability:: Windows. .. versionadded:: 3.6 diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index bbe8387..dd5ca61 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -156,36 +156,13 @@ typedef struct { /* Python filesystem encoding and error handler: sys.getfilesystemencoding() and sys.getfilesystemencodeerrors(). - Default encoding and error handler: - - * if Py_SetStandardStreamEncoding() has been called: they have the - highest priority; - * PYTHONIOENCODING environment variable; - * The UTF-8 Mode uses UTF-8/surrogateescape; - * If Python forces the usage of the ASCII encoding (ex: C locale - or POSIX locale on FreeBSD or HP-UX), use ASCII/surrogateescape; - * locale encoding: ANSI code page on Windows, UTF-8 on Android and - VxWorks, LC_CTYPE locale encoding on other platforms; - * On Windows, "surrogateescape" error handler; - * "surrogateescape" error handler if the LC_CTYPE locale is "C" or "POSIX"; - * "surrogateescape" error handler if the LC_CTYPE locale has been coerced - (PEP 538); - * "strict" error handler. - - Supported error handlers: "strict", "surrogateescape" and - "surrogatepass". The surrogatepass error handler is only supported - if Py_DecodeLocale() and Py_EncodeLocale() use directly the UTF-8 codec; - it's only used on Windows. - - initfsencoding() updates the encoding to the Python codec name. - For example, "ANSI_X3.4-1968" is replaced with "ascii". - - On Windows, sys._enablelegacywindowsfsencoding() sets the - encoding/errors to mbcs/replace at runtime. - - - See Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors. - */ + The Doc/c-api/init_config.rst documentation explains how Python selects + the filesystem encoding and error handler. + + _PyUnicode_InitEncodings() updates the encoding name to the Python codec + name. For example, "ANSI_X3.4-1968" is replaced with "ascii". It also + sets Py_FileSystemDefaultEncoding to filesystem_encoding and + sets Py_FileSystemDefaultEncodeErrors to filesystem_errors. */ wchar_t *filesystem_encoding; wchar_t *filesystem_errors; diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h index 1ab554f..9281f4e 100644 --- a/Include/internal/pycore_fileutils.h +++ b/Include/internal/pycore_fileutils.h @@ -50,7 +50,7 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric( PyAPI_FUNC(void) _Py_closerange(int first, int last); -PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(const char **errmsg); +PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(void); PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void); #ifdef __cplusplus diff --git a/Include/pyport.h b/Include/pyport.h index 7137006..79fc7c4 100644 --- a/Include/pyport.h +++ b/Include/pyport.h @@ -841,12 +841,16 @@ extern _invalid_parameter_handler _Py_silent_invalid_parameter_handler; #endif #if defined(__ANDROID__) || defined(__VXWORKS__) - /* Ignore the locale encoding: force UTF-8 */ + // Use UTF-8 as the locale encoding, ignore the LC_CTYPE locale. + // See _Py_GetLocaleEncoding(), PyUnicode_DecodeLocale() + // and PyUnicode_EncodeLocale(). # define _Py_FORCE_UTF8_LOCALE #endif #if defined(_Py_FORCE_UTF8_LOCALE) || defined(__APPLE__) - /* Use UTF-8 as filesystem encoding */ + // Use UTF-8 as the filesystem encoding. + // See PyUnicode_DecodeFSDefaultAndSize(), PyUnicode_EncodeFSDefault(), + // Py_DecodeLocale() and Py_EncodeLocale(). # define _Py_FORCE_UTF8_FS_ENCODING #endif diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-11-01-21-21-38.bpo-42236.MPx-NK.rst b/Misc/NEWS.d/next/Core and Builtins/2020-11-01-21-21-38.bpo-42236.MPx-NK.rst new file mode 100644 index 0000000..22e8c53 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-11-01-21-21-38.bpo-42236.MPx-NK.rst @@ -0,0 +1,2 @@ +If the ``nl_langinfo(CODESET)`` function returns an empty string, Python now +uses UTF-8 as the filesystem encoding. Patch by Victor Stinner. diff --git a/Python/fileutils.c b/Python/fileutils.c index 72cdee2..5177b37 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -826,20 +826,15 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str, // - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android) // - Return "UTF-8" if the UTF-8 Mode is enabled // - On Windows, return the ANSI code page (ex: "cp1250") -// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string -// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS). +// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string. // - Otherwise, return nl_langinfo(CODESET). // -// Return NULL and set errmsg to an error message -// if nl_langinfo(CODESET) fails. -// -// Return NULL and set errmsg to NULL on memory allocation failure. +// Return NULL on memory allocation failure. // // See also config_get_locale_encoding() wchar_t* -_Py_GetLocaleEncoding(const char **errmsg) +_Py_GetLocaleEncoding(void) { - *errmsg = NULL; #ifdef _Py_FORCE_UTF8_LOCALE // On Android langinfo.h and CODESET are missing, // and UTF-8 is always used in mbstowcs() and wcstombs(). @@ -859,21 +854,14 @@ _Py_GetLocaleEncoding(const char **errmsg) #else const char *encoding = nl_langinfo(CODESET); if (!encoding || encoding[0] == '\0') { -#ifdef _Py_FORCE_UTF8_FS_ENCODING - // nl_langinfo() can return an empty string when the LC_CTYPE locale is - // not supported. Default to UTF-8 in that case, because UTF-8 is the - // default charset on macOS. + // Use UTF-8 if nl_langinfo() returns an empty string. It can happen on + // macOS if the LC_CTYPE locale is not supported. return _PyMem_RawWcsdup(L"UTF-8"); -#else - *errmsg = "failed to get the locale encoding: " - "nl_langinfo(CODESET) returns an empty string"; - return NULL; -#endif } wchar_t *wstr; int res = decode_current_locale(encoding, &wstr, NULL, - errmsg, _Py_ERROR_SURROGATEESCAPE); + NULL, _Py_ERROR_SURROGATEESCAPE); if (res < 0) { return NULL; } @@ -887,15 +875,9 @@ _Py_GetLocaleEncoding(const char **errmsg) PyObject * _Py_GetLocaleEncodingObject(void) { - const char *errmsg; - wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + wchar_t *encoding = _Py_GetLocaleEncoding(); if (encoding == NULL) { - if (errmsg != NULL) { - PyErr_SetString(PyExc_ValueError, errmsg); - } - else { - PyErr_NoMemory(); - } + PyErr_NoMemory(); return NULL; } diff --git a/Python/initconfig.c b/Python/initconfig.c index 56f4297..d0ff888 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -1318,7 +1318,7 @@ config_read_env_vars(PyConfig *config) #ifdef MS_WINDOWS _Py_get_env_flag(use_env, &config->legacy_windows_stdio, - "PYTHONLEGACYWINDOWSSTDIO"); + "PYTHONLEGACYWINDOWSSTDIO"); #endif if (config_get_env(config, "PYTHONDUMPREFS")) { @@ -1498,15 +1498,9 @@ static PyStatus config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig, wchar_t **locale_encoding) { - const char *errmsg; - wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + wchar_t *encoding = _Py_GetLocaleEncoding(); if (encoding == NULL) { - if (errmsg != NULL) { - return _PyStatus_ERR(errmsg); - } - else { - return _PyStatus_NO_MEMORY(); - } + return _PyStatus_NO_MEMORY(); } PyStatus status = PyConfig_SetString(config, locale_encoding, encoding); PyMem_RawFree(encoding); -- cgit v0.12