diff options
author | Victor Stinner <vstinner@python.org> | 2020-11-01 22:07:23 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-11-01 22:07:23 (GMT) |
commit | e662c398d87f136497f8ec672e83657ae3a599e0 (patch) | |
tree | cc9383c30557769a096be580b7f8f1b936565ea9 /Python/fileutils.c | |
parent | 82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4 (diff) | |
download | cpython-e662c398d87f136497f8ec672e83657ae3a599e0.zip cpython-e662c398d87f136497f8ec672e83657ae3a599e0.tar.gz cpython-e662c398d87f136497f8ec672e83657ae3a599e0.tar.bz2 |
bpo-42236: Use UTF-8 encoding if nl_langinfo(CODESET) fails (GH-23086)
If the nl_langinfo(CODESET) function returns an empty string, Python
now uses UTF-8 as the filesystem encoding.
In May 2010 (commit b744ba1d14c5487576c95d0311e357b707600b47), I
modified Python to log a warning and use UTF-8 as the filesystem
encoding (instead of None) if nl_langinfo(CODESET) returns an empty
string.
In August 2020 (commit 94908bbc1503df830d1d615e7b57744ae1b41079), I
modified Python startup to fail with a fatal error and a specific
error message if nl_langinfo(CODESET) returns an empty string. The
intent was to prevent guessing the encoding and also investigate user
configuration where this case happens.
In 10 years (2010 to 2020), I saw zero user report about the error
message related to nl_langinfo(CODESET) returning an empty string.
Today, UTF-8 became the defacto standard and it's safe to make the
assumption that the user expects UTF-8. For example,
nl_langinfo(CODESET) can return an empty string on macOS if the
LC_CTYPE locale is not supported, and UTF-8 is the default encoding
on macOS.
While this change is likely to not affect anyone in practice, it
should make UTF-8 lover happy ;-)
Rewrite also the documentation explaining how Python selects the
filesystem encoding and error handler.
Diffstat (limited to 'Python/fileutils.c')
-rw-r--r-- | Python/fileutils.c | 34 |
1 files changed, 8 insertions, 26 deletions
diff --git a/Python/fileutils.c b/Python/fileutils.c index 72cdee2..5177b37 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -826,20 +826,15 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str, // - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android) // - Return "UTF-8" if the UTF-8 Mode is enabled // - On Windows, return the ANSI code page (ex: "cp1250") -// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string -// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS). +// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string. // - Otherwise, return nl_langinfo(CODESET). // -// Return NULL and set errmsg to an error message -// if nl_langinfo(CODESET) fails. -// -// Return NULL and set errmsg to NULL on memory allocation failure. +// Return NULL on memory allocation failure. // // See also config_get_locale_encoding() wchar_t* -_Py_GetLocaleEncoding(const char **errmsg) +_Py_GetLocaleEncoding(void) { - *errmsg = NULL; #ifdef _Py_FORCE_UTF8_LOCALE // On Android langinfo.h and CODESET are missing, // and UTF-8 is always used in mbstowcs() and wcstombs(). @@ -859,21 +854,14 @@ _Py_GetLocaleEncoding(const char **errmsg) #else const char *encoding = nl_langinfo(CODESET); if (!encoding || encoding[0] == '\0') { -#ifdef _Py_FORCE_UTF8_FS_ENCODING - // nl_langinfo() can return an empty string when the LC_CTYPE locale is - // not supported. Default to UTF-8 in that case, because UTF-8 is the - // default charset on macOS. + // Use UTF-8 if nl_langinfo() returns an empty string. It can happen on + // macOS if the LC_CTYPE locale is not supported. return _PyMem_RawWcsdup(L"UTF-8"); -#else - *errmsg = "failed to get the locale encoding: " - "nl_langinfo(CODESET) returns an empty string"; - return NULL; -#endif } wchar_t *wstr; int res = decode_current_locale(encoding, &wstr, NULL, - errmsg, _Py_ERROR_SURROGATEESCAPE); + NULL, _Py_ERROR_SURROGATEESCAPE); if (res < 0) { return NULL; } @@ -887,15 +875,9 @@ _Py_GetLocaleEncoding(const char **errmsg) PyObject * _Py_GetLocaleEncodingObject(void) { - const char *errmsg; - wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + wchar_t *encoding = _Py_GetLocaleEncoding(); if (encoding == NULL) { - if (errmsg != NULL) { - PyErr_SetString(PyExc_ValueError, errmsg); - } - else { - PyErr_NoMemory(); - } + PyErr_NoMemory(); return NULL; } |