summaryrefslogtreecommitdiffstats
path: root/Python/fileutils.c
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@python.org>2020-11-01 22:07:23 (GMT)
committerGitHub <noreply@github.com>2020-11-01 22:07:23 (GMT)
commite662c398d87f136497f8ec672e83657ae3a599e0 (patch)
treecc9383c30557769a096be580b7f8f1b936565ea9 /Python/fileutils.c
parent82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4 (diff)
downloadcpython-e662c398d87f136497f8ec672e83657ae3a599e0.zip
cpython-e662c398d87f136497f8ec672e83657ae3a599e0.tar.gz
cpython-e662c398d87f136497f8ec672e83657ae3a599e0.tar.bz2
bpo-42236: Use UTF-8 encoding if nl_langinfo(CODESET) fails (GH-23086)
If the nl_langinfo(CODESET) function returns an empty string, Python now uses UTF-8 as the filesystem encoding. In May 2010 (commit b744ba1d14c5487576c95d0311e357b707600b47), I modified Python to log a warning and use UTF-8 as the filesystem encoding (instead of None) if nl_langinfo(CODESET) returns an empty string. In August 2020 (commit 94908bbc1503df830d1d615e7b57744ae1b41079), I modified Python startup to fail with a fatal error and a specific error message if nl_langinfo(CODESET) returns an empty string. The intent was to prevent guessing the encoding and also investigate user configuration where this case happens. In 10 years (2010 to 2020), I saw zero user report about the error message related to nl_langinfo(CODESET) returning an empty string. Today, UTF-8 became the defacto standard and it's safe to make the assumption that the user expects UTF-8. For example, nl_langinfo(CODESET) can return an empty string on macOS if the LC_CTYPE locale is not supported, and UTF-8 is the default encoding on macOS. While this change is likely to not affect anyone in practice, it should make UTF-8 lover happy ;-) Rewrite also the documentation explaining how Python selects the filesystem encoding and error handler.
Diffstat (limited to 'Python/fileutils.c')
-rw-r--r--Python/fileutils.c34
1 files changed, 8 insertions, 26 deletions
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 72cdee2..5177b37 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -826,20 +826,15 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
// - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android)
// - Return "UTF-8" if the UTF-8 Mode is enabled
// - On Windows, return the ANSI code page (ex: "cp1250")
-// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string
-// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS).
+// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string.
// - Otherwise, return nl_langinfo(CODESET).
//
-// Return NULL and set errmsg to an error message
-// if nl_langinfo(CODESET) fails.
-//
-// Return NULL and set errmsg to NULL on memory allocation failure.
+// Return NULL on memory allocation failure.
//
// See also config_get_locale_encoding()
wchar_t*
-_Py_GetLocaleEncoding(const char **errmsg)
+_Py_GetLocaleEncoding(void)
{
- *errmsg = NULL;
#ifdef _Py_FORCE_UTF8_LOCALE
// On Android langinfo.h and CODESET are missing,
// and UTF-8 is always used in mbstowcs() and wcstombs().
@@ -859,21 +854,14 @@ _Py_GetLocaleEncoding(const char **errmsg)
#else
const char *encoding = nl_langinfo(CODESET);
if (!encoding || encoding[0] == '\0') {
-#ifdef _Py_FORCE_UTF8_FS_ENCODING
- // nl_langinfo() can return an empty string when the LC_CTYPE locale is
- // not supported. Default to UTF-8 in that case, because UTF-8 is the
- // default charset on macOS.
+ // Use UTF-8 if nl_langinfo() returns an empty string. It can happen on
+ // macOS if the LC_CTYPE locale is not supported.
return _PyMem_RawWcsdup(L"UTF-8");
-#else
- *errmsg = "failed to get the locale encoding: "
- "nl_langinfo(CODESET) returns an empty string";
- return NULL;
-#endif
}
wchar_t *wstr;
int res = decode_current_locale(encoding, &wstr, NULL,
- errmsg, _Py_ERROR_SURROGATEESCAPE);
+ NULL, _Py_ERROR_SURROGATEESCAPE);
if (res < 0) {
return NULL;
}
@@ -887,15 +875,9 @@ _Py_GetLocaleEncoding(const char **errmsg)
PyObject *
_Py_GetLocaleEncodingObject(void)
{
- const char *errmsg;
- wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg);
+ wchar_t *encoding = _Py_GetLocaleEncoding();
if (encoding == NULL) {
- if (errmsg != NULL) {
- PyErr_SetString(PyExc_ValueError, errmsg);
- }
- else {
- PyErr_NoMemory();
- }
+ PyErr_NoMemory();
return NULL;
}