From 82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sun, 1 Nov 2020 20:59:35 +0100 Subject: bpo-42236: Enhance _locale._get_locale_encoding() (GH-23083) * Rename _Py_GetLocaleEncoding() to _Py_GetLocaleEncodingObject() * Add _Py_GetLocaleEncoding() which returns a wchar_t* string to share code between _Py_GetLocaleEncodingObject() and config_get_locale_encoding(). * _Py_GetLocaleEncodingObject() now decodes nl_langinfo(CODESET) from the current locale encoding with surrogateescape, rather than using UTF-8. --- Include/internal/pycore_fileutils.h | 3 +- Modules/_io/textio.c | 2 +- Modules/_localemodule.c | 2 +- Python/fileutils.c | 74 +++++++++++++++++++++++++++++-------- Python/initconfig.c | 47 +++++++---------------- 5 files changed, 76 insertions(+), 52 deletions(-) diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h index ff7bc48..1ab554f 100644 --- a/Include/internal/pycore_fileutils.h +++ b/Include/internal/pycore_fileutils.h @@ -50,7 +50,8 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric( PyAPI_FUNC(void) _Py_closerange(int first, int last); -PyAPI_FUNC(PyObject*) _Py_GetLocaleEncoding(void); +PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(const char **errmsg); +PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void); #ifdef __cplusplus } diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 2078bb3..f08d14e 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1155,7 +1155,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, } } if (encoding == NULL && self->encoding == NULL) { - self->encoding = _Py_GetLocaleEncoding(); + self->encoding = _Py_GetLocaleEncodingObject(); if (self->encoding == NULL) { goto error; } diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c index 359deb7..7b3597e 100644 --- a/Modules/_localemodule.c +++ b/Modules/_localemodule.c @@ -783,7 +783,7 @@ static PyObject * _locale__get_locale_encoding_impl(PyObject *module) /*[clinic end generated code: output=e8e2f6f6f184591a input=513d9961d2f45c76]*/ { - return _Py_GetLocaleEncoding(); + return _Py_GetLocaleEncodingObject(); } diff --git a/Python/fileutils.c b/Python/fileutils.c index ba26904..72cdee2 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -821,23 +821,41 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str, } -// Get the current locale encoding: locale.getpreferredencoding(False). +// Get the current locale encoding name: +// +// - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android) +// - Return "UTF-8" if the UTF-8 Mode is enabled +// - On Windows, return the ANSI code page (ex: "cp1250") +// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string +// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS). +// - Otherwise, return nl_langinfo(CODESET). +// +// Return NULL and set errmsg to an error message +// if nl_langinfo(CODESET) fails. +// +// Return NULL and set errmsg to NULL on memory allocation failure. +// // See also config_get_locale_encoding() -PyObject * -_Py_GetLocaleEncoding(void) +wchar_t* +_Py_GetLocaleEncoding(const char **errmsg) { + *errmsg = NULL; #ifdef _Py_FORCE_UTF8_LOCALE // On Android langinfo.h and CODESET are missing, // and UTF-8 is always used in mbstowcs() and wcstombs(). - return PyUnicode_FromString("UTF-8"); + return _PyMem_RawWcsdup(L"UTF-8"); #else const PyPreConfig *preconfig = &_PyRuntime.preconfig; if (preconfig->utf8_mode) { - return PyUnicode_FromString("UTF-8"); + return _PyMem_RawWcsdup(L"UTF-8"); } -#if defined(MS_WINDOWS) - return PyUnicode_FromFormat("cp%u", GetACP()); +#ifdef MS_WINDOWS + wchar_t encoding[23]; + unsigned int ansi_codepage = GetACP(); + swprintf(encoding, Py_ARRAY_LENGTH(encoding), L"cp%u", ansi_codepage); + encoding[Py_ARRAY_LENGTH(encoding) - 1] = 0; + return _PyMem_RawWcsdup(encoding); #else const char *encoding = nl_langinfo(CODESET); if (!encoding || encoding[0] == '\0') { @@ -845,19 +863,45 @@ _Py_GetLocaleEncoding(void) // nl_langinfo() can return an empty string when the LC_CTYPE locale is // not supported. Default to UTF-8 in that case, because UTF-8 is the // default charset on macOS. - encoding = "UTF-8"; + return _PyMem_RawWcsdup(L"UTF-8"); #else - PyErr_SetString(PyExc_ValueError, - "failed to get the locale encoding: " - "nl_langinfo(CODESET) returns an empty string"); + *errmsg = "failed to get the locale encoding: " + "nl_langinfo(CODESET) returns an empty string"; return NULL; #endif } - // Decode from UTF-8 - return PyUnicode_FromString(encoding); -#endif // !CODESET -#endif + wchar_t *wstr; + int res = decode_current_locale(encoding, &wstr, NULL, + errmsg, _Py_ERROR_SURROGATEESCAPE); + if (res < 0) { + return NULL; + } + return wstr; +#endif // !MS_WINDOWS + +#endif // !_Py_FORCE_UTF8_LOCALE +} + + +PyObject * +_Py_GetLocaleEncodingObject(void) +{ + const char *errmsg; + wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + if (encoding == NULL) { + if (errmsg != NULL) { + PyErr_SetString(PyExc_ValueError, errmsg); + } + else { + PyErr_NoMemory(); + } + return NULL; + } + + PyObject *str = PyUnicode_FromWideChar(encoding, -1); + PyMem_RawFree(encoding); + return str; } diff --git a/Python/initconfig.c b/Python/initconfig.c index e129278..56f4297 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -11,11 +11,7 @@ #include "osdefs.h" // DELIM #include // setlocale() -#ifdef HAVE_LANGINFO_H -# include // nl_langinfo(CODESET) -#endif #if defined(MS_WINDOWS) || defined(__CYGWIN__) -# include // GetACP() # ifdef HAVE_IO_H # include # endif @@ -1497,41 +1493,24 @@ config_get_stdio_errors(const PyPreConfig *preconfig) } -// See also _Py_GetLocaleEncoding() and config_get_fs_encoding() +// See also config_get_fs_encoding() static PyStatus config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig, wchar_t **locale_encoding) { -#ifdef _Py_FORCE_UTF8_LOCALE - return PyConfig_SetString(config, locale_encoding, L"utf-8"); -#else - if (preconfig->utf8_mode) { - return PyConfig_SetString(config, locale_encoding, L"utf-8"); - } - -#ifdef MS_WINDOWS - char encoding[20]; - PyOS_snprintf(encoding, sizeof(encoding), "cp%u", GetACP()); - return PyConfig_SetBytesString(config, locale_encoding, encoding); -#else - const char *encoding = nl_langinfo(CODESET); - if (!encoding || encoding[0] == '\0') { -#ifdef _Py_FORCE_UTF8_FS_ENCODING - // nl_langinfo() can return an empty string when the LC_CTYPE locale is - // not supported. Default to UTF-8 in that case, because UTF-8 is the - // default charset on macOS. - encoding = "UTF-8"; -#else - return _PyStatus_ERR("failed to get the locale encoding: " - "nl_langinfo(CODESET) returns an empty string"); -#endif + const char *errmsg; + wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + if (encoding == NULL) { + if (errmsg != NULL) { + return _PyStatus_ERR(errmsg); + } + else { + return _PyStatus_NO_MEMORY(); + } } - /* nl_langinfo(CODESET) is decoded by Py_DecodeLocale() */ - return CONFIG_SET_BYTES_STR(config, - locale_encoding, encoding, - "nl_langinfo(CODESET)"); -#endif // !MS_WINDOWS -#endif // !_Py_FORCE_UTF8_LOCALE + PyStatus status = PyConfig_SetString(config, locale_encoding, encoding); + PyMem_RawFree(encoding); + return status; } -- cgit v0.12