diff options
author | Victor Stinner <vstinner@python.org> | 2020-11-01 19:59:35 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-11-01 19:59:35 (GMT) |
commit | 82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4 (patch) | |
tree | f70b3003306fdc3165740eec39eadec9a2e82a9c | |
parent | 1f7dfb277e5b88cddc13e5024766be787a3e9127 (diff) | |
download | cpython-82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4.zip cpython-82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4.tar.gz cpython-82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4.tar.bz2 |
bpo-42236: Enhance _locale._get_locale_encoding() (GH-23083)
* Rename _Py_GetLocaleEncoding() to _Py_GetLocaleEncodingObject()
* Add _Py_GetLocaleEncoding() which returns a wchar_t* string to
share code between _Py_GetLocaleEncodingObject()
and config_get_locale_encoding().
* _Py_GetLocaleEncodingObject() now decodes nl_langinfo(CODESET)
from the current locale encoding with surrogateescape,
rather than using UTF-8.
-rw-r--r-- | Include/internal/pycore_fileutils.h | 3 | ||||
-rw-r--r-- | Modules/_io/textio.c | 2 | ||||
-rw-r--r-- | Modules/_localemodule.c | 2 | ||||
-rw-r--r-- | Python/fileutils.c | 74 | ||||
-rw-r--r-- | Python/initconfig.c | 47 |
5 files changed, 76 insertions, 52 deletions
diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h index ff7bc48..1ab554f 100644 --- a/Include/internal/pycore_fileutils.h +++ b/Include/internal/pycore_fileutils.h @@ -50,7 +50,8 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric( PyAPI_FUNC(void) _Py_closerange(int first, int last); -PyAPI_FUNC(PyObject*) _Py_GetLocaleEncoding(void); +PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(const char **errmsg); +PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void); #ifdef __cplusplus } diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 2078bb3..f08d14e 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1155,7 +1155,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, } } if (encoding == NULL && self->encoding == NULL) { - self->encoding = _Py_GetLocaleEncoding(); + self->encoding = _Py_GetLocaleEncodingObject(); if (self->encoding == NULL) { goto error; } diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c index 359deb7..7b3597e 100644 --- a/Modules/_localemodule.c +++ b/Modules/_localemodule.c @@ -783,7 +783,7 @@ static PyObject * _locale__get_locale_encoding_impl(PyObject *module) /*[clinic end generated code: output=e8e2f6f6f184591a input=513d9961d2f45c76]*/ { - return _Py_GetLocaleEncoding(); + return _Py_GetLocaleEncodingObject(); } diff --git a/Python/fileutils.c b/Python/fileutils.c index ba26904..72cdee2 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -821,23 +821,41 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str, } -// Get the current locale encoding: locale.getpreferredencoding(False). +// Get the current locale encoding name: +// +// - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android) +// - Return "UTF-8" if the UTF-8 Mode is enabled +// - On Windows, return the ANSI code page (ex: "cp1250") +// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string +// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS). +// - Otherwise, return nl_langinfo(CODESET). +// +// Return NULL and set errmsg to an error message +// if nl_langinfo(CODESET) fails. +// +// Return NULL and set errmsg to NULL on memory allocation failure. +// // See also config_get_locale_encoding() -PyObject * -_Py_GetLocaleEncoding(void) +wchar_t* +_Py_GetLocaleEncoding(const char **errmsg) { + *errmsg = NULL; #ifdef _Py_FORCE_UTF8_LOCALE // On Android langinfo.h and CODESET are missing, // and UTF-8 is always used in mbstowcs() and wcstombs(). - return PyUnicode_FromString("UTF-8"); + return _PyMem_RawWcsdup(L"UTF-8"); #else const PyPreConfig *preconfig = &_PyRuntime.preconfig; if (preconfig->utf8_mode) { - return PyUnicode_FromString("UTF-8"); + return _PyMem_RawWcsdup(L"UTF-8"); } -#if defined(MS_WINDOWS) - return PyUnicode_FromFormat("cp%u", GetACP()); +#ifdef MS_WINDOWS + wchar_t encoding[23]; + unsigned int ansi_codepage = GetACP(); + swprintf(encoding, Py_ARRAY_LENGTH(encoding), L"cp%u", ansi_codepage); + encoding[Py_ARRAY_LENGTH(encoding) - 1] = 0; + return _PyMem_RawWcsdup(encoding); #else const char *encoding = nl_langinfo(CODESET); if (!encoding || encoding[0] == '\0') { @@ -845,19 +863,45 @@ _Py_GetLocaleEncoding(void) // nl_langinfo() can return an empty string when the LC_CTYPE locale is // not supported. Default to UTF-8 in that case, because UTF-8 is the // default charset on macOS. - encoding = "UTF-8"; + return _PyMem_RawWcsdup(L"UTF-8"); #else - PyErr_SetString(PyExc_ValueError, - "failed to get the locale encoding: " - "nl_langinfo(CODESET) returns an empty string"); + *errmsg = "failed to get the locale encoding: " + "nl_langinfo(CODESET) returns an empty string"; return NULL; #endif } - // Decode from UTF-8 - return PyUnicode_FromString(encoding); -#endif // !CODESET -#endif + wchar_t *wstr; + int res = decode_current_locale(encoding, &wstr, NULL, + errmsg, _Py_ERROR_SURROGATEESCAPE); + if (res < 0) { + return NULL; + } + return wstr; +#endif // !MS_WINDOWS + +#endif // !_Py_FORCE_UTF8_LOCALE +} + + +PyObject * +_Py_GetLocaleEncodingObject(void) +{ + const char *errmsg; + wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + if (encoding == NULL) { + if (errmsg != NULL) { + PyErr_SetString(PyExc_ValueError, errmsg); + } + else { + PyErr_NoMemory(); + } + return NULL; + } + + PyObject *str = PyUnicode_FromWideChar(encoding, -1); + PyMem_RawFree(encoding); + return str; } diff --git a/Python/initconfig.c b/Python/initconfig.c index e129278..56f4297 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -11,11 +11,7 @@ #include "osdefs.h" // DELIM #include <locale.h> // setlocale() -#ifdef HAVE_LANGINFO_H -# include <langinfo.h> // nl_langinfo(CODESET) -#endif #if defined(MS_WINDOWS) || defined(__CYGWIN__) -# include <windows.h> // GetACP() # ifdef HAVE_IO_H # include <io.h> # endif @@ -1497,41 +1493,24 @@ config_get_stdio_errors(const PyPreConfig *preconfig) } -// See also _Py_GetLocaleEncoding() and config_get_fs_encoding() +// See also config_get_fs_encoding() static PyStatus config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig, wchar_t **locale_encoding) { -#ifdef _Py_FORCE_UTF8_LOCALE - return PyConfig_SetString(config, locale_encoding, L"utf-8"); -#else - if (preconfig->utf8_mode) { - return PyConfig_SetString(config, locale_encoding, L"utf-8"); - } - -#ifdef MS_WINDOWS - char encoding[20]; - PyOS_snprintf(encoding, sizeof(encoding), "cp%u", GetACP()); - return PyConfig_SetBytesString(config, locale_encoding, encoding); -#else - const char *encoding = nl_langinfo(CODESET); - if (!encoding || encoding[0] == '\0') { -#ifdef _Py_FORCE_UTF8_FS_ENCODING - // nl_langinfo() can return an empty string when the LC_CTYPE locale is - // not supported. Default to UTF-8 in that case, because UTF-8 is the - // default charset on macOS. - encoding = "UTF-8"; -#else - return _PyStatus_ERR("failed to get the locale encoding: " - "nl_langinfo(CODESET) returns an empty string"); -#endif + const char *errmsg; + wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + if (encoding == NULL) { + if (errmsg != NULL) { + return _PyStatus_ERR(errmsg); + } + else { + return _PyStatus_NO_MEMORY(); + } } - /* nl_langinfo(CODESET) is decoded by Py_DecodeLocale() */ - return CONFIG_SET_BYTES_STR(config, - locale_encoding, encoding, - "nl_langinfo(CODESET)"); -#endif // !MS_WINDOWS -#endif // !_Py_FORCE_UTF8_LOCALE + PyStatus status = PyConfig_SetString(config, locale_encoding, encoding); + PyMem_RawFree(encoding); + return status; } |