summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@python.org>2020-11-01 19:59:35 (GMT)
committerGitHub <noreply@github.com>2020-11-01 19:59:35 (GMT)
commit82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4 (patch)
treef70b3003306fdc3165740eec39eadec9a2e82a9c
parent1f7dfb277e5b88cddc13e5024766be787a3e9127 (diff)
downloadcpython-82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4.zip
cpython-82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4.tar.gz
cpython-82458b6cdbae3b849dc11d0d7dc2ab06ef0451c4.tar.bz2
bpo-42236: Enhance _locale._get_locale_encoding() (GH-23083)
* Rename _Py_GetLocaleEncoding() to _Py_GetLocaleEncodingObject() * Add _Py_GetLocaleEncoding() which returns a wchar_t* string to share code between _Py_GetLocaleEncodingObject() and config_get_locale_encoding(). * _Py_GetLocaleEncodingObject() now decodes nl_langinfo(CODESET) from the current locale encoding with surrogateescape, rather than using UTF-8.
-rw-r--r--Include/internal/pycore_fileutils.h3
-rw-r--r--Modules/_io/textio.c2
-rw-r--r--Modules/_localemodule.c2
-rw-r--r--Python/fileutils.c74
-rw-r--r--Python/initconfig.c47
5 files changed, 76 insertions, 52 deletions
diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h
index ff7bc48..1ab554f 100644
--- a/Include/internal/pycore_fileutils.h
+++ b/Include/internal/pycore_fileutils.h
@@ -50,7 +50,8 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
PyAPI_FUNC(void) _Py_closerange(int first, int last);
-PyAPI_FUNC(PyObject*) _Py_GetLocaleEncoding(void);
+PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(const char **errmsg);
+PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void);
#ifdef __cplusplus
}
diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c
index 2078bb3..f08d14e 100644
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@@ -1155,7 +1155,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
}
}
if (encoding == NULL && self->encoding == NULL) {
- self->encoding = _Py_GetLocaleEncoding();
+ self->encoding = _Py_GetLocaleEncodingObject();
if (self->encoding == NULL) {
goto error;
}
diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c
index 359deb7..7b3597e 100644
--- a/Modules/_localemodule.c
+++ b/Modules/_localemodule.c
@@ -783,7 +783,7 @@ static PyObject *
_locale__get_locale_encoding_impl(PyObject *module)
/*[clinic end generated code: output=e8e2f6f6f184591a input=513d9961d2f45c76]*/
{
- return _Py_GetLocaleEncoding();
+ return _Py_GetLocaleEncodingObject();
}
diff --git a/Python/fileutils.c b/Python/fileutils.c
index ba26904..72cdee2 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -821,23 +821,41 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
}
-// Get the current locale encoding: locale.getpreferredencoding(False).
+// Get the current locale encoding name:
+//
+// - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android)
+// - Return "UTF-8" if the UTF-8 Mode is enabled
+// - On Windows, return the ANSI code page (ex: "cp1250")
+// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string
+// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS).
+// - Otherwise, return nl_langinfo(CODESET).
+//
+// Return NULL and set errmsg to an error message
+// if nl_langinfo(CODESET) fails.
+//
+// Return NULL and set errmsg to NULL on memory allocation failure.
+//
// See also config_get_locale_encoding()
-PyObject *
-_Py_GetLocaleEncoding(void)
+wchar_t*
+_Py_GetLocaleEncoding(const char **errmsg)
{
+ *errmsg = NULL;
#ifdef _Py_FORCE_UTF8_LOCALE
// On Android langinfo.h and CODESET are missing,
// and UTF-8 is always used in mbstowcs() and wcstombs().
- return PyUnicode_FromString("UTF-8");
+ return _PyMem_RawWcsdup(L"UTF-8");
#else
const PyPreConfig *preconfig = &_PyRuntime.preconfig;
if (preconfig->utf8_mode) {
- return PyUnicode_FromString("UTF-8");
+ return _PyMem_RawWcsdup(L"UTF-8");
}
-#if defined(MS_WINDOWS)
- return PyUnicode_FromFormat("cp%u", GetACP());
+#ifdef MS_WINDOWS
+ wchar_t encoding[23];
+ unsigned int ansi_codepage = GetACP();
+ swprintf(encoding, Py_ARRAY_LENGTH(encoding), L"cp%u", ansi_codepage);
+ encoding[Py_ARRAY_LENGTH(encoding) - 1] = 0;
+ return _PyMem_RawWcsdup(encoding);
#else
const char *encoding = nl_langinfo(CODESET);
if (!encoding || encoding[0] == '\0') {
@@ -845,19 +863,45 @@ _Py_GetLocaleEncoding(void)
// nl_langinfo() can return an empty string when the LC_CTYPE locale is
// not supported. Default to UTF-8 in that case, because UTF-8 is the
// default charset on macOS.
- encoding = "UTF-8";
+ return _PyMem_RawWcsdup(L"UTF-8");
#else
- PyErr_SetString(PyExc_ValueError,
- "failed to get the locale encoding: "
- "nl_langinfo(CODESET) returns an empty string");
+ *errmsg = "failed to get the locale encoding: "
+ "nl_langinfo(CODESET) returns an empty string";
return NULL;
#endif
}
- // Decode from UTF-8
- return PyUnicode_FromString(encoding);
-#endif // !CODESET
-#endif
+ wchar_t *wstr;
+ int res = decode_current_locale(encoding, &wstr, NULL,
+ errmsg, _Py_ERROR_SURROGATEESCAPE);
+ if (res < 0) {
+ return NULL;
+ }
+ return wstr;
+#endif // !MS_WINDOWS
+
+#endif // !_Py_FORCE_UTF8_LOCALE
+}
+
+
+PyObject *
+_Py_GetLocaleEncodingObject(void)
+{
+ const char *errmsg;
+ wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg);
+ if (encoding == NULL) {
+ if (errmsg != NULL) {
+ PyErr_SetString(PyExc_ValueError, errmsg);
+ }
+ else {
+ PyErr_NoMemory();
+ }
+ return NULL;
+ }
+
+ PyObject *str = PyUnicode_FromWideChar(encoding, -1);
+ PyMem_RawFree(encoding);
+ return str;
}
diff --git a/Python/initconfig.c b/Python/initconfig.c
index e129278..56f4297 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -11,11 +11,7 @@
#include "osdefs.h" // DELIM
#include <locale.h> // setlocale()
-#ifdef HAVE_LANGINFO_H
-# include <langinfo.h> // nl_langinfo(CODESET)
-#endif
#if defined(MS_WINDOWS) || defined(__CYGWIN__)
-# include <windows.h> // GetACP()
# ifdef HAVE_IO_H
# include <io.h>
# endif
@@ -1497,41 +1493,24 @@ config_get_stdio_errors(const PyPreConfig *preconfig)
}
-// See also _Py_GetLocaleEncoding() and config_get_fs_encoding()
+// See also config_get_fs_encoding()
static PyStatus
config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig,
wchar_t **locale_encoding)
{
-#ifdef _Py_FORCE_UTF8_LOCALE
- return PyConfig_SetString(config, locale_encoding, L"utf-8");
-#else
- if (preconfig->utf8_mode) {
- return PyConfig_SetString(config, locale_encoding, L"utf-8");
- }
-
-#ifdef MS_WINDOWS
- char encoding[20];
- PyOS_snprintf(encoding, sizeof(encoding), "cp%u", GetACP());
- return PyConfig_SetBytesString(config, locale_encoding, encoding);
-#else
- const char *encoding = nl_langinfo(CODESET);
- if (!encoding || encoding[0] == '\0') {
-#ifdef _Py_FORCE_UTF8_FS_ENCODING
- // nl_langinfo() can return an empty string when the LC_CTYPE locale is
- // not supported. Default to UTF-8 in that case, because UTF-8 is the
- // default charset on macOS.
- encoding = "UTF-8";
-#else
- return _PyStatus_ERR("failed to get the locale encoding: "
- "nl_langinfo(CODESET) returns an empty string");
-#endif
+ const char *errmsg;
+ wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg);
+ if (encoding == NULL) {
+ if (errmsg != NULL) {
+ return _PyStatus_ERR(errmsg);
+ }
+ else {
+ return _PyStatus_NO_MEMORY();
+ }
}
- /* nl_langinfo(CODESET) is decoded by Py_DecodeLocale() */
- return CONFIG_SET_BYTES_STR(config,
- locale_encoding, encoding,
- "nl_langinfo(CODESET)");
-#endif // !MS_WINDOWS
-#endif // !_Py_FORCE_UTF8_LOCALE
+ PyStatus status = PyConfig_SetString(config, locale_encoding, encoding);
+ PyMem_RawFree(encoding);
+ return status;
}