summaryrefslogtreecommitdiffstats
path: root/Python/pylifecycle.c
diff options
context:
space:
mode:
Diffstat (limited to 'Python/pylifecycle.c')
-rw-r--r--Python/pylifecycle.c174
1 files changed, 167 insertions, 7 deletions
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index ec26824..b7c9822 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -178,6 +178,7 @@ Py_SetStandardStreamEncoding(const char *encoding, const char *errors)
return 0;
}
+
/* Global initializations. Can be undone by Py_FinalizeEx(). Don't
call this twice without an intervening Py_FinalizeEx() call. When
initializations fail, a fatal error is issued and the function does
@@ -330,6 +331,159 @@ initexternalimport(PyInterpreterState *interp)
Py_DECREF(value);
}
+/* Helper functions to better handle the legacy C locale
+ *
+ * The legacy C locale assumes ASCII as the default text encoding, which
+ * causes problems not only for the CPython runtime, but also other
+ * components like GNU readline.
+ *
+ * Accordingly, when the CLI detects it, it attempts to coerce it to a
+ * more capable UTF-8 based alternative as follows:
+ *
+ * if (_Py_LegacyLocaleDetected()) {
+ * _Py_CoerceLegacyLocale();
+ * }
+ *
+ * See the documentation of the PYTHONCOERCECLOCALE setting for more details.
+ *
+ * Locale coercion also impacts the default error handler for the standard
+ * streams: while the usual default is "strict", the default for the legacy
+ * C locale and for any of the coercion target locales is "surrogateescape".
+ */
+
+int
+_Py_LegacyLocaleDetected(void)
+{
+#ifndef MS_WINDOWS
+ /* On non-Windows systems, the C locale is considered a legacy locale */
+ const char *ctype_loc = setlocale(LC_CTYPE, NULL);
+ return ctype_loc != NULL && strcmp(ctype_loc, "C") == 0;
+#else
+ /* Windows uses code pages instead of locales, so no locale is legacy */
+ return 0;
+#endif
+}
+
+typedef struct _CandidateLocale {
+ const char *locale_name; /* The locale to try as a coercion target */
+} _LocaleCoercionTarget;
+
+static _LocaleCoercionTarget _TARGET_LOCALES[] = {
+ {"C.UTF-8"},
+ {"C.utf8"},
+ {"UTF-8"},
+ {NULL}
+};
+
+static char *
+get_default_standard_stream_error_handler(void)
+{
+ const char *ctype_loc = setlocale(LC_CTYPE, NULL);
+ if (ctype_loc != NULL) {
+ /* "surrogateescape" is the default in the legacy C locale */
+ if (strcmp(ctype_loc, "C") == 0) {
+ return "surrogateescape";
+ }
+
+#ifdef PY_COERCE_C_LOCALE
+ /* "surrogateescape" is the default in locale coercion target locales */
+ const _LocaleCoercionTarget *target = NULL;
+ for (target = _TARGET_LOCALES; target->locale_name; target++) {
+ if (strcmp(ctype_loc, target->locale_name) == 0) {
+ return "surrogateescape";
+ }
+ }
+#endif
+ }
+
+ /* Otherwise return NULL to request the typical default error handler */
+ return NULL;
+}
+
+#ifdef PY_COERCE_C_LOCALE
+static const char *_C_LOCALE_COERCION_WARNING =
+ "Python detected LC_CTYPE=C: LC_CTYPE coerced to %.20s (set another locale "
+ "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n";
+
+static void
+_coerce_default_locale_settings(const _LocaleCoercionTarget *target)
+{
+ const char *newloc = target->locale_name;
+
+ /* Reset locale back to currently configured defaults */
+ setlocale(LC_ALL, "");
+
+ /* Set the relevant locale environment variable */
+ if (setenv("LC_CTYPE", newloc, 1)) {
+ fprintf(stderr,
+ "Error setting LC_CTYPE, skipping C locale coercion\n");
+ return;
+ }
+ fprintf(stderr, _C_LOCALE_COERCION_WARNING, newloc);
+
+ /* Reconfigure with the overridden environment variables */
+ setlocale(LC_ALL, "");
+}
+#endif
+
+void
+_Py_CoerceLegacyLocale(void)
+{
+#ifdef PY_COERCE_C_LOCALE
+ /* We ignore the Python -E and -I flags here, as the CLI needs to sort out
+ * the locale settings *before* we try to do anything with the command
+ * line arguments. For cross-platform debugging purposes, we also need
+ * to give end users a way to force even scripts that are otherwise
+ * isolated from their environment to use the legacy ASCII-centric C
+ * locale.
+ *
+ * Ignoring -E and -I is safe from a security perspective, as we only use
+ * the setting to turn *off* the implicit locale coercion, and anyone with
+ * access to the process environment already has the ability to set
+ * `LC_ALL=C` to override the C level locale settings anyway.
+ */
+ const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
+ if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
+ /* PYTHONCOERCECLOCALE is not set, or is set to something other than "0" */
+ const char *locale_override = getenv("LC_ALL");
+ if (locale_override == NULL || *locale_override == '\0') {
+ /* LC_ALL is also not set (or is set to an empty string) */
+ const _LocaleCoercionTarget *target = NULL;
+ for (target = _TARGET_LOCALES; target->locale_name; target++) {
+ const char *new_locale = setlocale(LC_CTYPE,
+ target->locale_name);
+ if (new_locale != NULL) {
+ /* Successfully configured locale, so make it the default */
+ _coerce_default_locale_settings(target);
+ return;
+ }
+ }
+ }
+ }
+ /* No C locale warning here, as Py_Initialize will emit one later */
+#endif
+}
+
+
+#ifdef PY_WARN_ON_C_LOCALE
+static const char *_C_LOCALE_WARNING =
+ "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
+ "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
+ "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
+ "locales is recommended.\n";
+
+static void
+_emit_stderr_warning_for_c_locale(void)
+{
+ const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
+ if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
+ if (_Py_LegacyLocaleDetected()) {
+ fprintf(stderr, "%s", _C_LOCALE_WARNING);
+ }
+ }
+}
+#endif
+
/* Global initializations. Can be undone by Py_Finalize(). Don't
call this twice without an intervening Py_Finalize() call.
@@ -396,11 +550,21 @@ void _Py_InitializeCore(const _PyCoreConfig *config)
*/
_Py_Finalizing = NULL;
-#ifdef HAVE_SETLOCALE
+#ifdef __ANDROID__
+ /* Passing "" to setlocale() on Android requests the C locale rather
+ * than checking environment variables, so request C.UTF-8 explicitly
+ */
+ setlocale(LC_CTYPE, "C.UTF-8");
+#else
+#ifndef MS_WINDOWS
/* Set up the LC_CTYPE locale, so we can obtain
the locale's charset without having to switch
locales. */
setlocale(LC_CTYPE, "");
+#ifdef PY_WARN_ON_C_LOCALE
+ _emit_stderr_warning_for_c_locale();
+#endif
+#endif
#endif
if ((p = Py_GETENV("PYTHONDEBUG")) && *p != '\0')
@@ -1457,12 +1621,8 @@ initstdio(void)
}
}
if (!errors && !(pythonioencoding && *pythonioencoding)) {
- /* When the LC_CTYPE locale is the POSIX locale ("C locale"),
- stdin and stdout use the surrogateescape error handler by
- default, instead of the strict error handler. */
- char *loc = setlocale(LC_CTYPE, NULL);
- if (loc != NULL && strcmp(loc, "C") == 0)
- errors = "surrogateescape";
+ /* Choose the default error handler based on the current locale */
+ errors = get_default_standard_stream_error_handler();
}
}