summaryrefslogtreecommitdiffstats
path: root/Python
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@redhat.com>2018-08-28 15:27:36 (GMT)
committerGitHub <noreply@github.com>2018-08-28 15:27:36 (GMT)
commitd500e5307aec9c5d535f66d567fadb9c587a9a36 (patch)
tree37f95130926a65be9419683aca896b3dcfcfceee /Python
parent5cb258950ce9b69b1f65646431c464c0c17b1510 (diff)
downloadcpython-d500e5307aec9c5d535f66d567fadb9c587a9a36.zip
cpython-d500e5307aec9c5d535f66d567fadb9c587a9a36.tar.gz
cpython-d500e5307aec9c5d535f66d567fadb9c587a9a36.tar.bz2
bpo-34403: On HP-UX, force ASCII for C locale (GH-8969)
On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns "ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale is not coerced). nl_langinfo(CODESET) announces "roman8" whereas it uses the Latin1 encoding in practice.
Diffstat (limited to 'Python')
-rw-r--r--Python/coreconfig.c15
-rw-r--r--Python/fileutils.c104
-rw-r--r--Python/pylifecycle.c30
3 files changed, 97 insertions, 52 deletions
diff --git a/Python/coreconfig.c b/Python/coreconfig.c
index acf4645..d08d75b 100644
--- a/Python/coreconfig.c
+++ b/Python/coreconfig.c
@@ -828,18 +828,21 @@ config_read_complex_options(_PyCoreConfig *config)
static void
config_init_locale(_PyCoreConfig *config)
{
- if (_Py_LegacyLocaleDetected()) {
+ if (config->coerce_c_locale < 0) {
/* The C locale enables the C locale coercion (PEP 538) */
- if (config->coerce_c_locale < 0) {
+ if (_Py_LegacyLocaleDetected()) {
config->coerce_c_locale = 1;
}
}
+
#ifndef MS_WINDOWS
- const char *ctype_loc = setlocale(LC_CTYPE, NULL);
- if (ctype_loc != NULL
- && (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0)) {
+ if (config->utf8_mode < 0) {
/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
- if (config->utf8_mode < 0) {
+ const char *ctype_loc = setlocale(LC_CTYPE, NULL);
+ if (ctype_loc != NULL
+ && (strcmp(ctype_loc, "C") == 0
+ || strcmp(ctype_loc, "POSIX") == 0))
+ {
config->utf8_mode = 1;
}
}
diff --git a/Python/fileutils.c b/Python/fileutils.c
index b413f4e..e756c26 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -72,8 +72,8 @@ _Py_device_encoding(int fd)
extern int _Py_normalize_encoding(const char *, char *, size_t);
-/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
- On these operating systems, nl_langinfo(CODESET) announces an alias of the
+/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
+ and POSIX locale. nl_langinfo(CODESET) announces an alias of the
ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
locale.getpreferredencoding() codec. For example, if command line arguments
@@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
workaround is also enabled on error, for example if getting the locale
failed.
+ On HP-UX with the C locale or the POSIX locale, nl_langinfo(CODESET)
+ announces "roman8" but mbstowcs() uses Latin1 in practice. Force also the
+ ASCII encoding in this case.
+
Values of force_ascii:
1: the workaround is used: Py_EncodeLocale() uses
@@ -100,13 +104,46 @@ static int force_ascii = -1;
static int
check_force_ascii(void)
{
- char *loc;
+ char *loc = setlocale(LC_CTYPE, NULL);
+ if (loc == NULL) {
+ goto error;
+ }
+ if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
+ /* the LC_CTYPE locale is different than C and POSIX */
+ return 0;
+ }
+
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
- char *codeset, **alias;
+ const char *codeset = nl_langinfo(CODESET);
+ if (!codeset || codeset[0] == '\0') {
+ /* CODESET is not set or empty */
+ goto error;
+ }
+
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
- int is_ascii;
- unsigned int i;
- char* ascii_aliases[] = {
+ if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
+ goto error;
+ }
+
+#ifdef __hpux
+ if (strcmp(encoding, "roman8") == 0) {
+ unsigned char ch;
+ wchar_t wch;
+ size_t res;
+
+ ch = (unsigned char)0xA7;
+ res = mbstowcs(&wch, (char*)&ch, 1);
+ if (res != (size_t)-1 && wch == L'\xA7') {
+ /* On HP-UX withe C locale or the POSIX locale,
+ nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
+ Latin1 encoding in practice. Force ASCII in this case.
+
+ Roman8 decodes 0xA7 to U+00CF. Latin1 decodes 0xA7 to U+00A7. */
+ return 1;
+ }
+ }
+#else
+ const char* ascii_aliases[] = {
"ascii",
/* Aliases from Lib/encodings/aliases.py */
"646",
@@ -123,27 +160,9 @@ check_force_ascii(void)
"us_ascii",
NULL
};
-#endif
-
- loc = setlocale(LC_CTYPE, NULL);
- if (loc == NULL)
- goto error;
- if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
- /* the LC_CTYPE locale is different than C */
- return 0;
- }
-
-#if defined(HAVE_LANGINFO_H) && defined(CODESET)
- codeset = nl_langinfo(CODESET);
- if (!codeset || codeset[0] == '\0') {
- /* CODESET is not set or empty */
- goto error;
- }
- if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
- goto error;
- is_ascii = 0;
- for (alias=ascii_aliases; *alias != NULL; alias++) {
+ int is_ascii = 0;
+ for (const char **alias=ascii_aliases; *alias != NULL; alias++) {
if (strcmp(encoding, *alias) == 0) {
is_ascii = 1;
break;
@@ -154,13 +173,14 @@ check_force_ascii(void)
return 0;
}
- for (i=0x80; i<0xff; i++) {
- unsigned char ch;
- wchar_t wch;
+ for (unsigned int i=0x80; i<=0xff; i++) {
+ char ch[1];
+ wchar_t wch[1];
size_t res;
- ch = (unsigned char)i;
- res = mbstowcs(&wch, (char*)&ch, 1);
+ unsigned uch = (unsigned char)i;
+ ch[0] = (char)uch;
+ res = mbstowcs(wch, ch, 1);
if (res != (size_t)-1) {
/* decoding a non-ASCII character from the locale encoding succeed:
the locale encoding is not ASCII, force ASCII */
@@ -169,17 +189,29 @@ check_force_ascii(void)
}
/* None of the bytes in the range 0x80-0xff can be decoded from the locale
encoding: the locale encoding is really ASCII */
+#endif /* !defined(__hpux) */
return 0;
#else
/* nl_langinfo(CODESET) is not available: always force ASCII */
return 1;
-#endif
+#endif /* defined(HAVE_LANGINFO_H) && defined(CODESET) */
error:
/* if an error occurred, force the ASCII encoding */
return 1;
}
+
+int
+_Py_GetForceASCII(void)
+{
+ if (force_ascii == -1) {
+ force_ascii = check_force_ascii();
+ }
+ return force_ascii;
+}
+
+
static int
encode_ascii(const wchar_t *text, char **str,
size_t *error_pos, const char **reason,
@@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str,
*str = result;
return 0;
}
+#else
+int
+_Py_GetForceASCII(void)
+{
+ return 0;
+}
#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 28704c1..cc64cf9 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1576,21 +1576,25 @@ initfsencoding(PyInterpreterState *interp)
Py_FileSystemDefaultEncodeErrors = "surrogatepass";
}
#else
- if (Py_FileSystemDefaultEncoding == NULL &&
- interp->core_config.utf8_mode)
- {
- Py_FileSystemDefaultEncoding = "utf-8";
- Py_HasFileSystemDefaultEncoding = 1;
- }
- else if (Py_FileSystemDefaultEncoding == NULL) {
- Py_FileSystemDefaultEncoding = get_locale_encoding();
- if (Py_FileSystemDefaultEncoding == NULL) {
- return _Py_INIT_ERR("Unable to get the locale encoding");
+ if (Py_FileSystemDefaultEncoding == NULL) {
+ if (interp->core_config.utf8_mode) {
+ Py_FileSystemDefaultEncoding = "utf-8";
+ Py_HasFileSystemDefaultEncoding = 1;
+ }
+ else if (_Py_GetForceASCII()) {
+ Py_FileSystemDefaultEncoding = "ascii";
+ Py_HasFileSystemDefaultEncoding = 1;
}
+ else {
+ Py_FileSystemDefaultEncoding = get_locale_encoding();
+ if (Py_FileSystemDefaultEncoding == NULL) {
+ return _Py_INIT_ERR("Unable to get the locale encoding");
+ }
- Py_HasFileSystemDefaultEncoding = 0;
- interp->fscodec_initialized = 1;
- return _Py_INIT_OK();
+ Py_HasFileSystemDefaultEncoding = 0;
+ interp->fscodec_initialized = 1;
+ return _Py_INIT_OK();
+ }
}
#endif