bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (GH-25096)

author: Jakub Kulík <Kulikjak@gmail.com> 2021-04-30 13:21:42 (GMT)
committer: GitHub <noreply@github.com> 2021-04-30 13:21:42 (GMT)
commit: 9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86 (patch)
tree: 86ccc15aac78e1225299e09c12215d942b147d6f /Python
parent: 4908fae3d57f68694cf006e89fd7761f45003447 (diff)
download: cpython-9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86.zip
cpython-9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86.tar.gz
cpython-9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86.tar.bz2
1 files changed, 106 insertions, 0 deletions
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 2a079bb..a8fab00 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -18,6 +18,10 @@ extern int winerror_to_errno(int);
 #include <sys/ioctl.h>
 #endif
 
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+#include <iconv.h>
+#endif
+
 #ifdef HAVE_FCNTL_H
 #include <fcntl.h>
 #endif /* HAVE_FCNTL_H */
@@ -93,6 +97,12 @@ _Py_device_encoding(int fd)
 static size_t
 is_valid_wide_char(wchar_t ch)
 {
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
+       for non-Unicode locales, which makes values higher than MAX_UNICODE
+       possibly valid. */
+    return 1;
+#endif
     if (Py_UNICODE_IS_SURROGATE(ch)) {
         // Reject lone surrogate characters
         return 0;
@@ -922,6 +932,102 @@ _Py_GetLocaleEncodingObject(void)
     return str;
 }
 
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+
+/* Check whether current locale uses Unicode as internal wchar_t form. */
+int
+_Py_LocaleUsesNonUnicodeWchar(void)
+{
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion to UTF first. */
+    char* codeset = nl_langinfo(CODESET);
+    if (!codeset) {
+        return 0;
+    }
+    /* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
+    return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0);
+}
+
+static wchar_t *
+_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size,
+                     const char *tocode, const char *fromcode)
+{
+    Py_BUILD_ASSERT(sizeof(wchar_t) == 4);
+
+    /* Ensure we won't overflow the size. */
+    if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    /* the string doesn't have to be NULL terminated */
+    wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t));
+    if (target == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    iconv_t cd = iconv_open(tocode, fromcode);
+    if (cd == (iconv_t)-1) {
+        PyErr_Format(PyExc_ValueError, "iconv_open() failed");
+        PyMem_Free(target);
+        return NULL;
+    }
+
+    char *inbuf = (char *) source;
+    char *outbuf = (char *) target;
+    size_t inbytesleft = sizeof(wchar_t) * size;
+    size_t outbytesleft = inbytesleft;
+
+    size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+    if (ret == DECODE_ERROR) {
+        PyErr_Format(PyExc_ValueError, "iconv() failed");
+        PyMem_Free(target);
+        iconv_close(cd);
+        return NULL;
+    }
+
+    iconv_close(cd);
+    return target;
+}
+
+/* Convert a wide character string to the UCS-4 encoded string. This
+   is necessary on systems where internal form of wchar_t are not Unicode
+   code points (e.g. Oracle Solaris).
+
+   Return a pointer to a newly allocated string, use PyMem_Free() to free
+   the memory. Return NULL and raise exception on conversion or memory
+   allocation error. */
+wchar_t *
+_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size)
+{
+    return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t");
+}
+
+/* Convert a UCS-4 encoded string to native wide character string. This
+   is necessary on systems where internal form of wchar_t are not Unicode
+   code points (e.g. Oracle Solaris).
+
+   The conversion is done in place. This can be done because both wchar_t
+   and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
+   to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
+   which is currently the only system using these functions; it doesn't have
+   to be for other systems).
+
+   Return 0 on success. Return -1 and raise exception on conversion
+   or memory allocation error. */
+int
+_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size)
+{
+    wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL");
+    if (!result) {
+        return -1;
+    }
+    memcpy(unicode, result, size * sizeof(wchar_t));
+    PyMem_Free(result);
+    return 0;
+}
+#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
 
 #ifdef MS_WINDOWS
 static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */
author	Jakub Kulík <Kulikjak@gmail.com>	2021-04-30 13:21:42 (GMT)
committer	GitHub <noreply@github.com>	2021-04-30 13:21:42 (GMT)
commit	9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86 (patch)
tree	86ccc15aac78e1225299e09c12215d942b147d6f /Python
parent	4908fae3d57f68694cf006e89fd7761f45003447 (diff)
download	cpython-9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86.zip cpython-9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86.tar.gz cpython-9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86.tar.bz2