summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJakub KulĂ­k <Kulikjak@gmail.com>2021-04-30 13:21:42 (GMT)
committerGitHub <noreply@github.com>2021-04-30 13:21:42 (GMT)
commit9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86 (patch)
tree86ccc15aac78e1225299e09c12215d942b147d6f
parent4908fae3d57f68694cf006e89fd7761f45003447 (diff)
downloadcpython-9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86.zip
cpython-9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86.tar.gz
cpython-9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86.tar.bz2
bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (GH-25096)
-rw-r--r--Include/internal/pycore_fileutils.h12
-rw-r--r--Objects/unicodeobject.c40
-rw-r--r--Python/fileutils.c106
-rwxr-xr-xconfigure16
-rw-r--r--configure.ac16
-rw-r--r--pyconfig.h.in4
6 files changed, 194 insertions, 0 deletions
diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h
index 9281f4e..c1c9244 100644
--- a/Include/internal/pycore_fileutils.h
+++ b/Include/internal/pycore_fileutils.h
@@ -53,6 +53,18 @@ PyAPI_FUNC(void) _Py_closerange(int first, int last);
PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(void);
PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void);
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+extern int _Py_LocaleUsesNonUnicodeWchar(void);
+
+extern wchar_t* _Py_DecodeNonUnicodeWchar(
+ const wchar_t* native,
+ Py_ssize_t size);
+
+extern int _Py_EncodeNonUnicodeWchar_InPlace(
+ wchar_t* unicode,
+ Py_ssize_t size);
+#endif
+
#ifdef __cplusplus
}
#endif
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 74c5888..bfd5c88 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -57,6 +57,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include <windows.h>
#endif
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
+#endif
+
/* Uncomment to display statistics on interned strings at exit
in _PyUnicode_ClearInterned(). */
/* #define INTERNED_STATS 1 */
@@ -2217,6 +2221,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+ /* Oracle Solaris uses non-Unicode internal wchar_t form for
+ non-Unicode locales and hence needs conversion to UCS-4 first. */
+ if (_Py_LocaleUsesNonUnicodeWchar()) {
+ wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
+ if (!converted) {
+ return NULL;
+ }
+ PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
+ PyMem_Free(converted);
+ return unicode;
+ }
+#endif
+
/* Single character Unicode objects in the Latin-1 range are
shared when using this constructor */
if (size == 1 && (Py_UCS4)*u < 256)
@@ -3295,6 +3313,17 @@ PyUnicode_AsWideChar(PyObject *unicode,
res = size;
}
unicode_copy_as_widechar(unicode, w, size);
+
+#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+ /* Oracle Solaris uses non-Unicode internal wchar_t form for
+ non-Unicode locales and hence needs conversion first. */
+ if (_Py_LocaleUsesNonUnicodeWchar()) {
+ if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
+ return -1;
+ }
+ }
+#endif
+
return res;
}
@@ -3321,6 +3350,17 @@ PyUnicode_AsWideCharString(PyObject *unicode,
return NULL;
}
unicode_copy_as_widechar(unicode, buffer, buflen + 1);
+
+#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+ /* Oracle Solaris uses non-Unicode internal wchar_t form for
+ non-Unicode locales and hence needs conversion first. */
+ if (_Py_LocaleUsesNonUnicodeWchar()) {
+ if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
+ return NULL;
+ }
+ }
+#endif
+
if (size != NULL) {
*size = buflen;
}
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 2a079bb..a8fab00 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -18,6 +18,10 @@ extern int winerror_to_errno(int);
#include <sys/ioctl.h>
#endif
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+#include <iconv.h>
+#endif
+
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
@@ -93,6 +97,12 @@ _Py_device_encoding(int fd)
static size_t
is_valid_wide_char(wchar_t ch)
{
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+ /* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
+ for non-Unicode locales, which makes values higher than MAX_UNICODE
+ possibly valid. */
+ return 1;
+#endif
if (Py_UNICODE_IS_SURROGATE(ch)) {
// Reject lone surrogate characters
return 0;
@@ -922,6 +932,102 @@ _Py_GetLocaleEncodingObject(void)
return str;
}
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+
+/* Check whether current locale uses Unicode as internal wchar_t form. */
+int
+_Py_LocaleUsesNonUnicodeWchar(void)
+{
+ /* Oracle Solaris uses non-Unicode internal wchar_t form for
+ non-Unicode locales and hence needs conversion to UTF first. */
+ char* codeset = nl_langinfo(CODESET);
+ if (!codeset) {
+ return 0;
+ }
+ /* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
+ return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0);
+}
+
+static wchar_t *
+_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size,
+ const char *tocode, const char *fromcode)
+{
+ Py_BUILD_ASSERT(sizeof(wchar_t) == 4);
+
+ /* Ensure we won't overflow the size. */
+ if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ /* the string doesn't have to be NULL terminated */
+ wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t));
+ if (target == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ iconv_t cd = iconv_open(tocode, fromcode);
+ if (cd == (iconv_t)-1) {
+ PyErr_Format(PyExc_ValueError, "iconv_open() failed");
+ PyMem_Free(target);
+ return NULL;
+ }
+
+ char *inbuf = (char *) source;
+ char *outbuf = (char *) target;
+ size_t inbytesleft = sizeof(wchar_t) * size;
+ size_t outbytesleft = inbytesleft;
+
+ size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+ if (ret == DECODE_ERROR) {
+ PyErr_Format(PyExc_ValueError, "iconv() failed");
+ PyMem_Free(target);
+ iconv_close(cd);
+ return NULL;
+ }
+
+ iconv_close(cd);
+ return target;
+}
+
+/* Convert a wide character string to the UCS-4 encoded string. This
+ is necessary on systems where internal form of wchar_t are not Unicode
+ code points (e.g. Oracle Solaris).
+
+ Return a pointer to a newly allocated string, use PyMem_Free() to free
+ the memory. Return NULL and raise exception on conversion or memory
+ allocation error. */
+wchar_t *
+_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size)
+{
+ return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t");
+}
+
+/* Convert a UCS-4 encoded string to native wide character string. This
+ is necessary on systems where internal form of wchar_t are not Unicode
+ code points (e.g. Oracle Solaris).
+
+ The conversion is done in place. This can be done because both wchar_t
+ and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
+ to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
+ which is currently the only system using these functions; it doesn't have
+ to be for other systems).
+
+ Return 0 on success. Return -1 and raise exception on conversion
+ or memory allocation error. */
+int
+_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size)
+{
+ wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL");
+ if (!result) {
+ return -1;
+ }
+ memcpy(unicode, result, size * sizeof(wchar_t));
+ PyMem_Free(result);
+ return 0;
+}
+#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
#ifdef MS_WINDOWS
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */
diff --git a/configure b/configure
index ad0367f..08a88aa 100755
--- a/configure
+++ b/configure
@@ -15264,6 +15264,22 @@ else
$as_echo "no" >&6; }
fi
+case $ac_sys_system/$ac_sys_release in
+SunOS/*)
+ if test -f /etc/os-release; then
+ OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
+ if test "x$OS_NAME" = "xOracle Solaris"; then
+ # bpo-43667: In Oracle Solaris, the internal form of wchar_t in
+ # non-Unicode locales is not Unicode and hence cannot be used directly.
+ # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
+
+$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h
+
+ fi
+ fi
+ ;;
+esac
+
# check for endianness
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
$as_echo_n "checking whether byte ordering is bigendian... " >&6; }
diff --git a/configure.ac b/configure.ac
index 3df9bd0..b264329 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4765,6 +4765,22 @@ else
AC_MSG_RESULT(no)
fi
+case $ac_sys_system/$ac_sys_release in
+SunOS/*)
+ if test -f /etc/os-release; then
+ OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
+ if test "x$OS_NAME" = "xOracle Solaris"; then
+ # bpo-43667: In Oracle Solaris, the internal form of wchar_t in
+ # non-Unicode locales is not Unicode and hence cannot be used directly.
+ # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
+ AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1,
+ [Define if the internal form of wchar_t in non-Unicode locales
+ is not Unicode.])
+ fi
+ fi
+ ;;
+esac
+
# check for endianness
AC_C_BIGENDIAN
diff --git a/pyconfig.h.in b/pyconfig.h.in
index 6e54d55..63438d8 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -748,6 +748,10 @@
/* Define to 1 if you have the `nice' function. */
#undef HAVE_NICE
+/* Define if the internal form of wchar_t in non-Unicode locales is not
+ Unicode. */
+#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+
/* Define to 1 if you have the `openat' function. */
#undef HAVE_OPENAT