From 7303f06846b69016a075bca7ad7c6055f29ad024 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Tue, 17 Dec 2024 12:12:45 +0100
Subject: gh-126742: Add _PyErr_SetLocaleString, use it for gdbm & dlerror
 messages (GH-126746)

- Add a helper to set an error from locale-encoded `char*`
- Use the helper for gdbm & dlerror messages

Co-authored-by: Victor Stinner <vstinner@python.org>
---
 Include/internal/pycore_pyerrors.h   | 12 ++++++
 Lib/test/test_ctypes/test_dlerror.py | 80 +++++++++++++++++++++++++++++++-----
 Lib/test/test_dbm_gnu.py             | 21 +++++++---
 Modules/_ctypes/_ctypes.c            | 28 ++++---------
 Modules/_ctypes/callproc.c           | 38 ++++++++---------
 Modules/_gdbmmodule.c                | 44 +++++++++++++++-----
 Modules/_hashopenssl.c               |  1 +
 Modules/_sqlite/util.c               |  1 +
 Modules/_testcapi/exceptions.c       |  1 +
 Modules/main.c                       |  1 +
 Modules/pyexpat.c                    |  7 +++-
 Python/errors.c                      |  9 ++++
 12 files changed, 175 insertions(+), 68 deletions(-)

diff --git a/Include/internal/pycore_pyerrors.h b/Include/internal/pycore_pyerrors.h
index 02945f0..6f2fdda 100644
--- a/Include/internal/pycore_pyerrors.h
+++ b/Include/internal/pycore_pyerrors.h
@@ -130,6 +130,18 @@ PyAPI_FUNC(void) _PyErr_SetString(
     PyObject *exception,
     const char *string);
 
+/*
+ * Set an exception with the error message decoded from the current locale
+ * encoding (LC_CTYPE).
+ *
+ * Exceptions occurring in decoding take priority over the desired exception.
+ *
+ * Exported for '_ctypes' shared extensions.
+ */
+PyAPI_FUNC(void) _PyErr_SetLocaleString(
+    PyObject *exception,
+    const char *string);
+
 PyAPI_FUNC(PyObject*) _PyErr_Format(
     PyThreadState *tstate,
     PyObject *exception,
diff --git a/Lib/test/test_ctypes/test_dlerror.py b/Lib/test/test_ctypes/test_dlerror.py
index 4441e30..c3c34d4 100644
--- a/Lib/test/test_ctypes/test_dlerror.py
+++ b/Lib/test/test_ctypes/test_dlerror.py
@@ -1,7 +1,12 @@
+import _ctypes
 import os
+import platform
 import sys
+import test.support
 import unittest
-import platform
+from ctypes import CDLL, c_int
+from ctypes.util import find_library
+
 
 FOO_C = r"""
 #include <unistd.h>
@@ -26,7 +31,7 @@ void *foo(void)
 
 
 @unittest.skipUnless(sys.platform.startswith('linux'),
-                     'Test only valid for Linux')
+                     'test requires GNU IFUNC support')
 class TestNullDlsym(unittest.TestCase):
     """GH-126554: Ensure that we catch NULL dlsym return values
 
@@ -53,14 +58,6 @@ class TestNullDlsym(unittest.TestCase):
         import subprocess
         import tempfile
 
-        # To avoid ImportErrors on Windows, where _ctypes does not have
-        # dlopen and dlsym,
-        # import here, i.e., inside the test function.
-        # The skipUnless('linux') decorator ensures that we're on linux
-        # if we're executing these statements.
-        from ctypes import CDLL, c_int
-        from _ctypes import dlopen, dlsym
-
         retcode = subprocess.call(["gcc", "--version"],
                                   stdout=subprocess.DEVNULL,
                                   stderr=subprocess.DEVNULL)
@@ -111,6 +108,8 @@ class TestNullDlsym(unittest.TestCase):
             self.assertEqual(os.read(pipe_r, 2), b'OK')
 
             # Case #3: Test 'py_dl_sym' from Modules/_ctypes/callproc.c
+            dlopen = test.support.get_attribute(_ctypes, 'dlopen')
+            dlsym = test.support.get_attribute(_ctypes, 'dlsym')
             L = dlopen(dstname)
             with self.assertRaisesRegex(OSError, "symbol 'foo' not found"):
                 dlsym(L, "foo")
@@ -119,5 +118,66 @@ class TestNullDlsym(unittest.TestCase):
             self.assertEqual(os.read(pipe_r, 2), b'OK')
 
 
+@unittest.skipUnless(os.name != 'nt', 'test requires dlerror() calls')
+class TestLocalization(unittest.TestCase):
+
+    @staticmethod
+    def configure_locales(func):
+        return test.support.run_with_locale(
+            'LC_ALL',
+            'fr_FR.iso88591', 'ja_JP.sjis', 'zh_CN.gbk',
+            'fr_FR.utf8', 'en_US.utf8',
+            '',
+        )(func)
+
+    @classmethod
+    def setUpClass(cls):
+        cls.libc_filename = find_library("c")
+
+    @configure_locales
+    def test_localized_error_from_dll(self):
+        dll = CDLL(self.libc_filename)
+        with self.assertRaises(AttributeError) as cm:
+            dll.this_name_does_not_exist
+        if sys.platform.startswith('linux'):
+            # On macOS, the filename is not reported by dlerror().
+            self.assertIn(self.libc_filename, str(cm.exception))
+
+    @configure_locales
+    def test_localized_error_in_dll(self):
+        dll = CDLL(self.libc_filename)
+        with self.assertRaises(ValueError) as cm:
+            c_int.in_dll(dll, 'this_name_does_not_exist')
+        if sys.platform.startswith('linux'):
+            # On macOS, the filename is not reported by dlerror().
+            self.assertIn(self.libc_filename, str(cm.exception))
+
+    @unittest.skipUnless(hasattr(_ctypes, 'dlopen'),
+                         'test requires _ctypes.dlopen()')
+    @configure_locales
+    def test_localized_error_dlopen(self):
+        missing_filename = b'missing\xff.so'
+        # Depending whether the locale, we may encode '\xff' differently
+        # but we are only interested in avoiding a UnicodeDecodeError
+        # when reporting the dlerror() error message which contains
+        # the localized filename.
+        filename_pattern = r'missing.*?\.so'
+        with self.assertRaisesRegex(OSError, filename_pattern):
+            _ctypes.dlopen(missing_filename, 2)
+
+    @unittest.skipUnless(hasattr(_ctypes, 'dlopen'),
+                         'test requires _ctypes.dlopen()')
+    @unittest.skipUnless(hasattr(_ctypes, 'dlsym'),
+                         'test requires _ctypes.dlsym()')
+    @configure_locales
+    def test_localized_error_dlsym(self):
+        dll = _ctypes.dlopen(self.libc_filename)
+        with self.assertRaises(OSError) as cm:
+            _ctypes.dlsym(dll, 'this_name_does_not_exist')
+        if sys.platform.startswith('linux'):
+            # On macOS, the filename is not reported by dlerror().
+            self.assertIn(self.libc_filename, str(cm.exception))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/test/test_dbm_gnu.py b/Lib/test/test_dbm_gnu.py
index e20addf..66268c4 100644
--- a/Lib/test/test_dbm_gnu.py
+++ b/Lib/test/test_dbm_gnu.py
@@ -1,10 +1,11 @@
-from test import support
-from test.support import import_helper, cpython_only
-gdbm = import_helper.import_module("dbm.gnu") #skip if not supported
-import unittest
 import os
-from test.support.os_helper import TESTFN, TESTFN_NONASCII, unlink, FakePath
+import unittest
+from test import support
+from test.support import cpython_only, import_helper
+from test.support.os_helper import (TESTFN, TESTFN_NONASCII, FakePath,
+                                    create_empty_file, temp_dir, unlink)
 
+gdbm = import_helper.import_module("dbm.gnu")  # skip if not supported
 
 filename = TESTFN
 
@@ -205,6 +206,16 @@ class TestGdbm(unittest.TestCase):
                 self.assertNotIn(k, db)
             self.assertEqual(len(db), 0)
 
+    @support.run_with_locale(
+        'LC_ALL',
+        'fr_FR.iso88591', 'ja_JP.sjis', 'zh_CN.gbk',
+        'fr_FR.utf8', 'en_US.utf8',
+        '',
+    )
+    def test_localized_error(self):
+        with temp_dir() as d:
+            create_empty_file(os.path.join(d, 'test'))
+            self.assertRaises(gdbm.error, gdbm.open, filename, 'r')
 
 
 if __name__ == '__main__':
diff --git a/Modules/_ctypes/_ctypes.c b/Modules/_ctypes/_ctypes.c
index bb46998..3a3b1da 100644
--- a/Modules/_ctypes/_ctypes.c
+++ b/Modules/_ctypes/_ctypes.c
@@ -984,15 +984,8 @@ CDataType_in_dll_impl(PyObject *type, PyTypeObject *cls, PyObject *dll,
     #ifdef USE_DLERROR
     const char *dlerr = dlerror();
     if (dlerr) {
-        PyObject *message = PyUnicode_DecodeLocale(dlerr, "surrogateescape");
-        if (message) {
-            PyErr_SetObject(PyExc_ValueError, message);
-            Py_DECREF(message);
-            return NULL;
-        }
-        // Ignore errors from PyUnicode_DecodeLocale,
-        // fall back to the generic error below.
-        PyErr_Clear();
+        _PyErr_SetLocaleString(PyExc_ValueError, dlerr);
+        return NULL;
     }
     #endif
 #undef USE_DLERROR
@@ -3825,21 +3818,14 @@ PyCFuncPtr_FromDll(PyTypeObject *type, PyObject *args, PyObject *kwds)
     address = (PPROC)dlsym(handle, name);
 
     if (!address) {
-	#ifdef USE_DLERROR
+    #ifdef USE_DLERROR
         const char *dlerr = dlerror();
         if (dlerr) {
-            PyObject *message = PyUnicode_DecodeLocale(dlerr, "surrogateescape");
-            if (message) {
-                PyErr_SetObject(PyExc_AttributeError, message);
-                Py_DECREF(ftuple);
-                Py_DECREF(message);
-                return NULL;
-            }
-            // Ignore errors from PyUnicode_DecodeLocale,
-            // fall back to the generic error below.
-            PyErr_Clear();
+            _PyErr_SetLocaleString(PyExc_AttributeError, dlerr);
+            Py_DECREF(ftuple);
+            return NULL;
         }
-	#endif
+    #endif
         PyErr_Format(PyExc_AttributeError, "function '%s' not found", name);
         Py_DECREF(ftuple);
         return NULL;
diff --git a/Modules/_ctypes/callproc.c b/Modules/_ctypes/callproc.c
index 218c3a9..92eedff 100644
--- a/Modules/_ctypes/callproc.c
+++ b/Modules/_ctypes/callproc.c
@@ -1588,10 +1588,11 @@ static PyObject *py_dl_open(PyObject *self, PyObject *args)
     Py_XDECREF(name2);
     if (!handle) {
         const char *errmsg = dlerror();
-        if (!errmsg)
-            errmsg = "dlopen() error";
-        PyErr_SetString(PyExc_OSError,
-                               errmsg);
+        if (errmsg) {
+            _PyErr_SetLocaleString(PyExc_OSError, errmsg);
+            return NULL;
+        }
+        PyErr_SetString(PyExc_OSError, "dlopen() error");
         return NULL;
     }
     return PyLong_FromVoidPtr(handle);
@@ -1604,8 +1605,12 @@ static PyObject *py_dl_close(PyObject *self, PyObject *args)
     if (!PyArg_ParseTuple(args, "O&:dlclose", &_parse_voidp, &handle))
         return NULL;
     if (dlclose(handle)) {
-        PyErr_SetString(PyExc_OSError,
-                               dlerror());
+        const char *errmsg = dlerror();
+        if (errmsg) {
+            _PyErr_SetLocaleString(PyExc_OSError, errmsg);
+            return NULL;
+        }
+        PyErr_SetString(PyExc_OSError, "dlclose() error");
         return NULL;
     }
     Py_RETURN_NONE;
@@ -1639,21 +1644,14 @@ static PyObject *py_dl_sym(PyObject *self, PyObject *args)
     if (ptr) {
         return PyLong_FromVoidPtr(ptr);
     }
-	#ifdef USE_DLERROR
-    const char *dlerr = dlerror();
-    if (dlerr) {
-        PyObject *message = PyUnicode_DecodeLocale(dlerr, "surrogateescape");
-        if (message) {
-            PyErr_SetObject(PyExc_OSError, message);
-            Py_DECREF(message);
-            return NULL;
-        }
-        // Ignore errors from PyUnicode_DecodeLocale,
-        // fall back to the generic error below.
-        PyErr_Clear();
+    #ifdef USE_DLERROR
+    const char *errmsg = dlerror();
+    if (errmsg) {
+        _PyErr_SetLocaleString(PyExc_OSError, errmsg);
+        return NULL;
     }
-	#endif
-	#undef USE_DLERROR
+    #endif
+    #undef USE_DLERROR
     PyErr_Format(PyExc_OSError, "symbol '%s' not found", name);
     return NULL;
 }
diff --git a/Modules/_gdbmmodule.c b/Modules/_gdbmmodule.c
index df7fba6..ea4fe24 100644
--- a/Modules/_gdbmmodule.c
+++ b/Modules/_gdbmmodule.c
@@ -8,10 +8,11 @@
 #endif
 
 #include "Python.h"
+#include "pycore_pyerrors.h"        // _PyErr_SetLocaleString()
 #include "gdbm.h"
 
 #include <fcntl.h>
-#include <stdlib.h>               // free()
+#include <stdlib.h>                 // free()
 #include <sys/stat.h>
 #include <sys/types.h>
 
@@ -33,6 +34,24 @@ get_gdbm_state(PyObject *module)
     return (_gdbm_state *)state;
 }
 
+/*
+ * Set the gdbm error obtained by gdbm_strerror(gdbm_errno).
+ *
+ * If no error message exists, a generic (UTF-8) error message
+ * is used instead.
+ */
+static void
+set_gdbm_error(_gdbm_state *state, const char *generic_error)
+{
+    const char *gdbm_errmsg = gdbm_strerror(gdbm_errno);
+    if (gdbm_errmsg) {
+        _PyErr_SetLocaleString(state->gdbm_error, gdbm_errmsg);
+    }
+    else {
+        PyErr_SetString(state->gdbm_error, generic_error);
+    }
+}
+
 /*[clinic input]
 module _gdbm
 class _gdbm.gdbm "gdbmobject *" "&Gdbmtype"
@@ -91,7 +110,7 @@ newgdbmobject(_gdbm_state *state, const char *file, int flags, int mode)
             PyErr_SetFromErrnoWithFilename(state->gdbm_error, file);
         }
         else {
-            PyErr_SetString(state->gdbm_error, gdbm_strerror(gdbm_errno));
+            set_gdbm_error(state, "gdbm_open() error");
         }
         Py_DECREF(dp);
         return NULL;
@@ -136,7 +155,7 @@ gdbm_length(gdbmobject *dp)
                 PyErr_SetFromErrno(state->gdbm_error);
             }
             else {
-                PyErr_SetString(state->gdbm_error, gdbm_strerror(gdbm_errno));
+                set_gdbm_error(state, "gdbm_count() error");
             }
             return -1;
         }
@@ -286,7 +305,7 @@ gdbm_ass_sub(gdbmobject *dp, PyObject *v, PyObject *w)
                 PyErr_SetObject(PyExc_KeyError, v);
             }
             else {
-                PyErr_SetString(state->gdbm_error, gdbm_strerror(gdbm_errno));
+                set_gdbm_error(state, "gdbm_delete() error");
             }
             return -1;
         }
@@ -297,11 +316,12 @@ gdbm_ass_sub(gdbmobject *dp, PyObject *v, PyObject *w)
         }
         errno = 0;
         if (gdbm_store(dp->di_dbm, krec, drec, GDBM_REPLACE) < 0) {
-            if (errno != 0)
+            if (errno != 0) {
                 PyErr_SetFromErrno(state->gdbm_error);
-            else
-                PyErr_SetString(state->gdbm_error,
-                                gdbm_strerror(gdbm_errno));
+            }
+            else {
+                set_gdbm_error(state, "gdbm_store() error");
+            }
             return -1;
         }
     }
@@ -534,10 +554,12 @@ _gdbm_gdbm_reorganize_impl(gdbmobject *self, PyTypeObject *cls)
     check_gdbmobject_open(self, state->gdbm_error);
     errno = 0;
     if (gdbm_reorganize(self->di_dbm) < 0) {
-        if (errno != 0)
+        if (errno != 0) {
             PyErr_SetFromErrno(state->gdbm_error);
-        else
-            PyErr_SetString(state->gdbm_error, gdbm_strerror(gdbm_errno));
+        }
+        else {
+            set_gdbm_error(state, "gdbm_reorganize() error");
+        }
         return NULL;
     }
     Py_RETURN_NONE;
diff --git a/Modules/_hashopenssl.c b/Modules/_hashopenssl.c
index 2c9a9fe..082929b 100644
--- a/Modules/_hashopenssl.c
+++ b/Modules/_hashopenssl.c
@@ -319,6 +319,7 @@ _setException(PyObject *exc, const char* altmsg, ...)
     va_end(vargs);
     ERR_clear_error();
 
+    /* ERR_ERROR_STRING(3) ensures that the messages below are ASCII */
     lib = ERR_lib_error_string(errcode);
     func = ERR_func_error_string(errcode);
     reason = ERR_reason_error_string(errcode);
diff --git a/Modules/_sqlite/util.c b/Modules/_sqlite/util.c
index 9e8613e..b0622e6 100644
--- a/Modules/_sqlite/util.c
+++ b/Modules/_sqlite/util.c
@@ -134,6 +134,7 @@ _pysqlite_seterror(pysqlite_state *state, sqlite3 *db)
 
     /* Create and set the exception. */
     int extended_errcode = sqlite3_extended_errcode(db);
+    // sqlite3_errmsg() always returns an UTF-8 encoded message
     const char *errmsg = sqlite3_errmsg(db);
     raise_exception(exc_class, extended_errcode, errmsg);
     return extended_errcode;
diff --git a/Modules/_testcapi/exceptions.c b/Modules/_testcapi/exceptions.c
index e92d967..b647bfc 100644
--- a/Modules/_testcapi/exceptions.c
+++ b/Modules/_testcapi/exceptions.c
@@ -3,6 +3,7 @@
 
 #include "parts.h"
 #include "util.h"
+
 #include "clinic/exceptions.c.h"
 
 
diff --git a/Modules/main.c b/Modules/main.c
index 15ea49a..3bf2241 100644
--- a/Modules/main.c
+++ b/Modules/main.c
@@ -374,6 +374,7 @@ pymain_run_file_obj(PyObject *program_name, PyObject *filename,
     if (fp == NULL) {
         // Ignore the OSError
         PyErr_Clear();
+        // TODO(picnixz): strerror() is locale dependent but not PySys_FormatStderr().
         PySys_FormatStderr("%S: can't open file %R: [Errno %d] %s\n",
                            program_name, filename, errno, strerror(errno));
         return 2;
diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
index 9733bc3..cf7714e 100644
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -1782,7 +1782,12 @@ add_error(PyObject *errors_module, PyObject *codes_dict,
      *       with the other uses of the XML_ErrorString function
      *       elsewhere within this file.  pyexpat's copy of the messages
      *       only acts as a fallback in case of outdated runtime libexpat,
-     *       where it returns NULL. */
+     *       where it returns NULL.
+     *
+     *       In addition, XML_ErrorString is assumed to return UTF-8 encoded
+     *       strings (in conv_string_to_unicode, we decode them using 'strict'
+     *       error handling).
+     */
     const char *error_string = XML_ErrorString(error_code);
     if (error_string == NULL) {
         error_string = error_info_of[error_index].description;
diff --git a/Python/errors.c b/Python/errors.c
index 7f3b4aa..2d362c1 100644
--- a/Python/errors.c
+++ b/Python/errors.c
@@ -301,6 +301,15 @@ PyErr_SetString(PyObject *exception, const char *string)
     _PyErr_SetString(tstate, exception, string);
 }
 
+void
+_PyErr_SetLocaleString(PyObject *exception, const char *string)
+{
+    PyObject *value = PyUnicode_DecodeLocale(string, "surrogateescape");
+    if (value != NULL) {
+        PyErr_SetObject(exception, value);
+        Py_DECREF(value);
+    }
+}
 
 PyObject* _Py_HOT_FUNCTION
 PyErr_Occurred(void)
-- 
cgit v0.12