bpo-37388: Development mode check encoding and errors (GH-14341)

In development mode and in debug build, encoding and errors arguments are now checked on string encoding and decoding operations. Examples: open(), str.encode() and bytes.decode(). By default, for best performances, the errors argument is only checked at the first encoding/decoding error, and the encoding argument is sometimes ignored for empty strings.
author: Victor Stinner <vstinner@redhat.com> 2019-06-25 22:51:05 (GMT)
committer: GitHub <noreply@github.com> 2019-06-25 22:51:05 (GMT)
commit: 22eb689cf3de7972a2789db3ad01a86949508ab7 (patch)
tree: a1d63fa4cf235008e73f92a18ebef57be54ce4a5 /Objects
parent: e1a63c4f21011a3ae77dff624196561070c83446 (diff)
download: cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.zip
cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.tar.gz
cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.tar.bz2
1 files changed, 63 insertions, 5 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index cb1456e..b6f3d8f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -427,6 +427,48 @@ get_error_handler_wide(const wchar_t *errors)
 }
 
 
+static inline int
+unicode_check_encoding_errors(const char *encoding, const char *errors)
+{
+    if (encoding == NULL && errors == NULL) {
+        return 0;
+    }
+
+    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+#ifndef Py_DEBUG
+    /* In release mode, only check in development mode (-X dev) */
+    if (!interp->config.dev_mode) {
+        return 0;
+    }
+#else
+    /* Always check in debug mode */
+#endif
+
+    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
+       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
+    if (!interp->fs_codec.encoding) {
+        return 0;
+    }
+
+    if (encoding != NULL) {
+        PyObject *handler = _PyCodec_Lookup(encoding);
+        if (handler == NULL) {
+            return -1;
+        }
+        Py_DECREF(handler);
+    }
+
+    if (errors != NULL) {
+        PyObject *handler = PyCodec_LookupError(errors);
+        if (handler == NULL) {
+            return -1;
+        }
+        Py_DECREF(handler);
+    }
+    return 0;
+}
+
+
 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
    This function is kept for backward compatibility with the old API. */
 Py_UNICODE
@@ -3211,12 +3253,15 @@ PyUnicode_FromEncodedObject(PyObject *obj,
 
     /* Decoding bytes objects is the most common case and should be fast */
     if (PyBytes_Check(obj)) {
-        if (PyBytes_GET_SIZE(obj) == 0)
+        if (PyBytes_GET_SIZE(obj) == 0) {
+            if (unicode_check_encoding_errors(encoding, errors) < 0) {
+                return NULL;
+            }
             _Py_RETURN_UNICODE_EMPTY();
-        v = PyUnicode_Decode(
+        }
+        return PyUnicode_Decode(
                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
                 encoding, errors);
-        return v;
     }
 
     if (PyUnicode_Check(obj)) {
@@ -3235,6 +3280,9 @@ PyUnicode_FromEncodedObject(PyObject *obj,
 
     if (buffer.len == 0) {
         PyBuffer_Release(&buffer);
+        if (unicode_check_encoding_errors(encoding, errors) < 0) {
+            return NULL;
+        }
         _Py_RETURN_UNICODE_EMPTY();
     }
 
@@ -3302,6 +3350,10 @@ PyUnicode_Decode(const char *s,
     Py_buffer info;
     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
 
+    if (unicode_check_encoding_errors(encoding, errors) < 0) {
+        return NULL;
+    }
+
     if (encoding == NULL) {
         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
     }
@@ -3562,7 +3614,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
        cannot use it to encode and decode filenames before it is loaded. Load
        the Python codec requires to encode at least its own filename. Use the C
        implementation of the locale codec until the codec registry is
-       initialized and the Python codec is loaded. See initfsencoding(). */
+       initialized and the Python codec is loaded.
+       See _PyUnicode_InitEncodings(). */
     if (interp->fs_codec.encoding) {
         return PyUnicode_AsEncodedString(unicode,
                                          interp->fs_codec.encoding,
@@ -3591,6 +3644,10 @@ PyUnicode_AsEncodedString(PyObject *unicode,
         return NULL;
     }
 
+    if (unicode_check_encoding_errors(encoding, errors) < 0) {
+        return NULL;
+    }
+
     if (encoding == NULL) {
         return _PyUnicode_AsUTF8String(unicode, errors);
     }
@@ -3800,7 +3857,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
        cannot use it to encode and decode filenames before it is loaded. Load
        the Python codec requires to encode at least its own filename. Use the C
        implementation of the locale codec until the codec registry is
-       initialized and the Python codec is loaded. See initfsencoding(). */
+       initialized and the Python codec is loaded.
+       See _PyUnicode_InitEncodings(). */
     if (interp->fs_codec.encoding) {
         return PyUnicode_Decode(s, size,
                                 interp->fs_codec.encoding,
author	Victor Stinner <vstinner@redhat.com>	2019-06-25 22:51:05 (GMT)
committer	GitHub <noreply@github.com>	2019-06-25 22:51:05 (GMT)
commit	22eb689cf3de7972a2789db3ad01a86949508ab7 (patch)
tree	a1d63fa4cf235008e73f92a18ebef57be54ce4a5 /Objects
parent	e1a63c4f21011a3ae77dff624196561070c83446 (diff)
download	cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.zip cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.tar.gz cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.tar.bz2