diff options
author | Victor Stinner <vstinner@redhat.com> | 2019-06-25 22:51:05 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-06-25 22:51:05 (GMT) |
commit | 22eb689cf3de7972a2789db3ad01a86949508ab7 (patch) | |
tree | a1d63fa4cf235008e73f92a18ebef57be54ce4a5 /Objects | |
parent | e1a63c4f21011a3ae77dff624196561070c83446 (diff) | |
download | cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.zip cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.tar.gz cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.tar.bz2 |
bpo-37388: Development mode check encoding and errors (GH-14341)
In development mode and in debug build, encoding and errors arguments
are now checked on string encoding and decoding operations. Examples:
open(), str.encode() and bytes.decode().
By default, for best performances, the errors argument is only
checked at the first encoding/decoding error, and the encoding
argument is sometimes ignored for empty strings.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 68 |
1 files changed, 63 insertions, 5 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index cb1456e..b6f3d8f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -427,6 +427,48 @@ get_error_handler_wide(const wchar_t *errors) } +static inline int +unicode_check_encoding_errors(const char *encoding, const char *errors) +{ + if (encoding == NULL && errors == NULL) { + return 0; + } + + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); +#ifndef Py_DEBUG + /* In release mode, only check in development mode (-X dev) */ + if (!interp->config.dev_mode) { + return 0; + } +#else + /* Always check in debug mode */ +#endif + + /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the + codec registry is ready: before_PyUnicode_InitEncodings() is called. */ + if (!interp->fs_codec.encoding) { + return 0; + } + + if (encoding != NULL) { + PyObject *handler = _PyCodec_Lookup(encoding); + if (handler == NULL) { + return -1; + } + Py_DECREF(handler); + } + + if (errors != NULL) { + PyObject *handler = PyCodec_LookupError(errors); + if (handler == NULL) { + return -1; + } + Py_DECREF(handler); + } + return 0; +} + + /* The max unicode value is always 0x10FFFF while using the PEP-393 API. This function is kept for backward compatibility with the old API. */ Py_UNICODE @@ -3211,12 +3253,15 @@ PyUnicode_FromEncodedObject(PyObject *obj, /* Decoding bytes objects is the most common case and should be fast */ if (PyBytes_Check(obj)) { - if (PyBytes_GET_SIZE(obj) == 0) + if (PyBytes_GET_SIZE(obj) == 0) { + if (unicode_check_encoding_errors(encoding, errors) < 0) { + return NULL; + } _Py_RETURN_UNICODE_EMPTY(); - v = PyUnicode_Decode( + } + return PyUnicode_Decode( PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), encoding, errors); - return v; } if (PyUnicode_Check(obj)) { @@ -3235,6 +3280,9 @@ PyUnicode_FromEncodedObject(PyObject *obj, if (buffer.len == 0) { PyBuffer_Release(&buffer); + if (unicode_check_encoding_errors(encoding, errors) < 0) { + return NULL; + } _Py_RETURN_UNICODE_EMPTY(); } @@ -3302,6 +3350,10 @@ PyUnicode_Decode(const char *s, Py_buffer info; char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */ + if (unicode_check_encoding_errors(encoding, errors) < 0) { + return NULL; + } + if (encoding == NULL) { return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); } @@ -3562,7 +3614,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode) cannot use it to encode and decode filenames before it is loaded. Load the Python codec requires to encode at least its own filename. Use the C implementation of the locale codec until the codec registry is - initialized and the Python codec is loaded. See initfsencoding(). */ + initialized and the Python codec is loaded. + See _PyUnicode_InitEncodings(). */ if (interp->fs_codec.encoding) { return PyUnicode_AsEncodedString(unicode, interp->fs_codec.encoding, @@ -3591,6 +3644,10 @@ PyUnicode_AsEncodedString(PyObject *unicode, return NULL; } + if (unicode_check_encoding_errors(encoding, errors) < 0) { + return NULL; + } + if (encoding == NULL) { return _PyUnicode_AsUTF8String(unicode, errors); } @@ -3800,7 +3857,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) cannot use it to encode and decode filenames before it is loaded. Load the Python codec requires to encode at least its own filename. Use the C implementation of the locale codec until the codec registry is - initialized and the Python codec is loaded. See initfsencoding(). */ + initialized and the Python codec is loaded. + See _PyUnicode_InitEncodings(). */ if (interp->fs_codec.encoding) { return PyUnicode_Decode(s, size, interp->fs_codec.encoding, |