From 22eb689cf3de7972a2789db3ad01a86949508ab7 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 26 Jun 2019 00:51:05 +0200 Subject: bpo-37388: Development mode check encoding and errors (GH-14341) In development mode and in debug build, encoding and errors arguments are now checked on string encoding and decoding operations. Examples: open(), str.encode() and bytes.decode(). By default, for best performances, the errors argument is only checked at the first encoding/decoding error, and the encoding argument is sometimes ignored for empty strings. --- Doc/library/stdtypes.rst | 14 +++++ Doc/using/cmdline.rst | 7 +++ Doc/whatsnew/3.9.rst | 9 +++ Lib/_pyio.py | 4 ++ Lib/test/test_bytes.py | 58 ++++++++++++++++++ Lib/test/test_io.py | 49 +++++++++++++++- Lib/test/test_unicode.py | 62 ++++++++++++++++++++ .../2019-06-24-21-53-52.bpo-37388.0XTZmW.rst | 7 +++ Modules/_io/textio.c | 43 ++++++++++++++ Objects/unicodeobject.c | 68 ++++++++++++++++++++-- 10 files changed, 315 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 35a17a1..8575f8a 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1559,9 +1559,16 @@ expression support in the :mod:`re` module). :func:`codecs.register_error`, see section :ref:`error-handlers`. For a list of possible encodings, see section :ref:`standard-encodings`. + By default, the *errors* argument is not checked for best performances, but + only used at the first encoding error. Enable the development mode + (:option:`-X` ``dev`` option), or use a debug build, to check *errors*. + .. versionchanged:: 3.1 Support for keyword arguments added. + .. versionchanged:: 3.9 + The *errors* is now checked in development mode and in debug mode. + .. method:: str.endswith(suffix[, start[, end]]) @@ -2575,6 +2582,10 @@ arbitrary binary data. :func:`codecs.register_error`, see section :ref:`error-handlers`. For a list of possible encodings, see section :ref:`standard-encodings`. + By default, the *errors* argument is not checked for best performances, but + only used at the first decoding error. Enable the development mode + (:option:`-X` ``dev`` option), or use a debug build, to check *errors*. + .. note:: Passing the *encoding* argument to :class:`str` allows decoding any @@ -2584,6 +2595,9 @@ arbitrary binary data. .. versionchanged:: 3.1 Added support for keyword arguments. + .. versionchanged:: 3.9 + The *errors* is now checked in development mode and in debug mode. + .. method:: bytes.endswith(suffix[, start[, end]]) bytearray.endswith(suffix[, start[, end]]) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 87af7e8..e11fe31 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -429,6 +429,9 @@ Miscellaneous options not be more verbose than the default if the code is correct: new warnings are only emitted when an issue is detected. Effect of the developer mode: + * Check *encoding* and *errors* arguments on string encoding and decoding + operations. Examples: :func:`open`, :meth:`str.encode` and + :meth:`bytes.decode`. * Add ``default`` warning filter, as :option:`-W` ``default``. * Install debug hooks on memory allocators: see the :c:func:`PyMem_SetupDebugHooks` C function. @@ -469,6 +472,10 @@ Miscellaneous options The ``-X pycache_prefix`` option. The ``-X dev`` option now logs ``close()`` exceptions in :class:`io.IOBase` destructor. + .. versionchanged:: 3.9 + Using ``-X dev`` option, check *encoding* and *errors* arguments on + string encoding and decoding operations. + Options you shouldn't use ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Doc/whatsnew/3.9.rst b/Doc/whatsnew/3.9.rst index b58c99b..e9a31c8 100644 --- a/Doc/whatsnew/3.9.rst +++ b/Doc/whatsnew/3.9.rst @@ -84,6 +84,15 @@ Other Language Changes this case. (Contributed by Victor Stinner in :issue:`20443`.) +* In development mode and in debug build, *encoding* and *errors* arguments are + now checked on string encoding and decoding operations. Examples: + :func:`open`, :meth:`str.encode` and :meth:`bytes.decode`. + + By default, for best performances, the *errors* argument is only checked at + the first encoding/decoding error, and the *encoding* argument is sometimes + ignored for empty strings. + (Contributed by Victor Stinner in :issue:`37388`.) + New Modules =========== diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 0b6493b..c355164 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -36,6 +36,8 @@ BlockingIOError = BlockingIOError # Does io.IOBase finalizer log the exception if the close() method fails? # The exception is ignored silently by default in release build. _IOBASE_EMITS_UNRAISABLE = (hasattr(sys, "gettotalrefcount") or sys.flags.dev_mode) +# Does open() check its 'errors' argument? +_CHECK_ERRORS = _IOBASE_EMITS_UNRAISABLE def open(file, mode="r", buffering=-1, encoding=None, errors=None, @@ -2022,6 +2024,8 @@ class TextIOWrapper(TextIOBase): else: if not isinstance(errors, str): raise ValueError("invalid errors: %r" % errors) + if _CHECK_ERRORS: + codecs.lookup_error(errors) self._buffer = buffer self._decoded_chars = '' # buffer for text returned from decoder diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index bbd45c7..b5eeb2b 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -12,12 +12,14 @@ import copy import functools import pickle import tempfile +import textwrap import unittest import test.support import test.string_tests import test.list_tests from test.support import bigaddrspacetest, MAX_Py_ssize_t +from test.support.script_helper import assert_python_failure if sys.flags.bytes_warning: @@ -315,6 +317,62 @@ class BaseBytesTest: # Default encoding is utf-8 self.assertEqual(self.type2test(b'\xe2\x98\x83').decode(), '\u2603') + def test_check_encoding_errors(self): + # bpo-37388: bytes(str) and bytes.encode() must check encoding + # and errors arguments in dev mode + invalid = 'Boom, Shaka Laka, Boom!' + encodings = ('ascii', 'utf8', 'latin1') + code = textwrap.dedent(f''' + import sys + type2test = {self.type2test.__name__} + encodings = {encodings!r} + + for data in ('', 'short string'): + try: + type2test(data, encoding={invalid!r}) + except LookupError: + pass + else: + sys.exit(21) + + for encoding in encodings: + try: + type2test(data, encoding=encoding, errors={invalid!r}) + except LookupError: + pass + else: + sys.exit(22) + + for data in (b'', b'short string'): + data = type2test(data) + print(repr(data)) + try: + data.decode(encoding={invalid!r}) + except LookupError: + sys.exit(10) + else: + sys.exit(23) + + try: + data.decode(errors={invalid!r}) + except LookupError: + pass + else: + sys.exit(24) + + for encoding in encodings: + try: + data.decode(encoding=encoding, errors={invalid!r}) + except LookupError: + pass + else: + sys.exit(25) + + sys.exit(10) + ''') + proc = assert_python_failure('-X', 'dev', '-c', code) + self.assertEqual(proc.rc, 10, proc) + def test_from_int(self): b = self.type2test(0) self.assertEqual(b, self.type2test()) diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index fc474c9..1fe1cba 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -29,6 +29,7 @@ import random import signal import sys import sysconfig +import textwrap import threading import time import unittest @@ -37,7 +38,8 @@ import weakref from collections import deque, UserList from itertools import cycle, count from test import support -from test.support.script_helper import assert_python_ok, run_python_until_end +from test.support.script_helper import ( + assert_python_ok, assert_python_failure, run_python_until_end) from test.support import FakePath import codecs @@ -4130,6 +4132,51 @@ class MiscIOTest(unittest.TestCase): # there used to be a buffer overflow in the parser for rawmode self.assertRaises(ValueError, self.open, support.TESTFN, 'rwax+') + def test_check_encoding_errors(self): + # bpo-37388: open() and TextIOWrapper must check encoding and errors + # arguments in dev mode + mod = self.io.__name__ + filename = __file__ + invalid = 'Boom, Shaka Laka, Boom!' + code = textwrap.dedent(f''' + import sys + from {mod} import open, TextIOWrapper + + try: + open({filename!r}, encoding={invalid!r}) + except LookupError: + pass + else: + sys.exit(21) + + try: + open({filename!r}, errors={invalid!r}) + except LookupError: + pass + else: + sys.exit(22) + + fp = open({filename!r}, "rb") + with fp: + try: + TextIOWrapper(fp, encoding={invalid!r}) + except LookupError: + pass + else: + sys.exit(23) + + try: + TextIOWrapper(fp, errors={invalid!r}) + except LookupError: + pass + else: + sys.exit(24) + + sys.exit(10) + ''') + proc = assert_python_failure('-X', 'dev', '-c', code) + self.assertEqual(proc.rc, 10, proc) + class CMiscIOTest(MiscIOTest): io = io diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 36b72e4..177d80d 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -11,9 +11,11 @@ import itertools import operator import struct import sys +import textwrap import unittest import warnings from test import support, string_tests +from test.support.script_helper import assert_python_failure # Error handling (bad decoder return) def search_function(encoding): @@ -2436,6 +2438,66 @@ class UnicodeTest(string_tests.CommonTest, support.check_free_after_iterating(self, iter, str) support.check_free_after_iterating(self, reversed, str) + def test_check_encoding_errors(self): + # bpo-37388: str(bytes) and str.decode() must check encoding and errors + # arguments in dev mode + encodings = ('ascii', 'utf8', 'latin1') + invalid = 'Boom, Shaka Laka, Boom!' + code = textwrap.dedent(f''' + import sys + encodings = {encodings!r} + + for data in (b'', b'short string'): + try: + str(data, encoding={invalid!r}) + except LookupError: + pass + else: + sys.exit(21) + + try: + str(data, errors={invalid!r}) + except LookupError: + pass + else: + sys.exit(22) + + for encoding in encodings: + try: + str(data, encoding, errors={invalid!r}) + except LookupError: + pass + else: + sys.exit(22) + + for data in ('', 'short string'): + try: + data.encode(encoding={invalid!r}) + except LookupError: + pass + else: + sys.exit(23) + + try: + data.encode(errors={invalid!r}) + except LookupError: + pass + else: + sys.exit(24) + + for encoding in encodings: + try: + data.encode(encoding, errors={invalid!r}) + except LookupError: + pass + else: + sys.exit(24) + + sys.exit(10) + ''') + proc = assert_python_failure('-X', 'dev', '-c', code) + self.assertEqual(proc.rc, 10, proc) + class CAPITest(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst b/Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst new file mode 100644 index 0000000..295ef1b --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst @@ -0,0 +1,7 @@ +In development mode and in debug build, *encoding* and *errors* arguments are +now checked on string encoding and decoding operations. Examples: :func:`open`, +:meth:`str.encode` and :meth:`bytes.decode`. + +By default, for best performances, the *errors* argument is only checked at the +first encoding/decoding error, and the *encoding* argument is sometimes ignored +for empty strings. diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 73b2756..021231e 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -988,6 +988,46 @@ _textiowrapper_fix_encoder_state(textio *self) return 0; } +static int +io_check_errors(PyObject *errors) +{ + assert(errors != NULL && errors != Py_None); + + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); +#ifndef Py_DEBUG + /* In release mode, only check in development mode (-X dev) */ + if (!interp->config.dev_mode) { + return 0; + } +#else + /* Always check in debug mode */ +#endif + + /* Avoid calling PyCodec_LookupError() before the codec registry is ready: + before_PyUnicode_InitEncodings() is called. */ + if (!interp->fs_codec.encoding) { + return 0; + } + + Py_ssize_t name_length; + const char *name = PyUnicode_AsUTF8AndSize(errors, &name_length); + if (name == NULL) { + return -1; + } + if (strlen(name) != (size_t)name_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character in errors"); + return -1; + } + PyObject *handler = PyCodec_LookupError(name); + if (handler != NULL) { + Py_DECREF(handler); + return 0; + } + return -1; +} + + + /*[clinic input] _io.TextIOWrapper.__init__ buffer: object @@ -1057,6 +1097,9 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, errors->ob_type->tp_name); return -1; } + else if (io_check_errors(errors)) { + return -1; + } if (validate_newline(newline) < 0) { return -1; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index cb1456e..b6f3d8f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -427,6 +427,48 @@ get_error_handler_wide(const wchar_t *errors) } +static inline int +unicode_check_encoding_errors(const char *encoding, const char *errors) +{ + if (encoding == NULL && errors == NULL) { + return 0; + } + + PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); +#ifndef Py_DEBUG + /* In release mode, only check in development mode (-X dev) */ + if (!interp->config.dev_mode) { + return 0; + } +#else + /* Always check in debug mode */ +#endif + + /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the + codec registry is ready: before_PyUnicode_InitEncodings() is called. */ + if (!interp->fs_codec.encoding) { + return 0; + } + + if (encoding != NULL) { + PyObject *handler = _PyCodec_Lookup(encoding); + if (handler == NULL) { + return -1; + } + Py_DECREF(handler); + } + + if (errors != NULL) { + PyObject *handler = PyCodec_LookupError(errors); + if (handler == NULL) { + return -1; + } + Py_DECREF(handler); + } + return 0; +} + + /* The max unicode value is always 0x10FFFF while using the PEP-393 API. This function is kept for backward compatibility with the old API. */ Py_UNICODE @@ -3211,12 +3253,15 @@ PyUnicode_FromEncodedObject(PyObject *obj, /* Decoding bytes objects is the most common case and should be fast */ if (PyBytes_Check(obj)) { - if (PyBytes_GET_SIZE(obj) == 0) + if (PyBytes_GET_SIZE(obj) == 0) { + if (unicode_check_encoding_errors(encoding, errors) < 0) { + return NULL; + } _Py_RETURN_UNICODE_EMPTY(); - v = PyUnicode_Decode( + } + return PyUnicode_Decode( PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), encoding, errors); - return v; } if (PyUnicode_Check(obj)) { @@ -3235,6 +3280,9 @@ PyUnicode_FromEncodedObject(PyObject *obj, if (buffer.len == 0) { PyBuffer_Release(&buffer); + if (unicode_check_encoding_errors(encoding, errors) < 0) { + return NULL; + } _Py_RETURN_UNICODE_EMPTY(); } @@ -3302,6 +3350,10 @@ PyUnicode_Decode(const char *s, Py_buffer info; char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */ + if (unicode_check_encoding_errors(encoding, errors) < 0) { + return NULL; + } + if (encoding == NULL) { return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); } @@ -3562,7 +3614,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode) cannot use it to encode and decode filenames before it is loaded. Load the Python codec requires to encode at least its own filename. Use the C implementation of the locale codec until the codec registry is - initialized and the Python codec is loaded. See initfsencoding(). */ + initialized and the Python codec is loaded. + See _PyUnicode_InitEncodings(). */ if (interp->fs_codec.encoding) { return PyUnicode_AsEncodedString(unicode, interp->fs_codec.encoding, @@ -3591,6 +3644,10 @@ PyUnicode_AsEncodedString(PyObject *unicode, return NULL; } + if (unicode_check_encoding_errors(encoding, errors) < 0) { + return NULL; + } + if (encoding == NULL) { return _PyUnicode_AsUTF8String(unicode, errors); } @@ -3800,7 +3857,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) cannot use it to encode and decode filenames before it is loaded. Load the Python codec requires to encode at least its own filename. Use the C implementation of the locale codec until the codec registry is - initialized and the Python codec is loaded. See initfsencoding(). */ + initialized and the Python codec is loaded. + See _PyUnicode_InitEncodings(). */ if (interp->fs_codec.encoding) { return PyUnicode_Decode(s, size, interp->fs_codec.encoding, -- cgit v0.12