summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@redhat.com>2019-06-25 22:51:05 (GMT)
committerGitHub <noreply@github.com>2019-06-25 22:51:05 (GMT)
commit22eb689cf3de7972a2789db3ad01a86949508ab7 (patch)
treea1d63fa4cf235008e73f92a18ebef57be54ce4a5
parente1a63c4f21011a3ae77dff624196561070c83446 (diff)
downloadcpython-22eb689cf3de7972a2789db3ad01a86949508ab7.zip
cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.tar.gz
cpython-22eb689cf3de7972a2789db3ad01a86949508ab7.tar.bz2
bpo-37388: Development mode check encoding and errors (GH-14341)
In development mode and in debug build, encoding and errors arguments are now checked on string encoding and decoding operations. Examples: open(), str.encode() and bytes.decode(). By default, for best performances, the errors argument is only checked at the first encoding/decoding error, and the encoding argument is sometimes ignored for empty strings.
-rw-r--r--Doc/library/stdtypes.rst14
-rw-r--r--Doc/using/cmdline.rst7
-rw-r--r--Doc/whatsnew/3.9.rst9
-rw-r--r--Lib/_pyio.py4
-rw-r--r--Lib/test/test_bytes.py58
-rw-r--r--Lib/test/test_io.py49
-rw-r--r--Lib/test/test_unicode.py62
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst7
-rw-r--r--Modules/_io/textio.c43
-rw-r--r--Objects/unicodeobject.c68
10 files changed, 315 insertions, 6 deletions
diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
index 35a17a1..8575f8a 100644
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -1559,9 +1559,16 @@ expression support in the :mod:`re` module).
:func:`codecs.register_error`, see section :ref:`error-handlers`. For a
list of possible encodings, see section :ref:`standard-encodings`.
+ By default, the *errors* argument is not checked for best performances, but
+ only used at the first encoding error. Enable the development mode
+ (:option:`-X` ``dev`` option), or use a debug build, to check *errors*.
+
.. versionchanged:: 3.1
Support for keyword arguments added.
+ .. versionchanged:: 3.9
+ The *errors* is now checked in development mode and in debug mode.
+
.. method:: str.endswith(suffix[, start[, end]])
@@ -2575,6 +2582,10 @@ arbitrary binary data.
:func:`codecs.register_error`, see section :ref:`error-handlers`. For a
list of possible encodings, see section :ref:`standard-encodings`.
+ By default, the *errors* argument is not checked for best performances, but
+ only used at the first decoding error. Enable the development mode
+ (:option:`-X` ``dev`` option), or use a debug build, to check *errors*.
+
.. note::
Passing the *encoding* argument to :class:`str` allows decoding any
@@ -2584,6 +2595,9 @@ arbitrary binary data.
.. versionchanged:: 3.1
Added support for keyword arguments.
+ .. versionchanged:: 3.9
+ The *errors* is now checked in development mode and in debug mode.
+
.. method:: bytes.endswith(suffix[, start[, end]])
bytearray.endswith(suffix[, start[, end]])
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
index 87af7e8..e11fe31 100644
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -429,6 +429,9 @@ Miscellaneous options
not be more verbose than the default if the code is correct: new warnings
are only emitted when an issue is detected. Effect of the developer mode:
+ * Check *encoding* and *errors* arguments on string encoding and decoding
+ operations. Examples: :func:`open`, :meth:`str.encode` and
+ :meth:`bytes.decode`.
* Add ``default`` warning filter, as :option:`-W` ``default``.
* Install debug hooks on memory allocators: see the
:c:func:`PyMem_SetupDebugHooks` C function.
@@ -469,6 +472,10 @@ Miscellaneous options
The ``-X pycache_prefix`` option. The ``-X dev`` option now logs
``close()`` exceptions in :class:`io.IOBase` destructor.
+ .. versionchanged:: 3.9
+ Using ``-X dev`` option, check *encoding* and *errors* arguments on
+ string encoding and decoding operations.
+
Options you shouldn't use
~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Doc/whatsnew/3.9.rst b/Doc/whatsnew/3.9.rst
index b58c99b..e9a31c8 100644
--- a/Doc/whatsnew/3.9.rst
+++ b/Doc/whatsnew/3.9.rst
@@ -84,6 +84,15 @@ Other Language Changes
this case.
(Contributed by Victor Stinner in :issue:`20443`.)
+* In development mode and in debug build, *encoding* and *errors* arguments are
+ now checked on string encoding and decoding operations. Examples:
+ :func:`open`, :meth:`str.encode` and :meth:`bytes.decode`.
+
+ By default, for best performances, the *errors* argument is only checked at
+ the first encoding/decoding error, and the *encoding* argument is sometimes
+ ignored for empty strings.
+ (Contributed by Victor Stinner in :issue:`37388`.)
+
New Modules
===========
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
index 0b6493b..c355164 100644
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -36,6 +36,8 @@ BlockingIOError = BlockingIOError
# Does io.IOBase finalizer log the exception if the close() method fails?
# The exception is ignored silently by default in release build.
_IOBASE_EMITS_UNRAISABLE = (hasattr(sys, "gettotalrefcount") or sys.flags.dev_mode)
+# Does open() check its 'errors' argument?
+_CHECK_ERRORS = _IOBASE_EMITS_UNRAISABLE
def open(file, mode="r", buffering=-1, encoding=None, errors=None,
@@ -2022,6 +2024,8 @@ class TextIOWrapper(TextIOBase):
else:
if not isinstance(errors, str):
raise ValueError("invalid errors: %r" % errors)
+ if _CHECK_ERRORS:
+ codecs.lookup_error(errors)
self._buffer = buffer
self._decoded_chars = '' # buffer for text returned from decoder
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index bbd45c7..b5eeb2b 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -12,12 +12,14 @@ import copy
import functools
import pickle
import tempfile
+import textwrap
import unittest
import test.support
import test.string_tests
import test.list_tests
from test.support import bigaddrspacetest, MAX_Py_ssize_t
+from test.support.script_helper import assert_python_failure
if sys.flags.bytes_warning:
@@ -315,6 +317,62 @@ class BaseBytesTest:
# Default encoding is utf-8
self.assertEqual(self.type2test(b'\xe2\x98\x83').decode(), '\u2603')
+ def test_check_encoding_errors(self):
+ # bpo-37388: bytes(str) and bytes.encode() must check encoding
+ # and errors arguments in dev mode
+ invalid = 'Boom, Shaka Laka, Boom!'
+ encodings = ('ascii', 'utf8', 'latin1')
+ code = textwrap.dedent(f'''
+ import sys
+ type2test = {self.type2test.__name__}
+ encodings = {encodings!r}
+
+ for data in ('', 'short string'):
+ try:
+ type2test(data, encoding={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(21)
+
+ for encoding in encodings:
+ try:
+ type2test(data, encoding=encoding, errors={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(22)
+
+ for data in (b'', b'short string'):
+ data = type2test(data)
+ print(repr(data))
+ try:
+ data.decode(encoding={invalid!r})
+ except LookupError:
+ sys.exit(10)
+ else:
+ sys.exit(23)
+
+ try:
+ data.decode(errors={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(24)
+
+ for encoding in encodings:
+ try:
+ data.decode(encoding=encoding, errors={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(25)
+
+ sys.exit(10)
+ ''')
+ proc = assert_python_failure('-X', 'dev', '-c', code)
+ self.assertEqual(proc.rc, 10, proc)
+
def test_from_int(self):
b = self.type2test(0)
self.assertEqual(b, self.type2test())
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index fc474c9..1fe1cba 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -29,6 +29,7 @@ import random
import signal
import sys
import sysconfig
+import textwrap
import threading
import time
import unittest
@@ -37,7 +38,8 @@ import weakref
from collections import deque, UserList
from itertools import cycle, count
from test import support
-from test.support.script_helper import assert_python_ok, run_python_until_end
+from test.support.script_helper import (
+ assert_python_ok, assert_python_failure, run_python_until_end)
from test.support import FakePath
import codecs
@@ -4130,6 +4132,51 @@ class MiscIOTest(unittest.TestCase):
# there used to be a buffer overflow in the parser for rawmode
self.assertRaises(ValueError, self.open, support.TESTFN, 'rwax+')
+ def test_check_encoding_errors(self):
+ # bpo-37388: open() and TextIOWrapper must check encoding and errors
+ # arguments in dev mode
+ mod = self.io.__name__
+ filename = __file__
+ invalid = 'Boom, Shaka Laka, Boom!'
+ code = textwrap.dedent(f'''
+ import sys
+ from {mod} import open, TextIOWrapper
+
+ try:
+ open({filename!r}, encoding={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(21)
+
+ try:
+ open({filename!r}, errors={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(22)
+
+ fp = open({filename!r}, "rb")
+ with fp:
+ try:
+ TextIOWrapper(fp, encoding={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(23)
+
+ try:
+ TextIOWrapper(fp, errors={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(24)
+
+ sys.exit(10)
+ ''')
+ proc = assert_python_failure('-X', 'dev', '-c', code)
+ self.assertEqual(proc.rc, 10, proc)
+
class CMiscIOTest(MiscIOTest):
io = io
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 36b72e4..177d80d 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -11,9 +11,11 @@ import itertools
import operator
import struct
import sys
+import textwrap
import unittest
import warnings
from test import support, string_tests
+from test.support.script_helper import assert_python_failure
# Error handling (bad decoder return)
def search_function(encoding):
@@ -2436,6 +2438,66 @@ class UnicodeTest(string_tests.CommonTest,
support.check_free_after_iterating(self, iter, str)
support.check_free_after_iterating(self, reversed, str)
+ def test_check_encoding_errors(self):
+ # bpo-37388: str(bytes) and str.decode() must check encoding and errors
+ # arguments in dev mode
+ encodings = ('ascii', 'utf8', 'latin1')
+ invalid = 'Boom, Shaka Laka, Boom!'
+ code = textwrap.dedent(f'''
+ import sys
+ encodings = {encodings!r}
+
+ for data in (b'', b'short string'):
+ try:
+ str(data, encoding={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(21)
+
+ try:
+ str(data, errors={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(22)
+
+ for encoding in encodings:
+ try:
+ str(data, encoding, errors={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(22)
+
+ for data in ('', 'short string'):
+ try:
+ data.encode(encoding={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(23)
+
+ try:
+ data.encode(errors={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(24)
+
+ for encoding in encodings:
+ try:
+ data.encode(encoding, errors={invalid!r})
+ except LookupError:
+ pass
+ else:
+ sys.exit(24)
+
+ sys.exit(10)
+ ''')
+ proc = assert_python_failure('-X', 'dev', '-c', code)
+ self.assertEqual(proc.rc, 10, proc)
+
class CAPITest(unittest.TestCase):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst b/Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst
new file mode 100644
index 0000000..295ef1b
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst
@@ -0,0 +1,7 @@
+In development mode and in debug build, *encoding* and *errors* arguments are
+now checked on string encoding and decoding operations. Examples: :func:`open`,
+:meth:`str.encode` and :meth:`bytes.decode`.
+
+By default, for best performances, the *errors* argument is only checked at the
+first encoding/decoding error, and the *encoding* argument is sometimes ignored
+for empty strings.
diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c
index 73b2756..021231e 100644
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@@ -988,6 +988,46 @@ _textiowrapper_fix_encoder_state(textio *self)
return 0;
}
+static int
+io_check_errors(PyObject *errors)
+{
+ assert(errors != NULL && errors != Py_None);
+
+ PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+#ifndef Py_DEBUG
+ /* In release mode, only check in development mode (-X dev) */
+ if (!interp->config.dev_mode) {
+ return 0;
+ }
+#else
+ /* Always check in debug mode */
+#endif
+
+ /* Avoid calling PyCodec_LookupError() before the codec registry is ready:
+ before_PyUnicode_InitEncodings() is called. */
+ if (!interp->fs_codec.encoding) {
+ return 0;
+ }
+
+ Py_ssize_t name_length;
+ const char *name = PyUnicode_AsUTF8AndSize(errors, &name_length);
+ if (name == NULL) {
+ return -1;
+ }
+ if (strlen(name) != (size_t)name_length) {
+ PyErr_SetString(PyExc_ValueError, "embedded null character in errors");
+ return -1;
+ }
+ PyObject *handler = PyCodec_LookupError(name);
+ if (handler != NULL) {
+ Py_DECREF(handler);
+ return 0;
+ }
+ return -1;
+}
+
+
+
/*[clinic input]
_io.TextIOWrapper.__init__
buffer: object
@@ -1057,6 +1097,9 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
errors->ob_type->tp_name);
return -1;
}
+ else if (io_check_errors(errors)) {
+ return -1;
+ }
if (validate_newline(newline) < 0) {
return -1;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index cb1456e..b6f3d8f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -427,6 +427,48 @@ get_error_handler_wide(const wchar_t *errors)
}
+static inline int
+unicode_check_encoding_errors(const char *encoding, const char *errors)
+{
+ if (encoding == NULL && errors == NULL) {
+ return 0;
+ }
+
+ PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+#ifndef Py_DEBUG
+ /* In release mode, only check in development mode (-X dev) */
+ if (!interp->config.dev_mode) {
+ return 0;
+ }
+#else
+ /* Always check in debug mode */
+#endif
+
+ /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
+ codec registry is ready: before_PyUnicode_InitEncodings() is called. */
+ if (!interp->fs_codec.encoding) {
+ return 0;
+ }
+
+ if (encoding != NULL) {
+ PyObject *handler = _PyCodec_Lookup(encoding);
+ if (handler == NULL) {
+ return -1;
+ }
+ Py_DECREF(handler);
+ }
+
+ if (errors != NULL) {
+ PyObject *handler = PyCodec_LookupError(errors);
+ if (handler == NULL) {
+ return -1;
+ }
+ Py_DECREF(handler);
+ }
+ return 0;
+}
+
+
/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
This function is kept for backward compatibility with the old API. */
Py_UNICODE
@@ -3211,12 +3253,15 @@ PyUnicode_FromEncodedObject(PyObject *obj,
/* Decoding bytes objects is the most common case and should be fast */
if (PyBytes_Check(obj)) {
- if (PyBytes_GET_SIZE(obj) == 0)
+ if (PyBytes_GET_SIZE(obj) == 0) {
+ if (unicode_check_encoding_errors(encoding, errors) < 0) {
+ return NULL;
+ }
_Py_RETURN_UNICODE_EMPTY();
- v = PyUnicode_Decode(
+ }
+ return PyUnicode_Decode(
PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
encoding, errors);
- return v;
}
if (PyUnicode_Check(obj)) {
@@ -3235,6 +3280,9 @@ PyUnicode_FromEncodedObject(PyObject *obj,
if (buffer.len == 0) {
PyBuffer_Release(&buffer);
+ if (unicode_check_encoding_errors(encoding, errors) < 0) {
+ return NULL;
+ }
_Py_RETURN_UNICODE_EMPTY();
}
@@ -3302,6 +3350,10 @@ PyUnicode_Decode(const char *s,
Py_buffer info;
char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
+ if (unicode_check_encoding_errors(encoding, errors) < 0) {
+ return NULL;
+ }
+
if (encoding == NULL) {
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}
@@ -3562,7 +3614,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
cannot use it to encode and decode filenames before it is loaded. Load
the Python codec requires to encode at least its own filename. Use the C
implementation of the locale codec until the codec registry is
- initialized and the Python codec is loaded. See initfsencoding(). */
+ initialized and the Python codec is loaded.
+ See _PyUnicode_InitEncodings(). */
if (interp->fs_codec.encoding) {
return PyUnicode_AsEncodedString(unicode,
interp->fs_codec.encoding,
@@ -3591,6 +3644,10 @@ PyUnicode_AsEncodedString(PyObject *unicode,
return NULL;
}
+ if (unicode_check_encoding_errors(encoding, errors) < 0) {
+ return NULL;
+ }
+
if (encoding == NULL) {
return _PyUnicode_AsUTF8String(unicode, errors);
}
@@ -3800,7 +3857,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
cannot use it to encode and decode filenames before it is loaded. Load
the Python codec requires to encode at least its own filename. Use the C
implementation of the locale codec until the codec registry is
- initialized and the Python codec is loaded. See initfsencoding(). */
+ initialized and the Python codec is loaded.
+ See _PyUnicode_InitEncodings(). */
if (interp->fs_codec.encoding) {
return PyUnicode_Decode(s, size,
interp->fs_codec.encoding,