From a9b15241c6bdf8ac71f1dc598b7c01a20518b6a7 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Tue, 4 Feb 2014 22:11:18 +1000 Subject: Close #20404: blacklist non-text encodings in io.TextIOWrapper - io.TextIOWrapper (and hence the open() builtin) now use the internal codec marking system added for issue #19619 - also tweaked the C code to only look up the encoding once, rather than multiple times - the existing output type checks remain in place to deal with unmarked third party codecs. --- Include/codecs.h | 20 +++++++++++++ Lib/_pyio.py | 5 ++++ Lib/test/test_io.py | 33 ++++++++++++++++----- Misc/NEWS | 6 ++++ Modules/_io/textio.c | 34 +++++++++++++-------- Python/codecs.c | 84 +++++++++++++++++++++++++++++++++++++++------------- 6 files changed, 141 insertions(+), 41 deletions(-) diff --git a/Include/codecs.h b/Include/codecs.h index 8f0014e..611964c 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -104,7 +104,14 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode( Please note that these APIs are internal and should not be used in Python C extensions. + XXX (ncoghlan): should we make these, or something like them, public + in Python 3.5+? + */ +PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding( + const char *encoding, + const char *alternate_command + ); PyAPI_FUNC(PyObject *) _PyCodec_EncodeText( PyObject *object, @@ -117,6 +124,19 @@ PyAPI_FUNC(PyObject *) _PyCodec_DecodeText( const char *encoding, const char *errors ); + +/* These two aren't actually text encoding specific, but _io.TextIOWrapper + * is the only current API consumer. + */ +PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalDecoder( + PyObject *codec_info, + const char *errors + ); + +PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalEncoder( + PyObject *codec_info, + const char *errors + ); #endif diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 3961969..b04d23a 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1503,6 +1503,11 @@ class TextIOWrapper(TextIOBase): if not isinstance(encoding, str): raise ValueError("invalid encoding: %r" % encoding) + if not codecs.lookup(encoding)._is_text_encoding: + msg = ("%r is not a text encoding; " + "use codecs.open() to handle arbitrary codecs") + raise LookupError(msg % encoding) + if errors is None: errors = "strict" else: diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 355a33e..d5b274c 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -1929,6 +1929,15 @@ class TextIOWrapperTest(unittest.TestCase): self.assertRaises(TypeError, t.__init__, b, newline=42) self.assertRaises(ValueError, t.__init__, b, newline='xyzzy') + def test_non_text_encoding_codecs_are_rejected(self): + # Ensure the constructor complains if passed a codec that isn't + # marked as a text encoding + # http://bugs.python.org/issue20404 + r = self.BytesIO() + b = self.BufferedWriter(r) + with self.assertRaisesRegex(LookupError, "is not a text encoding"): + self.TextIOWrapper(b, encoding="hex") + def test_detach(self): r = self.BytesIO() b = self.BufferedWriter(r) @@ -2579,15 +2588,22 @@ class TextIOWrapperTest(unittest.TestCase): def test_illegal_decoder(self): # Issue #17106 + # Bypass the early encoding check added in issue 20404 + def _make_illegal_wrapper(): + quopri = codecs.lookup("quopri") + quopri._is_text_encoding = True + try: + t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), + newline='\n', encoding="quopri") + finally: + quopri._is_text_encoding = False + return t # Crash when decoder returns non-string - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + t = _make_illegal_wrapper() self.assertRaises(TypeError, t.read, 1) - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + t = _make_illegal_wrapper() self.assertRaises(TypeError, t.readline) - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + t = _make_illegal_wrapper() self.assertRaises(TypeError, t.read) def _check_create_at_shutdown(self, **kwargs): @@ -2616,8 +2632,7 @@ class TextIOWrapperTest(unittest.TestCase): if err: # Can error out with a RuntimeError if the module state # isn't found. - self.assertIn("RuntimeError: could not find io module state", - err.decode()) + self.assertIn(self.shutdown_error, err.decode()) else: self.assertEqual("ok", out.decode().strip()) @@ -2630,6 +2645,7 @@ class TextIOWrapperTest(unittest.TestCase): class CTextIOWrapperTest(TextIOWrapperTest): io = io + shutdown_error = "RuntimeError: could not find io module state" def test_initialization(self): r = self.BytesIO(b"\xc3\xa9\n\n") @@ -2674,6 +2690,7 @@ class CTextIOWrapperTest(TextIOWrapperTest): class PyTextIOWrapperTest(TextIOWrapperTest): io = pyio + shutdown_error = "LookupError: unknown encoding: ascii" class IncrementalNewlineDecoderTest(unittest.TestCase): diff --git a/Misc/NEWS b/Misc/NEWS index 9938367..91e3b53 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,12 @@ Release date: 2014-02-09 Core and Builtins ----------------- +- Issue #20404: io.TextIOWrapper (and hence the open() builtin) now uses the + internal codec marking system added for issue #19619 to throw LookupError + for known non-text encodings at stream construction time. The existing + output type checks remain in place to deal with unmarked third party + codecs. + - Issue #17162: Add PyType_GetSlot. - Issue #20162: Fix an alignment issue in the siphash24() hash function which diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 747f623..5739bc4 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -849,7 +849,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) char *kwlist[] = {"buffer", "encoding", "errors", "newline", "line_buffering", "write_through", NULL}; - PyObject *buffer, *raw; + PyObject *buffer, *raw, *codec_info = NULL; char *encoding = NULL; char *errors = NULL; char *newline = NULL; @@ -961,6 +961,17 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) "could not determine default encoding"); } + /* Check we have been asked for a real text encoding */ + codec_info = _PyCodec_LookupTextEncoding(encoding, "codecs.open()"); + if (codec_info == NULL) { + Py_CLEAR(self->encoding); + goto error; + } + + /* XXX: Failures beyond this point have the potential to leak elements + * of the partially constructed object (like self->encoding) + */ + if (errors == NULL) errors = "strict"; self->errors = PyBytes_FromString(errors); @@ -975,7 +986,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) if (newline) { self->readnl = PyUnicode_FromString(newline); if (self->readnl == NULL) - return -1; + goto error; } self->writetranslate = (newline == NULL || newline[0] != '\0'); if (!self->readuniversal && self->readnl) { @@ -999,8 +1010,8 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) if (r == -1) goto error; if (r == 1) { - self->decoder = PyCodec_IncrementalDecoder( - encoding, errors); + self->decoder = _PyCodecInfo_GetIncrementalDecoder(codec_info, + errors); if (self->decoder == NULL) goto error; @@ -1024,17 +1035,12 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) if (r == -1) goto error; if (r == 1) { - PyObject *ci; - self->encoder = PyCodec_IncrementalEncoder( - encoding, errors); + self->encoder = _PyCodecInfo_GetIncrementalEncoder(codec_info, + errors); if (self->encoder == NULL) goto error; /* Get the normalized named of the codec */ - ci = _PyCodec_Lookup(encoding); - if (ci == NULL) - goto error; - res = _PyObject_GetAttrId(ci, &PyId_name); - Py_DECREF(ci); + res = _PyObject_GetAttrId(codec_info, &PyId_name); if (res == NULL) { if (PyErr_ExceptionMatches(PyExc_AttributeError)) PyErr_Clear(); @@ -1054,6 +1060,9 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) Py_XDECREF(res); } + /* Finished sorting out the codec details */ + Py_DECREF(codec_info); + self->buffer = buffer; Py_INCREF(buffer); @@ -1116,6 +1125,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) return 0; error: + Py_XDECREF(codec_info); return -1; } diff --git a/Python/codecs.c b/Python/codecs.c index 5ff41b5..e06d6e0 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -243,20 +243,15 @@ PyObject *codec_getitem(const char *encoding, int index) return v; } -/* Helper function to create an incremental codec. */ - +/* Helper functions to create an incremental codec. */ static -PyObject *codec_getincrementalcodec(const char *encoding, - const char *errors, - const char *attrname) +PyObject *codec_makeincrementalcodec(PyObject *codec_info, + const char *errors, + const char *attrname) { - PyObject *codecs, *ret, *inccodec; + PyObject *ret, *inccodec; - codecs = _PyCodec_Lookup(encoding); - if (codecs == NULL) - return NULL; - inccodec = PyObject_GetAttrString(codecs, attrname); - Py_DECREF(codecs); + inccodec = PyObject_GetAttrString(codec_info, attrname); if (inccodec == NULL) return NULL; if (errors) @@ -267,6 +262,21 @@ PyObject *codec_getincrementalcodec(const char *encoding, return ret; } +static +PyObject *codec_getincrementalcodec(const char *encoding, + const char *errors, + const char *attrname) +{ + PyObject *codec_info, *ret; + + codec_info = _PyCodec_Lookup(encoding); + if (codec_info == NULL) + return NULL; + ret = codec_makeincrementalcodec(codec_info, errors, attrname); + Py_DECREF(codec_info); + return ret; +} + /* Helper function to create a stream codec. */ static @@ -290,6 +300,24 @@ PyObject *codec_getstreamcodec(const char *encoding, return streamcodec; } +/* Helpers to work with the result of _PyCodec_Lookup + + */ +PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, + const char *errors) +{ + return codec_makeincrementalcodec(codec_info, errors, + "incrementaldecoder"); +} + +PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, + const char *errors) +{ + return codec_makeincrementalcodec(codec_info, errors, + "incrementalencoder"); +} + + /* Convenience APIs to query the Codec registry. All APIs return a codec object with incremented refcount. @@ -467,15 +495,12 @@ PyObject *PyCodec_Decode(PyObject *object, } /* Text encoding/decoding API */ -static -PyObject *codec_getitem_checked(const char *encoding, - const char *operation_name, - int index) +PyObject * _PyCodec_LookupTextEncoding(const char *encoding, + const char *alternate_command) { _Py_IDENTIFIER(_is_text_encoding); PyObject *codec; PyObject *attr; - PyObject *v; int is_text_codec; codec = _PyCodec_Lookup(encoding); @@ -502,27 +527,44 @@ PyObject *codec_getitem_checked(const char *encoding, Py_DECREF(codec); PyErr_Format(PyExc_LookupError, "'%.400s' is not a text encoding; " - "use codecs.%s() to handle arbitrary codecs", - encoding, operation_name); + "use %s to handle arbitrary codecs", + encoding, alternate_command); return NULL; } } } + /* This appears to be a valid text encoding */ + return codec; +} + + +static +PyObject *codec_getitem_checked(const char *encoding, + const char *alternate_command, + int index) +{ + PyObject *codec; + PyObject *v; + + codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); + if (codec == NULL) + return NULL; + v = PyTuple_GET_ITEM(codec, index); - Py_DECREF(codec); Py_INCREF(v); + Py_DECREF(codec); return v; } static PyObject * _PyCodec_TextEncoder(const char *encoding) { - return codec_getitem_checked(encoding, "encode", 0); + return codec_getitem_checked(encoding, "codecs.encode()", 0); } static PyObject * _PyCodec_TextDecoder(const char *encoding) { - return codec_getitem_checked(encoding, "decode", 1); + return codec_getitem_checked(encoding, "codecs.decode()", 1); } PyObject *_PyCodec_EncodeText(PyObject *object, -- cgit v0.12