summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Include/codecs.h27
-rw-r--r--Lib/codecs.py14
-rw-r--r--Lib/encodings/base64_codec.py1
-rw-r--r--Lib/encodings/bz2_codec.py1
-rw-r--r--Lib/encodings/hex_codec.py1
-rw-r--r--Lib/encodings/quopri_codec.py1
-rwxr-xr-xLib/encodings/rot_13.py1
-rw-r--r--Lib/encodings/uu_codec.py1
-rw-r--r--Lib/encodings/zlib_codec.py1
-rw-r--r--Lib/test/test_codecs.py42
-rw-r--r--Misc/NEWS6
-rw-r--r--Objects/unicodeobject.c4
-rw-r--r--Python/codecs.c138
13 files changed, 219 insertions, 19 deletions
diff --git a/Include/codecs.h b/Include/codecs.h
index 0d9e9b4..5ca505f 100644
--- a/Include/codecs.h
+++ b/Include/codecs.h
@@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
const char *errors
);
+#ifndef PY_LIMITED_API
+/* Text codec specific encoding and decoding API.
+
+ Checks the encoding against a list of codecs which do not
+ implement a str<->bytes encoding before attempting the
+ operation.
+
+ Please note that these APIs are internal and should not
+ be used in Python C extensions.
+
+ */
+
+PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
+ PyObject *object,
+ const char *encoding,
+ const char *errors
+ );
+
+PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
+ PyObject *object,
+ const char *encoding,
+ const char *errors
+ );
+#endif
+
+
+
/* --- Codec Lookup APIs --------------------------------------------------
All APIs return a codec object with incremented refcount and are
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 01ae0f3..c2065da 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -73,9 +73,19 @@ BOM64_BE = BOM_UTF32_BE
### Codec base classes (defining the API)
class CodecInfo(tuple):
+ """Codec details when looking up the codec registry"""
+
+ # Private API to allow Python 3.4 to blacklist the known non-Unicode
+ # codecs in the standard library. A more general mechanism to
+ # reliably distinguish test encodings from other codecs will hopefully
+ # be defined for Python 3.5
+ #
+ # See http://bugs.python.org/issue19619
+ _is_text_encoding = True # Assume codecs are text encodings by default
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
- incrementalencoder=None, incrementaldecoder=None, name=None):
+ incrementalencoder=None, incrementaldecoder=None, name=None,
+ *, _is_text_encoding=None):
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
self.name = name
self.encode = encode
@@ -84,6 +94,8 @@ class CodecInfo(tuple):
self.incrementaldecoder = incrementaldecoder
self.streamwriter = streamwriter
self.streamreader = streamreader
+ if _is_text_encoding is not None:
+ self._is_text_encoding = _is_text_encoding
return self
def __repr__(self):
diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py
index 321a961..881d1ba 100644
--- a/Lib/encodings/base64_codec.py
+++ b/Lib/encodings/base64_codec.py
@@ -52,4 +52,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/bz2_codec.py b/Lib/encodings/bz2_codec.py
index e65d226..fd9495e 100644
--- a/Lib/encodings/bz2_codec.py
+++ b/Lib/encodings/bz2_codec.py
@@ -74,4 +74,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py
index e003fc3..f2ed0a7 100644
--- a/Lib/encodings/hex_codec.py
+++ b/Lib/encodings/hex_codec.py
@@ -52,4 +52,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/quopri_codec.py b/Lib/encodings/quopri_codec.py
index 9243fc4..70f7083 100644
--- a/Lib/encodings/quopri_codec.py
+++ b/Lib/encodings/quopri_codec.py
@@ -53,4 +53,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py
index 3140c14..fff9153 100755
--- a/Lib/encodings/rot_13.py
+++ b/Lib/encodings/rot_13.py
@@ -43,6 +43,7 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
### Map
diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py
index 69c6f17..e3269e4 100644
--- a/Lib/encodings/uu_codec.py
+++ b/Lib/encodings/uu_codec.py
@@ -96,4 +96,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py
index e0b9cda..4c81ca1 100644
--- a/Lib/encodings/zlib_codec.py
+++ b/Lib/encodings/zlib_codec.py
@@ -74,4 +74,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _is_text_encoding=False,
)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 1a199f7..a8b3da0 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -4,6 +4,7 @@ import locale
import sys
import unittest
import warnings
+import encodings
from test import support
@@ -2408,6 +2409,47 @@ class TransformCodecTest(unittest.TestCase):
sout = reader.readline()
self.assertEqual(sout, b"\x80")
+ def test_text_to_binary_blacklists_binary_transforms(self):
+ # Check binary -> binary codecs give a good error for str input
+ bad_input = "bad input type"
+ for encoding in bytes_transform_encodings:
+ fmt = (r"{!r} is not a text encoding; "
+ r"use codecs.encode\(\) to handle arbitrary codecs")
+ msg = fmt.format(encoding)
+ with self.assertRaisesRegex(LookupError, msg) as failure:
+ bad_input.encode(encoding)
+ self.assertIsNone(failure.exception.__cause__)
+
+ def test_text_to_binary_blacklists_text_transforms(self):
+ # Check str.encode gives a good error message for str -> str codecs
+ msg = (r"^'rot_13' is not a text encoding; "
+ r"use codecs.encode\(\) to handle arbitrary codecs")
+ with self.assertRaisesRegex(LookupError, msg):
+ "just an example message".encode("rot_13")
+
+ def test_binary_to_text_blacklists_binary_transforms(self):
+ # Check bytes.decode and bytearray.decode give a good error
+ # message for binary -> binary codecs
+ data = b"encode first to ensure we meet any format restrictions"
+ for encoding in bytes_transform_encodings:
+ encoded_data = codecs.encode(data, encoding)
+ fmt = (r"{!r} is not a text encoding; "
+ r"use codecs.decode\(\) to handle arbitrary codecs")
+ msg = fmt.format(encoding)
+ with self.assertRaisesRegex(LookupError, msg):
+ encoded_data.decode(encoding)
+ with self.assertRaisesRegex(LookupError, msg):
+ bytearray(encoded_data).decode(encoding)
+
+ def test_binary_to_text_blacklists_text_transforms(self):
+ # Check str -> str codec gives a good error for binary input
+ for bad_input in (b"immutable", bytearray(b"mutable")):
+ msg = (r"^'rot_13' is not a text encoding; "
+ r"use codecs.decode\(\) to handle arbitrary codecs")
+ with self.assertRaisesRegex(LookupError, msg) as failure:
+ bad_input.decode("rot_13")
+ self.assertIsNone(failure.exception.__cause__)
+
@unittest.skipUnless(sys.platform == 'win32',
'code pages are specific to Windows')
diff --git a/Misc/NEWS b/Misc/NEWS
index 50ac145..4774f37 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,12 @@ What's New in Python 3.3.5 release candidate 1?
Core and Builtins
-----------------
+- Issue #19619: str.encode, bytes.decode and bytearray.decode now use an
+ internal API to throw LookupError for known non-text encodings, rather
+ than attempting the encoding or decoding operation and then throwing a
+ TypeError for an unexpected output type. (The latter mechanism remains
+ in place for third party non-text encodings)
+
- Issue #20588: Make Python-ast.c C89 compliant.
- Issue #20437: Fixed 21 potential bugs when deleting objects references.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 89094e0..0300753 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3129,7 +3129,7 @@ PyUnicode_Decode(const char *s,
buffer = PyMemoryView_FromBuffer(&info);
if (buffer == NULL)
goto onError;
- unicode = PyCodec_Decode(buffer, encoding, errors);
+ unicode = _PyCodec_DecodeText(buffer, encoding, errors);
if (unicode == NULL)
goto onError;
if (!PyUnicode_Check(unicode)) {
@@ -3489,7 +3489,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
}
/* Encode via the codec registry */
- v = PyCodec_Encode(unicode, encoding, errors);
+ v = _PyCodec_EncodeText(unicode, encoding, errors);
if (v == NULL)
return NULL;
diff --git a/Python/codecs.c b/Python/codecs.c
index fd67d1b..5ebc4cb 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -337,18 +337,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding,
errors is passed to the encoder factory as argument if non-NULL. */
-PyObject *PyCodec_Encode(PyObject *object,
- const char *encoding,
- const char *errors)
+static PyObject *
+_PyCodec_EncodeInternal(PyObject *object,
+ PyObject *encoder,
+ const char *encoding,
+ const char *errors)
{
- PyObject *encoder = NULL;
PyObject *args = NULL, *result = NULL;
PyObject *v = NULL;
- encoder = PyCodec_Encoder(encoding);
- if (encoder == NULL)
- goto onError;
-
args = args_tuple(object, errors);
if (args == NULL)
goto onError;
@@ -384,18 +381,15 @@ PyObject *PyCodec_Encode(PyObject *object,
errors is passed to the decoder factory as argument if non-NULL. */
-PyObject *PyCodec_Decode(PyObject *object,
- const char *encoding,
- const char *errors)
+static PyObject *
+_PyCodec_DecodeInternal(PyObject *object,
+ PyObject *decoder,
+ const char *encoding,
+ const char *errors)
{
- PyObject *decoder = NULL;
PyObject *args = NULL, *result = NULL;
PyObject *v;
- decoder = PyCodec_Decoder(encoding);
- if (decoder == NULL)
- goto onError;
-
args = args_tuple(object, errors);
if (args == NULL)
goto onError;
@@ -425,6 +419,118 @@ PyObject *PyCodec_Decode(PyObject *object,
return NULL;
}
+/* Generic encoding/decoding API */
+PyObject *PyCodec_Encode(PyObject *object,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *encoder;
+
+ encoder = PyCodec_Encoder(encoding);
+ if (encoder == NULL)
+ return NULL;
+
+ return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
+}
+
+PyObject *PyCodec_Decode(PyObject *object,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *decoder;
+
+ decoder = PyCodec_Decoder(encoding);
+ if (decoder == NULL)
+ return NULL;
+
+ return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
+}
+
+/* Text encoding/decoding API */
+static
+PyObject *codec_getitem_checked(const char *encoding,
+ const char *operation_name,
+ int index)
+{
+ _Py_IDENTIFIER(_is_text_encoding);
+ PyObject *codec;
+ PyObject *attr;
+ PyObject *v;
+ int is_text_codec;
+
+ codec = _PyCodec_Lookup(encoding);
+ if (codec == NULL)
+ return NULL;
+
+ /* Backwards compatibility: assume any raw tuple describes a text
+ * encoding, and the same for anything lacking the private
+ * attribute.
+ */
+ if (!PyTuple_CheckExact(codec)) {
+ attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
+ if (attr == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+ PyErr_Clear();
+ } else {
+ Py_DECREF(codec);
+ return NULL;
+ }
+ } else {
+ is_text_codec = PyObject_IsTrue(attr);
+ Py_DECREF(attr);
+ if (!is_text_codec) {
+ Py_DECREF(codec);
+ PyErr_Format(PyExc_LookupError,
+ "'%.400s' is not a text encoding; "
+ "use codecs.%s() to handle arbitrary codecs",
+ encoding, operation_name);
+ return NULL;
+ }
+ }
+ }
+
+ v = PyTuple_GET_ITEM(codec, index);
+ Py_DECREF(codec);
+ Py_INCREF(v);
+ return v;
+}
+
+static PyObject * _PyCodec_TextEncoder(const char *encoding)
+{
+ return codec_getitem_checked(encoding, "encode", 0);
+}
+
+static PyObject * _PyCodec_TextDecoder(const char *encoding)
+{
+ return codec_getitem_checked(encoding, "decode", 1);
+}
+
+PyObject *_PyCodec_EncodeText(PyObject *object,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *encoder;
+
+ encoder = _PyCodec_TextEncoder(encoding);
+ if (encoder == NULL)
+ return NULL;
+
+ return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
+}
+
+PyObject *_PyCodec_DecodeText(PyObject *object,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *decoder;
+
+ decoder = _PyCodec_TextDecoder(encoding);
+ if (decoder == NULL)
+ return NULL;
+
+ return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
+}
+
/* Register the error handling callback function error under the name
name. This function will be called by the codec when it encounters
an unencodable characters/undecodable bytes and doesn't know the