Issue #19619: Blacklist non-text codecs in method API

str.encode, bytes.decode and bytearray.decode now use an internal API to throw LookupError for known non-text encodings, rather than attempting the encoding or decoding operation and then throwing a TypeError for an unexpected output type. The latter mechanism remains in place for third party non-text encodings. Backported changeset d68df99d7a57.
author: Serhiy Storchaka <storchaka@gmail.com> 2014-02-24 12:43:03 (GMT)
committer: Serhiy Storchaka <storchaka@gmail.com> 2014-02-24 12:43:03 (GMT)
commit: 94ee389308ec9e0e07b3f7a944d5179aba540c5e (patch)
tree: 80bc231aff27723119beacbcfa2654b90f793060 /Lib
parent: 20f8728bf0cce877c1908b15ddc59e2d1011ad0f (diff)
download: cpython-94ee389308ec9e0e07b3f7a944d5179aba540c5e.zip
cpython-94ee389308ec9e0e07b3f7a944d5179aba540c5e.tar.gz
cpython-94ee389308ec9e0e07b3f7a944d5179aba540c5e.tar.bz2
9 files changed, 62 insertions, 1 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 01ae0f3..c2065da 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -73,9 +73,19 @@ BOM64_BE = BOM_UTF32_BE
 ### Codec base classes (defining the API)
 
 class CodecInfo(tuple):
+    """Codec details when looking up the codec registry"""
+
+    # Private API to allow Python 3.4 to blacklist the known non-Unicode
+    # codecs in the standard library. A more general mechanism to
+    # reliably distinguish test encodings from other codecs will hopefully
+    # be defined for Python 3.5
+    #
+    # See http://bugs.python.org/issue19619
+    _is_text_encoding = True # Assume codecs are text encodings by default
 
     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
-        incrementalencoder=None, incrementaldecoder=None, name=None):
+        incrementalencoder=None, incrementaldecoder=None, name=None,
+        *, _is_text_encoding=None):
         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
         self.name = name
         self.encode = encode
@@ -84,6 +94,8 @@ class CodecInfo(tuple):
         self.incrementaldecoder = incrementaldecoder
         self.streamwriter = streamwriter
         self.streamreader = streamreader
+        if _is_text_encoding is not None:
+            self._is_text_encoding = _is_text_encoding
         return self
 
     def __repr__(self):
diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py
index 321a961..881d1ba 100644
--- a/Lib/encodings/base64_codec.py
+++ b/Lib/encodings/base64_codec.py
@@ -52,4 +52,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
diff --git a/Lib/encodings/bz2_codec.py b/Lib/encodings/bz2_codec.py
index e65d226..fd9495e 100644
--- a/Lib/encodings/bz2_codec.py
+++ b/Lib/encodings/bz2_codec.py
@@ -74,4 +74,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py
index e003fc3..f2ed0a7 100644
--- a/Lib/encodings/hex_codec.py
+++ b/Lib/encodings/hex_codec.py
@@ -52,4 +52,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
diff --git a/Lib/encodings/quopri_codec.py b/Lib/encodings/quopri_codec.py
index 9243fc4..70f7083 100644
--- a/Lib/encodings/quopri_codec.py
+++ b/Lib/encodings/quopri_codec.py
@@ -53,4 +53,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py
index 3140c14..fff9153 100755
--- a/Lib/encodings/rot_13.py
+++ b/Lib/encodings/rot_13.py
@@ -43,6 +43,7 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamwriter=StreamWriter,
         streamreader=StreamReader,
+        _is_text_encoding=False,
     )
 
 ### Map
diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py
index 69c6f17..e3269e4 100644
--- a/Lib/encodings/uu_codec.py
+++ b/Lib/encodings/uu_codec.py
@@ -96,4 +96,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_text_encoding=False,
     )
diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py
index e0b9cda..4c81ca1 100644
--- a/Lib/encodings/zlib_codec.py
+++ b/Lib/encodings/zlib_codec.py
@@ -74,4 +74,5 @@ def getregentry():
         incrementaldecoder=IncrementalDecoder,
         streamreader=StreamReader,
         streamwriter=StreamWriter,
+        _is_text_encoding=False,
     )
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 1a199f7..a8b3da0 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -4,6 +4,7 @@ import locale
 import sys
 import unittest
 import warnings
+import encodings
 
 from test import support
 
@@ -2408,6 +2409,47 @@ class TransformCodecTest(unittest.TestCase):
             sout = reader.readline()
             self.assertEqual(sout, b"\x80")
 
+    def test_text_to_binary_blacklists_binary_transforms(self):
+        # Check binary -> binary codecs give a good error for str input
+        bad_input = "bad input type"
+        for encoding in bytes_transform_encodings:
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.encode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.encode(encoding)
+            self.assertIsNone(failure.exception.__cause__)
+
+    def test_text_to_binary_blacklists_text_transforms(self):
+        # Check str.encode gives a good error message for str -> str codecs
+        msg = (r"^'rot_13' is not a text encoding; "
+               r"use codecs.encode\(\) to handle arbitrary codecs")
+        with self.assertRaisesRegex(LookupError, msg):
+            "just an example message".encode("rot_13")
+
+    def test_binary_to_text_blacklists_binary_transforms(self):
+        # Check bytes.decode and bytearray.decode give a good error
+        # message for binary -> binary codecs
+        data = b"encode first to ensure we meet any format restrictions"
+        for encoding in bytes_transform_encodings:
+            encoded_data = codecs.encode(data, encoding)
+            fmt = (r"{!r} is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            msg = fmt.format(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                encoded_data.decode(encoding)
+            with self.assertRaisesRegex(LookupError, msg):
+                bytearray(encoded_data).decode(encoding)
+
+    def test_binary_to_text_blacklists_text_transforms(self):
+        # Check str -> str codec gives a good error for binary input
+        for bad_input in (b"immutable", bytearray(b"mutable")):
+            msg = (r"^'rot_13' is not a text encoding; "
+                   r"use codecs.decode\(\) to handle arbitrary codecs")
+            with self.assertRaisesRegex(LookupError, msg) as failure:
+                bad_input.decode("rot_13")
+            self.assertIsNone(failure.exception.__cause__)
+
 
 @unittest.skipUnless(sys.platform == 'win32',
                      'code pages are specific to Windows')
author	Serhiy Storchaka <storchaka@gmail.com>	2014-02-24 12:43:03 (GMT)
committer	Serhiy Storchaka <storchaka@gmail.com>	2014-02-24 12:43:03 (GMT)
commit	94ee389308ec9e0e07b3f7a944d5179aba540c5e (patch)
tree	80bc231aff27723119beacbcfa2654b90f793060 /Lib
parent	20f8728bf0cce877c1908b15ddc59e2d1011ad0f (diff)
download	cpython-94ee389308ec9e0e07b3f7a944d5179aba540c5e.zip cpython-94ee389308ec9e0e07b3f7a944d5179aba540c5e.tar.gz cpython-94ee389308ec9e0e07b3f7a944d5179aba540c5e.tar.bz2