diff options
-rw-r--r-- | Doc/library/codecs.rst | 116 | ||||
-rw-r--r-- | Doc/whatsnew/3.4.rst | 50 | ||||
-rw-r--r-- | Lib/encodings/aliases.py | 36 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 20 |
4 files changed, 142 insertions, 80 deletions
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 358fde7..ef79918 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1188,6 +1188,9 @@ common use case for codecs, the underlying codec infrastructure supports arbitrary data transforms rather than just text encodings). For asymmetric codecs, the stated purpose describes the encoding direction. +Text Encodings +^^^^^^^^^^^^^^ + The following codecs provide :class:`str` to :class:`bytes` encoding and :term:`bytes-like object` to :class:`str` decoding, similar to the Unicode text encodings. @@ -1234,62 +1237,83 @@ encodings. | | | .. deprecated:: 3.3 | +--------------------+---------+---------------------------+ -The following codecs provide :term:`bytes-like object` to :class:`bytes` -mappings. - - -.. tabularcolumns:: |l|L|L| - -+----------------------+------------------------------+------------------------------+ -| Codec | Purpose | Encoder / decoder | -+======================+==============================+==============================+ -| base64_codec [#b64]_ | Convert operand to MIME | :meth:`base64.b64encode` / | -| | base64 (the result always | :meth:`base64.b64decode` | -| | includes a trailing | | -| | ``'\n'``) | | -| | | | -| | .. versionchanged:: 3.4 | | -| | accepts any | | -| | :term:`bytes-like object` | | -| | as input for encoding and | | -| | decoding | | -+----------------------+------------------------------+------------------------------+ -| bz2_codec | Compress the operand | :meth:`bz2.compress` / | -| | using bz2 | :meth:`bz2.decompress` | -+----------------------+------------------------------+------------------------------+ -| hex_codec | Convert operand to | :meth:`base64.b16encode` / | -| | hexadecimal | :meth:`base64.b16decode` | -| | representation, with two | | -| | digits per byte | | -+----------------------+------------------------------+------------------------------+ -| quopri_codec | Convert operand to MIME | :meth:`quopri.encodestring` /| -| | quoted printable | :meth:`quopri.decodestring` | -+----------------------+------------------------------+------------------------------+ -| uu_codec | Convert the operand using | :meth:`uu.encode` / | -| | uuencode | :meth:`uu.decode` | -+----------------------+------------------------------+------------------------------+ -| zlib_codec | Compress the operand | :meth:`zlib.compress` / | -| | using gzip | :meth:`zlib.decompress` | -+----------------------+------------------------------+------------------------------+ +.. _binary-transforms: + +Binary Transforms +^^^^^^^^^^^^^^^^^ + +The following codecs provide binary transforms: :term:`bytes-like object` +to :class:`bytes` mappings. + + +.. tabularcolumns:: |l|L|L|L| + ++----------------------+------------------+------------------------------+------------------------------+ +| Codec | Aliases | Purpose | Encoder / decoder | ++======================+==================+==============================+==============================+ +| base64_codec [#b64]_ | base64, base_64 | Convert operand to MIME | :meth:`base64.b64encode` / | +| | | base64 (the result always | :meth:`base64.b64decode` | +| | | includes a trailing | | +| | | ``'\n'``) | | +| | | | | +| | | .. versionchanged:: 3.4 | | +| | | accepts any | | +| | | :term:`bytes-like object` | | +| | | as input for encoding and | | +| | | decoding | | ++----------------------+------------------+------------------------------+------------------------------+ +| bz2_codec | bz2 | Compress the operand | :meth:`bz2.compress` / | +| | | using bz2 | :meth:`bz2.decompress` | ++----------------------+------------------+------------------------------+------------------------------+ +| hex_codec | hex | Convert operand to | :meth:`base64.b16encode` / | +| | | hexadecimal | :meth:`base64.b16decode` | +| | | representation, with two | | +| | | digits per byte | | ++----------------------+------------------+------------------------------+------------------------------+ +| quopri_codec | quopri, | Convert operand to MIME | :meth:`quopri.encodestring` /| +| | quotedprintable, | quoted printable | :meth:`quopri.decodestring` | +| | quoted_printable | | | ++----------------------+------------------+------------------------------+------------------------------+ +| uu_codec | uu | Convert the operand using | :meth:`uu.encode` / | +| | | uuencode | :meth:`uu.decode` | ++----------------------+------------------+------------------------------+------------------------------+ +| zlib_codec | zip, zlib | Compress the operand | :meth:`zlib.compress` / | +| | | using gzip | :meth:`zlib.decompress` | ++----------------------+------------------+------------------------------+------------------------------+ .. [#b64] In addition to :term:`bytes-like objects <bytes-like object>`, ``'base64_codec'`` also accepts ASCII-only instances of :class:`str` for decoding +.. versionadded:: 3.2 + Restoration of the binary transforms. -The following codecs provide :class:`str` to :class:`str` mappings. +.. versionchanged:: 3.4 + Restoration of the aliases for the binary transforms. -.. tabularcolumns:: |l|L| -+--------------------+---------------------------+ -| Codec | Purpose | -+====================+===========================+ -| rot_13 | Returns the Caesar-cypher | -| | encryption of the operand | -+--------------------+---------------------------+ +.. _text-transforms: + +Text Transforms +^^^^^^^^^^^^^^^ + +The following codec provides a text transform: a :class:`str` to :class:`str` +mapping. + +.. tabularcolumns:: |l|l|L| + ++--------------------+---------+---------------------------+ +| Codec | Aliases | Purpose | ++====================+=========+===========================+ +| rot_13 | rot13 | Returns the Caesar-cypher | +| | | encryption of the operand | ++--------------------+---------+---------------------------+ .. versionadded:: 3.2 - bytes-to-bytes and str-to-str codecs. + Restoration of the ``rot_13`` text transform. + +.. versionchanged:: 3.4 + Restoration of the ``rot13`` alias. :mod:`encodings.idna` --- Internationalized Domain Names in Applications diff --git a/Doc/whatsnew/3.4.rst b/Doc/whatsnew/3.4.rst index 09d8be8..6fc0e48 100644 --- a/Doc/whatsnew/3.4.rst +++ b/Doc/whatsnew/3.4.rst @@ -103,7 +103,8 @@ New expected features for Python implementations: * :ref:`PEP 446: Make newly created file descriptors non-inheritable <pep-446>`. * command line option for :ref:`isolated mode <using-on-misc-options>`, (:issue:`16499`). -* improvements to handling of non-Unicode codecs +* :ref:`improvements <codec-handling-improvements>` in the handling of + codecs that are not text encodings Significantly Improved Library Modules: @@ -173,8 +174,10 @@ PEP 446: Make newly created file descriptors non-inheritable PEP written and implemented by Victor Stinner. -Improvements to handling of non-Unicode codecs -============================================== +.. _codec-handling-improvements: + +Improvements to codec handling +============================== Since it was first introduced, the :mod:`codecs` module has always been intended to operate as a type-neutral dynamic encoding and decoding @@ -186,7 +189,7 @@ fact. As a key step in clarifying the situation, the :meth:`codecs.encode` and :meth:`codecs.decode` convenience functions are now properly documented in Python 2.7, 3.3 and 3.4. These functions have existed in the :mod:`codecs` -module and have been covered by the regression test suite since Python 2.4, +module (and have been covered by the regression test suite) since Python 2.4, but were previously only discoverable through runtime introspection. Unlike the convenience methods on :class:`str`, :class:`bytes` and @@ -199,43 +202,58 @@ In Python 3.4, the interpreter is able to identify the known non-text encodings provided in the standard library and direct users towards these general purpose convenience functions when appropriate:: - >>> import codecs - - >>> b"abcdef".decode("hex_codec") + >>> b"abcdef".decode("hex") Traceback (most recent call last): File "<stdin>", line 1, in <module> - LookupError: 'hex_codec' is not a text encoding; use codecs.decode() to handle arbitrary codecs + LookupError: 'hex' is not a text encoding; use codecs.decode() to handle arbitrary codecs - >>> "hello".encode("rot_13") + >>> "hello".encode("rot13") Traceback (most recent call last): File "<stdin>", line 1, in <module> - LookupError: 'rot_13' is not a text encoding; use codecs.encode() to handle arbitrary codecs + LookupError: 'rot13' is not a text encoding; use codecs.encode() to handle arbitrary codecs In a related change, whenever it is feasible without breaking backwards compatibility, exceptions raised during encoding and decoding operations will be wrapped in a chained exception of the same type that mentions the name of the codec responsible for producing the error:: - >>> codecs.decode(b"abcdefgh", "hex_codec") + >>> import codecs + + >>> codecs.decode(b"abcdefgh", "hex") binascii.Error: Non-hexadecimal digit found The above exception was the direct cause of the following exception: Traceback (most recent call last): File "<stdin>", line 1, in <module> - binascii.Error: decoding with 'hex_codec' codec failed (Error: Non-hexadecimal digit found) + binascii.Error: decoding with 'hex' codec failed (Error: Non-hexadecimal digit found) - >>> codecs.encode("hello", "bz2_codec") + >>> codecs.encode("hello", "bz2") TypeError: 'str' does not support the buffer interface The above exception was the direct cause of the following exception: Traceback (most recent call last): File "<stdin>", line 1, in <module> - TypeError: encoding with 'bz2_codec' codec failed (TypeError: 'str' does not support the buffer interface) + TypeError: encoding with 'bz2' codec failed (TypeError: 'str' does not support the buffer interface) + +Finally, as the examples above show, these improvements have permitted +the restoration of the convenience aliases for the non-Unicode codecs that +were themselves restored in Python 3.2. This means that encoding binary data +to and from its hexadecimal representation (for example) can now be written +as:: + + >>> from codecs import encode, decode + >>> encode(b"hello", "hex") + b'68656c6c6f' + >>> decode(b"68656c6c6f", "hex") + b'hello' + +The binary and text transforms provided in the standard library are detailed +in :ref:`binary-transforms` and :ref:`text-transforms`. -(Contributed by Nick Coghlan in :issue:`17827`, :issue:`17828` and -:issue:`19619`) +(Contributed by Nick Coghlan in :issue:`7475`, , :issue:`17827`, +:issue:`17828` and :issue:`19619`) .. _pep-451: diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py index 235deb5..331095b 100644 --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -33,9 +33,9 @@ aliases = { 'us' : 'ascii', 'us_ascii' : 'ascii', - ## base64_codec codec - #'base64' : 'base64_codec', - #'base_64' : 'base64_codec', + # base64_codec codec + 'base64' : 'base64_codec', + 'base_64' : 'base64_codec', # big5 codec 'big5_tw' : 'big5', @@ -45,8 +45,8 @@ aliases = { 'big5_hkscs' : 'big5hkscs', 'hkscs' : 'big5hkscs', - ## bz2_codec codec - #'bz2' : 'bz2_codec', + # bz2_codec codec + 'bz2' : 'bz2_codec', # cp037 codec '037' : 'cp037', @@ -248,8 +248,8 @@ aliases = { 'cp936' : 'gbk', 'ms936' : 'gbk', - ## hex_codec codec - #'hex' : 'hex_codec', + # hex_codec codec + 'hex' : 'hex_codec', # hp_roman8 codec 'roman8' : 'hp_roman8', @@ -450,13 +450,13 @@ aliases = { 'cp154' : 'ptcp154', 'cyrillic_asian' : 'ptcp154', - ## quopri_codec codec - #'quopri' : 'quopri_codec', - #'quoted_printable' : 'quopri_codec', - #'quotedprintable' : 'quopri_codec', + # quopri_codec codec + 'quopri' : 'quopri_codec', + 'quoted_printable' : 'quopri_codec', + 'quotedprintable' : 'quopri_codec', - ## rot_13 codec - #'rot13' : 'rot_13', + # rot_13 codec + 'rot13' : 'rot_13', # shift_jis codec 'csshiftjis' : 'shift_jis', @@ -518,12 +518,12 @@ aliases = { 'utf8_ucs2' : 'utf_8', 'utf8_ucs4' : 'utf_8', - ## uu_codec codec - #'uu' : 'uu_codec', + # uu_codec codec + 'uu' : 'uu_codec', - ## zlib_codec codec - #'zip' : 'zlib_codec', - #'zlib' : 'zlib_codec', + # zlib_codec codec + 'zip' : 'zlib_codec', + 'zlib' : 'zlib_codec', # temporary mac CJK aliases, will be replaced by proper codecs in 3.1 'x_mac_japanese' : 'shift_jis', diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 506ba7d..07a6a5e 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -2320,18 +2320,29 @@ bytes_transform_encodings = [ "quopri_codec", "hex_codec", ] + +transform_aliases = { + "base64_codec": ["base64", "base_64"], + "uu_codec": ["uu"], + "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"], + "hex_codec": ["hex"], + "rot_13": ["rot13"], +} + try: import zlib except ImportError: pass else: bytes_transform_encodings.append("zlib_codec") + transform_aliases["zlib_codec"] = ["zip", "zlib"] try: import bz2 except ImportError: pass else: bytes_transform_encodings.append("bz2_codec") + transform_aliases["bz2_codec"] = ["bz2"] class TransformCodecTest(unittest.TestCase): @@ -2445,6 +2456,15 @@ class TransformCodecTest(unittest.TestCase): # Unfortunately, the bz2 module throws OSError, which the codec # machinery currently can't wrap :( + # Ensure codec aliases from http://bugs.python.org/issue7475 work + def test_aliases(self): + for codec_name, aliases in transform_aliases.items(): + expected_name = codecs.lookup(codec_name).name + for alias in aliases: + with self.subTest(alias=alias): + info = codecs.lookup(alias) + self.assertEqual(info.name, expected_name) + # The codec system tries to wrap exceptions in order to ensure the error # mentions the operation being performed and the codec involved. We |