From de629d46f2a143e230398a820e9c090295f72ddb Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 5 May 2010 21:43:57 +0000 Subject: Issue #8390: tarfile uses surrogateespace as the default error handler (instead of replace in read mode or strict in write mode) --- Doc/library/tarfile.rst | 21 +++++++++++++-------- Lib/tarfile.py | 12 +++--------- Lib/test/test_tarfile.py | 4 ++-- Misc/NEWS | 3 +++ 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 50a5148..8f68c42 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -218,7 +218,7 @@ be finalized; only the internally used file object will be closed. See the .. versionadded:: 3.2 Added support for the context manager protocol. -.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors=None, pax_headers=None, debug=0, errorlevel=0) +.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=0) All following arguments are optional and can be accessed as instance attributes as well. @@ -267,6 +267,9 @@ be finalized; only the internally used file object will be closed. See the to be handled. The default settings will work for most users. See section :ref:`tar-unicode` for in-depth information. + .. versionchanged:: 3.2 + Use ``'surrogateescape'`` as the default for the *errors* argument. + The *pax_headers* argument is an optional dictionary of strings which will be added as a pax global header if *format* is :const:`PAX_FORMAT`. @@ -449,11 +452,14 @@ It does *not* contain the file's data itself. a :class:`TarInfo` object. -.. method:: TarInfo.tobuf(format=DEFAULT_FORMAT, encoding=ENCODING, errors='strict') +.. method:: TarInfo.tobuf(format=DEFAULT_FORMAT, encoding=ENCODING, errors='surrogateescape') Create a string buffer from a :class:`TarInfo` object. For information on the arguments see the constructor of the :class:`TarFile` class. + .. versionchanged:: 3.2 + Use ``'surrogateescape'`` as the default for the *errors* argument. + A ``TarInfo`` object has the following public data attributes: @@ -701,11 +707,10 @@ metadata must be either decoded or encoded. If *encoding* is not set appropriately, this conversion may fail. The *errors* argument defines how characters are treated that cannot be -converted. Possible values are listed in section :ref:`codec-base-classes`. In -read mode the default scheme is ``'replace'``. This avoids unexpected -:exc:`UnicodeError` exceptions and guarantees that an archive can always be -read. In write mode the default value for *errors* is ``'strict'``. This -ensures that name information is not altered unnoticed. +converted. Possible values are listed in section :ref:`codec-base-classes`. +The default scheme is ``'surrogateescape'`` which Python also uses for its +file system calls, see :ref:`os-filenames`. In case of writing :const:`PAX_FORMAT` archives, *encoding* is ignored because -non-ASCII metadata is stored using *UTF-8*. +non-ASCII metadata is stored using *UTF-8*. Storing surrogate characters is not +possible and will raise a :exc:`UnicodeEncodeError`. diff --git a/Lib/tarfile.py b/Lib/tarfile.py index a888d69..81b13a6 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -978,7 +978,7 @@ class TarInfo(object): return info - def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"): + def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"): """Return a tar header as a string of 512 byte blocks. """ info = self.get_info() @@ -1490,7 +1490,7 @@ class TarFile(object): def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, - errors=None, pax_headers=None, debug=None, errorlevel=None): + errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' @@ -1531,13 +1531,7 @@ class TarFile(object): self.ignore_zeros = ignore_zeros if encoding is not None: self.encoding = encoding - - if errors is not None: - self.errors = errors - elif mode == "r": - self.errors = "replace" - else: - self.errors = "strict" + self.errors = errors if pax_headers is not None and self.format == PAX_FORMAT: self.pax_headers = pax_headers diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 43527ef..2db18fe 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -1118,8 +1118,8 @@ class UstarUnicodeTest(unittest.TestCase): if self.format != tarfile.PAX_FORMAT: tar = tarfile.open(tmpname, encoding="ascii") t = tar.getmember("foo") - self.assertEqual(t.uname, "\ufffd\ufffd\ufffd") - self.assertEqual(t.gname, "\ufffd\ufffd\ufffd") + self.assertEqual(t.uname, "\udce4\udcf6\udcfc") + self.assertEqual(t.gname, "\udce4\udcf6\udcfc") class GNUUnicodeTest(UstarUnicodeTest): diff --git a/Misc/NEWS b/Misc/NEWS index f3d27e1..2b76def 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -348,6 +348,9 @@ C-API Library ------- +- Issue #8390: tarfile uses surrogateespace as the default error handler + (instead of replace in read mode or strict in write mode) + - Issue #7755: Use an unencumbered audio file for tests. - Issue #8621: uuid.uuid4() returned the same sequence of values in the -- cgit v0.12