diff options
-rw-r--r-- | Doc/lib/libtarfile.tex | 166 | ||||
-rw-r--r-- | Lib/tarfile.py | 235 | ||||
-rw-r--r-- | Lib/test/test_tarfile.py | 246 | ||||
-rw-r--r-- | Lib/test/testtar.tar | bin | 256000 -> 264704 bytes | |||
-rw-r--r-- | Misc/NEWS | 3 |
5 files changed, 453 insertions, 197 deletions
diff --git a/Doc/lib/libtarfile.tex b/Doc/lib/libtarfile.tex index 73c35ed..54683a7 100644 --- a/Doc/lib/libtarfile.tex +++ b/Doc/lib/libtarfile.tex @@ -133,24 +133,20 @@ Some facts and figures: \versionadded{2.6} \end{excdesc} +Each of the following constants defines a tar archive format that the +\module{tarfile} module is able to create. See section \ref{tar-formats} for +details. + \begin{datadesc}{USTAR_FORMAT} - \POSIX{}.1-1988 (ustar) format. It supports filenames up to a length of - at best 256 characters and linknames up to 100 characters. The maximum - file size is 8 gigabytes. This is an old and limited but widely - supported format. + \POSIX{}.1-1988 (ustar) format. \end{datadesc} \begin{datadesc}{GNU_FORMAT} - GNU tar format. It supports arbitrarily long filenames and linknames and - files bigger than 8 gigabytes. It is the defacto standard on GNU/Linux - systems. + GNU tar format. \end{datadesc} \begin{datadesc}{PAX_FORMAT} - \POSIX{}.1-2001 (pax) format. It is the most flexible format with - virtually no limits. It supports long filenames and linknames, large files - and stores pathnames in a portable way. However, not all tar - implementations today are able to handle pax archives properly. + \POSIX{}.1-2001 (pax) format. \end{datadesc} \begin{datadesc}{DEFAULT_FORMAT} @@ -175,15 +171,15 @@ Some facts and figures: The \class{TarFile} object provides an interface to a tar archive. A tar archive is a sequence of blocks. An archive member (a stored file) is made up -of a header block followed by data blocks. It is possible, to store a file in a +of a header block followed by data blocks. It is possible to store a file in a tar archive several times. Each archive member is represented by a \class{TarInfo} object, see \citetitle{TarInfo Objects} (section \ref{tarinfo-objects}) for details. \begin{classdesc}{TarFile}{name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, - ignore_zeros=False, encoding=None, pax_headers=None, debug=0, - errorlevel=0} + ignore_zeros=False, encoding=None, errors=None, pax_headers=None, + debug=0, errorlevel=0} All following arguments are optional and can be accessed as instance attributes as well. @@ -231,18 +227,14 @@ tar archive several times. Each archive member is represented by a If \code{2}, all \emph{non-fatal} errors are raised as \exception{TarError} exceptions as well. - The \var{encoding} argument defines the local character encoding. It - defaults to the value from \function{sys.getfilesystemencoding()} or if - that is \code{None} to \code{"ascii"}. \var{encoding} is used only in - connection with the pax format which stores text data in \emph{UTF-8}. If - it is not set correctly, character conversion will fail with a - \exception{UnicodeError}. + The \var{encoding} and \var{errors} arguments control the way strings are + converted to unicode objects and vice versa. The default settings will work + for most users. See section \ref{tar-unicode} for in-depth information. \versionadded{2.6} - The \var{pax_headers} argument must be a dictionary whose elements are - either unicode objects, numbers or strings that can be decoded to unicode - using \var{encoding}. This information will be added to the archive as a - pax global header. + The \var{pax_headers} argument is an optional dictionary of unicode strings + which will be added as a pax global header if \var{format} is + \constant{PAX_FORMAT}. \versionadded{2.6} \end{classdesc} @@ -287,7 +279,7 @@ tar archive several times. Each archive member is represented by a Extract all members from the archive to the current working directory or directory \var{path}. If optional \var{members} is given, it must be a subset of the list returned by \method{getmembers()}. - Directory informations like owner, modification time and permissions are + Directory information like owner, modification time and permissions are set after all members have been extracted. This is done to work around two problems: A directory's modification time is reset each time a file is created in it. And, if a directory's permissions do not allow writing, @@ -365,6 +357,11 @@ tar archive several times. Each archive member is represented by a \deprecated{2.6}{Use the \member{format} attribute instead.} \end{memberdesc} +\begin{memberdesc}{pax_headers} + A dictionary containing key-value pairs of pax global headers. + \versionadded{2.6} +\end{memberdesc} + %----------------- % TarInfo Objects %----------------- @@ -384,8 +381,8 @@ the file's data itself. Create a \class{TarInfo} object. \end{classdesc} -\begin{methoddesc}{frombuf}{} - Create and return a \class{TarInfo} object from a string buffer. +\begin{methoddesc}{frombuf}{buf} + Create and return a \class{TarInfo} object from string buffer \var{buf}. \versionadded[Raises \exception{HeaderError} if the buffer is invalid.]{2.6} \end{methoddesc} @@ -396,10 +393,11 @@ the file's data itself. \versionadded{2.6} \end{methoddesc} -\begin{methoddesc}{tobuf}{\optional{format}} - Create a string buffer from a \class{TarInfo} object. See - \class{TarFile}'s \member{format} argument for information. - \versionchanged[The \var{format} parameter]{2.6} +\begin{methoddesc}{tobuf}{\optional{format\optional{, encoding + \optional{, errors}}}} + Create a string buffer from a \class{TarInfo} object. For information + on the arguments see the constructor of the \class{TarFile} class. + \versionchanged[The arguments were added]{2.6} \end{methoddesc} A \code{TarInfo} object has the following public data attributes: @@ -452,6 +450,12 @@ A \code{TarInfo} object has the following public data attributes: Group name. \end{memberdesc} +\begin{memberdesc}{pax_headers} + A dictionary containing key-value pairs of an associated pax + extended header. + \versionadded{2.6} +\end{memberdesc} + A \class{TarInfo} object also provides some convenient query methods: \begin{methoddesc}{isfile}{} @@ -554,3 +558,103 @@ for tarinfo in tar: tar.extract(tarinfo) tar.close() \end{verbatim} + +%------------ +% Tar format +%------------ + +\subsection{Supported tar formats \label{tar-formats}} + +There are three tar formats that can be created with the \module{tarfile} +module: + +\begin{itemize} + +\item +The \POSIX{}.1-1988 ustar format (\constant{USTAR_FORMAT}). It supports +filenames up to a length of at best 256 characters and linknames up to 100 +characters. The maximum file size is 8 gigabytes. This is an old and limited +but widely supported format. + +\item +The GNU tar format (\constant{GNU_FORMAT}). It supports long filenames and +linknames, files bigger than 8 gigabytes and sparse files. It is the de facto +standard on GNU/Linux systems. \module{tarfile} fully supports the GNU tar +extensions for long names, sparse file support is read-only. + +\item +The \POSIX{}.1-2001 pax format (\constant{PAX_FORMAT}). It is the most +flexible format with virtually no limits. It supports long filenames and +linknames, large files and stores pathnames in a portable way. However, not +all tar implementations today are able to handle pax archives properly. + +The \emph{pax} format is an extension to the existing \emph{ustar} format. It +uses extra headers for information that cannot be stored otherwise. There are +two flavours of pax headers: Extended headers only affect the subsequent file +header, global headers are valid for the complete archive and affect all +following files. All the data in a pax header is encoded in \emph{UTF-8} for +portability reasons. + +\end{itemize} + +There are some more variants of the tar format which can be read, but not +created: + +\begin{itemize} + +\item +The ancient V7 format. This is the first tar format from \UNIX{} Seventh +Edition, storing only regular files and directories. Names must not be longer +than 100 characters, there is no user/group name information. Some archives +have miscalculated header checksums in case of fields with non-\ASCII{} +characters. + +\item +The SunOS tar extended format. This format is a variant of the \POSIX{}.1-2001 +pax format, but is not compatible. + +\end{itemize} + +%---------------- +% Unicode issues +%---------------- + +\subsection{Unicode issues \label{tar-unicode}} + +The tar format was originally conceived to make backups on tape drives with the +main focus on preserving file system information. Nowadays tar archives are +commonly used for file distribution and exchanging archives over networks. One +problem of the original format (that all other formats are merely variants of) +is that there is no concept of supporting different character encodings. +For example, an ordinary tar archive created on a \emph{UTF-8} system cannot be +read correctly on a \emph{Latin-1} system if it contains non-\ASCII{} +characters. Names (i.e. filenames, linknames, user/group names) containing +these characters will appear damaged. Unfortunately, there is no way to +autodetect the encoding of an archive. + +The pax format was designed to solve this problem. It stores non-\ASCII{} names +using the universal character encoding \emph{UTF-8}. When a pax archive is +read, these \emph{UTF-8} names are converted to the encoding of the local +file system. + +The details of unicode conversion are controlled by the \var{encoding} and +\var{errors} keyword arguments of the \class{TarFile} class. + +The default value for \var{encoding} is the local character encoding. It is +deduced from \function{sys.getfilesystemencoding()} and +\function{sys.getdefaultencoding()}. In read mode, \var{encoding} is used +exclusively to convert unicode names from a pax archive to strings in the local +character encoding. In write mode, the use of \var{encoding} depends on the +chosen archive format. In case of \constant{PAX_FORMAT}, input names that +contain non-\ASCII{} characters need to be decoded before being stored as +\emph{UTF-8} strings. The other formats do not make use of \var{encoding} +unless unicode objects are used as input names. These are converted to +8-bit character strings before they are added to the archive. + +The \var{errors} argument defines how characters are treated that cannot be +converted to or from \var{encoding}. Possible values are listed in section +\ref{codec-base-classes}. In read mode, there is an additional scheme +\code{'utf-8'} which means that bad characters are replaced by their +\emph{UTF-8} representation. This is the default scheme. In write mode the +default value for \var{errors} is \code{'strict'} to ensure that name +information is not altered unnoticed. diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 4f4a1d9..107041e 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -125,6 +125,17 @@ GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, PAX_FIELDS = ("path", "linkpath", "size", "mtime", "uid", "gid", "uname", "gname") +# Fields in a pax header that are numbers, all other fields +# are treated as strings. +PAX_NUMBER_FIELDS = { + "atime": float, + "ctime": float, + "mtime": float, + "uid": int, + "gid": int, + "size": int +} + #--------------------------------------------------------- # Bits used in the mode field, values in octal. #--------------------------------------------------------- @@ -154,7 +165,7 @@ TOEXEC = 0001 # execute/search by other #--------------------------------------------------------- ENCODING = sys.getfilesystemencoding() if ENCODING is None: - ENCODING = "ascii" + ENCODING = sys.getdefaultencoding() #--------------------------------------------------------- # Some useful functions @@ -218,6 +229,26 @@ def itn(n, digits=8, format=DEFAULT_FORMAT): s = chr(0200) + s return s +def uts(s, encoding, errors): + """Convert a unicode object to a string. + """ + if errors == "utf-8": + # An extra error handler similar to the -o invalid=UTF-8 option + # in POSIX.1-2001. Replace untranslatable characters with their + # UTF-8 representation. + try: + return s.encode(encoding, "strict") + except UnicodeEncodeError: + x = [] + for c in s: + try: + x.append(c.encode(encoding, "strict")) + except UnicodeEncodeError: + x.append(c.encode("utf8")) + return "".join(x) + else: + return s.encode(encoding, errors) + def calc_chksums(buf): """Calculate the checksum for a member's header by summing up all characters except for the chksum field which is treated as if @@ -922,7 +953,7 @@ class TarInfo(object): def __repr__(self): return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) - def get_info(self): + def get_info(self, encoding, errors): """Return the TarInfo's attributes as a dictionary. """ info = { @@ -944,24 +975,29 @@ class TarInfo(object): if info["type"] == DIRTYPE and not info["name"].endswith("/"): info["name"] += "/" + for key in ("name", "linkname", "uname", "gname"): + if type(info[key]) is unicode: + info[key] = info[key].encode(encoding, errors) + return info - def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING): + def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"): """Return a tar header as a string of 512 byte blocks. """ + info = self.get_info(encoding, errors) + if format == USTAR_FORMAT: - return self.create_ustar_header() + return self.create_ustar_header(info) elif format == GNU_FORMAT: - return self.create_gnu_header() + return self.create_gnu_header(info) elif format == PAX_FORMAT: - return self.create_pax_header(encoding) + return self.create_pax_header(info, encoding, errors) else: raise ValueError("invalid format") - def create_ustar_header(self): + def create_ustar_header(self, info): """Return the object as a ustar header block. """ - info = self.get_info() info["magic"] = POSIX_MAGIC if len(info["linkname"]) > LENGTH_LINK: @@ -972,10 +1008,9 @@ class TarInfo(object): return self._create_header(info, USTAR_FORMAT) - def create_gnu_header(self): + def create_gnu_header(self, info): """Return the object as a GNU header block sequence. """ - info = self.get_info() info["magic"] = GNU_MAGIC buf = "" @@ -987,12 +1022,11 @@ class TarInfo(object): return buf + self._create_header(info, GNU_FORMAT) - def create_pax_header(self, encoding): + def create_pax_header(self, info, encoding, errors): """Return the object as a ustar header block. If it cannot be represented this way, prepend a pax extended header sequence with supplement information. """ - info = self.get_info() info["magic"] = POSIX_MAGIC pax_headers = self.pax_headers.copy() @@ -1002,7 +1036,11 @@ class TarInfo(object): ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), ("uname", "uname", 32), ("gname", "gname", 32)): - val = info[name].decode(encoding) + if hname in pax_headers: + # The pax header has priority. + continue + + val = info[name].decode(encoding, errors) # Try to encode the string as ASCII. try: @@ -1011,27 +1049,23 @@ class TarInfo(object): pax_headers[hname] = val continue - if len(val) > length: - if name == "name": - # Try to squeeze a longname in the prefix and name fields as in - # ustar format. - try: - info["prefix"], info["name"] = self._posix_split_name(info["name"]) - except ValueError: - pax_headers[hname] = val - else: - continue - else: - pax_headers[hname] = val + if len(info[name]) > length: + pax_headers[hname] = val # Test number fields for values that exceed the field limit or values # that like to be stored as float. for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): + if name in pax_headers: + # The pax header has priority. Avoid overflow. + info[name] = 0 + continue + val = info[name] if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): pax_headers[name] = unicode(val) info[name] = 0 + # Create a pax extended header if necessary. if pax_headers: buf = self._create_pax_generic_header(pax_headers) else: @@ -1040,26 +1074,10 @@ class TarInfo(object): return buf + self._create_header(info, USTAR_FORMAT) @classmethod - def create_pax_global_header(cls, pax_headers, encoding): + def create_pax_global_header(cls, pax_headers): """Return the object as a pax global header block sequence. """ - new_headers = {} - for key, val in pax_headers.iteritems(): - key = cls._to_unicode(key, encoding) - val = cls._to_unicode(val, encoding) - new_headers[key] = val - return cls._create_pax_generic_header(new_headers, type=XGLTYPE) - - @staticmethod - def _to_unicode(value, encoding): - if isinstance(value, unicode): - return value - elif isinstance(value, (int, long, float)): - return unicode(value) - elif isinstance(value, str): - return unicode(value, encoding) - else: - raise ValueError("unable to convert to unicode: %r" % value) + return cls._create_pax_generic_header(pax_headers, type=XGLTYPE) def _posix_split_name(self, name): """Split a name longer than 100 chars into a prefix @@ -1091,9 +1109,9 @@ class TarInfo(object): " ", # checksum field info.get("type", REGTYPE), stn(info.get("linkname", ""), 100), - stn(info.get("magic", ""), 8), - stn(info.get("uname", ""), 32), - stn(info.get("gname", ""), 32), + stn(info.get("magic", POSIX_MAGIC), 8), + stn(info.get("uname", "root"), 32), + stn(info.get("gname", "root"), 32), itn(info.get("devmajor", 0), 8, format), itn(info.get("devminor", 0), 8, format), stn(info.get("prefix", ""), 155) @@ -1254,12 +1272,9 @@ class TarInfo(object): offset += self._block(self.size) tarfile.offset = offset - # Patch the TarInfo object with saved extended + # Patch the TarInfo object with saved global # header information. - for keyword, value in tarfile.pax_headers.iteritems(): - if keyword in PAX_FIELDS: - setattr(self, keyword, value) - self.pax_headers[keyword] = value + self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) return self @@ -1270,18 +1285,17 @@ class TarInfo(object): buf = tarfile.fileobj.read(self._block(self.size)) # Fetch the next header and process it. - b = tarfile.fileobj.read(BLOCKSIZE) - t = self.frombuf(b) - t.offset = self.offset - next = t._proc_member(tarfile) + next = self.fromtarfile(tarfile) + if next is None: + raise HeaderError("missing subsequent header") # Patch the TarInfo object from the next header with # the longname information. next.offset = self.offset if self.type == GNUTYPE_LONGNAME: - next.name = buf.rstrip(NUL) + next.name = nts(buf) elif self.type == GNUTYPE_LONGLINK: - next.linkname = buf.rstrip(NUL) + next.linkname = nts(buf) return next @@ -1356,21 +1370,10 @@ class TarInfo(object): else: pax_headers = tarfile.pax_headers.copy() - # Fields in POSIX.1-2001 that are numbers, all other fields - # are treated as UTF-8 strings. - type_mapping = { - "atime": float, - "ctime": float, - "mtime": float, - "uid": int, - "gid": int, - "size": int - } - # Parse pax header information. A record looks like that: # "%d %s=%s\n" % (length, keyword, value). length is the size # of the complete record including the length field itself and - # the newline. + # the newline. keyword and value are both UTF-8 encoded strings. regex = re.compile(r"(\d+) ([^=]+)=", re.U) pos = 0 while True: @@ -1383,35 +1386,55 @@ class TarInfo(object): value = buf[match.end(2) + 1:match.start(1) + length - 1] keyword = keyword.decode("utf8") - keyword = keyword.encode(tarfile.encoding) - value = value.decode("utf8") - if keyword in type_mapping: - try: - value = type_mapping[keyword](value) - except ValueError: - value = 0 - else: - value = value.encode(tarfile.encoding) pax_headers[keyword] = value pos += length - # Fetch the next header that will be patched with the - # supplement information from the pax header (extended - # only). - t = self.fromtarfile(tarfile) + # Fetch the next header. + next = self.fromtarfile(tarfile) - if self.type != XGLTYPE and t is not None: - # Patch the TarInfo object from the next header with - # the pax header's information. - for keyword, value in pax_headers.items(): - if keyword in PAX_FIELDS: - setattr(t, keyword, value) - pax_headers[keyword] = value - t.pax_headers = pax_headers.copy() + if self.type in (XHDTYPE, SOLARIS_XHDTYPE): + if next is None: + raise HeaderError("missing subsequent header") - return t + # Patch the TarInfo object with the extended header info. + next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) + next.offset = self.offset + + if pax_headers.has_key("size"): + # If the extended header replaces the size field, + # we need to recalculate the offset where the next + # header starts. + offset = next.offset_data + if next.isreg() or next.type not in SUPPORTED_TYPES: + offset += next._block(next.size) + tarfile.offset = offset + + return next + + def _apply_pax_info(self, pax_headers, encoding, errors): + """Replace fields with supplemental information from a previous + pax extended or global header. + """ + for keyword, value in pax_headers.iteritems(): + if keyword not in PAX_FIELDS: + continue + + if keyword == "path": + value = value.rstrip("/") + + if keyword in PAX_NUMBER_FIELDS: + try: + value = PAX_NUMBER_FIELDS[keyword](value) + except ValueError: + value = 0 + else: + value = uts(value, encoding, errors) + + setattr(self, keyword, value) + + self.pax_headers = pax_headers.copy() def _block(self, count): """Round up a byte count by BLOCKSIZE and return it, @@ -1462,8 +1485,9 @@ class TarFile(object): format = DEFAULT_FORMAT # The format to use when creating an archive. - encoding = ENCODING # Transfer UTF-8 strings from POSIX.1-2001 - # headers to this encoding. + encoding = ENCODING # Encoding for 8-bit character strings. + + errors = None # Error handler for unicode conversion. tarinfo = TarInfo # The default TarInfo class to use. @@ -1471,7 +1495,7 @@ class TarFile(object): def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, - pax_headers=None, debug=None, errorlevel=None): + errors=None, pax_headers=None, debug=None, errorlevel=None): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' @@ -1512,6 +1536,19 @@ class TarFile(object): self.ignore_zeros = ignore_zeros if encoding is not None: self.encoding = encoding + + if errors is not None: + self.errors = errors + elif mode == "r": + self.errors = "utf-8" + else: + self.errors = "strict" + + if pax_headers is not None and self.format == PAX_FORMAT: + self.pax_headers = pax_headers + else: + self.pax_headers = {} + if debug is not None: self.debug = debug if errorlevel is not None: @@ -1524,7 +1561,6 @@ class TarFile(object): self.offset = 0L # current position in the archive file self.inodes = {} # dictionary caching the inodes of # archive members already added - self.pax_headers = {} # save contents of global pax headers if self.mode == "r": self.firstmember = None @@ -1543,9 +1579,8 @@ class TarFile(object): if self.mode in "aw": self._loaded = True - if pax_headers: - buf = self.tarinfo.create_pax_global_header( - pax_headers.copy(), self.encoding) + if self.pax_headers: + buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) self.fileobj.write(buf) self.offset += len(buf) @@ -1817,8 +1852,6 @@ class TarFile(object): self.inodes[inode] = arcname elif stat.S_ISDIR(stmd): type = DIRTYPE - if arcname[-1:] != "/": - arcname += "/" elif stat.S_ISFIFO(stmd): type = FIFOTYPE elif stat.S_ISLNK(stmd): @@ -1952,7 +1985,7 @@ class TarFile(object): tarinfo = copy.copy(tarinfo) - buf = tarinfo.tobuf(self.format, self.encoding) + buf = tarinfo.tobuf(self.format, self.encoding, self.errors) self.fileobj.write(buf) self.offset += len(buf) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index f0fb6b1..04f9ba5 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -1,4 +1,4 @@ -# encoding: iso8859-1 +# -*- coding: iso-8859-15 -*- import sys import os @@ -372,9 +372,9 @@ class LongnameTest(ReadTest): def test_read_longname(self): # Test reading of longname (bug #1471427). - name = self.subdir + "/" + "123/" * 125 + "longname" + longname = self.subdir + "/" + "123/" * 125 + "longname" try: - tarinfo = self.tar.getmember(name) + tarinfo = self.tar.getmember(longname) except KeyError: self.fail("longname not found") self.assert_(tarinfo.type != tarfile.DIRTYPE, "read longname as dirtype") @@ -393,13 +393,24 @@ class LongnameTest(ReadTest): tarinfo = self.tar.getmember(longname) offset = tarinfo.offset self.tar.fileobj.seek(offset) - fobj = StringIO.StringIO(self.tar.fileobj.read(1536)) + fobj = StringIO.StringIO(self.tar.fileobj.read(3 * 512)) self.assertRaises(tarfile.ReadError, tarfile.open, name="foo.tar", fileobj=fobj) + def test_header_offset(self): + # Test if the start offset of the TarInfo object includes + # the preceding extended header. + longname = self.subdir + "/" + "123/" * 125 + "longname" + offset = self.tar.getmember(longname).offset + fobj = open(tarname) + fobj.seek(offset) + tarinfo = tarfile.TarInfo.frombuf(fobj.read(512)) + self.assertEqual(tarinfo.type, self.longnametype) + class GNUReadTest(LongnameTest): subdir = "gnu" + longnametype = tarfile.GNUTYPE_LONGNAME def test_sparse_file(self): tarinfo1 = self.tar.getmember("ustar/sparse") @@ -410,26 +421,40 @@ class GNUReadTest(LongnameTest): "sparse file extraction failed") -class PaxReadTest(ReadTest): +class PaxReadTest(LongnameTest): subdir = "pax" + longnametype = tarfile.XHDTYPE - def test_pax_globheaders(self): + def test_pax_global_headers(self): tar = tarfile.open(tarname, encoding="iso8859-1") + tarinfo = tar.getmember("pax/regtype1") self.assertEqual(tarinfo.uname, "foo") self.assertEqual(tarinfo.gname, "bar") - self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), "ÄÖÜäöüß") + self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), u"ÄÖÜäöüß") tarinfo = tar.getmember("pax/regtype2") self.assertEqual(tarinfo.uname, "") self.assertEqual(tarinfo.gname, "bar") - self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), "ÄÖÜäöüß") + self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), u"ÄÖÜäöüß") tarinfo = tar.getmember("pax/regtype3") self.assertEqual(tarinfo.uname, "tarfile") self.assertEqual(tarinfo.gname, "tarfile") - self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), "ÄÖÜäöüß") + self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), u"ÄÖÜäöüß") + + def test_pax_number_fields(self): + # All following number fields are read from the pax header. + tar = tarfile.open(tarname, encoding="iso8859-1") + tarinfo = tar.getmember("pax/regtype4") + self.assertEqual(tarinfo.size, 7011) + self.assertEqual(tarinfo.uid, 123) + self.assertEqual(tarinfo.gid, 123) + self.assertEqual(tarinfo.mtime, 1041808783.0) + self.assertEqual(type(tarinfo.mtime), float) + self.assertEqual(float(tarinfo.pax_headers["atime"]), 1041808783.0) + self.assertEqual(float(tarinfo.pax_headers["ctime"]), 1041808783.0) class WriteTest(unittest.TestCase): @@ -700,68 +725,161 @@ class PaxWriteTest(GNUWriteTest): n = tar.getmembers()[0].name self.assert_(name == n, "PAX longname creation failed") - def test_iso8859_15_filename(self): - self._test_unicode_filename("iso8859-15") + def test_pax_global_header(self): + pax_headers = { + u"foo": u"bar", + u"uid": u"0", + u"mtime": u"1.23", + u"test": u"äöü", + u"äöü": u"test"} + + tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT, \ + pax_headers=pax_headers) + tar.addfile(tarfile.TarInfo("test")) + tar.close() + + # Test if the global header was written correctly. + tar = tarfile.open(tmpname, encoding="iso8859-1") + self.assertEqual(tar.pax_headers, pax_headers) + self.assertEqual(tar.getmembers()[0].pax_headers, pax_headers) + + # Test if all the fields are unicode. + for key, val in tar.pax_headers.iteritems(): + self.assert_(type(key) is unicode) + self.assert_(type(val) is unicode) + if key in tarfile.PAX_NUMBER_FIELDS: + try: + tarfile.PAX_NUMBER_FIELDS[key](val) + except (TypeError, ValueError): + self.fail("unable to convert pax header field") + + def test_pax_extended_header(self): + # The fields from the pax header have priority over the + # TarInfo. + pax_headers = {u"path": u"foo", u"uid": u"123"} + + tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT, encoding="iso8859-1") + t = tarfile.TarInfo() + t.name = u"äöü" # non-ASCII + t.uid = 8**8 # too large + t.pax_headers = pax_headers + tar.addfile(t) + tar.close() + + tar = tarfile.open(tmpname, encoding="iso8859-1") + t = tar.getmembers()[0] + self.assertEqual(t.pax_headers, pax_headers) + self.assertEqual(t.name, "foo") + self.assertEqual(t.uid, 123) + + +class UstarUnicodeTest(unittest.TestCase): + # All *UnicodeTests FIXME + + format = tarfile.USTAR_FORMAT + + def test_iso8859_1_filename(self): + self._test_unicode_filename("iso8859-1") + + def test_utf7_filename(self): + self._test_unicode_filename("utf7") def test_utf8_filename(self): self._test_unicode_filename("utf8") - def test_utf16_filename(self): - self._test_unicode_filename("utf16") - def _test_unicode_filename(self, encoding): - tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT) - name = u"\u20ac".encode(encoding) # Euro sign - tar.encoding = encoding + tar = tarfile.open(tmpname, "w", format=self.format, encoding=encoding, errors="strict") + name = u"äöü" tar.addfile(tarfile.TarInfo(name)) tar.close() tar = tarfile.open(tmpname, encoding=encoding) - self.assertEqual(tar.getmembers()[0].name, name) + self.assert_(type(tar.getnames()[0]) is not unicode) + self.assertEqual(tar.getmembers()[0].name, name.encode(encoding)) tar.close() def test_unicode_filename_error(self): - # The euro sign filename cannot be translated to iso8859-1 encoding. - tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT, encoding="utf8") - name = u"\u20ac".encode("utf8") # Euro sign - tar.addfile(tarfile.TarInfo(name)) + tar = tarfile.open(tmpname, "w", format=self.format, encoding="ascii", errors="strict") + tarinfo = tarfile.TarInfo() + + tarinfo.name = "äöü" + if self.format == tarfile.PAX_FORMAT: + self.assertRaises(UnicodeError, tar.addfile, tarinfo) + else: + tar.addfile(tarinfo) + + tarinfo.name = u"äöü" + self.assertRaises(UnicodeError, tar.addfile, tarinfo) + + tarinfo.name = "foo" + tarinfo.uname = u"äöü" + self.assertRaises(UnicodeError, tar.addfile, tarinfo) + + def test_unicode_argument(self): + tar = tarfile.open(tarname, "r", encoding="iso8859-1", errors="strict") + for t in tar: + self.assert_(type(t.name) is str) + self.assert_(type(t.linkname) is str) + self.assert_(type(t.uname) is str) + self.assert_(type(t.gname) is str) tar.close() - self.assertRaises(UnicodeError, tarfile.open, tmpname, encoding="iso8859-1") + def test_uname_unicode(self): + for name in (u"äöü", "äöü"): + t = tarfile.TarInfo("foo") + t.uname = name + t.gname = name - def test_pax_headers(self): - self._test_pax_headers({"foo": "bar", "uid": 0, "mtime": 1.23}) + fobj = StringIO.StringIO() + tar = tarfile.open("foo.tar", mode="w", fileobj=fobj, format=self.format, encoding="iso8859-1") + tar.addfile(t) + tar.close() + fobj.seek(0) - self._test_pax_headers({"euro": u"\u20ac".encode("utf8")}) + tar = tarfile.open("foo.tar", fileobj=fobj, encoding="iso8859-1") + t = tar.getmember("foo") + self.assertEqual(t.uname, "äöü") + self.assertEqual(t.gname, "äöü") - self._test_pax_headers({"euro": u"\u20ac"}, - {"euro": u"\u20ac".encode("utf8")}) - self._test_pax_headers({u"\u20ac": "euro"}, - {u"\u20ac".encode("utf8"): "euro"}) +class GNUUnicodeTest(UstarUnicodeTest): - def _test_pax_headers(self, pax_headers, cmp_headers=None): - if cmp_headers is None: - cmp_headers = pax_headers + format = tarfile.GNU_FORMAT - tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT, \ - pax_headers=pax_headers, encoding="utf8") - tar.addfile(tarfile.TarInfo("test")) - tar.close() - tar = tarfile.open(tmpname, encoding="utf8") - self.assertEqual(tar.pax_headers, cmp_headers) +class PaxUnicodeTest(UstarUnicodeTest): - def test_truncated_header(self): - tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT) - tarinfo = tarfile.TarInfo("123/" * 126 + "longname") - tar.addfile(tarinfo) + format = tarfile.PAX_FORMAT + + def _create_unicode_name(self, name): + tar = tarfile.open(tmpname, "w", format=self.format) + t = tarfile.TarInfo() + t.pax_headers["path"] = name + tar.addfile(t) tar.close() - # Simulate a premature EOF. - open(tmpname, "rb+").truncate(1536) - tar = tarfile.open(tmpname) - self.assertEqual(tar.getmembers(), []) + def test_error_handlers(self): + # Test if the unicode error handlers work correctly for characters + # that cannot be expressed in a given encoding. + self._create_unicode_name(u"äöü") + + for handler, name in (("utf-8", u"äöü".encode("utf8")), + ("replace", "???"), ("ignore", "")): + tar = tarfile.open(tmpname, format=self.format, encoding="ascii", + errors=handler) + self.assertEqual(tar.getnames()[0], name) + + self.assertRaises(UnicodeError, tarfile.open, tmpname, + encoding="ascii", errors="strict") + + def test_error_handler_utf8(self): + # Create a pathname that has one component representable using + # iso8859-1 and the other only in iso8859-15. + self._create_unicode_name(u"äöü/¤") + + tar = tarfile.open(tmpname, format=self.format, encoding="iso8859-1", + errors="utf-8") + self.assertEqual(tar.getnames()[0], "äöü/" + u"¤".encode("utf8")) class AppendTest(unittest.TestCase): @@ -836,63 +954,58 @@ class LimitsTest(unittest.TestCase): def test_ustar_limits(self): # 100 char name tarinfo = tarfile.TarInfo("0123456789" * 10) - tarinfo.create_ustar_header() + tarinfo.tobuf(tarfile.USTAR_FORMAT) # 101 char name that cannot be stored tarinfo = tarfile.TarInfo("0123456789" * 10 + "0") - self.assertRaises(ValueError, tarinfo.create_ustar_header) + self.assertRaises(ValueError, tarinfo.tobuf, tarfile.USTAR_FORMAT) # 256 char name with a slash at pos 156 tarinfo = tarfile.TarInfo("123/" * 62 + "longname") - tarinfo.create_ustar_header() + tarinfo.tobuf(tarfile.USTAR_FORMAT) # 256 char name that cannot be stored tarinfo = tarfile.TarInfo("1234567/" * 31 + "longname") - self.assertRaises(ValueError, tarinfo.create_ustar_header) + self.assertRaises(ValueError, tarinfo.tobuf, tarfile.USTAR_FORMAT) # 512 char name tarinfo = tarfile.TarInfo("123/" * 126 + "longname") - self.assertRaises(ValueError, tarinfo.create_ustar_header) + self.assertRaises(ValueError, tarinfo.tobuf, tarfile.USTAR_FORMAT) # 512 char linkname tarinfo = tarfile.TarInfo("longlink") tarinfo.linkname = "123/" * 126 + "longname" - self.assertRaises(ValueError, tarinfo.create_ustar_header) + self.assertRaises(ValueError, tarinfo.tobuf, tarfile.USTAR_FORMAT) # uid > 8 digits tarinfo = tarfile.TarInfo("name") tarinfo.uid = 010000000 - self.assertRaises(ValueError, tarinfo.create_ustar_header) + self.assertRaises(ValueError, tarinfo.tobuf, tarfile.USTAR_FORMAT) def test_gnu_limits(self): tarinfo = tarfile.TarInfo("123/" * 126 + "longname") - tarinfo.create_gnu_header() + tarinfo.tobuf(tarfile.GNU_FORMAT) tarinfo = tarfile.TarInfo("longlink") tarinfo.linkname = "123/" * 126 + "longname" - tarinfo.create_gnu_header() + tarinfo.tobuf(tarfile.GNU_FORMAT) # uid >= 256 ** 7 tarinfo = tarfile.TarInfo("name") tarinfo.uid = 04000000000000000000L - self.assertRaises(ValueError, tarinfo.create_gnu_header) + self.assertRaises(ValueError, tarinfo.tobuf, tarfile.GNU_FORMAT) def test_pax_limits(self): - # A 256 char name that can be stored without an extended header. - tarinfo = tarfile.TarInfo("123/" * 62 + "longname") - self.assert_(len(tarinfo.create_pax_header("utf8")) == 512, - "create_pax_header attached superfluous extended header") - tarinfo = tarfile.TarInfo("123/" * 126 + "longname") - tarinfo.create_pax_header("utf8") + tarinfo.tobuf(tarfile.PAX_FORMAT) tarinfo = tarfile.TarInfo("longlink") tarinfo.linkname = "123/" * 126 + "longname" - tarinfo.create_pax_header("utf8") + tarinfo.tobuf(tarfile.PAX_FORMAT) tarinfo = tarfile.TarInfo("name") tarinfo.uid = 04000000000000000000L - tarinfo.create_pax_header("utf8") + tarinfo.tobuf(tarfile.PAX_FORMAT) class GzipMiscReadTest(MiscReadTest): @@ -940,6 +1053,9 @@ def test_main(): StreamWriteTest, GNUWriteTest, PaxWriteTest, + UstarUnicodeTest, + GNUUnicodeTest, + PaxUnicodeTest, AppendTest, LimitsTest, ] diff --git a/Lib/test/testtar.tar b/Lib/test/testtar.tar Binary files differindex c4c82b8..3529bdf 100644 --- a/Lib/test/testtar.tar +++ b/Lib/test/testtar.tar @@ -220,6 +220,9 @@ Core and builtins Library ------- +- tarfile.py: Improved unicode support. Unicode input names are now + officially supported. Added "errors" argument to the TarFile class. + - urllib.ftpwrapper class now accepts an optional timeout. - shlex.split() now has an optional "posix" parameter. |