diff options
Diffstat (limited to 'Lib/tarfile.py')
| -rw-r--r-- | Lib/tarfile.py | 1097 |
1 files changed, 729 insertions, 368 deletions
diff --git a/Lib/tarfile.py b/Lib/tarfile.py index aac6a5d..98ae12f 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -33,7 +33,7 @@ __version__ = "$Revision$" # $Source$ -version = "0.8.0" +version = "0.9.0" __author__ = "Lars Gustäbel (lars@gustaebel.de)" __date__ = "$Date$" __cvsid__ = "$Id$" @@ -50,6 +50,8 @@ import errno import time import struct import copy +import re +import operator if sys.platform == 'mac': # This module needs work for MacOS9, especially in the area of pathname @@ -69,42 +71,71 @@ __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"] #--------------------------------------------------------- # tar constants #--------------------------------------------------------- -NUL = "\0" # the null character -BLOCKSIZE = 512 # length of processing blocks +NUL = "\0" # the null character +BLOCKSIZE = 512 # length of processing blocks RECORDSIZE = BLOCKSIZE * 20 # length of records -MAGIC = "ustar" # magic tar string -VERSION = "00" # version number +GNU_MAGIC = "ustar \0" # magic gnu tar string +POSIX_MAGIC = "ustar\x0000" # magic posix tar string -LENGTH_NAME = 100 # maximum length of a filename -LENGTH_LINK = 100 # maximum length of a linkname -LENGTH_PREFIX = 155 # maximum length of the prefix field -MAXSIZE_MEMBER = 077777777777L # maximum size of a file (11 octal digits) +LENGTH_NAME = 100 # maximum length of a filename +LENGTH_LINK = 100 # maximum length of a linkname +LENGTH_PREFIX = 155 # maximum length of the prefix field -REGTYPE = "0" # regular file +REGTYPE = "0" # regular file AREGTYPE = "\0" # regular file -LNKTYPE = "1" # link (inside tarfile) -SYMTYPE = "2" # symbolic link -CHRTYPE = "3" # character special device -BLKTYPE = "4" # block special device -DIRTYPE = "5" # directory +LNKTYPE = "1" # link (inside tarfile) +SYMTYPE = "2" # symbolic link +CHRTYPE = "3" # character special device +BLKTYPE = "4" # block special device +DIRTYPE = "5" # directory FIFOTYPE = "6" # fifo special device CONTTYPE = "7" # contiguous file -GNUTYPE_LONGNAME = "L" # GNU tar extension for longnames -GNUTYPE_LONGLINK = "K" # GNU tar extension for longlink -GNUTYPE_SPARSE = "S" # GNU tar extension for sparse file +GNUTYPE_LONGNAME = "L" # GNU tar longname +GNUTYPE_LONGLINK = "K" # GNU tar longlink +GNUTYPE_SPARSE = "S" # GNU tar sparse file + +XHDTYPE = "x" # POSIX.1-2001 extended header +XGLTYPE = "g" # POSIX.1-2001 global header +SOLARIS_XHDTYPE = "X" # Solaris extended header + +USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format +GNU_FORMAT = 1 # GNU tar format +PAX_FORMAT = 2 # POSIX.1-2001 (pax) format +DEFAULT_FORMAT = GNU_FORMAT #--------------------------------------------------------- # tarfile constants #--------------------------------------------------------- -SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, # file types that tarfile - SYMTYPE, DIRTYPE, FIFOTYPE, # can cope with. +# File types that tarfile supports: +SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, + SYMTYPE, DIRTYPE, FIFOTYPE, CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, GNUTYPE_SPARSE) -REGULAR_TYPES = (REGTYPE, AREGTYPE, # file types that somehow - CONTTYPE, GNUTYPE_SPARSE) # represent regular files +# File types that will be treated as a regular file. +REGULAR_TYPES = (REGTYPE, AREGTYPE, + CONTTYPE, GNUTYPE_SPARSE) + +# File types that are part of the GNU tar format. +GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, + GNUTYPE_SPARSE) + +# Fields from a pax header that override a TarInfo attribute. +PAX_FIELDS = ("path", "linkpath", "size", "mtime", + "uid", "gid", "uname", "gname") + +# Fields in a pax header that are numbers, all other fields +# are treated as strings. +PAX_NUMBER_FIELDS = { + "atime": float, + "ctime": float, + "mtime": float, + "uid": int, + "gid": int, + "size": int +} #--------------------------------------------------------- # Bits used in the mode field, values in octal. @@ -131,6 +162,13 @@ TOWRITE = 0002 # write by other TOEXEC = 0001 # execute/search by other #--------------------------------------------------------- +# initialization +#--------------------------------------------------------- +ENCODING = sys.getfilesystemencoding() +if ENCODING is None: + ENCODING = sys.getdefaultencoding() + +#--------------------------------------------------------- # Some useful functions #--------------------------------------------------------- @@ -154,7 +192,10 @@ def nti(s): # There are two possible encodings for a number field, see # itn() below. if s[0] != chr(0200): - n = int(nts(s) or "0", 8) + try: + n = int(nts(s) or "0", 8) + except ValueError: + raise HeaderError("invalid header") else: n = 0L for i in xrange(len(s) - 1): @@ -162,7 +203,7 @@ def nti(s): n += ord(s[i + 1]) return n -def itn(n, digits=8, posix=False): +def itn(n, digits=8, format=DEFAULT_FORMAT): """Convert a python number to a number field. """ # POSIX 1003.1-1988 requires numbers to be encoded as a string of @@ -174,7 +215,7 @@ def itn(n, digits=8, posix=False): if 0 <= n < 8 ** (digits - 1): s = "%0*o" % (digits - 1, n) + NUL else: - if posix: + if format != GNU_FORMAT or n >= 256 ** (digits - 1): raise ValueError("overflow in number field") if n < 0: @@ -189,6 +230,26 @@ def itn(n, digits=8, posix=False): s = chr(0200) + s return s +def uts(s, encoding, errors): + """Convert a unicode object to a string. + """ + if errors == "utf-8": + # An extra error handler similar to the -o invalid=UTF-8 option + # in POSIX.1-2001. Replace untranslatable characters with their + # UTF-8 representation. + try: + return s.encode(encoding, "strict") + except UnicodeEncodeError: + x = [] + for c in s: + try: + x.append(c.encode(encoding, "strict")) + except UnicodeEncodeError: + x.append(c.encode("utf8")) + return "".join(x) + else: + return s.encode(encoding, errors) + def calc_chksums(buf): """Calculate the checksum for a member's header by summing up all characters except for the chksum field which is treated as if @@ -289,6 +350,9 @@ class CompressionError(TarError): class StreamError(TarError): """Exception for unsupported operations on stream-like TarFiles.""" pass +class HeaderError(TarError): + """Exception for invalid headers.""" + pass #--------------------------- # internal stream interface @@ -306,7 +370,7 @@ class _LowLevelFile: }[mode] if hasattr(os, "O_BINARY"): mode |= os.O_BINARY - self.fd = os.open(name, mode) + self.fd = os.open(name, mode, 0666) def close(self): os.close(self.fd) @@ -357,7 +421,7 @@ class _Stream: except ImportError: raise CompressionError("zlib module is not available") self.zlib = zlib - self.crc = zlib.crc32("") + self.crc = zlib.crc32("") & 0xffffffffL if mode == "r": self._init_read_gz() else: @@ -395,7 +459,7 @@ class _Stream: """Write string s to the stream. """ if self.comptype == "gz": - self.crc = self.zlib.crc32(s, self.crc) + self.crc = self.zlib.crc32(s, self.crc) & 0xffffffffL self.pos += len(s) if self.comptype != "tar": s = self.cmp.compress(s) @@ -517,7 +581,10 @@ class _Stream: buf = self.__read(self.bufsize) if not buf: break - buf = self.cmp.decompress(buf) + try: + buf = self.cmp.decompress(buf) + except IOError: + raise ReadError("invalid compressed data") t.append(buf) c += len(buf) t = "".join(t) @@ -578,6 +645,7 @@ class _BZ2Proxy(object): def __init__(self, fileobj, mode): self.fileobj = fileobj self.mode = mode + self.name = getattr(self.fileobj, "name", None) self.init() def init(self): @@ -627,7 +695,6 @@ class _BZ2Proxy(object): if self.mode == "w": raw = self.bz2obj.flush() self.fileobj.write(raw) - self.fileobj.close() # class _BZ2Proxy #------------------------ @@ -852,8 +919,8 @@ class TarInfo(object): """Construct a TarInfo object. name is the optional name of the member. """ - self.name = name # member name (dirnames must end with '/') - self.mode = 0666 # file permissions + self.name = name # member name + self.mode = 0644 # file permissions self.uid = 0 # user id self.gid = 0 # group id self.size = 0 # file size @@ -861,147 +928,524 @@ class TarInfo(object): self.chksum = 0 # header checksum self.type = REGTYPE # member type self.linkname = "" # link name - self.uname = "user" # user name - self.gname = "group" # group name + self.uname = "root" # user name + self.gname = "root" # group name self.devmajor = 0 # device major number self.devminor = 0 # device minor number self.offset = 0 # the tar header starts here self.offset_data = 0 # the file's data starts here + self.pax_headers = {} # pax header information + + # In pax headers the "name" and "linkname" field are called + # "path" and "linkpath". + def _getpath(self): + return self.name + def _setpath(self, name): + self.name = name + path = property(_getpath, _setpath) + + def _getlinkpath(self): + return self.linkname + def _setlinkpath(self, linkname): + self.linkname = linkname + linkpath = property(_getlinkpath, _setlinkpath) + def __repr__(self): return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) - @classmethod - def frombuf(cls, buf): - """Construct a TarInfo object from a 512 byte string buffer. + def get_info(self, encoding, errors): + """Return the TarInfo's attributes as a dictionary. + """ + info = { + "name": normpath(self.name), + "mode": self.mode & 07777, + "uid": self.uid, + "gid": self.gid, + "size": self.size, + "mtime": self.mtime, + "chksum": self.chksum, + "type": self.type, + "linkname": normpath(self.linkname) if self.linkname else "", + "uname": self.uname, + "gname": self.gname, + "devmajor": self.devmajor, + "devminor": self.devminor + } + + if info["type"] == DIRTYPE and not info["name"].endswith("/"): + info["name"] += "/" + + for key in ("name", "linkname", "uname", "gname"): + if type(info[key]) is unicode: + info[key] = info[key].encode(encoding, errors) + + return info + + def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"): + """Return a tar header as a string of 512 byte blocks. """ - if len(buf) != BLOCKSIZE: - raise ValueError("truncated header") - if buf.count(NUL) == BLOCKSIZE: - raise ValueError("empty header") - - tarinfo = cls() - tarinfo.buf = buf - tarinfo.name = nts(buf[0:100]) - tarinfo.mode = nti(buf[100:108]) - tarinfo.uid = nti(buf[108:116]) - tarinfo.gid = nti(buf[116:124]) - tarinfo.size = nti(buf[124:136]) - tarinfo.mtime = nti(buf[136:148]) - tarinfo.chksum = nti(buf[148:156]) - tarinfo.type = buf[156:157] - tarinfo.linkname = nts(buf[157:257]) - tarinfo.uname = nts(buf[265:297]) - tarinfo.gname = nts(buf[297:329]) - tarinfo.devmajor = nti(buf[329:337]) - tarinfo.devminor = nti(buf[337:345]) - prefix = nts(buf[345:500]) + info = self.get_info(encoding, errors) + + if format == USTAR_FORMAT: + return self.create_ustar_header(info) + elif format == GNU_FORMAT: + return self.create_gnu_header(info) + elif format == PAX_FORMAT: + return self.create_pax_header(info, encoding, errors) + else: + raise ValueError("invalid format") - if prefix and not tarinfo.issparse(): - tarinfo.name = prefix + "/" + tarinfo.name + def create_ustar_header(self, info): + """Return the object as a ustar header block. + """ + info["magic"] = POSIX_MAGIC - if tarinfo.chksum not in calc_chksums(buf): - raise ValueError("invalid header") - return tarinfo + if len(info["linkname"]) > LENGTH_LINK: + raise ValueError("linkname is too long") - def tobuf(self, posix=False): - """Return a tar header as a string of 512 byte blocks. + if len(info["name"]) > LENGTH_NAME: + info["prefix"], info["name"] = self._posix_split_name(info["name"]) + + return self._create_header(info, USTAR_FORMAT) + + def create_gnu_header(self, info): + """Return the object as a GNU header block sequence. """ + info["magic"] = GNU_MAGIC + buf = "" - type = self.type - prefix = "" + if len(info["linkname"]) > LENGTH_LINK: + buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK) - if self.name.endswith("/"): - type = DIRTYPE + if len(info["name"]) > LENGTH_NAME: + buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME) - if type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): - # Prevent "././@LongLink" from being normalized. - name = self.name - else: - name = normpath(self.name) + return buf + self._create_header(info, GNU_FORMAT) - if type == DIRTYPE: - # directories should end with '/' - name += "/" + def create_pax_header(self, info, encoding, errors): + """Return the object as a ustar header block. If it cannot be + represented this way, prepend a pax extended header sequence + with supplement information. + """ + info["magic"] = POSIX_MAGIC + pax_headers = self.pax_headers.copy() - linkname = self.linkname - if linkname: - # if linkname is empty we end up with a '.' - linkname = normpath(linkname) + # Test string fields for values that exceed the field length or cannot + # be represented in ASCII encoding. + for name, hname, length in ( + ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), + ("uname", "uname", 32), ("gname", "gname", 32)): - if posix: - if self.size > MAXSIZE_MEMBER: - raise ValueError("file is too large (>= 8 GB)") + if hname in pax_headers: + # The pax header has priority. + continue - if len(self.linkname) > LENGTH_LINK: - raise ValueError("linkname is too long (>%d)" % (LENGTH_LINK)) + val = info[name].decode(encoding, errors) - if len(name) > LENGTH_NAME: - prefix = name[:LENGTH_PREFIX + 1] - while prefix and prefix[-1] != "/": - prefix = prefix[:-1] + # Try to encode the string as ASCII. + try: + val.encode("ascii") + except UnicodeEncodeError: + pax_headers[hname] = val + continue + + if len(info[name]) > length: + pax_headers[hname] = val + + # Test number fields for values that exceed the field limit or values + # that like to be stored as float. + for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): + if name in pax_headers: + # The pax header has priority. Avoid overflow. + info[name] = 0 + continue + + val = info[name] + if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): + pax_headers[name] = unicode(val) + info[name] = 0 + + # Create a pax extended header if necessary. + if pax_headers: + buf = self._create_pax_generic_header(pax_headers) + else: + buf = "" - name = name[len(prefix):] - prefix = prefix[:-1] + return buf + self._create_header(info, USTAR_FORMAT) - if not prefix or len(name) > LENGTH_NAME: - raise ValueError("name is too long") + @classmethod + def create_pax_global_header(cls, pax_headers): + """Return the object as a pax global header block sequence. + """ + return cls._create_pax_generic_header(pax_headers, type=XGLTYPE) - else: - if len(self.linkname) > LENGTH_LINK: - buf += self._create_gnulong(self.linkname, GNUTYPE_LONGLINK) + def _posix_split_name(self, name): + """Split a name longer than 100 chars into a prefix + and a name part. + """ + prefix = name[:LENGTH_PREFIX + 1] + while prefix and prefix[-1] != "/": + prefix = prefix[:-1] + + name = name[len(prefix):] + prefix = prefix[:-1] - if len(name) > LENGTH_NAME: - buf += self._create_gnulong(name, GNUTYPE_LONGNAME) + if not prefix or len(name) > LENGTH_NAME: + raise ValueError("name is too long") + return prefix, name + @staticmethod + def _create_header(info, format): + """Return a header block. info is a dictionary with file + information, format must be one of the *_FORMAT constants. + """ parts = [ - stn(name, 100), - itn(self.mode & 07777, 8, posix), - itn(self.uid, 8, posix), - itn(self.gid, 8, posix), - itn(self.size, 12, posix), - itn(self.mtime, 12, posix), + stn(info.get("name", ""), 100), + itn(info.get("mode", 0) & 07777, 8, format), + itn(info.get("uid", 0), 8, format), + itn(info.get("gid", 0), 8, format), + itn(info.get("size", 0), 12, format), + itn(info.get("mtime", 0), 12, format), " ", # checksum field - type, - stn(self.linkname, 100), - stn(MAGIC, 6), - stn(VERSION, 2), - stn(self.uname, 32), - stn(self.gname, 32), - itn(self.devmajor, 8, posix), - itn(self.devminor, 8, posix), - stn(prefix, 155) + info.get("type", REGTYPE), + stn(info.get("linkname", ""), 100), + stn(info.get("magic", POSIX_MAGIC), 8), + stn(info.get("uname", "root"), 32), + stn(info.get("gname", "root"), 32), + itn(info.get("devmajor", 0), 8, format), + itn(info.get("devminor", 0), 8, format), + stn(info.get("prefix", ""), 155) ] - buf += "".join(parts).ljust(BLOCKSIZE, NUL) + buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts)) chksum = calc_chksums(buf[-BLOCKSIZE:])[0] buf = buf[:-364] + "%06o\0" % chksum + buf[-357:] - self.buf = buf return buf - def _create_gnulong(self, name, type): - """Create a GNU longname/longlink header from name. - It consists of an extended tar header, with the length - of the longname as size, followed by data blocks, - which contain the longname as a null terminated string. + @staticmethod + def _create_payload(payload): + """Return the string payload filled with zero bytes + up to the next 512 byte border. + """ + blocks, remainder = divmod(len(payload), BLOCKSIZE) + if remainder > 0: + payload += (BLOCKSIZE - remainder) * NUL + return payload + + @classmethod + def _create_gnu_long_header(cls, name, type): + """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence + for name. """ name += NUL - tarinfo = self.__class__() - tarinfo.name = "././@LongLink" - tarinfo.type = type - tarinfo.mode = 0 - tarinfo.size = len(name) - - # create extended header - buf = tarinfo.tobuf() - # create name blocks - buf += name - blocks, remainder = divmod(len(name), BLOCKSIZE) - if remainder > 0: - buf += (BLOCKSIZE - remainder) * NUL - return buf + info = {} + info["name"] = "././@LongLink" + info["type"] = type + info["size"] = len(name) + info["magic"] = GNU_MAGIC + + # create extended header + name blocks. + return cls._create_header(info, USTAR_FORMAT) + \ + cls._create_payload(name) + + @classmethod + def _create_pax_generic_header(cls, pax_headers, type=XHDTYPE): + """Return a POSIX.1-2001 extended or global header sequence + that contains a list of keyword, value pairs. The values + must be unicode objects. + """ + records = [] + for keyword, value in pax_headers.iteritems(): + keyword = keyword.encode("utf8") + value = value.encode("utf8") + l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' + n = p = 0 + while True: + n = l + len(str(p)) + if n == p: + break + p = n + records.append("%d %s=%s\n" % (p, keyword, value)) + records = "".join(records) + + # We use a hardcoded "././@PaxHeader" name like star does + # instead of the one that POSIX recommends. + info = {} + info["name"] = "././@PaxHeader" + info["type"] = type + info["size"] = len(records) + info["magic"] = POSIX_MAGIC + + # Create pax header + record blocks. + return cls._create_header(info, USTAR_FORMAT) + \ + cls._create_payload(records) + + @classmethod + def frombuf(cls, buf): + """Construct a TarInfo object from a 512 byte string buffer. + """ + if len(buf) != BLOCKSIZE: + raise HeaderError("truncated header") + if buf.count(NUL) == BLOCKSIZE: + raise HeaderError("empty header") + + chksum = nti(buf[148:156]) + if chksum not in calc_chksums(buf): + raise HeaderError("bad checksum") + + obj = cls() + obj.buf = buf + obj.name = nts(buf[0:100]) + obj.mode = nti(buf[100:108]) + obj.uid = nti(buf[108:116]) + obj.gid = nti(buf[116:124]) + obj.size = nti(buf[124:136]) + obj.mtime = nti(buf[136:148]) + obj.chksum = chksum + obj.type = buf[156:157] + obj.linkname = nts(buf[157:257]) + obj.uname = nts(buf[265:297]) + obj.gname = nts(buf[297:329]) + obj.devmajor = nti(buf[329:337]) + obj.devminor = nti(buf[337:345]) + prefix = nts(buf[345:500]) + + # Old V7 tar format represents a directory as a regular + # file with a trailing slash. + if obj.type == AREGTYPE and obj.name.endswith("/"): + obj.type = DIRTYPE + + # Remove redundant slashes from directories. + if obj.isdir(): + obj.name = obj.name.rstrip("/") + + # Reconstruct a ustar longname. + if prefix and obj.type not in GNU_TYPES: + obj.name = prefix + "/" + obj.name + return obj + + @classmethod + def fromtarfile(cls, tarfile): + """Return the next TarInfo object from TarFile object + tarfile. + """ + buf = tarfile.fileobj.read(BLOCKSIZE) + if not buf: + return + obj = cls.frombuf(buf) + obj.offset = tarfile.fileobj.tell() - BLOCKSIZE + return obj._proc_member(tarfile) + + #-------------------------------------------------------------------------- + # The following are methods that are called depending on the type of a + # member. The entry point is _proc_member() which can be overridden in a + # subclass to add custom _proc_*() methods. A _proc_*() method MUST + # implement the following + # operations: + # 1. Set self.offset_data to the position where the data blocks begin, + # if there is data that follows. + # 2. Set tarfile.offset to the position where the next member's header will + # begin. + # 3. Return self or another valid TarInfo object. + def _proc_member(self, tarfile): + """Choose the right processing method depending on + the type and call it. + """ + if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): + return self._proc_gnulong(tarfile) + elif self.type == GNUTYPE_SPARSE: + return self._proc_sparse(tarfile) + elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): + return self._proc_pax(tarfile) + else: + return self._proc_builtin(tarfile) + + def _proc_builtin(self, tarfile): + """Process a builtin type or an unknown type which + will be treated as a regular file. + """ + self.offset_data = tarfile.fileobj.tell() + offset = self.offset_data + if self.isreg() or self.type not in SUPPORTED_TYPES: + # Skip the following data blocks. + offset += self._block(self.size) + tarfile.offset = offset + + # Patch the TarInfo object with saved global + # header information. + self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) + + return self + + def _proc_gnulong(self, tarfile): + """Process the blocks that hold a GNU longname + or longlink member. + """ + buf = tarfile.fileobj.read(self._block(self.size)) + + # Fetch the next header and process it. + next = self.fromtarfile(tarfile) + if next is None: + raise HeaderError("missing subsequent header") + + # Patch the TarInfo object from the next header with + # the longname information. + next.offset = self.offset + if self.type == GNUTYPE_LONGNAME: + next.name = nts(buf) + elif self.type == GNUTYPE_LONGLINK: + next.linkname = nts(buf) + + return next + + def _proc_sparse(self, tarfile): + """Process a GNU sparse header plus extra headers. + """ + buf = self.buf + sp = _ringbuffer() + pos = 386 + lastpos = 0L + realpos = 0L + # There are 4 possible sparse structs in the + # first header. + for i in xrange(4): + try: + offset = nti(buf[pos:pos + 12]) + numbytes = nti(buf[pos + 12:pos + 24]) + except ValueError: + break + if offset > lastpos: + sp.append(_hole(lastpos, offset - lastpos)) + sp.append(_data(offset, numbytes, realpos)) + realpos += numbytes + lastpos = offset + numbytes + pos += 24 + + isextended = ord(buf[482]) + origsize = nti(buf[483:495]) + + # If the isextended flag is given, + # there are extra headers to process. + while isextended == 1: + buf = tarfile.fileobj.read(BLOCKSIZE) + pos = 0 + for i in xrange(21): + try: + offset = nti(buf[pos:pos + 12]) + numbytes = nti(buf[pos + 12:pos + 24]) + except ValueError: + break + if offset > lastpos: + sp.append(_hole(lastpos, offset - lastpos)) + sp.append(_data(offset, numbytes, realpos)) + realpos += numbytes + lastpos = offset + numbytes + pos += 24 + isextended = ord(buf[504]) + + if lastpos < origsize: + sp.append(_hole(lastpos, origsize - lastpos)) + + self.sparse = sp + + self.offset_data = tarfile.fileobj.tell() + tarfile.offset = self.offset_data + self._block(self.size) + self.size = origsize + + return self + + def _proc_pax(self, tarfile): + """Process an extended or global header as described in + POSIX.1-2001. + """ + # Read the header information. + buf = tarfile.fileobj.read(self._block(self.size)) + + # A pax header stores supplemental information for either + # the following file (extended) or all following files + # (global). + if self.type == XGLTYPE: + pax_headers = tarfile.pax_headers + else: + pax_headers = tarfile.pax_headers.copy() + + # Parse pax header information. A record looks like that: + # "%d %s=%s\n" % (length, keyword, value). length is the size + # of the complete record including the length field itself and + # the newline. keyword and value are both UTF-8 encoded strings. + regex = re.compile(r"(\d+) ([^=]+)=", re.U) + pos = 0 + while True: + match = regex.match(buf, pos) + if not match: + break + + length, keyword = match.groups() + length = int(length) + value = buf[match.end(2) + 1:match.start(1) + length - 1] + + keyword = keyword.decode("utf8") + value = value.decode("utf8") + + pax_headers[keyword] = value + pos += length + + # Fetch the next header. + next = self.fromtarfile(tarfile) + + if self.type in (XHDTYPE, SOLARIS_XHDTYPE): + if next is None: + raise HeaderError("missing subsequent header") + + # Patch the TarInfo object with the extended header info. + next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) + next.offset = self.offset + + if "size" in pax_headers: + # If the extended header replaces the size field, + # we need to recalculate the offset where the next + # header starts. + offset = next.offset_data + if next.isreg() or next.type not in SUPPORTED_TYPES: + offset += next._block(next.size) + tarfile.offset = offset + + return next + + def _apply_pax_info(self, pax_headers, encoding, errors): + """Replace fields with supplemental information from a previous + pax extended or global header. + """ + for keyword, value in pax_headers.iteritems(): + if keyword not in PAX_FIELDS: + continue + + if keyword == "path": + value = value.rstrip("/") + + if keyword in PAX_NUMBER_FIELDS: + try: + value = PAX_NUMBER_FIELDS[keyword](value) + except ValueError: + value = 0 + else: + value = uts(value, encoding, errors) + + setattr(self, keyword, value) + + self.pax_headers = pax_headers.copy() + + def _block(self, count): + """Round up a byte count by BLOCKSIZE and return it, + e.g. _block(834) => 1024. + """ + blocks, remainder = divmod(count, BLOCKSIZE) + if remainder: + blocks += 1 + return blocks * BLOCKSIZE def isreg(self): return self.type in REGULAR_TYPES @@ -1041,12 +1485,19 @@ class TarFile(object): # messages (if debug >= 0). If > 0, errors # are passed to the caller as exceptions. - posix = False # If True, generates POSIX.1-1990-compliant - # archives (no GNU extensions!) + format = DEFAULT_FORMAT # The format to use when creating an archive. + + encoding = ENCODING # Encoding for 8-bit character strings. + + errors = None # Error handler for unicode conversion. + + tarinfo = TarInfo # The default TarInfo class to use. - fileobject = ExFileObject + fileobject = ExFileObject # The default ExFileObject class to use. - def __init__(self, name=None, mode="r", fileobj=None): + def __init__(self, name=None, mode="r", fileobj=None, format=None, + tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, + errors=None, pax_headers=None, debug=None, errorlevel=None): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' @@ -1057,22 +1508,55 @@ class TarFile(object): """ if len(mode) > 1 or mode not in "raw": raise ValueError("mode must be 'r', 'a' or 'w'") - self._mode = mode - self.mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode] + self.mode = mode + self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode] if not fileobj: - fileobj = file(name, self.mode) + if self.mode == "a" and not os.path.exists(name): + # Create nonexistent files in append mode. + self.mode = "w" + self._mode = "wb" + fileobj = bltn_open(name, self._mode) self._extfileobj = False else: if name is None and hasattr(fileobj, "name"): name = fileobj.name if hasattr(fileobj, "mode"): - self.mode = fileobj.mode + self._mode = fileobj.mode self._extfileobj = True self.name = os.path.abspath(name) if name else None self.fileobj = fileobj - # Init datastructures + # Init attributes. + if format is not None: + self.format = format + if tarinfo is not None: + self.tarinfo = tarinfo + if dereference is not None: + self.dereference = dereference + if ignore_zeros is not None: + self.ignore_zeros = ignore_zeros + if encoding is not None: + self.encoding = encoding + + if errors is not None: + self.errors = errors + elif mode == "r": + self.errors = "utf-8" + else: + self.errors = "strict" + + if pax_headers is not None and self.format == PAX_FORMAT: + self.pax_headers = pax_headers + else: + self.pax_headers = {} + + if debug is not None: + self.debug = debug + if errorlevel is not None: + self.errorlevel = errorlevel + + # Init datastructures. self.closed = False self.members = [] # list of members as TarInfo objects self._loaded = False # flag if all members have been read @@ -1081,26 +1565,45 @@ class TarFile(object): self.inodes = {} # dictionary caching the inodes of # archive members already added - if self._mode == "r": - self.firstmember = None - self.firstmember = self.next() - - if self._mode == "a": - # Move to the end of the archive, - # before the first empty block. - self.firstmember = None - while True: - try: - tarinfo = self.next() - except ReadError: - self.fileobj.seek(0) - break - if tarinfo is None: - self.fileobj.seek(- BLOCKSIZE, 1) - break - - if self._mode in "aw": - self._loaded = True + try: + if self.mode == "r": + self.firstmember = None + self.firstmember = self.next() + + if self.mode == "a": + # Move to the end of the archive, + # before the first empty block. + self.firstmember = None + while True: + if self.next() is None: + if self.offset > 0: + self.fileobj.seek(- BLOCKSIZE, 1) + break + + if self.mode in "aw": + self._loaded = True + + if self.pax_headers: + buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) + self.fileobj.write(buf) + self.offset += len(buf) + except: + if not self._extfileobj: + self.fileobj.close() + self.closed = True + raise + + def _getposix(self): + return self.format == USTAR_FORMAT + def _setposix(self, value): + import warnings + warnings.warn("use the format attribute instead", DeprecationWarning, + 2) + if value: + self.format = USTAR_FORMAT + else: + self.format = GNU_FORMAT + posix = property(_getposix, _setposix) #-------------------------------------------------------------------------- # Below are the classmethods which act as alternate constructors to the @@ -1114,7 +1617,7 @@ class TarFile(object): # by adding it to the mapping in OPEN_METH. @classmethod - def open(cls, name=None, mode="r", fileobj=None, bufsize=20*512): + def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): """Open a tar archive for reading, writing or appending. Return an appropriate TarFile class. @@ -1123,7 +1626,7 @@ class TarFile(object): 'r:' open for reading exclusively uncompressed 'r:gz' open for reading with gzip compression 'r:bz2' open for reading with bzip2 compression - 'a' or 'a:' open for appending + 'a' or 'a:' open for appending, creating the file if necessary 'w' or 'w:' open for writing without compression 'w:gz' open for writing with gzip compression 'w:bz2' open for writing with bzip2 compression @@ -1147,8 +1650,8 @@ class TarFile(object): if fileobj is not None: saved_pos = fileobj.tell() try: - return func(name, "r", fileobj) - except (ReadError, CompressionError): + return func(name, "r", fileobj, **kwargs) + except (ReadError, CompressionError), e: if fileobj is not None: fileobj.seek(saved_pos) continue @@ -1165,7 +1668,7 @@ class TarFile(object): func = getattr(cls, cls.OPEN_METH[comptype]) else: raise CompressionError("unknown compression type %r" % comptype) - return func(name, filemode, fileobj) + return func(name, filemode, fileobj, **kwargs) elif "|" in mode: filemode, comptype = mode.split("|", 1) @@ -1176,25 +1679,26 @@ class TarFile(object): raise ValueError("mode must be 'r' or 'w'") t = cls(name, filemode, - _Stream(name, filemode, comptype, fileobj, bufsize)) + _Stream(name, filemode, comptype, fileobj, bufsize), + **kwargs) t._extfileobj = False return t elif mode in "aw": - return cls.taropen(name, mode, fileobj) + return cls.taropen(name, mode, fileobj, **kwargs) raise ValueError("undiscernible mode") @classmethod - def taropen(cls, name, mode="r", fileobj=None): + def taropen(cls, name, mode="r", fileobj=None, **kwargs): """Open uncompressed tar archive name for reading or writing. """ if len(mode) > 1 or mode not in "raw": raise ValueError("mode must be 'r', 'a' or 'w'") - return cls(name, mode, fileobj) + return cls(name, mode, fileobj, **kwargs) @classmethod - def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9): + def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): """Open gzip compressed tar archive name for reading or writing. Appending is not allowed. """ @@ -1208,18 +1712,19 @@ class TarFile(object): raise CompressionError("gzip module is not available") if fileobj is None: - fileobj = file(name, mode + "b") + fileobj = bltn_open(name, mode + "b") try: t = cls.taropen(name, mode, - gzip.GzipFile(name, mode, compresslevel, fileobj)) + gzip.GzipFile(name, mode, compresslevel, fileobj), + **kwargs) except IOError: raise ReadError("not a gzip file") t._extfileobj = False return t @classmethod - def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9): + def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): """Open bzip2 compressed tar archive name for reading or writing. Appending is not allowed. """ @@ -1237,7 +1742,7 @@ class TarFile(object): fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel) try: - t = cls.taropen(name, mode, fileobj) + t = cls.taropen(name, mode, fileobj, **kwargs) except IOError: raise ReadError("not a bzip2 file") t._extfileobj = False @@ -1260,7 +1765,7 @@ class TarFile(object): if self.closed: return - if self._mode in "aw": + if self.mode in "aw": self.fileobj.write(NUL * (BLOCKSIZE * 2)) self.offset += (BLOCKSIZE * 2) # fill up the end with zero-blocks @@ -1276,7 +1781,7 @@ class TarFile(object): def getmember(self, name): """Return a TarInfo object for member `name'. If `name' can not be found in the archive, KeyError is raised. If a member occurs more - than once in the archive, its last occurence is assumed to be the + than once in the archive, its last occurrence is assumed to be the most up-to-date version. """ tarinfo = self._getmember(name) @@ -1326,7 +1831,8 @@ class TarFile(object): # Now, fill the TarInfo object with # information specific for the file. - tarinfo = TarInfo() + tarinfo = self.tarinfo() + tarinfo.tarfile = self # Use os.stat or os.lstat, depending on platform # and if symlinks shall be resolved. @@ -1342,8 +1848,8 @@ class TarFile(object): stmd = statres.st_mode if stat.S_ISREG(stmd): inode = (statres.st_ino, statres.st_dev) - if not self.dereference and \ - statres.st_nlink > 1 and inode in self.inodes: + if not self.dereference and statres.st_nlink > 1 and \ + inode in self.inodes and arcname != self.inodes[inode]: # Is it a hardlink to an already # archived file? type = LNKTYPE @@ -1356,8 +1862,6 @@ class TarFile(object): self.inodes[inode] = arcname elif stat.S_ISDIR(stmd): type = DIRTYPE - if arcname[-1:] != "/": - arcname += "/" elif stat.S_ISFIFO(stmd): type = FIFOTYPE elif stat.S_ISLNK(stmd): @@ -1376,7 +1880,7 @@ class TarFile(object): tarinfo.mode = stmd tarinfo.uid = statres.st_uid tarinfo.gid = statres.st_gid - if stat.S_ISREG(stmd): + if type == REGTYPE: tarinfo.size = statres.st_size else: tarinfo.size = 0L @@ -1420,7 +1924,7 @@ class TarFile(object): print "%d-%02d-%02d %02d:%02d:%02d" \ % time.localtime(tarinfo.mtime)[:6], - print tarinfo.name, + print tarinfo.name + ("/" if tarinfo.isdir() else ""), if verbose: if tarinfo.issym(): @@ -1429,18 +1933,24 @@ class TarFile(object): print "link to", tarinfo.linkname, print - def add(self, name, arcname=None, recursive=True): + def add(self, name, arcname=None, recursive=True, exclude=None): """Add the file `name' to the archive. `name' may be any type of file (directory, fifo, symbolic link, etc.). If given, `arcname' specifies an alternative name for the file in the archive. Directories are added recursively by default. This can be avoided by - setting `recursive' to False. + setting `recursive' to False. `exclude' is a function that should + return True for each filename to be excluded. """ self._check("aw") if arcname is None: arcname = name + # Exclude pathnames. + if exclude is not None and exclude(name): + self._dbg(2, "tarfile: Excluded %r" % name) + return + # Skip if somebody tries to archive the archive... if self.name is not None and os.path.abspath(name) == self.name: self._dbg(2, "tarfile: Skipped %r" % name) @@ -1452,8 +1962,8 @@ class TarFile(object): if recursive: if arcname == ".": arcname = "" - for f in os.listdir("."): - self.add(f, os.path.join(arcname, f)) + for f in os.listdir(name): + self.add(f, os.path.join(arcname, f), recursive, exclude) return self._dbg(1, name) @@ -1467,7 +1977,7 @@ class TarFile(object): # Append the tar header and data to the archive. if tarinfo.isreg(): - f = file(name, "rb") + f = bltn_open(name, "rb") self.addfile(tarinfo, f) f.close() @@ -1475,7 +1985,7 @@ class TarFile(object): self.addfile(tarinfo) if recursive: for f in os.listdir(name): - self.add(os.path.join(name, f), os.path.join(arcname, f)) + self.add(os.path.join(name, f), os.path.join(arcname, f), recursive, exclude) else: self.addfile(tarinfo) @@ -1491,7 +2001,7 @@ class TarFile(object): tarinfo = copy.copy(tarinfo) - buf = tarinfo.tobuf(self.posix) + buf = tarinfo.tobuf(self.format, self.encoding, self.errors) self.fileobj.write(buf) self.offset += len(buf) @@ -1527,7 +2037,7 @@ class TarFile(object): self.extract(tarinfo, path) # Reverse sort directories. - directories.sort(lambda a, b: cmp(a.name, b.name)) + directories.sort(key=operator.attrgetter('name')) directories.reverse() # Set correct owner, mtime and filemode on directories. @@ -1551,10 +2061,10 @@ class TarFile(object): """ self._check("r") - if isinstance(member, TarInfo): - tarinfo = member - else: + if isinstance(member, basestring): tarinfo = self.getmember(member) + else: + tarinfo = member # Prepare the link target for makelink(). if tarinfo.islnk(): @@ -1587,10 +2097,10 @@ class TarFile(object): """ self._check("r") - if isinstance(member, TarInfo): - tarinfo = member - else: + if isinstance(member, basestring): tarinfo = self.getmember(member) + else: + tarinfo = member if tarinfo.isreg(): return self.fileobject(self, tarinfo) @@ -1678,7 +2188,7 @@ class TarFile(object): """Make a file called targetpath. """ source = self.extractfile(tarinfo) - target = file(targetpath, "wb") + target = bltn_open(targetpath, "wb") copyfileobj(source, target) source.close() target.close() @@ -1783,10 +2293,6 @@ class TarFile(object): """ if not hasattr(os, 'utime'): return - if sys.platform == "win32" and tarinfo.isdir(): - # According to msdn.microsoft.com, it is an error (EACCES) - # to use utime() on directories. - return try: os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) except EnvironmentError, e: @@ -1807,177 +2313,28 @@ class TarFile(object): # Read the next block. self.fileobj.seek(self.offset) while True: - buf = self.fileobj.read(BLOCKSIZE) - if not buf: - return None - try: - tarinfo = TarInfo.frombuf(buf) - - # Set the TarInfo object's offset to the current position of the - # TarFile and set self.offset to the position where the data blocks - # should begin. - tarinfo.offset = self.offset - self.offset += BLOCKSIZE - - tarinfo = self.proc_member(tarinfo) + tarinfo = self.tarinfo.fromtarfile(self) + if tarinfo is None: + return + self.members.append(tarinfo) - except ValueError, e: + except HeaderError, e: if self.ignore_zeros: - self._dbg(2, "0x%X: empty or invalid block: %s" % - (self.offset, e)) + self._dbg(2, "0x%X: %s" % (self.offset, e)) self.offset += BLOCKSIZE continue else: if self.offset == 0: - raise ReadError("empty, unreadable or compressed " - "file: %s" % e) + raise ReadError(str(e)) return None break - # Some old tar programs represent a directory as a regular - # file with a trailing slash. - if tarinfo.isreg() and tarinfo.name.endswith("/"): - tarinfo.type = DIRTYPE - - # Directory names should have a '/' at the end. - if tarinfo.isdir() and not tarinfo.name.endswith("/"): - tarinfo.name += "/" - - self.members.append(tarinfo) - return tarinfo - - #-------------------------------------------------------------------------- - # The following are methods that are called depending on the type of a - # member. The entry point is proc_member() which is called with a TarInfo - # object created from the header block from the current offset. The - # proc_member() method can be overridden in a subclass to add custom - # proc_*() methods. A proc_*() method MUST implement the following - # operations: - # 1. Set tarinfo.offset_data to the position where the data blocks begin, - # if there is data that follows. - # 2. Set self.offset to the position where the next member's header will - # begin. - # 3. Return tarinfo or another valid TarInfo object. - def proc_member(self, tarinfo): - """Choose the right processing method for tarinfo depending - on its type and call it. - """ - if tarinfo.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): - return self.proc_gnulong(tarinfo) - elif tarinfo.type == GNUTYPE_SPARSE: - return self.proc_sparse(tarinfo) - else: - return self.proc_builtin(tarinfo) - - def proc_builtin(self, tarinfo): - """Process a builtin type member or an unknown member - which will be treated as a regular file. - """ - tarinfo.offset_data = self.offset - if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: - # Skip the following data blocks. - self.offset += self._block(tarinfo.size) - return tarinfo - - def proc_gnulong(self, tarinfo): - """Process the blocks that hold a GNU longname - or longlink member. - """ - buf = "" - count = tarinfo.size - while count > 0: - block = self.fileobj.read(BLOCKSIZE) - buf += block - self.offset += BLOCKSIZE - count -= BLOCKSIZE - - # Fetch the next header and process it. - b = self.fileobj.read(BLOCKSIZE) - t = TarInfo.frombuf(b) - t.offset = self.offset - self.offset += BLOCKSIZE - next = self.proc_member(t) - - # Patch the TarInfo object from the next header with - # the longname information. - next.offset = tarinfo.offset - if tarinfo.type == GNUTYPE_LONGNAME: - next.name = nts(buf) - elif tarinfo.type == GNUTYPE_LONGLINK: - next.linkname = nts(buf) - - return next - - def proc_sparse(self, tarinfo): - """Process a GNU sparse header plus extra headers. - """ - buf = tarinfo.buf - sp = _ringbuffer() - pos = 386 - lastpos = 0L - realpos = 0L - # There are 4 possible sparse structs in the - # first header. - for i in xrange(4): - try: - offset = nti(buf[pos:pos + 12]) - numbytes = nti(buf[pos + 12:pos + 24]) - except ValueError: - break - if offset > lastpos: - sp.append(_hole(lastpos, offset - lastpos)) - sp.append(_data(offset, numbytes, realpos)) - realpos += numbytes - lastpos = offset + numbytes - pos += 24 - - isextended = ord(buf[482]) - origsize = nti(buf[483:495]) - - # If the isextended flag is given, - # there are extra headers to process. - while isextended == 1: - buf = self.fileobj.read(BLOCKSIZE) - self.offset += BLOCKSIZE - pos = 0 - for i in xrange(21): - try: - offset = nti(buf[pos:pos + 12]) - numbytes = nti(buf[pos + 12:pos + 24]) - except ValueError: - break - if offset > lastpos: - sp.append(_hole(lastpos, offset - lastpos)) - sp.append(_data(offset, numbytes, realpos)) - realpos += numbytes - lastpos = offset + numbytes - pos += 24 - isextended = ord(buf[504]) - - if lastpos < origsize: - sp.append(_hole(lastpos, origsize - lastpos)) - - tarinfo.sparse = sp - - tarinfo.offset_data = self.offset - self.offset += self._block(tarinfo.size) - tarinfo.size = origsize - return tarinfo #-------------------------------------------------------------------------- # Little helper methods: - def _block(self, count): - """Round up a byte count by BLOCKSIZE and return it, - e.g. _block(834) => 1024. - """ - blocks, remainder = divmod(count, BLOCKSIZE) - if remainder: - blocks += 1 - return blocks * BLOCKSIZE - def _getmember(self, name, tarinfo=None): """Find an archive member by name from bottom to top. If tarinfo is given, it is used as the starting point. @@ -2010,8 +2367,8 @@ class TarFile(object): """ if self.closed: raise IOError("%s is closed" % self.__class__.__name__) - if mode is not None and self._mode not in mode: - raise IOError("bad operation for mode %r" % self._mode) + if mode is not None and self.mode not in mode: + raise IOError("bad operation for mode %r" % self.mode) def __iter__(self): """Provide an iterator object. @@ -2117,6 +2474,9 @@ class TarFileCompat: ZipFile class. """ def __init__(self, file, mode="r", compression=TAR_PLAIN): + from warnings import warnpy3k + warnpy3k("the TarFileCompat class has been removed in Python 3.0", + stacklevel=2) if compression == TAR_PLAIN: self.tarfile = TarFile.taropen(file, mode) elif compression == TAR_GZIPPED: @@ -2150,10 +2510,10 @@ class TarFileCompat: except ImportError: from StringIO import StringIO import calendar - zinfo.name = zinfo.filename - zinfo.size = zinfo.file_size - zinfo.mtime = calendar.timegm(zinfo.date_time) - self.tarfile.addfile(zinfo, StringIO(bytes)) + tinfo = TarInfo(zinfo.filename) + tinfo.size = len(bytes) + tinfo.mtime = calendar.timegm(zinfo.date_time) + self.tarfile.addfile(tinfo, StringIO(bytes)) def close(self): self.tarfile.close() #class TarFileCompat @@ -2172,4 +2532,5 @@ def is_tarfile(name): except TarError: return False +bltn_open = open open = TarFile.open |
