diff options
Diffstat (limited to 'Lib/tarfile.py')
| -rw-r--r-- | Lib/tarfile.py | 980 | 
1 files changed, 646 insertions, 334 deletions
| diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 54bb1b8..b6dc3ee 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -33,7 +33,7 @@  __version__ = "$Revision$"  # $Source$ -version     = "0.8.0" +version     = "0.9.0"  __author__  = "Lars Gustäbel (lars@gustaebel.de)"  __date__    = "$Date$"  __cvsid__   = "$Id$" @@ -50,6 +50,7 @@ import errno  import time  import struct  import copy +import re  if sys.platform == 'mac':      # This module needs work for MacOS9, especially in the area of pathname @@ -69,42 +70,60 @@ __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]  #---------------------------------------------------------  # tar constants  #--------------------------------------------------------- -NUL        = "\0"               # the null character -BLOCKSIZE  = 512                # length of processing blocks +NUL = "\0"                      # the null character +BLOCKSIZE = 512                 # length of processing blocks  RECORDSIZE = BLOCKSIZE * 20     # length of records -MAGIC      = "ustar"            # magic tar string -VERSION    = "00"               # version number +GNU_MAGIC = "ustar  \0"         # magic gnu tar string +POSIX_MAGIC = "ustar\x0000"     # magic posix tar string -LENGTH_NAME    = 100            # maximum length of a filename -LENGTH_LINK    = 100            # maximum length of a linkname -LENGTH_PREFIX  = 155            # maximum length of the prefix field -MAXSIZE_MEMBER = 077777777777L  # maximum size of a file (11 octal digits) +LENGTH_NAME = 100               # maximum length of a filename +LENGTH_LINK = 100               # maximum length of a linkname +LENGTH_PREFIX = 155             # maximum length of the prefix field -REGTYPE  = "0"                  # regular file +REGTYPE = "0"                   # regular file  AREGTYPE = "\0"                 # regular file -LNKTYPE  = "1"                  # link (inside tarfile) -SYMTYPE  = "2"                  # symbolic link -CHRTYPE  = "3"                  # character special device -BLKTYPE  = "4"                  # block special device -DIRTYPE  = "5"                  # directory +LNKTYPE = "1"                   # link (inside tarfile) +SYMTYPE = "2"                   # symbolic link +CHRTYPE = "3"                   # character special device +BLKTYPE = "4"                   # block special device +DIRTYPE = "5"                   # directory  FIFOTYPE = "6"                  # fifo special device  CONTTYPE = "7"                  # contiguous file -GNUTYPE_LONGNAME = "L"          # GNU tar extension for longnames -GNUTYPE_LONGLINK = "K"          # GNU tar extension for longlink -GNUTYPE_SPARSE   = "S"          # GNU tar extension for sparse file +GNUTYPE_LONGNAME = "L"          # GNU tar longname +GNUTYPE_LONGLINK = "K"          # GNU tar longlink +GNUTYPE_SPARSE = "S"            # GNU tar sparse file + +XHDTYPE = "x"                   # POSIX.1-2001 extended header +XGLTYPE = "g"                   # POSIX.1-2001 global header +SOLARIS_XHDTYPE = "X"           # Solaris extended header + +USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format +GNU_FORMAT = 1                  # GNU tar format +PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format +DEFAULT_FORMAT = GNU_FORMAT  #---------------------------------------------------------  # tarfile constants  #--------------------------------------------------------- -SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,  # file types that tarfile -                   SYMTYPE, DIRTYPE, FIFOTYPE,  # can cope with. +# File types that tarfile supports: +SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, +                   SYMTYPE, DIRTYPE, FIFOTYPE,                     CONTTYPE, CHRTYPE, BLKTYPE,                     GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,                     GNUTYPE_SPARSE) -REGULAR_TYPES = (REGTYPE, AREGTYPE,             # file types that somehow -                 CONTTYPE, GNUTYPE_SPARSE)      # represent regular files +# File types that will be treated as a regular file. +REGULAR_TYPES = (REGTYPE, AREGTYPE, +                 CONTTYPE, GNUTYPE_SPARSE) + +# File types that are part of the GNU tar format. +GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, +             GNUTYPE_SPARSE) + +# Fields from a pax header that override a TarInfo attribute. +PAX_FIELDS = ("path", "linkpath", "size", "mtime", +              "uid", "gid", "uname", "gname")  #---------------------------------------------------------  # Bits used in the mode field, values in octal. @@ -131,6 +150,13 @@ TOWRITE = 0002           # write by other  TOEXEC  = 0001           # execute/search by other  #--------------------------------------------------------- +# initialization +#--------------------------------------------------------- +ENCODING = sys.getfilesystemencoding() +if ENCODING is None: +    ENCODING = "ascii" + +#---------------------------------------------------------  # Some useful functions  #--------------------------------------------------------- @@ -139,6 +165,15 @@ def stn(s, length):      """      return s[:length] + (length - len(s)) * NUL +def nts(s): +    """Convert a null-terminated string field to a python string. +    """ +    # Use the string up to the first null char. +    p = s.find("\0") +    if p == -1: +        return s +    return s[:p] +  def nti(s):      """Convert a number field to a python number.      """ @@ -146,7 +181,7 @@ def nti(s):      # itn() below.      if s[0] != chr(0200):          try: -            n = int(s.rstrip(NUL + " ") or "0", 8) +            n = int(nts(s) or "0", 8)          except ValueError:              raise HeaderError("invalid header")      else: @@ -156,7 +191,7 @@ def nti(s):              n += ord(s[i + 1])      return n -def itn(n, digits=8, posix=False): +def itn(n, digits=8, format=DEFAULT_FORMAT):      """Convert a python number to a number field.      """      # POSIX 1003.1-1988 requires numbers to be encoded as a string of @@ -168,7 +203,7 @@ def itn(n, digits=8, posix=False):      if 0 <= n < 8 ** (digits - 1):          s = "%0*o" % (digits - 1, n) + NUL      else: -        if posix: +        if format != GNU_FORMAT or n >= 256 ** (digits - 1):              raise ValueError("overflow in number field")          if n < 0: @@ -514,7 +549,10 @@ class _Stream:              buf = self.__read(self.bufsize)              if not buf:                  break -            buf = self.cmp.decompress(buf) +            try: +                buf = self.cmp.decompress(buf) +            except IOError: +                raise ReadError("invalid compressed data")              t.append(buf)              c += len(buf)          t = "".join(t) @@ -575,6 +613,7 @@ class _BZ2Proxy(object):      def __init__(self, fileobj, mode):          self.fileobj = fileobj          self.mode = mode +        self.name = getattr(self.fileobj, "name", None)          self.init()      def init(self): @@ -847,8 +886,8 @@ class TarInfo(object):          """Construct a TarInfo object. name is the optional name             of the member.          """ -        self.name = name        # member name (dirnames must end with '/') -        self.mode = 0666        # file permissions +        self.name = name        # member name +        self.mode = 0644        # file permissions          self.uid = 0            # user id          self.gid = 0            # group id          self.size = 0           # file size @@ -856,17 +895,274 @@ class TarInfo(object):          self.chksum = 0         # header checksum          self.type = REGTYPE     # member type          self.linkname = ""      # link name -        self.uname = "user"     # user name -        self.gname = "group"    # group name +        self.uname = "root"     # user name +        self.gname = "root"     # group name          self.devmajor = 0       # device major number          self.devminor = 0       # device minor number          self.offset = 0         # the tar header starts here          self.offset_data = 0    # the file's data starts here +        self.pax_headers = {}   # pax header information + +    # In pax headers the "name" and "linkname" field are called +    # "path" and "linkpath". +    def _getpath(self): +        return self.name +    def _setpath(self, name): +        self.name = name +    path = property(_getpath, _setpath) + +    def _getlinkpath(self): +        return self.linkname +    def _setlinkpath(self, linkname): +        self.linkname = linkname +    linkpath = property(_getlinkpath, _setlinkpath) +      def __repr__(self):          return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) +    def get_info(self): +        """Return the TarInfo's attributes as a dictionary. +        """ +        info = { +            "name":     normpath(self.name), +            "mode":     self.mode & 07777, +            "uid":      self.uid, +            "gid":      self.gid, +            "size":     self.size, +            "mtime":    self.mtime, +            "chksum":   self.chksum, +            "type":     self.type, +            "linkname": normpath(self.linkname) if self.linkname else "", +            "uname":    self.uname, +            "gname":    self.gname, +            "devmajor": self.devmajor, +            "devminor": self.devminor +        } + +        if info["type"] == DIRTYPE and not info["name"].endswith("/"): +            info["name"] += "/" + +        return info + +    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING): +        """Return a tar header as a string of 512 byte blocks. +        """ +        if format == USTAR_FORMAT: +            return self.create_ustar_header() +        elif format == GNU_FORMAT: +            return self.create_gnu_header() +        elif format == PAX_FORMAT: +            return self.create_pax_header(encoding) +        else: +            raise ValueError("invalid format") + +    def create_ustar_header(self): +        """Return the object as a ustar header block. +        """ +        info = self.get_info() +        info["magic"] = POSIX_MAGIC + +        if len(info["linkname"]) > LENGTH_LINK: +            raise ValueError("linkname is too long") + +        if len(info["name"]) > LENGTH_NAME: +            info["prefix"], info["name"] = self._posix_split_name(info["name"]) + +        return self._create_header(info, USTAR_FORMAT) + +    def create_gnu_header(self): +        """Return the object as a GNU header block sequence. +        """ +        info = self.get_info() +        info["magic"] = GNU_MAGIC + +        buf = "" +        if len(info["linkname"]) > LENGTH_LINK: +            buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK) + +        if len(info["name"]) > LENGTH_NAME: +            buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME) + +        return buf + self._create_header(info, GNU_FORMAT) + +    def create_pax_header(self, encoding): +        """Return the object as a ustar header block. If it cannot be +           represented this way, prepend a pax extended header sequence +           with supplement information. +        """ +        info = self.get_info() +        info["magic"] = POSIX_MAGIC +        pax_headers = self.pax_headers.copy() + +        # Test string fields for values that exceed the field length or cannot +        # be represented in ASCII encoding. +        for name, hname, length in ( +                ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), +                ("uname", "uname", 32), ("gname", "gname", 32)): + +            val = info[name].decode(encoding) + +            # Try to encode the string as ASCII. +            try: +                val.encode("ascii") +            except UnicodeEncodeError: +                pax_headers[hname] = val +                continue + +            if len(val) > length: +                if name == "name": +                    # Try to squeeze a longname in the prefix and name fields as in +                    # ustar format. +                    try: +                        info["prefix"], info["name"] = self._posix_split_name(info["name"]) +                    except ValueError: +                        pax_headers[hname] = val +                    else: +                        continue +                else: +                    pax_headers[hname] = val + +        # Test number fields for values that exceed the field limit or values +        # that like to be stored as float. +        for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): +            val = info[name] +            if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): +                pax_headers[name] = unicode(val) +                info[name] = 0 + +        if pax_headers: +            buf = self._create_pax_generic_header(pax_headers) +        else: +            buf = "" + +        return buf + self._create_header(info, USTAR_FORMAT) + +    @classmethod +    def create_pax_global_header(cls, pax_headers, encoding): +        """Return the object as a pax global header block sequence. +        """ +        new_headers = {} +        for key, val in pax_headers.iteritems(): +            key = cls._to_unicode(key, encoding) +            val = cls._to_unicode(val, encoding) +            new_headers[key] = val +        return cls._create_pax_generic_header(new_headers, type=XGLTYPE) + +    @staticmethod +    def _to_unicode(value, encoding): +        if isinstance(value, unicode): +            return value +        elif isinstance(value, (int, long, float)): +            return unicode(value) +        elif isinstance(value, str): +            return unicode(value, encoding) +        else: +            raise ValueError("unable to convert to unicode: %r" % value) + +    def _posix_split_name(self, name): +        """Split a name longer than 100 chars into a prefix +           and a name part. +        """ +        prefix = name[:LENGTH_PREFIX + 1] +        while prefix and prefix[-1] != "/": +            prefix = prefix[:-1] + +        name = name[len(prefix):] +        prefix = prefix[:-1] + +        if not prefix or len(name) > LENGTH_NAME: +            raise ValueError("name is too long") +        return prefix, name + +    @staticmethod +    def _create_header(info, format): +        """Return a header block. info is a dictionary with file +           information, format must be one of the *_FORMAT constants. +        """ +        parts = [ +            stn(info.get("name", ""), 100), +            itn(info.get("mode", 0) & 07777, 8, format), +            itn(info.get("uid", 0), 8, format), +            itn(info.get("gid", 0), 8, format), +            itn(info.get("size", 0), 12, format), +            itn(info.get("mtime", 0), 12, format), +            "        ", # checksum field +            info.get("type", REGTYPE), +            stn(info.get("linkname", ""), 100), +            stn(info.get("magic", ""), 8), +            stn(info.get("uname", ""), 32), +            stn(info.get("gname", ""), 32), +            itn(info.get("devmajor", 0), 8, format), +            itn(info.get("devminor", 0), 8, format), +            stn(info.get("prefix", ""), 155) +        ] + +        buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts)) +        chksum = calc_chksums(buf[-BLOCKSIZE:])[0] +        buf = buf[:-364] + "%06o\0" % chksum + buf[-357:] +        return buf + +    @staticmethod +    def _create_payload(payload): +        """Return the string payload filled with zero bytes +           up to the next 512 byte border. +        """ +        blocks, remainder = divmod(len(payload), BLOCKSIZE) +        if remainder > 0: +            payload += (BLOCKSIZE - remainder) * NUL +        return payload + +    @classmethod +    def _create_gnu_long_header(cls, name, type): +        """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence +           for name. +        """ +        name += NUL + +        info = {} +        info["name"] = "././@LongLink" +        info["type"] = type +        info["size"] = len(name) +        info["magic"] = GNU_MAGIC + +        # create extended header + name blocks. +        return cls._create_header(info, USTAR_FORMAT) + \ +                cls._create_payload(name) + +    @classmethod +    def _create_pax_generic_header(cls, pax_headers, type=XHDTYPE): +        """Return a POSIX.1-2001 extended or global header sequence +           that contains a list of keyword, value pairs. The values +           must be unicode objects. +        """ +        records = [] +        for keyword, value in pax_headers.iteritems(): +            keyword = keyword.encode("utf8") +            value = value.encode("utf8") +            l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n' +            n = p = 0 +            while True: +                n = l + len(str(p)) +                if n == p: +                    break +                p = n +            records.append("%d %s=%s\n" % (p, keyword, value)) +        records = "".join(records) + +        # We use a hardcoded "././@PaxHeader" name like star does +        # instead of the one that POSIX recommends. +        info = {} +        info["name"] = "././@PaxHeader" +        info["type"] = type +        info["size"] = len(records) +        info["magic"] = POSIX_MAGIC + +        # Create pax header + record blocks. +        return cls._create_header(info, USTAR_FORMAT) + \ +                cls._create_payload(records) +      @classmethod      def frombuf(cls, buf):          """Construct a TarInfo object from a 512 byte string buffer. @@ -880,125 +1176,251 @@ class TarInfo(object):          if chksum not in calc_chksums(buf):              raise HeaderError("bad checksum") -        tarinfo = cls() -        tarinfo.buf = buf -        tarinfo.name = buf[0:100].rstrip(NUL) -        tarinfo.mode = nti(buf[100:108]) -        tarinfo.uid = nti(buf[108:116]) -        tarinfo.gid = nti(buf[116:124]) -        tarinfo.size = nti(buf[124:136]) -        tarinfo.mtime = nti(buf[136:148]) -        tarinfo.chksum = chksum -        tarinfo.type = buf[156:157] -        tarinfo.linkname = buf[157:257].rstrip(NUL) -        tarinfo.uname = buf[265:297].rstrip(NUL) -        tarinfo.gname = buf[297:329].rstrip(NUL) -        tarinfo.devmajor = nti(buf[329:337]) -        tarinfo.devminor = nti(buf[337:345]) -        prefix = buf[345:500].rstrip(NUL) - -        if prefix and not tarinfo.issparse(): -            tarinfo.name = prefix + "/" + tarinfo.name +        obj = cls() +        obj.buf = buf +        obj.name = nts(buf[0:100]) +        obj.mode = nti(buf[100:108]) +        obj.uid = nti(buf[108:116]) +        obj.gid = nti(buf[116:124]) +        obj.size = nti(buf[124:136]) +        obj.mtime = nti(buf[136:148]) +        obj.chksum = chksum +        obj.type = buf[156:157] +        obj.linkname = nts(buf[157:257]) +        obj.uname = nts(buf[265:297]) +        obj.gname = nts(buf[297:329]) +        obj.devmajor = nti(buf[329:337]) +        obj.devminor = nti(buf[337:345]) +        prefix = nts(buf[345:500]) + +        # Old V7 tar format represents a directory as a regular +        # file with a trailing slash. +        if obj.type == AREGTYPE and obj.name.endswith("/"): +            obj.type = DIRTYPE -        return tarinfo +        # Remove redundant slashes from directories. +        if obj.isdir(): +            obj.name = obj.name.rstrip("/") -    def tobuf(self, posix=False): -        """Return a tar header as a string of 512 byte blocks. -        """ -        buf = "" -        type = self.type -        prefix = "" +        # Reconstruct a ustar longname. +        if prefix and obj.type not in GNU_TYPES: +            obj.name = prefix + "/" + obj.name +        return obj -        if self.name.endswith("/"): -            type = DIRTYPE +    @classmethod +    def fromtarfile(cls, tarfile): +        """Return the next TarInfo object from TarFile object +           tarfile. +        """ +        buf = tarfile.fileobj.read(BLOCKSIZE) +        if not buf: +            return +        obj = cls.frombuf(buf) +        obj.offset = tarfile.fileobj.tell() - BLOCKSIZE +        return obj._proc_member(tarfile) -        if type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): -            # Prevent "././@LongLink" from being normalized. -            name = self.name +    #-------------------------------------------------------------------------- +    # The following are methods that are called depending on the type of a +    # member. The entry point is _proc_member() which can be overridden in a +    # subclass to add custom _proc_*() methods. A _proc_*() method MUST +    # implement the following +    # operations: +    # 1. Set self.offset_data to the position where the data blocks begin, +    #    if there is data that follows. +    # 2. Set tarfile.offset to the position where the next member's header will +    #    begin. +    # 3. Return self or another valid TarInfo object. +    def _proc_member(self, tarfile): +        """Choose the right processing method depending on +           the type and call it. +        """ +        if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): +            return self._proc_gnulong(tarfile) +        elif self.type == GNUTYPE_SPARSE: +            return self._proc_sparse(tarfile) +        elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): +            return self._proc_pax(tarfile)          else: -            name = normpath(self.name) +            return self._proc_builtin(tarfile) -        if type == DIRTYPE: -            # directories should end with '/' -            name += "/" +    def _proc_builtin(self, tarfile): +        """Process a builtin type or an unknown type which +           will be treated as a regular file. +        """ +        self.offset_data = tarfile.fileobj.tell() +        offset = self.offset_data +        if self.isreg() or self.type not in SUPPORTED_TYPES: +            # Skip the following data blocks. +            offset += self._block(self.size) +        tarfile.offset = offset -        linkname = self.linkname -        if linkname: -            # if linkname is empty we end up with a '.' -            linkname = normpath(linkname) +        # Patch the TarInfo object with saved extended +        # header information. +        for keyword, value in tarfile.pax_headers.iteritems(): +            if keyword in PAX_FIELDS: +                setattr(self, keyword, value) +            self.pax_headers[keyword] = value -        if posix: -            if self.size > MAXSIZE_MEMBER: -                raise ValueError("file is too large (>= 8 GB)") +        return self -            if len(self.linkname) > LENGTH_LINK: -                raise ValueError("linkname is too long (>%d)" % (LENGTH_LINK)) +    def _proc_gnulong(self, tarfile): +        """Process the blocks that hold a GNU longname +           or longlink member. +        """ +        buf = tarfile.fileobj.read(self._block(self.size)) -            if len(name) > LENGTH_NAME: -                prefix = name[:LENGTH_PREFIX + 1] -                while prefix and prefix[-1] != "/": -                    prefix = prefix[:-1] +        # Fetch the next header and process it. +        b = tarfile.fileobj.read(BLOCKSIZE) +        t = self.frombuf(b) +        t.offset = self.offset +        next = t._proc_member(tarfile) -                name = name[len(prefix):] -                prefix = prefix[:-1] +        # Patch the TarInfo object from the next header with +        # the longname information. +        next.offset = self.offset +        if self.type == GNUTYPE_LONGNAME: +            next.name = buf.rstrip(NUL) +        elif self.type == GNUTYPE_LONGLINK: +            next.linkname = buf.rstrip(NUL) -                if not prefix or len(name) > LENGTH_NAME: -                    raise ValueError("name is too long") +        return next -        else: -            if len(self.linkname) > LENGTH_LINK: -                buf += self._create_gnulong(self.linkname, GNUTYPE_LONGLINK) +    def _proc_sparse(self, tarfile): +        """Process a GNU sparse header plus extra headers. +        """ +        buf = self.buf +        sp = _ringbuffer() +        pos = 386 +        lastpos = 0L +        realpos = 0L +        # There are 4 possible sparse structs in the +        # first header. +        for i in xrange(4): +            try: +                offset = nti(buf[pos:pos + 12]) +                numbytes = nti(buf[pos + 12:pos + 24]) +            except ValueError: +                break +            if offset > lastpos: +                sp.append(_hole(lastpos, offset - lastpos)) +            sp.append(_data(offset, numbytes, realpos)) +            realpos += numbytes +            lastpos = offset + numbytes +            pos += 24 -            if len(name) > LENGTH_NAME: -                buf += self._create_gnulong(name, GNUTYPE_LONGNAME) +        isextended = ord(buf[482]) +        origsize = nti(buf[483:495]) -        parts = [ -            stn(name, 100), -            itn(self.mode & 07777, 8, posix), -            itn(self.uid, 8, posix), -            itn(self.gid, 8, posix), -            itn(self.size, 12, posix), -            itn(self.mtime, 12, posix), -            "        ", # checksum field -            type, -            stn(self.linkname, 100), -            stn(MAGIC, 6), -            stn(VERSION, 2), -            stn(self.uname, 32), -            stn(self.gname, 32), -            itn(self.devmajor, 8, posix), -            itn(self.devminor, 8, posix), -            stn(prefix, 155) -        ] +        # If the isextended flag is given, +        # there are extra headers to process. +        while isextended == 1: +            buf = tarfile.fileobj.read(BLOCKSIZE) +            pos = 0 +            for i in xrange(21): +                try: +                    offset = nti(buf[pos:pos + 12]) +                    numbytes = nti(buf[pos + 12:pos + 24]) +                except ValueError: +                    break +                if offset > lastpos: +                    sp.append(_hole(lastpos, offset - lastpos)) +                sp.append(_data(offset, numbytes, realpos)) +                realpos += numbytes +                lastpos = offset + numbytes +                pos += 24 +            isextended = ord(buf[504]) -        buf += struct.pack("%ds" % BLOCKSIZE, "".join(parts)) -        chksum = calc_chksums(buf[-BLOCKSIZE:])[0] -        buf = buf[:-364] + "%06o\0" % chksum + buf[-357:] -        self.buf = buf -        return buf +        if lastpos < origsize: +            sp.append(_hole(lastpos, origsize - lastpos)) + +        self.sparse = sp -    def _create_gnulong(self, name, type): -        """Create a GNU longname/longlink header from name. -           It consists of an extended tar header, with the length -           of the longname as size, followed by data blocks, -           which contain the longname as a null terminated string. +        self.offset_data = tarfile.fileobj.tell() +        tarfile.offset = self.offset_data + self._block(self.size) +        self.size = origsize + +        return self + +    def _proc_pax(self, tarfile): +        """Process an extended or global header as described in +           POSIX.1-2001.          """ -        name += NUL +        # Read the header information. +        buf = tarfile.fileobj.read(self._block(self.size)) -        tarinfo = self.__class__() -        tarinfo.name = "././@LongLink" -        tarinfo.type = type -        tarinfo.mode = 0 -        tarinfo.size = len(name) - -        # create extended header -        buf = tarinfo.tobuf() -        # create name blocks -        buf += name -        blocks, remainder = divmod(len(name), BLOCKSIZE) -        if remainder > 0: -            buf += (BLOCKSIZE - remainder) * NUL -        return buf +        # A pax header stores supplemental information for either +        # the following file (extended) or all following files +        # (global). +        if self.type == XGLTYPE: +            pax_headers = tarfile.pax_headers +        else: +            pax_headers = tarfile.pax_headers.copy() + +        # Fields in POSIX.1-2001 that are numbers, all other fields +        # are treated as UTF-8 strings. +        type_mapping = { +            "atime":        float, +            "ctime":        float, +            "mtime":        float, +            "uid":          int, +            "gid":          int, +            "size":         int +        } + +        # Parse pax header information. A record looks like that: +        # "%d %s=%s\n" % (length, keyword, value). length is the size +        # of the complete record including the length field itself and +        # the newline. +        regex = re.compile(r"(\d+) ([^=]+)=", re.U) +        pos = 0 +        while True: +            match = regex.match(buf, pos) +            if not match: +                break + +            length, keyword = match.groups() +            length = int(length) +            value = buf[match.end(2) + 1:match.start(1) + length - 1] + +            keyword = keyword.decode("utf8") +            keyword = keyword.encode(tarfile.encoding) + +            value = value.decode("utf8") +            if keyword in type_mapping: +                try: +                    value = type_mapping[keyword](value) +                except ValueError: +                    value = 0 +            else: +                value = value.encode(tarfile.encoding) + +            pax_headers[keyword] = value +            pos += length + +        # Fetch the next header that will be patched with the +        # supplement information from the pax header (extended +        # only). +        t = self.fromtarfile(tarfile) + +        if self.type != XGLTYPE and t is not None: +            # Patch the TarInfo object from the next header with +            # the pax header's information. +            for keyword, value in pax_headers.items(): +                if keyword in PAX_FIELDS: +                    setattr(t, keyword, value) +                pax_headers[keyword] = value +            t.pax_headers = pax_headers.copy() + +        return t + +    def _block(self, count): +        """Round up a byte count by BLOCKSIZE and return it, +           e.g. _block(834) => 1024. +        """ +        blocks, remainder = divmod(count, BLOCKSIZE) +        if remainder: +            blocks += 1 +        return blocks * BLOCKSIZE      def isreg(self):          return self.type in REGULAR_TYPES @@ -1038,12 +1460,18 @@ class TarFile(object):                                  # messages (if debug >= 0). If > 0, errors                                  # are passed to the caller as exceptions. -    posix = False               # If True, generates POSIX.1-1990-compliant -                                # archives (no GNU extensions!) +    format = DEFAULT_FORMAT     # The format to use when creating an archive. + +    encoding = ENCODING         # Transfer UTF-8 strings from POSIX.1-2001 +                                # headers to this encoding. + +    tarinfo = TarInfo           # The default TarInfo class to use. -    fileobject = ExFileObject +    fileobject = ExFileObject   # The default ExFileObject class to use. -    def __init__(self, name=None, mode="r", fileobj=None): +    def __init__(self, name=None, mode="r", fileobj=None, format=None, +            tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, +            pax_headers=None, debug=None, errorlevel=None):          """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to             read from an existing archive, 'a' to append data to an existing             file or 'w' to create a new file overwriting an existing one. `mode' @@ -1052,58 +1480,86 @@ class TarFile(object):             can be determined, `mode' is overridden by `fileobj's mode.             `fileobj' is not closed, when TarFile is closed.          """ -        self.name = os.path.abspath(name) -          if len(mode) > 1 or mode not in "raw":              raise ValueError("mode must be 'r', 'a' or 'w'") -        self._mode = mode -        self.mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode] +        self.mode = mode +        self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]          if not fileobj: -            if self._mode == "a" and not os.path.exists(self.name): +            if self.mode == "a" and not os.path.exists(name):                  # Create nonexistent files in append mode. -                self._mode = "w" -                self.mode = "wb" -            fileobj = file(self.name, self.mode) +                self.mode = "w" +                self._mode = "wb" +            fileobj = file(name, self._mode)              self._extfileobj = False          else: -            if self.name is None and hasattr(fileobj, "name"): -                self.name = os.path.abspath(fileobj.name) +            if name is None and hasattr(fileobj, "name"): +                name = fileobj.name              if hasattr(fileobj, "mode"): -                self.mode = fileobj.mode +                self._mode = fileobj.mode              self._extfileobj = True +        self.name = os.path.abspath(name)          self.fileobj = fileobj -        # Init datastructures +        # Init attributes. +        if format is not None: +            self.format = format +        if tarinfo is not None: +            self.tarinfo = tarinfo +        if dereference is not None: +            self.dereference = dereference +        if ignore_zeros is not None: +            self.ignore_zeros = ignore_zeros +        if encoding is not None: +            self.encoding = encoding +        if debug is not None: +            self.debug = debug +        if errorlevel is not None: +            self.errorlevel = errorlevel + +        # Init datastructures.          self.closed = False          self.members = []       # list of members as TarInfo objects          self._loaded = False    # flag if all members have been read          self.offset = 0L        # current position in the archive file          self.inodes = {}        # dictionary caching the inodes of                                  # archive members already added +        self.pax_headers = {}   # save contents of global pax headers -        if self._mode == "r": +        if self.mode == "r":              self.firstmember = None              self.firstmember = self.next() -        if self._mode == "a": +        if self.mode == "a":              # Move to the end of the archive,              # before the first empty block.              self.firstmember = None              while True: -                try: -                    tarinfo = self.next() -                except ReadError: -                    self.fileobj.seek(0) -                    break -                if tarinfo is None: +                if self.next() is None:                      if self.offset > 0:                          self.fileobj.seek(- BLOCKSIZE, 1)                      break -        if self._mode in "aw": +        if self.mode in "aw":              self._loaded = True +            if pax_headers: +                buf = self.tarinfo.create_pax_global_header( +                        pax_headers.copy(), self.encoding) +                self.fileobj.write(buf) +                self.offset += len(buf) + +    def _getposix(self): +        return self.format == USTAR_FORMAT +    def _setposix(self, value): +        import warnings +        warnings.warn("use the format attribute instead", DeprecationWarning) +        if value: +            self.format = USTAR_FORMAT +        else: +            self.format = GNU_FORMAT +    posix = property(_getposix, _setposix) +      #--------------------------------------------------------------------------      # Below are the classmethods which act as alternate constructors to the      # TarFile class. The open() method is the only one that is needed for @@ -1116,7 +1572,7 @@ class TarFile(object):      # by adding it to the mapping in OPEN_METH.      @classmethod -    def open(cls, name=None, mode="r", fileobj=None, bufsize=20*512): +    def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):          """Open a tar archive for reading, writing or appending. Return             an appropriate TarFile class. @@ -1149,8 +1605,8 @@ class TarFile(object):                  if fileobj is not None:                      saved_pos = fileobj.tell()                  try: -                    return func(name, "r", fileobj) -                except (ReadError, CompressionError): +                    return func(name, "r", fileobj, **kwargs) +                except (ReadError, CompressionError), e:                      if fileobj is not None:                          fileobj.seek(saved_pos)                      continue @@ -1167,7 +1623,7 @@ class TarFile(object):                  func = getattr(cls, cls.OPEN_METH[comptype])              else:                  raise CompressionError("unknown compression type %r" % comptype) -            return func(name, filemode, fileobj) +            return func(name, filemode, fileobj, **kwargs)          elif "|" in mode:              filemode, comptype = mode.split("|", 1) @@ -1178,25 +1634,26 @@ class TarFile(object):                  raise ValueError("mode must be 'r' or 'w'")              t = cls(name, filemode, -                    _Stream(name, filemode, comptype, fileobj, bufsize)) +                    _Stream(name, filemode, comptype, fileobj, bufsize), +                    **kwargs)              t._extfileobj = False              return t          elif mode in "aw": -            return cls.taropen(name, mode, fileobj) +            return cls.taropen(name, mode, fileobj, **kwargs)          raise ValueError("undiscernible mode")      @classmethod -    def taropen(cls, name, mode="r", fileobj=None): +    def taropen(cls, name, mode="r", fileobj=None, **kwargs):          """Open uncompressed tar archive name for reading or writing.          """          if len(mode) > 1 or mode not in "raw":              raise ValueError("mode must be 'r', 'a' or 'w'") -        return cls(name, mode, fileobj) +        return cls(name, mode, fileobj, **kwargs)      @classmethod -    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9): +    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):          """Open gzip compressed tar archive name for reading or writing.             Appending is not allowed.          """ @@ -1214,14 +1671,15 @@ class TarFile(object):          try:              t = cls.taropen(name, mode, -                gzip.GzipFile(name, mode, compresslevel, fileobj)) +                gzip.GzipFile(name, mode, compresslevel, fileobj), +                **kwargs)          except IOError:              raise ReadError("not a gzip file")          t._extfileobj = False          return t      @classmethod -    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9): +    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):          """Open bzip2 compressed tar archive name for reading or writing.             Appending is not allowed.          """ @@ -1239,7 +1697,7 @@ class TarFile(object):              fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)          try: -            t = cls.taropen(name, mode, fileobj) +            t = cls.taropen(name, mode, fileobj, **kwargs)          except IOError:              raise ReadError("not a bzip2 file")          t._extfileobj = False @@ -1262,7 +1720,7 @@ class TarFile(object):          if self.closed:              return -        if self._mode in "aw": +        if self.mode in "aw":              self.fileobj.write(NUL * (BLOCKSIZE * 2))              self.offset += (BLOCKSIZE * 2)              # fill up the end with zero-blocks @@ -1328,7 +1786,8 @@ class TarFile(object):          # Now, fill the TarInfo object with          # information specific for the file. -        tarinfo = TarInfo() +        tarinfo = self.tarinfo() +        tarinfo.tarfile = self          # Use os.stat or os.lstat, depending on platform          # and if symlinks shall be resolved. @@ -1344,8 +1803,8 @@ class TarFile(object):          stmd = statres.st_mode          if stat.S_ISREG(stmd):              inode = (statres.st_ino, statres.st_dev) -            if not self.dereference and \ -                    statres.st_nlink > 1 and inode in self.inodes: +            if not self.dereference and statres.st_nlink > 1 and \ +                    inode in self.inodes and arcname != self.inodes[inode]:                  # Is it a hardlink to an already                  # archived file?                  type = LNKTYPE @@ -1422,7 +1881,7 @@ class TarFile(object):                  print "%d-%02d-%02d %02d:%02d:%02d" \                        % time.localtime(tarinfo.mtime)[:6], -            print tarinfo.name, +            print tarinfo.name + ("/" if tarinfo.isdir() else ""),              if verbose:                  if tarinfo.issym(): @@ -1454,7 +1913,7 @@ class TarFile(object):              if recursive:                  if arcname == ".":                      arcname = "" -                for f in os.listdir("."): +                for f in os.listdir(name):                      self.add(f, os.path.join(arcname, f))              return @@ -1493,7 +1952,7 @@ class TarFile(object):          tarinfo = copy.copy(tarinfo) -        buf = tarinfo.tobuf(self.posix) +        buf = tarinfo.tobuf(self.format, self.encoding)          self.fileobj.write(buf)          self.offset += len(buf) @@ -1525,7 +1984,7 @@ class TarFile(object):                  # Extract directory with a safe mode, so that                  # all files below can be extracted as well.                  try: -                    os.makedirs(os.path.join(path, tarinfo.name), 0777) +                    os.makedirs(os.path.join(path, tarinfo.name), 0700)                  except EnvironmentError:                      pass                  directories.append(tarinfo) @@ -1557,10 +2016,10 @@ class TarFile(object):          """          self._check("r") -        if isinstance(member, TarInfo): -            tarinfo = member -        else: +        if isinstance(member, basestring):              tarinfo = self.getmember(member) +        else: +            tarinfo = member          # Prepare the link target for makelink().          if tarinfo.islnk(): @@ -1593,10 +2052,10 @@ class TarFile(object):          """          self._check("r") -        if isinstance(member, TarInfo): -            tarinfo = member -        else: +        if isinstance(member, basestring):              tarinfo = self.getmember(member) +        else: +            tarinfo = member          if tarinfo.isreg():              return self.fileobject(self, tarinfo) @@ -1809,20 +2268,11 @@ class TarFile(object):          # Read the next block.          self.fileobj.seek(self.offset)          while True: -            buf = self.fileobj.read(BLOCKSIZE) -            if not buf: -                return None -              try: -                tarinfo = TarInfo.frombuf(buf) - -                # Set the TarInfo object's offset to the current position of the -                # TarFile and set self.offset to the position where the data blocks -                # should begin. -                tarinfo.offset = self.offset -                self.offset += BLOCKSIZE - -                tarinfo = self.proc_member(tarinfo) +                tarinfo = self.tarinfo.fromtarfile(self) +                if tarinfo is None: +                    return +                self.members.append(tarinfo)              except HeaderError, e:                  if self.ignore_zeros: @@ -1835,149 +2285,11 @@ class TarFile(object):                      return None              break -        # Some old tar programs represent a directory as a regular -        # file with a trailing slash. -        if tarinfo.isreg() and tarinfo.name.endswith("/"): -            tarinfo.type = DIRTYPE - -        # Directory names should have a '/' at the end. -        if tarinfo.isdir(): -            tarinfo.name += "/" - -        self.members.append(tarinfo) -        return tarinfo - -    #-------------------------------------------------------------------------- -    # The following are methods that are called depending on the type of a -    # member. The entry point is proc_member() which is called with a TarInfo -    # object created from the header block from the current offset. The -    # proc_member() method can be overridden in a subclass to add custom -    # proc_*() methods. A proc_*() method MUST implement the following -    # operations: -    # 1. Set tarinfo.offset_data to the position where the data blocks begin, -    #    if there is data that follows. -    # 2. Set self.offset to the position where the next member's header will -    #    begin. -    # 3. Return tarinfo or another valid TarInfo object. -    def proc_member(self, tarinfo): -        """Choose the right processing method for tarinfo depending -           on its type and call it. -        """ -        if tarinfo.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): -            return self.proc_gnulong(tarinfo) -        elif tarinfo.type == GNUTYPE_SPARSE: -            return self.proc_sparse(tarinfo) -        else: -            return self.proc_builtin(tarinfo) - -    def proc_builtin(self, tarinfo): -        """Process a builtin type member or an unknown member -           which will be treated as a regular file. -        """ -        tarinfo.offset_data = self.offset -        if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: -            # Skip the following data blocks. -            self.offset += self._block(tarinfo.size) -        return tarinfo - -    def proc_gnulong(self, tarinfo): -        """Process the blocks that hold a GNU longname -           or longlink member. -        """ -        buf = "" -        count = tarinfo.size -        while count > 0: -            block = self.fileobj.read(BLOCKSIZE) -            buf += block -            self.offset += BLOCKSIZE -            count -= BLOCKSIZE - -        # Fetch the next header and process it. -        b = self.fileobj.read(BLOCKSIZE) -        t = TarInfo.frombuf(b) -        t.offset = self.offset -        self.offset += BLOCKSIZE -        next = self.proc_member(t) - -        # Patch the TarInfo object from the next header with -        # the longname information. -        next.offset = tarinfo.offset -        if tarinfo.type == GNUTYPE_LONGNAME: -            next.name = buf.rstrip(NUL) -        elif tarinfo.type == GNUTYPE_LONGLINK: -            next.linkname = buf.rstrip(NUL) - -        return next - -    def proc_sparse(self, tarinfo): -        """Process a GNU sparse header plus extra headers. -        """ -        buf = tarinfo.buf -        sp = _ringbuffer() -        pos = 386 -        lastpos = 0L -        realpos = 0L -        # There are 4 possible sparse structs in the -        # first header. -        for i in xrange(4): -            try: -                offset = nti(buf[pos:pos + 12]) -                numbytes = nti(buf[pos + 12:pos + 24]) -            except ValueError: -                break -            if offset > lastpos: -                sp.append(_hole(lastpos, offset - lastpos)) -            sp.append(_data(offset, numbytes, realpos)) -            realpos += numbytes -            lastpos = offset + numbytes -            pos += 24 - -        isextended = ord(buf[482]) -        origsize = nti(buf[483:495]) - -        # If the isextended flag is given, -        # there are extra headers to process. -        while isextended == 1: -            buf = self.fileobj.read(BLOCKSIZE) -            self.offset += BLOCKSIZE -            pos = 0 -            for i in xrange(21): -                try: -                    offset = nti(buf[pos:pos + 12]) -                    numbytes = nti(buf[pos + 12:pos + 24]) -                except ValueError: -                    break -                if offset > lastpos: -                    sp.append(_hole(lastpos, offset - lastpos)) -                sp.append(_data(offset, numbytes, realpos)) -                realpos += numbytes -                lastpos = offset + numbytes -                pos += 24 -            isextended = ord(buf[504]) - -        if lastpos < origsize: -            sp.append(_hole(lastpos, origsize - lastpos)) - -        tarinfo.sparse = sp - -        tarinfo.offset_data = self.offset -        self.offset += self._block(tarinfo.size) -        tarinfo.size = origsize -          return tarinfo      #--------------------------------------------------------------------------      # Little helper methods: -    def _block(self, count): -        """Round up a byte count by BLOCKSIZE and return it, -           e.g. _block(834) => 1024. -        """ -        blocks, remainder = divmod(count, BLOCKSIZE) -        if remainder: -            blocks += 1 -        return blocks * BLOCKSIZE -      def _getmember(self, name, tarinfo=None):          """Find an archive member by name from bottom to top.             If tarinfo is given, it is used as the starting point. @@ -2010,8 +2322,8 @@ class TarFile(object):          """          if self.closed:              raise IOError("%s is closed" % self.__class__.__name__) -        if mode is not None and self._mode not in mode: -            raise IOError("bad operation for mode %r" % self._mode) +        if mode is not None and self.mode not in mode: +            raise IOError("bad operation for mode %r" % self.mode)      def __iter__(self):          """Provide an iterator object. | 
