1 files changed, 646 insertions, 334 deletions
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index 963127c..efade27 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -33,7 +33,7 @@
 __version__ = "$Revision$"
 # $Source$
 
-version     = "0.8.0"
+version     = "0.9.0"
 __author__  = "Lars Gustäbel (lars@gustaebel.de)"
 __date__    = "$Date$"
 __cvsid__   = "$Id$"
@@ -50,6 +50,7 @@ import errno
 import time
 import struct
 import copy
+import re
 
 if sys.platform == 'mac':
     # This module needs work for MacOS9, especially in the area of pathname
@@ -71,42 +72,60 @@ from __builtin__ import open as _open # Since 'open' is TarFile.open
 #---------------------------------------------------------
 # tar constants
 #---------------------------------------------------------
-NUL        = "\0"               # the null character
-BLOCKSIZE  = 512                # length of processing blocks
+NUL = "\0"                      # the null character
+BLOCKSIZE = 512                 # length of processing blocks
 RECORDSIZE = BLOCKSIZE * 20     # length of records
-MAGIC      = "ustar"            # magic tar string
-VERSION    = "00"               # version number
+GNU_MAGIC = "ustar  \0"         # magic gnu tar string
+POSIX_MAGIC = "ustar\x0000"     # magic posix tar string
 
-LENGTH_NAME    = 100            # maximum length of a filename
-LENGTH_LINK    = 100            # maximum length of a linkname
-LENGTH_PREFIX  = 155            # maximum length of the prefix field
-MAXSIZE_MEMBER = 077777777777  # maximum size of a file (11 octal digits)
+LENGTH_NAME = 100               # maximum length of a filename
+LENGTH_LINK = 100               # maximum length of a linkname
+LENGTH_PREFIX = 155             # maximum length of the prefix field
 
-REGTYPE  = "0"                  # regular file
+REGTYPE = "0"                   # regular file
 AREGTYPE = "\0"                 # regular file
-LNKTYPE  = "1"                  # link (inside tarfile)
-SYMTYPE  = "2"                  # symbolic link
-CHRTYPE  = "3"                  # character special device
-BLKTYPE  = "4"                  # block special device
-DIRTYPE  = "5"                  # directory
+LNKTYPE = "1"                   # link (inside tarfile)
+SYMTYPE = "2"                   # symbolic link
+CHRTYPE = "3"                   # character special device
+BLKTYPE = "4"                   # block special device
+DIRTYPE = "5"                   # directory
 FIFOTYPE = "6"                  # fifo special device
 CONTTYPE = "7"                  # contiguous file
 
-GNUTYPE_LONGNAME = "L"          # GNU tar extension for longnames
-GNUTYPE_LONGLINK = "K"          # GNU tar extension for longlink
-GNUTYPE_SPARSE   = "S"          # GNU tar extension for sparse file
+GNUTYPE_LONGNAME = "L"          # GNU tar longname
+GNUTYPE_LONGLINK = "K"          # GNU tar longlink
+GNUTYPE_SPARSE = "S"            # GNU tar sparse file
+
+XHDTYPE = "x"                   # POSIX.1-2001 extended header
+XGLTYPE = "g"                   # POSIX.1-2001 global header
+SOLARIS_XHDTYPE = "X"           # Solaris extended header
+
+USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
+GNU_FORMAT = 1                  # GNU tar format
+PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
+DEFAULT_FORMAT = GNU_FORMAT
 
 #---------------------------------------------------------
 # tarfile constants
 #---------------------------------------------------------
-SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,  # file types that tarfile
-                   SYMTYPE, DIRTYPE, FIFOTYPE,  # can cope with.
+# File types that tarfile supports:
+SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
+                   SYMTYPE, DIRTYPE, FIFOTYPE,
                    CONTTYPE, CHRTYPE, BLKTYPE,
                    GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
                    GNUTYPE_SPARSE)
 
-REGULAR_TYPES = (REGTYPE, AREGTYPE,             # file types that somehow
-                 CONTTYPE, GNUTYPE_SPARSE)      # represent regular files
+# File types that will be treated as a regular file.
+REGULAR_TYPES = (REGTYPE, AREGTYPE,
+                 CONTTYPE, GNUTYPE_SPARSE)
+
+# File types that are part of the GNU tar format.
+GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
+             GNUTYPE_SPARSE)
+
+# Fields from a pax header that override a TarInfo attribute.
+PAX_FIELDS = ("path", "linkpath", "size", "mtime",
+              "uid", "gid", "uname", "gname")
 
 #---------------------------------------------------------
 # Bits used in the mode field, values in octal.
@@ -133,6 +152,13 @@ TOWRITE = 0002           # write by other
 TOEXEC  = 0001           # execute/search by other
 
 #---------------------------------------------------------
+# initialization
+#---------------------------------------------------------
+ENCODING = sys.getfilesystemencoding()
+if ENCODING is None:
+    ENCODING = "ascii"
+
+#---------------------------------------------------------
 # Some useful functions
 #---------------------------------------------------------
 
@@ -141,6 +167,15 @@ def stn(s, length):
     """
     return s[:length] + (length - len(s)) * NUL
 
+def nts(s):
+    """Convert a null-terminated string field to a python string.
+    """
+    # Use the string up to the first null char.
+    p = s.find("\0")
+    if p == -1:
+        return s
+    return s[:p]
+
 def nti(s):
     """Convert a number field to a python number.
     """
@@ -148,7 +183,7 @@ def nti(s):
     # itn() below.
     if s[0] != chr(0200):
         try:
-            n = int(s.rstrip(NUL + " ") or "0", 8)
+            n = int(nts(s) or "0", 8)
         except ValueError:
             raise HeaderError("invalid header")
     else:
@@ -158,7 +193,7 @@ def nti(s):
             n += ord(s[i + 1])
     return n
 
-def itn(n, digits=8, posix=False):
+def itn(n, digits=8, format=DEFAULT_FORMAT):
     """Convert a python number to a number field.
     """
     # POSIX 1003.1-1988 requires numbers to be encoded as a string of
@@ -170,7 +205,7 @@ def itn(n, digits=8, posix=False):
     if 0 <= n < 8 ** (digits - 1):
         s = "%0*o" % (digits - 1, n) + NUL
     else:
-        if posix:
+        if format != GNU_FORMAT or n >= 256 ** (digits - 1):
             raise ValueError("overflow in number field")
 
         if n < 0:
@@ -516,7 +551,10 @@ class _Stream:
             buf = self.__read(self.bufsize)
             if not buf:
                 break
-            buf = self.cmp.decompress(buf)
+            try:
+                buf = self.cmp.decompress(buf)
+            except IOError:
+                raise ReadError("invalid compressed data")
             t.append(buf)
             c += len(buf)
         t = "".join(t)
@@ -577,6 +615,7 @@ class _BZ2Proxy(object):
     def __init__(self, fileobj, mode):
         self.fileobj = fileobj
         self.mode = mode
+        self.name = getattr(self.fileobj, "name", None)
         self.init()
 
     def init(self):
@@ -849,8 +888,8 @@ class TarInfo(object):
         """Construct a TarInfo object. name is the optional name
            of the member.
         """
-        self.name = name        # member name (dirnames must end with '/')
-        self.mode = 0666        # file permissions
+        self.name = name        # member name
+        self.mode = 0644        # file permissions
         self.uid = 0            # user id
         self.gid = 0            # group id
         self.size = 0           # file size
@@ -858,17 +897,274 @@ class TarInfo(object):
         self.chksum = 0         # header checksum
         self.type = REGTYPE     # member type
         self.linkname = ""      # link name
-        self.uname = "user"     # user name
-        self.gname = "group"    # group name
+        self.uname = "root"     # user name
+        self.gname = "root"     # group name
         self.devmajor = 0       # device major number
         self.devminor = 0       # device minor number
 
         self.offset = 0         # the tar header starts here
         self.offset_data = 0    # the file's data starts here
 
+        self.pax_headers = {}   # pax header information
+
+    # In pax headers the "name" and "linkname" field are called
+    # "path" and "linkpath".
+    def _getpath(self):
+        return self.name
+    def _setpath(self, name):
+        self.name = name
+    path = property(_getpath, _setpath)
+
+    def _getlinkpath(self):
+        return self.linkname
+    def _setlinkpath(self, linkname):
+        self.linkname = linkname
+    linkpath = property(_getlinkpath, _setlinkpath)
+
     def __repr__(self):
         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
 
+    def get_info(self):
+        """Return the TarInfo's attributes as a dictionary.
+        """
+        info = {
+            "name":     normpath(self.name),
+            "mode":     self.mode & 07777,
+            "uid":      self.uid,
+            "gid":      self.gid,
+            "size":     self.size,
+            "mtime":    self.mtime,
+            "chksum":   self.chksum,
+            "type":     self.type,
+            "linkname": normpath(self.linkname) if self.linkname else "",
+            "uname":    self.uname,
+            "gname":    self.gname,
+            "devmajor": self.devmajor,
+            "devminor": self.devminor
+        }
+
+        if info["type"] == DIRTYPE and not info["name"].endswith("/"):
+            info["name"] += "/"
+
+        return info
+
+    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING):
+        """Return a tar header as a string of 512 byte blocks.
+        """
+        if format == USTAR_FORMAT:
+            return self.create_ustar_header()
+        elif format == GNU_FORMAT:
+            return self.create_gnu_header()
+        elif format == PAX_FORMAT:
+            return self.create_pax_header(encoding)
+        else:
+            raise ValueError("invalid format")
+
+    def create_ustar_header(self):
+        """Return the object as a ustar header block.
+        """
+        info = self.get_info()
+        info["magic"] = POSIX_MAGIC
+
+        if len(info["linkname"]) > LENGTH_LINK:
+            raise ValueError("linkname is too long")
+
+        if len(info["name"]) > LENGTH_NAME:
+            info["prefix"], info["name"] = self._posix_split_name(info["name"])
+
+        return self._create_header(info, USTAR_FORMAT)
+
+    def create_gnu_header(self):
+        """Return the object as a GNU header block sequence.
+        """
+        info = self.get_info()
+        info["magic"] = GNU_MAGIC
+
+        buf = ""
+        if len(info["linkname"]) > LENGTH_LINK:
+            buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK)
+
+        if len(info["name"]) > LENGTH_NAME:
+            buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME)
+
+        return buf + self._create_header(info, GNU_FORMAT)
+
+    def create_pax_header(self, encoding):
+        """Return the object as a ustar header block. If it cannot be
+           represented this way, prepend a pax extended header sequence
+           with supplement information.
+        """
+        info = self.get_info()
+        info["magic"] = POSIX_MAGIC
+        pax_headers = self.pax_headers.copy()
+
+        # Test string fields for values that exceed the field length or cannot
+        # be represented in ASCII encoding.
+        for name, hname, length in (
+                ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
+                ("uname", "uname", 32), ("gname", "gname", 32)):
+
+            val = info[name].decode(encoding)
+
+            # Try to encode the string as ASCII.
+            try:
+                val.encode("ascii")
+            except UnicodeEncodeError:
+                pax_headers[hname] = val
+                continue
+
+            if len(val) > length:
+                if name == "name":
+                    # Try to squeeze a longname in the prefix and name fields as in
+                    # ustar format.
+                    try:
+                        info["prefix"], info["name"] = self._posix_split_name(info["name"])
+                    except ValueError:
+                        pax_headers[hname] = val
+                    else:
+                        continue
+                else:
+                    pax_headers[hname] = val
+
+        # Test number fields for values that exceed the field limit or values
+        # that like to be stored as float.
+        for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
+            val = info[name]
+            if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
+                pax_headers[name] = unicode(val)
+                info[name] = 0
+
+        if pax_headers:
+            buf = self._create_pax_generic_header(pax_headers)
+        else:
+            buf = ""
+
+        return buf + self._create_header(info, USTAR_FORMAT)
+
+    @classmethod
+    def create_pax_global_header(cls, pax_headers, encoding):
+        """Return the object as a pax global header block sequence.
+        """
+        new_headers = {}
+        for key, val in pax_headers.items():
+            key = cls._to_unicode(key, encoding)
+            val = cls._to_unicode(val, encoding)
+            new_headers[key] = val
+        return cls._create_pax_generic_header(new_headers, type=XGLTYPE)
+
+    @staticmethod
+    def _to_unicode(value, encoding):
+        if isinstance(value, unicode):
+            return value
+        elif isinstance(value, (int, float)):
+            return unicode(value)
+        elif isinstance(value, str):
+            return unicode(value, encoding)
+        else:
+            raise ValueError("unable to convert to unicode: %r" % value)
+
+    def _posix_split_name(self, name):
+        """Split a name longer than 100 chars into a prefix
+           and a name part.
+        """
+        prefix = name[:LENGTH_PREFIX + 1]
+        while prefix and prefix[-1] != "/":
+            prefix = prefix[:-1]
+
+        name = name[len(prefix):]
+        prefix = prefix[:-1]
+
+        if not prefix or len(name) > LENGTH_NAME:
+            raise ValueError("name is too long")
+        return prefix, name
+
+    @staticmethod
+    def _create_header(info, format):
+        """Return a header block. info is a dictionary with file
+           information, format must be one of the *_FORMAT constants.
+        """
+        parts = [
+            stn(info.get("name", ""), 100),
+            itn(info.get("mode", 0) & 07777, 8, format),
+            itn(info.get("uid", 0), 8, format),
+            itn(info.get("gid", 0), 8, format),
+            itn(info.get("size", 0), 12, format),
+            itn(info.get("mtime", 0), 12, format),
+            "        ", # checksum field
+            info.get("type", REGTYPE),
+            stn(info.get("linkname", ""), 100),
+            stn(info.get("magic", ""), 8),
+            stn(info.get("uname", ""), 32),
+            stn(info.get("gname", ""), 32),
+            itn(info.get("devmajor", 0), 8, format),
+            itn(info.get("devminor", 0), 8, format),
+            stn(info.get("prefix", ""), 155)
+        ]
+
+        buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts))
+        chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
+        buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
+        return buf
+
+    @staticmethod
+    def _create_payload(payload):
+        """Return the string payload filled with zero bytes
+           up to the next 512 byte border.
+        """
+        blocks, remainder = divmod(len(payload), BLOCKSIZE)
+        if remainder > 0:
+            payload += (BLOCKSIZE - remainder) * NUL
+        return payload
+
+    @classmethod
+    def _create_gnu_long_header(cls, name, type):
+        """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
+           for name.
+        """
+        name += NUL
+
+        info = {}
+        info["name"] = "././@LongLink"
+        info["type"] = type
+        info["size"] = len(name)
+        info["magic"] = GNU_MAGIC
+
+        # create extended header + name blocks.
+        return cls._create_header(info, USTAR_FORMAT) + \
+                cls._create_payload(name)
+
+    @classmethod
+    def _create_pax_generic_header(cls, pax_headers, type=XHDTYPE):
+        """Return a POSIX.1-2001 extended or global header sequence
+           that contains a list of keyword, value pairs. The values
+           must be unicode objects.
+        """
+        records = []
+        for keyword, value in pax_headers.items():
+            keyword = keyword.encode("utf8")
+            value = value.encode("utf8")
+            l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
+            n = p = 0
+            while True:
+                n = l + len(str(p))
+                if n == p:
+                    break
+                p = n
+            records.append("%d %s=%s\n" % (p, keyword, value))
+        records = "".join(records)
+
+        # We use a hardcoded "././@PaxHeader" name like star does
+        # instead of the one that POSIX recommends.
+        info = {}
+        info["name"] = "././@PaxHeader"
+        info["type"] = type
+        info["size"] = len(records)
+        info["magic"] = POSIX_MAGIC
+
+        # Create pax header + record blocks.
+        return cls._create_header(info, USTAR_FORMAT) + \
+                cls._create_payload(records)
+
     @classmethod
     def frombuf(cls, buf):
         """Construct a TarInfo object from a 512 byte string buffer.
@@ -882,125 +1178,251 @@ class TarInfo(object):
         if chksum not in calc_chksums(buf):
             raise HeaderError("bad checksum")
 
-        tarinfo = cls()
-        tarinfo.buf = buf
-        tarinfo.name = buf[0:100].rstrip(NUL)
-        tarinfo.mode = nti(buf[100:108])
-        tarinfo.uid = nti(buf[108:116])
-        tarinfo.gid = nti(buf[116:124])
-        tarinfo.size = nti(buf[124:136])
-        tarinfo.mtime = nti(buf[136:148])
-        tarinfo.chksum = chksum
-        tarinfo.type = buf[156:157]
-        tarinfo.linkname = buf[157:257].rstrip(NUL)
-        tarinfo.uname = buf[265:297].rstrip(NUL)
-        tarinfo.gname = buf[297:329].rstrip(NUL)
-        tarinfo.devmajor = nti(buf[329:337])
-        tarinfo.devminor = nti(buf[337:345])
-        prefix = buf[345:500].rstrip(NUL)
-
-        if prefix and not tarinfo.issparse():
-            tarinfo.name = prefix + "/" + tarinfo.name
+        obj = cls()
+        obj.buf = buf
+        obj.name = nts(buf[0:100])
+        obj.mode = nti(buf[100:108])
+        obj.uid = nti(buf[108:116])
+        obj.gid = nti(buf[116:124])
+        obj.size = nti(buf[124:136])
+        obj.mtime = nti(buf[136:148])
+        obj.chksum = chksum
+        obj.type = buf[156:157]
+        obj.linkname = nts(buf[157:257])
+        obj.uname = nts(buf[265:297])
+        obj.gname = nts(buf[297:329])
+        obj.devmajor = nti(buf[329:337])
+        obj.devminor = nti(buf[337:345])
+        prefix = nts(buf[345:500])
+
+        # Old V7 tar format represents a directory as a regular
+        # file with a trailing slash.
+        if obj.type == AREGTYPE and obj.name.endswith("/"):
+            obj.type = DIRTYPE
 
-        return tarinfo
+        # Remove redundant slashes from directories.
+        if obj.isdir():
+            obj.name = obj.name.rstrip("/")
 
-    def tobuf(self, posix=False):
-        """Return a tar header as a string of 512 byte blocks.
-        """
-        buf = ""
-        type = self.type
-        prefix = ""
+        # Reconstruct a ustar longname.
+        if prefix and obj.type not in GNU_TYPES:
+            obj.name = prefix + "/" + obj.name
+        return obj
 
-        if self.name.endswith("/"):
-            type = DIRTYPE
+    @classmethod
+    def fromtarfile(cls, tarfile):
+        """Return the next TarInfo object from TarFile object
+           tarfile.
+        """
+        buf = tarfile.fileobj.read(BLOCKSIZE)
+        if not buf:
+            return
+        obj = cls.frombuf(buf)
+        obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
+        return obj._proc_member(tarfile)
 
-        if type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
-            # Prevent "././@LongLink" from being normalized.
-            name = self.name
+    #--------------------------------------------------------------------------
+    # The following are methods that are called depending on the type of a
+    # member. The entry point is _proc_member() which can be overridden in a
+    # subclass to add custom _proc_*() methods. A _proc_*() method MUST
+    # implement the following
+    # operations:
+    # 1. Set self.offset_data to the position where the data blocks begin,
+    #    if there is data that follows.
+    # 2. Set tarfile.offset to the position where the next member's header will
+    #    begin.
+    # 3. Return self or another valid TarInfo object.
+    def _proc_member(self, tarfile):
+        """Choose the right processing method depending on
+           the type and call it.
+        """
+        if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
+            return self._proc_gnulong(tarfile)
+        elif self.type == GNUTYPE_SPARSE:
+            return self._proc_sparse(tarfile)
+        elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
+            return self._proc_pax(tarfile)
         else:
-            name = normpath(self.name)
+            return self._proc_builtin(tarfile)
 
-        if type == DIRTYPE:
-            # directories should end with '/'
-            name += "/"
+    def _proc_builtin(self, tarfile):
+        """Process a builtin type or an unknown type which
+           will be treated as a regular file.
+        """
+        self.offset_data = tarfile.fileobj.tell()
+        offset = self.offset_data
+        if self.isreg() or self.type not in SUPPORTED_TYPES:
+            # Skip the following data blocks.
+            offset += self._block(self.size)
+        tarfile.offset = offset
 
-        linkname = self.linkname
-        if linkname:
-            # if linkname is empty we end up with a '.'
-            linkname = normpath(linkname)
+        # Patch the TarInfo object with saved extended
+        # header information.
+        for keyword, value in tarfile.pax_headers.items():
+            if keyword in PAX_FIELDS:
+                setattr(self, keyword, value)
+            self.pax_headers[keyword] = value
 
-        if posix:
-            if self.size > MAXSIZE_MEMBER:
-                raise ValueError("file is too large (>= 8 GB)")
+        return self
 
-            if len(self.linkname) > LENGTH_LINK:
-                raise ValueError("linkname is too long (>%d)" % (LENGTH_LINK))
+    def _proc_gnulong(self, tarfile):
+        """Process the blocks that hold a GNU longname
+           or longlink member.
+        """
+        buf = tarfile.fileobj.read(self._block(self.size))
 
-            if len(name) > LENGTH_NAME:
-                prefix = name[:LENGTH_PREFIX + 1]
-                while prefix and prefix[-1] != "/":
-                    prefix = prefix[:-1]
+        # Fetch the next header and process it.
+        b = tarfile.fileobj.read(BLOCKSIZE)
+        t = self.frombuf(b)
+        t.offset = self.offset
+        next = t._proc_member(tarfile)
 
-                name = name[len(prefix):]
-                prefix = prefix[:-1]
+        # Patch the TarInfo object from the next header with
+        # the longname information.
+        next.offset = self.offset
+        if self.type == GNUTYPE_LONGNAME:
+            next.name = buf.rstrip(NUL)
+        elif self.type == GNUTYPE_LONGLINK:
+            next.linkname = buf.rstrip(NUL)
 
-                if not prefix or len(name) > LENGTH_NAME:
-                    raise ValueError("name is too long")
+        return next
 
-        else:
-            if len(self.linkname) > LENGTH_LINK:
-                buf += self._create_gnulong(self.linkname, GNUTYPE_LONGLINK)
+    def _proc_sparse(self, tarfile):
+        """Process a GNU sparse header plus extra headers.
+        """
+        buf = self.buf
+        sp = _ringbuffer()
+        pos = 386
+        lastpos = 0
+        realpos = 0
+        # There are 4 possible sparse structs in the
+        # first header.
+        for i in xrange(4):
+            try:
+                offset = nti(buf[pos:pos + 12])
+                numbytes = nti(buf[pos + 12:pos + 24])
+            except ValueError:
+                break
+            if offset > lastpos:
+                sp.append(_hole(lastpos, offset - lastpos))
+            sp.append(_data(offset, numbytes, realpos))
+            realpos += numbytes
+            lastpos = offset + numbytes
+            pos += 24
 
-            if len(name) > LENGTH_NAME:
-                buf += self._create_gnulong(name, GNUTYPE_LONGNAME)
+        isextended = ord(buf[482])
+        origsize = nti(buf[483:495])
 
-        parts = [
-            stn(name, 100),
-            itn(self.mode & 07777, 8, posix),
-            itn(self.uid, 8, posix),
-            itn(self.gid, 8, posix),
-            itn(self.size, 12, posix),
-            itn(self.mtime, 12, posix),
-            "        ", # checksum field
-            type,
-            stn(self.linkname, 100),
-            stn(MAGIC, 6),
-            stn(VERSION, 2),
-            stn(self.uname, 32),
-            stn(self.gname, 32),
-            itn(self.devmajor, 8, posix),
-            itn(self.devminor, 8, posix),
-            stn(prefix, 155)
-        ]
+        # If the isextended flag is given,
+        # there are extra headers to process.
+        while isextended == 1:
+            buf = tarfile.fileobj.read(BLOCKSIZE)
+            pos = 0
+            for i in xrange(21):
+                try:
+                    offset = nti(buf[pos:pos + 12])
+                    numbytes = nti(buf[pos + 12:pos + 24])
+                except ValueError:
+                    break
+                if offset > lastpos:
+                    sp.append(_hole(lastpos, offset - lastpos))
+                sp.append(_data(offset, numbytes, realpos))
+                realpos += numbytes
+                lastpos = offset + numbytes
+                pos += 24
+            isextended = ord(buf[504])
 
-        buf += struct.pack("%ds" % BLOCKSIZE, "".join(parts))
-        chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
-        buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
-        self.buf = buf
-        return buf
+        if lastpos < origsize:
+            sp.append(_hole(lastpos, origsize - lastpos))
+
+        self.sparse = sp
 
-    def _create_gnulong(self, name, type):
-        """Create a GNU longname/longlink header from name.
-           It consists of an extended tar header, with the length
-           of the longname as size, followed by data blocks,
-           which contain the longname as a null terminated string.
+        self.offset_data = tarfile.fileobj.tell()
+        tarfile.offset = self.offset_data + self._block(self.size)
+        self.size = origsize
+
+        return self
+
+    def _proc_pax(self, tarfile):
+        """Process an extended or global header as described in
+           POSIX.1-2001.
         """
-        name += NUL
+        # Read the header information.
+        buf = tarfile.fileobj.read(self._block(self.size))
 
-        tarinfo = self.__class__()
-        tarinfo.name = "././@LongLink"
-        tarinfo.type = type
-        tarinfo.mode = 0
-        tarinfo.size = len(name)
-
-        # create extended header
-        buf = tarinfo.tobuf()
-        # create name blocks
-        buf += name
-        blocks, remainder = divmod(len(name), BLOCKSIZE)
-        if remainder > 0:
-            buf += (BLOCKSIZE - remainder) * NUL
-        return buf
+        # A pax header stores supplemental information for either
+        # the following file (extended) or all following files
+        # (global).
+        if self.type == XGLTYPE:
+            pax_headers = tarfile.pax_headers
+        else:
+            pax_headers = tarfile.pax_headers.copy()
+
+        # Fields in POSIX.1-2001 that are numbers, all other fields
+        # are treated as UTF-8 strings.
+        type_mapping = {
+            "atime":        float,
+            "ctime":        float,
+            "mtime":        float,
+            "uid":          int,
+            "gid":          int,
+            "size":         int
+        }
+
+        # Parse pax header information. A record looks like that:
+        # "%d %s=%s\n" % (length, keyword, value). length is the size
+        # of the complete record including the length field itself and
+        # the newline.
+        regex = re.compile(r"(\d+) ([^=]+)=", re.U)
+        pos = 0
+        while True:
+            match = regex.match(buf, pos)
+            if not match:
+                break
+
+            length, keyword = match.groups()
+            length = int(length)
+            value = buf[match.end(2) + 1:match.start(1) + length - 1]
+
+            keyword = keyword.decode("utf8")
+            keyword = keyword.encode(tarfile.encoding)
+
+            value = value.decode("utf8")
+            if keyword in type_mapping:
+                try:
+                    value = type_mapping[keyword](value)
+                except ValueError:
+                    value = 0
+            else:
+                value = value.encode(tarfile.encoding)
+
+            pax_headers[keyword] = value
+            pos += length
+
+        # Fetch the next header that will be patched with the
+        # supplement information from the pax header (extended
+        # only).
+        t = self.fromtarfile(tarfile)
+
+        if self.type != XGLTYPE and t is not None:
+            # Patch the TarInfo object from the next header with
+            # the pax header's information.
+            for keyword, value in pax_headers.items():
+                if keyword in PAX_FIELDS:
+                    setattr(t, keyword, value)
+                pax_headers[keyword] = value
+            t.pax_headers = pax_headers.copy()
+
+        return t
+
+    def _block(self, count):
+        """Round up a byte count by BLOCKSIZE and return it,
+           e.g. _block(834) => 1024.
+        """
+        blocks, remainder = divmod(count, BLOCKSIZE)
+        if remainder:
+            blocks += 1
+        return blocks * BLOCKSIZE
 
     def isreg(self):
         return self.type in REGULAR_TYPES
@@ -1040,12 +1462,18 @@ class TarFile(object):
                                 # messages (if debug >= 0). If > 0, errors
                                 # are passed to the caller as exceptions.
 
-    posix = False               # If True, generates POSIX.1-1990-compliant
-                                # archives (no GNU extensions!)
+    format = DEFAULT_FORMAT     # The format to use when creating an archive.
+
+    encoding = ENCODING         # Transfer UTF-8 strings from POSIX.1-2001
+                                # headers to this encoding.
+
+    tarinfo = TarInfo           # The default TarInfo class to use.
 
-    fileobject = ExFileObject
+    fileobject = ExFileObject   # The default ExFileObject class to use.
 
-    def __init__(self, name=None, mode="r", fileobj=None):
+    def __init__(self, name=None, mode="r", fileobj=None, format=None,
+            tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
+            pax_headers=None, debug=None, errorlevel=None):
         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
            read from an existing archive, 'a' to append data to an existing
            file or 'w' to create a new file overwriting an existing one. `mode'
@@ -1054,58 +1482,86 @@ class TarFile(object):
            can be determined, `mode' is overridden by `fileobj's mode.
            `fileobj' is not closed, when TarFile is closed.
         """
-        self.name = os.path.abspath(name)
-
         if len(mode) > 1 or mode not in "raw":
             raise ValueError("mode must be 'r', 'a' or 'w'")
-        self._mode = mode
-        self.mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
+        self.mode = mode
+        self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
 
         if not fileobj:
-            if self._mode == "a" and not os.path.exists(self.name):
+            if self.mode == "a" and not os.path.exists(name):
                 # Create nonexistent files in append mode.
-                self._mode = "w"
-                self.mode = "wb"
-            fileobj = _open(self.name, self.mode)
+                self.mode = "w"
+                self._mode = "wb"
+            fileobj = _open(name, self._mode)
             self._extfileobj = False
         else:
-            if self.name is None and hasattr(fileobj, "name"):
-                self.name = os.path.abspath(fileobj.name)
+            if name is None and hasattr(fileobj, "name"):
+                name = fileobj.name
             if hasattr(fileobj, "mode"):
-                self.mode = fileobj.mode
+                self._mode = fileobj.mode
             self._extfileobj = True
+        self.name = os.path.abspath(name)
         self.fileobj = fileobj
 
-        # Init datastructures
+        # Init attributes.
+        if format is not None:
+            self.format = format
+        if tarinfo is not None:
+            self.tarinfo = tarinfo
+        if dereference is not None:
+            self.dereference = dereference
+        if ignore_zeros is not None:
+            self.ignore_zeros = ignore_zeros
+        if encoding is not None:
+            self.encoding = encoding
+        if debug is not None:
+            self.debug = debug
+        if errorlevel is not None:
+            self.errorlevel = errorlevel
+
+        # Init datastructures.
         self.closed = False
         self.members = []       # list of members as TarInfo objects
         self._loaded = False    # flag if all members have been read
         self.offset = 0        # current position in the archive file
         self.inodes = {}        # dictionary caching the inodes of
                                 # archive members already added
+        self.pax_headers = {}   # save contents of global pax headers
 
-        if self._mode == "r":
+        if self.mode == "r":
             self.firstmember = None
             self.firstmember = self.next()
 
-        if self._mode == "a":
+        if self.mode == "a":
             # Move to the end of the archive,
             # before the first empty block.
             self.firstmember = None
             while True:
-                try:
-                    tarinfo = self.next()
-                except ReadError:
-                    self.fileobj.seek(0)
-                    break
-                if tarinfo is None:
+                if self.next() is None:
                     if self.offset > 0:
                         self.fileobj.seek(- BLOCKSIZE, 1)
                     break
 
-        if self._mode in "aw":
+        if self.mode in "aw":
             self._loaded = True
 
+            if pax_headers:
+                buf = self.tarinfo.create_pax_global_header(
+                        pax_headers.copy(), self.encoding)
+                self.fileobj.write(buf)
+                self.offset += len(buf)
+
+    def _getposix(self):
+        return self.format == USTAR_FORMAT
+    def _setposix(self, value):
+        import warnings
+        warnings.warn("use the format attribute instead", DeprecationWarning)
+        if value:
+            self.format = USTAR_FORMAT
+        else:
+            self.format = GNU_FORMAT
+    posix = property(_getposix, _setposix)
+
     #--------------------------------------------------------------------------
     # Below are the classmethods which act as alternate constructors to the
     # TarFile class. The open() method is the only one that is needed for
@@ -1118,7 +1574,7 @@ class TarFile(object):
     # by adding it to the mapping in OPEN_METH.
 
     @classmethod
-    def open(cls, name=None, mode="r", fileobj=None, bufsize=20*512):
+    def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
         """Open a tar archive for reading, writing or appending. Return
            an appropriate TarFile class.
 
@@ -1151,8 +1607,8 @@ class TarFile(object):
                 if fileobj is not None:
                     saved_pos = fileobj.tell()
                 try:
-                    return func(name, "r", fileobj)
-                except (ReadError, CompressionError):
+                    return func(name, "r", fileobj, **kwargs)
+                except (ReadError, CompressionError) as e:
                     if fileobj is not None:
                         fileobj.seek(saved_pos)
                     continue
@@ -1169,7 +1625,7 @@ class TarFile(object):
                 func = getattr(cls, cls.OPEN_METH[comptype])
             else:
                 raise CompressionError("unknown compression type %r" % comptype)
-            return func(name, filemode, fileobj)
+            return func(name, filemode, fileobj, **kwargs)
 
         elif "|" in mode:
             filemode, comptype = mode.split("|", 1)
@@ -1180,25 +1636,26 @@ class TarFile(object):
                 raise ValueError("mode must be 'r' or 'w'")
 
             t = cls(name, filemode,
-                    _Stream(name, filemode, comptype, fileobj, bufsize))
+                    _Stream(name, filemode, comptype, fileobj, bufsize),
+                    **kwargs)
             t._extfileobj = False
             return t
 
         elif mode in "aw":
-            return cls.taropen(name, mode, fileobj)
+            return cls.taropen(name, mode, fileobj, **kwargs)
 
         raise ValueError("undiscernible mode")
 
     @classmethod
-    def taropen(cls, name, mode="r", fileobj=None):
+    def taropen(cls, name, mode="r", fileobj=None, **kwargs):
         """Open uncompressed tar archive name for reading or writing.
         """
         if len(mode) > 1 or mode not in "raw":
             raise ValueError("mode must be 'r', 'a' or 'w'")
-        return cls(name, mode, fileobj)
+        return cls(name, mode, fileobj, **kwargs)
 
     @classmethod
-    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9):
+    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
         """Open gzip compressed tar archive name for reading or writing.
            Appending is not allowed.
         """
@@ -1216,14 +1673,15 @@ class TarFile(object):
 
         try:
             t = cls.taropen(name, mode,
-                gzip.GzipFile(name, mode, compresslevel, fileobj))
+                gzip.GzipFile(name, mode, compresslevel, fileobj),
+                **kwargs)
         except IOError:
             raise ReadError("not a gzip file")
         t._extfileobj = False
         return t
 
     @classmethod
-    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9):
+    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
         """Open bzip2 compressed tar archive name for reading or writing.
            Appending is not allowed.
         """
@@ -1241,7 +1699,7 @@ class TarFile(object):
             fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
 
         try:
-            t = cls.taropen(name, mode, fileobj)
+            t = cls.taropen(name, mode, fileobj, **kwargs)
         except IOError:
             raise ReadError("not a bzip2 file")
         t._extfileobj = False
@@ -1264,7 +1722,7 @@ class TarFile(object):
         if self.closed:
             return
 
-        if self._mode in "aw":
+        if self.mode in "aw":
             self.fileobj.write(NUL * (BLOCKSIZE * 2))
             self.offset += (BLOCKSIZE * 2)
             # fill up the end with zero-blocks
@@ -1330,7 +1788,8 @@ class TarFile(object):
 
         # Now, fill the TarInfo object with
         # information specific for the file.
-        tarinfo = TarInfo()
+        tarinfo = self.tarinfo()
+        tarinfo.tarfile = self
 
         # Use os.stat or os.lstat, depending on platform
         # and if symlinks shall be resolved.
@@ -1346,8 +1805,8 @@ class TarFile(object):
         stmd = statres.st_mode
         if stat.S_ISREG(stmd):
             inode = (statres.st_ino, statres.st_dev)
-            if not self.dereference and \
-                    statres.st_nlink > 1 and inode in self.inodes:
+            if not self.dereference and statres.st_nlink > 1 and \
+                    inode in self.inodes and arcname != self.inodes[inode]:
                 # Is it a hardlink to an already
                 # archived file?
                 type = LNKTYPE
@@ -1424,7 +1883,7 @@ class TarFile(object):
                 print("%d-%02d-%02d %02d:%02d:%02d" \
                       % time.localtime(tarinfo.mtime)[:6], end=' ')
 
-            print(tarinfo.name, end=' ')
+            print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
 
             if verbose:
                 if tarinfo.issym():
@@ -1456,7 +1915,7 @@ class TarFile(object):
             if recursive:
                 if arcname == ".":
                     arcname = ""
-                for f in os.listdir("."):
+                for f in os.listdir(name):
                     self.add(f, os.path.join(arcname, f))
             return
 
@@ -1495,7 +1954,7 @@ class TarFile(object):
 
         tarinfo = copy.copy(tarinfo)
 
-        buf = tarinfo.tobuf(self.posix)
+        buf = tarinfo.tobuf(self.format, self.encoding)
         self.fileobj.write(buf)
         self.offset += len(buf)
 
@@ -1527,7 +1986,7 @@ class TarFile(object):
                 # Extract directory with a safe mode, so that
                 # all files below can be extracted as well.
                 try:
-                    os.makedirs(os.path.join(path, tarinfo.name), 0777)
+                    os.makedirs(os.path.join(path, tarinfo.name), 0700)
                 except EnvironmentError:
                     pass
                 directories.append(tarinfo)
@@ -1559,10 +2018,10 @@ class TarFile(object):
         """
         self._check("r")
 
-        if isinstance(member, TarInfo):
-            tarinfo = member
-        else:
+        if isinstance(member, basestring):
             tarinfo = self.getmember(member)
+        else:
+            tarinfo = member
 
         # Prepare the link target for makelink().
         if tarinfo.islnk():
@@ -1595,10 +2054,10 @@ class TarFile(object):
         """
         self._check("r")
 
-        if isinstance(member, TarInfo):
-            tarinfo = member
-        else:
+        if isinstance(member, basestring):
             tarinfo = self.getmember(member)
+        else:
+            tarinfo = member
 
         if tarinfo.isreg():
             return self.fileobject(self, tarinfo)
@@ -1811,20 +2270,11 @@ class TarFile(object):
         # Read the next block.
         self.fileobj.seek(self.offset)
         while True:
-            buf = self.fileobj.read(BLOCKSIZE)
-            if not buf:
-                return None
-
             try:
-                tarinfo = TarInfo.frombuf(buf)
-
-                # Set the TarInfo object's offset to the current position of the
-                # TarFile and set self.offset to the position where the data blocks
-                # should begin.
-                tarinfo.offset = self.offset
-                self.offset += BLOCKSIZE
-
-                tarinfo = self.proc_member(tarinfo)
+                tarinfo = self.tarinfo.fromtarfile(self)
+                if tarinfo is None:
+                    return
+                self.members.append(tarinfo)
 
             except HeaderError as e:
                 if self.ignore_zeros:
@@ -1837,149 +2287,11 @@ class TarFile(object):
                     return None
             break
 
-        # Some old tar programs represent a directory as a regular
-        # file with a trailing slash.
-        if tarinfo.isreg() and tarinfo.name.endswith("/"):
-            tarinfo.type = DIRTYPE
-
-        # Directory names should have a '/' at the end.
-        if tarinfo.isdir():
-            tarinfo.name += "/"
-
-        self.members.append(tarinfo)
-        return tarinfo
-
-    #--------------------------------------------------------------------------
-    # The following are methods that are called depending on the type of a
-    # member. The entry point is proc_member() which is called with a TarInfo
-    # object created from the header block from the current offset. The
-    # proc_member() method can be overridden in a subclass to add custom
-    # proc_*() methods. A proc_*() method MUST implement the following
-    # operations:
-    # 1. Set tarinfo.offset_data to the position where the data blocks begin,
-    #    if there is data that follows.
-    # 2. Set self.offset to the position where the next member's header will
-    #    begin.
-    # 3. Return tarinfo or another valid TarInfo object.
-    def proc_member(self, tarinfo):
-        """Choose the right processing method for tarinfo depending
-           on its type and call it.
-        """
-        if tarinfo.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
-            return self.proc_gnulong(tarinfo)
-        elif tarinfo.type == GNUTYPE_SPARSE:
-            return self.proc_sparse(tarinfo)
-        else:
-            return self.proc_builtin(tarinfo)
-
-    def proc_builtin(self, tarinfo):
-        """Process a builtin type member or an unknown member
-           which will be treated as a regular file.
-        """
-        tarinfo.offset_data = self.offset
-        if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
-            # Skip the following data blocks.
-            self.offset += self._block(tarinfo.size)
-        return tarinfo
-
-    def proc_gnulong(self, tarinfo):
-        """Process the blocks that hold a GNU longname
-           or longlink member.
-        """
-        buf = ""
-        count = tarinfo.size
-        while count > 0:
-            block = self.fileobj.read(BLOCKSIZE)
-            buf += block
-            self.offset += BLOCKSIZE
-            count -= BLOCKSIZE
-
-        # Fetch the next header and process it.
-        b = self.fileobj.read(BLOCKSIZE)
-        t = TarInfo.frombuf(b)
-        t.offset = self.offset
-        self.offset += BLOCKSIZE
-        next = self.proc_member(t)
-
-        # Patch the TarInfo object from the next header with
-        # the longname information.
-        next.offset = tarinfo.offset
-        if tarinfo.type == GNUTYPE_LONGNAME:
-            next.name = buf.rstrip(NUL)
-        elif tarinfo.type == GNUTYPE_LONGLINK:
-            next.linkname = buf.rstrip(NUL)
-
-        return next
-
-    def proc_sparse(self, tarinfo):
-        """Process a GNU sparse header plus extra headers.
-        """
-        buf = tarinfo.buf
-        sp = _ringbuffer()
-        pos = 386
-        lastpos = 0
-        realpos = 0
-        # There are 4 possible sparse structs in the
-        # first header.
-        for i in xrange(4):
-            try:
-                offset = nti(buf[pos:pos + 12])
-                numbytes = nti(buf[pos + 12:pos + 24])
-            except ValueError:
-                break
-            if offset > lastpos:
-                sp.append(_hole(lastpos, offset - lastpos))
-            sp.append(_data(offset, numbytes, realpos))
-            realpos += numbytes
-            lastpos = offset + numbytes
-            pos += 24
-
-        isextended = ord(buf[482])
-        origsize = nti(buf[483:495])
-
-        # If the isextended flag is given,
-        # there are extra headers to process.
-        while isextended == 1:
-            buf = self.fileobj.read(BLOCKSIZE)
-            self.offset += BLOCKSIZE
-            pos = 0
-            for i in xrange(21):
-                try:
-                    offset = nti(buf[pos:pos + 12])
-                    numbytes = nti(buf[pos + 12:pos + 24])
-                except ValueError:
-                    break
-                if offset > lastpos:
-                    sp.append(_hole(lastpos, offset - lastpos))
-                sp.append(_data(offset, numbytes, realpos))
-                realpos += numbytes
-                lastpos = offset + numbytes
-                pos += 24
-            isextended = ord(buf[504])
-
-        if lastpos < origsize:
-            sp.append(_hole(lastpos, origsize - lastpos))
-
-        tarinfo.sparse = sp
-
-        tarinfo.offset_data = self.offset
-        self.offset += self._block(tarinfo.size)
-        tarinfo.size = origsize
-
         return tarinfo
 
     #--------------------------------------------------------------------------
     # Little helper methods:
 
-    def _block(self, count):
-        """Round up a byte count by BLOCKSIZE and return it,
-           e.g. _block(834) => 1024.
-        """
-        blocks, remainder = divmod(count, BLOCKSIZE)
-        if remainder:
-            blocks += 1
-        return blocks * BLOCKSIZE
-
     def _getmember(self, name, tarinfo=None):
         """Find an archive member by name from bottom to top.
            If tarinfo is given, it is used as the starting point.
@@ -2012,8 +2324,8 @@ class TarFile(object):
         """
         if self.closed:
             raise IOError("%s is closed" % self.__class__.__name__)
-        if mode is not None and self._mode not in mode:
-            raise IOError("bad operation for mode %r" % self._mode)
+        if mode is not None and self.mode not in mode:
+            raise IOError("bad operation for mode %r" % self.mode)
 
     def __iter__(self):
         """Provide an iterator object.