summaryrefslogtreecommitdiffstats
path: root/Lib/tarfile.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/tarfile.py')
-rw-r--r--Lib/tarfile.py399
1 files changed, 119 insertions, 280 deletions
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index f26953e..7b9f407 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -29,8 +29,6 @@
"""Read from and write to tar format archives.
"""
-__version__ = "$Revision$"
-
version = "0.9.0"
__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)"
__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
@@ -42,9 +40,9 @@ __credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
#---------
import sys
import os
+import io
import shutil
import stat
-import errno
import time
import struct
import copy
@@ -247,8 +245,8 @@ def calc_chksums(buf):
the high bit set. So we calculate two checksums, unsigned and
signed.
"""
- unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
- signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
+ unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
+ signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
return unsigned_chksum, signed_chksum
def copyfileobj(src, dst, length=None):
@@ -276,47 +274,13 @@ def copyfileobj(src, dst, length=None):
dst.write(buf)
return
-filemode_table = (
- ((S_IFLNK, "l"),
- (S_IFREG, "-"),
- (S_IFBLK, "b"),
- (S_IFDIR, "d"),
- (S_IFCHR, "c"),
- (S_IFIFO, "p")),
-
- ((TUREAD, "r"),),
- ((TUWRITE, "w"),),
- ((TUEXEC|TSUID, "s"),
- (TSUID, "S"),
- (TUEXEC, "x")),
-
- ((TGREAD, "r"),),
- ((TGWRITE, "w"),),
- ((TGEXEC|TSGID, "s"),
- (TSGID, "S"),
- (TGEXEC, "x")),
-
- ((TOREAD, "r"),),
- ((TOWRITE, "w"),),
- ((TOEXEC|TSVTX, "t"),
- (TSVTX, "T"),
- (TOEXEC, "x"))
-)
-
def filemode(mode):
- """Convert a file's mode to a string of the form
- -rwxrwxrwx.
- Used by TarFile.list()
- """
- perm = []
- for table in filemode_table:
- for bit, char in table:
- if mode & bit == bit:
- perm.append(char)
- break
- else:
- perm.append("-")
- return "".join(perm)
+ """Deprecated in this location; use stat.filemode."""
+ import warnings
+ warnings.warn("deprecated in favor of stat.filemode",
+ DeprecationWarning, 2)
+ return stat.filemode(mode)
+
class TarError(Exception):
"""Base exception."""
@@ -423,10 +387,11 @@ class _Stream:
self.crc = zlib.crc32(b"")
if mode == "r":
self._init_read_gz()
+ self.exception = zlib.error
else:
self._init_write_gz()
- if comptype == "bz2":
+ elif comptype == "bz2":
try:
import bz2
except ImportError:
@@ -434,8 +399,25 @@ class _Stream:
if mode == "r":
self.dbuf = b""
self.cmp = bz2.BZ2Decompressor()
+ self.exception = IOError
else:
self.cmp = bz2.BZ2Compressor()
+
+ elif comptype == "xz":
+ try:
+ import lzma
+ except ImportError:
+ raise CompressionError("lzma module is not available")
+ if mode == "r":
+ self.dbuf = b""
+ self.cmp = lzma.LZMADecompressor()
+ self.exception = lzma.LZMAError
+ else:
+ self.cmp = lzma.LZMACompressor()
+
+ elif comptype != "tar":
+ raise CompressionError("unknown compression type %r" % comptype)
+
except:
if not self._extfileobj:
self.fileobj.close()
@@ -587,7 +569,7 @@ class _Stream:
break
try:
buf = self.cmp.decompress(buf)
- except IOError:
+ except self.exception:
raise ReadError("invalid compressed data")
self.dbuf += buf
c += len(buf)
@@ -625,76 +607,19 @@ class _StreamProxy(object):
return self.buf
def getcomptype(self):
- if self.buf.startswith(b"\037\213\010"):
+ if self.buf.startswith(b"\x1f\x8b\x08"):
return "gz"
- if self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
+ elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
return "bz2"
- return "tar"
+ elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
+ return "xz"
+ else:
+ return "tar"
def close(self):
self.fileobj.close()
# class StreamProxy
-class _BZ2Proxy(object):
- """Small proxy class that enables external file object
- support for "r:bz2" and "w:bz2" modes. This is actually
- a workaround for a limitation in bz2 module's BZ2File
- class which (unlike gzip.GzipFile) has no support for
- a file object argument.
- """
-
- blocksize = 16 * 1024
-
- def __init__(self, fileobj, mode):
- self.fileobj = fileobj
- self.mode = mode
- self.name = getattr(self.fileobj, "name", None)
- self.init()
-
- def init(self):
- import bz2
- self.pos = 0
- if self.mode == "r":
- self.bz2obj = bz2.BZ2Decompressor()
- self.fileobj.seek(0)
- self.buf = b""
- else:
- self.bz2obj = bz2.BZ2Compressor()
-
- def read(self, size):
- x = len(self.buf)
- while x < size:
- raw = self.fileobj.read(self.blocksize)
- if not raw:
- break
- data = self.bz2obj.decompress(raw)
- self.buf += data
- x += len(data)
-
- buf = self.buf[:size]
- self.buf = self.buf[size:]
- self.pos += len(buf)
- return buf
-
- def seek(self, pos):
- if pos < self.pos:
- self.init()
- self.read(pos - self.pos)
-
- def tell(self):
- return self.pos
-
- def write(self, data):
- self.pos += len(data)
- raw = self.bz2obj.compress(data)
- self.fileobj.write(raw)
-
- def close(self):
- if self.mode == "w":
- raw = self.bz2obj.flush()
- self.fileobj.write(raw)
-# class _BZ2Proxy
-
#------------------------
# Extraction file object
#------------------------
@@ -709,6 +634,8 @@ class _FileInFile(object):
self.offset = offset
self.size = size
self.position = 0
+ self.name = getattr(fileobj, "name", None)
+ self.closed = False
if blockinfo is None:
blockinfo = [(0, size)]
@@ -727,10 +654,16 @@ class _FileInFile(object):
if lastpos < self.size:
self.map.append((False, lastpos, self.size, None))
+ def flush(self):
+ pass
+
+ def readable(self):
+ return True
+
+ def writable(self):
+ return False
+
def seekable(self):
- if not hasattr(self.fileobj, "seekable"):
- # XXX gzip.GzipFile and bz2.BZ2File
- return True
return self.fileobj.seekable()
def tell(self):
@@ -738,10 +671,21 @@ class _FileInFile(object):
"""
return self.position
- def seek(self, position):
+ def seek(self, position, whence=io.SEEK_SET):
"""Seek to a position in the file.
"""
- self.position = position
+ if whence == io.SEEK_SET:
+ self.position = min(max(position, 0), self.size)
+ elif whence == io.SEEK_CUR:
+ if position < 0:
+ self.position = max(self.position + position, 0)
+ else:
+ self.position = min(self.position + position, self.size)
+ elif whence == io.SEEK_END:
+ self.position = max(min(self.size + position, self.size), 0)
+ else:
+ raise ValueError("Invalid argument")
+ return self.position
def read(self, size=None):
"""Read data from the file.
@@ -770,145 +714,22 @@ class _FileInFile(object):
size -= length
self.position += length
return buf
-#class _FileInFile
-
-
-class ExFileObject(object):
- """File-like object for reading an archive member.
- Is returned by TarFile.extractfile().
- """
- blocksize = 1024
-
- def __init__(self, tarfile, tarinfo):
- self.fileobj = _FileInFile(tarfile.fileobj,
- tarinfo.offset_data,
- tarinfo.size,
- tarinfo.sparse)
- self.name = tarinfo.name
- self.mode = "r"
- self.closed = False
- self.size = tarinfo.size
-
- self.position = 0
- self.buffer = b""
-
- def readable(self):
- return True
-
- def writable(self):
- return False
-
- def seekable(self):
- return self.fileobj.seekable()
-
- def read(self, size=None):
- """Read at most size bytes from the file. If size is not
- present or None, read all data until EOF is reached.
- """
- if self.closed:
- raise ValueError("I/O operation on closed file")
-
- buf = b""
- if self.buffer:
- if size is None:
- buf = self.buffer
- self.buffer = b""
- else:
- buf = self.buffer[:size]
- self.buffer = self.buffer[size:]
-
- if size is None:
- buf += self.fileobj.read()
- else:
- buf += self.fileobj.read(size - len(buf))
-
- self.position += len(buf)
- return buf
-
- # XXX TextIOWrapper uses the read1() method.
- read1 = read
-
- def readline(self, size=-1):
- """Read one entire line from the file. If size is present
- and non-negative, return a string with at most that
- size, which may be an incomplete line.
- """
- if self.closed:
- raise ValueError("I/O operation on closed file")
-
- pos = self.buffer.find(b"\n") + 1
- if pos == 0:
- # no newline found.
- while True:
- buf = self.fileobj.read(self.blocksize)
- self.buffer += buf
- if not buf or b"\n" in buf:
- pos = self.buffer.find(b"\n") + 1
- if pos == 0:
- # no newline found.
- pos = len(self.buffer)
- break
-
- if size != -1:
- pos = min(size, pos)
-
- buf = self.buffer[:pos]
- self.buffer = self.buffer[pos:]
- self.position += len(buf)
- return buf
-
- def readlines(self):
- """Return a list with all remaining lines.
- """
- result = []
- while True:
- line = self.readline()
- if not line: break
- result.append(line)
- return result
-
- def tell(self):
- """Return the current file position.
- """
- if self.closed:
- raise ValueError("I/O operation on closed file")
-
- return self.position
-
- def seek(self, pos, whence=os.SEEK_SET):
- """Seek to a position in the file.
- """
- if self.closed:
- raise ValueError("I/O operation on closed file")
- if whence == os.SEEK_SET:
- self.position = min(max(pos, 0), self.size)
- elif whence == os.SEEK_CUR:
- if pos < 0:
- self.position = max(self.position + pos, 0)
- else:
- self.position = min(self.position + pos, self.size)
- elif whence == os.SEEK_END:
- self.position = max(min(self.size + pos, self.size), 0)
- else:
- raise ValueError("Invalid argument")
-
- self.buffer = b""
- self.fileobj.seek(self.position)
+ def readinto(self, b):
+ buf = self.read(len(b))
+ b[:len(buf)] = buf
+ return len(buf)
def close(self):
- """Close the file object.
- """
self.closed = True
+#class _FileInFile
- def __iter__(self):
- """Get an iterator over the file's lines.
- """
- while True:
- line = self.readline()
- if not line:
- break
- yield line
+class ExFileObject(io.BufferedReader):
+
+ def __init__(self, tarfile, tarinfo):
+ fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
+ tarinfo.size, tarinfo.sparse)
+ super().__init__(fileobj)
#class ExFileObject
#------------------
@@ -1087,7 +908,7 @@ class TarInfo(object):
def create_pax_global_header(cls, pax_headers):
"""Return the object as a pax global header block sequence.
"""
- return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf8")
+ return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
def _posix_split_name(self, name):
"""Split a name longer than 100 chars into a prefix
@@ -1170,7 +991,7 @@ class TarInfo(object):
binary = False
for keyword, value in pax_headers.items():
try:
- value.encode("utf8", "strict")
+ value.encode("utf-8", "strict")
except UnicodeEncodeError:
binary = True
break
@@ -1181,13 +1002,13 @@ class TarInfo(object):
records += b"21 hdrcharset=BINARY\n"
for keyword, value in pax_headers.items():
- keyword = keyword.encode("utf8")
+ keyword = keyword.encode("utf-8")
if binary:
# Try to restore the original byte representation of `value'.
# Needless to say, that the encoding must match the string.
value = value.encode(encoding, "surrogateescape")
else:
- value = value.encode("utf8")
+ value = value.encode("utf-8")
l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
n = p = 0
@@ -1396,7 +1217,7 @@ class TarInfo(object):
# the translation to UTF-8 fails.
match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
if match is not None:
- pax_headers["hdrcharset"] = match.group(1).decode("utf8")
+ pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
# For the time being, we don't care about anything other than "BINARY".
# The only other value that is currently allowed by the standard is
@@ -1405,7 +1226,7 @@ class TarInfo(object):
if hdrcharset == "BINARY":
encoding = tarfile.encoding
else:
- encoding = "utf8"
+ encoding = "utf-8"
# Parse pax header information. A record looks like that:
# "%d %s=%s\n" % (length, keyword, value). length is the size
@@ -1422,20 +1243,20 @@ class TarInfo(object):
length = int(length)
value = buf[match.end(2) + 1:match.start(1) + length - 1]
- # Normally, we could just use "utf8" as the encoding and "strict"
+ # Normally, we could just use "utf-8" as the encoding and "strict"
# as the error handler, but we better not take the risk. For
# example, GNU tar <= 1.23 is known to store filenames it cannot
# translate to UTF-8 as raw strings (unfortunately without a
# hdrcharset=BINARY header).
# We first try the strict standard encoding, and if that fails we
# fall back on the user's encoding and error handler.
- keyword = self._decode_pax_field(keyword, "utf8", "utf8",
+ keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
tarfile.errors)
if keyword in PAX_NAME_FIELDS:
value = self._decode_pax_field(value, encoding, tarfile.encoding,
tarfile.errors)
else:
- value = self._decode_pax_field(value, "utf8", "utf8",
+ value = self._decode_pax_field(value, "utf-8", "utf-8",
tarfile.errors)
pax_headers[keyword] = value
@@ -1595,7 +1416,7 @@ class TarFile(object):
tarinfo = TarInfo # The default TarInfo class to use.
- fileobject = ExFileObject # The default ExFileObject class to use.
+ fileobject = ExFileObject # The file-object for extractfile().
def __init__(self, name=None, mode="r", fileobj=None, format=None,
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
@@ -1714,18 +1535,22 @@ class TarFile(object):
'r:' open for reading exclusively uncompressed
'r:gz' open for reading with gzip compression
'r:bz2' open for reading with bzip2 compression
+ 'r:xz' open for reading with lzma compression
'a' or 'a:' open for appending, creating the file if necessary
'w' or 'w:' open for writing without compression
'w:gz' open for writing with gzip compression
'w:bz2' open for writing with bzip2 compression
+ 'w:xz' open for writing with lzma compression
'r|*' open a stream of tar blocks with transparent compression
'r|' open an uncompressed stream of tar blocks for reading
'r|gz' open a gzip compressed stream of tar blocks
'r|bz2' open a bzip2 compressed stream of tar blocks
+ 'r|xz' open an lzma compressed stream of tar blocks
'w|' open an uncompressed stream for writing
'w|gz' open a gzip compressed stream for writing
'w|bz2' open a bzip2 compressed stream for writing
+ 'w|xz' open an lzma compressed stream for writing
"""
if not name and not fileobj:
@@ -1832,10 +1657,8 @@ class TarFile(object):
except ImportError:
raise CompressionError("bz2 module is not available")
- if fileobj is not None:
- fileobj = _BZ2Proxy(fileobj, mode)
- else:
- fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
+ fileobj = bz2.BZ2File(fileobj or name, mode,
+ compresslevel=compresslevel)
try:
t = cls.taropen(name, mode, fileobj, **kwargs)
@@ -1845,11 +1668,35 @@ class TarFile(object):
t._extfileobj = False
return t
+ @classmethod
+ def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
+ """Open lzma compressed tar archive name for reading or writing.
+ Appending is not allowed.
+ """
+ if mode not in ("r", "w"):
+ raise ValueError("mode must be 'r' or 'w'")
+
+ try:
+ import lzma
+ except ImportError:
+ raise CompressionError("lzma module is not available")
+
+ fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
+
+ try:
+ t = cls.taropen(name, mode, fileobj, **kwargs)
+ except (lzma.LZMAError, EOFError):
+ fileobj.close()
+ raise ReadError("not an lzma file")
+ t._extfileobj = False
+ return t
+
# All *open() methods are registered here.
OPEN_METH = {
"tar": "taropen", # uncompressed tar
"gz": "gzopen", # gzip compressed tar
- "bz2": "bz2open" # bzip2 compressed tar
+ "bz2": "bz2open", # bzip2 compressed tar
+ "xz": "xzopen" # lzma compressed tar
}
#--------------------------------------------------------------------------
@@ -2009,7 +1856,7 @@ class TarFile(object):
for tarinfo in self:
if verbose:
- print(filemode(tarinfo.mode), end=' ')
+ print(stat.filemode(tarinfo.mode), end=' ')
print("%s/%s" % (tarinfo.uname or tarinfo.uid,
tarinfo.gname or tarinfo.gid), end=' ')
if tarinfo.ischr() or tarinfo.isblk():
@@ -2192,12 +2039,9 @@ class TarFile(object):
def extractfile(self, member):
"""Extract a member from the archive as a file object. `member' may be
- a filename or a TarInfo object. If `member' is a regular file, a
- file-like object is returned. If `member' is a link, a file-like
- object is constructed from the link's target. If `member' is none of
- the above, None is returned.
- The file-like object is read-only and provides the following
- methods: read(), readline(), readlines(), seek() and tell()
+ a filename or a TarInfo object. If `member' is a regular file or a
+ link, an io.BufferedReader object is returned. Otherwise, None is
+ returned.
"""
self._check("r")
@@ -2206,12 +2050,8 @@ class TarFile(object):
else:
tarinfo = member
- if tarinfo.isreg():
- return self.fileobject(self, tarinfo)
-
- elif tarinfo.type not in SUPPORTED_TYPES:
- # If a member's type is unknown, it is treated as a
- # regular file.
+ if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
+ # Members with unknown types are treated as regular files.
return self.fileobject(self, tarinfo)
elif tarinfo.islnk() or tarinfo.issym():
@@ -2283,9 +2123,8 @@ class TarFile(object):
# Use a safe mode for the directory, the real mode is set
# later in _extract_member().
os.mkdir(targetpath, 0o700)
- except EnvironmentError as e:
- if e.errno != errno.EEXIST:
- raise
+ except FileExistsError:
+ pass
def makefile(self, tarinfo, targetpath):
"""Make a file called targetpath.