diff options
author | Lars Gustäbel <lars@gustaebel.de> | 2007-05-27 19:49:30 (GMT) |
---|---|---|
committer | Lars Gustäbel <lars@gustaebel.de> | 2007-05-27 19:49:30 (GMT) |
commit | a0fcb9384ead24c412b93a4de903788eb5828dbe (patch) | |
tree | 6bf71c1d2d2943690bd59f838561520fcaadfdbf /Lib/tarfile.py | |
parent | 0ac601995ccd123696b44b0194c3718f8d364c07 (diff) | |
download | cpython-a0fcb9384ead24c412b93a4de903788eb5828dbe.zip cpython-a0fcb9384ead24c412b93a4de903788eb5828dbe.tar.gz cpython-a0fcb9384ead24c412b93a4de903788eb5828dbe.tar.bz2 |
Added errors argument to TarFile class that allows the user to
specify an error handling scheme for character conversion. Additional
scheme "utf-8" in read mode. Unicode input filenames are now
supported by design. The values of the pax_headers dictionary are now
limited to unicode objects.
Fixed: The prefix field is no longer used in PAX_FORMAT (in
conformance with POSIX).
Fixed: In read mode use a possible pax header size field.
Fixed: Strip trailing slashes from pax header name values.
Fixed: Give values in user-specified pax_headers precedence when
writing.
Added unicode tests. Added pax/regtype4 member to testtar.tar all
possible number fields in a pax header.
Added two chapters to the documentation about the different formats
tarfile.py supports and how unicode issues are handled.
Diffstat (limited to 'Lib/tarfile.py')
-rw-r--r-- | Lib/tarfile.py | 235 |
1 files changed, 134 insertions, 101 deletions
diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 4f4a1d9..107041e 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -125,6 +125,17 @@ GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, PAX_FIELDS = ("path", "linkpath", "size", "mtime", "uid", "gid", "uname", "gname") +# Fields in a pax header that are numbers, all other fields +# are treated as strings. +PAX_NUMBER_FIELDS = { + "atime": float, + "ctime": float, + "mtime": float, + "uid": int, + "gid": int, + "size": int +} + #--------------------------------------------------------- # Bits used in the mode field, values in octal. #--------------------------------------------------------- @@ -154,7 +165,7 @@ TOEXEC = 0001 # execute/search by other #--------------------------------------------------------- ENCODING = sys.getfilesystemencoding() if ENCODING is None: - ENCODING = "ascii" + ENCODING = sys.getdefaultencoding() #--------------------------------------------------------- # Some useful functions @@ -218,6 +229,26 @@ def itn(n, digits=8, format=DEFAULT_FORMAT): s = chr(0200) + s return s +def uts(s, encoding, errors): + """Convert a unicode object to a string. + """ + if errors == "utf-8": + # An extra error handler similar to the -o invalid=UTF-8 option + # in POSIX.1-2001. Replace untranslatable characters with their + # UTF-8 representation. + try: + return s.encode(encoding, "strict") + except UnicodeEncodeError: + x = [] + for c in s: + try: + x.append(c.encode(encoding, "strict")) + except UnicodeEncodeError: + x.append(c.encode("utf8")) + return "".join(x) + else: + return s.encode(encoding, errors) + def calc_chksums(buf): """Calculate the checksum for a member's header by summing up all characters except for the chksum field which is treated as if @@ -922,7 +953,7 @@ class TarInfo(object): def __repr__(self): return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) - def get_info(self): + def get_info(self, encoding, errors): """Return the TarInfo's attributes as a dictionary. """ info = { @@ -944,24 +975,29 @@ class TarInfo(object): if info["type"] == DIRTYPE and not info["name"].endswith("/"): info["name"] += "/" + for key in ("name", "linkname", "uname", "gname"): + if type(info[key]) is unicode: + info[key] = info[key].encode(encoding, errors) + return info - def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING): + def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"): """Return a tar header as a string of 512 byte blocks. """ + info = self.get_info(encoding, errors) + if format == USTAR_FORMAT: - return self.create_ustar_header() + return self.create_ustar_header(info) elif format == GNU_FORMAT: - return self.create_gnu_header() + return self.create_gnu_header(info) elif format == PAX_FORMAT: - return self.create_pax_header(encoding) + return self.create_pax_header(info, encoding, errors) else: raise ValueError("invalid format") - def create_ustar_header(self): + def create_ustar_header(self, info): """Return the object as a ustar header block. """ - info = self.get_info() info["magic"] = POSIX_MAGIC if len(info["linkname"]) > LENGTH_LINK: @@ -972,10 +1008,9 @@ class TarInfo(object): return self._create_header(info, USTAR_FORMAT) - def create_gnu_header(self): + def create_gnu_header(self, info): """Return the object as a GNU header block sequence. """ - info = self.get_info() info["magic"] = GNU_MAGIC buf = "" @@ -987,12 +1022,11 @@ class TarInfo(object): return buf + self._create_header(info, GNU_FORMAT) - def create_pax_header(self, encoding): + def create_pax_header(self, info, encoding, errors): """Return the object as a ustar header block. If it cannot be represented this way, prepend a pax extended header sequence with supplement information. """ - info = self.get_info() info["magic"] = POSIX_MAGIC pax_headers = self.pax_headers.copy() @@ -1002,7 +1036,11 @@ class TarInfo(object): ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), ("uname", "uname", 32), ("gname", "gname", 32)): - val = info[name].decode(encoding) + if hname in pax_headers: + # The pax header has priority. + continue + + val = info[name].decode(encoding, errors) # Try to encode the string as ASCII. try: @@ -1011,27 +1049,23 @@ class TarInfo(object): pax_headers[hname] = val continue - if len(val) > length: - if name == "name": - # Try to squeeze a longname in the prefix and name fields as in - # ustar format. - try: - info["prefix"], info["name"] = self._posix_split_name(info["name"]) - except ValueError: - pax_headers[hname] = val - else: - continue - else: - pax_headers[hname] = val + if len(info[name]) > length: + pax_headers[hname] = val # Test number fields for values that exceed the field limit or values # that like to be stored as float. for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): + if name in pax_headers: + # The pax header has priority. Avoid overflow. + info[name] = 0 + continue + val = info[name] if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): pax_headers[name] = unicode(val) info[name] = 0 + # Create a pax extended header if necessary. if pax_headers: buf = self._create_pax_generic_header(pax_headers) else: @@ -1040,26 +1074,10 @@ class TarInfo(object): return buf + self._create_header(info, USTAR_FORMAT) @classmethod - def create_pax_global_header(cls, pax_headers, encoding): + def create_pax_global_header(cls, pax_headers): """Return the object as a pax global header block sequence. """ - new_headers = {} - for key, val in pax_headers.iteritems(): - key = cls._to_unicode(key, encoding) - val = cls._to_unicode(val, encoding) - new_headers[key] = val - return cls._create_pax_generic_header(new_headers, type=XGLTYPE) - - @staticmethod - def _to_unicode(value, encoding): - if isinstance(value, unicode): - return value - elif isinstance(value, (int, long, float)): - return unicode(value) - elif isinstance(value, str): - return unicode(value, encoding) - else: - raise ValueError("unable to convert to unicode: %r" % value) + return cls._create_pax_generic_header(pax_headers, type=XGLTYPE) def _posix_split_name(self, name): """Split a name longer than 100 chars into a prefix @@ -1091,9 +1109,9 @@ class TarInfo(object): " ", # checksum field info.get("type", REGTYPE), stn(info.get("linkname", ""), 100), - stn(info.get("magic", ""), 8), - stn(info.get("uname", ""), 32), - stn(info.get("gname", ""), 32), + stn(info.get("magic", POSIX_MAGIC), 8), + stn(info.get("uname", "root"), 32), + stn(info.get("gname", "root"), 32), itn(info.get("devmajor", 0), 8, format), itn(info.get("devminor", 0), 8, format), stn(info.get("prefix", ""), 155) @@ -1254,12 +1272,9 @@ class TarInfo(object): offset += self._block(self.size) tarfile.offset = offset - # Patch the TarInfo object with saved extended + # Patch the TarInfo object with saved global # header information. - for keyword, value in tarfile.pax_headers.iteritems(): - if keyword in PAX_FIELDS: - setattr(self, keyword, value) - self.pax_headers[keyword] = value + self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) return self @@ -1270,18 +1285,17 @@ class TarInfo(object): buf = tarfile.fileobj.read(self._block(self.size)) # Fetch the next header and process it. - b = tarfile.fileobj.read(BLOCKSIZE) - t = self.frombuf(b) - t.offset = self.offset - next = t._proc_member(tarfile) + next = self.fromtarfile(tarfile) + if next is None: + raise HeaderError("missing subsequent header") # Patch the TarInfo object from the next header with # the longname information. next.offset = self.offset if self.type == GNUTYPE_LONGNAME: - next.name = buf.rstrip(NUL) + next.name = nts(buf) elif self.type == GNUTYPE_LONGLINK: - next.linkname = buf.rstrip(NUL) + next.linkname = nts(buf) return next @@ -1356,21 +1370,10 @@ class TarInfo(object): else: pax_headers = tarfile.pax_headers.copy() - # Fields in POSIX.1-2001 that are numbers, all other fields - # are treated as UTF-8 strings. - type_mapping = { - "atime": float, - "ctime": float, - "mtime": float, - "uid": int, - "gid": int, - "size": int - } - # Parse pax header information. A record looks like that: # "%d %s=%s\n" % (length, keyword, value). length is the size # of the complete record including the length field itself and - # the newline. + # the newline. keyword and value are both UTF-8 encoded strings. regex = re.compile(r"(\d+) ([^=]+)=", re.U) pos = 0 while True: @@ -1383,35 +1386,55 @@ class TarInfo(object): value = buf[match.end(2) + 1:match.start(1) + length - 1] keyword = keyword.decode("utf8") - keyword = keyword.encode(tarfile.encoding) - value = value.decode("utf8") - if keyword in type_mapping: - try: - value = type_mapping[keyword](value) - except ValueError: - value = 0 - else: - value = value.encode(tarfile.encoding) pax_headers[keyword] = value pos += length - # Fetch the next header that will be patched with the - # supplement information from the pax header (extended - # only). - t = self.fromtarfile(tarfile) + # Fetch the next header. + next = self.fromtarfile(tarfile) - if self.type != XGLTYPE and t is not None: - # Patch the TarInfo object from the next header with - # the pax header's information. - for keyword, value in pax_headers.items(): - if keyword in PAX_FIELDS: - setattr(t, keyword, value) - pax_headers[keyword] = value - t.pax_headers = pax_headers.copy() + if self.type in (XHDTYPE, SOLARIS_XHDTYPE): + if next is None: + raise HeaderError("missing subsequent header") - return t + # Patch the TarInfo object with the extended header info. + next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) + next.offset = self.offset + + if pax_headers.has_key("size"): + # If the extended header replaces the size field, + # we need to recalculate the offset where the next + # header starts. + offset = next.offset_data + if next.isreg() or next.type not in SUPPORTED_TYPES: + offset += next._block(next.size) + tarfile.offset = offset + + return next + + def _apply_pax_info(self, pax_headers, encoding, errors): + """Replace fields with supplemental information from a previous + pax extended or global header. + """ + for keyword, value in pax_headers.iteritems(): + if keyword not in PAX_FIELDS: + continue + + if keyword == "path": + value = value.rstrip("/") + + if keyword in PAX_NUMBER_FIELDS: + try: + value = PAX_NUMBER_FIELDS[keyword](value) + except ValueError: + value = 0 + else: + value = uts(value, encoding, errors) + + setattr(self, keyword, value) + + self.pax_headers = pax_headers.copy() def _block(self, count): """Round up a byte count by BLOCKSIZE and return it, @@ -1462,8 +1485,9 @@ class TarFile(object): format = DEFAULT_FORMAT # The format to use when creating an archive. - encoding = ENCODING # Transfer UTF-8 strings from POSIX.1-2001 - # headers to this encoding. + encoding = ENCODING # Encoding for 8-bit character strings. + + errors = None # Error handler for unicode conversion. tarinfo = TarInfo # The default TarInfo class to use. @@ -1471,7 +1495,7 @@ class TarFile(object): def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, - pax_headers=None, debug=None, errorlevel=None): + errors=None, pax_headers=None, debug=None, errorlevel=None): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' @@ -1512,6 +1536,19 @@ class TarFile(object): self.ignore_zeros = ignore_zeros if encoding is not None: self.encoding = encoding + + if errors is not None: + self.errors = errors + elif mode == "r": + self.errors = "utf-8" + else: + self.errors = "strict" + + if pax_headers is not None and self.format == PAX_FORMAT: + self.pax_headers = pax_headers + else: + self.pax_headers = {} + if debug is not None: self.debug = debug if errorlevel is not None: @@ -1524,7 +1561,6 @@ class TarFile(object): self.offset = 0L # current position in the archive file self.inodes = {} # dictionary caching the inodes of # archive members already added - self.pax_headers = {} # save contents of global pax headers if self.mode == "r": self.firstmember = None @@ -1543,9 +1579,8 @@ class TarFile(object): if self.mode in "aw": self._loaded = True - if pax_headers: - buf = self.tarinfo.create_pax_global_header( - pax_headers.copy(), self.encoding) + if self.pax_headers: + buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) self.fileobj.write(buf) self.offset += len(buf) @@ -1817,8 +1852,6 @@ class TarFile(object): self.inodes[inode] = arcname elif stat.S_ISDIR(stmd): type = DIRTYPE - if arcname[-1:] != "/": - arcname += "/" elif stat.S_ISFIFO(stmd): type = FIFOTYPE elif stat.S_ISLNK(stmd): @@ -1952,7 +1985,7 @@ class TarFile(object): tarinfo = copy.copy(tarinfo) - buf = tarinfo.tobuf(self.format, self.encoding) + buf = tarinfo.tobuf(self.format, self.encoding, self.errors) self.fileobj.write(buf) self.offset += len(buf) |