diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2008-07-03 14:13:42 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2008-07-03 14:13:42 (GMT) |
commit | b09b844a5c7bb41cf67af8aafe0682c3bfdc12da (patch) | |
tree | 09cb453d31b46761a3da71b09ec4927fdaf61373 /Lib/zipfile.py | |
parent | 451a356f11d3ed498a359f94d85a7df5d6a2a843 (diff) | |
download | cpython-b09b844a5c7bb41cf67af8aafe0682c3bfdc12da.zip cpython-b09b844a5c7bb41cf67af8aafe0682c3bfdc12da.tar.gz cpython-b09b844a5c7bb41cf67af8aafe0682c3bfdc12da.tar.bz2 |
Merged revisions 64688 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r64688 | martin.v.loewis | 2008-07-03 14:51:14 +0200 (Do, 03 Jul 2008) | 9 lines
Patch #1622: Correct interpretation of various ZIP header fields.
Also fixes
- Issue #1526: Allow more than 64k files to be added to Zip64 file.
- Issue #1746: Correct handling of zipfile archive comments (previously
archives with comments over 4k were flagged as invalid). Allow writing
Zip files with archives by setting the 'comment' attribute of a ZipFile.
........
Diffstat (limited to 'Lib/zipfile.py')
-rw-r--r-- | Lib/zipfile.py | 330 |
1 files changed, 223 insertions, 107 deletions
diff --git a/Lib/zipfile.py b/Lib/zipfile.py index fd923c8..59a86e2 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -29,31 +29,79 @@ class LargeZipFile(Exception): error = BadZipfile # The exception raised by this module ZIP64_LIMIT= (1 << 31) - 1 +ZIP_FILECOUNT_LIMIT = 1 << 16 +ZIP_MAX_COMMENT = (1 << 16) - 1 # constants for Zip file compression methods ZIP_STORED = 0 ZIP_DEFLATED = 8 # Other ZIP compression methods not supported -# Here are some struct module formats for reading headers -structEndArchive = "<4s4H2LH" # 9 items, end of archive, 22 bytes -stringEndArchive = b"PK\005\006" # magic number for end of archive record -structCentralDir = "<4s4B4HLLL5HLL"# 19 items, central directory, 46 bytes -stringCentralDir = b"PK\001\002" # magic number for central directory -structFileHeader = "<4s2B4HLLL2H" # 12 items, file header record, 30 bytes -stringFileHeader = b"PK\003\004" # magic number for file header -structEndArchive64Locator = "<4sLQL" # 4 items, locate Zip64 header, 20 bytes -stringEndArchive64Locator = b"PK\x06\x07" # magic token for locator header -structEndArchive64 = "<4sQHHLLQQQQ" # 10 items, end of archive (Zip64), 56 bytes -stringEndArchive64 = b"PK\x06\x06" # magic token for Zip64 header - +# Below are some formats and associated data for reading/writing headers using +# the struct module. The names and structures of headers/records are those used +# in the PKWARE description of the ZIP file format: +# http://www.pkware.com/documents/casestudies/APPNOTE.TXT +# (URL valid as of January 2008) + +# The "end of central directory" structure, magic number, size, and indices +# (section V.I in the format document) +structEndCentDir = b"<4s4H2LH" +magicEndCentDir = b"PK\005\006" +sizeEndCentDir = struct.calcsize(structEndCentDir) + +_ECD_SIGNATURE = 0 +_ECD_DISK_NUMBER = 1 +_ECD_DISK_START = 2 +_ECD_ENTRIES_THIS_DISK = 3 +_ECD_ENTRIES_TOTAL = 4 +_ECD_SIZE = 5 +_ECD_OFFSET = 6 +_ECD_COMMENT_SIZE = 7 +# These last two indices are not part of the structure as defined in the +# spec, but they are used internally by this module as a convenience +_ECD_COMMENT = 8 +_ECD_LOCATION = 9 + +# The "central directory" structure, magic number, size, and indices +# of entries in the structure (section V.F in the format document) +structCentralDir = "<4s4B4HL2L5H2L" +magicCentralDir = b"PK\001\002" +sizeCentralDir = struct.calcsize(structCentralDir) + +# The "local file header" structure, magic number, size, and indices +# (section V.A in the format document) +structFileHeader = "<4s2B4HL2L2H" +magicFileHeader = b"PK\003\004" +sizeFileHeader = struct.calcsize(structFileHeader) + +# The "Zip64 end of central directory locator" structure, magic number, and size +structEndCentDir64Locator = "<4sLQL" +magicEndCentDir64Locator = b"PK\x06\x07" +sizeEndCentDir64Locator = struct.calcsize(structEndCentDir64Locator) + +# The "Zip64 end of central directory" record, magic number, size, and indices +# (section V.G in the format document) +structEndCentDir64 = "<4sQ2H2L4Q" +magicEndCentDir64 = b"PK\x06\x06" +sizeEndCentDir64 = struct.calcsize(structEndCentDir64) + +_CD64_SIGNATURE = 0 +_CD64_DIRECTORY_RECSIZE = 1 +_CD64_CREATE_VERSION = 2 +_CD64_EXTRACT_VERSION = 3 +_CD64_DISK_NUMBER = 4 +_CD64_DISK_NUMBER_START = 5 +_CD64_NUMBER_ENTRIES_THIS_DISK = 6 +_CD64_NUMBER_ENTRIES_TOTAL = 7 +_CD64_DIRECTORY_SIZE = 8 +_CD64_OFFSET_START_CENTDIR = 9 # indexes of entries in the central directory structure _CD_SIGNATURE = 0 _CD_CREATE_VERSION = 1 _CD_CREATE_SYSTEM = 2 _CD_EXTRACT_VERSION = 3 -_CD_EXTRACT_SYSTEM = 4 # is this meaningful? +_CD_EXTRACT_SYSTEM = 4 _CD_FLAG_BITS = 5 _CD_COMPRESS_TYPE = 6 _CD_TIME = 7 @@ -69,10 +117,15 @@ _CD_INTERNAL_FILE_ATTRIBUTES = 16 _CD_EXTERNAL_FILE_ATTRIBUTES = 17 _CD_LOCAL_HEADER_OFFSET = 18 -# indexes of entries in the local file header structure +# The "local file header" structure, magic number, size, and indices +# (section V.A in the format document) +structFileHeader = "<4s2B4HL2L2H" +magicFileHeader = b"PK\003\004" +sizeFileHeader = struct.calcsize(structFileHeader) + _FH_SIGNATURE = 0 _FH_EXTRACT_VERSION = 1 -_FH_EXTRACT_SYSTEM = 2 # is this meaningful? +_FH_EXTRACT_SYSTEM = 2 _FH_GENERAL_PURPOSE_FLAG_BITS = 3 _FH_COMPRESSION_METHOD = 4 _FH_LAST_MOD_TIME = 5 @@ -83,6 +136,28 @@ _FH_UNCOMPRESSED_SIZE = 9 _FH_FILENAME_LENGTH = 10 _FH_EXTRA_FIELD_LENGTH = 11 +# The "Zip64 end of central directory locator" structure, magic number, and size +structEndCentDir64Locator = "<4sLQL" +magicEndCentDir64Locator = b"PK\x06\x07" +sizeEndCentDir64Locator = struct.calcsize(structEndCentDir64Locator) + +# The "Zip64 end of central directory" record, magic number, size, and indices +# (section V.G in the format document) +structEndCentDir64 = "<4sQ2H2L4Q" +magicEndCentDir64 = b"PK\x06\x06" +sizeEndCentDir64 = struct.calcsize(structEndCentDir64) + +_CD64_SIGNATURE = 0 +_CD64_DIRECTORY_RECSIZE = 1 +_CD64_CREATE_VERSION = 2 +_CD64_EXTRACT_VERSION = 3 +_CD64_DISK_NUMBER = 4 +_CD64_DISK_NUMBER_START = 5 +_CD64_NUMBER_ENTRIES_THIS_DISK = 6 +_CD64_NUMBER_ENTRIES_TOTAL = 7 +_CD64_DIRECTORY_SIZE = 8 +_CD64_OFFSET_START_CENTDIR = 9 + def is_zipfile(filename): """Quickly see if file is a ZIP file by checking the magic number.""" try: @@ -99,33 +174,31 @@ def _EndRecData64(fpin, offset, endrec): """ Read the ZIP64 end-of-archive records and use that to update endrec """ - locatorSize = struct.calcsize(structEndArchive64Locator) - fpin.seek(offset - locatorSize, 2) - data = fpin.read(locatorSize) - sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) - if sig != stringEndArchive64Locator: + fpin.seek(offset - sizeEndCentDir64Locator, 2) + data = fpin.read(sizeEndCentDir64Locator) + sig, diskno, reloff, disks = struct.unpack(structEndCentDir64Locator, data) + if sig != magicEndCentDir64Locator: return endrec if diskno != 0 or disks != 1: raise BadZipfile("zipfiles that span multiple disks are not supported") # Assume no 'zip64 extensible data' - endArchiveSize = struct.calcsize(structEndArchive64) - fpin.seek(offset - locatorSize - endArchiveSize, 2) - data = fpin.read(endArchiveSize) + fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) + data = fpin.read(sizeEndCentDir64) sig, sz, create_version, read_version, disk_num, disk_dir, \ dircount, dircount2, dirsize, diroffset = \ - struct.unpack(structEndArchive64, data) - if sig != stringEndArchive64: + struct.unpack(structEndCentDir64, data) + if sig != magicEndCentDir64: return endrec # Update the original endrec using data from the ZIP64 record - endrec[1] = disk_num - endrec[2] = disk_dir - endrec[3] = dircount - endrec[4] = dircount2 - endrec[5] = dirsize - endrec[6] = diroffset + endrec[_ECD_DISK_NUMBER] = disk_num + endrec[_ECD_DISK_START] = disk_dir + endrec[_ECD_ENTRIES_THIS_DISK] = dircount + endrec[_ECD_ENTRIES_TOTAL] = dircount2 + endrec[_ECD_SIZE] = dirsize + endrec[_ECD_OFFSET] = diroffset return endrec @@ -134,38 +207,59 @@ def _EndRecData(fpin): The data is a list of the nine items in the ZIP "End of central dir" record followed by a tenth item, the file seek offset of this record.""" - fpin.seek(-22, 2) # Assume no archive comment. - filesize = fpin.tell() + 22 # Get file size + + # Determine file size + fpin.seek(0, 2) + filesize = fpin.tell() + + # Check to see if this is ZIP file with no archive comment (the + # "end of central directory" structure should be the last item in the + # file if this is the case). + fpin.seek(-sizeEndCentDir, 2) data = fpin.read() - if data[0:4] == stringEndArchive and data[-2:] == b"\000\000": - endrec = struct.unpack(structEndArchive, data) - endrec = list(endrec) - endrec.append("") # Append the archive comment - endrec.append(filesize - 22) # Append the record start offset - if endrec[-4] == 0xffffffff: - return _EndRecData64(fpin, -22, endrec) + if data[0:4] == magicEndCentDir and data[-2:] == b"\000\000": + # the signature is correct and there's no comment, unpack structure + endrec = struct.unpack(structEndCentDir, data) + endrec=list(endrec) + + # Append a blank comment and record start offset + endrec.append(b"") + endrec.append(filesize - sizeEndCentDir) + if endrec[_ECD_OFFSET] == 0xffffffff: + # the value for the "offset of the start of the central directory" + # indicates that there is a "Zip64 end of central directory" + # structure present, so go look for it + return _EndRecData64(fpin, -sizeEndCentDir, endrec) + return endrec - # Search the last END_BLOCK bytes of the file for the record signature. - # The comment is appended to the ZIP file and has a 16 bit length. - # So the comment may be up to 64K long. We limit the search for the - # signature to a few Kbytes at the end of the file for efficiency. - # also, the signature must not appear in the comment. - END_BLOCK = min(filesize, 1024 * 4) - fpin.seek(filesize - END_BLOCK, 0) + + # Either this is not a ZIP file, or it is a ZIP file with an archive + # comment. Search the end of the file for the "end of central directory" + # record signature. The comment is the last item in the ZIP file and may be + # up to 64K long. It is assumed that the "end of central directory" magic + # number does not appear in the comment. + maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0) + fpin.seek(maxCommentStart, 0) data = fpin.read() - start = data.rfind(stringEndArchive) - if start >= 0: # Correct signature string was found - endrec = struct.unpack(structEndArchive, data[start:start+22]) - endrec = list(endrec) - comment = data[start+22:] - if endrec[7] == len(comment): # Comment length checks out + start = data.rfind(magicEndCentDir) + if start >= 0: + # found the magic number; attempt to unpack and interpret + recData = data[start:start+sizeEndCentDir] + endrec = list(struct.unpack(structEndCentDir, recData)) + comment = data[start+sizeEndCentDir:] + # check that comment length is correct + if endrec[_ECD_COMMENT_SIZE] == len(comment): # Append the archive comment and start offset endrec.append(comment) - endrec.append(filesize - END_BLOCK + start) - if endrec[-4] == 0xffffffff: - return _EndRecData64(fpin, - END_BLOCK + start, endrec) + endrec.append(maxCommentStart + start) + if endrec[_ECD_OFFSET] == 0xffffffff: + # There is apparently a "Zip64 end of central directory" + # structure present, so go look for it + return _EndRecData64(fpin, start - filesize, endrec) return endrec - return # Error, return None + + # Unable to find a valid end of central directory structure + return class ZipInfo (object): @@ -252,13 +346,13 @@ class ZipInfo (object): fmt = '<HHQQ' extra = extra + struct.pack(fmt, 1, struct.calcsize(fmt)-4, file_size, compress_size) - file_size = 0xffffffff # -1 - compress_size = 0xffffffff # -1 + file_size = 0xffffffff + compress_size = 0xffffffff self.extract_version = max(45, self.extract_version) self.create_version = max(45, self.extract_version) filename, flag_bits = self._encodeFilenameFlags() - header = struct.pack(structFileHeader, stringFileHeader, + header = struct.pack(structFileHeader, magicFileHeader, self.extract_version, self.reserved, flag_bits, self.compress_type, dostime, dosdate, CRC, compress_size, file_size, @@ -292,16 +386,15 @@ class ZipInfo (object): idx = 0 # ZIP64 extension (large files and/or large archives) - # XXX Is this correct? won't this exclude 2**32-1 byte files? if self.file_size in (0xffffffffffffffff, 0xffffffff): self.file_size = counts[idx] idx += 1 - if self.compress_size == -1 or self.compress_size == 0xFFFFFFFF: + if self.compress_size == 0xFFFFFFFF: self.compress_size = counts[idx] idx += 1 - if self.header_offset == -1 or self.header_offset == 0xffffffff: + if self.header_offset == 0xffffffff: old = self.header_offset self.header_offset = counts[idx] idx+=1 @@ -569,7 +662,7 @@ class ZipExtFile: class ZipFile: """ Class with methods to open, read, write, close, list zip files. - z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True) + z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=False) file: Either the path to the file, or a file-like object. If it is a path, the file will be opened and closed by ZipFile. @@ -605,6 +698,7 @@ class ZipFile: self.compression = compression # Method of compression self.mode = key = mode.replace('b', '')[0] self.pwd = None + self.comment = b'' # Check if we were passed a file-like object if isinstance(file, str): @@ -661,18 +755,20 @@ class ZipFile: raise BadZipfile("File is not a zip file") if self.debug > 1: print(endrec) - size_cd = endrec[5] # bytes in central directory - offset_cd = endrec[6] # offset of central directory - self.comment = endrec[8] # archive comment - # endrec[9] is the offset of the "End of Central Dir" record - if endrec[9] > ZIP64_LIMIT: - x = endrec[9] - size_cd - 56 - 20 - else: - x = endrec[9] - size_cd + size_cd = endrec[_ECD_SIZE] # bytes in central directory + offset_cd = endrec[_ECD_OFFSET] # offset of central directory + self.comment = endrec[_ECD_COMMENT] # archive comment + # "concat" is zero, unless zip was concatenated to another file - concat = x - offset_cd + concat = endrec[_ECD_LOCATION] - size_cd - offset_cd + if endrec[_ECD_LOCATION] > ZIP64_LIMIT: + # If the offset of the "End of Central Dir" record requires Zip64 + # extension structures, account for them + concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) + if self.debug > 2: - print("given, inferred, offset", offset_cd, x, concat) + inferred = concat + offset_cd + print("given, inferred, offset", offset_cd, inferred, concat) # self.start_dir: Position of start of central directory self.start_dir = offset_cd + concat fp.seek(self.start_dir, 0) @@ -680,9 +776,8 @@ class ZipFile: fp = io.BytesIO(data) total = 0 while total < size_cd: - centdir = fp.read(46) - total = total + 46 - if centdir[0:4] != stringCentralDir: + centdir = fp.read(sizeCentralDir) + if centdir[0:4] != magicCentralDir: raise BadZipfile("Bad magic number for central directory") centdir = struct.unpack(structCentralDir, centdir) if self.debug > 2: @@ -699,9 +794,6 @@ class ZipFile: x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) - total = (total + centdir[_CD_FILENAME_LENGTH] - + centdir[_CD_EXTRA_FIELD_LENGTH] - + centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, @@ -716,6 +808,12 @@ class ZipFile: x.header_offset = x.header_offset + concat self.filelist.append(x) self.NameToInfo[x.filename] = x + + # update total bytes read from central directory + total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + + centdir[_CD_EXTRA_FIELD_LENGTH] + + centdir[_CD_COMMENT_LENGTH]) + if self.debug > 2: print("total", total) @@ -749,7 +847,6 @@ class ZipFile: except BadZipfile: return zinfo.filename - def getinfo(self, name): """Return the instance of ZipInfo given 'name'.""" info = self.NameToInfo.get(name) @@ -794,8 +891,8 @@ class ZipFile: zef_file.seek(zinfo.header_offset, 0) # Skip the file header: - fheader = zef_file.read(30) - if fheader[0:4] != stringFileHeader: + fheader = zef_file.read(sizeFileHeader) + if fheader[0:4] != magicFileHeader: raise BadZipfile("Bad magic number for file header") fheader = struct.unpack(structFileHeader, fheader) @@ -1059,15 +1156,15 @@ class ZipFile: or zinfo.compress_size > ZIP64_LIMIT: extra.append(zinfo.file_size) extra.append(zinfo.compress_size) - file_size = 0xffffffff #-1 - compress_size = 0xffffffff #-1 + file_size = 0xffffffff + compress_size = 0xffffffff else: file_size = zinfo.file_size compress_size = zinfo.compress_size if zinfo.header_offset > ZIP64_LIMIT: extra.append(zinfo.header_offset) - header_offset = 0xffffffff # -1 32 bit + header_offset = 0xffffffff else: header_offset = zinfo.header_offset @@ -1084,15 +1181,26 @@ class ZipFile: extract_version = zinfo.extract_version create_version = zinfo.create_version - filename, flag_bits = zinfo._encodeFilenameFlags() - centdir = struct.pack(structCentralDir, - stringCentralDir, create_version, - zinfo.create_system, extract_version, zinfo.reserved, - flag_bits, zinfo.compress_type, dostime, dosdate, - zinfo.CRC, compress_size, file_size, - len(filename), len(extra_data), len(zinfo.comment), - 0, zinfo.internal_attr, zinfo.external_attr, - header_offset) + try: + filename, flag_bits = zinfo._encodeFilenameFlags() + centdir = struct.pack(structCentralDir, + magicCentralDir, create_version, + zinfo.create_system, extract_version, zinfo.reserved, + flag_bits, zinfo.compress_type, dostime, dosdate, + zinfo.CRC, compress_size, file_size, + len(filename), len(extra_data), len(zinfo.comment), + 0, zinfo.internal_attr, zinfo.external_attr, + header_offset) + except DeprecationWarning: + print >>sys.stderr, (structCentralDir, + stringCentralDir, create_version, + zinfo.create_system, extract_version, zinfo.reserved, + zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, + zinfo.CRC, compress_size, file_size, + len(zinfo.filename), len(extra_data), len(zinfo.comment), + 0, zinfo.internal_attr, zinfo.external_attr, + header_offset) + raise self.fp.write(centdir) self.fp.write(filename) self.fp.write(extra_data) @@ -1100,27 +1208,35 @@ class ZipFile: pos2 = self.fp.tell() # Write end-of-zip-archive record + centDirOffset = pos1 if pos1 > ZIP64_LIMIT: # Need to write the ZIP64 end-of-archive records zip64endrec = struct.pack( - structEndArchive64, stringEndArchive64, + structEndCentDir64, magicEndCentDir64, 44, 45, 45, 0, 0, count, count, pos2 - pos1, pos1) self.fp.write(zip64endrec) zip64locrec = struct.pack( - structEndArchive64Locator, - stringEndArchive64Locator, 0, pos2, 1) + structEndCentDir64Locator, + magicEndCentDir64Locator, 0, pos2, 1) self.fp.write(zip64locrec) - - endrec = struct.pack(structEndArchive, stringEndArchive, - 0, 0, count, count, pos2 - pos1, 0xffffffff, 0) - self.fp.write(endrec) - - else: - endrec = struct.pack(structEndArchive, stringEndArchive, - 0, 0, count, count, pos2 - pos1, pos1, 0) - self.fp.write(endrec) + centDirOffset = 0xFFFFFFFF + + # check for valid comment length + if len(self.comment) >= ZIP_MAX_COMMENT: + if self.debug > 0: + msg = 'Archive comment is too long; truncating to %d bytes' \ + % ZIP_MAX_COMMENT + self.comment = self.comment[:ZIP_MAX_COMMENT] + + endrec = struct.pack(structEndCentDir, magicEndCentDir, + 0, 0, count % ZIP_FILECOUNT_LIMIT, + count % ZIP_FILECOUNT_LIMIT, pos2 - pos1, + centDirOffset, len(self.comment)) + self.fp.write(endrec) + self.fp.write(self.comment) self.fp.flush() + if not self._filePassed: self.fp.close() self.fp = None |