summaryrefslogtreecommitdiffstats
path: root/Lib/zipimport.py
diff options
context:
space:
mode:
authorTim Hatch <tim@timhatch.com>2024-03-28 06:54:51 (GMT)
committerGitHub <noreply@github.com>2024-03-28 06:54:51 (GMT)
commitb44898299a2ed97045c270f6474785da2ff07ced (patch)
tree3cca65e4c9fbf1d34ed10d8656609d5370e23cdc /Lib/zipimport.py
parent2cedd25c14d3acfdcb5e8ee55132ce3e334ab8fe (diff)
downloadcpython-b44898299a2ed97045c270f6474785da2ff07ced.zip
cpython-b44898299a2ed97045c270f6474785da2ff07ced.tar.gz
cpython-b44898299a2ed97045c270f6474785da2ff07ced.tar.bz2
gh-89739: gh-77140: Support zip64 in zipimport (GH-94146)
* Reads zip64 files as produced by the zipfile module * Include tests (somewhat slow, however, because of the need to create "large" zips) * About the same amount of strictness reading invalid zip files as zipfile has * Still works on files with prepended data (like pex) There are a lot more test cases at https://github.com/thatch/zipimport64/ that give me confidence that this works for real-world files. Fixes #89739 and #77140. --------- Co-authored-by: Itamar Ostricher <itamarost@gmail.com> Reviewed-by: Gregory P. Smith <greg@krypto.org>
Diffstat (limited to 'Lib/zipimport.py')
-rw-r--r--Lib/zipimport.py166
1 files changed, 127 insertions, 39 deletions
diff --git a/Lib/zipimport.py b/Lib/zipimport.py
index 823a82e..21d2dca 100644
--- a/Lib/zipimport.py
+++ b/Lib/zipimport.py
@@ -15,7 +15,7 @@ to Zip archives.
#from importlib import _bootstrap_external
#from importlib import _bootstrap # for _verbose_message
import _frozen_importlib_external as _bootstrap_external
-from _frozen_importlib_external import _unpack_uint16, _unpack_uint32
+from _frozen_importlib_external import _unpack_uint16, _unpack_uint32, _unpack_uint64
import _frozen_importlib as _bootstrap # for _verbose_message
import _imp # for check_hash_based_pycs
import _io # for open
@@ -40,8 +40,14 @@ _zip_directory_cache = {}
_module_type = type(sys)
END_CENTRAL_DIR_SIZE = 22
-STRING_END_ARCHIVE = b'PK\x05\x06'
+END_CENTRAL_DIR_SIZE_64 = 56
+END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20
+STRING_END_ARCHIVE = b'PK\x05\x06' # standard EOCD signature
+STRING_END_LOCATOR_64 = b'PK\x06\x07' # Zip64 EOCD Locator signature
+STRING_END_ZIP_64 = b'PK\x06\x06' # Zip64 EOCD signature
MAX_COMMENT_LEN = (1 << 16) - 1
+MAX_UINT32 = 0xffffffff
+ZIP64_EXTRA_TAG = 0x1
class zipimporter(_bootstrap_external._LoaderBasics):
"""zipimporter(archivepath) -> zipimporter object
@@ -356,49 +362,72 @@ def _read_directory(archive):
# to not cause problems when some runs 'python3 /dev/fd/9 9<some_script'
start_offset = fp.tell()
try:
+ # Check if there's a comment.
try:
- fp.seek(-END_CENTRAL_DIR_SIZE, 2)
- header_position = fp.tell()
- buffer = fp.read(END_CENTRAL_DIR_SIZE)
+ fp.seek(0, 2)
+ file_size = fp.tell()
except OSError:
- raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
- if len(buffer) != END_CENTRAL_DIR_SIZE:
- raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
- if buffer[:4] != STRING_END_ARCHIVE:
- # Bad: End of Central Dir signature
- # Check if there's a comment.
- try:
- fp.seek(0, 2)
- file_size = fp.tell()
- except OSError:
- raise ZipImportError(f"can't read Zip file: {archive!r}",
- path=archive)
- max_comment_start = max(file_size - MAX_COMMENT_LEN -
- END_CENTRAL_DIR_SIZE, 0)
- try:
- fp.seek(max_comment_start)
- data = fp.read()
- except OSError:
- raise ZipImportError(f"can't read Zip file: {archive!r}",
- path=archive)
- pos = data.rfind(STRING_END_ARCHIVE)
- if pos < 0:
- raise ZipImportError(f'not a Zip file: {archive!r}',
- path=archive)
+ raise ZipImportError(f"can't read Zip file: {archive!r}",
+ path=archive)
+ max_comment_plus_dirs_size = (
+ MAX_COMMENT_LEN + END_CENTRAL_DIR_SIZE +
+ END_CENTRAL_DIR_SIZE_64 + END_CENTRAL_DIR_LOCATOR_SIZE_64)
+ max_comment_start = max(file_size - max_comment_plus_dirs_size, 0)
+ try:
+ fp.seek(max_comment_start)
+ data = fp.read(max_comment_plus_dirs_size)
+ except OSError:
+ raise ZipImportError(f"can't read Zip file: {archive!r}",
+ path=archive)
+ pos = data.rfind(STRING_END_ARCHIVE)
+ pos64 = data.rfind(STRING_END_ZIP_64)
+
+ if (pos64 >= 0 and pos64+END_CENTRAL_DIR_SIZE_64+END_CENTRAL_DIR_LOCATOR_SIZE_64==pos):
+ # Zip64 at "correct" offset from standard EOCD
+ buffer = data[pos64:pos64 + END_CENTRAL_DIR_SIZE_64]
+ if len(buffer) != END_CENTRAL_DIR_SIZE_64:
+ raise ZipImportError(
+ f"corrupt Zip64 file: Expected {END_CENTRAL_DIR_SIZE_64} byte "
+ f"zip64 central directory, but read {len(buffer)} bytes.",
+ path=archive)
+ header_position = file_size - len(data) + pos64
+
+ central_directory_size = _unpack_uint64(buffer[40:48])
+ central_directory_position = _unpack_uint64(buffer[48:56])
+ num_entries = _unpack_uint64(buffer[24:32])
+ elif pos >= 0:
buffer = data[pos:pos+END_CENTRAL_DIR_SIZE]
if len(buffer) != END_CENTRAL_DIR_SIZE:
raise ZipImportError(f"corrupt Zip file: {archive!r}",
path=archive)
+
header_position = file_size - len(data) + pos
- header_size = _unpack_uint32(buffer[12:16])
- header_offset = _unpack_uint32(buffer[16:20])
- if header_position < header_size:
+ # Buffer now contains a valid EOCD, and header_position gives the
+ # starting position of it.
+ central_directory_size = _unpack_uint32(buffer[12:16])
+ central_directory_position = _unpack_uint32(buffer[16:20])
+ num_entries = _unpack_uint16(buffer[8:10])
+
+ # N.b. if someday you want to prefer the standard (non-zip64) EOCD,
+ # you need to adjust position by 76 for arc to be 0.
+ else:
+ raise ZipImportError(f'not a Zip file: {archive!r}',
+ path=archive)
+
+ # Buffer now contains a valid EOCD, and header_position gives the
+ # starting position of it.
+ # XXX: These are cursory checks but are not as exact or strict as they
+ # could be. Checking the arc-adjusted value is probably good too.
+ if header_position < central_directory_size:
raise ZipImportError(f'bad central directory size: {archive!r}', path=archive)
- if header_position < header_offset:
+ if header_position < central_directory_position:
raise ZipImportError(f'bad central directory offset: {archive!r}', path=archive)
- header_position -= header_size
- arc_offset = header_position - header_offset
+ header_position -= central_directory_size
+ # On just-a-zipfile these values are the same and arc_offset is zero; if
+ # the file has some bytes prepended, `arc_offset` is the number of such
+ # bytes. This is used for pex as well as self-extracting .exe.
+ arc_offset = header_position - central_directory_position
if arc_offset < 0:
raise ZipImportError(f'bad central directory size or offset: {archive!r}', path=archive)
@@ -415,6 +444,11 @@ def _read_directory(archive):
raise EOFError('EOF read where not expected')
# Start of file header
if buffer[:4] != b'PK\x01\x02':
+ if count != num_entries:
+ raise ZipImportError(
+ f"mismatched num_entries: {count} should be {num_entries} in {archive!r}",
+ path=archive,
+ )
break # Bad: Central Dir File Header
if len(buffer) != 46:
raise EOFError('EOF read where not expected')
@@ -430,9 +464,6 @@ def _read_directory(archive):
comment_size = _unpack_uint16(buffer[32:34])
file_offset = _unpack_uint32(buffer[42:46])
header_size = name_size + extra_size + comment_size
- if file_offset > header_offset:
- raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
- file_offset += arc_offset
try:
name = fp.read(name_size)
@@ -444,7 +475,10 @@ def _read_directory(archive):
# slower than reading the data because fseek flushes stdio's
# internal buffers. See issue #8745.
try:
- if len(fp.read(header_size - name_size)) != header_size - name_size:
+ extra_data_len = header_size - name_size
+ extra_data = memoryview(fp.read(extra_data_len))
+
+ if len(extra_data) != extra_data_len:
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
except OSError:
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
@@ -461,6 +495,60 @@ def _read_directory(archive):
name = name.replace('/', path_sep)
path = _bootstrap_external._path_join(archive, name)
+
+ # Ordering matches unpacking below.
+ if (
+ file_size == MAX_UINT32 or
+ data_size == MAX_UINT32 or
+ file_offset == MAX_UINT32
+ ):
+ # need to decode extra_data looking for a zip64 extra (which might not
+ # be present)
+ while extra_data:
+ if len(extra_data) < 4:
+ raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+ tag = _unpack_uint16(extra_data[:2])
+ size = _unpack_uint16(extra_data[2:4])
+ if len(extra_data) < 4 + size:
+ raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+ if tag == ZIP64_EXTRA_TAG:
+ if (len(extra_data) - 4) % 8 != 0:
+ raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+ num_extra_values = (len(extra_data) - 4) // 8
+ if num_extra_values > 3:
+ raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+ values = struct.unpack_from(f"<{min(num_extra_values, 3)}Q",
+ extra_data, offset=4)
+
+ # N.b. Here be dragons: the ordering of these is different than
+ # the header fields, and it's really easy to get it wrong since
+ # naturally-occuring zips that use all 3 are >4GB
+ if file_size == MAX_UINT32:
+ file_size = values.pop(0)
+ if data_size == MAX_UINT32:
+ data_size = values.pop(0)
+ if file_offset == MAX_UINT32:
+ file_offset = values.pop(0)
+
+ break
+
+ # For a typical zip, this bytes-slicing only happens 2-3 times, on
+ # small data like timestamps and filesizes.
+ extra_data = extra_data[4+size:]
+ else:
+ _bootstrap._verbose_message(
+ "zipimport: suspected zip64 but no zip64 extra for {!r}",
+ path,
+ )
+ # XXX These two statements seem swapped because `central_directory_position`
+ # is a position within the actual file, but `file_offset` (when compared) is
+ # as encoded in the entry, not adjusted for this file.
+ # N.b. this must be after we've potentially read the zip64 extra which can
+ # change `file_offset`.
+ if file_offset > central_directory_position:
+ raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
+ file_offset += arc_offset
+
t = (path, compress, data_size, file_size, file_offset, time, date, crc)
files[name] = t
count += 1