summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/gzip.rst3
-rw-r--r--Lib/gzip.py91
-rw-r--r--Lib/test/test_gzip.py21
-rw-r--r--Misc/ACKS1
-rw-r--r--Misc/NEWS3
5 files changed, 101 insertions, 18 deletions
diff --git a/Doc/library/gzip.rst b/Doc/library/gzip.rst
index edd5587..934fcb3 100644
--- a/Doc/library/gzip.rst
+++ b/Doc/library/gzip.rst
@@ -74,6 +74,9 @@ The module defines the following items:
.. versionchanged:: 3.2
Support for zero-padded files was added.
+ .. versionchanged:: 3.2
+ Support for unseekable files was added.
+
.. function:: open(filename, mode='rb', compresslevel=9)
diff --git a/Lib/gzip.py b/Lib/gzip.py
index 83311cc..3edc839 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -45,6 +45,62 @@ def open(filename, mode="rb", compresslevel=9):
"""
return GzipFile(filename, mode, compresslevel)
+class _PaddedFile:
+ """Minimal read-only file object that prepends a string to the contents
+ of an actual file. Shouldn't be used outside of gzip.py, as it lacks
+ essential functionality."""
+
+ def __init__(self, f, prepend=b''):
+ self._buffer = prepend
+ self._length = len(prepend)
+ self.file = f
+ self._read = 0
+
+ def read(self, size):
+ if self._read is None:
+ return self.file.read(size)
+ if self._read + size <= self._length:
+ read = self._read
+ self._read += size
+ return self._buffer[read:self._read]
+ else:
+ read = self._read
+ self._read = None
+ return self._buffer[read:] + \
+ self.file.read(size-self._length+read)
+
+ def prepend(self, prepend=b'', readprevious=False):
+ if self._read is None:
+ self._buffer = prepend
+ elif readprevious and len(prepend) <= self._read:
+ self._read -= len(prepend)
+ return
+ else:
+ self._buffer = self._buffer[read:] + prepend
+ self._length = len(self._buffer)
+ self._read = 0
+
+ def unused(self):
+ if self._read is None:
+ return b''
+ return self._buffer[self._read:]
+
+ def seek(self, offset, whence=0):
+ # This is only ever called with offset=whence=0
+ if whence == 1 and self._read is not None:
+ if 0 <= offset + self._read <= self._length:
+ self._read += offset
+ return
+ else:
+ offset += self._length - self._read
+ self._read = None
+ self._buffer = None
+ return self.file.seek(offset, whence)
+
+ def __getattr__(self, name):
+ return getattr(name, self.file)
+
+
class GzipFile(io.BufferedIOBase):
"""The GzipFile class simulates most of the methods of a file object with
the exception of the readinto() and truncate() methods.
@@ -119,6 +175,7 @@ class GzipFile(io.BufferedIOBase):
self.name = filename
# Starts small, scales exponentially
self.min_readsize = 100
+ fileobj = _PaddedFile(fileobj)
elif mode[0:1] == 'w' or mode[0:1] == 'a':
self.mode = WRITE
@@ -188,6 +245,9 @@ class GzipFile(io.BufferedIOBase):
def _read_gzip_header(self):
magic = self.fileobj.read(2)
+ if magic == b'':
+ raise EOFError("Reached EOF")
+
if magic != b'\037\213':
raise IOError('Not a gzipped file')
method = ord( self.fileobj.read(1) )
@@ -219,6 +279,11 @@ class GzipFile(io.BufferedIOBase):
if flag & FHCRC:
self.fileobj.read(2) # Read & discard the 16-bit header CRC
+ unused = self.fileobj.unused()
+ if unused:
+ uncompress = self.decompress.decompress(unused)
+ self._add_read_data(uncompress)
+
def write(self,data):
if self.mode != WRITE:
import errno
@@ -282,16 +347,6 @@ class GzipFile(io.BufferedIOBase):
if self._new_member:
# If the _new_member flag is set, we have to
# jump to the next member, if there is one.
- #
- # First, check if we're at the end of the file;
- # if so, it's time to stop; no more members to read.
- pos = self.fileobj.tell() # Save current position
- self.fileobj.seek(0, 2) # Seek to end of file
- if pos == self.fileobj.tell():
- raise EOFError("Reached EOF")
- else:
- self.fileobj.seek( pos ) # Return to original position
-
self._init_read()
self._read_gzip_header()
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
@@ -305,6 +360,9 @@ class GzipFile(io.BufferedIOBase):
if buf == b"":
uncompress = self.decompress.flush()
+ # Prepend the already read bytes to the fileobj to they can be
+ # seen by _read_eof()
+ self.fileobj.prepend(self.decompress.unused_data, True)
self._read_eof()
self._add_read_data( uncompress )
raise EOFError('Reached EOF')
@@ -316,10 +374,9 @@ class GzipFile(io.BufferedIOBase):
# Ending case: we've come to the end of a member in the file,
# so seek back to the start of the unused data, finish up
# this member, and read a new gzip header.
- # (The number of bytes to seek back is the length of the unused
- # data, minus 8 because _read_eof() will rewind a further 8 bytes)
- self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
-
+ # Prepend the already read bytes to the fileobj to they can be
+ # seen by _read_eof() and _read_gzip_header()
+ self.fileobj.prepend(self.decompress.unused_data, True)
# Check the CRC and file size, and set the flag so we read
# a new member on the next call
self._read_eof()
@@ -334,12 +391,10 @@ class GzipFile(io.BufferedIOBase):
self.size = self.size + len(data)
def _read_eof(self):
- # We've read to the end of the file, so we have to rewind in order
- # to reread the 8 bytes containing the CRC and the file size.
+ # We've read to the end of the file
# We check the that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
- self.fileobj.seek(-8, 1)
crc32 = read32(self.fileobj)
isize = read32(self.fileobj) # may exceed 2GB
if crc32 != self.crc:
@@ -355,7 +410,7 @@ class GzipFile(io.BufferedIOBase):
while c == b"\x00":
c = self.fileobj.read(1)
if c:
- self.fileobj.seek(-1, 1)
+ self.fileobj.prepend(c, True)
@property
def closed(self):
diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py
index a95af05..e49fe00 100644
--- a/Lib/test/test_gzip.py
+++ b/Lib/test/test_gzip.py
@@ -22,6 +22,17 @@ data2 = b"""/* zlibmodule.c -- gzip-compatible data compression */
"""
+class UnseekableIO(io.BytesIO):
+ def seekable(self):
+ return False
+
+ def tell(self):
+ raise io.UnsupportedOperation
+
+ def seek(self, *args):
+ raise io.UnsupportedOperation
+
+
class TestGzip(unittest.TestCase):
filename = support.TESTFN
@@ -265,6 +276,16 @@ class TestGzip(unittest.TestCase):
d = f.read()
self.assertEqual(d, data1 * 50, "Incorrect data in file")
+ def test_non_seekable_file(self):
+ uncompressed = data1 * 50
+ buf = UnseekableIO()
+ with gzip.GzipFile(fileobj=buf, mode="wb") as f:
+ f.write(uncompressed)
+ compressed = buf.getvalue()
+ buf = UnseekableIO(compressed)
+ with gzip.GzipFile(fileobj=buf, mode="rb") as f:
+ self.assertEqual(f.read(), uncompressed)
+
# Testing compress/decompress shortcut functions
def test_compress(self):
diff --git a/Misc/ACKS b/Misc/ACKS
index 4f2780a..5d12c1f 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -260,6 +260,7 @@ Bill Fancher
Mark Favas
Niels Ferguson
Sebastian Fernandez
+Florian Festi
Vincent Fiack
Tomer Filiba
Jeffrey Finkelstein
diff --git a/Misc/NEWS b/Misc/NEWS
index 15b4da2..514ea6d 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -62,6 +62,9 @@ Core and Builtins
Library
-------
+- Issue #1675951: Allow GzipFile to work with unseekable file objects.
+ Patch by Florian Festi.
+
- Logging: Added QueueListener class to facilitate logging usage for
performance-critical threads.