diff options
-rw-r--r-- | Doc/library/gzip.rst | 3 | ||||
-rw-r--r-- | Lib/gzip.py | 91 | ||||
-rw-r--r-- | Lib/test/test_gzip.py | 21 | ||||
-rw-r--r-- | Misc/ACKS | 1 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
5 files changed, 101 insertions, 18 deletions
diff --git a/Doc/library/gzip.rst b/Doc/library/gzip.rst index edd5587..934fcb3 100644 --- a/Doc/library/gzip.rst +++ b/Doc/library/gzip.rst @@ -74,6 +74,9 @@ The module defines the following items: .. versionchanged:: 3.2 Support for zero-padded files was added. + .. versionchanged:: 3.2 + Support for unseekable files was added. + .. function:: open(filename, mode='rb', compresslevel=9) diff --git a/Lib/gzip.py b/Lib/gzip.py index 83311cc..3edc839 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -45,6 +45,62 @@ def open(filename, mode="rb", compresslevel=9): """ return GzipFile(filename, mode, compresslevel) +class _PaddedFile: + """Minimal read-only file object that prepends a string to the contents + of an actual file. Shouldn't be used outside of gzip.py, as it lacks + essential functionality.""" + + def __init__(self, f, prepend=b''): + self._buffer = prepend + self._length = len(prepend) + self.file = f + self._read = 0 + + def read(self, size): + if self._read is None: + return self.file.read(size) + if self._read + size <= self._length: + read = self._read + self._read += size + return self._buffer[read:self._read] + else: + read = self._read + self._read = None + return self._buffer[read:] + \ + self.file.read(size-self._length+read) + + def prepend(self, prepend=b'', readprevious=False): + if self._read is None: + self._buffer = prepend + elif readprevious and len(prepend) <= self._read: + self._read -= len(prepend) + return + else: + self._buffer = self._buffer[read:] + prepend + self._length = len(self._buffer) + self._read = 0 + + def unused(self): + if self._read is None: + return b'' + return self._buffer[self._read:] + + def seek(self, offset, whence=0): + # This is only ever called with offset=whence=0 + if whence == 1 and self._read is not None: + if 0 <= offset + self._read <= self._length: + self._read += offset + return + else: + offset += self._length - self._read + self._read = None + self._buffer = None + return self.file.seek(offset, whence) + + def __getattr__(self, name): + return getattr(name, self.file) + + class GzipFile(io.BufferedIOBase): """The GzipFile class simulates most of the methods of a file object with the exception of the readinto() and truncate() methods. @@ -119,6 +175,7 @@ class GzipFile(io.BufferedIOBase): self.name = filename # Starts small, scales exponentially self.min_readsize = 100 + fileobj = _PaddedFile(fileobj) elif mode[0:1] == 'w' or mode[0:1] == 'a': self.mode = WRITE @@ -188,6 +245,9 @@ class GzipFile(io.BufferedIOBase): def _read_gzip_header(self): magic = self.fileobj.read(2) + if magic == b'': + raise EOFError("Reached EOF") + if magic != b'\037\213': raise IOError('Not a gzipped file') method = ord( self.fileobj.read(1) ) @@ -219,6 +279,11 @@ class GzipFile(io.BufferedIOBase): if flag & FHCRC: self.fileobj.read(2) # Read & discard the 16-bit header CRC + unused = self.fileobj.unused() + if unused: + uncompress = self.decompress.decompress(unused) + self._add_read_data(uncompress) + def write(self,data): if self.mode != WRITE: import errno @@ -282,16 +347,6 @@ class GzipFile(io.BufferedIOBase): if self._new_member: # If the _new_member flag is set, we have to # jump to the next member, if there is one. - # - # First, check if we're at the end of the file; - # if so, it's time to stop; no more members to read. - pos = self.fileobj.tell() # Save current position - self.fileobj.seek(0, 2) # Seek to end of file - if pos == self.fileobj.tell(): - raise EOFError("Reached EOF") - else: - self.fileobj.seek( pos ) # Return to original position - self._init_read() self._read_gzip_header() self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) @@ -305,6 +360,9 @@ class GzipFile(io.BufferedIOBase): if buf == b"": uncompress = self.decompress.flush() + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() + self.fileobj.prepend(self.decompress.unused_data, True) self._read_eof() self._add_read_data( uncompress ) raise EOFError('Reached EOF') @@ -316,10 +374,9 @@ class GzipFile(io.BufferedIOBase): # Ending case: we've come to the end of a member in the file, # so seek back to the start of the unused data, finish up # this member, and read a new gzip header. - # (The number of bytes to seek back is the length of the unused - # data, minus 8 because _read_eof() will rewind a further 8 bytes) - self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) - + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() and _read_gzip_header() + self.fileobj.prepend(self.decompress.unused_data, True) # Check the CRC and file size, and set the flag so we read # a new member on the next call self._read_eof() @@ -334,12 +391,10 @@ class GzipFile(io.BufferedIOBase): self.size = self.size + len(data) def _read_eof(self): - # We've read to the end of the file, so we have to rewind in order - # to reread the 8 bytes containing the CRC and the file size. + # We've read to the end of the file # We check the that the computed CRC and size of the # uncompressed data matches the stored values. Note that the size # stored is the true file size mod 2**32. - self.fileobj.seek(-8, 1) crc32 = read32(self.fileobj) isize = read32(self.fileobj) # may exceed 2GB if crc32 != self.crc: @@ -355,7 +410,7 @@ class GzipFile(io.BufferedIOBase): while c == b"\x00": c = self.fileobj.read(1) if c: - self.fileobj.seek(-1, 1) + self.fileobj.prepend(c, True) @property def closed(self): diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py index a95af05..e49fe00 100644 --- a/Lib/test/test_gzip.py +++ b/Lib/test/test_gzip.py @@ -22,6 +22,17 @@ data2 = b"""/* zlibmodule.c -- gzip-compatible data compression */ """ +class UnseekableIO(io.BytesIO): + def seekable(self): + return False + + def tell(self): + raise io.UnsupportedOperation + + def seek(self, *args): + raise io.UnsupportedOperation + + class TestGzip(unittest.TestCase): filename = support.TESTFN @@ -265,6 +276,16 @@ class TestGzip(unittest.TestCase): d = f.read() self.assertEqual(d, data1 * 50, "Incorrect data in file") + def test_non_seekable_file(self): + uncompressed = data1 * 50 + buf = UnseekableIO() + with gzip.GzipFile(fileobj=buf, mode="wb") as f: + f.write(uncompressed) + compressed = buf.getvalue() + buf = UnseekableIO(compressed) + with gzip.GzipFile(fileobj=buf, mode="rb") as f: + self.assertEqual(f.read(), uncompressed) + # Testing compress/decompress shortcut functions def test_compress(self): @@ -260,6 +260,7 @@ Bill Fancher Mark Favas Niels Ferguson Sebastian Fernandez +Florian Festi Vincent Fiack Tomer Filiba Jeffrey Finkelstein @@ -62,6 +62,9 @@ Core and Builtins Library ------- +- Issue #1675951: Allow GzipFile to work with unseekable file objects. + Patch by Florian Festi. + - Logging: Added QueueListener class to facilitate logging usage for performance-critical threads. |