gh-95534: Improve gzip reading speed by 10% (#97664)

Change summary: + There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks. + a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this. + The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case. + GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls. EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.
author: Ruben Vorderman <r.h.p.vorderman@lumc.nl> 2022-10-17 02:10:58 (GMT)
committer: GitHub <noreply@github.com> 2022-10-17 02:10:58 (GMT)
commit: eae7dad40255bad42e4abce53ff8143dcbc66af5 (patch)
tree: 7cea56066a6db7c451712f8375034c2d8b8914f4 /Lib/gzip.py
parent: bb38b39b339191c5fc001c8fbfbc3037c13bc7bb (diff)
download: cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.zip
cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.gz
cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.bz2
1 files changed, 12 insertions, 12 deletions
diff --git a/Lib/gzip.py b/Lib/gzip.py
index 8edcda4..75c6ddc 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -21,6 +21,8 @@ _COMPRESS_LEVEL_FAST = 1
 _COMPRESS_LEVEL_TRADEOFF = 6
 _COMPRESS_LEVEL_BEST = 9
 
+READ_BUFFER_SIZE = 128 * 1024
+
 
 def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
          encoding=None, errors=None, newline=None):
@@ -446,7 +448,7 @@ def _read_gzip_header(fp):
 
 class _GzipReader(_compression.DecompressReader):
     def __init__(self, fp):
-        super().__init__(_PaddedFile(fp), zlib.decompressobj,
+        super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
                          wbits=-zlib.MAX_WBITS)
         # Set flag indicating start of a new member
         self._new_member = True
@@ -494,12 +496,13 @@ class _GzipReader(_compression.DecompressReader):
                 self._new_member = False
 
             # Read a chunk of data from the file
-            buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
+            if self._decompressor.needs_input:
+                buf = self._fp.read(READ_BUFFER_SIZE)
+                uncompress = self._decompressor.decompress(buf, size)
+            else:
+                uncompress = self._decompressor.decompress(b"", size)
 
-            uncompress = self._decompressor.decompress(buf, size)
-            if self._decompressor.unconsumed_tail != b"":
-                self._fp.prepend(self._decompressor.unconsumed_tail)
-            elif self._decompressor.unused_data != b"":
+            if self._decompressor.unused_data != b"":
                 # Prepend the already read bytes to the fileobj so they can
                 # be seen by _read_eof() and _read_gzip_header()
                 self._fp.prepend(self._decompressor.unused_data)
@@ -510,14 +513,11 @@ class _GzipReader(_compression.DecompressReader):
                 raise EOFError("Compressed file ended before the "
                                "end-of-stream marker was reached")
 
-        self._add_read_data( uncompress )
+        self._crc = zlib.crc32(uncompress, self._crc)
+        self._stream_size += len(uncompress)
         self._pos += len(uncompress)
         return uncompress
 
-    def _add_read_data(self, data):
-        self._crc = zlib.crc32(data, self._crc)
-        self._stream_size = self._stream_size + len(data)
-
     def _read_eof(self):
         # We've read to the end of the file
         # We check that the computed CRC and size of the
@@ -647,7 +647,7 @@ def main():
                 f = builtins.open(arg, "rb")
                 g = open(arg + ".gz", "wb")
         while True:
-            chunk = f.read(io.DEFAULT_BUFFER_SIZE)
+            chunk = f.read(READ_BUFFER_SIZE)
             if not chunk:
                 break
             g.write(chunk)
author	Ruben Vorderman <r.h.p.vorderman@lumc.nl>	2022-10-17 02:10:58 (GMT)
committer	GitHub <noreply@github.com>	2022-10-17 02:10:58 (GMT)
commit	eae7dad40255bad42e4abce53ff8143dcbc66af5 (patch)
tree	7cea56066a6db7c451712f8375034c2d8b8914f4 /Lib/gzip.py
parent	bb38b39b339191c5fc001c8fbfbc3037c13bc7bb (diff)
download	cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.zip cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.gz cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.bz2