summaryrefslogtreecommitdiffstats
path: root/Lib/gzip.py
diff options
context:
space:
mode:
authorArjun <ccldarjun@icloud.com>2023-05-08 17:55:59 (GMT)
committerGitHub <noreply@github.com>2023-05-08 17:55:59 (GMT)
commit9af485436b83003b5705a6e54bdeb900c70e0c69 (patch)
tree69506164b3b413a22146cbf7250eaec8836d7b3d /Lib/gzip.py
parent405eacc1b87a42e19fd176131e70537f0539e05e (diff)
downloadcpython-9af485436b83003b5705a6e54bdeb900c70e0c69.zip
cpython-9af485436b83003b5705a6e54bdeb900c70e0c69.tar.gz
cpython-9af485436b83003b5705a6e54bdeb900c70e0c69.tar.bz2
gh-89550: Buffer GzipFile.write to reduce execution time by ~15% (#101251)
Use `io.BufferedWriter` to buffer gzip writes. --------- Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com> Co-authored-by: Gregory P. Smith <greg@krypto.org>
Diffstat (limited to 'Lib/gzip.py')
-rw-r--r--Lib/gzip.py40
1 files changed, 35 insertions, 5 deletions
diff --git a/Lib/gzip.py b/Lib/gzip.py
index 75c6ddc..8796c8d 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -22,6 +22,7 @@ _COMPRESS_LEVEL_TRADEOFF = 6
_COMPRESS_LEVEL_BEST = 9
READ_BUFFER_SIZE = 128 * 1024
+_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE
def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
@@ -120,6 +121,21 @@ class BadGzipFile(OSError):
"""Exception raised in some cases for invalid gzip files."""
+class _WriteBufferStream(io.RawIOBase):
+ """Minimal object to pass WriteBuffer flushes into GzipFile"""
+ def __init__(self, gzip_file):
+ self.gzip_file = gzip_file
+
+ def write(self, data):
+ return self.gzip_file._write_raw(data)
+
+ def seekable(self):
+ return False
+
+ def writable(self):
+ return True
+
+
class GzipFile(_compression.BaseStream):
"""The GzipFile class simulates most of the methods of a file object with
the exception of the truncate() method.
@@ -184,6 +200,7 @@ class GzipFile(_compression.BaseStream):
if mode is None:
mode = getattr(fileobj, 'mode', 'rb')
+
if mode.startswith('r'):
self.mode = READ
raw = _GzipReader(fileobj)
@@ -206,6 +223,9 @@ class GzipFile(_compression.BaseStream):
zlib.DEF_MEM_LEVEL,
0)
self._write_mtime = mtime
+ self._buffer_size = _WRITE_BUFFER_SIZE
+ self._buffer = io.BufferedWriter(_WriteBufferStream(self),
+ buffer_size=self._buffer_size)
else:
raise ValueError("Invalid mode: {!r}".format(mode))
@@ -231,6 +251,11 @@ class GzipFile(_compression.BaseStream):
self.bufsize = 0
self.offset = 0 # Current file offset for seek(), tell(), etc
+ def tell(self):
+ self._check_not_closed()
+ self._buffer.flush()
+ return super().tell()
+
def _write_gzip_header(self, compresslevel):
self.fileobj.write(b'\037\213') # magic header
self.fileobj.write(b'\010') # compression method
@@ -272,6 +297,10 @@ class GzipFile(_compression.BaseStream):
if self.fileobj is None:
raise ValueError("write() on closed GzipFile object")
+ return self._buffer.write(data)
+
+ def _write_raw(self, data):
+ # Called by our self._buffer underlying WriteBufferStream.
if isinstance(data, (bytes, bytearray)):
length = len(data)
else:
@@ -322,9 +351,9 @@ class GzipFile(_compression.BaseStream):
fileobj = self.fileobj
if fileobj is None:
return
- self.fileobj = None
try:
if self.mode == WRITE:
+ self._buffer.flush()
fileobj.write(self.compress.flush())
write32u(fileobj, self.crc)
# self.size may exceed 2 GiB, or even 4 GiB
@@ -332,6 +361,7 @@ class GzipFile(_compression.BaseStream):
elif self.mode == READ:
self._buffer.close()
finally:
+ self.fileobj = None
myfileobj = self.myfileobj
if myfileobj:
self.myfileobj = None
@@ -341,7 +371,7 @@ class GzipFile(_compression.BaseStream):
self._check_not_closed()
if self.mode == WRITE:
# Ensure the compressor's buffer is flushed
- self.fileobj.write(self.compress.flush(zlib_mode))
+ self._buffer.flush()
self.fileobj.flush()
def fileno(self):
@@ -378,10 +408,10 @@ class GzipFile(_compression.BaseStream):
if offset < self.offset:
raise OSError('Negative seek in write mode')
count = offset - self.offset
- chunk = b'\0' * 1024
- for i in range(count // 1024):
+ chunk = b'\0' * self._buffer_size
+ for i in range(count // self._buffer_size):
self.write(chunk)
- self.write(b'\0' * (count % 1024))
+ self.write(b'\0' * (count % self._buffer_size))
elif self.mode == READ:
self._check_not_closed()
return self._buffer.seek(offset, whence)