summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorINADA Naoki <methane@users.noreply.github.com>2018-07-06 05:06:00 (GMT)
committerGitHub <noreply@github.com>2018-07-06 05:06:00 (GMT)
commit8d130913cb9359c01de412178f9942419e921170 (patch)
treef144f1eead7e16809b69813e05b008fd7a8598c1
parentf12028809b0e37ee003d06e7fce8dc6a6f447a94 (diff)
downloadcpython-8d130913cb9359c01de412178f9942419e921170.zip
cpython-8d130913cb9359c01de412178f9942419e921170.tar.gz
cpython-8d130913cb9359c01de412178f9942419e921170.tar.bz2
bpo-34043: Optimize tarfile uncompress performance (GH-8089)
tarfile._Stream has two buffer for compressed and uncompressed data. Those buffers are not aligned so unnecessary bytes slicing happens for every reading chunks. This commit bypass compressed buffering. In this benchmark [1], user time become 250ms from 300ms. [1]: https://bugs.python.org/msg320763
-rwxr-xr-xLib/tarfile.py30
-rw-r--r--Misc/NEWS.d/next/Library/2018-07-04-21-14-35.bpo-34043.0YJNq9.rst1
2 files changed, 13 insertions, 18 deletions
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index 59f044c..ba3e95f 100755
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -513,21 +513,10 @@ class _Stream:
raise StreamError("seeking backwards is not allowed")
return self.pos
- def read(self, size=None):
- """Return the next size number of bytes from the stream.
- If size is not defined, return all bytes of the stream
- up to EOF.
- """
- if size is None:
- t = []
- while True:
- buf = self._read(self.bufsize)
- if not buf:
- break
- t.append(buf)
- buf = b"".join(t)
- else:
- buf = self._read(size)
+ def read(self, size):
+ """Return the next size number of bytes from the stream."""
+ assert size is not None
+ buf = self._read(size)
self.pos += len(buf)
return buf
@@ -540,9 +529,14 @@ class _Stream:
c = len(self.dbuf)
t = [self.dbuf]
while c < size:
- buf = self.__read(self.bufsize)
- if not buf:
- break
+ # Skip underlying buffer to avoid unaligned double buffering.
+ if self.buf:
+ buf = self.buf
+ self.buf = b""
+ else:
+ buf = self.fileobj.read(self.bufsize)
+ if not buf:
+ break
try:
buf = self.cmp.decompress(buf)
except self.exception:
diff --git a/Misc/NEWS.d/next/Library/2018-07-04-21-14-35.bpo-34043.0YJNq9.rst b/Misc/NEWS.d/next/Library/2018-07-04-21-14-35.bpo-34043.0YJNq9.rst
new file mode 100644
index 0000000..c035ba7
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-07-04-21-14-35.bpo-34043.0YJNq9.rst
@@ -0,0 +1 @@
+Optimize tarfile uncompress performance about 15% when gzip is used.