summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2018-07-04 08:43:42 (GMT)
committerGitHub <noreply@github.com>2018-07-04 08:43:42 (GMT)
commitd7a0ad7dd7bd7dfbdbf6be2c89fde5a71813628a (patch)
tree6a7e1a7251cfe7b9e536fdf19891f8d0174d1752
parentde6a2dec9c2b1280d70a29396d4e141bd1614655 (diff)
downloadcpython-d7a0ad7dd7bd7dfbdbf6be2c89fde5a71813628a.zip
cpython-d7a0ad7dd7bd7dfbdbf6be2c89fde5a71813628a.tar.gz
cpython-d7a0ad7dd7bd7dfbdbf6be2c89fde5a71813628a.tar.bz2
bpo-34010: Fix tarfile read performance regression (GH-8020)
During buffered read, use a list followed by join instead of extending a bytes object. This is how it was done before but changed in commit b506dc32c1a. (cherry picked from commit 12a08c47601cadea8e7d3808502cdbcca87b2ce2) Co-authored-by: hajoscher <hajoscher@gmail.com>
-rwxr-xr-xLib/tarfile.py20
-rw-r--r--Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst2
2 files changed, 13 insertions, 9 deletions
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index 395b846..62d2215 100755
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -534,7 +534,7 @@ class _Stream:
if not buf:
break
t.append(buf)
- buf = "".join(t)
+ buf = b"".join(t)
else:
buf = self._read(size)
self.pos += len(buf)
@@ -547,6 +547,7 @@ class _Stream:
return self.__read(size)
c = len(self.dbuf)
+ t = [self.dbuf]
while c < size:
buf = self.__read(self.bufsize)
if not buf:
@@ -555,26 +556,27 @@ class _Stream:
buf = self.cmp.decompress(buf)
except self.exception:
raise ReadError("invalid compressed data")
- self.dbuf += buf
+ t.append(buf)
c += len(buf)
- buf = self.dbuf[:size]
- self.dbuf = self.dbuf[size:]
- return buf
+ t = b"".join(t)
+ self.dbuf = t[size:]
+ return t[:size]
def __read(self, size):
"""Return size bytes from stream. If internal buffer is empty,
read another block from the stream.
"""
c = len(self.buf)
+ t = [self.buf]
while c < size:
buf = self.fileobj.read(self.bufsize)
if not buf:
break
- self.buf += buf
+ t.append(buf)
c += len(buf)
- buf = self.buf[:size]
- self.buf = self.buf[size:]
- return buf
+ t = b"".join(t)
+ self.buf = t[size:]
+ return t[:size]
# class _Stream
class _StreamProxy(object):
diff --git a/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst
new file mode 100644
index 0000000..4cb7892
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst
@@ -0,0 +1,2 @@
+Fixed a performance regression for reading streams with tarfile. The
+buffered read should use a list, instead of appending to a bytes object.