summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLars Gustäbel <lars@gustaebel.de>2011-02-23 11:42:22 (GMT)
committerLars Gustäbel <lars@gustaebel.de>2011-02-23 11:42:22 (GMT)
commitdd071045e776e1c3e8cf6750a2fd1d0958bf19b3 (patch)
tree3afb00727522ffb897602ec1ae5d2a9ccfd3dce4
parent3eeee833915b96a15c60eafc317bb6822af2084c (diff)
downloadcpython-dd071045e776e1c3e8cf6750a2fd1d0958bf19b3.zip
cpython-dd071045e776e1c3e8cf6750a2fd1d0958bf19b3.tar.gz
cpython-dd071045e776e1c3e8cf6750a2fd1d0958bf19b3.tar.bz2
Issue #11224: Improved sparse file read support (r85916) introduced a
regression in _FileInFile which is used in file-like objects returned by TarFile.extractfile(). The inefficient design of the _FileInFile.read() method causes various dramatic side-effects and errors: - The data segment of a file member is read completely into memory every(!) time a small block is accessed. This is not only slow but may cause unexpected MemoryErrors with very large files. - Reading members from compressed tar archives is even slower because of the excessive backwards seeking which is done when the same data segment is read over and over again. - As a backwards seek on a TarFile opened in stream mode is not possible, using extractfile() fails with a StreamError.
-rw-r--r--Lib/tarfile.py5
-rw-r--r--Lib/test/test_tarfile.py16
-rw-r--r--Misc/NEWS4
3 files changed, 22 insertions, 3 deletions
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index e3747e9..0f9d1da 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -760,9 +760,8 @@ class _FileInFile(object):
self.map_index = 0
length = min(size, stop - self.position)
if data:
- self.fileobj.seek(offset)
- block = self.fileobj.read(stop - start)
- buf += block[self.position - start:self.position + length]
+ self.fileobj.seek(offset + (self.position - start))
+ buf += self.fileobj.read(length)
else:
buf += NUL * length
size -= length
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py
index 94ef61c..68e094d 100644
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -419,6 +419,22 @@ class StreamReadTest(CommonReadTest):
mode="r|"
+ def test_read_through(self):
+ # Issue #11224: A poorly designed _FileInFile.read() method
+ # caused seeking errors with stream tar files.
+ for tarinfo in self.tar:
+ if not tarinfo.isreg():
+ continue
+ fobj = self.tar.extractfile(tarinfo)
+ while True:
+ try:
+ buf = fobj.read(512)
+ except tarfile.StreamError:
+ self.fail("simple read-through using TarFile.extractfile() failed")
+ if not buf:
+ break
+ fobj.close()
+
def test_fileobj_regular_file(self):
tarinfo = self.tar.next() # get "regtype" (can't use getmember)
fobj = self.tar.extractfile(tarinfo)
diff --git a/Misc/NEWS b/Misc/NEWS
index a3a246c..9dcd309 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -27,6 +27,10 @@ Core and Builtins
Library
-------
+- Issue #11224: Fixed a regression in tarfile that affected the file-like
+ objects returned by TarFile.extractfile() regarding performance, memory
+ consumption and failures with the stream interface.
+
- Issue #10924: Adding salt and Modular Crypt Format to crypt library.
Moved old C wrapper to _crypt, and added a Python wrapper with
enhanced salt generation and simpler API for password generation.