diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2010-01-27 21:18:57 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2010-01-27 21:18:57 (GMT) |
commit | a32f9a24df24e3169ce39d7871910595488465a1 (patch) | |
tree | a2d719e715f2d77cbe438ffb637e601236d3ec26 | |
parent | 176d6c40e06b768966038cff9b95a06b19938ac1 (diff) | |
download | cpython-a32f9a24df24e3169ce39d7871910595488465a1.zip cpython-a32f9a24df24e3169ce39d7871910595488465a1.tar.gz cpython-a32f9a24df24e3169ce39d7871910595488465a1.tar.bz2 |
Merged revisions 77798 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r77798 | antoine.pitrou | 2010-01-27 21:59:50 +0100 (mer., 27 janv. 2010) | 8 lines
Issue #7610: Reworked implementation of the internal
:class:`zipfile.ZipExtFile` class used to represent files stored inside
an archive. The new implementation is significantly faster and can
be wrapped in a :class:`io.BufferedReader` object for more speedups.
It also solves an issue where interleaved calls to `read()` and
`readline()` give wrong results. Patch by Nir Aides.
........
-rw-r--r-- | Lib/test/test_zipfile.py | 84 | ||||
-rw-r--r-- | Lib/zipfile.py | 342 | ||||
-rw-r--r-- | Misc/NEWS | 8 |
3 files changed, 242 insertions, 192 deletions
diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 1afd475..a925560 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -168,6 +168,45 @@ class TestsWithSourceFile(unittest.TestCase): for f in (TESTFN2, TemporaryFile(), io.BytesIO()): self.zip_random_open_test(f, zipfile.ZIP_STORED) + def test_univeral_readaheads(self): + f = io.BytesIO() + + data = b'a\r\n' * 16 * 1024 + zipfp = zipfile.ZipFile(f, 'w', zipfile.ZIP_STORED) + zipfp.writestr(TESTFN, data) + zipfp.close() + + data2 = b'' + zipfp = zipfile.ZipFile(f, 'r') + zipopen = zipfp.open(TESTFN, 'rU') + for line in zipopen: + data2 += line + zipfp.close() + + self.assertEqual(data, data2.replace(b'\n', b'\r\n')) + + def zip_readline_read_test(self, f, compression): + self.make_test_archive(f, compression) + + # Read the ZIP archive + zipfp = zipfile.ZipFile(f, "r") + zipopen = zipfp.open(TESTFN) + + data = b'' + while True: + read = zipopen.readline() + if not read: + break + data += read + + read = zipopen.read(100) + if not read: + break + data += read + + self.assertEqual(data, self.data) + zipfp.close() + def zip_readline_test(self, f, compression): self.make_test_archive(f, compression) @@ -195,6 +234,11 @@ class TestsWithSourceFile(unittest.TestCase): for line, zipline in zip(self.line_gen, zipfp.open(TESTFN)): self.assertEqual(zipline, line + '\n') + def test_readline_read_stored(self): + # Issue #7610: calls to readline() interleaved with calls to read(). + for f in (TESTFN2, TemporaryFile(), io.BytesIO()): + self.zip_readline_read_test(f, zipfile.ZIP_STORED) + def test_readline_stored(self): for f in (TESTFN2, TemporaryFile(), io.BytesIO()): self.zip_readline_test(f, zipfile.ZIP_STORED) @@ -224,6 +268,12 @@ class TestsWithSourceFile(unittest.TestCase): self.zip_random_open_test(f, zipfile.ZIP_DEFLATED) @skipUnless(zlib, "requires zlib") + def test_readline_read_deflated(self): + # Issue #7610: calls to readline() interleaved with calls to read(). + for f in (TESTFN2, TemporaryFile(), io.BytesIO()): + self.zip_readline_read_test(f, zipfile.ZIP_DEFLATED) + + @skipUnless(zlib, "requires zlib") def test_readline_deflated(self): for f in (TESTFN2, TemporaryFile(), io.BytesIO()): self.zip_readline_test(f, zipfile.ZIP_DEFLATED) @@ -1067,6 +1117,29 @@ class UniversalNewlineTests(unittest.TestCase): zipdata = zipfp.open(fn, "rU").read() self.assertEqual(self.arcdata[sep], zipdata) + def readline_read_test(self, f, compression): + self.make_test_archive(f, compression) + + # Read the ZIP archive + zipfp = zipfile.ZipFile(f, "r") + for sep, fn in self.arcfiles.items(): + zipopen = zipfp.open(fn, "rU") + data = b'' + while True: + read = zipopen.readline() + if not read: + break + data += read + + read = zipopen.read(5) + if not read: + break + data += read + + self.assertEqual(data, self.arcdata['\n']) + + zipfp.close() + def readline_test(self, f, compression): self.make_test_archive(f, compression) @@ -1101,6 +1174,11 @@ class UniversalNewlineTests(unittest.TestCase): for f in (TESTFN2, TemporaryFile(), io.BytesIO()): self.read_test(f, zipfile.ZIP_STORED) + def test_readline_read_stored(self): + # Issue #7610: calls to readline() interleaved with calls to read(). + for f in (TESTFN2, TemporaryFile(), io.BytesIO()): + self.readline_read_test(f, zipfile.ZIP_STORED) + def test_readline_stored(self): for f in (TESTFN2, TemporaryFile(), io.BytesIO()): self.readline_test(f, zipfile.ZIP_STORED) @@ -1119,6 +1197,12 @@ class UniversalNewlineTests(unittest.TestCase): self.read_test(f, zipfile.ZIP_DEFLATED) @skipUnless(zlib, "requires zlib") + def test_readline_read_deflated(self): + # Issue #7610: calls to readline() interleaved with calls to read(). + for f in (TESTFN2, TemporaryFile(), io.BytesIO()): + self.readline_read_test(f, zipfile.ZIP_DEFLATED) + + @skipUnless(zlib, "requires zlib") def test_readline_deflated(self): for f in (TESTFN2, TemporaryFile(), io.BytesIO()): self.readline_test(f, zipfile.ZIP_DEFLATED) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index f70cf63..7ee5e2f 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -5,6 +5,8 @@ XXX references to utf-8 need further investigation. """ import struct, os, time, sys, shutil import binascii, io, stat +import io +import re try: import zlib # We may need its compression method @@ -443,205 +445,172 @@ class _ZipDecrypter: self._UpdateKeys(c) return c -class ZipExtFile: +class ZipExtFile(io.BufferedIOBase): """File-like object for reading an archive member. Is returned by ZipFile.open(). """ - def __init__(self, fileobj, zipinfo, decrypt=None): - self.fileobj = fileobj - self.decrypter = decrypt - self.bytes_read = 0 - self.rawbuffer = b'' - self.readbuffer = b'' - self.linebuffer = b'' - self.eof = False - self.univ_newlines = False - self.nlSeps = (b"\n", ) - self.lastdiscard = b'' - - self.compress_type = zipinfo.compress_type - self.compress_size = zipinfo.compress_size - - self.closed = False - self.mode = "r" - self.name = zipinfo.filename + # Max size supported by decompressor. + MAX_N = 1 << 31 - 1 - # read from compressed files in 64k blocks - self.compreadsize = 64*1024 - if self.compress_type == ZIP_DEFLATED: - self.dc = zlib.decompressobj(-15) + # Read from compressed files in 4k blocks. + MIN_READ_SIZE = 4096 - def set_univ_newlines(self, univ_newlines): - self.univ_newlines = univ_newlines + # Search for universal newlines or line chunks. + PATTERN = re.compile(br'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)') - # pick line separator char(s) based on universal newlines flag - self.nlSeps = (b"\n", ) - if self.univ_newlines: - self.nlSeps = (b"\r\n", b"\r", b"\n") + def __init__(self, fileobj, mode, zipinfo, decrypter=None): + self._fileobj = fileobj + self._decrypter = decrypter - def __iter__(self): - return self + self._decompressor = zlib.decompressobj(-15) + self._unconsumed = b'' - def __next__(self): - nextline = self.readline() - if not nextline: - raise StopIteration() + self._readbuffer = b'' + self._offset = 0 - return nextline + self._universal = 'U' in mode + self.newlines = None - def close(self): - self.closed = True - - def _checkfornewline(self): - nl, nllen = -1, -1 - if self.linebuffer: - # ugly check for cases where half of an \r\n pair was - # read on the last pass, and the \r was discarded. In this - # case we just throw away the \n at the start of the buffer. - if (self.lastdiscard, self.linebuffer[:1]) == (b'\r', b'\n'): - self.linebuffer = self.linebuffer[1:] - - for sep in self.nlSeps: - nl = self.linebuffer.find(sep) - if nl >= 0: - nllen = len(sep) - return nl, nllen - - return nl, nllen - - def readline(self, size = -1): - """Read a line with approx. size. If size is negative, - read a whole line. - """ - if size < 0: - size = sys.maxsize - elif size == 0: - return b'' + self._compress_type = zipinfo.compress_type + self._compress_size = zipinfo.compress_size + self._compress_left = zipinfo.compress_size - # check for a newline already in buffer - nl, nllen = self._checkfornewline() + # Adjust read size for encrypted files since the first 12 bytes + # are for the encryption/password information. + if self._decrypter is not None: + self._compress_left -= 12 - if nl >= 0: - # the next line was already in the buffer - nl = min(nl, size) - else: - # no line break in buffer - try to read more - size -= len(self.linebuffer) - while nl < 0 and size > 0: - buf = self.read(min(size, 100)) - if not buf: - break - self.linebuffer += buf - size -= len(buf) - - # check for a newline in buffer - nl, nllen = self._checkfornewline() - - # we either ran out of bytes in the file, or - # met the specified size limit without finding a newline, - # so return current buffer - if nl < 0: - s = self.linebuffer - self.linebuffer = b'' - return s - - buf = self.linebuffer[:nl] - self.lastdiscard = self.linebuffer[nl:nl + nllen] - self.linebuffer = self.linebuffer[nl + nllen:] - - # line is always returned with \n as newline char (except possibly - # for a final incomplete line in the file, which is handled above). - return buf + b"\n" - - def readlines(self, sizehint = -1): - """Return a list with all (following) lines. The sizehint parameter - is ignored in this implementation. + self.mode = mode + self.name = zipinfo.filename + + def readline(self, limit=-1): + """Read and return a line from the stream. + + If limit is specified, at most limit bytes will be read. """ - result = [] - while True: - line = self.readline() - if not line: break - result.append(line) - return result - - def read(self, size = None): - # act like file obj and return empty string if size is 0 - if size == 0: - return b'' - - # determine read size - bytesToRead = self.compress_size - self.bytes_read - - # adjust read size for encrypted files since the first 12 bytes - # are for the encryption/password information - if self.decrypter is not None: - bytesToRead -= 12 - - if size is not None and size >= 0: - if self.compress_type == ZIP_STORED: - lr = len(self.readbuffer) - bytesToRead = min(bytesToRead, size - lr) - elif self.compress_type == ZIP_DEFLATED: - if len(self.readbuffer) > size: - # the user has requested fewer bytes than we've already - # pulled through the decompressor; don't read any more - bytesToRead = 0 - else: - # user will use up the buffer, so read some more - lr = len(self.rawbuffer) - bytesToRead = min(bytesToRead, self.compreadsize - lr) - - # avoid reading past end of file contents - if bytesToRead + self.bytes_read > self.compress_size: - bytesToRead = self.compress_size - self.bytes_read - - # try to read from file (if necessary) - if bytesToRead > 0: - data = self.fileobj.read(bytesToRead) - self.bytes_read += len(data) - try: - self.rawbuffer += data - except: - print(repr(self.fileobj), repr(self.rawbuffer), - repr(data)) - raise - - # handle contents of raw buffer - if self.rawbuffer: - newdata = self.rawbuffer - self.rawbuffer = b'' - - # decrypt new data if we were given an object to handle that - if newdata and self.decrypter is not None: - newdata = bytes(map(self.decrypter, newdata)) - - # decompress newly read data if necessary - if newdata and self.compress_type == ZIP_DEFLATED: - newdata = self.dc.decompress(newdata) - self.rawbuffer = self.dc.unconsumed_tail - if self.eof and len(self.rawbuffer) == 0: - # we're out of raw bytes (both from the file and - # the local buffer); flush just to make sure the - # decompressor is done - newdata += self.dc.flush() - # prevent decompressor from being used again - self.dc = None - - self.readbuffer += newdata - - - # return what the user asked for - if size is None or len(self.readbuffer) <= size: - data = self.readbuffer - self.readbuffer = b'' - else: - data = self.readbuffer[:size] - self.readbuffer = self.readbuffer[size:] + if not self._universal and limit < 0: + # Shortcut common case - newline found in buffer. + i = self._readbuffer.find(b'\n', self._offset) + 1 + if i > 0: + line = self._readbuffer[self._offset: i] + self._offset = i + return line + + if not self._universal: + return io.BufferedIOBase.readline(self, limit) + + line = b'' + while limit < 0 or len(line) < limit: + readahead = self.peek(2) + if readahead == b'': + return line + + # + # Search for universal newlines or line chunks. + # + # The pattern returns either a line chunk or a newline, but not + # both. Combined with peek(2), we are assured that the sequence + # '\r\n' is always retrieved completely and never split into + # separate newlines - '\r', '\n' due to coincidental readaheads. + # + match = self.PATTERN.search(readahead) + newline = match.group('newline') + if newline is not None: + if self.newlines is None: + self.newlines = [] + if newline not in self.newlines: + self.newlines.append(newline) + self._offset += len(newline) + return line + b'\n' + + chunk = match.group('chunk') + if limit >= 0: + chunk = chunk[: limit - len(line)] + + self._offset += len(chunk) + line += chunk + + return line + + def peek(self, n=1): + """Returns buffered bytes without advancing the position.""" + if n > len(self._readbuffer) - self._offset: + chunk = self.read(n) + self._offset -= len(chunk) + + # Return up to 512 bytes to reduce allocation overhead for tight loops. + return self._readbuffer[self._offset: self._offset + 512] + + def readable(self): + return True + + def read(self, n=-1): + """Read and return up to n bytes. + If the argument is omitted, None, or negative, data is read and returned until EOF is reached.. + """ + + buf = b'' + while n < 0 or n is None or n > len(buf): + data = self.read1(n) + if len(data) == 0: + return buf + + buf += data + + return buf + + def read1(self, n): + """Read up to n bytes with at most one read() system call.""" + + # Simplify algorithm (branching) by transforming negative n to large n. + if n < 0 or n is None: + n = self.MAX_N + + # Bytes available in read buffer. + len_readbuffer = len(self._readbuffer) - self._offset + + # Read from file. + if self._compress_left > 0 and n > len_readbuffer + len(self._unconsumed): + nbytes = n - len_readbuffer - len(self._unconsumed) + nbytes = max(nbytes, self.MIN_READ_SIZE) + nbytes = min(nbytes, self._compress_left) + + data = self._fileobj.read(nbytes) + self._compress_left -= len(data) + + if data and self._decrypter is not None: + data = bytes(map(self._decrypter, data)) + + if self._compress_type == ZIP_STORED: + self._readbuffer = self._readbuffer[self._offset:] + data + self._offset = 0 + else: + # Prepare deflated bytes for decompression. + self._unconsumed += data + + # Handle unconsumed data. + if len(self._unconsumed) > 0 and n > len_readbuffer: + data = self._decompressor.decompress( + self._unconsumed, + max(n - len_readbuffer, self.MIN_READ_SIZE) + ) + + self._unconsumed = self._decompressor.unconsumed_tail + if len(self._unconsumed) == 0 and self._compress_left == 0: + data += self._decompressor.flush() + + self._readbuffer = self._readbuffer[self._offset:] + data + self._offset = 0 + + # Read from buffer. + data = self._readbuffer[self._offset: self._offset + n] + self._offset += len(data) return data + class ZipFile: """ Class with methods to open, read, write, close, list zip files. @@ -925,16 +894,7 @@ class ZipFile: if h[11] != check_byte: raise RuntimeError("Bad password for file", name) - # build and return a ZipExtFile - if zd is None: - zef = ZipExtFile(zef_file, zinfo) - else: - zef = ZipExtFile(zef_file, zinfo, zd) - - # set universal newlines on ZipExtFile if necessary - if "U" in mode: - zef.set_univ_newlines(True) - return zef + return ZipExtFile(zef_file, mode, zinfo, zd) def extract(self, member, path=None, pwd=None): """Extract a member from the archive to the current working directory, @@ -225,7 +225,6 @@ C-API - Issue #1419652: Change the first argument to PyImport_AppendInittab() to ``const char *`` as the string is stored beyond the call. - - Issue #2422: When compiled with the ``--with-valgrind`` option, the pymalloc allocator will be automatically disabled when running under Valgrind. This gives improved memory leak detection when running @@ -234,6 +233,13 @@ C-API Library ------- +- Issue #7610: Reworked implementation of the internal + :class:`zipfile.ZipExtFile` class used to represent files stored inside + an archive. The new implementation is significantly faster and can + be wrapped in a :class:`io.BufferedReader` object for more speedups. + It also solves an issue where interleaved calls to `read()` and + `readline()` give wrong results. Patch by Nir Aides. + - Issue #6963: Added "maxtasksperchild" argument to multiprocessing.Pool, allowing for a maximum number of tasks within the pool to be completed by the worker before that worker is terminated, and a new one created to |