summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2010-01-03 22:37:40 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2010-01-03 22:37:40 (GMT)
commitb1f8835b213411d059d0e2ba4b78125328afeee6 (patch)
tree42cf60ff63d11840481e1ee373fa213c950a99cc
parenta81d881e136021a84656620b3bf11dfa8b0556ec (diff)
downloadcpython-b1f8835b213411d059d0e2ba4b78125328afeee6.zip
cpython-b1f8835b213411d059d0e2ba4b78125328afeee6.tar.gz
cpython-b1f8835b213411d059d0e2ba4b78125328afeee6.tar.bz2
Merged revisions 77288 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r77288 | antoine.pitrou | 2010-01-03 23:29:56 +0100 (dim., 03 janv. 2010) | 5 lines Issue #7471: Improve the performance of GzipFile's buffering mechanism, and make it implement the `io.BufferedIOBase` ABC to allow for further speedups by wrapping it in an `io.BufferedReader`. Patch by Nir Aides. ........
-rw-r--r--Lib/gzip.py99
-rw-r--r--Lib/test/test_gzip.py11
-rw-r--r--Misc/NEWS6
3 files changed, 58 insertions, 58 deletions
diff --git a/Lib/gzip.py b/Lib/gzip.py
index f9a59d7..66fc88d 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -8,6 +8,7 @@ but random access is not allowed."""
import struct, sys, time, os
import zlib
import builtins
+import io
__all__ = ["GzipFile","open"]
@@ -44,7 +45,7 @@ def open(filename, mode="rb", compresslevel=9):
"""
return GzipFile(filename, mode, compresslevel)
-class GzipFile:
+class GzipFile(io.BufferedIOBase):
"""The GzipFile class simulates most of the methods of a file object with
the exception of the readinto() and truncate() methods.
@@ -109,8 +110,12 @@ class GzipFile:
self.mode = READ
# Set flag indicating start of a new member
self._new_member = True
+ # Buffer data read from gzip file. extrastart is offset in
+ # stream where buffer starts. extrasize is number of
+ # bytes remaining in buffer from current stream position.
self.extrabuf = b""
self.extrasize = 0
+ self.extrastart = 0
self.name = filename
# Starts small, scales exponentially
self.min_readsize = 100
@@ -214,7 +219,6 @@ class GzipFile:
if flag & FHCRC:
self.fileobj.read(2) # Read & discard the 16-bit header CRC
-
def write(self,data):
if self.mode != WRITE:
import errno
@@ -222,12 +226,19 @@ class GzipFile:
if self.fileobj is None:
raise ValueError("write() on closed GzipFile object")
+
+ # Convert data type if called by io.BufferedWriter.
+ if isinstance(data, memoryview):
+ data = data.tobytes()
+
if len(data) > 0:
self.size = self.size + len(data)
self.crc = zlib.crc32(data, self.crc) & 0xffffffff
self.fileobj.write( self.compress.compress(data) )
self.offset += len(data)
+ return len(data)
+
def read(self, size=-1):
if self.mode != READ:
import errno
@@ -253,15 +264,14 @@ class GzipFile:
if size > self.extrasize:
size = self.extrasize
- chunk = self.extrabuf[:size]
- self.extrabuf = self.extrabuf[size:]
+ offset = self.offset - self.extrastart
+ chunk = self.extrabuf[offset: offset + size]
self.extrasize = self.extrasize - size
self.offset += size
return chunk
def _unread(self, buf):
- self.extrabuf = buf + self.extrabuf
self.extrasize = len(buf) + self.extrasize
self.offset -= len(buf)
@@ -317,8 +327,10 @@ class GzipFile:
def _add_read_data(self, data):
self.crc = zlib.crc32(data, self.crc) & 0xffffffff
- self.extrabuf = self.extrabuf + data
+ offset = self.offset - self.extrastart
+ self.extrabuf = self.extrabuf[offset:] + data
self.extrasize = self.extrasize + len(data)
+ self.extrastart = self.offset
self.size = self.size + len(data)
def _read_eof(self):
@@ -336,6 +348,10 @@ class GzipFile:
elif isize != (self.size & 0xffffffff):
raise IOError("Incorrect length of data produced")
+ @property
+ def closed(self):
+ return self.fileobj is None
+
def close(self):
if self.fileobj is None:
return
@@ -351,15 +367,6 @@ class GzipFile:
self.myfileobj.close()
self.myfileobj = None
- def __del__(self):
- try:
- if (self.myfileobj is None and
- self.fileobj is None):
- return
- except AttributeError:
- return
- self.close()
-
def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
if self.mode == WRITE:
# Ensure the compressor's buffer is flushed
@@ -374,12 +381,6 @@ class GzipFile:
"""
return self.fileobj.fileno()
- def isatty(self):
- return False
-
- def tell(self):
- return self.offset
-
def rewind(self):
'''Return the uncompressed stream file position indicator to the
beginning of the file'''
@@ -389,8 +390,18 @@ class GzipFile:
self._new_member = True
self.extrabuf = b""
self.extrasize = 0
+ self.extrastart = 0
self.offset = 0
+ def readable(self):
+ return self.mode == READ
+
+ def writable(self):
+ return self.mode == WRITE
+
+ def seekable(self):
+ return True
+
def seek(self, offset, whence=0):
if whence:
if whence == 1:
@@ -414,8 +425,18 @@ class GzipFile:
self.read(1024)
self.read(count % 1024)
+ return self.offset
+
def readline(self, size=-1):
if size < 0:
+ # Shortcut common case - newline found in buffer.
+ offset = self.offset - self.extrastart
+ i = self.extrabuf.find(b'\n', offset) + 1
+ if i > 0:
+ self.extrasize -= i - offset
+ self.offset += i - offset
+ return self.extrabuf[offset: i]
+
size = sys.maxsize
readsize = self.min_readsize
else:
@@ -445,42 +466,6 @@ class GzipFile:
self.min_readsize = min(readsize, self.min_readsize * 2, 512)
return b''.join(bufs) # Return resulting line
- def readlines(self, sizehint=0):
- # Negative numbers result in reading all the lines
- if sizehint <= 0:
- sizehint = sys.maxsize
- L = []
- while sizehint > 0:
- line = self.readline()
- if line == b"":
- break
- L.append(line)
- sizehint = sizehint - len(line)
-
- return L
-
- def writelines(self, L):
- for line in L:
- self.write(line)
-
- def __iter__(self):
- return self
-
- def __next__(self):
- line = self.readline()
- if line:
- return line
- else:
- raise StopIteration
-
- def __enter__(self):
- if self.fileobj is None:
- raise ValueError("I/O operation on closed GzipFile object")
- return self
-
- def __exit__(self, *args):
- self.close()
-
def _test():
# Act like gzip; with -d, act like gunzip.
diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py
index fa91dc0..320adfd 100644
--- a/Lib/test/test_gzip.py
+++ b/Lib/test/test_gzip.py
@@ -5,6 +5,7 @@
import unittest
from test import support
import os
+import io
import struct
gzip = support.import_module('gzip')
@@ -80,6 +81,16 @@ class TestGzip(unittest.TestCase):
zgfile.close()
self.assertEquals(contents, b'a'*201)
+ def test_buffered_reader(self):
+ # Issue #7471: a GzipFile can be wrapped in a BufferedReader for
+ # performance.
+ self.test_write()
+
+ f = gzip.GzipFile(self.filename, 'rb')
+ with io.BufferedReader(f) as r:
+ lines = [line for line in r]
+
+ self.assertEqual(lines, 50 * data1.splitlines(True))
def test_readline(self):
self.test_write()
diff --git a/Misc/NEWS b/Misc/NEWS
index 451a2a0..815e392 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -191,7 +191,11 @@ C-API
Library
-------
-_ Issue #3972: http.client.HTTPConnection now accepts an optional source_address
+- Issue #7471: Improve the performance of GzipFile's buffering mechanism,
+ and make it implement the `io.BufferedIOBase` ABC to allow for further
+ speedups by wrapping it in an `io.BufferedReader`. Patch by Nir Aides.
+
+- Issue #3972: http.client.HTTPConnection now accepts an optional source_address
parameter to allow specifying where your connections come from.
- socket.create_connection now accepts an optional source_address parameter.