GzipFile.readline performance improvement (~30-40%), patch #1281707

author: Bob Ippolito <bob@redivi.com> 2006-05-22 14:31:24 (GMT)
committer: Bob Ippolito <bob@redivi.com> 2006-05-22 14:31:24 (GMT)
commit: d72aab5e31f831edb2b8e837e2ab387f2db07aee (patch)
tree: 60dad6606806bc6b6a001f29972acb51b41f2bed /Lib/gzip.py
parent: 31a4262d3dab7eea8ad0ab0a54ea386c924b5c94 (diff)
download: cpython-d72aab5e31f831edb2b8e837e2ab387f2db07aee.zip
cpython-d72aab5e31f831edb2b8e837e2ab387f2db07aee.tar.gz
cpython-d72aab5e31f831edb2b8e837e2ab387f2db07aee.tar.bz2
1 files changed, 28 insertions, 20 deletions
diff --git a/Lib/gzip.py b/Lib/gzip.py
index 3c1ebf2..8c7870e 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -107,6 +107,7 @@ class GzipFile:
             self.extrabuf = ""
             self.extrasize = 0
             self.filename = filename
+            self.min_readsize = 64       # Starts small, scales exponentially
 
         elif mode[0:1] == 'w' or mode[0:1] == 'a':
             self.mode = WRITE
@@ -381,32 +382,39 @@ class GzipFile:
             self.read(count % 1024)
 
     def readline(self, size=-1):
-        if size < 0: size = sys.maxint
-        bufs = []
-        readsize = min(100, size)    # Read from the file in small chunks
+        if size < 0:
+            size = sys.maxint              # Line can be as long as maxint
+            readsize = self.min_readsize   # Read from file in small chunks
+        else:
+            readsize = size                # Only read in as much as specified
+
+        bufs = ""
+
         while True:
-            if size == 0:
-                return "".join(bufs) # Return resulting line
+            if size == 0: return bufs      # Return line (reached max len)
 
             c = self.read(readsize)
             i = c.find('\n')
-            if size is not None:
-                # We set i=size to break out of the loop under two
-                # conditions: 1) there's no newline, and the chunk is
-                # larger than size, or 2) there is a newline, but the
-                # resulting line would be longer than 'size'.
-                if i==-1 and len(c) > size: i=size-1
-                elif size <= i: i = size -1
 
+            # If there is a newline, or the string is empty
             if i >= 0 or c == '':
-                bufs.append(c[:i+1])    # Add portion of last chunk
-                self._unread(c[i+1:])   # Push back rest of chunk
-                return ''.join(bufs)    # Return resulting line
-
-            # Append chunk to list, decrease 'size',
-            bufs.append(c)
-            size = size - len(c)
-            readsize = min(size, readsize * 2)
+                if size <= i: i = size - 1 # Another larger than size check
+
+                self._unread(c[i+1:])      # Push back rest of chunk
+
+                return bufs + c[:i+1]      # Stored line, plus new segment
+
+            # If there is no newline
+            else:
+                if len(c) > size: i = size - 1   # If lineis larger than size
+
+                bufs = bufs + c
+                size = size - len(c)
+                readsize = min(size, int(readsize * 1.1))
+
+                # Optimize future readline() calls
+                if readsize > self.min_readsize:
+                    self.min_readsize = readsize
 
     def readlines(self, sizehint=0):
         # Negative numbers result in reading all the lines
author	Bob Ippolito <bob@redivi.com>	2006-05-22 14:31:24 (GMT)
committer	Bob Ippolito <bob@redivi.com>	2006-05-22 14:31:24 (GMT)
commit	d72aab5e31f831edb2b8e837e2ab387f2db07aee (patch)
tree	60dad6606806bc6b6a001f29972acb51b41f2bed /Lib/gzip.py
parent	31a4262d3dab7eea8ad0ab0a54ea386c924b5c94 (diff)
download	cpython-d72aab5e31f831edb2b8e837e2ab387f2db07aee.zip cpython-d72aab5e31f831edb2b8e837e2ab387f2db07aee.tar.gz cpython-d72aab5e31f831edb2b8e837e2ab387f2db07aee.tar.bz2