summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGregory P. Smith <greg@mad-scientist.com>2008-05-02 07:26:52 (GMT)
committerGregory P. Smith <greg@mad-scientist.com>2008-05-02 07:26:52 (GMT)
commitf8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31 (patch)
tree95a9a74e85ba4b09ef95e1c9dad2de5fba56076b
parentb457ddaff2094a0ec02176184beb74f600178ed4 (diff)
downloadcpython-f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31.zip
cpython-f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31.tar.gz
cpython-f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31.tar.bz2
This should fix issue2632. A long description of the two competing
problems is in the bug report (one old, one recently introduced trying to fix the old one). In short: buffer data during socket._fileobject.read() and readlines() within a cStringIO object instead of a [] of str()s returned from the recv() call. This prevents excessive memory use due to the size parameter being passed to recv() being grossly larger than the actual size of the data returned *and* prevents excessive cpu usage due to looping in python calling recv() with a very tiny size value if min() is used as the previous memory-use bug "fix" did. It also documents what the socket._fileobject._rbufsize member is actually used for. This is a candidate for back porting to 2.5.
-rw-r--r--Lib/socket.py163
1 files changed, 104 insertions, 59 deletions
diff --git a/Lib/socket.py b/Lib/socket.py
index 2ca8ff6..f778f3b 100644
--- a/Lib/socket.py
+++ b/Lib/socket.py
@@ -79,6 +79,11 @@ else:
import os, sys, warnings
try:
+ from cStringIO import StringIO
+except ImportError:
+ from StringIO import StringIO
+
+try:
from errno import EBADF
except ImportError:
EBADF = 9
@@ -234,6 +239,9 @@ class _fileobject(object):
bufsize = self.default_bufsize
self.bufsize = bufsize
self.softspace = False
+ # _rbufsize is the suggested recv buffer size. It is *strictly*
+ # obeyed within readline() for recv calls. If it is larger than
+ # default_bufsize it will be used for recv calls within read().
if bufsize == 0:
self._rbufsize = 1
elif bufsize == 1:
@@ -241,7 +249,11 @@ class _fileobject(object):
else:
self._rbufsize = bufsize
self._wbufsize = bufsize
- self._rbuf = "" # A string
+ # We use StringIO for the read buffer to avoid holding a list
+ # of variously sized string objects which have been known to
+ # fragment the heap due to how they are malloc()ed and often
+ # realloc()ed down much smaller than their original allocation.
+ self._rbuf = StringIO()
self._wbuf = [] # A list of strings
self._close = close
@@ -299,56 +311,86 @@ class _fileobject(object):
return buf_len
def read(self, size=-1):
- data = self._rbuf
+ # Use max, disallow tiny reads in a loop as they are very inefficient.
+ # We never leave read() with any leftover data in our internal buffer.
+ rbufsize = max(self._rbufsize, self.default_bufsize)
+ # Our use of StringIO rather than lists of string objects returned by
+ # recv() minimizes memory usage and fragmentation that occurs when
+ # rbufsize is large compared to the typical return value of recv().
+ buf = self._rbuf
+ buf.seek(0, 2) # seek end
if size < 0:
# Read until EOF
- buffers = []
- if data:
- buffers.append(data)
- self._rbuf = ""
- if self._rbufsize <= 1:
- recv_size = self.default_bufsize
- else:
- recv_size = self._rbufsize
+ self._rbuf = StringIO() # reset _rbuf. we consume it via buf.
while True:
- data = self._sock.recv(recv_size)
+ data = self._sock.recv(rbufsize)
if not data:
break
- buffers.append(data)
- return "".join(buffers)
+ buf.write(data)
+ return buf.getvalue()
else:
# Read until size bytes or EOF seen, whichever comes first
- buf_len = len(data)
+ buf_len = buf.tell()
if buf_len >= size:
- self._rbuf = data[size:]
- return data[:size]
- buffers = []
- if data:
- buffers.append(data)
- self._rbuf = ""
+ # Already have size bytes in our buffer? Extract and return.
+ buf.seek(0)
+ rv = buf.read(size)
+ self._rbuf = StringIO()
+ self._rbuf.write(buf.read())
+ return rv
+
+ self._rbuf = StringIO() # reset _rbuf. we consume it via buf.
while True:
left = size - buf_len
- recv_size = min(self._rbufsize, left)
+ # Using max() here means that recv() can malloc a
+ # large amount of memory even though recv may return
+ # much less data than that. But the returned data
+ # string is short lived in that case as we copy it
+ # into a StringIO and free it.
+ recv_size = max(rbufsize, left)
data = self._sock.recv(recv_size)
if not data:
break
- buffers.append(data)
n = len(data)
+ if n == size and not buf_len:
+ # Shortcut. Avoid buffer data copies when:
+ # - We have no data in our buffer.
+ # AND
+ # - Our call to recv returned exactly the
+ # number of bytes we were asked to read.
+ return data
if n >= left:
- self._rbuf = data[left:]
- buffers[-1] = data[:left]
+ # avoids data copy of: buf.write(data[:left])
+ buf.write(buffer(data, 0, left))
+ # avoids data copy of: self._rbuf.write(data[left:])
+ self._rbuf.write(buffer(data, left))
+ del data # explicit free
break
+ buf.write(data)
buf_len += n
- return "".join(buffers)
+ del data # explicit free
+ #assert buf_len == buf.tell()
+ return buf.getvalue()
def readline(self, size=-1):
- data = self._rbuf
+ buf = self._rbuf
+ if self._rbufsize > 1:
+ # if we're buffering, check if we already have it in our buffer
+ buf.seek(0)
+ bline = buf.readline(size)
+ if bline.endswith('\n') or len(bline) == size:
+ self._rbuf = StringIO()
+ self._rbuf.write(buf.read())
+ return bline
+ del bline
+ buf.seek(0, 2) # seek end
if size < 0:
# Read until \n or EOF, whichever comes first
if self._rbufsize <= 1:
# Speed up unbuffered case
- assert data == ""
+ assert buf.tell() == 0
buffers = []
+ data = None
recv = self._sock.recv
while data != "\n":
data = recv(1)
@@ -356,61 +398,64 @@ class _fileobject(object):
break
buffers.append(data)
return "".join(buffers)
- nl = data.find('\n')
- if nl >= 0:
- nl += 1
- self._rbuf = data[nl:]
- return data[:nl]
- buffers = []
- if data:
- buffers.append(data)
- self._rbuf = ""
+
+ buf = self._rbuf
+ buf.seek(0, 2) # seek end
+ self._rbuf = StringIO() # reset _rbuf. we consume it via buf.
while True:
data = self._sock.recv(self._rbufsize)
if not data:
break
- buffers.append(data)
nl = data.find('\n')
if nl >= 0:
nl += 1
- self._rbuf = data[nl:]
- buffers[-1] = data[:nl]
+ buf.write(buffer(data, 0, nl))
+ self._rbuf.write(buffer(data, nl))
+ del data
break
- return "".join(buffers)
+ buf.write(data)
+ return buf.getvalue()
else:
# Read until size bytes or \n or EOF seen, whichever comes first
- nl = data.find('\n', 0, size)
- if nl >= 0:
- nl += 1
- self._rbuf = data[nl:]
- return data[:nl]
- buf_len = len(data)
+ buf_len = buf.tell()
if buf_len >= size:
- self._rbuf = data[size:]
- return data[:size]
- buffers = []
- if data:
- buffers.append(data)
- self._rbuf = ""
+ buf.seek(0)
+ rv = buf.read(size)
+ self._rbuf = StringIO()
+ self._rbuf.write(buf.read())
+ return rv
+ self._rbuf = StringIO() # reset _rbuf. we consume it via buf.
while True:
data = self._sock.recv(self._rbufsize)
if not data:
break
- buffers.append(data)
left = size - buf_len
+ # did we just receive a newline?
nl = data.find('\n', 0, left)
if nl >= 0:
nl += 1
- self._rbuf = data[nl:]
- buffers[-1] = data[:nl]
- break
+ # save the excess data to _rbuf
+ self._rbuf.write(buffer(data, nl))
+ if buf_len:
+ buf.write(buffer(data, 0, nl))
+ break
+ else:
+ # Shortcut. Avoid data copy through buf when returning
+ # a substring of our first recv().
+ return data[:nl]
n = len(data)
+ if n == size and not buf_len:
+ # Shortcut. Avoid data copy through buf when
+ # returning exactly all of our first recv().
+ return data
if n >= left:
- self._rbuf = data[left:]
- buffers[-1] = data[:left]
+ buf.write(buffer(data, 0, left))
+ self._rbuf.write(buffer(data, left))
break
+ buf.write(data)
buf_len += n
- return "".join(buffers)
+ #assert buf_len == buf.tell()
+ return buf.getvalue()
def readlines(self, sizehint=0):
total = 0