diff options
author | Gregory P. Smith <greg@mad-scientist.com> | 2008-05-02 07:26:52 (GMT) |
---|---|---|
committer | Gregory P. Smith <greg@mad-scientist.com> | 2008-05-02 07:26:52 (GMT) |
commit | f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31 (patch) | |
tree | 95a9a74e85ba4b09ef95e1c9dad2de5fba56076b /Lib/socket.py | |
parent | b457ddaff2094a0ec02176184beb74f600178ed4 (diff) | |
download | cpython-f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31.zip cpython-f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31.tar.gz cpython-f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31.tar.bz2 |
This should fix issue2632. A long description of the two competing
problems is in the bug report (one old, one recently introduced trying
to fix the old one). In short:
buffer data during socket._fileobject.read() and readlines() within a
cStringIO object instead of a [] of str()s returned from the recv()
call.
This prevents excessive memory use due to the size parameter being
passed to recv() being grossly larger than the actual size of the data
returned *and* prevents excessive cpu usage due to looping in python
calling recv() with a very tiny size value if min() is used as the
previous memory-use bug "fix" did.
It also documents what the socket._fileobject._rbufsize member is
actually used for.
This is a candidate for back porting to 2.5.
Diffstat (limited to 'Lib/socket.py')
-rw-r--r-- | Lib/socket.py | 163 |
1 files changed, 104 insertions, 59 deletions
diff --git a/Lib/socket.py b/Lib/socket.py index 2ca8ff6..f778f3b 100644 --- a/Lib/socket.py +++ b/Lib/socket.py @@ -79,6 +79,11 @@ else: import os, sys, warnings try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +try: from errno import EBADF except ImportError: EBADF = 9 @@ -234,6 +239,9 @@ class _fileobject(object): bufsize = self.default_bufsize self.bufsize = bufsize self.softspace = False + # _rbufsize is the suggested recv buffer size. It is *strictly* + # obeyed within readline() for recv calls. If it is larger than + # default_bufsize it will be used for recv calls within read(). if bufsize == 0: self._rbufsize = 1 elif bufsize == 1: @@ -241,7 +249,11 @@ class _fileobject(object): else: self._rbufsize = bufsize self._wbufsize = bufsize - self._rbuf = "" # A string + # We use StringIO for the read buffer to avoid holding a list + # of variously sized string objects which have been known to + # fragment the heap due to how they are malloc()ed and often + # realloc()ed down much smaller than their original allocation. + self._rbuf = StringIO() self._wbuf = [] # A list of strings self._close = close @@ -299,56 +311,86 @@ class _fileobject(object): return buf_len def read(self, size=-1): - data = self._rbuf + # Use max, disallow tiny reads in a loop as they are very inefficient. + # We never leave read() with any leftover data in our internal buffer. + rbufsize = max(self._rbufsize, self.default_bufsize) + # Our use of StringIO rather than lists of string objects returned by + # recv() minimizes memory usage and fragmentation that occurs when + # rbufsize is large compared to the typical return value of recv(). + buf = self._rbuf + buf.seek(0, 2) # seek end if size < 0: # Read until EOF - buffers = [] - if data: - buffers.append(data) - self._rbuf = "" - if self._rbufsize <= 1: - recv_size = self.default_bufsize - else: - recv_size = self._rbufsize + self._rbuf = StringIO() # reset _rbuf. we consume it via buf. while True: - data = self._sock.recv(recv_size) + data = self._sock.recv(rbufsize) if not data: break - buffers.append(data) - return "".join(buffers) + buf.write(data) + return buf.getvalue() else: # Read until size bytes or EOF seen, whichever comes first - buf_len = len(data) + buf_len = buf.tell() if buf_len >= size: - self._rbuf = data[size:] - return data[:size] - buffers = [] - if data: - buffers.append(data) - self._rbuf = "" + # Already have size bytes in our buffer? Extract and return. + buf.seek(0) + rv = buf.read(size) + self._rbuf = StringIO() + self._rbuf.write(buf.read()) + return rv + + self._rbuf = StringIO() # reset _rbuf. we consume it via buf. while True: left = size - buf_len - recv_size = min(self._rbufsize, left) + # Using max() here means that recv() can malloc a + # large amount of memory even though recv may return + # much less data than that. But the returned data + # string is short lived in that case as we copy it + # into a StringIO and free it. + recv_size = max(rbufsize, left) data = self._sock.recv(recv_size) if not data: break - buffers.append(data) n = len(data) + if n == size and not buf_len: + # Shortcut. Avoid buffer data copies when: + # - We have no data in our buffer. + # AND + # - Our call to recv returned exactly the + # number of bytes we were asked to read. + return data if n >= left: - self._rbuf = data[left:] - buffers[-1] = data[:left] + # avoids data copy of: buf.write(data[:left]) + buf.write(buffer(data, 0, left)) + # avoids data copy of: self._rbuf.write(data[left:]) + self._rbuf.write(buffer(data, left)) + del data # explicit free break + buf.write(data) buf_len += n - return "".join(buffers) + del data # explicit free + #assert buf_len == buf.tell() + return buf.getvalue() def readline(self, size=-1): - data = self._rbuf + buf = self._rbuf + if self._rbufsize > 1: + # if we're buffering, check if we already have it in our buffer + buf.seek(0) + bline = buf.readline(size) + if bline.endswith('\n') or len(bline) == size: + self._rbuf = StringIO() + self._rbuf.write(buf.read()) + return bline + del bline + buf.seek(0, 2) # seek end if size < 0: # Read until \n or EOF, whichever comes first if self._rbufsize <= 1: # Speed up unbuffered case - assert data == "" + assert buf.tell() == 0 buffers = [] + data = None recv = self._sock.recv while data != "\n": data = recv(1) @@ -356,61 +398,64 @@ class _fileobject(object): break buffers.append(data) return "".join(buffers) - nl = data.find('\n') - if nl >= 0: - nl += 1 - self._rbuf = data[nl:] - return data[:nl] - buffers = [] - if data: - buffers.append(data) - self._rbuf = "" + + buf = self._rbuf + buf.seek(0, 2) # seek end + self._rbuf = StringIO() # reset _rbuf. we consume it via buf. while True: data = self._sock.recv(self._rbufsize) if not data: break - buffers.append(data) nl = data.find('\n') if nl >= 0: nl += 1 - self._rbuf = data[nl:] - buffers[-1] = data[:nl] + buf.write(buffer(data, 0, nl)) + self._rbuf.write(buffer(data, nl)) + del data break - return "".join(buffers) + buf.write(data) + return buf.getvalue() else: # Read until size bytes or \n or EOF seen, whichever comes first - nl = data.find('\n', 0, size) - if nl >= 0: - nl += 1 - self._rbuf = data[nl:] - return data[:nl] - buf_len = len(data) + buf_len = buf.tell() if buf_len >= size: - self._rbuf = data[size:] - return data[:size] - buffers = [] - if data: - buffers.append(data) - self._rbuf = "" + buf.seek(0) + rv = buf.read(size) + self._rbuf = StringIO() + self._rbuf.write(buf.read()) + return rv + self._rbuf = StringIO() # reset _rbuf. we consume it via buf. while True: data = self._sock.recv(self._rbufsize) if not data: break - buffers.append(data) left = size - buf_len + # did we just receive a newline? nl = data.find('\n', 0, left) if nl >= 0: nl += 1 - self._rbuf = data[nl:] - buffers[-1] = data[:nl] - break + # save the excess data to _rbuf + self._rbuf.write(buffer(data, nl)) + if buf_len: + buf.write(buffer(data, 0, nl)) + break + else: + # Shortcut. Avoid data copy through buf when returning + # a substring of our first recv(). + return data[:nl] n = len(data) + if n == size and not buf_len: + # Shortcut. Avoid data copy through buf when + # returning exactly all of our first recv(). + return data if n >= left: - self._rbuf = data[left:] - buffers[-1] = data[:left] + buf.write(buffer(data, 0, left)) + self._rbuf.write(buffer(data, left)) break + buf.write(data) buf_len += n - return "".join(buffers) + #assert buf_len == buf.tell() + return buf.getvalue() def readlines(self, sizehint=0): total = 0 |