diff options
| author | Gregory P. Smith <greg@mad-scientist.com> | 2008-05-02 07:26:52 (GMT) | 
|---|---|---|
| committer | Gregory P. Smith <greg@mad-scientist.com> | 2008-05-02 07:26:52 (GMT) | 
| commit | f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31 (patch) | |
| tree | 95a9a74e85ba4b09ef95e1c9dad2de5fba56076b /Lib/socket.py | |
| parent | b457ddaff2094a0ec02176184beb74f600178ed4 (diff) | |
| download | cpython-f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31.zip cpython-f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31.tar.gz cpython-f8cc64017ca0f33b1e21b1c9c9f5f30ebdc4da31.tar.bz2  | |
This should fix issue2632.  A long description of the two competing
problems is in the bug report (one old, one recently introduced trying
to fix the old one).  In short:
buffer data during socket._fileobject.read() and readlines() within a
cStringIO object instead of a [] of str()s returned from the recv()
call.
This prevents excessive memory use due to the size parameter being
passed to recv() being grossly larger than the actual size of the data
returned *and* prevents excessive cpu usage due to looping in python
calling recv() with a very tiny size value if min() is used as the
previous memory-use bug "fix" did.
It also documents what the socket._fileobject._rbufsize member is
actually used for.
This is a candidate for back porting to 2.5.
Diffstat (limited to 'Lib/socket.py')
| -rw-r--r-- | Lib/socket.py | 163 | 
1 files changed, 104 insertions, 59 deletions
diff --git a/Lib/socket.py b/Lib/socket.py index 2ca8ff6..f778f3b 100644 --- a/Lib/socket.py +++ b/Lib/socket.py @@ -79,6 +79,11 @@ else:  import os, sys, warnings  try: +    from cStringIO import StringIO +except ImportError: +    from StringIO import StringIO + +try:      from errno import EBADF  except ImportError:      EBADF = 9 @@ -234,6 +239,9 @@ class _fileobject(object):              bufsize = self.default_bufsize          self.bufsize = bufsize          self.softspace = False +        # _rbufsize is the suggested recv buffer size.  It is *strictly* +        # obeyed within readline() for recv calls.  If it is larger than +        # default_bufsize it will be used for recv calls within read().          if bufsize == 0:              self._rbufsize = 1          elif bufsize == 1: @@ -241,7 +249,11 @@ class _fileobject(object):          else:              self._rbufsize = bufsize          self._wbufsize = bufsize -        self._rbuf = "" # A string +        # We use StringIO for the read buffer to avoid holding a list +        # of variously sized string objects which have been known to +        # fragment the heap due to how they are malloc()ed and often +        # realloc()ed down much smaller than their original allocation. +        self._rbuf = StringIO()          self._wbuf = [] # A list of strings          self._close = close @@ -299,56 +311,86 @@ class _fileobject(object):          return buf_len      def read(self, size=-1): -        data = self._rbuf +        # Use max, disallow tiny reads in a loop as they are very inefficient. +        # We never leave read() with any leftover data in our internal buffer. +        rbufsize = max(self._rbufsize, self.default_bufsize) +        # Our use of StringIO rather than lists of string objects returned by +        # recv() minimizes memory usage and fragmentation that occurs when +        # rbufsize is large compared to the typical return value of recv(). +        buf = self._rbuf +        buf.seek(0, 2)  # seek end          if size < 0:              # Read until EOF -            buffers = [] -            if data: -                buffers.append(data) -            self._rbuf = "" -            if self._rbufsize <= 1: -                recv_size = self.default_bufsize -            else: -                recv_size = self._rbufsize +            self._rbuf = StringIO()  # reset _rbuf.  we consume it via buf.              while True: -                data = self._sock.recv(recv_size) +                data = self._sock.recv(rbufsize)                  if not data:                      break -                buffers.append(data) -            return "".join(buffers) +                buf.write(data) +            return buf.getvalue()          else:              # Read until size bytes or EOF seen, whichever comes first -            buf_len = len(data) +            buf_len = buf.tell()              if buf_len >= size: -                self._rbuf = data[size:] -                return data[:size] -            buffers = [] -            if data: -                buffers.append(data) -            self._rbuf = "" +                # Already have size bytes in our buffer?  Extract and return. +                buf.seek(0) +                rv = buf.read(size) +                self._rbuf = StringIO() +                self._rbuf.write(buf.read()) +                return rv + +            self._rbuf = StringIO()  # reset _rbuf.  we consume it via buf.              while True:                  left = size - buf_len -                recv_size = min(self._rbufsize, left) +                # Using max() here means that recv() can malloc a +                # large amount of memory even though recv may return +                # much less data than that.  But the returned data +                # string is short lived in that case as we copy it +                # into a StringIO and free it. +                recv_size = max(rbufsize, left)                  data = self._sock.recv(recv_size)                  if not data:                      break -                buffers.append(data)                  n = len(data) +                if n == size and not buf_len: +                    # Shortcut.  Avoid buffer data copies when: +                    # - We have no data in our buffer. +                    # AND +                    # - Our call to recv returned exactly the +                    #   number of bytes we were asked to read. +                    return data                  if n >= left: -                    self._rbuf = data[left:] -                    buffers[-1] = data[:left] +                    # avoids data copy of: buf.write(data[:left]) +                    buf.write(buffer(data, 0, left)) +                    # avoids data copy of: self._rbuf.write(data[left:]) +                    self._rbuf.write(buffer(data, left)) +                    del data  # explicit free                      break +                buf.write(data)                  buf_len += n -            return "".join(buffers) +                del data  # explicit free +                #assert buf_len == buf.tell() +            return buf.getvalue()      def readline(self, size=-1): -        data = self._rbuf +        buf = self._rbuf +        if self._rbufsize > 1: +            # if we're buffering, check if we already have it in our buffer +            buf.seek(0) +            bline = buf.readline(size) +            if bline.endswith('\n') or len(bline) == size: +                self._rbuf = StringIO() +                self._rbuf.write(buf.read()) +                return bline +            del bline +        buf.seek(0, 2)  # seek end          if size < 0:              # Read until \n or EOF, whichever comes first              if self._rbufsize <= 1:                  # Speed up unbuffered case -                assert data == "" +                assert buf.tell() == 0                  buffers = [] +                data = None                  recv = self._sock.recv                  while data != "\n":                      data = recv(1) @@ -356,61 +398,64 @@ class _fileobject(object):                          break                      buffers.append(data)                  return "".join(buffers) -            nl = data.find('\n') -            if nl >= 0: -                nl += 1 -                self._rbuf = data[nl:] -                return data[:nl] -            buffers = [] -            if data: -                buffers.append(data) -            self._rbuf = "" + +            buf = self._rbuf +            buf.seek(0, 2)  # seek end +            self._rbuf = StringIO()  # reset _rbuf.  we consume it via buf.              while True:                  data = self._sock.recv(self._rbufsize)                  if not data:                      break -                buffers.append(data)                  nl = data.find('\n')                  if nl >= 0:                      nl += 1 -                    self._rbuf = data[nl:] -                    buffers[-1] = data[:nl] +                    buf.write(buffer(data, 0, nl)) +                    self._rbuf.write(buffer(data, nl)) +                    del data                      break -            return "".join(buffers) +                buf.write(data) +            return buf.getvalue()          else:              # Read until size bytes or \n or EOF seen, whichever comes first -            nl = data.find('\n', 0, size) -            if nl >= 0: -                nl += 1 -                self._rbuf = data[nl:] -                return data[:nl] -            buf_len = len(data) +            buf_len = buf.tell()              if buf_len >= size: -                self._rbuf = data[size:] -                return data[:size] -            buffers = [] -            if data: -                buffers.append(data) -            self._rbuf = "" +                buf.seek(0) +                rv = buf.read(size) +                self._rbuf = StringIO() +                self._rbuf.write(buf.read()) +                return rv +            self._rbuf = StringIO()  # reset _rbuf.  we consume it via buf.              while True:                  data = self._sock.recv(self._rbufsize)                  if not data:                      break -                buffers.append(data)                  left = size - buf_len +                # did we just receive a newline?                  nl = data.find('\n', 0, left)                  if nl >= 0:                      nl += 1 -                    self._rbuf = data[nl:] -                    buffers[-1] = data[:nl] -                    break +                    # save the excess data to _rbuf +                    self._rbuf.write(buffer(data, nl)) +                    if buf_len: +                        buf.write(buffer(data, 0, nl)) +                        break +                    else: +                        # Shortcut.  Avoid data copy through buf when returning +                        # a substring of our first recv(). +                        return data[:nl]                  n = len(data) +                if n == size and not buf_len: +                    # Shortcut.  Avoid data copy through buf when +                    # returning exactly all of our first recv(). +                    return data                  if n >= left: -                    self._rbuf = data[left:] -                    buffers[-1] = data[:left] +                    buf.write(buffer(data, 0, left)) +                    self._rbuf.write(buffer(data, left))                      break +                buf.write(data)                  buf_len += n -            return "".join(buffers) +                #assert buf_len == buf.tell() +            return buf.getvalue()      def readlines(self, sizehint=0):          total = 0  | 
