SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support

decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
author: Walter Dörwald <walter@livinglogic.de> 2004-09-07 20:24:22 (GMT)
committer: Walter Dörwald <walter@livinglogic.de> 2004-09-07 20:24:22 (GMT)
commit: 69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree: 088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Lib/codecs.py
parent: a708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
download: cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2
1 files changed, 69 insertions, 43 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 92c6fef..f831dd6 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -228,12 +228,22 @@ class StreamReader(Codec):
         """
         self.stream = stream
         self.errors = errors
+        self.bytebuffer = ""
+        self.charbuffer = u""
 
-    def read(self, size=-1):
+    def decode(self, input, errors='strict'):
+        raise NotImplementedError
+
+    def read(self, size=-1, chars=-1):
 
         """ Decodes data from the stream self.stream and returns the
             resulting object.
 
+            chars indicates the number of characters to read from the
+            stream. read() will never return more than chars
+            characters, but it might return less, if there are not enough
+            characters available.
+
             size indicates the approximate maximum number of bytes to
             read from the stream for decoding purposes. The decoder
             can modify this setting as appropriate. The default value
@@ -248,54 +258,70 @@ class StreamReader(Codec):
             on the stream, these should be read too.
 
         """
-        # Unsliced reading:
-        if size < 0:
-            return self.decode(self.stream.read(), self.errors)[0]
-
-        # Sliced reading:
-        read = self.stream.read
-        decode = self.decode
-        data = read(size)
-        i = 0
-        while 1:
-            try:
-                object, decodedbytes = decode(data, self.errors)
-            except ValueError, why:
-                # This method is slow but should work under pretty much
-                # all conditions; at most 10 tries are made
-                i = i + 1
-                newdata = read(1)
-                if not newdata or i > 10:
-                    raise
-                data = data + newdata
+        # read until we get the required number of characters (if available)
+        done = False
+        while True:
+            # can the request can be satisfied from the character buffer?
+            if chars < 0:
+                if self.charbuffer:
+                    done = True
             else:
-                return object
-
-    def readline(self, size=None):
+                if len(self.charbuffer) >= chars:
+                    done = True
+            if done:
+                if chars < 0:
+                    result = self.charbuffer
+                    self.charbuffer = u""
+                    break
+                else:
+                    result = self.charbuffer[:chars]
+                    self.charbuffer = self.charbuffer[chars:]
+                    break
+            # we need more data
+            if size < 0:
+                newdata = self.stream.read()
+            else:
+                newdata = self.stream.read(size)
+            data = self.bytebuffer + newdata
+            object, decodedbytes = self.decode(data, self.errors)
+            # keep undecoded bytes until the next call
+            self.bytebuffer = data[decodedbytes:]
+            # put new characters in the character buffer
+            self.charbuffer += object
+            # there was no data available
+            if not newdata:
+                done = True
+        return result
+
+    def readline(self, size=None, keepends=True):
 
         """ Read one line from the input stream and return the
             decoded data.
 
-            Note: Unlike the .readlines() method, this method inherits
-            the line breaking knowledge from the underlying stream's
-            .readline() method -- there is currently no support for
-            line breaking using the codec decoder due to lack of line
-            buffering. Subclasses should however, if possible, try to
-            implement this method using their own knowledge of line
-            breaking.
-
-            size, if given, is passed as size argument to the stream's
-            .readline() method.
+            size, if given, is passed as size argument to the
+            read() method.
 
         """
         if size is None:
-            line = self.stream.readline()
-        else:
-            line = self.stream.readline(size)
-        return self.decode(line, self.errors)[0]
-
-
-    def readlines(self, sizehint=None):
+            size = 10
+        line = u""
+        while True:
+            data = self.read(size)
+            line += data
+            pos = line.find("\n")
+            if pos>=0:
+                self.charbuffer = line[pos+1:] + self.charbuffer
+                if keepends:
+                    line = line[:pos+1]
+                else:
+                    line = line[:pos]
+                return line
+            elif not data:
+                return line
+            if size<8000:
+                size *= 2
+
+    def readlines(self, sizehint=None, keepends=True):
 
         """ Read all lines available on the input stream
             and return them as list of lines.
@@ -307,8 +333,8 @@ class StreamReader(Codec):
             way to finding the true end-of-line.
 
         """
-        data = self.stream.read()
-        return self.decode(data, self.errors)[0].splitlines(1)
+        data = self.read()
+        return self.splitlines(keepends)
 
     def reset(self):
author	Walter Dörwald <walter@livinglogic.de>	2004-09-07 20:24:22 (GMT)
committer	Walter Dörwald <walter@livinglogic.de>	2004-09-07 20:24:22 (GMT)
commit	69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree	088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Lib/codecs.py
parent	a708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
download	cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2