summaryrefslogtreecommitdiffstats
path: root/Lib/codecs.py
diff options
context:
space:
mode:
authorWalter Dörwald <walter@livinglogic.de>2004-09-07 20:24:22 (GMT)
committerWalter Dörwald <walter@livinglogic.de>2004-09-07 20:24:22 (GMT)
commit69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Lib/codecs.py
parenta708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
downloadcpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support
decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
Diffstat (limited to 'Lib/codecs.py')
-rw-r--r--Lib/codecs.py112
1 files changed, 69 insertions, 43 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 92c6fef..f831dd6 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -228,12 +228,22 @@ class StreamReader(Codec):
"""
self.stream = stream
self.errors = errors
+ self.bytebuffer = ""
+ self.charbuffer = u""
- def read(self, size=-1):
+ def decode(self, input, errors='strict'):
+ raise NotImplementedError
+
+ def read(self, size=-1, chars=-1):
""" Decodes data from the stream self.stream and returns the
resulting object.
+ chars indicates the number of characters to read from the
+ stream. read() will never return more than chars
+ characters, but it might return less, if there are not enough
+ characters available.
+
size indicates the approximate maximum number of bytes to
read from the stream for decoding purposes. The decoder
can modify this setting as appropriate. The default value
@@ -248,54 +258,70 @@ class StreamReader(Codec):
on the stream, these should be read too.
"""
- # Unsliced reading:
- if size < 0:
- return self.decode(self.stream.read(), self.errors)[0]
-
- # Sliced reading:
- read = self.stream.read
- decode = self.decode
- data = read(size)
- i = 0
- while 1:
- try:
- object, decodedbytes = decode(data, self.errors)
- except ValueError, why:
- # This method is slow but should work under pretty much
- # all conditions; at most 10 tries are made
- i = i + 1
- newdata = read(1)
- if not newdata or i > 10:
- raise
- data = data + newdata
+ # read until we get the required number of characters (if available)
+ done = False
+ while True:
+ # can the request can be satisfied from the character buffer?
+ if chars < 0:
+ if self.charbuffer:
+ done = True
else:
- return object
-
- def readline(self, size=None):
+ if len(self.charbuffer) >= chars:
+ done = True
+ if done:
+ if chars < 0:
+ result = self.charbuffer
+ self.charbuffer = u""
+ break
+ else:
+ result = self.charbuffer[:chars]
+ self.charbuffer = self.charbuffer[chars:]
+ break
+ # we need more data
+ if size < 0:
+ newdata = self.stream.read()
+ else:
+ newdata = self.stream.read(size)
+ data = self.bytebuffer + newdata
+ object, decodedbytes = self.decode(data, self.errors)
+ # keep undecoded bytes until the next call
+ self.bytebuffer = data[decodedbytes:]
+ # put new characters in the character buffer
+ self.charbuffer += object
+ # there was no data available
+ if not newdata:
+ done = True
+ return result
+
+ def readline(self, size=None, keepends=True):
""" Read one line from the input stream and return the
decoded data.
- Note: Unlike the .readlines() method, this method inherits
- the line breaking knowledge from the underlying stream's
- .readline() method -- there is currently no support for
- line breaking using the codec decoder due to lack of line
- buffering. Subclasses should however, if possible, try to
- implement this method using their own knowledge of line
- breaking.
-
- size, if given, is passed as size argument to the stream's
- .readline() method.
+ size, if given, is passed as size argument to the
+ read() method.
"""
if size is None:
- line = self.stream.readline()
- else:
- line = self.stream.readline(size)
- return self.decode(line, self.errors)[0]
-
-
- def readlines(self, sizehint=None):
+ size = 10
+ line = u""
+ while True:
+ data = self.read(size)
+ line += data
+ pos = line.find("\n")
+ if pos>=0:
+ self.charbuffer = line[pos+1:] + self.charbuffer
+ if keepends:
+ line = line[:pos+1]
+ else:
+ line = line[:pos]
+ return line
+ elif not data:
+ return line
+ if size<8000:
+ size *= 2
+
+ def readlines(self, sizehint=None, keepends=True):
""" Read all lines available on the input stream
and return them as list of lines.
@@ -307,8 +333,8 @@ class StreamReader(Codec):
way to finding the true end-of-line.
"""
- data = self.stream.read()
- return self.decode(data, self.errors)[0].splitlines(1)
+ data = self.read()
+ return self.splitlines(keepends)
def reset(self):