summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorWalter Dörwald <walter@livinglogic.de>2004-09-07 20:24:22 (GMT)
committerWalter Dörwald <walter@livinglogic.de>2004-09-07 20:24:22 (GMT)
commit69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Lib
parenta708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
downloadcpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support
decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/codecs.py112
-rw-r--r--Lib/encodings/utf_16.py64
-rw-r--r--Lib/encodings/utf_16_be.py20
-rw-r--r--Lib/encodings/utf_16_le.py19
-rw-r--r--Lib/encodings/utf_8.py18
-rw-r--r--Lib/test/test_codecs.py118
6 files changed, 235 insertions, 116 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 92c6fef..f831dd6 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -228,12 +228,22 @@ class StreamReader(Codec):
"""
self.stream = stream
self.errors = errors
+ self.bytebuffer = ""
+ self.charbuffer = u""
- def read(self, size=-1):
+ def decode(self, input, errors='strict'):
+ raise NotImplementedError
+
+ def read(self, size=-1, chars=-1):
""" Decodes data from the stream self.stream and returns the
resulting object.
+ chars indicates the number of characters to read from the
+ stream. read() will never return more than chars
+ characters, but it might return less, if there are not enough
+ characters available.
+
size indicates the approximate maximum number of bytes to
read from the stream for decoding purposes. The decoder
can modify this setting as appropriate. The default value
@@ -248,54 +258,70 @@ class StreamReader(Codec):
on the stream, these should be read too.
"""
- # Unsliced reading:
- if size < 0:
- return self.decode(self.stream.read(), self.errors)[0]
-
- # Sliced reading:
- read = self.stream.read
- decode = self.decode
- data = read(size)
- i = 0
- while 1:
- try:
- object, decodedbytes = decode(data, self.errors)
- except ValueError, why:
- # This method is slow but should work under pretty much
- # all conditions; at most 10 tries are made
- i = i + 1
- newdata = read(1)
- if not newdata or i > 10:
- raise
- data = data + newdata
+ # read until we get the required number of characters (if available)
+ done = False
+ while True:
+ # can the request can be satisfied from the character buffer?
+ if chars < 0:
+ if self.charbuffer:
+ done = True
else:
- return object
-
- def readline(self, size=None):
+ if len(self.charbuffer) >= chars:
+ done = True
+ if done:
+ if chars < 0:
+ result = self.charbuffer
+ self.charbuffer = u""
+ break
+ else:
+ result = self.charbuffer[:chars]
+ self.charbuffer = self.charbuffer[chars:]
+ break
+ # we need more data
+ if size < 0:
+ newdata = self.stream.read()
+ else:
+ newdata = self.stream.read(size)
+ data = self.bytebuffer + newdata
+ object, decodedbytes = self.decode(data, self.errors)
+ # keep undecoded bytes until the next call
+ self.bytebuffer = data[decodedbytes:]
+ # put new characters in the character buffer
+ self.charbuffer += object
+ # there was no data available
+ if not newdata:
+ done = True
+ return result
+
+ def readline(self, size=None, keepends=True):
""" Read one line from the input stream and return the
decoded data.
- Note: Unlike the .readlines() method, this method inherits
- the line breaking knowledge from the underlying stream's
- .readline() method -- there is currently no support for
- line breaking using the codec decoder due to lack of line
- buffering. Subclasses should however, if possible, try to
- implement this method using their own knowledge of line
- breaking.
-
- size, if given, is passed as size argument to the stream's
- .readline() method.
+ size, if given, is passed as size argument to the
+ read() method.
"""
if size is None:
- line = self.stream.readline()
- else:
- line = self.stream.readline(size)
- return self.decode(line, self.errors)[0]
-
-
- def readlines(self, sizehint=None):
+ size = 10
+ line = u""
+ while True:
+ data = self.read(size)
+ line += data
+ pos = line.find("\n")
+ if pos>=0:
+ self.charbuffer = line[pos+1:] + self.charbuffer
+ if keepends:
+ line = line[:pos+1]
+ else:
+ line = line[:pos]
+ return line
+ elif not data:
+ return line
+ if size<8000:
+ size *= 2
+
+ def readlines(self, sizehint=None, keepends=True):
""" Read all lines available on the input stream
and return them as list of lines.
@@ -307,8 +333,8 @@ class StreamReader(Codec):
way to finding the true end-of-line.
"""
- data = self.stream.read()
- return self.decode(data, self.errors)[0].splitlines(1)
+ data = self.read()
+ return self.splitlines(keepends)
def reset(self):
diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py
index 8c79c79..a33581c 100644
--- a/Lib/encodings/utf_16.py
+++ b/Lib/encodings/utf_16.py
@@ -10,54 +10,40 @@ import codecs, sys
### Codec APIs
-class Codec(codecs.Codec):
+encode = codecs.utf_16_encode
- # Note: Binding these as C functions will result in the class not
- # converting them to methods. This is intended.
- encode = codecs.utf_16_encode
- decode = codecs.utf_16_decode
+def decode(input, errors='strict'):
+ return codecs.utf_16_decode(input, errors, True)
-class StreamWriter(Codec,codecs.StreamWriter):
+class StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
- self.bom_written = 0
+ self.bom_written = False
codecs.StreamWriter.__init__(self, stream, errors)
- def write(self, data):
- result = codecs.StreamWriter.write(self, data)
- if not self.bom_written:
- self.bom_written = 1
- if sys.byteorder == 'little':
- self.encode = codecs.utf_16_le_encode
- else:
- self.encode = codecs.utf_16_be_encode
+ def encode(self, input, errors='strict'):
+ self.bom_written = True
+ result = codecs.utf_16_encode(input, errors)
+ if sys.byteorder == 'little':
+ self.encode = codecs.utf_16_le_encode
+ else:
+ self.encode = codecs.utf_16_be_encode
return result
-class StreamReader(Codec,codecs.StreamReader):
- def __init__(self, stream, errors='strict'):
- self.bom_read = 0
- codecs.StreamReader.__init__(self, stream, errors)
-
- def read(self, size=-1):
- if not self.bom_read:
- signature = self.stream.read(2)
- if signature == codecs.BOM_BE:
- self.decode = codecs.utf_16_be_decode
- elif signature == codecs.BOM_LE:
- self.decode = codecs.utf_16_le_decode
- else:
- raise UnicodeError,"UTF-16 stream does not start with BOM"
- if size > 2:
- size -= 2
- elif size >= 0:
- size = 0
- self.bom_read = 1
- return codecs.StreamReader.read(self, size)
-
- def readline(self, size=None):
- raise NotImplementedError, '.readline() is not implemented for UTF-16'
+class StreamReader(codecs.StreamReader):
+
+ def decode(self, input, errors='strict'):
+ (object, consumed, byteorder) = \
+ codecs.utf_16_ex_decode(input, errors, 0, False)
+ if byteorder == -1:
+ self.decode = codecs.utf_16_le_decode
+ elif byteorder == 1:
+ self.decode = codecs.utf_16_be_decode
+ elif consumed>=2:
+ raise UnicodeError,"UTF-16 stream does not start with BOM"
+ return (object, consumed)
### encodings module API
def getregentry():
- return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
+ return (encode,decode,StreamReader,StreamWriter)
diff --git a/Lib/encodings/utf_16_be.py b/Lib/encodings/utf_16_be.py
index dad540b..9a51f8c 100644
--- a/Lib/encodings/utf_16_be.py
+++ b/Lib/encodings/utf_16_be.py
@@ -10,23 +10,19 @@ import codecs
### Codec APIs
-class Codec(codecs.Codec):
+encode = codecs.utf_16_be_encode
- # Note: Binding these as C functions will result in the class not
- # converting them to methods. This is intended.
- encode = codecs.utf_16_be_encode
- decode = codecs.utf_16_be_decode
-
-class StreamWriter(Codec,codecs.StreamWriter):
- pass
+def decode(input, errors='strict'):
+ return codecs.utf_16_be_decode(input, errors, True)
-class StreamReader(Codec,codecs.StreamReader):
+class StreamWriter(codecs.StreamWriter):
+ encode = codecs.utf_16_be_encode
- def readline(self, size=None):
- raise NotImplementedError, '.readline() is not implemented for UTF-16-BE'
+class StreamReader(codecs.StreamReader):
+ decode = codecs.utf_16_be_decode
### encodings module API
def getregentry():
- return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
+ return (encode,decode,StreamReader,StreamWriter)
diff --git a/Lib/encodings/utf_16_le.py b/Lib/encodings/utf_16_le.py
index 8120d5b..95ca830 100644
--- a/Lib/encodings/utf_16_le.py
+++ b/Lib/encodings/utf_16_le.py
@@ -10,23 +10,20 @@ import codecs
### Codec APIs
-class Codec(codecs.Codec):
+encode = codecs.utf_16_le_encode
- # Note: Binding these as C functions will result in the class not
- # converting them to methods. This is intended.
- encode = codecs.utf_16_le_encode
- decode = codecs.utf_16_le_decode
+def decode(input, errors='strict'):
+ return codecs.utf_16_le_decode(input, errors, True)
-class StreamWriter(Codec,codecs.StreamWriter):
- pass
+class StreamWriter(codecs.StreamWriter):
+ encode = codecs.utf_16_le_encode
-class StreamReader(Codec,codecs.StreamReader):
+class StreamReader(codecs.StreamReader):
+ decode = codecs.utf_16_le_decode
- def readline(self, size=None):
- raise NotImplementedError, '.readline() is not implemented for UTF-16-LE'
### encodings module API
def getregentry():
- return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
+ return (encode,decode,StreamReader,StreamWriter)
diff --git a/Lib/encodings/utf_8.py b/Lib/encodings/utf_8.py
index 89249a9..9cb0b4b 100644
--- a/Lib/encodings/utf_8.py
+++ b/Lib/encodings/utf_8.py
@@ -10,21 +10,19 @@ import codecs
### Codec APIs
-class Codec(codecs.Codec):
+encode = codecs.utf_8_encode
- # Note: Binding these as C functions will result in the class not
- # converting them to methods. This is intended.
- encode = codecs.utf_8_encode
- decode = codecs.utf_8_decode
+def decode(input, errors='strict'):
+ return codecs.utf_8_decode(input, errors, True)
-class StreamWriter(Codec,codecs.StreamWriter):
- pass
+class StreamWriter(codecs.StreamWriter):
+ encode = codecs.utf_8_encode
-class StreamReader(Codec,codecs.StreamReader):
- pass
+class StreamReader(codecs.StreamReader):
+ decode = codecs.utf_8_decode
### encodings module API
def getregentry():
- return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
+ return (encode,decode,StreamReader,StreamWriter)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index c428c61..524c247 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3,7 +3,45 @@ import unittest
import codecs
import StringIO
-class UTF16Test(unittest.TestCase):
+class Queue(object):
+ """
+ queue: write bytes at one end, read bytes from the other end
+ """
+ def __init__(self):
+ self._buffer = ""
+
+ def write(self, chars):
+ self._buffer += chars
+
+ def read(self, size=-1):
+ if size<0:
+ s = self._buffer
+ self._buffer = ""
+ return s
+ else:
+ s = self._buffer[:size]
+ self._buffer = self._buffer[size:]
+ return s
+
+class PartialReadTest(unittest.TestCase):
+ def check_partial(self, encoding, input, partialresults):
+ # get a StreamReader for the encoding and feed the bytestring version
+ # of input to the reader byte by byte. Read every available from
+ # the StreamReader and check that the results equal the appropriate
+ # entries from partialresults.
+ q = Queue()
+ r = codecs.getreader(encoding)(q)
+ result = u""
+ for (c, partialresult) in zip(input.encode(encoding), partialresults):
+ q.write(c)
+ result += r.read()
+ self.assertEqual(result, partialresult)
+ # check that there's nothing left in the buffers
+ self.assertEqual(r.read(), u"")
+ self.assertEqual(r.bytebuffer, "")
+ self.assertEqual(r.charbuffer, u"")
+
+class UTF16Test(PartialReadTest):
spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
@@ -23,6 +61,81 @@ class UTF16Test(unittest.TestCase):
f = reader(s)
self.assertEquals(f.read(), u"spamspam")
+ def test_partial(self):
+ self.check_partial(
+ "utf-16",
+ u"\x00\xff\u0100\uffff",
+ [
+ u"", # first byte of BOM read
+ u"", # second byte of BOM read => byteorder known
+ u"",
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100\uffff",
+ ]
+ )
+
+class UTF16LETest(PartialReadTest):
+
+ def test_partial(self):
+ self.check_partial(
+ "utf-16-le",
+ u"\x00\xff\u0100\uffff",
+ [
+ u"",
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100\uffff",
+ ]
+ )
+
+class UTF16BETest(PartialReadTest):
+
+ def test_partial(self):
+ self.check_partial(
+ "utf-16-be",
+ u"\x00\xff\u0100\uffff",
+ [
+ u"",
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100",
+ u"\x00\xff\u0100\uffff",
+ ]
+ )
+
+class UTF8Test(PartialReadTest):
+
+ def test_partial(self):
+ self.check_partial(
+ "utf-8",
+ u"\x00\xff\u07ff\u0800\uffff",
+ [
+ u"\x00",
+ u"\x00",
+ u"\x00\xff",
+ u"\x00\xff",
+ u"\x00\xff\u07ff",
+ u"\x00\xff\u07ff",
+ u"\x00\xff\u07ff",
+ u"\x00\xff\u07ff\u0800",
+ u"\x00\xff\u07ff\u0800",
+ u"\x00\xff\u07ff\u0800",
+ u"\x00\xff\u07ff\u0800\uffff",
+ ]
+ )
+
class EscapeDecodeTest(unittest.TestCase):
def test_empty_escape_decode(self):
self.assertEquals(codecs.escape_decode(""), ("", 0))
@@ -348,6 +461,9 @@ class CodecsModuleTest(unittest.TestCase):
def test_main():
test_support.run_unittest(
UTF16Test,
+ UTF16LETest,
+ UTF16BETest,
+ UTF8Test,
EscapeDecodeTest,
RecodingTest,
PunycodeTest,