From 183744d6b9d45e77c3bd2dc30a6eb41c9f1c58f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Walter=20D=C3=B6rwald?= Date: Mon, 19 Nov 2007 12:41:10 +0000 Subject: Fix for #1444: utf_8_sig.StreamReader was (indirectly through decode()) calling codecs.utf_8_decode() with final==True, which falled with incomplete byte sequences. Fix and test by James G. Sack. --- Lib/encodings/utf_8_sig.py | 16 +++++++++++----- Lib/test/test_codecs.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py index 92678d2..697ba95 100644 --- a/Lib/encodings/utf_8_sig.py +++ b/Lib/encodings/utf_8_sig.py @@ -84,12 +84,18 @@ class StreamReader(codecs.StreamReader): pass def decode(self, input, errors='strict'): - if len(input) < 3 and codecs.BOM_UTF8.startswith(input): - # not enough data to decide if this is a BOM - # => try again on the next call - return (u"", 0) + if len(input) < 3: + if codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this is a BOM + # => try again on the next call + return (u"", 0) + elif input[:3] == codecs.BOM_UTF8: + self.decode = codecs.utf_8_decode + (output, consumed) = codecs.utf_8_decode(input[3:],errors) + return (output, consumed+3) + # (else) no BOM present self.decode = codecs.utf_8_decode - return decode(input, errors) + return codecs.utf_8_decode(input, errors) ### encodings module API diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 95dd432..bfb417c 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -565,6 +565,50 @@ class UTF8SigTest(ReadTest): s = u"spam" self.assertEqual(d.decode(s.encode("utf-8-sig")), s) + def test_stream_bom(self): + unistring = u"ABC\u00A1\u2200XYZ" + bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ" + + reader = codecs.getreader("utf-8-sig") + for sizehint in [None] + range(1, 11) + \ + [64, 128, 256, 512, 1024]: + istream = reader(StringIO.StringIO(bytestring)) + ostream = StringIO.StringIO() + while 1: + if sizehint is not None: + data = istream.read(sizehint) + else: + data = istream.read() + + if not data: + break + ostream.write(data) + + got = ostream.getvalue() + self.assertEqual(got, unistring) + + def test_stream_bare(self): + unistring = u"ABC\u00A1\u2200XYZ" + bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ" + + reader = codecs.getreader("utf-8-sig") + for sizehint in [None] + range(1, 11) + \ + [64, 128, 256, 512, 1024]: + istream = reader(StringIO.StringIO(bytestring)) + ostream = StringIO.StringIO() + while 1: + if sizehint is not None: + data = istream.read(sizehint) + else: + data = istream.read() + + if not data: + break + ostream.write(data) + + got = ostream.getvalue() + self.assertEqual(got, unistring) + class EscapeDecodeTest(unittest.TestCase): def test_empty(self): self.assertEquals(codecs.escape_decode(""), ("", 0)) -- cgit v0.12