From 183744d6b9d45e77c3bd2dc30a6eb41c9f1c58f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Walter=20D=C3=B6rwald?= <walter@livinglogic.de>
Date: Mon, 19 Nov 2007 12:41:10 +0000
Subject: Fix for #1444: utf_8_sig.StreamReader was (indirectly through
 decode()) calling codecs.utf_8_decode() with final==True, which falled with
 incomplete byte sequences. Fix and test by James G. Sack.

---
 Lib/encodings/utf_8_sig.py | 16 +++++++++++-----
 Lib/test/test_codecs.py    | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py
index 92678d2..697ba95 100644
--- a/Lib/encodings/utf_8_sig.py
+++ b/Lib/encodings/utf_8_sig.py
@@ -84,12 +84,18 @@ class StreamReader(codecs.StreamReader):
             pass
 
     def decode(self, input, errors='strict'):
-        if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
-            # not enough data to decide if this is a BOM
-            # => try again on the next call
-            return (u"", 0)
+        if len(input) < 3:
+            if codecs.BOM_UTF8.startswith(input):
+                # not enough data to decide if this is a BOM
+                # => try again on the next call
+                return (u"", 0)
+        elif input[:3] == codecs.BOM_UTF8:
+            self.decode = codecs.utf_8_decode
+            (output, consumed) = codecs.utf_8_decode(input[3:],errors)
+            return (output, consumed+3)
+        # (else) no BOM present
         self.decode = codecs.utf_8_decode
-        return decode(input, errors)
+        return codecs.utf_8_decode(input, errors)
 
 ### encodings module API
 
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 95dd432..bfb417c 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -565,6 +565,50 @@ class UTF8SigTest(ReadTest):
         s = u"spam"
         self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
 
+    def test_stream_bom(self):
+        unistring = u"ABC\u00A1\u2200XYZ"
+        bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
+
+        reader = codecs.getreader("utf-8-sig")
+        for sizehint in [None] + range(1, 11) + \
+                        [64, 128, 256, 512, 1024]:
+            istream = reader(StringIO.StringIO(bytestring))
+            ostream = StringIO.StringIO()
+            while 1:
+                if sizehint is not None:
+                    data = istream.read(sizehint)
+                else:
+                    data = istream.read()
+
+                if not data:
+                    break
+                ostream.write(data)
+
+            got = ostream.getvalue()
+            self.assertEqual(got, unistring)
+
+    def test_stream_bare(self):
+        unistring = u"ABC\u00A1\u2200XYZ"
+        bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
+
+        reader = codecs.getreader("utf-8-sig")
+        for sizehint in [None] + range(1, 11) + \
+                        [64, 128, 256, 512, 1024]:
+            istream = reader(StringIO.StringIO(bytestring))
+            ostream = StringIO.StringIO()
+            while 1:
+                if sizehint is not None:
+                    data = istream.read(sizehint)
+                else:
+                    data = istream.read()
+
+                if not data:
+                    break
+                ostream.write(data)
+
+            got = ostream.getvalue()
+            self.assertEqual(got, unistring)
+
 class EscapeDecodeTest(unittest.TestCase):
     def test_empty(self):
         self.assertEquals(codecs.escape_decode(""), ("", 0))
-- 
cgit v0.12