summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWalter Dörwald <walter@livinglogic.de>2007-04-12 10:35:00 (GMT)
committerWalter Dörwald <walter@livinglogic.de>2007-04-12 10:35:00 (GMT)
commit4234827e99eb73adbd387e15619cd6290f80f655 (patch)
treeb4ab22283f7054598fc18a0f8c2168145c33a0a8
parent9aba6d6905bcf3f3009f71cfa70525eb8b0ad3dc (diff)
downloadcpython-4234827e99eb73adbd387e15619cd6290f80f655.zip
cpython-4234827e99eb73adbd387e15619cd6290f80f655.tar.gz
cpython-4234827e99eb73adbd387e15619cd6290f80f655.tar.bz2
Fix utf-8-sig incremental decoder, which didn't recognise a BOM when the
first chunk fed to the decoder started with a BOM, but was longer than 3 bytes.
-rw-r--r--Lib/encodings/utf_8_sig.py19
-rw-r--r--Lib/test/test_codecs.py5
-rw-r--r--Misc/NEWS2
3 files changed, 19 insertions, 7 deletions
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py
index d751da6..92678d2 100644
--- a/Lib/encodings/utf_8_sig.py
+++ b/Lib/encodings/utf_8_sig.py
@@ -44,14 +44,19 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
self.first = True
def _buffer_decode(self, input, errors, final):
- if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM
+ if self.first:
if len(input) < 3:
- # not enough data to decide if this really is a BOM
- # => try again on the next call
- return (u"", 0)
- (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
- self.first = False
- return (output, consumed+3)
+ if codecs.BOM_UTF8.startswith(input):
+ # not enough data to decide if this really is a BOM
+ # => try again on the next call
+ return (u"", 0)
+ else:
+ self.first = None
+ else:
+ self.first = None
+ if input[:3] == codecs.BOM_UTF8:
+ (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
+ return (output, consumed+3)
return codecs.utf_8_decode(input, errors, final)
def reset(self):
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 3c800f8..0389623 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -429,6 +429,11 @@ class UTF8SigTest(ReadTest):
# SF bug #1601501: check that the codec works with a buffer
unicode("\xef\xbb\xbf", "utf-8-sig")
+ def test_bom(self):
+ d = codecs.getincrementaldecoder("utf-8-sig")()
+ s = u"spam"
+ self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
+
class EscapeDecodeTest(unittest.TestCase):
def test_empty(self):
self.assertEquals(codecs.escape_decode(""), ("", 0))
diff --git a/Misc/NEWS b/Misc/NEWS
index 4370030..db0c8dc 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -591,6 +591,8 @@ Library
- idle: Honor the "Cancel" action in the save dialog (Debian bug #299092).
+- Fix utf-8-sig incremental decoder, which didn't recognise a BOM when the
+ first chunk fed to the decoder started with a BOM, but was longer than 3 bytes.
Extension Modules
-----------------