diff options
author | Walter Dörwald <walter@livinglogic.de> | 2007-04-21 10:31:43 (GMT) |
---|---|---|
committer | Walter Dörwald <walter@livinglogic.de> | 2007-04-21 10:31:43 (GMT) |
commit | 93a3603c671894c1b5479c36464c9bf52efa0aba (patch) | |
tree | eda94257f9cb0829f8316ffaab0964ac436d4858 | |
parent | 552ba11085cc56a53c51888cd86a30fa69491df3 (diff) | |
download | cpython-93a3603c671894c1b5479c36464c9bf52efa0aba.zip cpython-93a3603c671894c1b5479c36464c9bf52efa0aba.tar.gz cpython-93a3603c671894c1b5479c36464c9bf52efa0aba.tar.bz2 |
Backport r54786:
Fix utf-8-sig incremental decoder, which didn't recognise a BOM when the
first chunk fed to the decoder started with a BOM, but was longer than 3 bytes.
-rw-r--r-- | Lib/encodings/utf_8_sig.py | 19 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 5 | ||||
-rw-r--r-- | Misc/NEWS | 2 |
3 files changed, 19 insertions, 7 deletions
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py index d751da6..92678d2 100644 --- a/Lib/encodings/utf_8_sig.py +++ b/Lib/encodings/utf_8_sig.py @@ -44,14 +44,19 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder): self.first = True def _buffer_decode(self, input, errors, final): - if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM + if self.first: if len(input) < 3: - # not enough data to decide if this really is a BOM - # => try again on the next call - return (u"", 0) - (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) - self.first = False - return (output, consumed+3) + if codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this really is a BOM + # => try again on the next call + return (u"", 0) + else: + self.first = None + else: + self.first = None + if input[:3] == codecs.BOM_UTF8: + (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) + return (output, consumed+3) return codecs.utf_8_decode(input, errors, final) def reset(self): diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 185670b..8c2a979 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -430,6 +430,11 @@ class UTF8SigTest(ReadTest): # SF bug #1601501: check that the codec works with a buffer unicode("\xef\xbb\xbf", "utf-8-sig") + def test_bom(self): + d = codecs.getincrementaldecoder("utf-8-sig")() + s = u"spam" + self.assertEqual(d.decode(s.encode("utf-8-sig")), s) + class EscapeDecodeTest(unittest.TestCase): def test_empty(self): self.assertEquals(codecs.escape_decode(""), ("", 0)) @@ -602,6 +602,8 @@ Tests - Fix bsddb test_basics.test06_Transactions to check the version number properly. +- Fix utf-8-sig incremental decoder, which didn't recognise a BOM when the + first chunk fed to the decoder started with a BOM, but was longer than 3 bytes. Documentation ------------- |