diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2006-01-08 10:45:39 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2006-01-08 10:45:39 (GMT) |
commit | 412ed3b8a7388da4850d2a832679fe21804a1591 (patch) | |
tree | c11d68a17f938df03661e8d548ae7ccef6d54bd5 /Lib | |
parent | fd9a72ad8994fd90cec26ed1ca5f8044719e24e5 (diff) | |
download | cpython-412ed3b8a7388da4850d2a832679fe21804a1591.zip cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.gz cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.bz2 |
Patch #1177307: UTF-8-Sig codec.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/encodings/utf_8_sig.py | 57 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 28 |
2 files changed, 85 insertions, 0 deletions
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py new file mode 100644 index 0000000..fa437e6 --- /dev/null +++ b/Lib/encodings/utf_8_sig.py @@ -0,0 +1,57 @@ +""" Python 'utf-8-sig' Codec +This work similar to UTF-8 with the following changes: + +* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the + first three bytes. + +* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these + bytes will be skipped. +""" +import codecs + +### Codec APIs + +def encode(input, errors='strict'): + return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) + +def decode(input, errors='strict'): + prefix = 0 + if input.startswith(codecs.BOM_UTF8): + input = input[3:] + prefix = 3 + (output, consumed) = codecs.utf_8_decode(input, errors, True) + return (output, consumed+prefix) + +class StreamWriter(codecs.StreamWriter): + def reset(self): + codecs.StreamWriter.reset(self) + try: + del self.encode + except AttributeError: + pass + + def encode(self, input, errors='strict'): + self.encode = codecs.utf_8_encode + return encode(input, errors) + +class StreamReader(codecs.StreamReader): + def reset(self): + codecs.StreamReader.reset(self) + try: + del self.decode + except AttributeError: + pass + + def decode(self, input, errors='strict'): + if len(input) < 3 and codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this is a BOM + # => try again on the next call + return (u"", 0) + self.decode = codecs.utf_8_decode + return decode(input, errors) + +### encodings module API + +def getregentry(): + + return (encode,decode,StreamReader,StreamWriter) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index b344f9a..ded5d19 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -367,6 +367,33 @@ class CharBufferTest(unittest.TestCase): self.assertRaises(TypeError, codecs.charbuffer_encode) self.assertRaises(TypeError, codecs.charbuffer_encode, 42) +class UTF8SigTest(ReadTest): + encoding = "utf-8-sig" + + def test_partial(self): + self.check_partial( + u"\ufeff\x00\xff\u07ff\u0800\uffff", + [ + u"", + u"", + u"", # First BOM has been read and skipped + u"", + u"", + u"\ufeff", # Second BOM has been read and emitted + u"\ufeff\x00", # "\x00" read and emitted + u"\ufeff\x00", # First byte of encoded u"\xff" read + u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read + u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read + u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read + u"\ufeff\x00\xff\u07ff", + u"\ufeff\x00\xff\u07ff", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800\uffff", + ] + ) + class EscapeDecodeTest(unittest.TestCase): def test_empty(self): self.assertEquals(codecs.escape_decode(""), ("", 0)) @@ -1044,6 +1071,7 @@ def test_main(): UTF16LETest, UTF16BETest, UTF8Test, + UTF8SigTest, UTF7Test, UTF16ExTest, ReadBufferTest, |