diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2006-01-08 10:45:39 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2006-01-08 10:45:39 (GMT) |
commit | 412ed3b8a7388da4850d2a832679fe21804a1591 (patch) | |
tree | c11d68a17f938df03661e8d548ae7ccef6d54bd5 /Lib/encodings | |
parent | fd9a72ad8994fd90cec26ed1ca5f8044719e24e5 (diff) | |
download | cpython-412ed3b8a7388da4850d2a832679fe21804a1591.zip cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.gz cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.bz2 |
Patch #1177307: UTF-8-Sig codec.
Diffstat (limited to 'Lib/encodings')
-rw-r--r-- | Lib/encodings/utf_8_sig.py | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py new file mode 100644 index 0000000..fa437e6 --- /dev/null +++ b/Lib/encodings/utf_8_sig.py @@ -0,0 +1,57 @@ +""" Python 'utf-8-sig' Codec +This work similar to UTF-8 with the following changes: + +* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the + first three bytes. + +* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these + bytes will be skipped. +""" +import codecs + +### Codec APIs + +def encode(input, errors='strict'): + return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) + +def decode(input, errors='strict'): + prefix = 0 + if input.startswith(codecs.BOM_UTF8): + input = input[3:] + prefix = 3 + (output, consumed) = codecs.utf_8_decode(input, errors, True) + return (output, consumed+prefix) + +class StreamWriter(codecs.StreamWriter): + def reset(self): + codecs.StreamWriter.reset(self) + try: + del self.encode + except AttributeError: + pass + + def encode(self, input, errors='strict'): + self.encode = codecs.utf_8_encode + return encode(input, errors) + +class StreamReader(codecs.StreamReader): + def reset(self): + codecs.StreamReader.reset(self) + try: + del self.decode + except AttributeError: + pass + + def decode(self, input, errors='strict'): + if len(input) < 3 and codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this is a BOM + # => try again on the next call + return (u"", 0) + self.decode = codecs.utf_8_decode + return decode(input, errors) + +### encodings module API + +def getregentry(): + + return (encode,decode,StreamReader,StreamWriter) |