Patch #1177307: UTF-8-Sig codec.

author: Martin v. Löwis <martin@v.loewis.de> 2006-01-08 10:45:39 (GMT)
committer: Martin v. Löwis <martin@v.loewis.de> 2006-01-08 10:45:39 (GMT)
commit: 412ed3b8a7388da4850d2a832679fe21804a1591 (patch)
tree: c11d68a17f938df03661e8d548ae7ccef6d54bd5 /Lib/encodings
parent: fd9a72ad8994fd90cec26ed1ca5f8044719e24e5 (diff)
download: cpython-412ed3b8a7388da4850d2a832679fe21804a1591.zip
cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.gz
cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.bz2
1 files changed, 57 insertions, 0 deletions
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py
new file mode 100644
index 0000000..fa437e6
--- /dev/null
+++ b/Lib/encodings/utf_8_sig.py
@@ -0,0 +1,57 @@
+""" Python 'utf-8-sig' Codec
+This work similar to UTF-8 with the following changes:
+
+* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
+  first three bytes.
+
+* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
+  bytes will be skipped.
+"""
+import codecs
+
+### Codec APIs
+
+def encode(input, errors='strict'):
+    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
+
+def decode(input, errors='strict'):
+    prefix = 0
+    if input.startswith(codecs.BOM_UTF8):
+        input = input[3:]
+        prefix = 3
+    (output, consumed) = codecs.utf_8_decode(input, errors, True)
+    return (output, consumed+prefix)
+
+class StreamWriter(codecs.StreamWriter):
+    def reset(self):
+        codecs.StreamWriter.reset(self)
+        try:
+            del self.encode
+        except AttributeError:
+            pass
+
+    def encode(self, input, errors='strict'):
+        self.encode = codecs.utf_8_encode
+        return encode(input, errors)
+
+class StreamReader(codecs.StreamReader):
+    def reset(self):
+        codecs.StreamReader.reset(self)
+        try:
+            del self.decode
+        except AttributeError:
+            pass
+
+    def decode(self, input, errors='strict'):
+        if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
+            # not enough data to decide if this is a BOM
+            # => try again on the next call
+            return (u"", 0)
+        self.decode = codecs.utf_8_decode
+        return decode(input, errors)
+
+### encodings module API
+
+def getregentry():
+
+    return (encode,decode,StreamReader,StreamWriter)
author	Martin v. Löwis <martin@v.loewis.de>	2006-01-08 10:45:39 (GMT)
committer	Martin v. Löwis <martin@v.loewis.de>	2006-01-08 10:45:39 (GMT)
commit	412ed3b8a7388da4850d2a832679fe21804a1591 (patch)
tree	c11d68a17f938df03661e8d548ae7ccef6d54bd5 /Lib/encodings
parent	fd9a72ad8994fd90cec26ed1ca5f8044719e24e5 (diff)
download	cpython-412ed3b8a7388da4850d2a832679fe21804a1591.zip cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.gz cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.bz2