summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2006-01-08 10:45:39 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2006-01-08 10:45:39 (GMT)
commit412ed3b8a7388da4850d2a832679fe21804a1591 (patch)
treec11d68a17f938df03661e8d548ae7ccef6d54bd5 /Lib
parentfd9a72ad8994fd90cec26ed1ca5f8044719e24e5 (diff)
downloadcpython-412ed3b8a7388da4850d2a832679fe21804a1591.zip
cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.gz
cpython-412ed3b8a7388da4850d2a832679fe21804a1591.tar.bz2
Patch #1177307: UTF-8-Sig codec.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/encodings/utf_8_sig.py57
-rw-r--r--Lib/test/test_codecs.py28
2 files changed, 85 insertions, 0 deletions
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py
new file mode 100644
index 0000000..fa437e6
--- /dev/null
+++ b/Lib/encodings/utf_8_sig.py
@@ -0,0 +1,57 @@
+""" Python 'utf-8-sig' Codec
+This work similar to UTF-8 with the following changes:
+
+* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
+ first three bytes.
+
+* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
+ bytes will be skipped.
+"""
+import codecs
+
+### Codec APIs
+
+def encode(input, errors='strict'):
+ return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
+
+def decode(input, errors='strict'):
+ prefix = 0
+ if input.startswith(codecs.BOM_UTF8):
+ input = input[3:]
+ prefix = 3
+ (output, consumed) = codecs.utf_8_decode(input, errors, True)
+ return (output, consumed+prefix)
+
+class StreamWriter(codecs.StreamWriter):
+ def reset(self):
+ codecs.StreamWriter.reset(self)
+ try:
+ del self.encode
+ except AttributeError:
+ pass
+
+ def encode(self, input, errors='strict'):
+ self.encode = codecs.utf_8_encode
+ return encode(input, errors)
+
+class StreamReader(codecs.StreamReader):
+ def reset(self):
+ codecs.StreamReader.reset(self)
+ try:
+ del self.decode
+ except AttributeError:
+ pass
+
+ def decode(self, input, errors='strict'):
+ if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
+ # not enough data to decide if this is a BOM
+ # => try again on the next call
+ return (u"", 0)
+ self.decode = codecs.utf_8_decode
+ return decode(input, errors)
+
+### encodings module API
+
+def getregentry():
+
+ return (encode,decode,StreamReader,StreamWriter)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index b344f9a..ded5d19 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -367,6 +367,33 @@ class CharBufferTest(unittest.TestCase):
self.assertRaises(TypeError, codecs.charbuffer_encode)
self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
+class UTF8SigTest(ReadTest):
+ encoding = "utf-8-sig"
+
+ def test_partial(self):
+ self.check_partial(
+ u"\ufeff\x00\xff\u07ff\u0800\uffff",
+ [
+ u"",
+ u"",
+ u"", # First BOM has been read and skipped
+ u"",
+ u"",
+ u"\ufeff", # Second BOM has been read and emitted
+ u"\ufeff\x00", # "\x00" read and emitted
+ u"\ufeff\x00", # First byte of encoded u"\xff" read
+ u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
+ u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
+ u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
+ u"\ufeff\x00\xff\u07ff",
+ u"\ufeff\x00\xff\u07ff",
+ u"\ufeff\x00\xff\u07ff\u0800",
+ u"\ufeff\x00\xff\u07ff\u0800",
+ u"\ufeff\x00\xff\u07ff\u0800",
+ u"\ufeff\x00\xff\u07ff\u0800\uffff",
+ ]
+ )
+
class EscapeDecodeTest(unittest.TestCase):
def test_empty(self):
self.assertEquals(codecs.escape_decode(""), ("", 0))
@@ -1044,6 +1071,7 @@ def test_main():
UTF16LETest,
UTF16BETest,
UTF8Test,
+ UTF8SigTest,
UTF7Test,
UTF16ExTest,
ReadBufferTest,