1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
""" Python 'utf-8-sig' Codec
This work similar to UTF-8 with the following changes:
* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
first three bytes.
* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
bytes will be skipped.
"""
import codecs
### Codec APIs
def encode(input, errors='strict'):
return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
def decode(input, errors='strict'):
prefix = 0
if input.startswith(codecs.BOM_UTF8):
input = input[3:]
prefix = 3
(output, consumed) = codecs.utf_8_decode(input, errors, True)
return (output, consumed+prefix)
class StreamWriter(codecs.StreamWriter):
def reset(self):
codecs.StreamWriter.reset(self)
try:
del self.encode
except AttributeError:
pass
def encode(self, input, errors='strict'):
self.encode = codecs.utf_8_encode
return encode(input, errors)
class StreamReader(codecs.StreamReader):
def reset(self):
codecs.StreamReader.reset(self)
try:
del self.decode
except AttributeError:
pass
def decode(self, input, errors='strict'):
if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
# not enough data to decide if this is a BOM
# => try again on the next call
return (u"", 0)
self.decode = codecs.utf_8_decode
return decode(input, errors)
### encodings module API
def getregentry():
return (encode,decode,StreamReader,StreamWriter)
|