diff options
author | Marc-André Lemburg <mal@egenix.com> | 2001-06-19 20:07:51 (GMT) |
---|---|---|
committer | Marc-André Lemburg <mal@egenix.com> | 2001-06-19 20:07:51 (GMT) |
commit | 92b550cdd854f27ba49fbdab4fc001e9ab7717c2 (patch) | |
tree | 8f74b0feabc1de444a06df73fce34b71b25f4461 /Lib | |
parent | 8c78d3a5d1225e986805925e13508a5339271e2a (diff) | |
download | cpython-92b550cdd854f27ba49fbdab4fc001e9ab7717c2.zip cpython-92b550cdd854f27ba49fbdab4fc001e9ab7717c2.tar.gz cpython-92b550cdd854f27ba49fbdab4fc001e9ab7717c2.tar.bz2 |
This patch by Martin v. Loewis changes the UTF-16 codec to only
write a BOM at the start of the stream and also to only read it as
BOM at the start of a stream.
Subsequent reading/writing of BOMs will read/write the BOM as ZWNBSP
character. This is in sync with the Unicode specifications.
Note that UTF-16 files will now *have* to start with a BOM mark
in order to be readable by the codec.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/encodings/utf_16.py | 36 |
1 files changed, 33 insertions, 3 deletions
diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py index c034a97..72be072 100644 --- a/Lib/encodings/utf_16.py +++ b/Lib/encodings/utf_16.py @@ -6,7 +6,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. """ -import codecs +import codecs, sys ### Codec APIs @@ -18,10 +18,40 @@ class Codec(codecs.Codec): decode = codecs.utf_16_decode class StreamWriter(Codec,codecs.StreamWriter): - pass + def __init__(self, stream, errors='strict'): + self.bom_written = 0 + codecs.StreamWriter.__init__(self, stream, errors) + + def write(self, data): + result = codecs.StreamWriter.write(self, data) + if not self.bom_written: + self.bom_written = 1 + if sys.byteorder == 'little': + self.encode = codecs.utf_16_le_encode + else: + self.encode = codecs.utf_16_be_encode + return result class StreamReader(Codec,codecs.StreamReader): - pass + def __init__(self, stream, errors='strict'): + self.bom_read = 0 + codecs.StreamReader.__init__(self, stream, errors) + + def read(self, size=-1): + if not self.bom_read: + signature = self.stream.read(2) + if signature == codecs.BOM_BE: + self.decode = codecs.utf_16_be_decode + elif signature == codecs.BOM_LE: + self.decode = codecs.utf_16_le_decode + else: + raise UnicodeError,"UTF-16 stream does not start with BOM" + if size > 2: + size -= 2 + elif size >= 0: + size = 0 + self.bom_read = 1 + return codecs.StreamReader.read(self, size) ### encodings module API |