summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarc-André Lemburg <mal@egenix.com>2001-06-19 20:07:51 (GMT)
committerMarc-André Lemburg <mal@egenix.com>2001-06-19 20:07:51 (GMT)
commit92b550cdd854f27ba49fbdab4fc001e9ab7717c2 (patch)
tree8f74b0feabc1de444a06df73fce34b71b25f4461
parent8c78d3a5d1225e986805925e13508a5339271e2a (diff)
downloadcpython-92b550cdd854f27ba49fbdab4fc001e9ab7717c2.zip
cpython-92b550cdd854f27ba49fbdab4fc001e9ab7717c2.tar.gz
cpython-92b550cdd854f27ba49fbdab4fc001e9ab7717c2.tar.bz2
This patch by Martin v. Loewis changes the UTF-16 codec to only
write a BOM at the start of the stream and also to only read it as BOM at the start of a stream. Subsequent reading/writing of BOMs will read/write the BOM as ZWNBSP character. This is in sync with the Unicode specifications. Note that UTF-16 files will now *have* to start with a BOM mark in order to be readable by the codec.
-rw-r--r--Lib/encodings/utf_16.py36
1 files changed, 33 insertions, 3 deletions
diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py
index c034a97..72be072 100644
--- a/Lib/encodings/utf_16.py
+++ b/Lib/encodings/utf_16.py
@@ -6,7 +6,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""
-import codecs
+import codecs, sys
### Codec APIs
@@ -18,10 +18,40 @@ class Codec(codecs.Codec):
decode = codecs.utf_16_decode
class StreamWriter(Codec,codecs.StreamWriter):
- pass
+ def __init__(self, stream, errors='strict'):
+ self.bom_written = 0
+ codecs.StreamWriter.__init__(self, stream, errors)
+
+ def write(self, data):
+ result = codecs.StreamWriter.write(self, data)
+ if not self.bom_written:
+ self.bom_written = 1
+ if sys.byteorder == 'little':
+ self.encode = codecs.utf_16_le_encode
+ else:
+ self.encode = codecs.utf_16_be_encode
+ return result
class StreamReader(Codec,codecs.StreamReader):
- pass
+ def __init__(self, stream, errors='strict'):
+ self.bom_read = 0
+ codecs.StreamReader.__init__(self, stream, errors)
+
+ def read(self, size=-1):
+ if not self.bom_read:
+ signature = self.stream.read(2)
+ if signature == codecs.BOM_BE:
+ self.decode = codecs.utf_16_be_decode
+ elif signature == codecs.BOM_LE:
+ self.decode = codecs.utf_16_le_decode
+ else:
+ raise UnicodeError,"UTF-16 stream does not start with BOM"
+ if size > 2:
+ size -= 2
+ elif size >= 0:
+ size = 0
+ self.bom_read = 1
+ return codecs.StreamReader.read(self, size)
### encodings module API