diff options
author | Victor Stinner <victor.stinner@haypocalc.com> | 2010-07-28 01:58:41 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@haypocalc.com> | 2010-07-28 01:58:41 (GMT) |
commit | 8243ddb6ca5c0f78764a28f044f0c0284774d317 (patch) | |
tree | ca6b3fc1c3b76df1ebd7b0a4d575dd49470f4b56 | |
parent | 082a65ab1fd8c54df7a128d0e813061db9bfbba8 (diff) | |
download | cpython-8243ddb6ca5c0f78764a28f044f0c0284774d317.zip cpython-8243ddb6ca5c0f78764a28f044f0c0284774d317.tar.gz cpython-8243ddb6ca5c0f78764a28f044f0c0284774d317.tar.bz2 |
Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
library. This means, for example, that opening an UTF-16 text file in append
mode doesn't add a BOM at the end of the file if the file isn't empty.
-rw-r--r-- | Lib/io.py | 20 | ||||
-rw-r--r-- | Lib/test/test_io.py | 31 | ||||
-rw-r--r-- | Misc/NEWS | 4 |
3 files changed, 55 insertions, 0 deletions
@@ -1440,6 +1440,15 @@ class TextIOWrapper(TextIOBase): self._snapshot = None # info for reconstructing decoder state self._seekable = self._telling = self.buffer.seekable() + if self._seekable and self.writable(): + position = self.buffer.tell() + if position != 0: + try: + self._get_encoder().setstate(0) + except LookupError: + # Sometimes the encoder doesn't exist + pass + # self._snapshot is either None, or a tuple (dec_flags, next_input) # where dec_flags is the second (integer) item of the decoder state # and next_input is the chunk of input bytes that comes next after the @@ -1726,6 +1735,17 @@ class TextIOWrapper(TextIOBase): raise IOError("can't restore logical file position") self._decoded_chars_used = chars_to_skip + # Finally, reset the encoder (merely useful for proper BOM handling) + try: + encoder = self._encoder or self._get_encoder() + except LookupError: + # Sometimes the encoder doesn't exist + pass + else: + if cookie != 0: + encoder.setstate(0) + else: + encoder.reset() return cookie def read(self, n=None): diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index aebe67b..5cfa472 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -799,6 +799,37 @@ class StatefulIncrementalDecoderTest(unittest.TestCase): self.assertEquals(d.decode(b'oiabcd'), '') self.assertEquals(d.decode(b'', 1), 'abcd.') + def test_append_bom(self): + # The BOM is not written again when appending to a non-empty file + filename = test_support.TESTFN + for charset in ('utf-8-sig', 'utf-16', 'utf-32'): + with io.open(filename, 'w', encoding=charset) as f: + f.write('aaa') + pos = f.tell() + with io.open(filename, 'rb') as f: + self.assertEquals(f.read(), 'aaa'.encode(charset)) + + with io.open(filename, 'a', encoding=charset) as f: + f.write('xxx') + with io.open(filename, 'rb') as f: + self.assertEquals(f.read(), 'aaaxxx'.encode(charset)) + + def test_seek_bom(self): + # Same test, but when seeking manually + filename = test_support.TESTFN + for charset in ('utf-8-sig', 'utf-16', 'utf-32'): + with io.open(filename, 'w', encoding=charset) as f: + f.write('aaa') + pos = f.tell() + with io.open(filename, 'r+', encoding=charset) as f: + f.seek(pos) + f.write('zzz') + f.seek(0) + f.write('bbb') + with io.open(filename, 'rb') as f: + self.assertEquals(f.read(), 'bbbzzz'.encode(charset)) + + class TextIOWrapperTest(unittest.TestCase): def setUp(self): @@ -84,6 +84,10 @@ C-API Library ------- +- Issue #5006: Better handling of unicode byte-order marks (BOM) in the io + library. This means, for example, that opening an UTF-16 text file in append + mode doesn't add a BOM at the end of the file if the file isn't empty. + - Issue #3704: cookielib was not properly handling URLs with a / in the parameters. |