diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2009-05-14 18:55:55 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2009-05-14 18:55:55 (GMT) |
commit | e450185b4ad645d4f72cbd4b2139d6a987edc84d (patch) | |
tree | d588925c1710f0404f9ac61058a79a5b33382408 | |
parent | b565577aa722d8b39aa42da0384f776680c03c36 (diff) | |
download | cpython-e450185b4ad645d4f72cbd4b2139d6a987edc84d.zip cpython-e450185b4ad645d4f72cbd4b2139d6a987edc84d.tar.gz cpython-e450185b4ad645d4f72cbd4b2139d6a987edc84d.tar.bz2 |
Issue #5006: Better handling of unicode byte-order marks (BOM) in the io library.
This means, for example, that opening an UTF-16 text file in
append mode doesn't add a BOM at the end of the file if the file isn't
empty.
-rw-r--r-- | Lib/_pyio.py | 20 | ||||
-rw-r--r-- | Lib/test/test_io.py | 31 | ||||
-rw-r--r-- | Misc/NEWS | 5 | ||||
-rw-r--r-- | Modules/_io/_iomodule.c | 6 | ||||
-rw-r--r-- | Modules/_io/_iomodule.h | 2 | ||||
-rw-r--r-- | Modules/_io/textio.c | 126 |
6 files changed, 168 insertions, 22 deletions
diff --git a/Lib/_pyio.py b/Lib/_pyio.py index e3e7c3d..c9a7c5e 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1436,6 +1436,15 @@ class TextIOWrapper(TextIOBase): self._snapshot = None # info for reconstructing decoder state self._seekable = self._telling = self.buffer.seekable() + if self._seekable and self.writable(): + position = self.buffer.tell() + if position != 0: + try: + self._get_encoder().setstate(0) + except LookupError: + # Sometimes the encoder doesn't exist + pass + # self._snapshot is either None, or a tuple (dec_flags, next_input) # where dec_flags is the second (integer) item of the decoder state # and next_input is the chunk of input bytes that comes next after the @@ -1741,6 +1750,17 @@ class TextIOWrapper(TextIOBase): raise IOError("can't restore logical file position") self._decoded_chars_used = chars_to_skip + # Finally, reset the encoder (merely useful for proper BOM handling) + try: + encoder = self._encoder or self._get_encoder() + except LookupError: + # Sometimes the encoder doesn't exist + pass + else: + if cookie != 0: + encoder.setstate(0) + else: + encoder.reset() return cookie def read(self, n=None): diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 1a525dc..98dc711 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -1963,6 +1963,37 @@ class TextIOWrapperTest(unittest.TestCase): self.assertEqual(buffer.seekable(), txt.seekable()) + def test_append_bom(self): + # The BOM is not written again when appending to a non-empty file + filename = support.TESTFN + for charset in ('utf-8-sig', 'utf-16', 'utf-32'): + with self.open(filename, 'w', encoding=charset) as f: + f.write('aaa') + pos = f.tell() + with self.open(filename, 'rb') as f: + self.assertEquals(f.read(), 'aaa'.encode(charset)) + + with self.open(filename, 'a', encoding=charset) as f: + f.write('xxx') + with self.open(filename, 'rb') as f: + self.assertEquals(f.read(), 'aaaxxx'.encode(charset)) + + def test_seek_bom(self): + # Same test, but when seeking manually + filename = support.TESTFN + for charset in ('utf-8-sig', 'utf-16', 'utf-32'): + with self.open(filename, 'w', encoding=charset) as f: + f.write('aaa') + pos = f.tell() + with self.open(filename, 'r+', encoding=charset) as f: + f.seek(pos) + f.write('zzz') + f.seek(0) + f.write('bbb') + with self.open(filename, 'rb') as f: + self.assertEquals(f.read(), 'bbbzzz'.encode(charset)) + + class CTextIOWrapperTest(TextIOWrapperTest): def test_initialization(self): @@ -23,6 +23,11 @@ Core and Builtins Library ------- +- Issue #5006: Better handling of unicode byte-order marks (BOM) in the io + library. This means, for example, that opening an UTF-16 text file in + append mode doesn't add a BOM at the end of the file if the file isn't + empty. + - Issue #4050: inspect.findsource/getsource now raise an IOError if the 'source' file is a binary. Patch by Brodie Rao, tests by Daniel Diniz. This fix corrects a pydoc regression. diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c index 1bba13a..ba653d6 100644 --- a/Modules/_io/_iomodule.c +++ b/Modules/_io/_iomodule.c @@ -41,6 +41,7 @@ PyObject *_PyIO_str_readline; PyObject *_PyIO_str_reset; PyObject *_PyIO_str_seek; PyObject *_PyIO_str_seekable; +PyObject *_PyIO_str_setstate; PyObject *_PyIO_str_tell; PyObject *_PyIO_str_truncate; PyObject *_PyIO_str_writable; @@ -48,6 +49,7 @@ PyObject *_PyIO_str_write; PyObject *_PyIO_empty_str; PyObject *_PyIO_empty_bytes; +PyObject *_PyIO_zero; PyDoc_STRVAR(module_doc, @@ -734,6 +736,8 @@ PyInit__io(void) goto fail; if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable"))) goto fail; + if (!(_PyIO_str_setstate = PyUnicode_InternFromString("setstate"))) + goto fail; if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell"))) goto fail; if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate"))) @@ -747,6 +751,8 @@ PyInit__io(void) goto fail; if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0))) goto fail; + if (!(_PyIO_zero = PyLong_FromLong(0L))) + goto fail; state->initialized = 1; diff --git a/Modules/_io/_iomodule.h b/Modules/_io/_iomodule.h index a44f127..ef7248a 100644 --- a/Modules/_io/_iomodule.h +++ b/Modules/_io/_iomodule.h @@ -141,6 +141,7 @@ extern PyObject *_PyIO_str_readline; extern PyObject *_PyIO_str_reset; extern PyObject *_PyIO_str_seek; extern PyObject *_PyIO_str_seekable; +extern PyObject *_PyIO_str_setstate; extern PyObject *_PyIO_str_tell; extern PyObject *_PyIO_str_truncate; extern PyObject *_PyIO_str_writable; @@ -148,3 +149,4 @@ extern PyObject *_PyIO_str_write; extern PyObject *_PyIO_empty_str; extern PyObject *_PyIO_empty_bytes; +extern PyObject *_PyIO_zero; diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index f201ba7..8d2a686 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -647,6 +647,8 @@ typedef struct char telling; /* Specialized encoding func (see below) */ encodefunc_t encodefunc; + /* Whether or not it's the start of the stream */ + char encoding_start_of_stream; /* Reads and writes are internally buffered in order to speed things up. However, any read will first flush the write buffer if itsn't empty. @@ -707,21 +709,50 @@ utf16le_encode(PyTextIOWrapperObject *self, PyObject *text) static PyObject * utf16_encode(PyTextIOWrapperObject *self, PyObject *text) { - PyObject *res; - res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text), - PyUnicode_GET_SIZE(text), - PyBytes_AS_STRING(self->errors), 0); - if (res == NULL) - return NULL; - /* Next writes will skip the BOM and use native byte ordering */ + if (!self->encoding_start_of_stream) { + /* Skip the BOM and use native byte ordering */ #if defined(WORDS_BIGENDIAN) - self->encodefunc = (encodefunc_t) utf16be_encode; + return utf16be_encode(self, text); #else - self->encodefunc = (encodefunc_t) utf16le_encode; + return utf16le_encode(self, text); #endif - return res; + } + return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text), + PyUnicode_GET_SIZE(text), + PyBytes_AS_STRING(self->errors), 0); } +static PyObject * +utf32be_encode(PyTextIOWrapperObject *self, PyObject *text) +{ + return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text), + PyUnicode_GET_SIZE(text), + PyBytes_AS_STRING(self->errors), 1); +} + +static PyObject * +utf32le_encode(PyTextIOWrapperObject *self, PyObject *text) +{ + return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text), + PyUnicode_GET_SIZE(text), + PyBytes_AS_STRING(self->errors), -1); +} + +static PyObject * +utf32_encode(PyTextIOWrapperObject *self, PyObject *text) +{ + if (!self->encoding_start_of_stream) { + /* Skip the BOM and use native byte ordering */ +#if defined(WORDS_BIGENDIAN) + return utf32be_encode(self, text); +#else + return utf32le_encode(self, text); +#endif + } + return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text), + PyUnicode_GET_SIZE(text), + PyBytes_AS_STRING(self->errors), 0); +} static PyObject * utf8_encode(PyTextIOWrapperObject *self, PyObject *text) @@ -749,10 +780,13 @@ typedef struct { static encodefuncentry encodefuncs[] = { {"ascii", (encodefunc_t) ascii_encode}, {"iso8859-1", (encodefunc_t) latin1_encode}, + {"utf-8", (encodefunc_t) utf8_encode}, {"utf-16-be", (encodefunc_t) utf16be_encode}, {"utf-16-le", (encodefunc_t) utf16le_encode}, {"utf-16", (encodefunc_t) utf16_encode}, - {"utf-8", (encodefunc_t) utf8_encode}, + {"utf-32-be", (encodefunc_t) utf32be_encode}, + {"utf-32-le", (encodefunc_t) utf32le_encode}, + {"utf-32", (encodefunc_t) utf32_encode}, {NULL, NULL} }; @@ -978,6 +1012,33 @@ TextIOWrapper_init(PyTextIOWrapperObject *self, PyObject *args, PyObject *kwds) self->seekable = self->telling = PyObject_IsTrue(res); Py_DECREF(res); + self->encoding_start_of_stream = 0; + if (self->seekable && self->encoder) { + PyObject *cookieObj; + int cmp; + + self->encoding_start_of_stream = 1; + + cookieObj = PyObject_CallMethodObjArgs(buffer, _PyIO_str_tell, NULL); + if (cookieObj == NULL) + goto error; + + cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ); + Py_DECREF(cookieObj); + if (cmp < 0) { + goto error; + } + + if (cmp == 0) { + self->encoding_start_of_stream = 0; + res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate, + _PyIO_zero, NULL); + if (res == NULL) + goto error; + Py_DECREF(res); + } + } + self->ok = 1; return 0; @@ -1192,8 +1253,10 @@ TextIOWrapper_write(PyTextIOWrapperObject *self, PyObject *args) needflush = 1; /* XXX What if we were just reading? */ - if (self->encodefunc != NULL) + if (self->encodefunc != NULL) { b = (*self->encodefunc)((PyObject *) self, text); + self->encoding_start_of_stream = 0; + } else b = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_encode, text, NULL); @@ -1847,24 +1910,38 @@ _TextIOWrapper_decoder_setstate(PyTextIOWrapperObject *self, return 0; } +static int +_TextIOWrapper_encoder_setstate(PyTextIOWrapperObject *self, + CookieStruct *cookie) +{ + PyObject *res; + /* Same as _TextIOWrapper_decoder_setstate() above. */ + if (cookie->start_pos == 0 && cookie->dec_flags == 0) { + res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_reset, NULL); + self->encoding_start_of_stream = 1; + } + else { + res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate, + _PyIO_zero, NULL); + self->encoding_start_of_stream = 0; + } + if (res == NULL) + return -1; + Py_DECREF(res); + return 0; +} + static PyObject * TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) { PyObject *cookieObj, *posobj; CookieStruct cookie; int whence = 0; - static PyObject *zero = NULL; PyObject *res; int cmp; CHECK_INITIALIZED(self); - if (zero == NULL) { - zero = PyLong_FromLong(0L); - if (zero == NULL) - return NULL; - } - if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence)) return NULL; CHECK_CLOSED(self); @@ -1879,7 +1956,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) if (whence == 1) { /* seek relative to current position */ - cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ); + cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ); if (cmp < 0) goto fail; @@ -1900,7 +1977,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) else if (whence == 2) { /* seek relative to end of file */ - cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ); + cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ); if (cmp < 0) goto fail; @@ -1934,7 +2011,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) goto fail; } - cmp = PyObject_RichCompareBool(cookieObj, zero, Py_LT); + cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_LT); if (cmp < 0) goto fail; @@ -2013,6 +2090,11 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) goto fail; } + /* Finally, reset the encoder (merely useful for proper BOM handling) */ + if (self->encoder) { + if (_TextIOWrapper_encoder_setstate(self, &cookie) < 0) + goto fail; + } return cookieObj; fail: Py_XDECREF(cookieObj); |