summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2009-05-14 18:55:55 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2009-05-14 18:55:55 (GMT)
commite450185b4ad645d4f72cbd4b2139d6a987edc84d (patch)
treed588925c1710f0404f9ac61058a79a5b33382408 /Modules
parentb565577aa722d8b39aa42da0384f776680c03c36 (diff)
downloadcpython-e450185b4ad645d4f72cbd4b2139d6a987edc84d.zip
cpython-e450185b4ad645d4f72cbd4b2139d6a987edc84d.tar.gz
cpython-e450185b4ad645d4f72cbd4b2139d6a987edc84d.tar.bz2
Issue #5006: Better handling of unicode byte-order marks (BOM) in the io library.
This means, for example, that opening an UTF-16 text file in append mode doesn't add a BOM at the end of the file if the file isn't empty.
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_io/_iomodule.c6
-rw-r--r--Modules/_io/_iomodule.h2
-rw-r--r--Modules/_io/textio.c126
3 files changed, 112 insertions, 22 deletions
diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c
index 1bba13a..ba653d6 100644
--- a/Modules/_io/_iomodule.c
+++ b/Modules/_io/_iomodule.c
@@ -41,6 +41,7 @@ PyObject *_PyIO_str_readline;
PyObject *_PyIO_str_reset;
PyObject *_PyIO_str_seek;
PyObject *_PyIO_str_seekable;
+PyObject *_PyIO_str_setstate;
PyObject *_PyIO_str_tell;
PyObject *_PyIO_str_truncate;
PyObject *_PyIO_str_writable;
@@ -48,6 +49,7 @@ PyObject *_PyIO_str_write;
PyObject *_PyIO_empty_str;
PyObject *_PyIO_empty_bytes;
+PyObject *_PyIO_zero;
PyDoc_STRVAR(module_doc,
@@ -734,6 +736,8 @@ PyInit__io(void)
goto fail;
if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable")))
goto fail;
+ if (!(_PyIO_str_setstate = PyUnicode_InternFromString("setstate")))
+ goto fail;
if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell")))
goto fail;
if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate")))
@@ -747,6 +751,8 @@ PyInit__io(void)
goto fail;
if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0)))
goto fail;
+ if (!(_PyIO_zero = PyLong_FromLong(0L)))
+ goto fail;
state->initialized = 1;
diff --git a/Modules/_io/_iomodule.h b/Modules/_io/_iomodule.h
index a44f127..ef7248a 100644
--- a/Modules/_io/_iomodule.h
+++ b/Modules/_io/_iomodule.h
@@ -141,6 +141,7 @@ extern PyObject *_PyIO_str_readline;
extern PyObject *_PyIO_str_reset;
extern PyObject *_PyIO_str_seek;
extern PyObject *_PyIO_str_seekable;
+extern PyObject *_PyIO_str_setstate;
extern PyObject *_PyIO_str_tell;
extern PyObject *_PyIO_str_truncate;
extern PyObject *_PyIO_str_writable;
@@ -148,3 +149,4 @@ extern PyObject *_PyIO_str_write;
extern PyObject *_PyIO_empty_str;
extern PyObject *_PyIO_empty_bytes;
+extern PyObject *_PyIO_zero;
diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c
index f201ba7..8d2a686 100644
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@@ -647,6 +647,8 @@ typedef struct
char telling;
/* Specialized encoding func (see below) */
encodefunc_t encodefunc;
+ /* Whether or not it's the start of the stream */
+ char encoding_start_of_stream;
/* Reads and writes are internally buffered in order to speed things up.
However, any read will first flush the write buffer if itsn't empty.
@@ -707,21 +709,50 @@ utf16le_encode(PyTextIOWrapperObject *self, PyObject *text)
static PyObject *
utf16_encode(PyTextIOWrapperObject *self, PyObject *text)
{
- PyObject *res;
- res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
- PyUnicode_GET_SIZE(text),
- PyBytes_AS_STRING(self->errors), 0);
- if (res == NULL)
- return NULL;
- /* Next writes will skip the BOM and use native byte ordering */
+ if (!self->encoding_start_of_stream) {
+ /* Skip the BOM and use native byte ordering */
#if defined(WORDS_BIGENDIAN)
- self->encodefunc = (encodefunc_t) utf16be_encode;
+ return utf16be_encode(self, text);
#else
- self->encodefunc = (encodefunc_t) utf16le_encode;
+ return utf16le_encode(self, text);
#endif
- return res;
+ }
+ return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors), 0);
}
+static PyObject *
+utf32be_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+ return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors), 1);
+}
+
+static PyObject *
+utf32le_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+ return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors), -1);
+}
+
+static PyObject *
+utf32_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+ if (!self->encoding_start_of_stream) {
+ /* Skip the BOM and use native byte ordering */
+#if defined(WORDS_BIGENDIAN)
+ return utf32be_encode(self, text);
+#else
+ return utf32le_encode(self, text);
+#endif
+ }
+ return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors), 0);
+}
static PyObject *
utf8_encode(PyTextIOWrapperObject *self, PyObject *text)
@@ -749,10 +780,13 @@ typedef struct {
static encodefuncentry encodefuncs[] = {
{"ascii", (encodefunc_t) ascii_encode},
{"iso8859-1", (encodefunc_t) latin1_encode},
+ {"utf-8", (encodefunc_t) utf8_encode},
{"utf-16-be", (encodefunc_t) utf16be_encode},
{"utf-16-le", (encodefunc_t) utf16le_encode},
{"utf-16", (encodefunc_t) utf16_encode},
- {"utf-8", (encodefunc_t) utf8_encode},
+ {"utf-32-be", (encodefunc_t) utf32be_encode},
+ {"utf-32-le", (encodefunc_t) utf32le_encode},
+ {"utf-32", (encodefunc_t) utf32_encode},
{NULL, NULL}
};
@@ -978,6 +1012,33 @@ TextIOWrapper_init(PyTextIOWrapperObject *self, PyObject *args, PyObject *kwds)
self->seekable = self->telling = PyObject_IsTrue(res);
Py_DECREF(res);
+ self->encoding_start_of_stream = 0;
+ if (self->seekable && self->encoder) {
+ PyObject *cookieObj;
+ int cmp;
+
+ self->encoding_start_of_stream = 1;
+
+ cookieObj = PyObject_CallMethodObjArgs(buffer, _PyIO_str_tell, NULL);
+ if (cookieObj == NULL)
+ goto error;
+
+ cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
+ Py_DECREF(cookieObj);
+ if (cmp < 0) {
+ goto error;
+ }
+
+ if (cmp == 0) {
+ self->encoding_start_of_stream = 0;
+ res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
+ _PyIO_zero, NULL);
+ if (res == NULL)
+ goto error;
+ Py_DECREF(res);
+ }
+ }
+
self->ok = 1;
return 0;
@@ -1192,8 +1253,10 @@ TextIOWrapper_write(PyTextIOWrapperObject *self, PyObject *args)
needflush = 1;
/* XXX What if we were just reading? */
- if (self->encodefunc != NULL)
+ if (self->encodefunc != NULL) {
b = (*self->encodefunc)((PyObject *) self, text);
+ self->encoding_start_of_stream = 0;
+ }
else
b = PyObject_CallMethodObjArgs(self->encoder,
_PyIO_str_encode, text, NULL);
@@ -1847,24 +1910,38 @@ _TextIOWrapper_decoder_setstate(PyTextIOWrapperObject *self,
return 0;
}
+static int
+_TextIOWrapper_encoder_setstate(PyTextIOWrapperObject *self,
+ CookieStruct *cookie)
+{
+ PyObject *res;
+ /* Same as _TextIOWrapper_decoder_setstate() above. */
+ if (cookie->start_pos == 0 && cookie->dec_flags == 0) {
+ res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_reset, NULL);
+ self->encoding_start_of_stream = 1;
+ }
+ else {
+ res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
+ _PyIO_zero, NULL);
+ self->encoding_start_of_stream = 0;
+ }
+ if (res == NULL)
+ return -1;
+ Py_DECREF(res);
+ return 0;
+}
+
static PyObject *
TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
{
PyObject *cookieObj, *posobj;
CookieStruct cookie;
int whence = 0;
- static PyObject *zero = NULL;
PyObject *res;
int cmp;
CHECK_INITIALIZED(self);
- if (zero == NULL) {
- zero = PyLong_FromLong(0L);
- if (zero == NULL)
- return NULL;
- }
-
if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence))
return NULL;
CHECK_CLOSED(self);
@@ -1879,7 +1956,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
if (whence == 1) {
/* seek relative to current position */
- cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
+ cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
if (cmp < 0)
goto fail;
@@ -1900,7 +1977,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
else if (whence == 2) {
/* seek relative to end of file */
- cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
+ cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
if (cmp < 0)
goto fail;
@@ -1934,7 +2011,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
goto fail;
}
- cmp = PyObject_RichCompareBool(cookieObj, zero, Py_LT);
+ cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_LT);
if (cmp < 0)
goto fail;
@@ -2013,6 +2090,11 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
goto fail;
}
+ /* Finally, reset the encoder (merely useful for proper BOM handling) */
+ if (self->encoder) {
+ if (_TextIOWrapper_encoder_setstate(self, &cookie) < 0)
+ goto fail;
+ }
return cookieObj;
fail:
Py_XDECREF(cookieObj);