From b454e8e4df73bc73bc1a6f597431f171bfae8abd Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 9 Oct 2021 19:17:43 +0300 Subject: bpo-27580: Add support of null characters in the csv module. (GH-28808) --- Lib/test/test_csv.py | 44 +++++++++++++-- .../2021-10-07-21-11-48.bpo-27580.tGcBTH.rst | 1 + Modules/_csv.c | 63 +++++++++++----------- 3 files changed, 70 insertions(+), 38 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-10-07-21-11-48.bpo-27580.tGcBTH.rst diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 6e5dfc6..fb27ea3 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -217,6 +217,17 @@ class Test_Csv(unittest.TestCase): self._write_test(['C\\', '6', '7', 'X"'], 'C\\\\,6,7,"X"""', escapechar='\\', quoting=csv.QUOTE_MINIMAL) + def test_write_lineterminator(self): + for lineterminator in '\r\n', '\n', '\r', '!@#', '\0': + with self.subTest(lineterminator=lineterminator): + with StringIO() as sio: + writer = csv.writer(sio, lineterminator=lineterminator) + writer.writerow(['a', 'b']) + writer.writerow([1, 2]) + self.assertEqual(sio.getvalue(), + f'a,b{lineterminator}' + f'1,2{lineterminator}') + def test_write_iterable(self): self._write_test(iter(['a', 1, 'p,q']), 'a,1,"p,q"') self._write_test(iter(['a', 1, None]), 'a,1,') @@ -286,14 +297,10 @@ class Test_Csv(unittest.TestCase): self._read_test([''], [[]]) self.assertRaises(csv.Error, self._read_test, ['"ab"c'], None, strict = 1) - # cannot handle null bytes for the moment - self.assertRaises(csv.Error, self._read_test, - ['ab\0c'], None, strict = 1) self._read_test(['"ab"c'], [['abc']], doublequote = 0) self.assertRaises(csv.Error, self._read_test, - [b'ab\0c'], None) - + [b'abc'], None) def test_read_eol(self): self._read_test(['a,b'], [['a','b']]) @@ -313,6 +320,18 @@ class Test_Csv(unittest.TestCase): self.assertRaises(csv.Error, self._read_test, ['^'], [], escapechar='^', strict=True) + def test_read_nul(self): + self._read_test(['\0'], [['\0']]) + self._read_test(['a,\0b,c'], [['a', '\0b', 'c']]) + self._read_test(['a,b\0,c'], [['a', 'b\0', 'c']]) + self._read_test(['a,b\\\0,c'], [['a', 'b\0', 'c']], escapechar='\\') + self._read_test(['a,"\0b",c'], [['a', '\0b', 'c']]) + + def test_read_delimiter(self): + self._read_test(['a,b,c'], [['a', 'b', 'c']]) + self._read_test(['a;b;c'], [['a', 'b', 'c']], delimiter=';') + self._read_test(['a\0b\0c'], [['a', 'b', 'c']], delimiter='\0') + def test_read_escape(self): self._read_test(['a,\\b,c'], [['a', 'b', 'c']], escapechar='\\') self._read_test(['a,b\\,c'], [['a', 'b,c']], escapechar='\\') @@ -320,6 +339,11 @@ class Test_Csv(unittest.TestCase): self._read_test(['a,"b,\\c"'], [['a', 'b,c']], escapechar='\\') self._read_test(['a,"b,c\\""'], [['a', 'b,c"']], escapechar='\\') self._read_test(['a,"b,c"\\'], [['a', 'b,c\\']], escapechar='\\') + self._read_test(['a,^b,c'], [['a', 'b', 'c']], escapechar='^') + self._read_test(['a,\0b,c'], [['a', 'b', 'c']], escapechar='\0') + self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar=None) + self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar='') + self._read_test(['a,\\b,c'], [['a', '\\b', 'c']]) def test_read_quoting(self): self._read_test(['1,",3,",5'], [['1', ',3,', '5']]) @@ -334,6 +358,8 @@ class Test_Csv(unittest.TestCase): self.assertRaises(ValueError, self._read_test, ['abc,3'], [[]], quoting=csv.QUOTE_NONNUMERIC) + self._read_test(['1,@,3,@,5'], [['1', ',3,', '5']], quotechar='@') + self._read_test(['1,\0,3,\0,5'], [['1', ',3,', '5']], quotechar='\0') def test_read_bigfield(self): # This exercises the buffer realloc functionality and field size @@ -1074,6 +1100,12 @@ Stonecutters Seafood and Chop House+ Lemont+ IL+ 12/19/02+ Week Back a,b """) + sample14 = """\ +abc\0def +ghijkl\0mno +ghi\0jkl +""" + def test_issue43625(self): sniffer = csv.Sniffer() self.assertTrue(sniffer.has_header(self.sample12)) @@ -1142,6 +1174,8 @@ Stonecutters Seafood and Chop House+ Lemont+ IL+ 12/19/02+ Week Back dialect = sniffer.sniff(self.sample9) self.assertEqual(dialect.delimiter, '+') self.assertEqual(dialect.quotechar, "'") + dialect = sniffer.sniff(self.sample14) + self.assertEqual(dialect.delimiter, '\0') def test_doublequote(self): sniffer = csv.Sniffer() diff --git a/Misc/NEWS.d/next/Library/2021-10-07-21-11-48.bpo-27580.tGcBTH.rst b/Misc/NEWS.d/next/Library/2021-10-07-21-11-48.bpo-27580.tGcBTH.rst new file mode 100644 index 0000000..15a8ff2 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-10-07-21-11-48.bpo-27580.tGcBTH.rst @@ -0,0 +1 @@ +Add support of null characters in :mod:`csv`. diff --git a/Modules/_csv.c b/Modules/_csv.c index 72f0791..469c1a1 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -14,6 +14,9 @@ module instead. #include "structmember.h" // PyMemberDef #include +#define NOT_SET ((Py_UCS4)-1) +#define EOL ((Py_UCS4)-2) + typedef struct { PyObject *error_obj; /* CSV exception */ @@ -153,9 +156,9 @@ get_dialect_from_registry(PyObject *name_obj, _csvstate *module_state) } static PyObject * -get_nullchar_as_None(Py_UCS4 c) +get_char_or_None(Py_UCS4 c) { - if (c == '\0') { + if (c == NOT_SET) { Py_RETURN_NONE; } else @@ -172,19 +175,19 @@ Dialect_get_lineterminator(DialectObj *self, void *Py_UNUSED(ignored)) static PyObject * Dialect_get_delimiter(DialectObj *self, void *Py_UNUSED(ignored)) { - return get_nullchar_as_None(self->delimiter); + return get_char_or_None(self->delimiter); } static PyObject * Dialect_get_escapechar(DialectObj *self, void *Py_UNUSED(ignored)) { - return get_nullchar_as_None(self->escapechar); + return get_char_or_None(self->escapechar); } static PyObject * Dialect_get_quotechar(DialectObj *self, void *Py_UNUSED(ignored)) { - return get_nullchar_as_None(self->quotechar); + return get_char_or_None(self->quotechar); } static PyObject * @@ -235,7 +238,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt *target = dflt; } else { - *target = '\0'; + *target = NOT_SET; if (src != Py_None) { if (!PyUnicode_Check(src)) { PyErr_Format(PyExc_TypeError, @@ -254,7 +257,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt return -1; } /* PyUnicode_READY() is called in PyUnicode_GetLength() */ - else { + else if (len > 0) { *target = PyUnicode_READ_CHAR(src, 0); } } @@ -269,7 +272,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt) *target = dflt; } else { - *target = '\0'; + *target = NOT_SET; if (!PyUnicode_Check(src)) { PyErr_Format(PyExc_TypeError, "\"%s\" must be string, not %.200s", name, @@ -287,7 +290,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt) return -1; } /* PyUnicode_READY() is called in PyUnicode_GetLength() */ - else { + else if (len > 0) { *target = PyUnicode_READ_CHAR(src, 0); } } @@ -481,7 +484,7 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) goto err DIASET(_set_char, "delimiter", &self->delimiter, delimiter, ','); DIASET(_set_bool, "doublequote", &self->doublequote, doublequote, true); - DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, 0); + DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, NOT_SET); DIASET(_set_str, "lineterminator", &self->lineterminator, lineterminator, "\r\n"); DIASET(_set_char_or_none, "quotechar", &self->quotechar, quotechar, '"'); DIASET(_set_int, "quoting", &self->quoting, quoting, QUOTE_MINIMAL); @@ -491,19 +494,19 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) /* validate options */ if (dialect_check_quoting(self->quoting)) goto err; - if (self->delimiter == 0) { + if (self->delimiter == NOT_SET) { PyErr_SetString(PyExc_TypeError, "\"delimiter\" must be a 1-character string"); goto err; } if (quotechar == Py_None && quoting == NULL) self->quoting = QUOTE_NONE; - if (self->quoting != QUOTE_NONE && self->quotechar == 0) { + if (self->quoting != QUOTE_NONE && self->quotechar == NOT_SET) { PyErr_SetString(PyExc_TypeError, "quotechar must be set if quoting enabled"); goto err; } - if (self->lineterminator == 0) { + if (self->lineterminator == NULL) { PyErr_SetString(PyExc_TypeError, "lineterminator must be set"); goto err; } @@ -670,7 +673,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) switch (self->state) { case START_RECORD: /* start of record */ - if (c == '\0') + if (c == EOL) /* empty line - return [] */ break; else if (c == '\n' || c == '\r') { @@ -682,11 +685,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) /* fallthru */ case START_FIELD: /* expecting field */ - if (c == '\n' || c == '\r' || c == '\0') { + if (c == '\n' || c == '\r' || c == EOL) { /* save empty field - return [fields] */ if (parse_save_field(self) < 0) return -1; - self->state = (c == '\0' ? START_RECORD : EAT_CRNL); + self->state = (c == EOL ? START_RECORD : EAT_CRNL); } else if (c == dialect->quotechar && dialect->quoting != QUOTE_NONE) { @@ -722,7 +725,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) self->state = AFTER_ESCAPED_CRNL; break; } - if (c == '\0') + if (c == EOL) c = '\n'; if (parse_add_char(self, module_state, c) < 0) return -1; @@ -730,17 +733,17 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) break; case AFTER_ESCAPED_CRNL: - if (c == '\0') + if (c == EOL) break; /*fallthru*/ case IN_FIELD: /* in unquoted field */ - if (c == '\n' || c == '\r' || c == '\0') { + if (c == '\n' || c == '\r' || c == EOL) { /* end of line - return [fields] */ if (parse_save_field(self) < 0) return -1; - self->state = (c == '\0' ? START_RECORD : EAT_CRNL); + self->state = (c == EOL ? START_RECORD : EAT_CRNL); } else if (c == dialect->escapechar) { /* possible escaped character */ @@ -761,7 +764,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) case IN_QUOTED_FIELD: /* in quoted field */ - if (c == '\0') + if (c == EOL) ; else if (c == dialect->escapechar) { /* Possible escape character */ @@ -786,7 +789,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) break; case ESCAPE_IN_QUOTED_FIELD: - if (c == '\0') + if (c == EOL) c = '\n'; if (parse_add_char(self, module_state, c) < 0) return -1; @@ -808,11 +811,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) return -1; self->state = START_FIELD; } - else if (c == '\n' || c == '\r' || c == '\0') { + else if (c == '\n' || c == '\r' || c == EOL) { /* end of line - return [fields] */ if (parse_save_field(self) < 0) return -1; - self->state = (c == '\0' ? START_RECORD : EAT_CRNL); + self->state = (c == EOL ? START_RECORD : EAT_CRNL); } else if (!dialect->strict) { if (parse_add_char(self, module_state, c) < 0) @@ -831,7 +834,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) case EAT_CRNL: if (c == '\n' || c == '\r') ; - else if (c == '\0') + else if (c == EOL) self->state = START_RECORD; else { PyErr_Format(module_state->error_obj, @@ -909,12 +912,6 @@ Reader_iternext(ReaderObj *self) linelen = PyUnicode_GET_LENGTH(lineobj); while (linelen--) { c = PyUnicode_READ(kind, data, pos); - if (c == '\0') { - Py_DECREF(lineobj); - PyErr_Format(module_state->error_obj, - "line contains NUL"); - goto err; - } if (parse_process_char(self, module_state, c) < 0) { Py_DECREF(lineobj); goto err; @@ -922,7 +919,7 @@ Reader_iternext(ReaderObj *self) pos++; } Py_DECREF(lineobj); - if (parse_process_char(self, module_state, 0) < 0) + if (parse_process_char(self, module_state, EOL) < 0) goto err; } while (self->state != START_RECORD); @@ -1127,7 +1124,7 @@ join_append_data(WriterObj *self, unsigned int field_kind, const void *field_dat *quoted = 1; } if (want_escape) { - if (!dialect->escapechar) { + if (dialect->escapechar == NOT_SET) { PyErr_Format(self->error_obj, "need to escape, but no escapechar set"); return -1; -- cgit v0.12