summaryrefslogtreecommitdiffstats
path: root/Modules/_csv.c
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2021-10-09 16:17:43 (GMT)
committerGitHub <noreply@github.com>2021-10-09 16:17:43 (GMT)
commitb454e8e4df73bc73bc1a6f597431f171bfae8abd (patch)
tree26dbf6df9301332814c27eecb63d7abfdcbcf32a /Modules/_csv.c
parentb4903afd4debbbd71dc49a2c8fefa74a3b6c6832 (diff)
downloadcpython-b454e8e4df73bc73bc1a6f597431f171bfae8abd.zip
cpython-b454e8e4df73bc73bc1a6f597431f171bfae8abd.tar.gz
cpython-b454e8e4df73bc73bc1a6f597431f171bfae8abd.tar.bz2
bpo-27580: Add support of null characters in the csv module. (GH-28808)
Diffstat (limited to 'Modules/_csv.c')
-rw-r--r--Modules/_csv.c63
1 files changed, 30 insertions, 33 deletions
diff --git a/Modules/_csv.c b/Modules/_csv.c
index 72f0791..469c1a1 100644
--- a/Modules/_csv.c
+++ b/Modules/_csv.c
@@ -14,6 +14,9 @@ module instead.
#include "structmember.h" // PyMemberDef
#include <stdbool.h>
+#define NOT_SET ((Py_UCS4)-1)
+#define EOL ((Py_UCS4)-2)
+
typedef struct {
PyObject *error_obj; /* CSV exception */
@@ -153,9 +156,9 @@ get_dialect_from_registry(PyObject *name_obj, _csvstate *module_state)
}
static PyObject *
-get_nullchar_as_None(Py_UCS4 c)
+get_char_or_None(Py_UCS4 c)
{
- if (c == '\0') {
+ if (c == NOT_SET) {
Py_RETURN_NONE;
}
else
@@ -172,19 +175,19 @@ Dialect_get_lineterminator(DialectObj *self, void *Py_UNUSED(ignored))
static PyObject *
Dialect_get_delimiter(DialectObj *self, void *Py_UNUSED(ignored))
{
- return get_nullchar_as_None(self->delimiter);
+ return get_char_or_None(self->delimiter);
}
static PyObject *
Dialect_get_escapechar(DialectObj *self, void *Py_UNUSED(ignored))
{
- return get_nullchar_as_None(self->escapechar);
+ return get_char_or_None(self->escapechar);
}
static PyObject *
Dialect_get_quotechar(DialectObj *self, void *Py_UNUSED(ignored))
{
- return get_nullchar_as_None(self->quotechar);
+ return get_char_or_None(self->quotechar);
}
static PyObject *
@@ -235,7 +238,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt
*target = dflt;
}
else {
- *target = '\0';
+ *target = NOT_SET;
if (src != Py_None) {
if (!PyUnicode_Check(src)) {
PyErr_Format(PyExc_TypeError,
@@ -254,7 +257,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt
return -1;
}
/* PyUnicode_READY() is called in PyUnicode_GetLength() */
- else {
+ else if (len > 0) {
*target = PyUnicode_READ_CHAR(src, 0);
}
}
@@ -269,7 +272,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
*target = dflt;
}
else {
- *target = '\0';
+ *target = NOT_SET;
if (!PyUnicode_Check(src)) {
PyErr_Format(PyExc_TypeError,
"\"%s\" must be string, not %.200s", name,
@@ -287,7 +290,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
return -1;
}
/* PyUnicode_READY() is called in PyUnicode_GetLength() */
- else {
+ else if (len > 0) {
*target = PyUnicode_READ_CHAR(src, 0);
}
}
@@ -481,7 +484,7 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
goto err
DIASET(_set_char, "delimiter", &self->delimiter, delimiter, ',');
DIASET(_set_bool, "doublequote", &self->doublequote, doublequote, true);
- DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, 0);
+ DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, NOT_SET);
DIASET(_set_str, "lineterminator", &self->lineterminator, lineterminator, "\r\n");
DIASET(_set_char_or_none, "quotechar", &self->quotechar, quotechar, '"');
DIASET(_set_int, "quoting", &self->quoting, quoting, QUOTE_MINIMAL);
@@ -491,19 +494,19 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
/* validate options */
if (dialect_check_quoting(self->quoting))
goto err;
- if (self->delimiter == 0) {
+ if (self->delimiter == NOT_SET) {
PyErr_SetString(PyExc_TypeError,
"\"delimiter\" must be a 1-character string");
goto err;
}
if (quotechar == Py_None && quoting == NULL)
self->quoting = QUOTE_NONE;
- if (self->quoting != QUOTE_NONE && self->quotechar == 0) {
+ if (self->quoting != QUOTE_NONE && self->quotechar == NOT_SET) {
PyErr_SetString(PyExc_TypeError,
"quotechar must be set if quoting enabled");
goto err;
}
- if (self->lineterminator == 0) {
+ if (self->lineterminator == NULL) {
PyErr_SetString(PyExc_TypeError, "lineterminator must be set");
goto err;
}
@@ -670,7 +673,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
switch (self->state) {
case START_RECORD:
/* start of record */
- if (c == '\0')
+ if (c == EOL)
/* empty line - return [] */
break;
else if (c == '\n' || c == '\r') {
@@ -682,11 +685,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
/* fallthru */
case START_FIELD:
/* expecting field */
- if (c == '\n' || c == '\r' || c == '\0') {
+ if (c == '\n' || c == '\r' || c == EOL) {
/* save empty field - return [fields] */
if (parse_save_field(self) < 0)
return -1;
- self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
+ self->state = (c == EOL ? START_RECORD : EAT_CRNL);
}
else if (c == dialect->quotechar &&
dialect->quoting != QUOTE_NONE) {
@@ -722,7 +725,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
self->state = AFTER_ESCAPED_CRNL;
break;
}
- if (c == '\0')
+ if (c == EOL)
c = '\n';
if (parse_add_char(self, module_state, c) < 0)
return -1;
@@ -730,17 +733,17 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
break;
case AFTER_ESCAPED_CRNL:
- if (c == '\0')
+ if (c == EOL)
break;
/*fallthru*/
case IN_FIELD:
/* in unquoted field */
- if (c == '\n' || c == '\r' || c == '\0') {
+ if (c == '\n' || c == '\r' || c == EOL) {
/* end of line - return [fields] */
if (parse_save_field(self) < 0)
return -1;
- self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
+ self->state = (c == EOL ? START_RECORD : EAT_CRNL);
}
else if (c == dialect->escapechar) {
/* possible escaped character */
@@ -761,7 +764,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
case IN_QUOTED_FIELD:
/* in quoted field */
- if (c == '\0')
+ if (c == EOL)
;
else if (c == dialect->escapechar) {
/* Possible escape character */
@@ -786,7 +789,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
break;
case ESCAPE_IN_QUOTED_FIELD:
- if (c == '\0')
+ if (c == EOL)
c = '\n';
if (parse_add_char(self, module_state, c) < 0)
return -1;
@@ -808,11 +811,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
return -1;
self->state = START_FIELD;
}
- else if (c == '\n' || c == '\r' || c == '\0') {
+ else if (c == '\n' || c == '\r' || c == EOL) {
/* end of line - return [fields] */
if (parse_save_field(self) < 0)
return -1;
- self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
+ self->state = (c == EOL ? START_RECORD : EAT_CRNL);
}
else if (!dialect->strict) {
if (parse_add_char(self, module_state, c) < 0)
@@ -831,7 +834,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
case EAT_CRNL:
if (c == '\n' || c == '\r')
;
- else if (c == '\0')
+ else if (c == EOL)
self->state = START_RECORD;
else {
PyErr_Format(module_state->error_obj,
@@ -909,12 +912,6 @@ Reader_iternext(ReaderObj *self)
linelen = PyUnicode_GET_LENGTH(lineobj);
while (linelen--) {
c = PyUnicode_READ(kind, data, pos);
- if (c == '\0') {
- Py_DECREF(lineobj);
- PyErr_Format(module_state->error_obj,
- "line contains NUL");
- goto err;
- }
if (parse_process_char(self, module_state, c) < 0) {
Py_DECREF(lineobj);
goto err;
@@ -922,7 +919,7 @@ Reader_iternext(ReaderObj *self)
pos++;
}
Py_DECREF(lineobj);
- if (parse_process_char(self, module_state, 0) < 0)
+ if (parse_process_char(self, module_state, EOL) < 0)
goto err;
} while (self->state != START_RECORD);
@@ -1127,7 +1124,7 @@ join_append_data(WriterObj *self, unsigned int field_kind, const void *field_dat
*quoted = 1;
}
if (want_escape) {
- if (!dialect->escapechar) {
+ if (dialect->escapechar == NOT_SET) {
PyErr_Format(self->error_obj,
"need to escape, but no escapechar set");
return -1;