Moved reader \r and \n processing from the iterator to the state machine -

this allows for better handling of newline characters in quoted fields (and hopefully resolves Bug 967934).
author: Andrew McNamara <andrewm@object-craft.com.au> 2005-01-13 11:30:54 (GMT)
committer: Andrew McNamara <andrewm@object-craft.com.au> 2005-01-13 11:30:54 (GMT)
commit: f69d94f6c0263975c2b197a47487472d20a0c92c (patch)
tree: d43ce89c2f6afcf9d6b3904c70bb127f54f11d86
parent: a1974c1459a424fdc9d8bbce55500f6006d0297d (diff)
download: cpython-f69d94f6c0263975c2b197a47487472d20a0c92c.zip
cpython-f69d94f6c0263975c2b197a47487472d20a0c92c.tar.gz
cpython-f69d94f6c0263975c2b197a47487472d20a0c92c.tar.bz2
2 files changed, 78 insertions, 99 deletions
diff --git a/Misc/NEWS b/Misc/NEWS
index 5ad39e2..6385157 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -48,10 +48,11 @@ Library
     dictates.
   + the parser now removes the escapechar prefix from escaped characters.
   + when quoting=QUOTE_NONNUMERIC, the writer now tests for numeric
-    objects, rather than attempting to cast to float, and using the
-    success of that as the determinator.
+    types, rather than any object than can be represented as a numeric.
   + when quoting=QUOTE_NONNUMERIC, the reader now casts unquoted fields
     to floats.
+  + reader now allows \r characters to be quoted (previously it only allowed
+    \n to be quoted).
   + writer doublequote handling improved.
   + Dialect classes passed to the module are no longer instantiated by
     the module before being parsed (the former validation scheme required
diff --git a/Modules/_csv.c b/Modules/_csv.c
index 8547d3c..6380792 100644
--- a/Modules/_csv.c
+++ b/Modules/_csv.c
@@ -48,7 +48,8 @@ static long field_limit = 128 * 1024;	/* max parsed field size */
 
 typedef enum {
 	START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, 
-	IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD
+	IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
+	EAT_CRNL
 } ParserState;
 
 typedef enum {
@@ -96,7 +97,6 @@ typedef struct {
 	char *field;		/* build current field in here */
 	int field_size;		/* size of allocated buffer */
 	int field_len;		/* length of current field */
-	int had_parse_error;	/* did we have a parse error? */
 	int numeric_field;	/* treat field as numeric */
 	unsigned long line_num;	/* Source-file line number */
 } ReaderObj;
@@ -497,6 +497,9 @@ _call_dialect(PyObject *dialect_inst, PyObject *kwargs)
 	return dialect;
 }
 
+/*
+ * READER
+ */
 static int
 parse_save_field(ReaderObj *self)
 {
@@ -544,22 +547,6 @@ parse_grow_buff(ReaderObj *self)
 }
 
 static int
-parse_reset(ReaderObj *self)
-{
-	if (self->fields) {
-		Py_DECREF(self->fields);
-	}
-	self->fields = PyList_New(0);
-	if (self->fields == NULL)
-		return -1;
-	self->field_len = 0;
-	self->state = START_RECORD;
-	self->had_parse_error = 0;
-	self->numeric_field = 0;
-	return 0;
-}
-
-static int
 parse_add_char(ReaderObj *self, char c)
 {
 	if (self->field_len >= field_limit) {
@@ -581,19 +568,23 @@ parse_process_char(ReaderObj *self, char c)
 	switch (self->state) {
 	case START_RECORD:
 		/* start of record */
-		if (c == '\n')
+		if (c == '\0')
 			/* empty line - return [] */
 			break;
+		else if (c == '\n' || c == '\r') {
+			self->state = EAT_CRNL;
+			break;
+		}
 		/* normal character - handle as START_FIELD */
 		self->state = START_FIELD;
 		/* fallthru */
 	case START_FIELD:
 		/* expecting field */
-		if (c == '\n') {
+		if (c == '\n' || c == '\r' || c == '\0') {
 			/* save empty field - return [fields] */
 			if (parse_save_field(self) < 0)
 				return -1;
-			self->state = START_RECORD;
+			self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
 		}
 		else if (c == dialect->quotechar && 
 			 dialect->quoting != QUOTE_NONE) {
@@ -623,6 +614,8 @@ parse_process_char(ReaderObj *self, char c)
 		break;
 
 	case ESCAPED_CHAR:
+		if (c == '\0')
+			c = '\n';
 		if (parse_add_char(self, c) < 0)
 			return -1;
 		self->state = IN_FIELD;
@@ -630,11 +623,11 @@ parse_process_char(ReaderObj *self, char c)
 
 	case IN_FIELD:
 		/* in unquoted field */
-		if (c == '\n') {
+		if (c == '\n' || c == '\r' || c == '\0') {
 			/* end of line - return [fields] */
 			if (parse_save_field(self) < 0)
 				return -1;
-			self->state = START_RECORD;
+			self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
 		}
 		else if (c == dialect->escapechar) {
 			/* possible escaped character */
@@ -655,11 +648,8 @@ parse_process_char(ReaderObj *self, char c)
 
 	case IN_QUOTED_FIELD:
 		/* in quoted field */
-		if (c == '\n') {
-			/* end of line - save '\n' in field */
-			if (parse_add_char(self, '\n') < 0)
-				return -1;
-		}
+		if (c == '\0')
+			;
 		else if (c == dialect->escapechar) {
 			/* Possible escape character */
 			self->state = ESCAPE_IN_QUOTED_FIELD;
@@ -683,6 +673,8 @@ parse_process_char(ReaderObj *self, char c)
 		break;
 
 	case ESCAPE_IN_QUOTED_FIELD:
+		if (c == '\0')
+			c = '\n';
 		if (parse_add_char(self, c) < 0)
 			return -1;
 		self->state = IN_QUOTED_FIELD;
@@ -703,11 +695,11 @@ parse_process_char(ReaderObj *self, char c)
 				return -1;
 			self->state = START_FIELD;
 		}
-		else if (c == '\n') {
+		else if (c == '\n' || c == '\r' || c == '\0') {
 			/* end of line - return [fields] */
 			if (parse_save_field(self) < 0)
 				return -1;
-			self->state = START_RECORD;
+			self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
 		}
 		else if (!dialect->strict) {
 			if (parse_add_char(self, c) < 0)
@@ -716,7 +708,6 @@ parse_process_char(ReaderObj *self, char c)
 		}
 		else {
 			/* illegal */
-			self->had_parse_error = 1;
 			PyErr_Format(error_obj, "'%c' expected after '%c'", 
 					dialect->delimiter, 
                                         dialect->quotechar);
@@ -724,104 +715,83 @@ parse_process_char(ReaderObj *self, char c)
 		}
 		break;
 
+	case EAT_CRNL:
+		if (c == '\n' || c == '\r')
+			;
+		else if (c == '\0')
+			self->state = START_RECORD;
+		else {
+			PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");
+			return -1;
+		}
+		break;
+
 	}
 	return 0;
 }
 
-/*
- * READER
- */
-#define R_OFF(x) offsetof(ReaderObj, x)
-
-static struct PyMemberDef Reader_memberlist[] = {
-	{ "dialect", T_OBJECT, R_OFF(dialect), RO },
-	{ "line_num", T_ULONG, R_OFF(line_num), RO },
-	{ NULL }
-};
+static int
+parse_reset(ReaderObj *self)
+{
+	Py_XDECREF(self->fields);
+	self->fields = PyList_New(0);
+	if (self->fields == NULL)
+		return -1;
+	self->field_len = 0;
+	self->state = START_RECORD;
+	self->numeric_field = 0;
+	return 0;
+}
 
 static PyObject *
 Reader_iternext(ReaderObj *self)
 {
         PyObject *lineobj;
-        PyObject *fields;
-        char *line;
+        PyObject *fields = NULL;
+        char *line, c;
+	int linelen;
 
+	if (parse_reset(self) < 0)
+		return NULL;
         do {
                 lineobj = PyIter_Next(self->input_iter);
                 if (lineobj == NULL) {
                         /* End of input OR exception */
                         if (!PyErr_Occurred() && self->field_len != 0)
-                                return PyErr_Format(error_obj,
-                                                    "newline inside string");
+                                PyErr_Format(error_obj,
+					     "newline inside string");
                         return NULL;
                 }
 		++self->line_num;
 
-                if (self->had_parse_error)
-			if (parse_reset(self) < 0) {
-				Py_DECREF(lineobj);
-				return NULL;
-			}
                 line = PyString_AsString(lineobj);
+		linelen = PyString_Size(lineobj);
 
-                if (line == NULL) {
+                if (line == NULL || linelen < 0) {
                         Py_DECREF(lineobj);
                         return NULL;
                 }
-		if (strlen(line) < (size_t)PyString_GET_SIZE(lineobj)) {
-			self->had_parse_error = 1;
-			Py_DECREF(lineobj);
-			return PyErr_Format(error_obj,
-					    "string with NUL bytes");
-		}
-
-                /* Process line of text - send '\n' to processing code to
-                represent end of line.  End of line which is not at end of
-                string is an error. */
-                while (*line) {
-                        char c;
-
-                        c = *line++;
-                        if (c == '\r') {
-                                c = *line++;
-                                if (c == '\0')
-                                        /* macintosh end of line */
-                                        break;
-                                if (c == '\n') {
-                                        c = *line++;
-                                        if (c == '\0')
-                                                /* DOS end of line */
-                                                break;
-                                }
-                                self->had_parse_error = 1;
-                                Py_DECREF(lineobj);
-                                return PyErr_Format(error_obj,
-                                                    "newline inside string");
-                        }
-                        if (c == '\n') {
-                                c = *line++;
-                                if (c == '\0')
-                                        /* unix end of line */
-                                        break;
-                                self->had_parse_error = 1;
-                                Py_DECREF(lineobj);
-                                return PyErr_Format(error_obj, 
-                                                    "newline inside string");
-                        }
+                while (linelen--) {
+			c = *line++;
+			if (c == '\0') {
+				Py_DECREF(lineobj);
+				PyErr_Format(error_obj,
+					     "line contains NULL byte");
+				goto err;
+			}
 			if (parse_process_char(self, c) < 0) {
 				Py_DECREF(lineobj);
-				return NULL;
+				goto err;
 			}
 		}
-		if (parse_process_char(self, '\n') < 0) {
-			Py_DECREF(lineobj);
-			return NULL;
-		}
                 Py_DECREF(lineobj);
+		if (parse_process_char(self, 0) < 0)
+			goto err;
         } while (self->state != START_RECORD);
 
         fields = self->fields;
-        self->fields = PyList_New(0);
+        self->fields = NULL;
+err:
         return fields;
 }
 
@@ -875,6 +845,14 @@ PyDoc_STRVAR(Reader_Type_doc,
 static struct PyMethodDef Reader_methods[] = {
 	{ NULL, NULL }
 };
+#define R_OFF(x) offsetof(ReaderObj, x)
+
+static struct PyMemberDef Reader_memberlist[] = {
+	{ "dialect", T_OBJECT, R_OFF(dialect), RO },
+	{ "line_num", T_ULONG, R_OFF(line_num), RO },
+	{ NULL }
+};
+
 
 static PyTypeObject Reader_Type = {
 	PyObject_HEAD_INIT(NULL)
author	Andrew McNamara <andrewm@object-craft.com.au>	2005-01-13 11:30:54 (GMT)
committer	Andrew McNamara <andrewm@object-craft.com.au>	2005-01-13 11:30:54 (GMT)
commit	f69d94f6c0263975c2b197a47487472d20a0c92c (patch)
tree	d43ce89c2f6afcf9d6b3904c70bb127f54f11d86
parent	a1974c1459a424fdc9d8bbce55500f6006d0297d (diff)
download	cpython-f69d94f6c0263975c2b197a47487472d20a0c92c.zip cpython-f69d94f6c0263975c2b197a47487472d20a0c92c.tar.gz cpython-f69d94f6c0263975c2b197a47487472d20a0c92c.tar.bz2