Mass checkin of universal newline support.

Highlights: import and friends will understand any of \r, \n and \r\n as end of line. Python file input will do the same if you use mode 'U'. Everything can be disabled by configuring with --without-universal-newlines. See PEP278 for details.
author: Jack Jansen <jack.jansen@cwi.nl> 2002-04-14 20:12:41 (GMT)
committer: Jack Jansen <jack.jansen@cwi.nl> 2002-04-14 20:12:41 (GMT)
commit: 7b8c7546ebc1fc3688ef95768fa8b82f0f205490 (patch)
tree: 2e14b243347a9c38c82e2e774d4b201f23149916 /Objects
parent: dcd2dc2fffce8614c5d2b8d303a303a599b88a23 (diff)
download: cpython-7b8c7546ebc1fc3688ef95768fa8b82f0f205490.zip
cpython-7b8c7546ebc1fc3688ef95768fa8b82f0f205490.tar.gz
cpython-7b8c7546ebc1fc3688ef95768fa8b82f0f205490.tar.bz2
1 files changed, 294 insertions, 17 deletions
diff --git a/Objects/fileobject.c b/Objects/fileobject.c
index 54d040d..152ba1a 100644
--- a/Objects/fileobject.c
+++ b/Objects/fileobject.c
@@ -37,6 +37,23 @@
 #include <errno.h>
 #endif
 
+#ifdef HAVE_GETC_UNLOCKED
+#define GETC(f) getc_unlocked(f)
+#define FLOCKFILE(f) flockfile(f)
+#define FUNLOCKFILE(f) funlockfile(f)
+#else
+#define GETC(f) getc(f)
+#define FLOCKFILE(f)
+#define FUNLOCKFILE(f)
+#endif
+
+#ifdef WITH_UNIVERSAL_NEWLINES
+/* Bits in f_newlinetypes */
+#define NEWLINE_UNKNOWN	0	/* No newline seen, yet */
+#define NEWLINE_CR 1		/* \r newline seen */
+#define NEWLINE_LF 2		/* \n newline seen */
+#define NEWLINE_CRLF 4		/* \r\n newline seen */
+#endif
 
 FILE *
 PyFile_AsFile(PyObject *f)
@@ -99,6 +116,11 @@ fill_file_fields(PyFileObject *f, FILE *fp, char *name, char *mode,
 	f->f_close = close;
 	f->f_softspace = 0;
 	f->f_binary = strchr(mode,'b') != NULL;
+#ifdef WITH_UNIVERSAL_NEWLINES
+	f->f_univ_newline = (strchr(mode, 'U') != NULL);
+	f->f_newlinetypes = NEWLINE_UNKNOWN;
+	f->f_skipnextlf = 0;
+#endif
 
 	if (f->f_name == NULL || f->f_mode == NULL)
 		return NULL;
@@ -134,6 +156,17 @@ open_the_file(PyFileObject *f, char *name, char *mode)
 #endif
 	{
 		Py_BEGIN_ALLOW_THREADS
+#ifdef WITH_UNIVERSAL_NEWLINES
+		if (strcmp(mode, "U") == 0 || strcmp(mode, "rU") == 0)
+			mode = "rb";
+#else
+		/* Compatibility: specifying U in a Python without universal
+		** newlines is allowed, and the file is opened as a normal text
+		** file.
+		*/
+		if (strcmp(mode, "U") == 0 || strcmp(mode, "rU") == 0)
+			mode = "r";
+#endif
 		f->f_fp = fopen(name, mode);
 		Py_END_ALLOW_THREADS
 	}
@@ -394,6 +427,9 @@ file_seek(PyFileObject *f, PyObject *args)
 		clearerr(f->f_fp);
 		return NULL;
 	}
+#ifdef WITH_UNIVERSAL_NEWLINES
+	f->f_skipnextlf = 0;
+#endif
 	Py_INCREF(Py_None);
 	return Py_None;
 }
@@ -534,6 +570,16 @@ file_tell(PyFileObject *f)
 		clearerr(f->f_fp);
 		return NULL;
 	}
+#ifdef WITH_UNIVERSAL_NEWLINES
+	if (f->f_skipnextlf) {
+		int c;
+		c = GETC(f->f_fp);
+		if (c == '\n') {
+			pos++;
+			f->f_skipnextlf = 0;
+		} else if (c != EOF) ungetc(c, f->f_fp);
+	}
+#endif
 #if !defined(HAVE_LARGEFILE_SUPPORT)
 	return PyInt_FromLong(pos);
 #else
@@ -665,8 +711,8 @@ file_read(PyFileObject *f, PyObject *args)
 	for (;;) {
 		Py_BEGIN_ALLOW_THREADS
 		errno = 0;
-		chunksize = fread(BUF(v) + bytesread, 1,
-				  buffersize - bytesread, f->f_fp);
+		chunksize = Py_UniversalNewlineFread(BUF(v) + bytesread,
+				  buffersize - bytesread, f->f_fp, (PyObject *)f);
 		Py_END_ALLOW_THREADS
 		if (chunksize == 0) {
 			if (!ferror(f->f_fp))
@@ -705,7 +751,7 @@ file_readinto(PyFileObject *f, PyObject *args)
 	while (ntodo > 0) {
 		Py_BEGIN_ALLOW_THREADS
 		errno = 0;
-		nnow = fread(ptr+ndone, 1, ntodo, f->f_fp);
+		nnow = Py_UniversalNewlineFread(ptr+ndone, ntodo, f->f_fp, (PyObject *)f);
 		Py_END_ALLOW_THREADS
 		if (nnow == 0) {
 			if (!ferror(f->f_fp))
@@ -934,16 +980,6 @@ getline_via_fgets(FILE *fp)
    <= 0: read arbitrary line
 */
 
-#ifdef HAVE_GETC_UNLOCKED
-#define GETC(f) getc_unlocked(f)
-#define FLOCKFILE(f) flockfile(f)
-#define FUNLOCKFILE(f) funlockfile(f)
-#else
-#define GETC(f) getc(f)
-#define FLOCKFILE(f)
-#define FUNLOCKFILE(f)
-#endif
-
 static PyObject *
 get_line(PyFileObject *f, int n)
 {
@@ -954,9 +990,18 @@ get_line(PyFileObject *f, int n)
 	size_t used_v_size;	/* # used slots in buffer */
 	size_t increment;       /* amount to increment the buffer */
 	PyObject *v;
+#ifdef WITH_UNIVERSAL_NEWLINES
+	int newlinetypes = f->f_newlinetypes;
+	int skipnextlf = f->f_skipnextlf;
+	int univ_newline = f->f_univ_newline;
+#endif
 
-#ifdef USE_FGETS_IN_GETLINE
+#if defined(USE_FGETS_IN_GETLINE)
+#ifdef WITH_UNIVERSAL_NEWLINES
+	if (n <= 0 && !univ_newline )
+#else
 	if (n <= 0)
+#endif
 		return getline_via_fgets(fp);
 #endif
 	total_v_size = n > 0 ? n : 100;
@@ -969,12 +1014,45 @@ get_line(PyFileObject *f, int n)
 	for (;;) {
 		Py_BEGIN_ALLOW_THREADS
 		FLOCKFILE(fp);
+#ifdef WITH_UNIVERSAL_NEWLINES
+		if (univ_newline) {
+			c = 'x'; /* Shut up gcc warning */
+			while ( buf != end && (c = GETC(fp)) != EOF ) {
+				if (skipnextlf ) {
+					skipnextlf = 0;
+					if (c == '\n') {
+						/* Seeing a \n here with skipnextlf true
+						** means we saw a \r before.
+						*/
+						newlinetypes |= NEWLINE_CRLF;
+						c = GETC(fp);
+						if (c == EOF) break;
+					} else {
+						newlinetypes |= NEWLINE_CR;
+					}
+				}
+				if (c == '\r') {
+					skipnextlf = 1;
+					c = '\n';
+				} else if ( c == '\n')
+					newlinetypes |= NEWLINE_LF;
+				*buf++ = c;
+				if (c == '\n') break;
+			}
+			if ( c == EOF && skipnextlf )
+				newlinetypes |= NEWLINE_CR;
+		} else /* If not universal newlines use the normal loop */
+#endif
 		while ((c = GETC(fp)) != EOF &&
 		       (*buf++ = c) != '\n' &&
 			buf != end)
 			;
 		FUNLOCKFILE(fp);
 		Py_END_ALLOW_THREADS
+#ifdef WITH_UNIVERSAL_NEWLINES
+		f->f_newlinetypes = newlinetypes;
+		f->f_skipnextlf = skipnextlf;
+#endif
 		if (c == '\n')
 			break;
 		if (c == EOF) {
@@ -1150,8 +1228,8 @@ file_readlines(PyFileObject *f, PyObject *args)
 		else {
 			Py_BEGIN_ALLOW_THREADS
 			errno = 0;
-			nread = fread(buffer+nfilled, 1,
-				      buffersize-nfilled, f->f_fp);
+			nread = Py_UniversalNewlineFread(buffer+nfilled, 
+				buffersize-nfilled, f->f_fp, (PyObject *)f);
 			Py_END_ALLOW_THREADS
 			shortread = (nread < buffersize-nfilled);
 		}
@@ -1188,7 +1266,8 @@ file_readlines(PyFileObject *f, PyObject *args)
 			}
 			else {
 				/* Grow the big buffer */
-				_PyString_Resize(&big_buffer, buffersize);
+				if ( _PyString_Resize(&big_buffer, buffersize) < 0 )
+					goto error;
 				buffer = PyString_AS_STRING(big_buffer);
 			}
 			continue;
@@ -1503,9 +1582,40 @@ get_closed(PyFileObject *f, void *closure)
 {
 	return PyBool_FromLong((long)(f->f_fp == 0));
 }
+#ifdef WITH_UNIVERSAL_NEWLINES
+static PyObject *
+get_newlines(PyFileObject *f, void *closure)
+{
+	switch (f->f_newlinetypes) {
+	case NEWLINE_UNKNOWN:
+		Py_INCREF(Py_None);
+		return Py_None;
+	case NEWLINE_CR:
+		return PyString_FromString("\r");
+	case NEWLINE_LF:
+		return PyString_FromString("\n");
+	case NEWLINE_CR|NEWLINE_LF:
+		return Py_BuildValue("(ss)", "\r", "\n");
+	case NEWLINE_CRLF:
+		return PyString_FromString("\r\n");
+	case NEWLINE_CR|NEWLINE_CRLF:
+		return Py_BuildValue("(ss)", "\r", "\r\n");
+	case NEWLINE_LF|NEWLINE_CRLF:
+		return Py_BuildValue("(ss)", "\n", "\r\n");
+	case NEWLINE_CR|NEWLINE_LF|NEWLINE_CRLF:
+		return Py_BuildValue("(sss)", "\r", "\n", "\r\n");
+	default:
+		PyErr_Format(PyExc_SystemError, "Unknown newlines value 0x%x\n", f->f_newlinetypes);
+		return NULL;
+	}
+}
+#endif
 
 static PyGetSetDef file_getsetlist[] = {
 	{"closed", (getter)get_closed, NULL, "True if the file is closed"},
+#ifdef WITH_UNIVERSAL_NEWLINES
+	{"newlines", (getter)get_newlines, NULL, "end-of-line convention used in this file"},
+#endif
 	{0},
 };
 
@@ -1805,3 +1915,170 @@ int PyObject_AsFileDescriptor(PyObject *o)
 	}
 	return fd;
 }
+
+#ifdef WITH_UNIVERSAL_NEWLINES
+/* From here on we need access to the real fgets and fread */
+#undef fgets
+#undef fread
+
+/*
+** Py_UniversalNewlineFgets is an fgets variation that understands
+** all of \r, \n and \r\n conventions.
+** The stream should be opened in binary mode.
+** If fobj is NULL the routine always does newline conversion, and
+** it may peek one char ahead to gobble the second char in \r\n.
+** If fobj is non-NULL it must be a PyFileObject. In this case there
+** is no readahead but in stead a flag is used to skip a following
+** \n on the next read. Also, if the file is open in binary mode
+** the whole conversion is skipped. Finally, the routine keeps track of
+** the different types of newlines seen.
+** Note that we need no error handling: fgets() treats error and eof
+** identically.
+*/
+char *
+Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj)
+{
+	char *p = buf;
+	int c;
+	int newlinetypes = 0;
+	int skipnextlf = 0;
+	int univ_newline = 1;
+	
+	if (fobj) {
+		if (!PyFile_Check(fobj)) {
+			errno = ENXIO;	/* What can you do... */
+			return NULL;
+		}
+		univ_newline = ((PyFileObject *)fobj)->f_univ_newline;
+		if ( !univ_newline )
+			return fgets(buf, n, stream);
+		newlinetypes = ((PyFileObject *)fobj)->f_newlinetypes;
+		skipnextlf = ((PyFileObject *)fobj)->f_skipnextlf;
+	}
+	FLOCKFILE(stream);
+	c = 'x'; /* Shut up gcc warning */
+	while (--n > 0 && (c = GETC(stream)) != EOF ) {
+		if (skipnextlf ) {
+			skipnextlf = 0;
+			if (c == '\n') {
+				/* Seeing a \n here with skipnextlf true
+				** means we saw a \r before.
+				*/
+				newlinetypes |= NEWLINE_CRLF;
+				c = GETC(stream);
+				if (c == EOF) break;
+			} else {
+				/*
+				** Note that c == EOF also brings us here,
+				** so we're okay if the last char in the file
+				** is a CR.
+				*/
+				newlinetypes |= NEWLINE_CR;
+			}
+		}
+		if (c == '\r') {
+			/* A \r is translated into a \n, and we skip
+			** an adjacent \n, if any. We don't set the
+			** newlinetypes flag until we've seen the next char.
+			*/
+			skipnextlf = 1;
+			c = '\n';
+		} else if ( c == '\n') {
+			newlinetypes |= NEWLINE_LF;
+		}
+		*p++ = c;
+		if (c == '\n') break;
+	}
+	if ( c == EOF && skipnextlf )
+		newlinetypes |= NEWLINE_CR;
+	FUNLOCKFILE(stream);
+	*p = '\0';
+	if (fobj) {
+		((PyFileObject *)fobj)->f_newlinetypes = newlinetypes;
+		((PyFileObject *)fobj)->f_skipnextlf = skipnextlf;
+	} else if ( skipnextlf ) {
+		/* If we have no file object we cannot save the
+		** skipnextlf flag. We have to readahead, which
+		** will cause a pause if we're reading from an
+		** interactive stream, but that is very unlikely
+		** unless we're doing something silly like
+		** execfile("/dev/tty").
+		*/
+		c = GETC(stream);
+		if ( c != '\n' )
+			ungetc(c, stream);
+	}
+	if (p == buf)
+		return NULL;
+	return buf;
+}
+
+/*
+** Py_UniversalNewlineFread is an fread variation that understands
+** all of \r, \n and \r\n conventions.
+** The stream should be opened in binary mode.
+** fobj must be a PyFileObject. In this case there
+** is no readahead but in stead a flag is used to skip a following
+** \n on the next read. Also, if the file is open in binary mode
+** the whole conversion is skipped. Finally, the routine keeps track of
+** the different types of newlines seen.
+*/
+size_t
+Py_UniversalNewlineFread(void *buf, size_t n,
+			 FILE *stream, PyObject *fobj)
+{
+	char *src = buf, *dst = buf, c;
+	int nread, ntodo=n;
+	int newlinetypes, skipnextlf, univ_newline;
+	
+	if (!fobj || !PyFile_Check(fobj)) {
+		errno = ENXIO;	/* What can you do... */
+		return -1;
+	}
+	univ_newline = ((PyFileObject *)fobj)->f_univ_newline;
+	if ( !univ_newline )
+		return fread(buf, 1, n, stream);
+	newlinetypes = ((PyFileObject *)fobj)->f_newlinetypes;
+	skipnextlf = ((PyFileObject *)fobj)->f_skipnextlf;
+	while (ntodo > 0) {
+		if (ferror(stream))
+			break;
+		nread = fread(dst, 1, ntodo, stream);
+		src = dst;
+		if (nread <= 0) {
+			if (skipnextlf)
+				newlinetypes |= NEWLINE_CR;
+			break;
+		}
+		ntodo -= nread;
+		while ( nread-- ) {
+			c = *src++;
+			if (c == '\r') {
+				/* Save CR as LF and set flag to skip next newline
+				*/
+				*dst++ = '\n';
+				skipnextlf = 1;
+			} else if (skipnextlf && c == '\n') {
+				/* Skip an LF, and remember that we saw CR LF
+				*/
+				skipnextlf = 0;
+				newlinetypes |= NEWLINE_CRLF;
+			} else {
+				/* Normal char to be stored in buffer. Also update
+				** the newlinetypes flag if either this is an LF
+				** or the previous char was a CR.
+				*/
+				if (c == '\n')
+					newlinetypes |= NEWLINE_LF;
+				else if (skipnextlf)
+					newlinetypes |= NEWLINE_CR;
+				*dst++ = c;
+				skipnextlf = 0;
+			}
+		}
+	}
+	((PyFileObject *)fobj)->f_newlinetypes = newlinetypes;
+	((PyFileObject *)fobj)->f_skipnextlf = skipnextlf;
+	return dst - (char *)buf;
+}
+#endif
author	Jack Jansen <jack.jansen@cwi.nl>	2002-04-14 20:12:41 (GMT)
committer	Jack Jansen <jack.jansen@cwi.nl>	2002-04-14 20:12:41 (GMT)
commit	7b8c7546ebc1fc3688ef95768fa8b82f0f205490 (patch)
tree	2e14b243347a9c38c82e2e774d4b201f23149916 /Objects
parent	dcd2dc2fffce8614c5d2b8d303a303a599b88a23 (diff)
download	cpython-7b8c7546ebc1fc3688ef95768fa8b82f0f205490.zip cpython-7b8c7546ebc1fc3688ef95768fa8b82f0f205490.tar.gz cpython-7b8c7546ebc1fc3688ef95768fa8b82f0f205490.tar.bz2