4 files changed, 218 insertions, 81 deletions
diff --git a/Lib/test/test_traceback.py b/Lib/test/test_traceback.py
index 3f69e5e..c44e2b1 100644
--- a/Lib/test/test_traceback.py
+++ b/Lib/test/test_traceback.py
@@ -6,6 +6,7 @@ import sys
 import unittest
 import re
 from test.support import run_unittest, is_jython, Error, captured_output
+from test.support import TESTFN, unlink
 
 import traceback
 
@@ -90,6 +91,70 @@ class SyntaxTracebackCases(unittest.TestCase):
         err = traceback.format_exception_only(None, None)
         self.assertEqual(err, ['None\n'])
 
+    def test_encoded_file(self):
+        # Test that tracebacks are correctly printed for encoded source files:
+        # - correct line number (Issue2384)
+        # - respect file encoding (Issue3975)
+        import tempfile, sys, subprocess, os
+
+        # The spawned subprocess has its stdout redirected to a PIPE, and its
+        # encoding may be different from the current interpreter, on Windows
+        # at least.
+        process = subprocess.Popen([sys.executable, "-c",
+                                    "import sys; print(sys.stdout.encoding)"],
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.STDOUT)
+        stdout, stderr = process.communicate()
+        output_encoding = str(stdout, 'ascii').splitlines()[0]
+
+        def do_test(firstlines, message, charset, lineno):
+            # Raise the message in a subprocess, and catch the output
+            try:
+                output = open(TESTFN, "w", encoding=charset)
+                output.write("""{0}if 1:
+                    import traceback;
+                    raise RuntimeError('{1}')
+                    """.format(firstlines, message))
+                output.close()
+                process = subprocess.Popen([sys.executable, TESTFN],
+                    stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+                stdout, stderr = process.communicate()
+                stdout = stdout.decode(output_encoding).splitlines()
+            finally:
+                unlink(TESTFN)
+
+            # The source lines are encoded with the 'backslashreplace' handler
+            encoded_message = message.encode(output_encoding,
+                                             'backslashreplace')
+            # and we just decoded them with the output_encoding.
+            message_ascii = encoded_message.decode(output_encoding)
+
+            err_line = "raise RuntimeError('{0}')".format(message_ascii)
+            err_msg = "RuntimeError: {0}".format(message_ascii)
+
+            self.assert_(("line %s" % lineno) in stdout[1],
+                "Invalid line number: {0!r} instead of {1}".format(
+                    stdout[1], lineno))
+            self.assert_(stdout[2].endswith(err_line),
+                "Invalid traceback line: {0!r} instead of {1!r}".format(
+                    stdout[2], err_line))
+            self.assert_(stdout[3] == err_msg,
+                "Invalid error message: {0!r} instead of {1!r}".format(
+                    stdout[3], err_msg))
+
+        do_test("", "foo", "ascii", 3)
+        for charset in ("ascii", "iso-8859-1", "utf-8", "GBK"):
+            if charset == "ascii":
+                text = "foo"
+            elif charset == "GBK":
+                text = "\u4E02\u5100"
+            else:
+                text = "h\xe9 ho"
+            do_test("# coding: {0}\n".format(charset),
+                    text, charset, 4)
+            do_test("#!shebang\n# coding: {0}\n".format(charset),
+                    text, charset, 5)
+
 
 class TracebackFormatTests(unittest.TestCase):
 
diff --git a/Misc/NEWS b/Misc/NEWS
index 2505bfa..7bad53ce 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -15,6 +15,10 @@ What's New in Python 3.0 beta 5
 Core and Builtins
 -----------------
 
+- Issues #2384 and #3975: Tracebacks were not correctly printed when the
+  source file contains a ``coding:`` header: the wrong line was displayed, and
+  the encoding was not respected.
+
 - Issue #3740: Null-initialize module state.
 
 - Issue #3946: PyObject_CheckReadBuffer crashed on a memoryview object.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 18815ae..4edf6d0 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -461,6 +461,14 @@ fp_setreadl(struct tok_state *tok, const char* enc)
 	readline = PyObject_GetAttrString(stream, "readline");
 	tok->decoding_readline = readline;
 
+	/* The file has been reopened; parsing will restart from
+	 * the beginning of the file, we have to reset the line number.
+	 * But this function has been called from inside tok_nextc() which
+	 * will increment lineno before it returns. So we set it -1 so that
+	 * the next call to tok_nextc() will start with tok->lineno == 0.
+	 */
+	tok->lineno = -1;
+
   cleanup:
 	Py_XDECREF(stream);
 	Py_XDECREF(io);
diff --git a/Python/traceback.c b/Python/traceback.c
index dffce35..63ecc3c 100644
--- a/Python/traceback.c
+++ b/Python/traceback.c
@@ -8,9 +8,15 @@
 #include "structmember.h"
 #include "osdefs.h"
 #include "traceback.h"
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
 
 #define OFF(x) offsetof(PyTracebackObject, x)
 
+/* Method from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncoding(int);
+
 static PyObject *
 tb_dir(PyTracebackObject *self)
 {
@@ -128,102 +134,156 @@ PyTraceBack_Here(PyFrameObject *frame)
 	return 0;
 }
 
+static int
+_Py_FindSourceFile(const char* filename, char* namebuf, size_t namelen, int open_flags)
+{
+	int i;
+	int fd = -1;
+	PyObject *v;
+	Py_ssize_t _npath;
+	int npath;
+	size_t taillen;
+	PyObject *syspath;
+	const char* path;
+	const char* tail;
+	Py_ssize_t len;
+
+	/* Search tail of filename in sys.path before giving up */
+	tail = strrchr(filename, SEP);
+	if (tail == NULL)
+		tail = filename;
+	else
+		tail++;
+	taillen = strlen(tail);
+
+	syspath = PySys_GetObject("path");
+	if (syspath == NULL || !PyList_Check(syspath))
+		return -1;
+	_npath = PyList_Size(syspath);
+	npath = Py_SAFE_DOWNCAST(_npath, Py_ssize_t, int);
+
+	for (i = 0; i < npath; i++) {
+		v = PyList_GetItem(syspath, i);
+		if (v == NULL) {
+			PyErr_Clear();
+			break;
+		}
+		if (!PyUnicode_Check(v))
+			continue;
+		path = _PyUnicode_AsStringAndSize(v, &len);
+		if (len + 1 + taillen >= (Py_ssize_t)namelen - 1)
+			continue; /* Too long */
+		strcpy(namebuf, path);
+		if (strlen(namebuf) != len)
+			continue; /* v contains '\0' */
+		if (len > 0 && namebuf[len-1] != SEP)
+			namebuf[len++] = SEP;
+		strcpy(namebuf+len, tail);
+		Py_BEGIN_ALLOW_THREADS
+		fd = open(namebuf, open_flags);
+		Py_END_ALLOW_THREADS
+		if (0 <= fd) {
+			return fd;
+		}
+	}
+	return -1;
+}
+
 int
 _Py_DisplaySourceLine(PyObject *f, const char *filename, int lineno, int indent)
 {
 	int err = 0;
-	FILE *xfp = NULL;
-	char linebuf[2000];
+	int fd;
 	int i;
-	char namebuf[MAXPATHLEN+1];
+	char *found_encoding;
+	char *encoding;
+	PyObject *fob = NULL;
+	PyObject *lineobj = NULL;
+#ifdef O_BINARY
+	const int open_flags = O_RDONLY | O_BINARY;   /* necessary for Windows */
+#else
+	const int open_flags = O_RDONLY;
+#endif
+	char buf[MAXPATHLEN+1];
+	Py_UNICODE *u, *p;
+	Py_ssize_t len;
 
+	/* open the file */
 	if (filename == NULL)
-		return -1;
-	xfp = fopen(filename, "r" PY_STDIOTEXTMODE);
-	if (xfp == NULL) {
-		/* Search tail of filename in sys.path before giving up */
-		PyObject *path;
-		const char *tail = strrchr(filename, SEP);
-		if (tail == NULL)
-			tail = filename;
-		else
-			tail++;
-		path = PySys_GetObject("path");
-		if (path != NULL && PyList_Check(path)) {
-			Py_ssize_t _npath = PyList_Size(path);
-			int npath = Py_SAFE_DOWNCAST(_npath, Py_ssize_t, int);
-			size_t taillen = strlen(tail);
-			for (i = 0; i < npath; i++) {
-				PyObject *v = PyList_GetItem(path, i);
-				if (v == NULL) {
-					PyErr_Clear();
-					break;
-				}
-				if (PyBytes_Check(v)) {
-					size_t len;
-					len = PyBytes_GET_SIZE(v);
-					if (len + 1 + taillen >= MAXPATHLEN)
-						continue; /* Too long */
-					strcpy(namebuf, PyBytes_AsString(v));
-					if (strlen(namebuf) != len)
-						continue; /* v contains '\0' */
-					if (len > 0 && namebuf[len-1] != SEP)
-						namebuf[len++] = SEP;
-					strcpy(namebuf+len, tail);
-					xfp = fopen(namebuf, "r" PY_STDIOTEXTMODE);
-					if (xfp != NULL) {
-						filename = namebuf;
-						break;
-					}
-				}
-			}
-		}
+		return 0;
+	Py_BEGIN_ALLOW_THREADS
+	fd = open(filename, open_flags);
+	Py_END_ALLOW_THREADS
+	if (fd < 0) {
+		fd = _Py_FindSourceFile(filename, buf, sizeof(buf), open_flags);
+		if (fd < 0)
+			return 0;
+		filename = buf;
 	}
 
-        if (xfp == NULL)
-            return err;
-        if (err != 0) {
-            fclose(xfp);
-            return err;
-        }
+	/* use the right encoding to decode the file as unicode */
+	found_encoding = PyTokenizer_FindEncoding(fd);
+	encoding = (found_encoding != NULL) ? found_encoding :
+		(char*)PyUnicode_GetDefaultEncoding();
+	lseek(fd, 0, 0); /* Reset position */
+	fob = PyFile_FromFd(fd, (char*)filename, "r", -1, (char*)encoding,
+		NULL, NULL, 1);
+	PyMem_FREE(found_encoding);
+	if (fob == NULL) {
+		PyErr_Clear();
+		close(fd);
+		return 0;
+	}
 
+	/* get the line number lineno */
 	for (i = 0; i < lineno; i++) {
-		char* pLastChar = &linebuf[sizeof(linebuf)-2];
-		do {
-			*pLastChar = '\0';
-			if (Py_UniversalNewlineFgets(linebuf, sizeof linebuf, xfp, NULL) == NULL)
-				break;
-			/* fgets read *something*; if it didn't get as
-			   far as pLastChar, it must have found a newline
-			   or hit the end of the file;	if pLastChar is \n,
-			   it obviously found a newline; else we haven't
-			   yet seen a newline, so must continue */
-		} while (*pLastChar != '\0' && *pLastChar != '\n');
+		Py_XDECREF(lineobj);
+		lineobj = PyFile_GetLine(fob, -1);
+		if (!lineobj) {
+			err = -1;
+			break;
+		}
 	}
-	if (i == lineno) {
-		char buf[11];
-		char *p = linebuf;
-		while (*p == ' ' || *p == '\t' || *p == '\014')
-			p++;
-
-		/* Write some spaces before the line */
-		strcpy(buf, "          ");
-		assert (strlen(buf) == 10);
-		while (indent > 0) {
-			if(indent < 10)
-				buf[indent] = '\0';
-			err = PyFile_WriteString(buf, f);
-			if (err != 0)
-				break;
-			indent -= 10;
+	Py_DECREF(fob);
+	if (!lineobj || !PyUnicode_Check(lineobj)) {
+		Py_XDECREF(lineobj);
+		return err;
+	}
+
+	/* remove the indentation of the line */
+	u = PyUnicode_AS_UNICODE(lineobj);
+	len = PyUnicode_GET_SIZE(lineobj);
+	for (p=u; *p == ' ' || *p == '\t' || *p == '\014'; p++)
+		len--;
+	if (u != p) {
+		PyObject *truncated;
+		truncated = PyUnicode_FromUnicode(p, len);
+		if (truncated) {
+			Py_DECREF(lineobj);
+			lineobj = truncated;
+		} else {
+			PyErr_Clear();
 		}
+	}
 
-		if (err == 0)
-			err = PyFile_WriteString(p, f);
-		if (err == 0 && strchr(p, '\n') == NULL)
-			err = PyFile_WriteString("\n", f);
+	/* Write some spaces before the line */
+	strcpy(buf, "          ");
+	assert (strlen(buf) == 10);
+	while (indent > 0) {
+		if(indent < 10)
+			buf[indent] = '\0';
+		err = PyFile_WriteString(buf, f);
+		if (err != 0)
+			break;
+		indent -= 10;
 	}
-	fclose(xfp);
+
+	/* finally display the line */
+	if (err == 0)
+		err = PyFile_WriteObject(lineobj, f, Py_PRINT_RAW);
+	Py_DECREF(lineobj);
+	if  (err == 0)
+		err = PyFile_WriteString("\n", f);
 	return err;
 }