summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_imp.py6
-rw-r--r--Misc/NEWS2
-rw-r--r--Parser/tokenizer.c41
-rw-r--r--Parser/tokenizer.h1
-rw-r--r--Python/import.c10
-rw-r--r--Python/traceback.c6
6 files changed, 43 insertions, 23 deletions
diff --git a/Lib/test/test_imp.py b/Lib/test/test_imp.py
index 83e17d3..88d2a3e 100644
--- a/Lib/test/test_imp.py
+++ b/Lib/test/test_imp.py
@@ -58,6 +58,12 @@ class ImportTests(unittest.TestCase):
with imp.find_module('module_' + mod, self.test_path)[0] as fd:
self.assertEqual(fd.encoding, encoding)
+ path = [os.path.dirname(__file__)]
+ self.assertRaisesRegex(SyntaxError,
+ r"Non-UTF-8 code starting with '\\xf6'"
+ r" in file .*badsyntax_pep3120.py",
+ imp.find_module, 'badsyntax_pep3120', path)
+
def test_issue1267(self):
for mod, encoding, _ in self.test_strings:
fp, filename, info = imp.find_module('module_' + mod,
diff --git a/Misc/NEWS b/Misc/NEWS
index 30d7c50..ef274eb 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
+- Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.
+
- Issue #10785: Store the filename as Unicode in the Python parser.
- Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 5edd958..f4d7e3f 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1690,17 +1690,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
return result;
}
-/* Get -*- encoding -*- from a Python file.
+/* Get the encoding of a Python file. Check for the coding cookie and check if
+ the file starts with a BOM.
- PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
- the first or second line of the file (in which case the encoding
- should be assumed to be PyUnicode_GetDefaultEncoding()).
+ PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
+ encoding in the first or second line of the file (in which case the encoding
+ should be assumed to be UTF-8).
+
+ The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
+ by the caller. */
- The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
- by the caller.
-*/
char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
{
struct tok_state *tok;
FILE *fp;
@@ -1720,9 +1721,18 @@ PyTokenizer_FindEncoding(int fd)
return NULL;
}
#ifndef PGEN
- tok->filename = PyUnicode_FromString("<string>");
- if (tok->filename == NULL)
- goto error;
+ if (filename != NULL) {
+ Py_INCREF(filename);
+ tok->filename = filename;
+ }
+ else {
+ tok->filename = PyUnicode_FromString("<string>");
+ if (tok->filename == NULL) {
+ fclose(fp);
+ PyTokenizer_Free(tok);
+ return encoding;
+ }
+ }
#endif
while (tok->lineno < 2 && tok->done == E_OK) {
PyTokenizer_Get(tok, &p_start, &p_end);
@@ -1733,13 +1743,16 @@ PyTokenizer_FindEncoding(int fd)
if (encoding)
strcpy(encoding, tok->encoding);
}
-#ifndef PGEN
-error:
-#endif
PyTokenizer_Free(tok);
return encoding;
}
+char *
+PyTokenizer_FindEncoding(int fd)
+{
+ return PyTokenizer_FindEncodingFilename(fd, NULL);
+}
+
#ifdef Py_DEBUG
void
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 3a0d3cb..ed1f3aa 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -75,7 +75,6 @@ extern void PyTokenizer_Free(struct tok_state *);
extern int PyTokenizer_Get(struct tok_state *, char **, char **);
extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
int len, int *offset);
-extern char * PyTokenizer_FindEncoding(int);
#ifdef __cplusplus
}
diff --git a/Python/import.c b/Python/import.c
index b074b83..4159a8e 100644
--- a/Python/import.c
+++ b/Python/import.c
@@ -124,12 +124,12 @@ static const Py_UNICODE PYC_TAG_UNICODE[] = {
/* See _PyImport_FixupExtensionObject() below */
static PyObject *extensions = NULL;
+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
+
/* This table is defined in config.c: */
extern struct _inittab _PyImport_Inittab[];
-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
-
struct _inittab *PyImport_Inittab = _PyImport_Inittab;
/* these tables define the module suffixes that Python recognizes */
@@ -3540,9 +3540,9 @@ call_find_module(PyObject *name, PyObject *path_list)
}
if (fd != -1) {
if (strchr(fdp->mode, 'b') == NULL) {
- /* PyTokenizer_FindEncoding() returns PyMem_MALLOC'ed
+ /* PyTokenizer_FindEncodingFilename() returns PyMem_MALLOC'ed
memory. */
- found_encoding = PyTokenizer_FindEncoding(fd);
+ found_encoding = PyTokenizer_FindEncodingFilename(fd, pathobj);
lseek(fd, 0, 0); /* Reset position */
if (found_encoding == NULL && PyErr_Occurred()) {
Py_XDECREF(pathobj);
diff --git a/Python/traceback.c b/Python/traceback.c
index f0142da..e74a147 100644
--- a/Python/traceback.c
+++ b/Python/traceback.c
@@ -18,8 +18,8 @@
#define MAX_FRAME_DEPTH 100
#define MAX_NTHREADS 100
-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
static PyObject *
tb_dir(PyTracebackObject *self)
@@ -251,7 +251,7 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent)
/* use the right encoding to decode the file as unicode */
fd = PyObject_AsFileDescriptor(binary);
- found_encoding = PyTokenizer_FindEncoding(fd);
+ found_encoding = PyTokenizer_FindEncodingFilename(fd, filename);
encoding = (found_encoding != NULL) ? found_encoding : "utf-8";
lseek(fd, 0, 0); /* Reset position */
fob = PyObject_CallMethod(io, "TextIOWrapper", "Os", binary, encoding);