From fe7c5b5bdf7c21551b56be563fc604f2d4d3c756 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 5 Apr 2011 01:48:03 +0200 Subject: Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error. --- Lib/test/test_imp.py | 6 ++++++ Misc/NEWS | 2 ++ Parser/tokenizer.c | 41 +++++++++++++++++++++++++++-------------- Parser/tokenizer.h | 1 - Python/import.c | 10 +++++----- Python/traceback.c | 6 +++--- 6 files changed, 43 insertions(+), 23 deletions(-) diff --git a/Lib/test/test_imp.py b/Lib/test/test_imp.py index 83e17d3..88d2a3e 100644 --- a/Lib/test/test_imp.py +++ b/Lib/test/test_imp.py @@ -58,6 +58,12 @@ class ImportTests(unittest.TestCase): with imp.find_module('module_' + mod, self.test_path)[0] as fd: self.assertEqual(fd.encoding, encoding) + path = [os.path.dirname(__file__)] + self.assertRaisesRegex(SyntaxError, + r"Non-UTF-8 code starting with '\\xf6'" + r" in file .*badsyntax_pep3120.py", + imp.find_module, 'badsyntax_pep3120', path) + def test_issue1267(self): for mod, encoding, _ in self.test_strings: fp, filename, info = imp.find_module('module_' + mod, diff --git a/Misc/NEWS b/Misc/NEWS index 30d7c50..ef274eb 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1? Core and Builtins ----------------- +- Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error. + - Issue #10785: Store the filename as Unicode in the Python parser. - Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 5edd958..f4d7e3f 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1690,17 +1690,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) return result; } -/* Get -*- encoding -*- from a Python file. +/* Get the encoding of a Python file. Check for the coding cookie and check if + the file starts with a BOM. - PyTokenizer_FindEncoding returns NULL when it can't find the encoding in - the first or second line of the file (in which case the encoding - should be assumed to be PyUnicode_GetDefaultEncoding()). + PyTokenizer_FindEncodingFilename() returns NULL when it can't find the + encoding in the first or second line of the file (in which case the encoding + should be assumed to be UTF-8). + + The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed + by the caller. */ - The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed - by the caller. -*/ char * -PyTokenizer_FindEncoding(int fd) +PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) { struct tok_state *tok; FILE *fp; @@ -1720,9 +1721,18 @@ PyTokenizer_FindEncoding(int fd) return NULL; } #ifndef PGEN - tok->filename = PyUnicode_FromString(""); - if (tok->filename == NULL) - goto error; + if (filename != NULL) { + Py_INCREF(filename); + tok->filename = filename; + } + else { + tok->filename = PyUnicode_FromString(""); + if (tok->filename == NULL) { + fclose(fp); + PyTokenizer_Free(tok); + return encoding; + } + } #endif while (tok->lineno < 2 && tok->done == E_OK) { PyTokenizer_Get(tok, &p_start, &p_end); @@ -1733,13 +1743,16 @@ PyTokenizer_FindEncoding(int fd) if (encoding) strcpy(encoding, tok->encoding); } -#ifndef PGEN -error: -#endif PyTokenizer_Free(tok); return encoding; } +char * +PyTokenizer_FindEncoding(int fd) +{ + return PyTokenizer_FindEncodingFilename(fd, NULL); +} + #ifdef Py_DEBUG void diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 3a0d3cb..ed1f3aa 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -75,7 +75,6 @@ extern void PyTokenizer_Free(struct tok_state *); extern int PyTokenizer_Get(struct tok_state *, char **, char **); extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset); -extern char * PyTokenizer_FindEncoding(int); #ifdef __cplusplus } diff --git a/Python/import.c b/Python/import.c index b074b83..4159a8e 100644 --- a/Python/import.c +++ b/Python/import.c @@ -124,12 +124,12 @@ static const Py_UNICODE PYC_TAG_UNICODE[] = { /* See _PyImport_FixupExtensionObject() below */ static PyObject *extensions = NULL; +/* Function from Parser/tokenizer.c */ +extern char * PyTokenizer_FindEncodingFilename(int, PyObject *); + /* This table is defined in config.c: */ extern struct _inittab _PyImport_Inittab[]; -/* Method from Parser/tokenizer.c */ -extern char * PyTokenizer_FindEncoding(int); - struct _inittab *PyImport_Inittab = _PyImport_Inittab; /* these tables define the module suffixes that Python recognizes */ @@ -3540,9 +3540,9 @@ call_find_module(PyObject *name, PyObject *path_list) } if (fd != -1) { if (strchr(fdp->mode, 'b') == NULL) { - /* PyTokenizer_FindEncoding() returns PyMem_MALLOC'ed + /* PyTokenizer_FindEncodingFilename() returns PyMem_MALLOC'ed memory. */ - found_encoding = PyTokenizer_FindEncoding(fd); + found_encoding = PyTokenizer_FindEncodingFilename(fd, pathobj); lseek(fd, 0, 0); /* Reset position */ if (found_encoding == NULL && PyErr_Occurred()) { Py_XDECREF(pathobj); diff --git a/Python/traceback.c b/Python/traceback.c index f0142da..e74a147 100644 --- a/Python/traceback.c +++ b/Python/traceback.c @@ -18,8 +18,8 @@ #define MAX_FRAME_DEPTH 100 #define MAX_NTHREADS 100 -/* Method from Parser/tokenizer.c */ -extern char * PyTokenizer_FindEncoding(int); +/* Function from Parser/tokenizer.c */ +extern char * PyTokenizer_FindEncodingFilename(int, PyObject *); static PyObject * tb_dir(PyTracebackObject *self) @@ -251,7 +251,7 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent) /* use the right encoding to decode the file as unicode */ fd = PyObject_AsFileDescriptor(binary); - found_encoding = PyTokenizer_FindEncoding(fd); + found_encoding = PyTokenizer_FindEncodingFilename(fd, filename); encoding = (found_encoding != NULL) ? found_encoding : "utf-8"; lseek(fd, 0, 0); /* Reset position */ fob = PyObject_CallMethod(io, "TextIOWrapper", "Os", binary, encoding); -- cgit v0.12