From a5136196bce72c51c79a5f961223b4645c90255c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Tue, 4 Sep 2007 14:19:28 +0000 Subject: Patch #1031213: Decode source line in SyntaxErrors back to its original source encoding. Will backport to 2.5. --- Lib/test/test_compiler.py | 26 ++++++++++++++++++++ Misc/ACKS | 1 + Misc/NEWS | 3 +++ Parser/parsetok.c | 18 ++++++++++---- Parser/tokenizer.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++ Parser/tokenizer.h | 2 ++ 6 files changed, 107 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_compiler.py b/Lib/test/test_compiler.py index 229d8a3..606ed70 100644 --- a/Lib/test/test_compiler.py +++ b/Lib/test/test_compiler.py @@ -155,6 +155,32 @@ class CompilerTest(unittest.TestCase): self.assertEquals(dct.get('result'), 1) + def _testErrEnc(self, src, text, offset): + try: + compile(src, "", "exec") + except SyntaxError, e: + self.assertEquals(e.offset, offset) + self.assertEquals(e.text, text) + + def testSourceCodeEncodingsError(self): + # Test SyntaxError with encoding definition + sjis = "print '\x83\x70\x83\x43\x83\x5c\x83\x93', '\n" + ascii = "print '12345678', '\n" + encdef = "#! -*- coding: ShiftJIS -*-\n" + + # ascii source without encdef + self._testErrEnc(ascii, ascii, 19) + + # ascii source with encdef + self._testErrEnc(encdef+ascii, ascii, 19) + + # non-ascii source with encdef + self._testErrEnc(encdef+sjis, sjis, 19) + + # ShiftJIS source without encdef + self._testErrEnc(sjis, sjis, 19) + + NOLINENO = (compiler.ast.Module, compiler.ast.Stmt, compiler.ast.Discard) ############################################################################### diff --git a/Misc/ACKS b/Misc/ACKS index 3d73388..4204678 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -320,6 +320,7 @@ Lars Immisch Tony Ingraldi John Interrante Bob Ippolito +Atsuo Ishimoto Ben Jackson Paul Jackson David Jacobs diff --git a/Misc/NEWS b/Misc/NEWS index f8a875c..d99e7bc 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,9 @@ What's New in Python 2.6 alpha 1? Core and builtins ----------------- +- Patch #1031213: Decode source line in SyntaxErrors back to its original source + encoding. + - Py_ssize_t fields work in structmember when HAVE_LONG_LONG is not defined. - PEP 3123: Provide forward compatibility with Python 3.0, while keeping diff --git a/Parser/parsetok.c b/Parser/parsetok.c index c951396..f3d8462 100644 --- a/Parser/parsetok.c +++ b/Parser/parsetok.c @@ -218,16 +218,24 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, err_ret->error = E_EOF; err_ret->lineno = tok->lineno; if (tok->buf != NULL) { + char *text = NULL; size_t len; assert(tok->cur - tok->buf < INT_MAX); err_ret->offset = (int)(tok->cur - tok->buf); len = tok->inp - tok->buf; - err_ret->text = (char *) PyObject_MALLOC(len + 1); - if (err_ret->text != NULL) { - if (len > 0) - strncpy(err_ret->text, tok->buf, len); - err_ret->text[len] = '\0'; +#ifdef Py_USING_UNICODE + text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset); + +#endif + if (text == NULL) { + text = (char *) PyObject_MALLOC(len + 1); + if (text != NULL) { + if (len > 0) + strncpy(text, tok->buf, len); + text[len] = '\0'; + } } + err_ret->text = text; } } else if (tok->encoding != NULL) { node* r = PyNode_New(encoding_decl); diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 8654356..28fcf3c 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1522,6 +1522,68 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) return result; } +/* This function is only called from parsetok. However, it cannot live + there, as it must be empty for PGEN, and we can check for PGEN only + in this file. */ + +#ifdef PGEN +char* +PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) +{ + return NULL; +} +#else +static PyObject * +dec_utf8(const char *enc, const char *text, size_t len) { + PyObject *ret = NULL; + PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); + if (unicode_text) { + ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); + Py_DECREF(unicode_text); + } + if (!ret) { + PyErr_Print(); + } + return ret; +} + +char * +PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) +{ + char *text = NULL; + if (tok->encoding) { + /* convert source to original encondig */ + PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); + if (lineobj != NULL) { + int linelen = PyString_Size(lineobj); + const char *line = PyString_AsString(lineobj); + text = PyObject_MALLOC(linelen + 1); + if (text != NULL && line != NULL) { + if (linelen) + strncpy(text, line, linelen); + text[linelen] = '\0'; + } + Py_DECREF(lineobj); + + /* adjust error offset */ + if (*offset > 1) { + PyObject *offsetobj = dec_utf8(tok->encoding, + tok->buf, *offset-1); + if (offsetobj) { + *offset = PyString_Size(offsetobj) + 1; + Py_DECREF(offsetobj); + } + } + + } + } + return text; + +} +#endif + + + #ifdef Py_DEBUG void diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 5e7ebf7..8482cdd 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -58,6 +58,8 @@ extern struct tok_state *PyTokenizer_FromString(const char *); extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *); extern void PyTokenizer_Free(struct tok_state *); extern int PyTokenizer_Get(struct tok_state *, char **, char **); +extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok, + int len, int *offset); #ifdef __cplusplus } -- cgit v0.12