summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2007-09-04 14:19:28 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2007-09-04 14:19:28 (GMT)
commita5136196bce72c51c79a5f961223b4645c90255c (patch)
tree552aefbadb426b866df79421bb0e7e953dec47c9
parent58bd49f5fec11751806d869a8479f59e13d2d558 (diff)
downloadcpython-a5136196bce72c51c79a5f961223b4645c90255c.zip
cpython-a5136196bce72c51c79a5f961223b4645c90255c.tar.gz
cpython-a5136196bce72c51c79a5f961223b4645c90255c.tar.bz2
Patch #1031213: Decode source line in SyntaxErrors back to its original
source encoding. Will backport to 2.5.
-rw-r--r--Lib/test/test_compiler.py26
-rw-r--r--Misc/ACKS1
-rw-r--r--Misc/NEWS3
-rw-r--r--Parser/parsetok.c18
-rw-r--r--Parser/tokenizer.c62
-rw-r--r--Parser/tokenizer.h2
6 files changed, 107 insertions, 5 deletions
diff --git a/Lib/test/test_compiler.py b/Lib/test/test_compiler.py
index 229d8a3..606ed70 100644
--- a/Lib/test/test_compiler.py
+++ b/Lib/test/test_compiler.py
@@ -155,6 +155,32 @@ class CompilerTest(unittest.TestCase):
self.assertEquals(dct.get('result'), 1)
+ def _testErrEnc(self, src, text, offset):
+ try:
+ compile(src, "", "exec")
+ except SyntaxError, e:
+ self.assertEquals(e.offset, offset)
+ self.assertEquals(e.text, text)
+
+ def testSourceCodeEncodingsError(self):
+ # Test SyntaxError with encoding definition
+ sjis = "print '\x83\x70\x83\x43\x83\x5c\x83\x93', '\n"
+ ascii = "print '12345678', '\n"
+ encdef = "#! -*- coding: ShiftJIS -*-\n"
+
+ # ascii source without encdef
+ self._testErrEnc(ascii, ascii, 19)
+
+ # ascii source with encdef
+ self._testErrEnc(encdef+ascii, ascii, 19)
+
+ # non-ascii source with encdef
+ self._testErrEnc(encdef+sjis, sjis, 19)
+
+ # ShiftJIS source without encdef
+ self._testErrEnc(sjis, sjis, 19)
+
+
NOLINENO = (compiler.ast.Module, compiler.ast.Stmt, compiler.ast.Discard)
###############################################################################
diff --git a/Misc/ACKS b/Misc/ACKS
index 3d73388..4204678 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -320,6 +320,7 @@ Lars Immisch
Tony Ingraldi
John Interrante
Bob Ippolito
+Atsuo Ishimoto
Ben Jackson
Paul Jackson
David Jacobs
diff --git a/Misc/NEWS b/Misc/NEWS
index f8a875c..d99e7bc 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,9 @@ What's New in Python 2.6 alpha 1?
Core and builtins
-----------------
+- Patch #1031213: Decode source line in SyntaxErrors back to its original source
+ encoding.
+
- Py_ssize_t fields work in structmember when HAVE_LONG_LONG is not defined.
- PEP 3123: Provide forward compatibility with Python 3.0, while keeping
diff --git a/Parser/parsetok.c b/Parser/parsetok.c
index c951396..f3d8462 100644
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -218,16 +218,24 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
err_ret->error = E_EOF;
err_ret->lineno = tok->lineno;
if (tok->buf != NULL) {
+ char *text = NULL;
size_t len;
assert(tok->cur - tok->buf < INT_MAX);
err_ret->offset = (int)(tok->cur - tok->buf);
len = tok->inp - tok->buf;
- err_ret->text = (char *) PyObject_MALLOC(len + 1);
- if (err_ret->text != NULL) {
- if (len > 0)
- strncpy(err_ret->text, tok->buf, len);
- err_ret->text[len] = '\0';
+#ifdef Py_USING_UNICODE
+ text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset);
+
+#endif
+ if (text == NULL) {
+ text = (char *) PyObject_MALLOC(len + 1);
+ if (text != NULL) {
+ if (len > 0)
+ strncpy(text, tok->buf, len);
+ text[len] = '\0';
+ }
}
+ err_ret->text = text;
}
} else if (tok->encoding != NULL) {
node* r = PyNode_New(encoding_decl);
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 8654356..28fcf3c 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1522,6 +1522,68 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
return result;
}
+/* This function is only called from parsetok. However, it cannot live
+ there, as it must be empty for PGEN, and we can check for PGEN only
+ in this file. */
+
+#ifdef PGEN
+char*
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
+{
+ return NULL;
+}
+#else
+static PyObject *
+dec_utf8(const char *enc, const char *text, size_t len) {
+ PyObject *ret = NULL;
+ PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
+ if (unicode_text) {
+ ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
+ Py_DECREF(unicode_text);
+ }
+ if (!ret) {
+ PyErr_Print();
+ }
+ return ret;
+}
+
+char *
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
+{
+ char *text = NULL;
+ if (tok->encoding) {
+ /* convert source to original encondig */
+ PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
+ if (lineobj != NULL) {
+ int linelen = PyString_Size(lineobj);
+ const char *line = PyString_AsString(lineobj);
+ text = PyObject_MALLOC(linelen + 1);
+ if (text != NULL && line != NULL) {
+ if (linelen)
+ strncpy(text, line, linelen);
+ text[linelen] = '\0';
+ }
+ Py_DECREF(lineobj);
+
+ /* adjust error offset */
+ if (*offset > 1) {
+ PyObject *offsetobj = dec_utf8(tok->encoding,
+ tok->buf, *offset-1);
+ if (offsetobj) {
+ *offset = PyString_Size(offsetobj) + 1;
+ Py_DECREF(offsetobj);
+ }
+ }
+
+ }
+ }
+ return text;
+
+}
+#endif
+
+
+
#ifdef Py_DEBUG
void
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 5e7ebf7..8482cdd 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -58,6 +58,8 @@ extern struct tok_state *PyTokenizer_FromString(const char *);
extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *);
extern void PyTokenizer_Free(struct tok_state *);
extern int PyTokenizer_Get(struct tok_state *, char **, char **);
+extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
+ int len, int *offset);
#ifdef __cplusplus
}