Patch #1031213: Decode source line in SyntaxErrors back to its original

source encoding. Will backport to 2.5.
author: Martin v. Löwis <martin@v.loewis.de> 2007-09-04 14:19:28 (GMT)
committer: Martin v. Löwis <martin@v.loewis.de> 2007-09-04 14:19:28 (GMT)
commit: a5136196bce72c51c79a5f961223b4645c90255c (patch)
tree: 552aefbadb426b866df79421bb0e7e953dec47c9 /Parser
parent: 58bd49f5fec11751806d869a8479f59e13d2d558 (diff)
download: cpython-a5136196bce72c51c79a5f961223b4645c90255c.zip
cpython-a5136196bce72c51c79a5f961223b4645c90255c.tar.gz
cpython-a5136196bce72c51c79a5f961223b4645c90255c.tar.bz2
3 files changed, 77 insertions, 5 deletions
diff --git a/Parser/parsetok.c b/Parser/parsetok.c
index c951396..f3d8462 100644
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -218,16 +218,24 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 			err_ret->error = E_EOF;
 		err_ret->lineno = tok->lineno;
 		if (tok->buf != NULL) {
+			char *text = NULL;
 			size_t len;
 			assert(tok->cur - tok->buf < INT_MAX);
 			err_ret->offset = (int)(tok->cur - tok->buf);
 			len = tok->inp - tok->buf;
-			err_ret->text = (char *) PyObject_MALLOC(len + 1);
-			if (err_ret->text != NULL) {
-				if (len > 0)
-					strncpy(err_ret->text, tok->buf, len);
-				err_ret->text[len] = '\0';
+#ifdef Py_USING_UNICODE
+			text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset);
+
+#endif
+			if (text == NULL) {
+				text = (char *) PyObject_MALLOC(len + 1);
+				if (text != NULL) {
+					if (len > 0)
+						strncpy(text, tok->buf, len);
+					text[len] = '\0';
+				}
 			}
+			err_ret->text = text;
 		}
 	} else if (tok->encoding != NULL) {
 		node* r = PyNode_New(encoding_decl);
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 8654356..28fcf3c 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1522,6 +1522,68 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
 	return result;
 }
 
+/* This function is only called from parsetok. However, it cannot live
+   there, as it must be empty for PGEN, and we can check for PGEN only
+   in this file. */
+
+#ifdef PGEN
+char*
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
+{
+	return NULL;
+}
+#else
+static PyObject *
+dec_utf8(const char *enc, const char *text, size_t len) {
+	PyObject *ret = NULL;	
+	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
+	if (unicode_text) {
+		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
+		Py_DECREF(unicode_text);
+	}
+	if (!ret) {
+		PyErr_Print();
+	}
+	return ret;
+}
+
+char *
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
+{
+	char *text = NULL;
+	if (tok->encoding) {
+		/* convert source to original encondig */
+		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
+		if (lineobj != NULL) {
+			int linelen = PyString_Size(lineobj);
+			const char *line = PyString_AsString(lineobj);
+			text = PyObject_MALLOC(linelen + 1);
+			if (text != NULL && line != NULL) {
+				if (linelen)
+					strncpy(text, line, linelen);
+				text[linelen] = '\0';
+			}
+			Py_DECREF(lineobj);
+					
+			/* adjust error offset */
+			if (*offset > 1) {
+				PyObject *offsetobj = dec_utf8(tok->encoding, 
+							       tok->buf, *offset-1);
+				if (offsetobj) {
+					*offset = PyString_Size(offsetobj) + 1;
+					Py_DECREF(offsetobj);
+				}
+			}
+			
+		}
+	}
+	return text;
+
+}
+#endif
+
+			   
+
 #ifdef Py_DEBUG
 
 void
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 5e7ebf7..8482cdd 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -58,6 +58,8 @@ extern struct tok_state *PyTokenizer_FromString(const char *);
 extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *);
 extern void PyTokenizer_Free(struct tok_state *);
 extern int PyTokenizer_Get(struct tok_state *, char **, char **);
+extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok, 
+					  int len, int *offset);
 
 #ifdef __cplusplus
 }
author	Martin v. Löwis <martin@v.loewis.de>	2007-09-04 14:19:28 (GMT)
committer	Martin v. Löwis <martin@v.loewis.de>	2007-09-04 14:19:28 (GMT)
commit	a5136196bce72c51c79a5f961223b4645c90255c (patch)
tree	552aefbadb426b866df79421bb0e7e953dec47c9 /Parser
parent	58bd49f5fec11751806d869a8479f59e13d2d558 (diff)
download	cpython-a5136196bce72c51c79a5f961223b4645c90255c.zip cpython-a5136196bce72c51c79a5f961223b4645c90255c.tar.gz cpython-a5136196bce72c51c79a5f961223b4645c90255c.tar.bz2