Patch #534304: Implement phase 1 of PEP 263.

author: Martin v. Löwis <martin@v.loewis.de> 2002-08-04 17:29:52 (GMT)
committer: Martin v. Löwis <martin@v.loewis.de> 2002-08-04 17:29:52 (GMT)
commit: 00f1e3f5a54adb0a7159a446edeca2e36da4092e (patch)
tree: 86d731c4fd7c0141686044f5eefbb7f475da9647 /Python
parent: a729daf2e43f3ffa2d1b3b6cd31491c840091e66 (diff)
download: cpython-00f1e3f5a54adb0a7159a446edeca2e36da4092e.zip
cpython-00f1e3f5a54adb0a7159a446edeca2e36da4092e.tar.gz
cpython-00f1e3f5a54adb0a7159a446edeca2e36da4092e.tar.bz2
3 files changed, 143 insertions, 14 deletions
diff --git a/Python/compile.c b/Python/compile.c
index 3a0948e..512b5a3 100644
--- a/Python/compile.c
+++ b/Python/compile.c
@@ -485,6 +485,7 @@ struct compiling {
 	int c_closure;		/* Is nested w/freevars? */
 	struct symtable *c_symtable; /* pointer to module symbol table */
         PyFutureFeatures *c_future; /* pointer to module's __future__ */
+	char *c_encoding;	/* source encoding (a borrowed reference) */
 };
 
 static int
@@ -1182,6 +1183,23 @@ parsenumber(struct compiling *co, char *s)
 }
 
 static PyObject *
+decode_utf8(char **sPtr, char *end, char* encoding)
+{
+	PyObject *u, *v;
+	char *s, *t;
+	t = s = *sPtr;
+	/* while (s < end && *s != '\\') s++; */ /* inefficient for u".." */
+	while (s < end && (*s & 0x80)) s++;
+	*sPtr = s;
+	u = PyUnicode_DecodeUTF8(t, s - t, NULL);
+	if (u == NULL)
+		return NULL;
+	v = PyUnicode_AsEncodedString(u, encoding, NULL);
+	Py_DECREF(u);
+	return v;
+}
+
+static PyObject *
 parsestr(struct compiling *com, char *s)
 {
 	PyObject *v;
@@ -1193,6 +1211,8 @@ parsestr(struct compiling *com, char *s)
 	int first = *s;
 	int quote = first;
 	int rawmode = 0;
+	char* encoding = ((com == NULL) ? NULL : com->c_encoding);
+	int need_encoding;
 	int unicode = 0;
 
 	if (isalpha(quote) || quote == '_') {
@@ -1230,28 +1250,101 @@ parsestr(struct compiling *com, char *s)
 	}
 #ifdef Py_USING_UNICODE
 	if (unicode || Py_UnicodeFlag) {
+		PyObject *u, *w;
+		if (encoding == NULL) {
+			buf = s;
+			u = NULL;
+		} else if (strcmp(encoding, "iso-8859-1") == 0) {
+			buf = s;
+			u = NULL;
+		} else {
+			/* "\XX" may become "\u005c\uHHLL" (12 bytes) */
+			u = PyString_FromStringAndSize((char *)NULL, len * 4);
+			if (u == NULL)
+				return NULL;
+			p = buf = PyString_AsString(u);
+			end = s + len;
+			while (s < end) {
+				if (*s == '\\') {
+					*p++ = *s++;
+					if (*s & 0x80) {
+						strcpy(p, "u005c");
+						p += 5;
+					}
+				}
+				if (*s & 0x80) { /* XXX inefficient */
+					char *r;
+					int rn, i;
+					w = decode_utf8(&s, end, "utf-16-be");
+					if (w == NULL) {
+						Py_DECREF(u);
+						return NULL;
+					}
+					r = PyString_AsString(w);
+					rn = PyString_Size(w);
+					assert(rn % 2 == 0);
+					for (i = 0; i < rn; i += 2) {
+						sprintf(p, "\\u%02x%02x",
+							r[i + 0] & 0xFF,
+							r[i + 1] & 0xFF);
+						p += 6;
+					}
+					Py_DECREF(w);
+				} else {
+					*p++ = *s++;
+				}
+			}
+			len = p - buf;
+		}
 		if (rawmode)
-			v = PyUnicode_DecodeRawUnicodeEscape(
-				 s, len, NULL);
+			v = PyUnicode_DecodeRawUnicodeEscape(buf, len, NULL);
 		else
-			v = PyUnicode_DecodeUnicodeEscape(
-				s, len, NULL);
+			v = PyUnicode_DecodeUnicodeEscape(buf, len, NULL);
+		Py_XDECREF(u);
 		if (v == NULL)
 			PyErr_SyntaxLocation(com->c_filename, com->c_lineno);
 		return v;
 			
 	}
 #endif
-	if (rawmode || strchr(s, '\\') == NULL)
-		return PyString_FromStringAndSize(s, len);
-	v = PyString_FromStringAndSize((char *)NULL, len);
+	need_encoding = (encoding != NULL &&
+			 strcmp(encoding, "utf-8") != 0 &&
+			 strcmp(encoding, "iso-8859-1") != 0);
+	if (rawmode || strchr(s, '\\') == NULL) {
+		if (need_encoding) {
+			PyObject* u = PyUnicode_DecodeUTF8(s, len, NULL);
+			if (u == NULL)
+				return NULL;
+			v = PyUnicode_AsEncodedString(u, encoding, NULL);
+			Py_DECREF(u);
+			return v;
+		} else {
+			return PyString_FromStringAndSize(s, len);
+		}
+	}
+	v = PyString_FromStringAndSize((char *)NULL, /* XXX 4 is enough? */
+				       need_encoding ? len * 4 : len);
 	if (v == NULL)
 		return NULL;
 	p = buf = PyString_AsString(v);
 	end = s + len;
 	while (s < end) {
 		if (*s != '\\') {
-			*p++ = *s++;
+		  ORDINAL: 
+			if (need_encoding && (*s & 0x80)) {
+				char *r;
+				int rn;
+				PyObject* w = decode_utf8(&s, end, encoding);
+				if (w == NULL)
+					return NULL;
+				r = PyString_AsString(w);
+				rn = PyString_Size(w);
+				memcpy(p, r, rn);
+				p += rn;
+				Py_DECREF(w);
+			} else {
+				*p++ = *s++;
+			}
 			continue;
 		}
 		s++;
@@ -1320,8 +1413,8 @@ parsestr(struct compiling *com, char *s)
 #endif
 		default:
 			*p++ = '\\';
-			*p++ = s[-1];
-			break;
+			s--;
+			goto ORDINAL;
 		}
 	}
 	_PyString_Resize(&v, (int)(p - buf));
@@ -4149,6 +4242,12 @@ jcompile(node *n, char *filename, struct compiling *base,
 	PyCodeObject *co;
 	if (!com_init(&sc, filename))
 		return NULL;
+	if (TYPE(n) == encoding_decl) {
+		sc.c_encoding = STR(n);
+		n = CHILD(n, 0);
+	} else {
+		sc.c_encoding = NULL;
+	}
 	if (base) {
 		sc.c_private = base->c_private;
 		sc.c_symtable = base->c_symtable;
@@ -4157,6 +4256,10 @@ jcompile(node *n, char *filename, struct compiling *base,
 		    || (sc.c_symtable->st_cur->ste_type == TYPE_FUNCTION))
 			sc.c_nested = 1;
 		sc.c_flags |= base->c_flags & PyCF_MASK;
+		if (base->c_encoding != NULL) {
+			assert(sc.c_encoding == NULL);
+			sc.c_encoding = base->c_encoding;
+		}
 	} else {
 		sc.c_private = NULL;
 		sc.c_future = PyNode_Future(n, filename);
diff --git a/Python/graminit.c b/Python/graminit.c
index ef7d467..98bad94 100644
--- a/Python/graminit.c
+++ b/Python/graminit.c
@@ -1463,7 +1463,17 @@ static state states_66[2] = {
 	{1, arcs_66_0},
 	{2, arcs_66_1},
 };
-static dfa dfas[67] = {
+static arc arcs_67_0[1] = {
+	{12, 1},
+};
+static arc arcs_67_1[1] = {
+	{0, 1},
+};
+static state states_67[2] = {
+	{1, arcs_67_0},
+	{1, arcs_67_1},
+};
+static dfa dfas[68] = {
 	{256, "single_input", 0, 3, states_0,
 	 "\004\030\001\000\000\000\124\360\213\011\162\000\002\000\140\210\244\005\001"},
 	{257, "file_input", 0, 2, states_1,
@@ -1598,8 +1608,10 @@ static dfa dfas[67] = {
 	 "\000\000\000\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\000"},
 	{322, "testlist1", 0, 2, states_66,
 	 "\000\020\001\000\000\000\000\000\000\000\000\000\002\000\140\210\244\005\000"},
+	{323, "encoding_decl", 0, 2, states_67,
+	 "\000\020\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"},
 };
-static label labels[148] = {
+static label labels[149] = {
 	{0, "EMPTY"},
 	{256, 0},
 	{4, 0},
@@ -1748,10 +1760,11 @@ static label labels[148] = {
 	{318, 0},
 	{319, 0},
 	{321, 0},
+	{323, 0},
 };
 grammar _PyParser_Grammar = {
-	67,
+	68,
 	dfas,
-	{148, labels},
+	{149, labels},
 	256
 };
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index b1fde29..006ff08 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -1221,6 +1221,7 @@ static void
 err_input(perrdetail *err)
 {
 	PyObject *v, *w, *errtype;
+	PyObject* u = NULL;
 	char *msg = NULL;
 	errtype = PyExc_SyntaxError;
 	v = Py_BuildValue("(ziiz)", err->filename,
@@ -1272,12 +1273,24 @@ err_input(perrdetail *err)
 		errtype = PyExc_IndentationError;
 		msg = "too many levels of indentation";
 		break;
+	case E_DECODE: {	/* XXX */
+		PyThreadState* tstate = PyThreadState_Get();
+		PyObject* value = tstate->curexc_value;
+		if (value != NULL) {
+			u = PyObject_Repr(value);
+			if (u != NULL) {
+				msg = PyString_AsString(u);
+				break;
+			}
+		}
+	}
 	default:
 		fprintf(stderr, "error=%d\n", err->error);
 		msg = "unknown parsing error";
 		break;
 	}
 	w = Py_BuildValue("(sO)", msg, v);
+	Py_XDECREF(u);
 	Py_XDECREF(v);
 	PyErr_SetObject(errtype, w);
 	Py_XDECREF(w);
author	Martin v. Löwis <martin@v.loewis.de>	2002-08-04 17:29:52 (GMT)
committer	Martin v. Löwis <martin@v.loewis.de>	2002-08-04 17:29:52 (GMT)
commit	00f1e3f5a54adb0a7159a446edeca2e36da4092e (patch)
tree	86d731c4fd7c0141686044f5eefbb7f475da9647 /Python
parent	a729daf2e43f3ffa2d1b3b6cd31491c840091e66 (diff)
download	cpython-00f1e3f5a54adb0a7159a446edeca2e36da4092e.zip cpython-00f1e3f5a54adb0a7159a446edeca2e36da4092e.tar.gz cpython-00f1e3f5a54adb0a7159a446edeca2e36da4092e.tar.bz2