From 447d33ead62b6dcd46d475a703f59940eb85428b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= <martin@v.loewis.de>
Date: Sun, 29 Jul 2007 18:10:01 +0000
Subject: Implement PEP 3120.

---
 Lib/test/badsyntax_pep3120.py |  1 +
 Lib/test/test_pep3120.py      | 30 ++++++++++++++++++++++++++++++
 Misc/NEWS                     |  2 ++
 Parser/tokenizer.c            | 39 ++++++++++++++++++++++++++++++++++-----
 Python/ast.c                  |  3 ++-
 5 files changed, 69 insertions(+), 6 deletions(-)
 create mode 100644 Lib/test/badsyntax_pep3120.py
 create mode 100644 Lib/test/test_pep3120.py

diff --git a/Lib/test/badsyntax_pep3120.py b/Lib/test/badsyntax_pep3120.py
new file mode 100644
index 0000000..d14b4c9
--- /dev/null
+++ b/Lib/test/badsyntax_pep3120.py
@@ -0,0 +1 @@
+print("b鰏e")
diff --git a/Lib/test/test_pep3120.py b/Lib/test/test_pep3120.py
new file mode 100644
index 0000000..3f567bf
--- /dev/null
+++ b/Lib/test/test_pep3120.py
@@ -0,0 +1,30 @@
+# This file is marked as binary in the CVS, to prevent MacCVS from recoding it.
+
+import unittest
+from test import test_support
+
+class PEP3120Test(unittest.TestCase):
+
+    def test_pep3120(self):
+        self.assertEqual(
+            "袩懈褌芯薪".encode("utf-8"),
+            b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd'
+        )
+        self.assertEqual(
+            "\袩".encode("utf-8"),
+            b'\\\xd0\x9f'
+        )
+
+    def test_badsyntax(self):
+        try:
+            import test.badsyntax_pep3120
+        except SyntaxError as msg:
+            self.assert_(str(msg).find("Non-UTF-8 code starting with") >= 0)
+        else:
+            self.fail("expected exception didn't occur")
+
+def test_main():
+    test_support.run_unittest(PEP3120Test)
+
+if __name__=="__main__":
+    test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index 8750b21..98b4fe1 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -26,6 +26,8 @@ TO DO
 Core and Builtins
 -----------------
 
+- PEP 3120: Change default encoding to UTF-8.
+
 - PEP 3123: Use proper C inheritance for PyObject.
 
 - Removed the __oct__ and __hex__ special methods and added a bin()
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 9cbc8fe..00bb38a 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -444,6 +444,34 @@ static void fp_ungetc(int c, struct tok_state *tok) {
 	ungetc(c, tok->fp);
 }
 
+/* Check whether the characters at s start a valid
+   UTF-8 sequence. Return the number of characters forming
+   the sequence if yes, 0 if not.  */
+static int valid_utf8(const unsigned char* s)
+{
+	int expected = 0;
+	int length;
+	if (*s < 0x80)
+		/* single-byte code */
+		return 1;
+	if (*s < 0xc0)
+		/* following byte */
+		return 0;
+	if (*s < 0xE0)
+		expected = 1;
+	else if (*s < 0xF0)
+		expected = 2;
+	else if (*s < 0xF8)
+		expected = 3;
+	else
+		return 0;
+	length = expected + 1;
+	for (; expected; expected--)
+		if (s[expected] < 0x80 || s[expected] >= 0xC0)
+			return 0;
+	return length;
+}
+
 /* Read a line of input from TOK. Determine encoding
    if necessary.  */
 
@@ -478,12 +506,13 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
 		}
 	}
 #ifndef PGEN
-	/* The default encoding is ASCII, so make sure we don't have any
-           non-ASCII bytes in it. */
+	/* The default encoding is UTF-8, so make sure we don't have any
+           non-UTF-8 sequences in it. */
 	if (line && !tok->encoding) {
 		unsigned char *c;
-		for (c = (unsigned char *)line; *c; c++)
-			if (*c > 127) {
+		int length;
+		for (c = (unsigned char *)line; *c; c += length)
+			if (!(length = valid_utf8(c))) {
 				badchar = *c;
 				break;
 			}
@@ -493,7 +522,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
 		/* Need to add 1 to the line number, since this line
 		   has not been counted, yet.  */
 		sprintf(buf,
-			"Non-ASCII character '\\x%.2x' "
+			"Non-UTF-8 code starting with '\\x%.2x' "
 			"in file %.200s on line %i, "
 			"but no encoding declared; "
 			"see http://www.python.org/peps/pep-0263.html for details",
diff --git a/Python/ast.c b/Python/ast.c
index 146cd05..5426c02 100644
--- a/Python/ast.c
+++ b/Python/ast.c
@@ -203,7 +203,8 @@ PyAST_FromNode(const node *n, PyCompilerFlags *flags, const char *filename,
         c.c_encoding = STR(n);
         n = CHILD(n, 0);
     } else {
-        c.c_encoding = NULL;
+	/* PEP 3120 */
+        c.c_encoding = "utf-8";
     }
     c.c_arena = arena;
 
-- 
cgit v0.12