summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2007-07-29 18:10:01 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2007-07-29 18:10:01 (GMT)
commit447d33ead62b6dcd46d475a703f59940eb85428b (patch)
treefbae188f574406c408058b2924d98e764ff062e1
parent5de17db361d4cbf194c07b8ee5e037bb3bf3ae13 (diff)
downloadcpython-447d33ead62b6dcd46d475a703f59940eb85428b.zip
cpython-447d33ead62b6dcd46d475a703f59940eb85428b.tar.gz
cpython-447d33ead62b6dcd46d475a703f59940eb85428b.tar.bz2
Implement PEP 3120.
-rw-r--r--Lib/test/badsyntax_pep3120.py1
-rw-r--r--Lib/test/test_pep3120.py30
-rw-r--r--Misc/NEWS2
-rw-r--r--Parser/tokenizer.c39
-rw-r--r--Python/ast.c3
5 files changed, 69 insertions, 6 deletions
diff --git a/Lib/test/badsyntax_pep3120.py b/Lib/test/badsyntax_pep3120.py
new file mode 100644
index 0000000..d14b4c9
--- /dev/null
+++ b/Lib/test/badsyntax_pep3120.py
@@ -0,0 +1 @@
+print("böse")
diff --git a/Lib/test/test_pep3120.py b/Lib/test/test_pep3120.py
new file mode 100644
index 0000000..3f567bf
--- /dev/null
+++ b/Lib/test/test_pep3120.py
@@ -0,0 +1,30 @@
+# This file is marked as binary in the CVS, to prevent MacCVS from recoding it.
+
+import unittest
+from test import test_support
+
+class PEP3120Test(unittest.TestCase):
+
+ def test_pep3120(self):
+ self.assertEqual(
+ "Питон".encode("utf-8"),
+ b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd'
+ )
+ self.assertEqual(
+ "\П".encode("utf-8"),
+ b'\\\xd0\x9f'
+ )
+
+ def test_badsyntax(self):
+ try:
+ import test.badsyntax_pep3120
+ except SyntaxError as msg:
+ self.assert_(str(msg).find("Non-UTF-8 code starting with") >= 0)
+ else:
+ self.fail("expected exception didn't occur")
+
+def test_main():
+ test_support.run_unittest(PEP3120Test)
+
+if __name__=="__main__":
+ test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index 8750b21..98b4fe1 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -26,6 +26,8 @@ TO DO
Core and Builtins
-----------------
+- PEP 3120: Change default encoding to UTF-8.
+
- PEP 3123: Use proper C inheritance for PyObject.
- Removed the __oct__ and __hex__ special methods and added a bin()
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 9cbc8fe..00bb38a 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -444,6 +444,34 @@ static void fp_ungetc(int c, struct tok_state *tok) {
ungetc(c, tok->fp);
}
+/* Check whether the characters at s start a valid
+ UTF-8 sequence. Return the number of characters forming
+ the sequence if yes, 0 if not. */
+static int valid_utf8(const unsigned char* s)
+{
+ int expected = 0;
+ int length;
+ if (*s < 0x80)
+ /* single-byte code */
+ return 1;
+ if (*s < 0xc0)
+ /* following byte */
+ return 0;
+ if (*s < 0xE0)
+ expected = 1;
+ else if (*s < 0xF0)
+ expected = 2;
+ else if (*s < 0xF8)
+ expected = 3;
+ else
+ return 0;
+ length = expected + 1;
+ for (; expected; expected--)
+ if (s[expected] < 0x80 || s[expected] >= 0xC0)
+ return 0;
+ return length;
+}
+
/* Read a line of input from TOK. Determine encoding
if necessary. */
@@ -478,12 +506,13 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
}
}
#ifndef PGEN
- /* The default encoding is ASCII, so make sure we don't have any
- non-ASCII bytes in it. */
+ /* The default encoding is UTF-8, so make sure we don't have any
+ non-UTF-8 sequences in it. */
if (line && !tok->encoding) {
unsigned char *c;
- for (c = (unsigned char *)line; *c; c++)
- if (*c > 127) {
+ int length;
+ for (c = (unsigned char *)line; *c; c += length)
+ if (!(length = valid_utf8(c))) {
badchar = *c;
break;
}
@@ -493,7 +522,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
/* Need to add 1 to the line number, since this line
has not been counted, yet. */
sprintf(buf,
- "Non-ASCII character '\\x%.2x' "
+ "Non-UTF-8 code starting with '\\x%.2x' "
"in file %.200s on line %i, "
"but no encoding declared; "
"see http://www.python.org/peps/pep-0263.html for details",
diff --git a/Python/ast.c b/Python/ast.c
index 146cd05..5426c02 100644
--- a/Python/ast.c
+++ b/Python/ast.c
@@ -203,7 +203,8 @@ PyAST_FromNode(const node *n, PyCompilerFlags *flags, const char *filename,
c.c_encoding = STR(n);
n = CHILD(n, 0);
} else {
- c.c_encoding = NULL;
+ /* PEP 3120 */
+ c.c_encoding = "utf-8";
}
c.c_arena = arena;