From 447d33ead62b6dcd46d475a703f59940eb85428b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Sun, 29 Jul 2007 18:10:01 +0000 Subject: Implement PEP 3120. --- Lib/test/badsyntax_pep3120.py | 1 + Lib/test/test_pep3120.py | 30 ++++++++++++++++++++++++++++++ Misc/NEWS | 2 ++ Parser/tokenizer.c | 39 ++++++++++++++++++++++++++++++++++----- Python/ast.c | 3 ++- 5 files changed, 69 insertions(+), 6 deletions(-) create mode 100644 Lib/test/badsyntax_pep3120.py create mode 100644 Lib/test/test_pep3120.py diff --git a/Lib/test/badsyntax_pep3120.py b/Lib/test/badsyntax_pep3120.py new file mode 100644 index 0000000..d14b4c9 --- /dev/null +++ b/Lib/test/badsyntax_pep3120.py @@ -0,0 +1 @@ +print("böse") diff --git a/Lib/test/test_pep3120.py b/Lib/test/test_pep3120.py new file mode 100644 index 0000000..3f567bf --- /dev/null +++ b/Lib/test/test_pep3120.py @@ -0,0 +1,30 @@ +# This file is marked as binary in the CVS, to prevent MacCVS from recoding it. + +import unittest +from test import test_support + +class PEP3120Test(unittest.TestCase): + + def test_pep3120(self): + self.assertEqual( + "Питон".encode("utf-8"), + b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd' + ) + self.assertEqual( + "\П".encode("utf-8"), + b'\\\xd0\x9f' + ) + + def test_badsyntax(self): + try: + import test.badsyntax_pep3120 + except SyntaxError as msg: + self.assert_(str(msg).find("Non-UTF-8 code starting with") >= 0) + else: + self.fail("expected exception didn't occur") + +def test_main(): + test_support.run_unittest(PEP3120Test) + +if __name__=="__main__": + test_main() diff --git a/Misc/NEWS b/Misc/NEWS index 8750b21..98b4fe1 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -26,6 +26,8 @@ TO DO Core and Builtins ----------------- +- PEP 3120: Change default encoding to UTF-8. + - PEP 3123: Use proper C inheritance for PyObject. - Removed the __oct__ and __hex__ special methods and added a bin() diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 9cbc8fe..00bb38a 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -444,6 +444,34 @@ static void fp_ungetc(int c, struct tok_state *tok) { ungetc(c, tok->fp); } +/* Check whether the characters at s start a valid + UTF-8 sequence. Return the number of characters forming + the sequence if yes, 0 if not. */ +static int valid_utf8(const unsigned char* s) +{ + int expected = 0; + int length; + if (*s < 0x80) + /* single-byte code */ + return 1; + if (*s < 0xc0) + /* following byte */ + return 0; + if (*s < 0xE0) + expected = 1; + else if (*s < 0xF0) + expected = 2; + else if (*s < 0xF8) + expected = 3; + else + return 0; + length = expected + 1; + for (; expected; expected--) + if (s[expected] < 0x80 || s[expected] >= 0xC0) + return 0; + return length; +} + /* Read a line of input from TOK. Determine encoding if necessary. */ @@ -478,12 +506,13 @@ decoding_fgets(char *s, int size, struct tok_state *tok) } } #ifndef PGEN - /* The default encoding is ASCII, so make sure we don't have any - non-ASCII bytes in it. */ + /* The default encoding is UTF-8, so make sure we don't have any + non-UTF-8 sequences in it. */ if (line && !tok->encoding) { unsigned char *c; - for (c = (unsigned char *)line; *c; c++) - if (*c > 127) { + int length; + for (c = (unsigned char *)line; *c; c += length) + if (!(length = valid_utf8(c))) { badchar = *c; break; } @@ -493,7 +522,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok) /* Need to add 1 to the line number, since this line has not been counted, yet. */ sprintf(buf, - "Non-ASCII character '\\x%.2x' " + "Non-UTF-8 code starting with '\\x%.2x' " "in file %.200s on line %i, " "but no encoding declared; " "see http://www.python.org/peps/pep-0263.html for details", diff --git a/Python/ast.c b/Python/ast.c index 146cd05..5426c02 100644 --- a/Python/ast.c +++ b/Python/ast.c @@ -203,7 +203,8 @@ PyAST_FromNode(const node *n, PyCompilerFlags *flags, const char *filename, c.c_encoding = STR(n); n = CHILD(n, 0); } else { - c.c_encoding = NULL; + /* PEP 3120 */ + c.c_encoding = "utf-8"; } c.c_arena = arena; -- cgit v0.12