From da780432378e6298463889557ab43e0c156758cd Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Fri, 17 Oct 2008 03:38:50 +0000 Subject: Latin-1 source code was not being properly decoded when passed through compile(). This was due to left-over special-casing before UTF-8 became the default source encoding. Closes issue #3574. Thanks to Victor Stinner for help with the patch. --- Lib/test/test_pep3120.py | 18 +++++++++++++++++- Misc/NEWS | 2 ++ Parser/tokenizer.c | 4 ++-- Parser/tokenizer.h | 4 ++-- Python/ast.c | 6 +----- 5 files changed, 24 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_pep3120.py b/Lib/test/test_pep3120.py index 3bb30ca..81d15bc 100644 --- a/Lib/test/test_pep3120.py +++ b/Lib/test/test_pep3120.py @@ -23,8 +23,24 @@ class PEP3120Test(unittest.TestCase): else: self.fail("expected exception didn't occur") + +class BuiltinCompileTests(unittest.TestCase): + + # Issue 3574. + def test_latin1(self): + # Allow compile() to read Latin-1 source. + source_code = '# coding: Latin-1\nu = "Ç"\n'.encode("Latin-1") + try: + code = compile(source_code, '', 'exec') + except SyntaxError: + self.fail("compile() cannot handle Latin-1 source") + ns = {} + exec(code, ns) + self.assertEqual('Ç', ns['u']) + + def test_main(): - support.run_unittest(PEP3120Test) + support.run_unittest(PEP3120Test, BuiltinCompileTests) if __name__=="__main__": test_main() diff --git a/Misc/NEWS b/Misc/NEWS index 0f47afb..ede8e52 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -15,6 +15,8 @@ What's New in Python 3.0 beta 5 Core and Builtins ----------------- +- Issue #3574: compile() incorrectly handled source code encoded as Latin-1. + - Issues #2384 and #3975: Tracebacks were not correctly printed when the source file contains a ``coding:`` header: the wrong line was displayed, and the encoding was not respected. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 4edf6d0..ce8129d 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -135,6 +135,7 @@ tok_new(void) tok->decoding_state = STATE_INIT; tok->decoding_erred = 0; tok->read_coding_spec = 0; + tok->enc = NULL; tok->encoding = NULL; tok->cont_line = 0; #ifndef PGEN @@ -274,8 +275,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, tok->read_coding_spec = 1; if (tok->encoding == NULL) { assert(tok->decoding_state == STATE_RAW); - if (strcmp(cs, "utf-8") == 0 || - strcmp(cs, "iso-8859-1") == 0) { + if (strcmp(cs, "utf-8") == 0) { tok->encoding = cs; } else { r = set_readline(tok, cs); diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index c45dea1..df9cbc7 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -49,14 +49,14 @@ struct tok_state { enum decoding_state decoding_state; int decoding_erred; /* whether erred in decoding */ int read_coding_spec; /* whether 'coding:...' has been read */ - char *encoding; + char *encoding; /* Source encoding. */ int cont_line; /* whether we are in a continuation line. */ const char* line_start; /* pointer to start of current line */ #ifndef PGEN PyObject *decoding_readline; /* codecs.open(...).readline */ PyObject *decoding_buffer; #endif - const char* enc; + const char* enc; /* Encoding for the current str. */ const char* str; }; diff --git a/Python/ast.c b/Python/ast.c index 6d2fa09..60906a1 100644 --- a/Python/ast.c +++ b/Python/ast.c @@ -3160,9 +3160,6 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons if (encoding == NULL) { buf = (char *)s; u = NULL; - } else if (strcmp(encoding, "iso-8859-1") == 0) { - buf = (char *)s; - u = NULL; } else { /* check for integer overflow */ if (len > PY_SIZE_MAX / 4) @@ -3275,8 +3272,7 @@ parsestr(struct compiling *c, const node *n, int *bytesmode) } } need_encoding = (!*bytesmode && c->c_encoding != NULL && - strcmp(c->c_encoding, "utf-8") != 0 && - strcmp(c->c_encoding, "iso-8859-1") != 0); + strcmp(c->c_encoding, "utf-8") != 0); if (rawmode || strchr(s, '\\') == NULL) { if (need_encoding) { PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL); -- cgit v0.12