summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrett Cannon <bcannon@gmail.com>2008-10-17 03:38:50 (GMT)
committerBrett Cannon <bcannon@gmail.com>2008-10-17 03:38:50 (GMT)
commitda780432378e6298463889557ab43e0c156758cd (patch)
treedc622a9b62874851f90abc45524d3d2653cab9ba
parent9e9dcd6d4225faa6a8b19120f009e0253d16ab92 (diff)
downloadcpython-da780432378e6298463889557ab43e0c156758cd.zip
cpython-da780432378e6298463889557ab43e0c156758cd.tar.gz
cpython-da780432378e6298463889557ab43e0c156758cd.tar.bz2
Latin-1 source code was not being properly decoded when passed through
compile(). This was due to left-over special-casing before UTF-8 became the default source encoding. Closes issue #3574. Thanks to Victor Stinner for help with the patch.
-rw-r--r--Lib/test/test_pep3120.py18
-rw-r--r--Misc/NEWS2
-rw-r--r--Parser/tokenizer.c4
-rw-r--r--Parser/tokenizer.h4
-rw-r--r--Python/ast.c6
5 files changed, 24 insertions, 10 deletions
diff --git a/Lib/test/test_pep3120.py b/Lib/test/test_pep3120.py
index 3bb30ca..81d15bc 100644
--- a/Lib/test/test_pep3120.py
+++ b/Lib/test/test_pep3120.py
@@ -23,8 +23,24 @@ class PEP3120Test(unittest.TestCase):
else:
self.fail("expected exception didn't occur")
+
+class BuiltinCompileTests(unittest.TestCase):
+
+ # Issue 3574.
+ def test_latin1(self):
+ # Allow compile() to read Latin-1 source.
+ source_code = '# coding: Latin-1\nu = "Ç"\n'.encode("Latin-1")
+ try:
+ code = compile(source_code, '<dummy>', 'exec')
+ except SyntaxError:
+ self.fail("compile() cannot handle Latin-1 source")
+ ns = {}
+ exec(code, ns)
+ self.assertEqual('Ç', ns['u'])
+
+
def test_main():
- support.run_unittest(PEP3120Test)
+ support.run_unittest(PEP3120Test, BuiltinCompileTests)
if __name__=="__main__":
test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index 0f47afb..ede8e52 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -15,6 +15,8 @@ What's New in Python 3.0 beta 5
Core and Builtins
-----------------
+- Issue #3574: compile() incorrectly handled source code encoded as Latin-1.
+
- Issues #2384 and #3975: Tracebacks were not correctly printed when the
source file contains a ``coding:`` header: the wrong line was displayed, and
the encoding was not respected.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 4edf6d0..ce8129d 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -135,6 +135,7 @@ tok_new(void)
tok->decoding_state = STATE_INIT;
tok->decoding_erred = 0;
tok->read_coding_spec = 0;
+ tok->enc = NULL;
tok->encoding = NULL;
tok->cont_line = 0;
#ifndef PGEN
@@ -274,8 +275,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
tok->read_coding_spec = 1;
if (tok->encoding == NULL) {
assert(tok->decoding_state == STATE_RAW);
- if (strcmp(cs, "utf-8") == 0 ||
- strcmp(cs, "iso-8859-1") == 0) {
+ if (strcmp(cs, "utf-8") == 0) {
tok->encoding = cs;
} else {
r = set_readline(tok, cs);
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index c45dea1..df9cbc7 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -49,14 +49,14 @@ struct tok_state {
enum decoding_state decoding_state;
int decoding_erred; /* whether erred in decoding */
int read_coding_spec; /* whether 'coding:...' has been read */
- char *encoding;
+ char *encoding; /* Source encoding. */
int cont_line; /* whether we are in a continuation line. */
const char* line_start; /* pointer to start of current line */
#ifndef PGEN
PyObject *decoding_readline; /* codecs.open(...).readline */
PyObject *decoding_buffer;
#endif
- const char* enc;
+ const char* enc; /* Encoding for the current str. */
const char* str;
};
diff --git a/Python/ast.c b/Python/ast.c
index 6d2fa09..60906a1 100644
--- a/Python/ast.c
+++ b/Python/ast.c
@@ -3160,9 +3160,6 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
if (encoding == NULL) {
buf = (char *)s;
u = NULL;
- } else if (strcmp(encoding, "iso-8859-1") == 0) {
- buf = (char *)s;
- u = NULL;
} else {
/* check for integer overflow */
if (len > PY_SIZE_MAX / 4)
@@ -3275,8 +3272,7 @@ parsestr(struct compiling *c, const node *n, int *bytesmode)
}
}
need_encoding = (!*bytesmode && c->c_encoding != NULL &&
- strcmp(c->c_encoding, "utf-8") != 0 &&
- strcmp(c->c_encoding, "iso-8859-1") != 0);
+ strcmp(c->c_encoding, "utf-8") != 0);
if (rawmode || strchr(s, '\\') == NULL) {
if (need_encoding) {
PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL);