summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2002-09-03 11:52:44 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2002-09-03 11:52:44 (GMT)
commitf62a89b1e09d84fbd60e0356b87430a6ff1e352d (patch)
tree005f3cbd5a4510a448022832dbdb8c0490de8fab
parent65b7282ef715ef0e9b8f9e8581715d2b4746f960 (diff)
downloadcpython-f62a89b1e09d84fbd60e0356b87430a6ff1e352d.zip
cpython-f62a89b1e09d84fbd60e0356b87430a6ff1e352d.tar.gz
cpython-f62a89b1e09d84fbd60e0356b87430a6ff1e352d.tar.bz2
Ignore encoding declarations inside strings. Fixes #603509.
-rw-r--r--Doc/ref/ref2.tex2
-rw-r--r--Parser/tokenizer.c17
-rw-r--r--Parser/tokenizer.h1
3 files changed, 18 insertions, 2 deletions
diff --git a/Doc/ref/ref2.tex b/Doc/ref/ref2.tex
index 3319949..4947650 100644
--- a/Doc/ref/ref2.tex
+++ b/Doc/ref/ref2.tex
@@ -101,7 +101,7 @@ The encoding is used for all lexical analysis, in particular to find
the end of a string, and to interpret the contents of Unicode literals.
String literals are converted to Unicode for syntactical analysis,
then converted back to their original encoding before interpretation
-starts.
+starts. The encoding declaration must appear on a line of its own.
\subsection{Explicit line joining\label{explicit-joining}}
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 4119c43..c58aadb 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -128,6 +128,7 @@ tok_new(void)
tok->read_coding_spec = 0;
tok->issued_encoding_warning = 0;
tok->encoding = NULL;
+ tok->cont_line = 0;
#ifndef PGEN
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
@@ -207,7 +208,15 @@ static char *
get_coding_spec(const char *s, int size)
{
int i;
- for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
+ /* Coding spec must be in a comment, and that comment must be
+ * the only statement on the source code line. */
+ for (i = 0; i < size - 6; i++) {
+ if (s[i] == '#')
+ break;
+ if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
+ return NULL;
+ }
+ for (; i < size - 6; i++) { /* XXX inefficient search */
const char* t = s + i;
if (strncmp(t, "coding", 6) == 0) {
const char* begin = NULL;
@@ -247,6 +256,9 @@ check_coding_spec(const char* line, int size, struct tok_state *tok,
int set_readline(struct tok_state *, const char *))
{
int r = 1;
+ if (tok->cont_line)
+ /* It's a continuation line, so it can't be a coding spec. */
+ return 1;
char* cs = get_coding_spec(line, size);
if (cs != NULL) {
tok->read_coding_spec = 1;
@@ -1158,6 +1170,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
goto nextline;
*p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
+ tok->cont_line = 0;
return NEWLINE;
}
@@ -1292,6 +1305,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
return ERRORTOKEN;
}
tripcount = 0;
+ tok->cont_line = 1; /* multiline string. */
}
else if (c == EOF) {
if (triple)
@@ -1340,6 +1354,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
tok->cur = tok->inp;
return ERRORTOKEN;
}
+ tok->cont_line = 1;
goto again; /* Read next line */
}
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index f3bac74..b3d456a 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -45,6 +45,7 @@ struct tok_state {
int read_coding_spec; /* whether 'coding:...' has been read */
int issued_encoding_warning; /* whether non-ASCII warning was issued */
char *encoding;
+ int cont_line; /* whether we are in a continuation line. */
#ifndef PGEN
PyObject *decoding_readline; /* codecs.open(...).readline */
PyObject *decoding_buffer;