From f62a89b1e09d84fbd60e0356b87430a6ff1e352d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Tue, 3 Sep 2002 11:52:44 +0000 Subject: Ignore encoding declarations inside strings. Fixes #603509. --- Doc/ref/ref2.tex | 2 +- Parser/tokenizer.c | 17 ++++++++++++++++- Parser/tokenizer.h | 1 + 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Doc/ref/ref2.tex b/Doc/ref/ref2.tex index 3319949..4947650 100644 --- a/Doc/ref/ref2.tex +++ b/Doc/ref/ref2.tex @@ -101,7 +101,7 @@ The encoding is used for all lexical analysis, in particular to find the end of a string, and to interpret the contents of Unicode literals. String literals are converted to Unicode for syntactical analysis, then converted back to their original encoding before interpretation -starts. +starts. The encoding declaration must appear on a line of its own. \subsection{Explicit line joining\label{explicit-joining}} diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 4119c43..c58aadb 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -128,6 +128,7 @@ tok_new(void) tok->read_coding_spec = 0; tok->issued_encoding_warning = 0; tok->encoding = NULL; + tok->cont_line = 0; #ifndef PGEN tok->decoding_readline = NULL; tok->decoding_buffer = NULL; @@ -207,7 +208,15 @@ static char * get_coding_spec(const char *s, int size) { int i; - for (i = 0; i < size - 6; i++) { /* XXX inefficient search */ + /* Coding spec must be in a comment, and that comment must be + * the only statement on the source code line. */ + for (i = 0; i < size - 6; i++) { + if (s[i] == '#') + break; + if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') + return NULL; + } + for (; i < size - 6; i++) { /* XXX inefficient search */ const char* t = s + i; if (strncmp(t, "coding", 6) == 0) { const char* begin = NULL; @@ -247,6 +256,9 @@ check_coding_spec(const char* line, int size, struct tok_state *tok, int set_readline(struct tok_state *, const char *)) { int r = 1; + if (tok->cont_line) + /* It's a continuation line, so it can't be a coding spec. */ + return 1; char* cs = get_coding_spec(line, size); if (cs != NULL) { tok->read_coding_spec = 1; @@ -1158,6 +1170,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) goto nextline; *p_start = tok->start; *p_end = tok->cur - 1; /* Leave '\n' out of the string */ + tok->cont_line = 0; return NEWLINE; } @@ -1292,6 +1305,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) return ERRORTOKEN; } tripcount = 0; + tok->cont_line = 1; /* multiline string. */ } else if (c == EOF) { if (triple) @@ -1340,6 +1354,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) tok->cur = tok->inp; return ERRORTOKEN; } + tok->cont_line = 1; goto again; /* Read next line */ } diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index f3bac74..b3d456a 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -45,6 +45,7 @@ struct tok_state { int read_coding_spec; /* whether 'coding:...' has been read */ int issued_encoding_warning; /* whether non-ASCII warning was issued */ char *encoding; + int cont_line; /* whether we are in a continuation line. */ #ifndef PGEN PyObject *decoding_readline; /* codecs.open(...).readline */ PyObject *decoding_buffer; -- cgit v0.12