summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_ast.py3
-rw-r--r--Lib/test/test_syntax.py30
-rw-r--r--Lib/test/test_tokenize.py100
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst2
-rw-r--r--Parser/tokenizer.c46
5 files changed, 165 insertions, 16 deletions
diff --git a/Lib/test/test_ast.py b/Lib/test/test_ast.py
index 314b360..039d1c1 100644
--- a/Lib/test/test_ast.py
+++ b/Lib/test/test_ast.py
@@ -1078,8 +1078,7 @@ Module(
ast.literal_eval(node)
def test_literal_eval_syntax_errors(self):
- msg = "unexpected character after line continuation character"
- with self.assertRaisesRegex(SyntaxError, msg):
+ with self.assertRaisesRegex(SyntaxError, "unexpected indent"):
ast.literal_eval(r'''
\
(\
diff --git a/Lib/test/test_syntax.py b/Lib/test/test_syntax.py
index 968d348..a6ff319 100644
--- a/Lib/test/test_syntax.py
+++ b/Lib/test/test_syntax.py
@@ -1613,6 +1613,36 @@ pass
except SyntaxError:
self.fail("Empty line after a line continuation character is valid.")
+ # See issue-46091
+ s1 = r"""\
+def fib(n):
+ \
+'''Print a Fibonacci series up to n.'''
+ \
+a, b = 0, 1
+"""
+ s2 = r"""\
+def fib(n):
+ '''Print a Fibonacci series up to n.'''
+ a, b = 0, 1
+"""
+ try:
+ self.assertEqual(compile(s1, '<string>', 'exec'), compile(s2, '<string>', 'exec'))
+ except SyntaxError:
+ self.fail("Indented statement over multiple lines is valid")
+
+ def test_continuation_bad_indentation(self):
+ # Check that code that breaks indentation across multiple lines raises a syntax error
+
+ code = r"""\
+if x:
+ y = 1
+ \
+ foo = 1
+ """
+
+ self.assertRaises(IndentationError, exec, code)
+
@support.cpython_only
def test_nested_named_except_blocks(self):
code = ""
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index ca2821d..334390a 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -6,6 +6,7 @@ from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
NEWLINE, _generate_tokens_from_c_tokenizer)
from io import BytesIO, StringIO
import unittest
+from textwrap import dedent
from unittest import TestCase, mock
from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
INVALID_UNDERSCORE_LITERALS)
@@ -44,7 +45,6 @@ class TokenizeTest(TestCase):
# The ENDMARKER and final NEWLINE are omitted.
f = BytesIO(s.encode('utf-8'))
result = stringify_tokens_from_source(tokenize(f.readline), s)
-
self.assertEqual(result,
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
expected.rstrip().splitlines())
@@ -2511,7 +2511,105 @@ async def f():
self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
self.assertRaises(SyntaxError, get_tokens, "]")
+
+ def test_continuation_lines_indentation(self):
+ def get_tokens(string):
+ return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
+ code = dedent("""
+ def fib(n):
+ \\
+ '''Print a Fibonacci series up to n.'''
+ \\
+ a, b = 0, 1
+ """)
+
+ self.check_tokenize(code, """\
+ NAME 'def' (2, 0) (2, 3)
+ NAME 'fib' (2, 4) (2, 7)
+ LPAR '(' (2, 7) (2, 8)
+ NAME 'n' (2, 8) (2, 9)
+ RPAR ')' (2, 9) (2, 10)
+ COLON ':' (2, 10) (2, 11)
+ NEWLINE '' (2, 11) (2, 11)
+ INDENT '' (4, -1) (4, -1)
+ STRING "'''Print a Fibonacci series up to n.'''" (4, 0) (4, 39)
+ NEWLINE '' (4, 39) (4, 39)
+ NAME 'a' (6, 0) (6, 1)
+ COMMA ',' (6, 1) (6, 2)
+ NAME 'b' (6, 3) (6, 4)
+ EQUAL '=' (6, 5) (6, 6)
+ NUMBER '0' (6, 7) (6, 8)
+ COMMA ',' (6, 8) (6, 9)
+ NUMBER '1' (6, 10) (6, 11)
+ NEWLINE '' (6, 11) (6, 11)
+ DEDENT '' (6, -1) (6, -1)
+ """)
+
+ code_no_cont = dedent("""
+ def fib(n):
+ '''Print a Fibonacci series up to n.'''
+ a, b = 0, 1
+ """)
+
+ self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
+
+ code = dedent("""
+ pass
+ \\
+
+ pass
+ """)
+
+ self.check_tokenize(code, """\
+ NAME 'pass' (2, 0) (2, 4)
+ NEWLINE '' (2, 4) (2, 4)
+ NAME 'pass' (5, 0) (5, 4)
+ NEWLINE '' (5, 4) (5, 4)
+ """)
+
+ code_no_cont = dedent("""
+ pass
+ pass
+ """)
+
+ self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
+
+ code = dedent("""
+ if x:
+ y = 1
+ \\
+ \\
+ \\
+ \\
+ foo = 1
+ """)
+
+ self.check_tokenize(code, """\
+ NAME 'if' (2, 0) (2, 2)
+ NAME 'x' (2, 3) (2, 4)
+ COLON ':' (2, 4) (2, 5)
+ NEWLINE '' (2, 5) (2, 5)
+ INDENT '' (3, -1) (3, -1)
+ NAME 'y' (3, 4) (3, 5)
+ EQUAL '=' (3, 6) (3, 7)
+ NUMBER '1' (3, 8) (3, 9)
+ NEWLINE '' (3, 9) (3, 9)
+ NAME 'foo' (8, 4) (8, 7)
+ EQUAL '=' (8, 8) (8, 9)
+ NUMBER '1' (8, 10) (8, 11)
+ NEWLINE '' (8, 11) (8, 11)
+ DEDENT '' (8, -1) (8, -1)
+ """)
+
+ code_no_cont = dedent("""
+ if x:
+ y = 1
+ foo = 1
+ """)
+
+ self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
+
if __name__ == "__main__":
unittest.main()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst b/Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst
new file mode 100644
index 0000000..a2eee0f
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst
@@ -0,0 +1,2 @@
+Correctly calculate indentation levels for lines with whitespace character
+that are ended by line continuation characters. Patch by Pablo Galindo
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 5e35d6f..cd4254f 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1347,6 +1347,24 @@ tok_decimal_tail(struct tok_state *tok)
/* Get next token, after space stripping etc. */
+static inline int
+tok_continuation_line(struct tok_state *tok) {
+ int c = tok_nextc(tok);
+ if (c != '\n') {
+ tok->done = E_LINECONT;
+ return -1;
+ }
+ c = tok_nextc(tok);
+ if (c == EOF) {
+ tok->done = E_EOF;
+ tok->cur = tok->inp;
+ return -1;
+ } else {
+ tok_backup(tok, c);
+ }
+ return c;
+}
+
static int
tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
{
@@ -1363,6 +1381,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
int col = 0;
int altcol = 0;
tok->atbol = 0;
+ int cont_line_col = 0;
for (;;) {
c = tok_nextc(tok);
if (c == ' ') {
@@ -1375,14 +1394,23 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
else if (c == '\014') {/* Control-L (formfeed) */
col = altcol = 0; /* For Emacs users */
}
+ else if (c == '\\') {
+ // Indentation cannot be split over multiple physical lines
+ // using backslashes. This means that if we found a backslash
+ // preceded by whitespace, **the first one we find** determines
+ // the level of indentation of whatever comes next.
+ cont_line_col = cont_line_col ? cont_line_col : col;
+ if ((c = tok_continuation_line(tok)) == -1) {
+ return ERRORTOKEN;
+ }
+ }
else {
break;
}
}
tok_backup(tok, c);
- if (c == '#' || c == '\n' || c == '\\') {
+ if (c == '#' || c == '\n') {
/* Lines with only whitespace and/or comments
- and/or a line continuation character
shouldn't affect the indentation and are
not passed to the parser as NEWLINE tokens,
except *totally* empty lines in interactive
@@ -1403,6 +1431,8 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
may need to skip to the end of a comment */
}
if (!blankline && tok->level == 0) {
+ col = cont_line_col ? cont_line_col : col;
+ altcol = cont_line_col ? cont_line_col : altcol;
if (col == tok->indstack[tok->indent]) {
/* No change */
if (altcol != tok->altindstack[tok->indent]) {
@@ -1964,19 +1994,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
/* Line continuation */
if (c == '\\') {
- c = tok_nextc(tok);
- if (c != '\n') {
- tok->done = E_LINECONT;
+ if ((c = tok_continuation_line(tok)) == -1) {
return ERRORTOKEN;
}
- c = tok_nextc(tok);
- if (c == EOF) {
- tok->done = E_EOF;
- tok->cur = tok->inp;
- return ERRORTOKEN;
- } else {
- tok_backup(tok, c);
- }
tok->cont_line = 1;
goto again; /* Read next line */
}