From 8fb307cd650511ba019c4493275cb6684ad308bc Mon Sep 17 00:00:00 2001 From: Yury Selivanov Date: Wed, 22 Jul 2015 13:33:45 +0300 Subject: Issue #24619: New approach for tokenizing async/await. This commit fixes how one-line async-defs and defs are tracked by tokenizer. It allows to correctly parse invalid code such as: >>> async def f(): ... def g(): pass ... async = 10 and valid code such as: >>> async def f(): ... async def g(): pass ... await z As a consequence, is is now possible to have one-line 'async def foo(): await ..' functions: >>> async def foo(): return await bar() --- Doc/reference/compound_stmts.rst | 4 +- Lib/lib2to3/pgen2/tokenize.py | 12 ++- Lib/test/badsyntax_async1.py | 5 +- Lib/test/badsyntax_async2.py | 5 +- Lib/test/badsyntax_async4.py | 2 +- Lib/test/badsyntax_async9.py | 2 - Lib/test/test_coroutines.py | 226 +++++++++++++++++++++++++++++++++++++-- Lib/test/test_grammar.py | 5 +- Lib/test/test_tokenize.py | 15 ++- Lib/tokenize.py | 7 +- Misc/NEWS | 3 + Parser/tokenizer.c | 105 ++++++++++++------ Parser/tokenizer.h | 21 ++-- 13 files changed, 343 insertions(+), 69 deletions(-) delete mode 100644 Lib/test/badsyntax_async9.py diff --git a/Doc/reference/compound_stmts.rst b/Doc/reference/compound_stmts.rst index 76b3850..71f240f 100644 --- a/Doc/reference/compound_stmts.rst +++ b/Doc/reference/compound_stmts.rst @@ -685,9 +685,7 @@ Execution of Python coroutines can be suspended and resumed at many points (see :term:`coroutine`). In the body of a coroutine, any ``await`` and ``async`` identifiers become reserved keywords; :keyword:`await` expressions, :keyword:`async for` and :keyword:`async with` can only be used in -coroutine bodies. However, to simplify the parser, these keywords cannot -be used on the same line as a function or coroutine (:keyword:`def` -statement) header. +coroutine bodies. Functions defined with ``async def`` syntax are always coroutine functions, even if they do not contain ``await`` or ``async`` keywords. diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py index 690fec4..896b0fa 100644 --- a/Lib/lib2to3/pgen2/tokenize.py +++ b/Lib/lib2to3/pgen2/tokenize.py @@ -369,6 +369,7 @@ def generate_tokens(readline): # 'stashed' and 'ctx' are used for async/await parsing stashed = None ctx = [('sync', 0)] + in_async = 0 while 1: # loop over lines in stream try: @@ -436,6 +437,14 @@ def generate_tokens(readline): "unindent does not match any outer indentation level", ("", lnum, pos, line)) indents = indents[:-1] + + cur_indent = indents[-1] + while len(ctx) > 1 and ctx[-1][1] >= cur_indent: + if ctx[-1][0] == 'async': + in_async -= 1 + assert in_async >= 0 + ctx.pop() + yield (DEDENT, '', (lnum, pos), (lnum, pos), line) else: # continued statement @@ -499,7 +508,7 @@ def generate_tokens(readline): yield (STRING, token, spos, epos, line) elif initial in namechars: # ordinary name if token in ('async', 'await'): - if ctx[-1][0] == 'async' and ctx[-1][1] < indents[-1]: + if in_async: yield (ASYNC if token == 'async' else AWAIT, token, spos, epos, line) continue @@ -515,6 +524,7 @@ def generate_tokens(readline): and stashed[1] == 'async'): ctx.append(('async', indents[-1])) + in_async += 1 yield (ASYNC, stashed[1], stashed[2], stashed[3], diff --git a/Lib/test/badsyntax_async1.py b/Lib/test/badsyntax_async1.py index 970445d..fb85e29 100644 --- a/Lib/test/badsyntax_async1.py +++ b/Lib/test/badsyntax_async1.py @@ -1,3 +1,2 @@ -async def foo(): - def foo(a=await something()): - pass +async def foo(a=await something()): + pass diff --git a/Lib/test/badsyntax_async2.py b/Lib/test/badsyntax_async2.py index 1e62a3e..6f6f4f5 100644 --- a/Lib/test/badsyntax_async2.py +++ b/Lib/test/badsyntax_async2.py @@ -1,3 +1,2 @@ -async def foo(): - def foo(a:await something()): - pass +async def foo(a:await something()): + pass diff --git a/Lib/test/badsyntax_async4.py b/Lib/test/badsyntax_async4.py index 4afda40..d033b28 100644 --- a/Lib/test/badsyntax_async4.py +++ b/Lib/test/badsyntax_async4.py @@ -1,2 +1,2 @@ async def foo(): - async def foo(): await something() + await diff --git a/Lib/test/badsyntax_async9.py b/Lib/test/badsyntax_async9.py deleted file mode 100644 index d033b28..0000000 --- a/Lib/test/badsyntax_async9.py +++ /dev/null @@ -1,2 +0,0 @@ -async def foo(): - await diff --git a/Lib/test/test_coroutines.py b/Lib/test/test_coroutines.py index 9d97123..3ba2f23 100644 --- a/Lib/test/test_coroutines.py +++ b/Lib/test/test_coroutines.py @@ -67,11 +67,11 @@ def silence_coro_gc(): class AsyncBadSyntaxTest(unittest.TestCase): def test_badsyntax_1(self): - with self.assertRaisesRegex(SyntaxError, 'invalid syntax'): + with self.assertRaisesRegex(SyntaxError, "'await' outside"): import test.badsyntax_async1 def test_badsyntax_2(self): - with self.assertRaisesRegex(SyntaxError, 'invalid syntax'): + with self.assertRaisesRegex(SyntaxError, "'await' outside"): import test.badsyntax_async2 def test_badsyntax_3(self): @@ -103,10 +103,6 @@ class AsyncBadSyntaxTest(unittest.TestCase): import test.badsyntax_async8 def test_badsyntax_9(self): - with self.assertRaisesRegex(SyntaxError, 'invalid syntax'): - import test.badsyntax_async9 - - def test_badsyntax_10(self): ns = {} for comp in {'(await a for a in b)', '[await a for a in b]', @@ -116,6 +112,221 @@ class AsyncBadSyntaxTest(unittest.TestCase): with self.assertRaisesRegex(SyntaxError, 'await.*in comprehen'): exec('async def f():\n\t{}'.format(comp), ns, ns) + def test_badsyntax_10(self): + # Tests for issue 24619 + + samples = [ + """async def foo(): + def bar(): pass + await = 1 + """, + + """async def foo(): + + def bar(): pass + await = 1 + """, + + """async def foo(): + def bar(): pass + if 1: + await = 1 + """, + + """def foo(): + async def bar(): pass + if 1: + await a + """, + + """def foo(): + async def bar(): pass + await a + """, + + """def foo(): + def baz(): pass + async def bar(): pass + await a + """, + + """def foo(): + def baz(): pass + # 456 + async def bar(): pass + # 123 + await a + """, + + """async def foo(): + def baz(): pass + # 456 + async def bar(): pass + # 123 + await = 2 + """, + + """def foo(): + + def baz(): pass + + async def bar(): pass + + await a + """, + + """async def foo(): + + def baz(): pass + + async def bar(): pass + + await = 2 + """, + + """async def foo(): + def async(): pass + """, + + """async def foo(): + def await(): pass + """, + + """async def foo(): + def bar(): + await + """, + + """async def foo(): + return lambda async: await + """, + + """async def foo(): + return lambda a: await + """, + + """async def foo(a: await b): + pass + """, + + """def baz(): + async def foo(a: await b): + pass + """, + + """async def foo(async): + pass + """, + + """async def foo(): + def bar(): + def baz(): + async = 1 + """, + + """async def foo(): + def bar(): + def baz(): + pass + async = 1 + """, + + """def foo(): + async def bar(): + + async def baz(): + pass + + def baz(): + 42 + + async = 1 + """, + + """async def foo(): + def bar(): + def baz(): + pass\nawait foo() + """, + + """def foo(): + def bar(): + async def baz(): + pass\nawait foo() + """, + + """async def foo(await): + pass + """, + + """def foo(): + + async def bar(): pass + + await a + """, + + """def foo(): + async def bar(): + pass\nawait a + """] + + ns = {} + for code in samples: + with self.subTest(code=code), self.assertRaises(SyntaxError): + exec(code, ns, ns) + + def test_goodsyntax_1(self): + # Tests for issue 24619 + + def foo(await): + async def foo(): pass + async def foo(): + pass + return await + 1 + self.assertEqual(foo(10), 11) + + def foo(await): + async def foo(): pass + async def foo(): pass + return await + 2 + self.assertEqual(foo(20), 22) + + def foo(await): + + async def foo(): pass + + async def foo(): pass + + return await + 2 + self.assertEqual(foo(20), 22) + + def foo(await): + """spam""" + async def foo(): \ + pass + # 123 + async def foo(): pass + # 456 + return await + 2 + self.assertEqual(foo(20), 22) + + def foo(await): + def foo(): pass + def foo(): pass + async def bar(): return await_ + await_ = await + try: + bar().send(None) + except StopIteration as ex: + return ex.args[0] + self.assertEqual(foo(42), 42) + + async def f(): + async def g(): pass + await z + self.assertTrue(inspect.iscoroutinefunction(f)) + class TokenizerRegrTest(unittest.TestCase): @@ -461,8 +672,7 @@ class CoroutineTest(unittest.TestCase): class Awaitable: pass - async def foo(): - return (await Awaitable()) + async def foo(): return await Awaitable() with self.assertRaisesRegex( TypeError, "object Awaitable can't be used in 'await' expression"): diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py index 2af7390..ca6b5d0 100644 --- a/Lib/test/test_grammar.py +++ b/Lib/test/test_grammar.py @@ -1051,10 +1051,7 @@ class GrammarTests(unittest.TestCase): async def test(): def sum(): - async = 1 - await = 41 - return async + await - + pass if 1: await someobj() diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 42fc78f..e320562 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -786,12 +786,12 @@ Async/await extension: NAME 'def' (2, 2) (2, 5) NAME 'foo' (2, 6) (2, 9) OP '(' (2, 9) (2, 10) - NAME 'await' (2, 10) (2, 15) + AWAIT 'await' (2, 10) (2, 15) OP ')' (2, 15) (2, 16) OP ':' (2, 16) (2, 17) NEWLINE '\\n' (2, 17) (2, 18) INDENT ' ' (3, 0) (3, 4) - NAME 'await' (3, 4) (3, 9) + AWAIT 'await' (3, 4) (3, 9) OP '=' (3, 10) (3, 11) NUMBER '1' (3, 12) (3, 13) NEWLINE '\\n' (3, 13) (3, 14) @@ -829,6 +829,17 @@ Async/await extension: OP ':' (2, 18) (2, 19) NAME 'pass' (2, 20) (2, 24) DEDENT '' (3, 0) (3, 0) + + >>> dump_tokens('''async def foo(async): await''') + ENCODING 'utf-8' (0, 0) (0, 0) + ASYNC 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + OP '(' (1, 13) (1, 14) + ASYNC 'async' (1, 14) (1, 19) + OP ')' (1, 19) (1, 20) + OP ':' (1, 20) (1, 21) + AWAIT 'await' (1, 22) (1, 27) """ from test import support diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 3ec9018..c3efdda 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -501,6 +501,7 @@ def _tokenize(readline, encoding): # 'stashed' and 'ctx' are used for async/await parsing stashed = None ctx = [('sync', 0)] + in_async = 0 if encoding is not None: if encoding == "utf-8-sig": @@ -580,6 +581,9 @@ def _tokenize(readline, encoding): cur_indent = indents[-1] while len(ctx) > 1 and ctx[-1][1] >= cur_indent: + if ctx[-1][0] == 'async': + in_async -= 1 + assert in_async >= 0 ctx.pop() yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) @@ -640,7 +644,7 @@ def _tokenize(readline, encoding): yield TokenInfo(STRING, token, spos, epos, line) elif initial.isidentifier(): # ordinary name if token in ('async', 'await'): - if ctx[-1][0] == 'async' and ctx[-1][1] < indents[-1]: + if in_async: yield TokenInfo( ASYNC if token == 'async' else AWAIT, token, spos, epos, line) @@ -657,6 +661,7 @@ def _tokenize(readline, encoding): and stashed.string == 'async'): ctx.append(('async', indents[-1])) + in_async += 1 yield TokenInfo(ASYNC, stashed.string, stashed.start, stashed.end, diff --git a/Misc/NEWS b/Misc/NEWS index f69138e..ba27a55 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -19,6 +19,9 @@ Core and Builtins - Issue #24407: Fix crash when dict is mutated while being updated. +- Issue #24619: New approach for tokenizing async/await. As a consequence, + is is now possible to have one-line 'async def foo(): await ..' functions. + Library ------- diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index d4476ae..46c0580 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -31,6 +31,12 @@ || c == '_'\ || (c >= 128)) +/* The following DEFTYPE* flags are used in 'tok_state->deftypestack', + and should be removed in 3.7, when async/await are regular + keywords. */ +#define DEFTYPE_ASYNC 1 +#define DEFTYPE_HAS_NL 2 + extern char *PyOS_Readline(FILE *, FILE *, const char *); /* Return malloc'ed string including trailing \n; empty malloc'ed string for EOF; @@ -130,6 +136,8 @@ tok_new(void) tok->def = 0; tok->defstack[0] = 0; tok->deftypestack[0] = 0; + tok->def_async_behind = 0; + tok->def_in_async = 0; tok->atbol = 1; tok->pendin = 0; @@ -1436,7 +1444,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok->pendin++; while (tok->def && tok->defstack[tok->def] >= tok->indent) { + if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) { + tok->def_in_async--; + assert(tok->def_in_async >= 0); + } tok->def--; + assert(tok->def >= 0); } return DEDENT; @@ -1447,6 +1460,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } } + if (!blankline && tok->level == 0 + && tok->def && tok->deftypestack[tok->def] & DEFTYPE_HAS_NL + && tok->defstack[tok->def] >= tok->indent) + { + /* The top function on the stack did have a NEWLINE + token, but didn't have an INDENT. That means that + it's a one-line function and it should now be removed + from the stack. */ + if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) { + tok->def_in_async--; + assert(tok->def_in_async >= 0); + } + tok->def--; + assert(tok->def >= 0); + } + again: tok->start = NULL; /* Skip spaces */ @@ -1501,59 +1530,58 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok_len = tok->cur - tok->start; if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) { - if (tok->def && tok->deftypestack[tok->def] == 3) { - tok->deftypestack[tok->def] = 2; + /* The current token is 'def'. */ + if (tok->def + 1 >= MAXINDENT) { + tok->done = E_TOODEEP; + tok->cur = tok->inp; + return ERRORTOKEN; } - else if (tok->defstack[tok->def] < tok->indent) { - /* We advance defs stack only when we see "def" *and* - the indentation level was increased relative to the - previous "def". */ - if (tok->def + 1 >= MAXINDENT) { - tok->done = E_TOODEEP; - tok->cur = tok->inp; - return ERRORTOKEN; - } + /* Advance defs stack. */ + tok->def++; + tok->defstack[tok->def] = tok->indent; - tok->def++; - tok->defstack[tok->def] = tok->indent; - tok->deftypestack[tok->def] = 1; + if (tok->def_async_behind) { + /* The previous token was 'async'. */ + tok->def_async_behind = 0; + tok->deftypestack[tok->def] = DEFTYPE_ASYNC; + tok->def_in_async++; + } + else { + /* This is a regular function (not async def). */ + tok->deftypestack[tok->def] = 0; } } else if (tok_len == 5) { if (memcmp(tok->start, "async", 5) == 0) { + /* The current token is 'async'. */ memcpy(&ahead_tok, tok, sizeof(ahead_tok)); + /* Try to look ahead one token. */ ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, &ahead_top_end); - if (ahead_tok_kind == NAME && - ahead_tok.cur - ahead_tok.start == 3 && - memcmp(ahead_tok.start, "def", 3) == 0) { - - if (tok->def + 1 >= MAXINDENT) { - tok->done = E_TOODEEP; - tok->cur = tok->inp; - return ERRORTOKEN; - } - - tok->def++; - tok->defstack[tok->def] = tok->indent; - tok->deftypestack[tok->def] = 3; - + if (ahead_tok_kind == NAME + && ahead_tok.cur - ahead_tok.start == 3 + && memcmp(ahead_tok.start, "def", 3) == 0) + { + /* The next token is going to be 'def', so instead of + returning 'async' NAME token, we return ASYNC. */ + tok->def_async_behind = 1; return ASYNC; } - else if (tok->def && tok->deftypestack[tok->def] == 2 - && tok->defstack[tok->def] < tok->indent) { - + else if (tok->def_in_async) + { + /* We're inside an 'async def' function, so we treat + 'async' token as ASYNC, instead of NAME. */ return ASYNC; } } - else if (memcmp(tok->start, "await", 5) == 0 - && tok->def && tok->deftypestack[tok->def] == 2 - && tok->defstack[tok->def] < tok->indent) { - + else if (memcmp(tok->start, "await", 5) == 0 && tok->def_in_async) + { + /* We're inside an 'async def' function, so we treat + 'await' token as AWAIT, instead of NAME. */ return AWAIT; } } @@ -1569,6 +1597,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) *p_start = tok->start; *p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; + if (tok->def) { + /* Mark the top function on the stack that it had + at least one NEWLINE. That will help us to + distinguish one-line functions from functions + with multiple statements. */ + tok->deftypestack[tok->def] |= DEFTYPE_HAS_NL; + } return NEWLINE; } diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 3bcdad6..e198a0b 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -66,12 +66,21 @@ struct tok_state { const char* str; const char* input; /* Tokenizer's newline translated copy of the string. */ - int defstack[MAXINDENT]; /* stack if funcs & indents where they - were defined */ - int deftypestack[MAXINDENT]; /* stack of func types - (0 not func; 1: "def name"; - 2: "async def name") */ - int def; /* Length of stack of func types */ + /* `def*` fields are for parsing async/await in a backwards compatible + way. They should be removed in 3.7, when they will become + regular constants. See PEP 492 for more details. */ + int defstack[MAXINDENT]; /* Stack of funcs & indents where they + were defined. */ + int deftypestack[MAXINDENT]; /* Stack of func flags, see DEFTYPE_* + constants. */ + int def; /* Length of stack of func types/flags. */ + int def_async_behind; /* 1 if there was an 'async' token before + a 'def' token. */ + int def_in_async; /* Counter of how deep 'async def's + are nested. If greater than 0, + we are somewhere in an 'async def' + body, so 'async' and 'await' should + be parsed as keywords.*/ }; extern struct tok_state *PyTokenizer_FromString(const char *, int); -- cgit v0.12