From 8fb307cd650511ba019c4493275cb6684ad308bc Mon Sep 17 00:00:00 2001
From: Yury Selivanov <yselivanov@sprymix.com>
Date: Wed, 22 Jul 2015 13:33:45 +0300
Subject: Issue #24619: New approach for tokenizing async/await.

This commit fixes how one-line async-defs and defs are tracked
by tokenizer.  It allows to correctly parse invalid code such
as:

>>> async def f():
...     def g(): pass
...     async = 10

and valid code such as:

>>> async def f():
...     async def g(): pass
...     await z

As a consequence, is is now possible to have one-line
'async def foo(): await ..' functions:

>>> async def foo(): return await bar()
---
 Doc/reference/compound_stmts.rst |   4 +-
 Lib/lib2to3/pgen2/tokenize.py    |  12 ++-
 Lib/test/badsyntax_async1.py     |   5 +-
 Lib/test/badsyntax_async2.py     |   5 +-
 Lib/test/badsyntax_async4.py     |   2 +-
 Lib/test/badsyntax_async9.py     |   2 -
 Lib/test/test_coroutines.py      | 226 +++++++++++++++++++++++++++++++++++++--
 Lib/test/test_grammar.py         |   5 +-
 Lib/test/test_tokenize.py        |  15 ++-
 Lib/tokenize.py                  |   7 +-
 Misc/NEWS                        |   3 +
 Parser/tokenizer.c               | 105 ++++++++++++------
 Parser/tokenizer.h               |  21 ++--
 13 files changed, 343 insertions(+), 69 deletions(-)
 delete mode 100644 Lib/test/badsyntax_async9.py
diff --git a/Doc/reference/compound_stmts.rst b/Doc/reference/compound_stmts.rst
index 76b3850..71f240f 100644
--- a/Doc/reference/compound_stmts.rst
+++ b/Doc/reference/compound_stmts.rst
@@ -685,9 +685,7 @@ Execution of Python coroutines can be suspended and resumed at many points
 (see :term:`coroutine`).  In the body of a coroutine, any ``await`` and
 ``async`` identifiers become reserved keywords; :keyword:`await` expressions,
 :keyword:`async for` and :keyword:`async with` can only be used in
-coroutine bodies.  However, to simplify the parser, these keywords cannot
-be used on the same line as a function or coroutine (:keyword:`def`
-statement) header.
+coroutine bodies.
 
 Functions defined with ``async def`` syntax are always coroutine functions,
 even if they do not contain ``await`` or ``async`` keywords.
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py
index 690fec4..896b0fa 100644
--- a/Lib/lib2to3/pgen2/tokenize.py
+++ b/Lib/lib2to3/pgen2/tokenize.py
@@ -369,6 +369,7 @@ def generate_tokens(readline):
     # 'stashed' and 'ctx' are used for async/await parsing
     stashed = None
     ctx = [('sync', 0)]
+    in_async = 0
 
     while 1:                                   # loop over lines in stream
         try:
@@ -436,6 +437,14 @@ def generate_tokens(readline):
                         "unindent does not match any outer indentation level",
                         ("<tokenize>", lnum, pos, line))
                 indents = indents[:-1]
+
+                cur_indent = indents[-1]
+                while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
+                    if ctx[-1][0] == 'async':
+                        in_async -= 1
+                        assert in_async >= 0
+                    ctx.pop()
+
                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 
         else:                                  # continued statement
@@ -499,7 +508,7 @@ def generate_tokens(readline):
                         yield (STRING, token, spos, epos, line)
                 elif initial in namechars:                 # ordinary name
                     if token in ('async', 'await'):
-                        if ctx[-1][0] == 'async' and ctx[-1][1] < indents[-1]:
+                        if in_async:
                             yield (ASYNC if token == 'async' else AWAIT,
                                    token, spos, epos, line)
                             continue
@@ -515,6 +524,7 @@ def generate_tokens(readline):
                                 and stashed[1] == 'async'):
 
                             ctx.append(('async', indents[-1]))
+                            in_async += 1
 
                             yield (ASYNC, stashed[1],
                                    stashed[2], stashed[3],
diff --git a/Lib/test/badsyntax_async1.py b/Lib/test/badsyntax_async1.py
index 970445d..fb85e29 100644
--- a/Lib/test/badsyntax_async1.py
+++ b/Lib/test/badsyntax_async1.py
@@ -1,3 +1,2 @@
-async def foo():
-    def foo(a=await something()):
-        pass
+async def foo(a=await something()):
+    pass
diff --git a/Lib/test/badsyntax_async2.py b/Lib/test/badsyntax_async2.py
index 1e62a3e..6f6f4f5 100644
--- a/Lib/test/badsyntax_async2.py
+++ b/Lib/test/badsyntax_async2.py
@@ -1,3 +1,2 @@
-async def foo():
-    def foo(a:await something()):
-        pass
+async def foo(a:await something()):
+    pass
diff --git a/Lib/test/badsyntax_async4.py b/Lib/test/badsyntax_async4.py
index 4afda40..d033b28 100644
--- a/Lib/test/badsyntax_async4.py
+++ b/Lib/test/badsyntax_async4.py
@@ -1,2 +1,2 @@
 async def foo():
-    async def foo(): await something()
+    await
diff --git a/Lib/test/badsyntax_async9.py b/Lib/test/badsyntax_async9.py
deleted file mode 100644
index d033b28..0000000
--- a/Lib/test/badsyntax_async9.py
+++ /dev/null
@@ -1,2 +0,0 @@
-async def foo():
-    await
diff --git a/Lib/test/test_coroutines.py b/Lib/test/test_coroutines.py
index 9d97123..3ba2f23 100644
--- a/Lib/test/test_coroutines.py
+++ b/Lib/test/test_coroutines.py
@@ -67,11 +67,11 @@ def silence_coro_gc():
 class AsyncBadSyntaxTest(unittest.TestCase):
 
     def test_badsyntax_1(self):
-        with self.assertRaisesRegex(SyntaxError, 'invalid syntax'):
+        with self.assertRaisesRegex(SyntaxError, "'await' outside"):
             import test.badsyntax_async1
 
     def test_badsyntax_2(self):
-        with self.assertRaisesRegex(SyntaxError, 'invalid syntax'):
+        with self.assertRaisesRegex(SyntaxError, "'await' outside"):
             import test.badsyntax_async2
 
     def test_badsyntax_3(self):
@@ -103,10 +103,6 @@ class AsyncBadSyntaxTest(unittest.TestCase):
             import test.badsyntax_async8
 
     def test_badsyntax_9(self):
-        with self.assertRaisesRegex(SyntaxError, 'invalid syntax'):
-            import test.badsyntax_async9
-
-    def test_badsyntax_10(self):
         ns = {}
         for comp in {'(await a for a in b)',
                      '[await a for a in b]',
@@ -116,6 +112,221 @@ class AsyncBadSyntaxTest(unittest.TestCase):
             with self.assertRaisesRegex(SyntaxError, 'await.*in comprehen'):
                 exec('async def f():\n\t{}'.format(comp), ns, ns)
 
+    def test_badsyntax_10(self):
+        # Tests for issue 24619
+
+        samples = [
+            """async def foo():
+                   def bar(): pass
+                   await = 1
+            """,
+
+            """async def foo():
+
+                   def bar(): pass
+                   await = 1
+            """,
+
+            """async def foo():
+                   def bar(): pass
+                   if 1:
+                       await = 1
+            """,
+
+            """def foo():
+                   async def bar(): pass
+                   if 1:
+                       await a
+            """,
+
+            """def foo():
+                   async def bar(): pass
+                   await a
+            """,
+
+            """def foo():
+                   def baz(): pass
+                   async def bar(): pass
+                   await a
+            """,
+
+            """def foo():
+                   def baz(): pass
+                   # 456
+                   async def bar(): pass
+                   # 123
+                   await a
+            """,
+
+            """async def foo():
+                   def baz(): pass
+                   # 456
+                   async def bar(): pass
+                   # 123
+                   await = 2
+            """,
+
+            """def foo():
+
+                   def baz(): pass
+
+                   async def bar(): pass
+
+                   await a
+            """,
+
+            """async def foo():
+
+                   def baz(): pass
+
+                   async def bar(): pass
+
+                   await = 2
+            """,
+
+            """async def foo():
+                   def async(): pass
+            """,
+
+            """async def foo():
+                   def await(): pass
+            """,
+
+            """async def foo():
+                   def bar():
+                       await
+            """,
+
+            """async def foo():
+                   return lambda async: await
+            """,
+
+            """async def foo():
+                   return lambda a: await
+            """,
+
+            """async def foo(a: await b):
+                   pass
+            """,
+
+            """def baz():
+                   async def foo(a: await b):
+                       pass
+            """,
+
+            """async def foo(async):
+                   pass
+            """,
+
+            """async def foo():
+                   def bar():
+                        def baz():
+                            async = 1
+            """,
+
+            """async def foo():
+                   def bar():
+                        def baz():
+                            pass
+                        async = 1
+            """,
+
+            """def foo():
+                   async def bar():
+
+                        async def baz():
+                            pass
+
+                        def baz():
+                            42
+
+                        async = 1
+            """,
+
+            """async def foo():
+                   def bar():
+                        def baz():
+                            pass\nawait foo()
+            """,
+
+            """def foo():
+                   def bar():
+                        async def baz():
+                            pass\nawait foo()
+            """,
+
+            """async def foo(await):
+                   pass
+            """,
+
+            """def foo():
+
+                   async def bar(): pass
+
+                   await a
+            """,
+
+            """def foo():
+                   async def bar():
+                        pass\nawait a
+            """]
+
+        ns = {}
+        for code in samples:
+            with self.subTest(code=code), self.assertRaises(SyntaxError):
+                exec(code, ns, ns)
+
+    def test_goodsyntax_1(self):
+        # Tests for issue 24619
+
+        def foo(await):
+            async def foo(): pass
+            async def foo():
+                pass
+            return await + 1
+        self.assertEqual(foo(10), 11)
+
+        def foo(await):
+            async def foo(): pass
+            async def foo(): pass
+            return await + 2
+        self.assertEqual(foo(20), 22)
+
+        def foo(await):
+
+            async def foo(): pass
+
+            async def foo(): pass
+
+            return await + 2
+        self.assertEqual(foo(20), 22)
+
+        def foo(await):
+            """spam"""
+            async def foo(): \
+                pass
+            # 123
+            async def foo(): pass
+            # 456
+            return await + 2
+        self.assertEqual(foo(20), 22)
+
+        def foo(await):
+            def foo(): pass
+            def foo(): pass
+            async def bar(): return await_
+            await_ = await
+            try:
+                bar().send(None)
+            except StopIteration as ex:
+                return ex.args[0]
+        self.assertEqual(foo(42), 42)
+
+        async def f():
+            async def g(): pass
+            await z
+        self.assertTrue(inspect.iscoroutinefunction(f))
+
 
 class TokenizerRegrTest(unittest.TestCase):
 
@@ -461,8 +672,7 @@ class CoroutineTest(unittest.TestCase):
         class Awaitable:
             pass
 
-        async def foo():
-            return (await Awaitable())
+        async def foo(): return await Awaitable()
 
         with self.assertRaisesRegex(
             TypeError, "object Awaitable can't be used in 'await' expression"):
diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py
index 2af7390..ca6b5d0 100644
--- a/Lib/test/test_grammar.py
+++ b/Lib/test/test_grammar.py
@@ -1051,10 +1051,7 @@ class GrammarTests(unittest.TestCase):
 
         async def test():
             def sum():
-                async = 1
-                await = 41
-                return async + await
-
+                pass
             if 1:
                 await someobj()
 
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 42fc78f..e320562 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -786,12 +786,12 @@ Async/await extension:
     NAME       'def'         (2, 2) (2, 5)
     NAME       'foo'         (2, 6) (2, 9)
     OP         '('           (2, 9) (2, 10)
-    NAME       'await'       (2, 10) (2, 15)
+    AWAIT      'await'       (2, 10) (2, 15)
     OP         ')'           (2, 15) (2, 16)
     OP         ':'           (2, 16) (2, 17)
     NEWLINE    '\\n'          (2, 17) (2, 18)
     INDENT     '    '        (3, 0) (3, 4)
-    NAME       'await'       (3, 4) (3, 9)
+    AWAIT      'await'       (3, 4) (3, 9)
     OP         '='           (3, 10) (3, 11)
     NUMBER     '1'           (3, 12) (3, 13)
     NEWLINE    '\\n'          (3, 13) (3, 14)
@@ -829,6 +829,17 @@ Async/await extension:
     OP         ':'           (2, 18) (2, 19)
     NAME       'pass'        (2, 20) (2, 24)
     DEDENT     ''            (3, 0) (3, 0)
+
+    >>> dump_tokens('''async def foo(async): await''')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'foo'         (1, 10) (1, 13)
+    OP         '('           (1, 13) (1, 14)
+    ASYNC      'async'       (1, 14) (1, 19)
+    OP         ')'           (1, 19) (1, 20)
+    OP         ':'           (1, 20) (1, 21)
+    AWAIT      'await'       (1, 22) (1, 27)
 """
 
 from test import support
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 3ec9018..c3efdda 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -501,6 +501,7 @@ def _tokenize(readline, encoding):
     # 'stashed' and 'ctx' are used for async/await parsing
     stashed = None
     ctx = [('sync', 0)]
+    in_async = 0
 
     if encoding is not None:
         if encoding == "utf-8-sig":
@@ -580,6 +581,9 @@ def _tokenize(readline, encoding):
 
                 cur_indent = indents[-1]
                 while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
+                    if ctx[-1][0] == 'async':
+                        in_async -= 1
+                        assert in_async >= 0
                     ctx.pop()
 
                 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
@@ -640,7 +644,7 @@ def _tokenize(readline, encoding):
                         yield TokenInfo(STRING, token, spos, epos, line)
                 elif initial.isidentifier():               # ordinary name
                     if token in ('async', 'await'):
-                        if ctx[-1][0] == 'async' and ctx[-1][1] < indents[-1]:
+                        if in_async:
                             yield TokenInfo(
                                 ASYNC if token == 'async' else AWAIT,
                                 token, spos, epos, line)
@@ -657,6 +661,7 @@ def _tokenize(readline, encoding):
                                 and stashed.string == 'async'):
 
                             ctx.append(('async', indents[-1]))
+                            in_async += 1
 
                             yield TokenInfo(ASYNC, stashed.string,
                                             stashed.start, stashed.end,
diff --git a/Misc/NEWS b/Misc/NEWS
index f69138e..ba27a55 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -19,6 +19,9 @@ Core and Builtins
 
 - Issue #24407: Fix crash when dict is mutated while being updated.
 
+- Issue #24619: New approach for tokenizing async/await. As a consequence,
+  is is now possible to have one-line 'async def foo(): await ..' functions.
+
 Library
 -------
 
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index d4476ae..46c0580 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -31,6 +31,12 @@
                || c == '_'\
                || (c >= 128))
 
+/* The following DEFTYPE* flags are used in 'tok_state->deftypestack',
+   and should be removed in 3.7, when async/await are regular
+   keywords. */
+#define DEFTYPE_ASYNC           1
+#define DEFTYPE_HAS_NL          2
+
 extern char *PyOS_Readline(FILE *, FILE *, const char *);
 /* Return malloc'ed string including trailing \n;
    empty malloc'ed string for EOF;
@@ -130,6 +136,8 @@ tok_new(void)
     tok->def = 0;
     tok->defstack[0] = 0;
     tok->deftypestack[0] = 0;
+    tok->def_async_behind = 0;
+    tok->def_in_async = 0;
 
     tok->atbol = 1;
     tok->pendin = 0;
@@ -1436,7 +1444,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
             tok->pendin++;
 
             while (tok->def && tok->defstack[tok->def] >= tok->indent) {
+                if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
+                    tok->def_in_async--;
+                    assert(tok->def_in_async >= 0);
+                }
                 tok->def--;
+                assert(tok->def >= 0);
             }
 
             return DEDENT;
@@ -1447,6 +1460,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
         }
     }
 
+    if (!blankline && tok->level == 0
+        && tok->def && tok->deftypestack[tok->def] & DEFTYPE_HAS_NL
+        && tok->defstack[tok->def] >= tok->indent)
+    {
+        /* The top function on the stack did have a NEWLINE
+        token, but didn't have an INDENT.  That means that
+        it's a one-line function and it should now be removed
+        from the stack. */
+        if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
+            tok->def_in_async--;
+            assert(tok->def_in_async >= 0);
+        }
+        tok->def--;
+        assert(tok->def >= 0);
+    }
+
  again:
     tok->start = NULL;
     /* Skip spaces */
@@ -1501,59 +1530,58 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
 
         tok_len = tok->cur - tok->start;
         if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) {
-            if (tok->def && tok->deftypestack[tok->def] == 3) {
-                tok->deftypestack[tok->def] = 2;
+            /* The current token is 'def'. */
+            if (tok->def + 1 >= MAXINDENT) {
+                tok->done = E_TOODEEP;
+                tok->cur = tok->inp;
+                return ERRORTOKEN;
             }
-            else if (tok->defstack[tok->def] < tok->indent) {
-                /* We advance defs stack only when we see "def" *and*
-                   the indentation level was increased relative to the
-                   previous "def". */
 
-                if (tok->def + 1 >= MAXINDENT) {
-                    tok->done = E_TOODEEP;
-                    tok->cur = tok->inp;
-                    return ERRORTOKEN;
-                }
+            /* Advance defs stack. */
+            tok->def++;
+            tok->defstack[tok->def] = tok->indent;
 
-                tok->def++;
-                tok->defstack[tok->def] = tok->indent;
-                tok->deftypestack[tok->def] = 1;
+            if (tok->def_async_behind) {
+                /* The previous token was 'async'. */
+                tok->def_async_behind = 0;
+                tok->deftypestack[tok->def] = DEFTYPE_ASYNC;
+                tok->def_in_async++;
+            }
+            else {
+                /* This is a regular function (not async def). */
+                tok->deftypestack[tok->def] = 0;
             }
         }
         else if (tok_len == 5) {
             if (memcmp(tok->start, "async", 5) == 0) {
+                /* The current token is 'async'. */
                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
 
+                /* Try to look ahead one token. */
                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
                                          &ahead_top_end);
 
-                if (ahead_tok_kind == NAME &&
-                        ahead_tok.cur - ahead_tok.start == 3 &&
-                        memcmp(ahead_tok.start, "def", 3) == 0) {
-
-                    if (tok->def + 1 >= MAXINDENT) {
-                        tok->done = E_TOODEEP;
-                        tok->cur = tok->inp;
-                        return ERRORTOKEN;
-                    }
-
-                    tok->def++;
-                    tok->defstack[tok->def] = tok->indent;
-                    tok->deftypestack[tok->def] = 3;
-
+                if (ahead_tok_kind == NAME
+                    && ahead_tok.cur - ahead_tok.start == 3
+                    && memcmp(ahead_tok.start, "def", 3) == 0)
+                {
+                    /* The next token is going to be 'def', so instead of
+                       returning 'async' NAME token, we return ASYNC. */
+                    tok->def_async_behind = 1;
                     return ASYNC;
                 }
-                else if (tok->def && tok->deftypestack[tok->def] == 2
-                         && tok->defstack[tok->def] < tok->indent) {
-
+                else if (tok->def_in_async)
+                {
+                    /* We're inside an 'async def' function, so we treat
+                    'async' token as ASYNC, instead of NAME. */
                     return ASYNC;
                 }
 
             }
-            else if (memcmp(tok->start, "await", 5) == 0
-                        && tok->def && tok->deftypestack[tok->def] == 2
-                        && tok->defstack[tok->def] < tok->indent) {
-
+            else if (memcmp(tok->start, "await", 5) == 0 && tok->def_in_async)
+            {
+                /* We're inside an 'async def' function, so we treat
+                'await' token as AWAIT, instead of NAME. */
                 return AWAIT;
             }
         }
@@ -1569,6 +1597,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
         *p_start = tok->start;
         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
         tok->cont_line = 0;
+        if (tok->def) {
+            /* Mark the top function on the stack that it had
+               at least one NEWLINE.  That will help us to
+               distinguish one-line functions from functions
+               with multiple statements. */
+            tok->deftypestack[tok->def] |= DEFTYPE_HAS_NL;
+        }
         return NEWLINE;
     }
 
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 3bcdad6..e198a0b 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -66,12 +66,21 @@ struct tok_state {
     const char* str;
     const char* input; /* Tokenizer's newline translated copy of the string. */
 
-    int defstack[MAXINDENT];     /* stack if funcs & indents where they
-                                    were defined */
-    int deftypestack[MAXINDENT]; /* stack of func types
-                                    (0 not func; 1: "def name";
-                                     2: "async def name") */
-    int def;                     /* Length of stack of func types */
+    /* `def*` fields are for parsing async/await in a backwards compatible
+       way.  They should be removed in 3.7, when they will become
+       regular constants.  See PEP 492 for more details. */
+    int defstack[MAXINDENT];     /* Stack of funcs & indents where they
+                                    were defined. */
+    int deftypestack[MAXINDENT]; /* Stack of func flags, see DEFTYPE_*
+                                    constants. */
+    int def;                     /* Length of stack of func types/flags. */
+    int def_async_behind;        /* 1 if there was an 'async' token before
+                                    a 'def' token. */
+    int def_in_async;            /* Counter of how deep 'async def's
+                                    are nested.  If greater than 0,
+                                    we are somewhere in an 'async def'
+                                    body, so 'async' and 'await' should
+                                    be parsed as keywords.*/
 };
 
 extern struct tok_state *PyTokenizer_FromString(const char *, int);
-- 
cgit v0.12