Issue #24619: Simplify async/await tokenization.

This commit simplifies async/await tokenization in tokenizer.c, tokenize.py & lib2to3/tokenize.py. Previous solution was to keep a stack of async-def & def blocks, whereas the new approach is just to remember position of the outermost async-def block. This change won't bring any parsing performance improvements, but it makes the code much easier to read and validate.
author: Yury Selivanov <yselivanov@sprymix.com> 2015-07-23 12:01:58 (GMT)
committer: Yury Selivanov <yselivanov@sprymix.com> 2015-07-23 12:01:58 (GMT)
commit: 96ec934e755355cfc5af036db8641646b7ddb45e (patch)
tree: a6fd6a4cbef1b75ab0cc10db01fd91ecf2e99976 /Lib
parent: f315c1c01676bfabb5b1c6628642668f1ef436a6 (diff)
download: cpython-96ec934e755355cfc5af036db8641646b7ddb45e.zip
cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.gz
cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.bz2
5 files changed, 138 insertions, 30 deletions
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py
index 896b0fa..1ff1c61 100644
--- a/Lib/lib2to3/pgen2/tokenize.py
+++ b/Lib/lib2to3/pgen2/tokenize.py
@@ -366,10 +366,11 @@ def generate_tokens(readline):
     contline = None
     indents = [0]
 
-    # 'stashed' and 'ctx' are used for async/await parsing
+    # 'stashed' and 'async_*' are used for async/await parsing
     stashed = None
-    ctx = [('sync', 0)]
-    in_async = 0
+    async_def = False
+    async_def_indent = 0
+    async_def_nl = False
 
     while 1:                                   # loop over lines in stream
         try:
@@ -438,15 +439,18 @@ def generate_tokens(readline):
                         ("<tokenize>", lnum, pos, line))
                 indents = indents[:-1]
 
-                cur_indent = indents[-1]
-                while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
-                    if ctx[-1][0] == 'async':
-                        in_async -= 1
-                        assert in_async >= 0
-                    ctx.pop()
+                if async_def and async_def_indent >= indents[-1]:
+                    async_def = False
+                    async_def_nl = False
+                    async_def_indent = 0
 
                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 
+            if async_def and async_def_nl and async_def_indent >= indents[-1]:
+                async_def = False
+                async_def_nl = False
+                async_def_indent = 0
+
         else:                                  # continued statement
             if not line:
                 raise TokenError("EOF in multi-line statement", (lnum, 0))
@@ -466,10 +470,13 @@ def generate_tokens(readline):
                     newline = NEWLINE
                     if parenlev > 0:
                         newline = NL
+                    elif async_def:
+                        async_def_nl = True
                     if stashed:
                         yield stashed
                         stashed = None
                     yield (newline, token, spos, epos, line)
+
                 elif initial == '#':
                     assert not token.endswith("\n")
                     if stashed:
@@ -508,7 +515,7 @@ def generate_tokens(readline):
                         yield (STRING, token, spos, epos, line)
                 elif initial in namechars:                 # ordinary name
                     if token in ('async', 'await'):
-                        if in_async:
+                        if async_def:
                             yield (ASYNC if token == 'async' else AWAIT,
                                    token, spos, epos, line)
                             continue
@@ -523,15 +530,13 @@ def generate_tokens(readline):
                                 and stashed[0] == NAME
                                 and stashed[1] == 'async'):
 
-                            ctx.append(('async', indents[-1]))
-                            in_async += 1
+                            async_def = True
+                            async_def_indent = indents[-1]
 
                             yield (ASYNC, stashed[1],
                                    stashed[2], stashed[3],
                                    stashed[4])
                             stashed = None
-                        else:
-                            ctx.append(('sync', indents[-1]))
 
                     if stashed:
                         yield stashed
diff --git a/Lib/lib2to3/tests/test_parser.py b/Lib/lib2to3/tests/test_parser.py
index 107b5ab..b533c01 100644
--- a/Lib/lib2to3/tests/test_parser.py
+++ b/Lib/lib2to3/tests/test_parser.py
@@ -67,10 +67,32 @@ class TestAsyncAwait(GrammarTest):
                              await x
                       """)
 
+        self.validate("""async def foo():
+
+            def foo(): pass
+
+            def foo(): pass
+
+            await x
+        """)
+
+        self.validate("""async def foo(): return await a""")
+
+        self.validate("""def foo():
+            def foo(): pass
+            async def foo(): await x
+        """)
+
         self.invalid_syntax("await x")
         self.invalid_syntax("""def foo():
                                    await x""")
 
+        self.invalid_syntax("""def foo():
+            def foo(): pass
+            async def foo(): pass
+            await x
+        """)
+
     def test_async_var(self):
         self.validate("""async = 1""")
         self.validate("""await = 1""")
diff --git a/Lib/test/test_coroutines.py b/Lib/test/test_coroutines.py
index 14682ca..10de856 100644
--- a/Lib/test/test_coroutines.py
+++ b/Lib/test/test_coroutines.py
@@ -330,6 +330,7 @@ class AsyncBadSyntaxTest(unittest.TestCase):
         async def f():
             async def g(): pass
             await z
+        await = 1
         self.assertTrue(inspect.iscoroutinefunction(f))
 
 
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index e320562..b7ca089 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -840,6 +840,79 @@ Async/await extension:
     OP         ')'           (1, 19) (1, 20)
     OP         ':'           (1, 20) (1, 21)
     AWAIT      'await'       (1, 22) (1, 27)
+
+    >>> dump_tokens('''def f():
+    ...
+    ...   def baz(): pass
+    ...   async def bar(): pass
+    ...
+    ...   await = 2''')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'f'           (1, 4) (1, 5)
+    OP         '('           (1, 5) (1, 6)
+    OP         ')'           (1, 6) (1, 7)
+    OP         ':'           (1, 7) (1, 8)
+    NEWLINE    '\\n'          (1, 8) (1, 9)
+    NL         '\\n'          (2, 0) (2, 1)
+    INDENT     '  '          (3, 0) (3, 2)
+    NAME       'def'         (3, 2) (3, 5)
+    NAME       'baz'         (3, 6) (3, 9)
+    OP         '('           (3, 9) (3, 10)
+    OP         ')'           (3, 10) (3, 11)
+    OP         ':'           (3, 11) (3, 12)
+    NAME       'pass'        (3, 13) (3, 17)
+    NEWLINE    '\\n'          (3, 17) (3, 18)
+    ASYNC      'async'       (4, 2) (4, 7)
+    NAME       'def'         (4, 8) (4, 11)
+    NAME       'bar'         (4, 12) (4, 15)
+    OP         '('           (4, 15) (4, 16)
+    OP         ')'           (4, 16) (4, 17)
+    OP         ':'           (4, 17) (4, 18)
+    NAME       'pass'        (4, 19) (4, 23)
+    NEWLINE    '\\n'          (4, 23) (4, 24)
+    NL         '\\n'          (5, 0) (5, 1)
+    NAME       'await'       (6, 2) (6, 7)
+    OP         '='           (6, 8) (6, 9)
+    NUMBER     '2'           (6, 10) (6, 11)
+    DEDENT     ''            (7, 0) (7, 0)
+
+    >>> dump_tokens('''async def f():
+    ...
+    ...   def baz(): pass
+    ...   async def bar(): pass
+    ...
+    ...   await = 2''')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    ASYNC      'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'f'           (1, 10) (1, 11)
+    OP         '('           (1, 11) (1, 12)
+    OP         ')'           (1, 12) (1, 13)
+    OP         ':'           (1, 13) (1, 14)
+    NEWLINE    '\\n'          (1, 14) (1, 15)
+    NL         '\\n'          (2, 0) (2, 1)
+    INDENT     '  '          (3, 0) (3, 2)
+    NAME       'def'         (3, 2) (3, 5)
+    NAME       'baz'         (3, 6) (3, 9)
+    OP         '('           (3, 9) (3, 10)
+    OP         ')'           (3, 10) (3, 11)
+    OP         ':'           (3, 11) (3, 12)
+    NAME       'pass'        (3, 13) (3, 17)
+    NEWLINE    '\\n'          (3, 17) (3, 18)
+    ASYNC      'async'       (4, 2) (4, 7)
+    NAME       'def'         (4, 8) (4, 11)
+    NAME       'bar'         (4, 12) (4, 15)
+    OP         '('           (4, 15) (4, 16)
+    OP         ')'           (4, 16) (4, 17)
+    OP         ':'           (4, 17) (4, 18)
+    NAME       'pass'        (4, 19) (4, 23)
+    NEWLINE    '\\n'          (4, 23) (4, 24)
+    NL         '\\n'          (5, 0) (5, 1)
+    AWAIT      'await'       (6, 2) (6, 7)
+    OP         '='           (6, 8) (6, 9)
+    NUMBER     '2'           (6, 10) (6, 11)
+    DEDENT     ''            (7, 0) (7, 0)
 """
 
 from test import support
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index c3efdda..65d06e5 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -498,10 +498,11 @@ def _tokenize(readline, encoding):
     contline = None
     indents = [0]
 
-    # 'stashed' and 'ctx' are used for async/await parsing
+    # 'stashed' and 'async_*' are used for async/await parsing
     stashed = None
-    ctx = [('sync', 0)]
-    in_async = 0
+    async_def = False
+    async_def_indent = 0
+    async_def_nl = False
 
     if encoding is not None:
         if encoding == "utf-8-sig":
@@ -579,15 +580,18 @@ def _tokenize(readline, encoding):
                         ("<tokenize>", lnum, pos, line))
                 indents = indents[:-1]
 
-                cur_indent = indents[-1]
-                while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
-                    if ctx[-1][0] == 'async':
-                        in_async -= 1
-                        assert in_async >= 0
-                    ctx.pop()
+                if async_def and async_def_indent >= indents[-1]:
+                    async_def = False
+                    async_def_nl = False
+                    async_def_indent = 0
 
                 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
 
+            if async_def and async_def_nl and async_def_indent >= indents[-1]:
+                async_def = False
+                async_def_nl = False
+                async_def_indent = 0
+
         else:                                  # continued statement
             if not line:
                 raise TokenError("EOF in multi-line statement", (lnum, 0))
@@ -609,8 +613,13 @@ def _tokenize(readline, encoding):
                     if stashed:
                         yield stashed
                         stashed = None
-                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
-                           token, spos, epos, line)
+                    if parenlev > 0:
+                        yield TokenInfo(NL, token, spos, epos, line)
+                    else:
+                        yield TokenInfo(NEWLINE, token, spos, epos, line)
+                        if async_def:
+                            async_def_nl = True
+
                 elif initial == '#':
                     assert not token.endswith("\n")
                     if stashed:
@@ -644,7 +653,7 @@ def _tokenize(readline, encoding):
                         yield TokenInfo(STRING, token, spos, epos, line)
                 elif initial.isidentifier():               # ordinary name
                     if token in ('async', 'await'):
-                        if in_async:
+                        if async_def:
                             yield TokenInfo(
                                 ASYNC if token == 'async' else AWAIT,
                                 token, spos, epos, line)
@@ -660,15 +669,13 @@ def _tokenize(readline, encoding):
                                 and stashed.type == NAME
                                 and stashed.string == 'async'):
 
-                            ctx.append(('async', indents[-1]))
-                            in_async += 1
+                            async_def = True
+                            async_def_indent = indents[-1]
 
                             yield TokenInfo(ASYNC, stashed.string,
                                             stashed.start, stashed.end,
                                             stashed.line)
                             stashed = None
-                        else:
-                            ctx.append(('sync', indents[-1]))
 
                     if stashed:
                         yield stashed
author	Yury Selivanov <yselivanov@sprymix.com>	2015-07-23 12:01:58 (GMT)
committer	Yury Selivanov <yselivanov@sprymix.com>	2015-07-23 12:01:58 (GMT)
commit	96ec934e755355cfc5af036db8641646b7ddb45e (patch)
tree	a6fd6a4cbef1b75ab0cc10db01fd91ecf2e99976 /Lib
parent	f315c1c01676bfabb5b1c6628642668f1ef436a6 (diff)
download	cpython-96ec934e755355cfc5af036db8641646b7ddb45e.zip cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.gz cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.bz2