summaryrefslogtreecommitdiffstats
path: root/Parser
diff options
context:
space:
mode:
authorYury Selivanov <yselivanov@sprymix.com>2015-07-23 12:01:58 (GMT)
committerYury Selivanov <yselivanov@sprymix.com>2015-07-23 12:01:58 (GMT)
commit96ec934e755355cfc5af036db8641646b7ddb45e (patch)
treea6fd6a4cbef1b75ab0cc10db01fd91ecf2e99976 /Parser
parentf315c1c01676bfabb5b1c6628642668f1ef436a6 (diff)
downloadcpython-96ec934e755355cfc5af036db8641646b7ddb45e.zip
cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.gz
cpython-96ec934e755355cfc5af036db8641646b7ddb45e.tar.bz2
Issue #24619: Simplify async/await tokenization.
This commit simplifies async/await tokenization in tokenizer.c, tokenize.py & lib2to3/tokenize.py. Previous solution was to keep a stack of async-def & def blocks, whereas the new approach is just to remember position of the outermost async-def block. This change won't bring any parsing performance improvements, but it makes the code much easier to read and validate.
Diffstat (limited to 'Parser')
-rw-r--r--Parser/tokenizer.c126
-rw-r--r--Parser/tokenizer.h21
2 files changed, 45 insertions, 102 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 46c0580..04baeaf 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -31,12 +31,6 @@
|| c == '_'\
|| (c >= 128))
-/* The following DEFTYPE* flags are used in 'tok_state->deftypestack',
- and should be removed in 3.7, when async/await are regular
- keywords. */
-#define DEFTYPE_ASYNC 1
-#define DEFTYPE_HAS_NL 2
-
extern char *PyOS_Readline(FILE *, FILE *, const char *);
/* Return malloc'ed string including trailing \n;
empty malloc'ed string for EOF;
@@ -133,12 +127,6 @@ tok_new(void)
tok->indent = 0;
tok->indstack[0] = 0;
- tok->def = 0;
- tok->defstack[0] = 0;
- tok->deftypestack[0] = 0;
- tok->def_async_behind = 0;
- tok->def_in_async = 0;
-
tok->atbol = 1;
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
@@ -159,6 +147,11 @@ tok_new(void)
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
#endif
+
+ tok->async_def = 0;
+ tok->async_def_indent = 0;
+ tok->async_def_nl = 0;
+
return tok;
}
@@ -1350,11 +1343,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
int c;
int blankline, nonascii;
- int tok_len;
- struct tok_state ahead_tok;
- char *ahead_tok_start = NULL, *ahead_top_end = NULL;
- int ahead_tok_kind;
-
*p_start = *p_end = NULL;
nextline:
tok->start = NULL;
@@ -1442,16 +1430,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (tok->pendin != 0) {
if (tok->pendin < 0) {
tok->pendin++;
-
- while (tok->def && tok->defstack[tok->def] >= tok->indent) {
- if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
- tok->def_in_async--;
- assert(tok->def_in_async >= 0);
- }
- tok->def--;
- assert(tok->def >= 0);
- }
-
return DEDENT;
}
else {
@@ -1460,20 +1438,19 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
}
}
- if (!blankline && tok->level == 0
- && tok->def && tok->deftypestack[tok->def] & DEFTYPE_HAS_NL
- && tok->defstack[tok->def] >= tok->indent)
+ if (tok->async_def
+ && !blankline
+ && tok->level == 0
+ /* There was a NEWLINE after ASYNC DEF,
+ so we're past the signature. */
+ && tok->async_def_nl
+ /* Current indentation level is less than where
+ the async function was defined */
+ && tok->async_def_indent >= tok->indent)
{
- /* The top function on the stack did have a NEWLINE
- token, but didn't have an INDENT. That means that
- it's a one-line function and it should now be removed
- from the stack. */
- if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
- tok->def_in_async--;
- assert(tok->def_in_async >= 0);
- }
- tok->def--;
- assert(tok->def >= 0);
+ tok->async_def = 0;
+ tok->async_def_indent = 0;
+ tok->async_def_nl = 0;
}
again:
@@ -1528,38 +1505,27 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_start = tok->start;
*p_end = tok->cur;
- tok_len = tok->cur - tok->start;
- if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) {
- /* The current token is 'def'. */
- if (tok->def + 1 >= MAXINDENT) {
- tok->done = E_TOODEEP;
- tok->cur = tok->inp;
- return ERRORTOKEN;
+ /* async/await parsing block. */
+ if (tok->cur - tok->start == 5) {
+ /* Current token length is 5. */
+ if (tok->async_def) {
+ /* We're inside an 'async def' function. */
+ if (memcmp(tok->start, "async", 5) == 0)
+ return ASYNC;
+ if (memcmp(tok->start, "await", 5) == 0)
+ return AWAIT;
}
+ else if (memcmp(tok->start, "async", 5) == 0) {
+ /* The current token is 'async'.
+ Look ahead one token.*/
- /* Advance defs stack. */
- tok->def++;
- tok->defstack[tok->def] = tok->indent;
+ struct tok_state ahead_tok;
+ char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
+ int ahead_tok_kind;
- if (tok->def_async_behind) {
- /* The previous token was 'async'. */
- tok->def_async_behind = 0;
- tok->deftypestack[tok->def] = DEFTYPE_ASYNC;
- tok->def_in_async++;
- }
- else {
- /* This is a regular function (not async def). */
- tok->deftypestack[tok->def] = 0;
- }
- }
- else if (tok_len == 5) {
- if (memcmp(tok->start, "async", 5) == 0) {
- /* The current token is 'async'. */
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
-
- /* Try to look ahead one token. */
ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
- &ahead_top_end);
+ &ahead_tok_end);
if (ahead_tok_kind == NAME
&& ahead_tok.cur - ahead_tok.start == 3
@@ -1567,22 +1533,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
{
/* The next token is going to be 'def', so instead of
returning 'async' NAME token, we return ASYNC. */
- tok->def_async_behind = 1;
+ tok->async_def_indent = tok->indent;
+ tok->async_def = 1;
return ASYNC;
}
- else if (tok->def_in_async)
- {
- /* We're inside an 'async def' function, so we treat
- 'async' token as ASYNC, instead of NAME. */
- return ASYNC;
- }
-
- }
- else if (memcmp(tok->start, "await", 5) == 0 && tok->def_in_async)
- {
- /* We're inside an 'async def' function, so we treat
- 'await' token as AWAIT, instead of NAME. */
- return AWAIT;
}
}
@@ -1597,12 +1551,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
- if (tok->def) {
- /* Mark the top function on the stack that it had
- at least one NEWLINE. That will help us to
- distinguish one-line functions from functions
- with multiple statements. */
- tok->deftypestack[tok->def] |= DEFTYPE_HAS_NL;
+ if (tok->async_def) {
+ /* We're somewhere inside an 'async def' function, and
+ we've encountered a NEWLINE after its signature. */
+ tok->async_def_nl = 1;
}
return NEWLINE;
}
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index e198a0b..af053e2 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -66,21 +66,12 @@ struct tok_state {
const char* str;
const char* input; /* Tokenizer's newline translated copy of the string. */
- /* `def*` fields are for parsing async/await in a backwards compatible
- way. They should be removed in 3.7, when they will become
- regular constants. See PEP 492 for more details. */
- int defstack[MAXINDENT]; /* Stack of funcs & indents where they
- were defined. */
- int deftypestack[MAXINDENT]; /* Stack of func flags, see DEFTYPE_*
- constants. */
- int def; /* Length of stack of func types/flags. */
- int def_async_behind; /* 1 if there was an 'async' token before
- a 'def' token. */
- int def_in_async; /* Counter of how deep 'async def's
- are nested. If greater than 0,
- we are somewhere in an 'async def'
- body, so 'async' and 'await' should
- be parsed as keywords.*/
+ /* async/await related fields; can be removed in 3.7 when async and await
+ become normal keywords. */
+ int async_def; /* =1 if tokens are inside an 'async def' body. */
+ int async_def_indent; /* Indentation level of the outermost 'async def'. */
+ int async_def_nl; /* =1 if the outermost 'async def' had at least one
+ NEWLINE token after it. */
};
extern struct tok_state *PyTokenizer_FromString(const char *, int);