gh-102856: Python tokenizer implementation for PEP 701 (#104323)

This commit replaces the Python implementation of the tokenize module with an implementation that reuses the real C tokenizer via a private extension module. The tokenize module now implements a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward compatibility. As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
author: Marta Gómez Macías <mgmacias@google.com> 2023-05-21 00:03:02 (GMT)
committer: GitHub <noreply@github.com> 2023-05-21 00:03:02 (GMT)
commit: 6715f91edcf6f379f666e18f57b8a0dcb724bf79 (patch)
tree: 25724d6eb5b8ff5e713f7bfd8f6c33e5a6d87f62 /Parser
parent: 3ed57e4995d9f8583083483f397ddc3131720953 (diff)
download: cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.zip
cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.gz
cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.bz2
5 files changed, 65 insertions, 8 deletions
diff --git a/Parser/pegen.c b/Parser/pegen.c
index da410ea..b031a6f 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -208,7 +208,7 @@ int
 _PyPegen_fill_token(Parser *p)
 {
     struct token new_token;
-    new_token.metadata = NULL;
+    _PyToken_Init(&new_token);
     int type = _PyTokenizer_Get(p->tok, &new_token);
 
     // Record and skip '# type: ignore' comments
@@ -251,7 +251,7 @@ _PyPegen_fill_token(Parser *p)
     Token *t = p->tokens[p->fill];
     return initialize_token(p, t, &new_token, type);
 error:
-    Py_XDECREF(new_token.metadata);
+    _PyToken_Free(&new_token);
     return -1;
 }
 
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index 1f227da..af52905 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -165,7 +165,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
 
     int ret = 0;
     struct token new_token;
-    new_token.metadata = NULL;
+    _PyToken_Init(&new_token);
 
     for (;;) {
         switch (_PyTokenizer_Get(p->tok, &new_token)) {
@@ -193,7 +193,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
 
 
 exit:
-    Py_XDECREF(new_token.metadata);
+    _PyToken_Free(&new_token);
     // If we're in an f-string, we want the syntax error in the expression part
     // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
     // do not swallow it.
diff --git a/Parser/token.c b/Parser/token.c
index 82267fb..2bc963a 100644
--- a/Parser/token.c
+++ b/Parser/token.c
@@ -70,9 +70,9 @@ const char * const _PyParser_TokenNames[] = {
     "FSTRING_START",
     "FSTRING_MIDDLE",
     "FSTRING_END",
+    "COMMENT",
+    "NL",
     "<ERRORTOKEN>",
-    "<COMMENT>",
-    "<NL>",
     "<ENCODING>",
     "<N_TOKENS>",
 };
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index c5dc9e7..fb94fbe 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -111,6 +111,8 @@ tok_new(void)
     tok->interactive_underflow = IUNDERFLOW_NORMAL;
     tok->str = NULL;
     tok->report_warnings = 1;
+    tok->tok_extra_tokens = 0;
+    tok->comment_newline = 0;
     tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
     tok->tok_mode_stack_index = 0;
     tok->tok_report_warnings = 1;
@@ -980,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok)
     PyMem_Free(tok);
 }
 
+void
+_PyToken_Free(struct token *token) {
+    Py_XDECREF(token->metadata);
+}
+
+void
+_PyToken_Init(struct token *token) {
+    token->metadata = NULL;
+}
+
 static int
 tok_readline_raw(struct tok_state *tok)
 {
@@ -1636,6 +1648,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
     return type;
 }
 
+
 static int
 tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
 {
@@ -1649,6 +1662,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     tok->starting_col_offset = -1;
     blankline = 0;
 
+
     /* Get indentation level */
     if (tok->atbol) {
         int col = 0;
@@ -1749,12 +1763,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     tok->starting_col_offset = tok->col_offset;
 
     /* Return pending indents/dedents */
-    if (tok->pendin != 0) {
+   if (tok->pendin != 0) {
         if (tok->pendin < 0) {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->cur;
+                p_end = tok->cur;
+            }
             tok->pendin++;
             return MAKE_TOKEN(DEDENT);
         }
         else {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->buf;
+                p_end = tok->cur;
+            }
             tok->pendin--;
             return MAKE_TOKEN(INDENT);
         }
@@ -1803,13 +1825,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
         }
 
-        const char *prefix, *p, *type_start;
+        const char* p = NULL;
+        const char *prefix, *type_start;
         int current_starting_col_offset;
 
         while (c != EOF && c != '\n') {
             c = tok_nextc(tok);
         }
 
+        if (tok->tok_extra_tokens) {
+            p = tok->start;
+        }
+
         if (tok->type_comments) {
             p = tok->start;
             current_starting_col_offset = tok->starting_col_offset;
@@ -1864,6 +1891,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                 }
             }
         }
+        if (tok->tok_extra_tokens) {
+            tok_backup(tok, c);  /* don't eat the newline or EOF */
+            p_start = p;
+            p_end = tok->cur;
+            tok->comment_newline = blankline;
+            return MAKE_TOKEN(COMMENT);
+        }
     }
 
     if (tok->done == E_INTERACT_STOP) {
@@ -1949,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
 
                 struct tok_state ahead_tok;
                 struct token ahead_token;
+                _PyToken_Init(&ahead_token);
                 int ahead_tok_kind;
 
                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
@@ -1964,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                        returning a plain NAME token, return ASYNC. */
                     tok->async_def_indent = tok->indent;
                     tok->async_def = 1;
+                    _PyToken_Free(&ahead_token);
                     return MAKE_TOKEN(ASYNC);
                 }
+                _PyToken_Free(&ahead_token);
             }
         }
 
@@ -1976,8 +2013,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     if (c == '\n') {
         tok->atbol = 1;
         if (blankline || tok->level > 0) {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(NL);
+            }
             goto nextline;
         }
+        if (tok->comment_newline && tok->tok_extra_tokens) {
+            tok->comment_newline = 0;
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(NL);
+        }
         p_start = tok->start;
         p_end = tok->cur - 1; /* Leave '\n' out of the string */
         tok->cont_line = 0;
@@ -2563,6 +2611,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
 
 f_string_middle:
 
+    // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
+    // this.
+    tok->multi_line_start = tok->line_start;
     while (end_quote_size != current_tok->f_string_quote_size) {
         int c = tok_nextc(tok);
         if (tok->done == E_ERROR) {
@@ -2788,7 +2839,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
     // if fetching the encoding shows a warning.
     tok->report_warnings = 0;
     while (tok->lineno < 2 && tok->done == E_OK) {
+        _PyToken_Init(&token);
         _PyTokenizer_Get(tok, &token);
+        _PyToken_Free(&token);
     }
     fclose(fp);
     if (tok->encoding) {
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index fd169cf..3f34763 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -128,6 +128,8 @@ struct tok_state {
     tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
     int tok_mode_stack_index;
     int tok_report_warnings;
+    int tok_extra_tokens;
+    int comment_newline;
 #ifdef Py_DEBUG
     int debug;
 #endif
@@ -138,6 +140,8 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
 extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                               const char *, const char *);
 extern void _PyTokenizer_Free(struct tok_state *);
+extern void _PyToken_Free(struct token *);
+extern void _PyToken_Init(struct token *);
 extern int _PyTokenizer_Get(struct tok_state *, struct token *);
 
 #define tok_dump _Py_tok_dump
author	Marta Gómez Macías <mgmacias@google.com>	2023-05-21 00:03:02 (GMT)
committer	GitHub <noreply@github.com>	2023-05-21 00:03:02 (GMT)
commit	6715f91edcf6f379f666e18f57b8a0dcb724bf79 (patch)
tree	25724d6eb5b8ff5e713f7bfd8f6c33e5a6d87f62 /Parser
parent	3ed57e4995d9f8583083483f397ddc3131720953 (diff)
download	cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.zip cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.gz cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.bz2