summaryrefslogtreecommitdiffstats
path: root/Parser
diff options
context:
space:
mode:
authorMarta Gómez Macías <mgmacias@google.com>2023-05-21 00:03:02 (GMT)
committerGitHub <noreply@github.com>2023-05-21 00:03:02 (GMT)
commit6715f91edcf6f379f666e18f57b8a0dcb724bf79 (patch)
tree25724d6eb5b8ff5e713f7bfd8f6c33e5a6d87f62 /Parser
parent3ed57e4995d9f8583083483f397ddc3131720953 (diff)
downloadcpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.zip
cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.gz
cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.bz2
gh-102856: Python tokenizer implementation for PEP 701 (#104323)
This commit replaces the Python implementation of the tokenize module with an implementation that reuses the real C tokenizer via a private extension module. The tokenize module now implements a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward compatibility. As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
Diffstat (limited to 'Parser')
-rw-r--r--Parser/pegen.c4
-rw-r--r--Parser/pegen_errors.c4
-rw-r--r--Parser/token.c4
-rw-r--r--Parser/tokenizer.c57
-rw-r--r--Parser/tokenizer.h4
5 files changed, 65 insertions, 8 deletions
diff --git a/Parser/pegen.c b/Parser/pegen.c
index da410ea..b031a6f 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -208,7 +208,7 @@ int
_PyPegen_fill_token(Parser *p)
{
struct token new_token;
- new_token.metadata = NULL;
+ _PyToken_Init(&new_token);
int type = _PyTokenizer_Get(p->tok, &new_token);
// Record and skip '# type: ignore' comments
@@ -251,7 +251,7 @@ _PyPegen_fill_token(Parser *p)
Token *t = p->tokens[p->fill];
return initialize_token(p, t, &new_token, type);
error:
- Py_XDECREF(new_token.metadata);
+ _PyToken_Free(&new_token);
return -1;
}
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index 1f227da..af52905 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -165,7 +165,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
int ret = 0;
struct token new_token;
- new_token.metadata = NULL;
+ _PyToken_Init(&new_token);
for (;;) {
switch (_PyTokenizer_Get(p->tok, &new_token)) {
@@ -193,7 +193,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
exit:
- Py_XDECREF(new_token.metadata);
+ _PyToken_Free(&new_token);
// If we're in an f-string, we want the syntax error in the expression part
// to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
// do not swallow it.
diff --git a/Parser/token.c b/Parser/token.c
index 82267fb..2bc963a 100644
--- a/Parser/token.c
+++ b/Parser/token.c
@@ -70,9 +70,9 @@ const char * const _PyParser_TokenNames[] = {
"FSTRING_START",
"FSTRING_MIDDLE",
"FSTRING_END",
+ "COMMENT",
+ "NL",
"<ERRORTOKEN>",
- "<COMMENT>",
- "<NL>",
"<ENCODING>",
"<N_TOKENS>",
};
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index c5dc9e7..fb94fbe 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -111,6 +111,8 @@ tok_new(void)
tok->interactive_underflow = IUNDERFLOW_NORMAL;
tok->str = NULL;
tok->report_warnings = 1;
+ tok->tok_extra_tokens = 0;
+ tok->comment_newline = 0;
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
tok->tok_mode_stack_index = 0;
tok->tok_report_warnings = 1;
@@ -980,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok)
PyMem_Free(tok);
}
+void
+_PyToken_Free(struct token *token) {
+ Py_XDECREF(token->metadata);
+}
+
+void
+_PyToken_Init(struct token *token) {
+ token->metadata = NULL;
+}
+
static int
tok_readline_raw(struct tok_state *tok)
{
@@ -1636,6 +1648,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
return type;
}
+
static int
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
{
@@ -1649,6 +1662,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
tok->starting_col_offset = -1;
blankline = 0;
+
/* Get indentation level */
if (tok->atbol) {
int col = 0;
@@ -1749,12 +1763,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
tok->starting_col_offset = tok->col_offset;
/* Return pending indents/dedents */
- if (tok->pendin != 0) {
+ if (tok->pendin != 0) {
if (tok->pendin < 0) {
+ if (tok->tok_extra_tokens) {
+ p_start = tok->cur;
+ p_end = tok->cur;
+ }
tok->pendin++;
return MAKE_TOKEN(DEDENT);
}
else {
+ if (tok->tok_extra_tokens) {
+ p_start = tok->buf;
+ p_end = tok->cur;
+ }
tok->pendin--;
return MAKE_TOKEN(INDENT);
}
@@ -1803,13 +1825,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
}
- const char *prefix, *p, *type_start;
+ const char* p = NULL;
+ const char *prefix, *type_start;
int current_starting_col_offset;
while (c != EOF && c != '\n') {
c = tok_nextc(tok);
}
+ if (tok->tok_extra_tokens) {
+ p = tok->start;
+ }
+
if (tok->type_comments) {
p = tok->start;
current_starting_col_offset = tok->starting_col_offset;
@@ -1864,6 +1891,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
}
}
}
+ if (tok->tok_extra_tokens) {
+ tok_backup(tok, c); /* don't eat the newline or EOF */
+ p_start = p;
+ p_end = tok->cur;
+ tok->comment_newline = blankline;
+ return MAKE_TOKEN(COMMENT);
+ }
}
if (tok->done == E_INTERACT_STOP) {
@@ -1949,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
struct tok_state ahead_tok;
struct token ahead_token;
+ _PyToken_Init(&ahead_token);
int ahead_tok_kind;
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
@@ -1964,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
returning a plain NAME token, return ASYNC. */
tok->async_def_indent = tok->indent;
tok->async_def = 1;
+ _PyToken_Free(&ahead_token);
return MAKE_TOKEN(ASYNC);
}
+ _PyToken_Free(&ahead_token);
}
}
@@ -1976,8 +2013,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
if (c == '\n') {
tok->atbol = 1;
if (blankline || tok->level > 0) {
+ if (tok->tok_extra_tokens) {
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NL);
+ }
goto nextline;
}
+ if (tok->comment_newline && tok->tok_extra_tokens) {
+ tok->comment_newline = 0;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NL);
+ }
p_start = tok->start;
p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
@@ -2563,6 +2611,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
f_string_middle:
+ // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
+ // this.
+ tok->multi_line_start = tok->line_start;
while (end_quote_size != current_tok->f_string_quote_size) {
int c = tok_nextc(tok);
if (tok->done == E_ERROR) {
@@ -2788,7 +2839,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
// if fetching the encoding shows a warning.
tok->report_warnings = 0;
while (tok->lineno < 2 && tok->done == E_OK) {
+ _PyToken_Init(&token);
_PyTokenizer_Get(tok, &token);
+ _PyToken_Free(&token);
}
fclose(fp);
if (tok->encoding) {
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index fd169cf..3f34763 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -128,6 +128,8 @@ struct tok_state {
tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
int tok_mode_stack_index;
int tok_report_warnings;
+ int tok_extra_tokens;
+ int comment_newline;
#ifdef Py_DEBUG
int debug;
#endif
@@ -138,6 +140,8 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
const char *, const char *);
extern void _PyTokenizer_Free(struct tok_state *);
+extern void _PyToken_Free(struct token *);
+extern void _PyToken_Init(struct token *);
extern int _PyTokenizer_Get(struct tok_state *, struct token *);
#define tok_dump _Py_tok_dump