gh-102856: Initial implementation of PEP 701 (#102855)

Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com> Co-authored-by: Batuhan Taskaya <isidentical@gmail.com> Co-authored-by: Marta Gómez Macías <mgmacias@google.com> Co-authored-by: sunmy2019 <59365878+sunmy2019@users.noreply.github.com>
author: Pablo Galindo Salgado <Pablogsal@gmail.com> 2023-04-19 16:18:16 (GMT)
committer: GitHub <noreply@github.com> 2023-04-19 16:18:16 (GMT)
commit: 1ef61cf71a218c71860ff6aecf0fd51edb8b65dc (patch)
tree: d0c4995cac9cb660b66498419d528254f26baf54 /Parser/tokenizer.c
parent: a6b07b5a345f7f54ee9f6d75e81d2fb55971b35c (diff)
download: cpython-1ef61cf71a218c71860ff6aecf0fd51edb8b65dc.zip
cpython-1ef61cf71a218c71860ff6aecf0fd51edb8b65dc.tar.gz
cpython-1ef61cf71a218c71860ff6aecf0fd51edb8b65dc.tar.bz2
1 files changed, 483 insertions, 6 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 463c0e0..1dfd2d6 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -43,6 +43,28 @@
             tok->lineno++; \
             tok->col_offset = 0;
 
+#ifdef Py_DEBUG
+static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
+    assert(tok->tok_mode_stack_index >= 0);
+    assert(tok->tok_mode_stack_index < MAXLEVEL);
+    return &(tok->tok_mode_stack[tok->tok_mode_stack_index]);
+}
+static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
+    assert(tok->tok_mode_stack_index >= 0);
+    assert(tok->tok_mode_stack_index < MAXLEVEL);
+    return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
+}
+static inline int *TOK_GET_BRACKET_MARK(tokenizer_mode* mode) {
+    assert(mode->bracket_mark_index >= 0);
+    assert(mode->bracket_mark_index < MAX_EXPR_NESTING);
+    return &(mode->bracket_mark[mode->bracket_mark_index]);
+}
+#else
+#define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
+#define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
+#define TOK_GET_BRACKET_MARK(mode) (&(mode->bracket_mark[mode->bracket_mark_index]))
+#endif
+
 /* Forward */
 static struct tok_state *tok_new(void);
 static int tok_nextc(struct tok_state *tok);
@@ -98,6 +120,9 @@ tok_new(void)
     tok->interactive_underflow = IUNDERFLOW_NORMAL;
     tok->str = NULL;
     tok->report_warnings = 1;
+    tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0};
+    tok->tok_mode_stack_index = 0;
+    tok->tok_report_warnings = 1;
 #ifdef Py_DEBUG
     tok->debug = _Py_GetConfig()->parser_debug;
 #endif
@@ -346,6 +371,92 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
 }
 
 
+/* Traverse and update all f-string buffers with the value */
+static void
+update_fstring_buffers(struct tok_state *tok, char value, int regular, int multiline)
+{
+    int index;
+    tokenizer_mode *mode;
+
+    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+        mode = &(tok->tok_mode_stack[index]);
+        if (regular && mode->f_string_start != NULL) {
+            mode->f_string_start += value;
+        }
+        if (multiline && mode->f_string_multi_line_start != NULL) {
+            mode->f_string_multi_line_start += value;
+        }
+    }
+}
+
+static int
+update_fstring_expr(struct tok_state *tok, char cur)
+{
+    assert(tok->cur != NULL);
+
+    Py_ssize_t size = strlen(tok->cur);
+    tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
+
+    switch (cur) {
+        case '{':
+            if (tok_mode->last_expr_buffer != NULL) {
+                PyMem_Free(tok_mode->last_expr_buffer);
+            }
+            tok_mode->last_expr_buffer = PyMem_Malloc(size);
+            if (tok_mode->last_expr_buffer == NULL) {
+                tok->done = E_NOMEM;
+                return 0;
+            }
+            tok_mode->last_expr_size = size;
+            tok_mode->last_expr_end = -1;
+            strncpy(tok_mode->last_expr_buffer, tok->cur, size);
+            break;
+        case 0:
+            if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
+                return 1;
+            }
+            char *new_buffer = PyMem_Realloc(
+                tok_mode->last_expr_buffer,
+                tok_mode->last_expr_size + size
+            );
+            if (new_buffer == NULL) {
+                PyMem_Free(tok_mode->last_expr_buffer);
+                tok->done = E_NOMEM;
+                return 0;
+            }
+            tok_mode->last_expr_buffer = new_buffer;
+            strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
+            tok_mode->last_expr_size += size;
+            break;
+        case '}':
+        case '!':
+        case ':':
+            if (tok_mode->last_expr_end == -1) {
+                tok_mode->last_expr_end = strlen(tok->start);
+            }
+            break;
+    }
+
+    return 1;
+}
+
+static void
+free_fstring_expressions(struct tok_state *tok)
+{
+    int index;
+    tokenizer_mode *mode;
+
+    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+        mode = &(tok->tok_mode_stack[index]);
+        if (mode->last_expr_buffer != NULL) {
+            PyMem_Free(mode->last_expr_buffer);
+            mode->last_expr_buffer = NULL;
+            mode->last_expr_size = 0;
+            mode->last_expr_end = -1;
+        }
+    }
+}
+
 /* Read a line of text from TOK into S, using the stream in TOK.
    Return NULL on failure, else S.
 
@@ -372,6 +483,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
         Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
         Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
         Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
+        update_fstring_buffers(tok, -*tok->buf, /*regular=*/1, /*multiline=*/1);
         newbuf = (char *)PyMem_Realloc(newbuf, newsize);
         if (newbuf == NULL) {
             tok->done = E_NOMEM;
@@ -384,6 +496,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
         tok->start = start < 0 ? NULL : tok->buf + start;
         tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
         tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
+        update_fstring_buffers(tok, *tok->buf, /*regular=*/1, /*multiline=*/1);
     }
     return 1;
 }
@@ -838,6 +951,7 @@ _PyTokenizer_Free(struct tok_state *tok)
     if (tok->interactive_src_start != NULL) {
         PyMem_Free(tok->interactive_src_start);
     }
+    free_fstring_expressions(tok);
     PyMem_Free(tok);
 }
 
@@ -854,6 +968,9 @@ tok_readline_raw(struct tok_state *tok)
         if (line == NULL) {
             return 1;
         }
+        if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
+            return 0;
+        }
         if (tok->fp_interactive &&
             tok_concatenate_interactive_new_line(tok, line) == -1) {
             return 0;
@@ -941,6 +1058,7 @@ tok_underflow_interactive(struct tok_state *tok) {
     }
     else if (tok->start != NULL) {
         Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
+        update_fstring_buffers(tok, -*tok->buf, /*regular=*/0, /*multiline=*/1);
         size_t size = strlen(newtok);
         ADVANCE_LINENO();
         if (!tok_reserve_buf(tok, size + 1)) {
@@ -953,6 +1071,7 @@ tok_underflow_interactive(struct tok_state *tok) {
         PyMem_Free(newtok);
         tok->inp += size;
         tok->multi_line_start = tok->buf + cur_multi_line_start;
+        update_fstring_buffers(tok, *tok->buf, /*regular=*/0, /*multiline=*/1);
     }
     else {
         ADVANCE_LINENO();
@@ -969,6 +1088,10 @@ tok_underflow_interactive(struct tok_state *tok) {
         }
         return 0;
     }
+
+    if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
+        return 0;
+    }
     return 1;
 }
 
@@ -1073,7 +1196,7 @@ tok_nextc(struct tok_state *tok)
             return Py_CHARMASK(*tok->cur++); /* Fast path */
         }
         if (tok->done != E_OK) {
-           return EOF;
+            return EOF;
         }
         if (tok->fp == NULL) {
             rc = tok_underflow_string(tok);
@@ -1115,7 +1238,7 @@ tok_backup(struct tok_state *tok, int c)
         if (--tok->cur < tok->buf) {
             Py_FatalError("tokenizer beginning of buffer");
         }
-        if ((int)(unsigned char)*tok->cur != c) {
+        if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) {
             Py_FatalError("tok_backup: wrong character");
         }
         tok->col_offset--;
@@ -1172,6 +1295,7 @@ error:
 static int
 syntaxerror(struct tok_state *tok, const char *format, ...)
 {
+    // This errors are cleaned on startup. Todo: Fix it.
     va_list vargs;
     va_start(vargs, format);
     int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
@@ -1235,6 +1359,41 @@ error:
 }
 
 static int
+warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char)
+{
+
+    if (!tok->tok_report_warnings) {
+        return 0;
+    }
+
+    PyObject *msg = PyUnicode_FromFormat(
+        "invalid escape sequence '\\%c'",
+        (char) first_invalid_escape_char
+    );
+
+    if (msg == NULL) {
+        return -1;
+    }
+
+    if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, tok->filename,
+                                 tok->lineno, NULL, NULL) < 0) {
+        Py_DECREF(msg);
+
+        if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
+            /* Replace the DeprecationWarning exception with a SyntaxError
+               to get a more accurate error report */
+            PyErr_Clear();
+            return syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char);
+        }
+
+        return -1;
+    }
+
+    Py_DECREF(msg);
+    return 0;
+}
+
+static int
 lookahead(struct tok_state *tok, const char *test)
 {
     const char *s = test;
@@ -1389,7 +1548,6 @@ tok_decimal_tail(struct tok_state *tok)
     return c;
 }
 
-/* Get next token, after space stripping etc. */
 
 static inline int
 tok_continuation_line(struct tok_state *tok) {
@@ -1427,7 +1585,12 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
 {
     assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
     token->level = tok->level;
-    token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
+    if (ISSTRINGLIT(type)) {
+        token->lineno = tok->first_lineno;
+    }
+    else {
+        token->lineno = tok->lineno;
+    }
     token->end_lineno = tok->lineno;
     token->col_offset = token->end_col_offset = -1;
     token->start = start;
@@ -1441,7 +1604,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
 }
 
 static int
-tok_get(struct tok_state *tok, struct token *token)
+tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
 {
     int c;
     int blankline, nonascii;
@@ -1602,6 +1765,11 @@ tok_get(struct tok_state *tok, struct token *token)
 
     /* Skip comment, unless it's a type comment */
     if (c == '#') {
+
+        if (tok->tok_mode_stack_index > 0) {
+            return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
+        }
+
         const char *prefix, *p, *type_start;
         int current_starting_col_offset;
 
@@ -1703,6 +1871,9 @@ tok_get(struct tok_state *tok, struct token *token)
             }
             c = tok_nextc(tok);
             if (c == '"' || c == '\'') {
+                if (saw_f) {
+                    goto f_string_quote;
+                }
                 goto letter_quote;
             }
         }
@@ -1748,7 +1919,9 @@ tok_get(struct tok_state *tok, struct token *token)
                 int ahead_tok_kind;
 
                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
-                ahead_tok_kind = tok_get(&ahead_tok, &ahead_token);
+                ahead_tok_kind = tok_get_normal_mode(&ahead_tok,
+                                                     current_tok,
+                                                     &ahead_token);
 
                 if (ahead_tok_kind == NAME
                     && ahead_tok.cur - ahead_tok.start == 3
@@ -2003,6 +2176,67 @@ tok_get(struct tok_state *tok, struct token *token)
         return MAKE_TOKEN(NUMBER);
     }
 
+  f_string_quote:
+    if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) {
+        int quote = c;
+        int quote_size = 1;             /* 1 or 3 */
+
+        /* Nodes of type STRING, especially multi line strings
+           must be handled differently in order to get both
+           the starting line number and the column offset right.
+           (cf. issue 16806) */
+        tok->first_lineno = tok->lineno;
+        tok->multi_line_start = tok->line_start;
+
+        /* Find the quote size and start of string */
+        int after_quote = tok_nextc(tok);
+        if (after_quote == quote) {
+            int after_after_quote = tok_nextc(tok);
+            if (after_after_quote == quote) {
+                quote_size = 3;
+            }
+            else {
+                // TODO: Check this
+                tok_backup(tok, after_after_quote);
+                tok_backup(tok, after_quote);
+            }
+        }
+        if (after_quote != quote) {
+            tok_backup(tok, after_quote);
+        }
+
+
+        p_start = tok->start;
+        p_end = tok->cur;
+        tokenizer_mode *current_tok = TOK_NEXT_MODE(tok);
+        current_tok->kind = TOK_FSTRING_MODE;
+        current_tok->f_string_quote = quote;
+        current_tok->f_string_quote_size = quote_size;
+        current_tok->f_string_start = tok->start;
+        current_tok->f_string_multi_line_start = tok->line_start;
+        current_tok->last_expr_buffer = NULL;
+        current_tok->last_expr_size = 0;
+        current_tok->last_expr_end = -1;
+
+        switch (*tok->start) {
+            case 'F':
+            case 'f':
+                current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r';
+                break;
+            case 'R':
+            case 'r':
+                current_tok->f_string_raw = 1;
+                break;
+            default:
+                Py_UNREACHABLE();
+        }
+
+        current_tok->bracket_stack = 0;
+        current_tok->bracket_mark[0] = 0;
+        current_tok->bracket_mark_index = -1;
+        return MAKE_TOKEN(FSTRING_START);
+    }
+
   letter_quote:
     /* String */
     if (c == '\'' || c == '"') {
@@ -2047,6 +2281,20 @@ tok_get(struct tok_state *tok, struct token *token)
                 tok->line_start = tok->multi_line_start;
                 int start = tok->lineno;
                 tok->lineno = tok->first_lineno;
+
+                if (tok->tok_mode_stack_index > 0) {
+                    /* When we are in an f-string, before raising the
+                     * unterminated string literal error, check whether
+                     * does the initial quote matches with f-strings quotes
+                     * and if it is, then this must be a missing '}' token
+                     * so raise the proper error */
+                    tokenizer_mode *current_tok = TOK_GET_MODE(tok);
+                    if (current_tok->f_string_quote == quote &&
+                        current_tok->f_string_quote_size == quote_size) {
+                        return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start));
+                    }
+                }
+
                 if (quote_size == 3) {
                     syntaxerror(tok, "unterminated triple-quoted string literal"
                                      " (detected at line %d)", start);
@@ -2089,6 +2337,27 @@ tok_get(struct tok_state *tok, struct token *token)
         goto again; /* Read next line */
     }
 
+    /* Punctuation character */
+    int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
+    if (is_punctuation && tok->tok_mode_stack_index > 0 && current_tok->bracket_mark_index >= 0) {
+        int mark = *TOK_GET_BRACKET_MARK(current_tok);
+        /* This code block gets executed before the bracket_stack is incremented
+         * by the `{` case, so for ensuring that we are on the 0th level, we need
+         * to adjust it manually */
+        int cursor = current_tok->bracket_stack - (c != '{');
+
+        if (cursor == 0 && !update_fstring_expr(tok, c)) {
+            return MAKE_TOKEN(ENDMARKER);
+        }
+
+        if (c == ':' && cursor == mark) {
+            current_tok->kind = TOK_FSTRING_MODE;
+            p_start = tok->start;
+            p_end = tok->cur;
+            return MAKE_TOKEN(_PyToken_OneChar(c));
+        }
+    }
+
     /* Check for two-character token */
     {
         int c2 = tok_nextc(tok);
@@ -2121,11 +2390,18 @@ tok_get(struct tok_state *tok, struct token *token)
         tok->parenlinenostack[tok->level] = tok->lineno;
         tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
         tok->level++;
+
+        if (tok->tok_mode_stack_index > 0) {
+            current_tok->bracket_stack++;
+        }
         break;
     case ')':
     case ']':
     case '}':
         if (!tok->level) {
+            if (tok->tok_mode_stack_index > 0 && !current_tok->bracket_stack && c == '}') {
+                return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed"));
+            }
             return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
         }
         tok->level--;
@@ -2134,6 +2410,18 @@ tok_get(struct tok_state *tok, struct token *token)
               (opening == '[' && c == ']') ||
               (opening == '{' && c == '}')))
         {
+            /* If the opening bracket belongs to an f-string's expression
+               part (e.g. f"{)}") and the closing bracket is an arbitrary
+               nested expression, then instead of matching a different
+               syntactical construct with it; we'll throw an unmatched
+               parentheses error. */
+            if (tok->tok_mode_stack_index > 0 && opening == '{') {
+                assert(current_tok->bracket_stack >= 0);
+                int previous_bracket = current_tok->bracket_stack - 1;
+                if (previous_bracket == *TOK_GET_BRACKET_MARK(current_tok)) {
+                    return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c));
+                }
+            }
             if (tok->parenlinenostack[tok->level] != tok->lineno) {
                 return MAKE_TOKEN(syntaxerror(tok,
                         "closing parenthesis '%c' does not match "
@@ -2147,6 +2435,14 @@ tok_get(struct tok_state *tok, struct token *token)
                         c, opening));
             }
         }
+
+        if (tok->tok_mode_stack_index > 0) {
+            current_tok->bracket_stack--;
+            if (c == '}' && current_tok->bracket_stack == *TOK_GET_BRACKET_MARK(current_tok)) {
+                current_tok->bracket_mark_index--;
+                current_tok->kind = TOK_FSTRING_MODE;
+            }
+        }
         break;
     }
 
@@ -2162,6 +2458,187 @@ tok_get(struct tok_state *tok, struct token *token)
     return MAKE_TOKEN(_PyToken_OneChar(c));
 }
 
+static int
+tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
+{
+    const char *p_start = NULL;
+    const char *p_end = NULL;
+    int end_quote_size = 0;
+    int unicode_escape = 0;
+
+    tok->start = tok->cur;
+    tok->first_lineno = tok->lineno;
+    tok->starting_col_offset = tok->col_offset;
+
+    // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
+    // before it.
+    int start_char = tok_nextc(tok);
+    int peek1 = tok_nextc(tok);
+    tok_backup(tok, peek1);
+    tok_backup(tok, start_char);
+
+    if ((start_char == '{' && peek1 != '{') || (start_char == '}' && peek1 != '}')) {
+        if (start_char == '{') {
+            current_tok->bracket_mark_index++;
+            if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) {
+                return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
+            }
+            *TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack;
+        }
+        TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+        return tok_get_normal_mode(tok, current_tok, token);
+    }
+
+    // Check if we are at the end of the string
+    for (int i = 0; i < current_tok->f_string_quote_size; i++) {
+        int quote = tok_nextc(tok);
+        if (quote != current_tok->f_string_quote) {
+            tok_backup(tok, quote);
+            goto f_string_middle;
+        }
+    }
+
+    if (current_tok->last_expr_buffer != NULL) {
+        PyMem_Free(current_tok->last_expr_buffer);
+        current_tok->last_expr_buffer = NULL;
+        current_tok->last_expr_size = 0;
+        current_tok->last_expr_end = -1;
+    }
+
+    p_start = tok->start;
+    p_end = tok->cur;
+    tok->tok_mode_stack_index--;
+    return MAKE_TOKEN(FSTRING_END);
+
+f_string_middle:
+
+    while (end_quote_size != current_tok->f_string_quote_size) {
+        int c = tok_nextc(tok);
+        if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) {
+            assert(tok->multi_line_start != NULL);
+            // shift the tok_state's location into
+            // the start of string, and report the error
+            // from the initial quote character
+            tok->cur = (char *)current_tok->f_string_start;
+            tok->cur++;
+            tok->line_start = current_tok->f_string_multi_line_start;
+            int start = tok->lineno;
+            tok->lineno = tok->first_lineno;
+
+            if (current_tok->f_string_quote_size == 3) {
+                return MAKE_TOKEN(syntaxerror(tok,
+                                    "unterminated triple-quoted f-string literal"
+                                    " (detected at line %d)", start));
+            }
+            else {
+                return MAKE_TOKEN(syntaxerror(tok,
+                                    "unterminated f-string literal (detected at"
+                                    " line %d)", start));
+            }
+        }
+
+        if (c == current_tok->f_string_quote) {
+            end_quote_size += 1;
+            continue;
+        } else {
+            end_quote_size = 0;
+        }
+
+        int in_format_spec = current_tok->last_expr_end != -1 && current_tok->bracket_mark_index >= 0;
+        if (c == '{') {
+            int peek = tok_nextc(tok);
+            if (peek != '{' || in_format_spec) {
+                tok_backup(tok, peek);
+                tok_backup(tok, c);
+                current_tok->bracket_mark_index++;
+                if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) {
+                    return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
+                }
+                *TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack;
+                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                p_start = tok->start;
+                p_end = tok->cur;
+            } else {
+                p_start = tok->start;
+                p_end = tok->cur - 1;
+            }
+            return MAKE_TOKEN(FSTRING_MIDDLE);
+        } else if (c == '}') {
+            if (unicode_escape) {
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(FSTRING_MIDDLE);
+            }
+            int peek = tok_nextc(tok);
+
+            // The tokenizer can only be in the format spec if we have already completed the expression
+            // scanning (indicated by the end of the expression being set) and we are not at the top level
+            // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
+            // brackets, we can bypass it here.
+            if (peek == '}' && !in_format_spec) {
+                p_start = tok->start;
+                p_end = tok->cur - 1;
+            } else {
+                tok_backup(tok, peek);
+                tok_backup(tok, c);
+                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                p_start = tok->start;
+                p_end = tok->cur;
+            }
+            return MAKE_TOKEN(FSTRING_MIDDLE);
+        } else if (c == '\\') {
+            int peek = tok_nextc(tok);
+            // Special case when the backslash is right before a curly
+            // brace. We have to restore and return the control back
+            // to the loop for the next iteration.
+            if (peek == '{' || peek == '}') {
+                if (!current_tok->f_string_raw) {
+                    if (warn_invalid_escape_sequence(tok, peek)) {
+                        return MAKE_TOKEN(ERRORTOKEN);
+                    }
+                }
+                tok_backup(tok, peek);
+                continue;
+            }
+
+            if (!current_tok->f_string_raw) {
+                if (peek == 'N') {
+                    /* Handle named unicode escapes (\N{BULLET}) */
+                    peek = tok_nextc(tok);
+                    if (peek == '{') {
+                        unicode_escape = 1;
+                    } else {
+                        tok_backup(tok, peek);
+                    }
+                }
+            } /* else {
+                skip the escaped character
+            }*/
+        }
+    }
+
+    // Backup the f-string quotes to emit a final FSTRING_MIDDLE and
+    // add the quotes to the FSTRING_END in the next tokenizer iteration.
+    for (int i = 0; i < current_tok->f_string_quote_size; i++) {
+        tok_backup(tok, current_tok->f_string_quote);
+    }
+    p_start = tok->start;
+    p_end = tok->cur;
+    return MAKE_TOKEN(FSTRING_MIDDLE);
+}
+
+
+static int
+tok_get(struct tok_state *tok, struct token *token)
+{
+    tokenizer_mode *current_tok = TOK_GET_MODE(tok);
+    if (current_tok->kind == TOK_REGULAR_MODE) {
+        return tok_get_normal_mode(tok, current_tok, token);
+    } else {
+        return tok_get_fstring_mode(tok, current_tok, token);
+    }
+}
+
 int
 _PyTokenizer_Get(struct tok_state *tok, struct token *token)
 {
author	Pablo Galindo Salgado <Pablogsal@gmail.com>	2023-04-19 16:18:16 (GMT)
committer	GitHub <noreply@github.com>	2023-04-19 16:18:16 (GMT)
commit	1ef61cf71a218c71860ff6aecf0fd51edb8b65dc (patch)
tree	d0c4995cac9cb660b66498419d528254f26baf54 /Parser/tokenizer.c
parent	a6b07b5a345f7f54ee9f6d75e81d2fb55971b35c (diff)
download	cpython-1ef61cf71a218c71860ff6aecf0fd51edb8b65dc.zip cpython-1ef61cf71a218c71860ff6aecf0fd51edb8b65dc.tar.gz cpython-1ef61cf71a218c71860ff6aecf0fd51edb8b65dc.tar.bz2