7 files changed, 127 insertions, 60 deletions
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
index 55c0f6f..0aaaed6 100644
--- a/Parser/action_helpers.c
+++ b/Parser/action_helpers.c
@@ -965,17 +965,43 @@ _PyPegen_check_legacy_stmt(Parser *p, expr_ty name) {
     return 0;
 }
 
-expr_ty
-_PyPegen_check_fstring_conversion(Parser *p, Token* symbol, expr_ty conv) {
-    if (symbol->lineno != conv->lineno || symbol->end_col_offset != conv->col_offset) {
+static ResultTokenWithMetadata *
+result_token_with_metadata(Parser *p, void *result, PyObject *metadata)
+{
+    ResultTokenWithMetadata *res = _PyArena_Malloc(p->arena, sizeof(ResultTokenWithMetadata));
+    if (res == NULL) {
+        return NULL;
+    }
+    res->metadata = metadata;
+    res->result = result;
+    return res;
+}
+
+ResultTokenWithMetadata *
+_PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv)
+{
+    if (conv_token->lineno != conv->lineno || conv_token->end_col_offset != conv->col_offset) {
         return RAISE_SYNTAX_ERROR_KNOWN_RANGE(
-            symbol, conv,
+            conv_token, conv,
             "f-string: conversion type must come right after the exclamanation mark"
         );
     }
-    return conv;
+    return result_token_with_metadata(p, conv, conv_token->metadata);
 }
 
+ResultTokenWithMetadata *
+_PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, int lineno, int col_offset,
+                                int end_lineno, int end_col_offset, PyArena *arena)
+{
+    if (!spec) {
+        return NULL;
+    }
+    expr_ty res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno, end_col_offset, p->arena);
+    if (!res) {
+        return NULL;
+    }
+    return result_token_with_metadata(p, res, colon->metadata);
+}
 
 const char *
 _PyPegen_get_expr_name(expr_ty e)
@@ -1198,27 +1224,6 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq
 // Fstring stuff
 
 static expr_ty
-decode_fstring_buffer(Parser *p, int lineno, int col_offset, int end_lineno,
-                      int end_col_offset)
-{
-    tokenizer_mode *tok_mode = &(p->tok->tok_mode_stack[p->tok->tok_mode_stack_index]);
-    assert(tok_mode->last_expr_buffer != NULL);
-    assert(tok_mode->last_expr_size >= 0 && tok_mode->last_expr_end >= 0);
-
-    PyObject *res = PyUnicode_DecodeUTF8(
-        tok_mode->last_expr_buffer,
-        tok_mode->last_expr_size - tok_mode->last_expr_end,
-        NULL
-    );
-    if (!res || _PyArena_AddPyObject(p->arena, res) < 0) {
-        Py_XDECREF(res);
-        return NULL;
-    }
-
-    return _PyAST_Constant(res, NULL, lineno, col_offset, end_lineno, end_col_offset, p->arena);
-}
-
-static expr_ty
 _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant) {
     assert(PyUnicode_CheckExact(constant->v.Constant.value));
 
@@ -1386,19 +1391,20 @@ expr_ty _PyPegen_constant_from_string(Parser* p, Token* tok) {
     return _PyAST_Constant(s, kind, tok->lineno, tok->col_offset, tok->end_lineno, tok->end_col_offset, p->arena);
 }
 
-expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, expr_ty conversion,
-                                 expr_ty format, int lineno, int col_offset, int end_lineno, int end_col_offset,
-                                 PyArena *arena) {
+expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, ResultTokenWithMetadata *conversion,
+                                 ResultTokenWithMetadata *format, Token *closing_brace, int lineno, int col_offset,
+                                 int end_lineno, int end_col_offset, PyArena *arena) {
     int conversion_val = -1;
     if (conversion != NULL) {
-        assert(conversion->kind == Name_kind);
-        Py_UCS4 first = PyUnicode_READ_CHAR(conversion->v.Name.id, 0);
+        expr_ty conversion_expr = (expr_ty) conversion->result;
+        assert(conversion_expr->kind == Name_kind);
+        Py_UCS4 first = PyUnicode_READ_CHAR(conversion_expr->v.Name.id, 0);
 
-        if (PyUnicode_GET_LENGTH(conversion->v.Name.id) > 1 ||
+        if (PyUnicode_GET_LENGTH(conversion_expr->v.Name.id) > 1 ||
             !(first == 's' || first == 'r' || first == 'a')) {
-            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(conversion,
+            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(conversion_expr,
                                               "f-string: invalid conversion character %R: expected 's', 'r', or 'a'",
-                                              conversion->v.Name.id);
+                                              conversion_expr->v.Name.id);
             return NULL;
         }
 
@@ -1410,7 +1416,7 @@ expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, ex
     }
 
     expr_ty formatted_value = _PyAST_FormattedValue(
-        expression, conversion_val, format,
+        expression, conversion_val, format ? (expr_ty) format->result : NULL,
         lineno, col_offset, end_lineno,
         end_col_offset, arena
     );
@@ -1418,22 +1424,26 @@ expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, ex
     if (debug) {
         /* Find the non whitespace token after the "=" */
         int debug_end_line, debug_end_offset;
+        PyObject *debug_metadata;
 
         if (conversion) {
-            debug_end_line = conversion->lineno;
-            debug_end_offset = conversion->col_offset;
+            debug_end_line = ((expr_ty) conversion->result)->lineno;
+            debug_end_offset = ((expr_ty) conversion->result)->col_offset;
+            debug_metadata = conversion->metadata;
         }
         else if (format) {
-            debug_end_line = format->lineno;
-            debug_end_offset = format->col_offset + 1; // HACK: ??
+            debug_end_line = ((expr_ty) format->result)->lineno;
+            debug_end_offset = ((expr_ty) format->result)->col_offset + 1;
+            debug_metadata = format->metadata;
         }
         else {
             debug_end_line = end_lineno;
             debug_end_offset = end_col_offset;
+            debug_metadata = closing_brace->metadata;
         }
 
-        expr_ty debug_text = decode_fstring_buffer(p, lineno, col_offset + 1,
-                                                   debug_end_line, debug_end_offset - 1);
+        expr_ty debug_text = _PyAST_Constant(debug_metadata, NULL, lineno, col_offset + 1, debug_end_line,
+                                             debug_end_offset - 1, p->arena);
         if (!debug_text) {
             return NULL;
         }
diff --git a/Parser/parser.c b/Parser/parser.c
index 7713668..6eb985a 100644
--- a/Parser/parser.c
+++ b/Parser/parser.c
@@ -738,8 +738,8 @@ static NameDefaultPair* lambda_param_maybe_default_rule(Parser *p);
 static arg_ty lambda_param_rule(Parser *p);
 static expr_ty fstring_middle_rule(Parser *p);
 static expr_ty fstring_replacement_field_rule(Parser *p);
-static expr_ty fstring_conversion_rule(Parser *p);
-static expr_ty fstring_full_format_spec_rule(Parser *p);
+static ResultTokenWithMetadata* fstring_conversion_rule(Parser *p);
+static ResultTokenWithMetadata* fstring_full_format_spec_rule(Parser *p);
 static expr_ty fstring_format_spec_rule(Parser *p);
 static expr_ty string_rule(Parser *p);
 static expr_ty strings_rule(Parser *p);
@@ -15639,11 +15639,11 @@ fstring_replacement_field_rule(Parser *p)
         }
         D(fprintf(stderr, "%*c> fstring_replacement_field[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "'{' (yield_expr | star_expressions) \"=\"? fstring_conversion? fstring_full_format_spec? '}'"));
         Token * _literal;
-        Token * _literal_1;
         void *a;
         void *conversion;
         void *debug_expr;
         void *format;
+        Token * rbrace;
         if (
             (_literal = _PyPegen_expect_token(p, 25))  // token='{'
             &&
@@ -15655,7 +15655,7 @@ fstring_replacement_field_rule(Parser *p)
             &&
             (format = fstring_full_format_spec_rule(p), !p->error_indicator)  // fstring_full_format_spec?
             &&
-            (_literal_1 = _PyPegen_expect_token(p, 26))  // token='}'
+            (rbrace = _PyPegen_expect_token(p, 26))  // token='}'
         )
         {
             D(fprintf(stderr, "%*c+ fstring_replacement_field[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "'{' (yield_expr | star_expressions) \"=\"? fstring_conversion? fstring_full_format_spec? '}'"));
@@ -15668,7 +15668,7 @@ fstring_replacement_field_rule(Parser *p)
             UNUSED(_end_lineno); // Only used by EXTRA macro
             int _end_col_offset = _token->end_col_offset;
             UNUSED(_end_col_offset); // Only used by EXTRA macro
-            _res = _PyPegen_formatted_value ( p , a , debug_expr , conversion , format , EXTRA );
+            _res = _PyPegen_formatted_value ( p , a , debug_expr , conversion , format , rbrace , EXTRA );
             if (_res == NULL && PyErr_Occurred()) {
                 p->error_indicator = 1;
                 p->level--;
@@ -15706,7 +15706,7 @@ fstring_replacement_field_rule(Parser *p)
 }
 
 // fstring_conversion: "!" NAME
-static expr_ty
+static ResultTokenWithMetadata*
 fstring_conversion_rule(Parser *p)
 {
     if (p->level++ == MAXSTACK) {
@@ -15717,7 +15717,7 @@ fstring_conversion_rule(Parser *p)
         p->level--;
         return NULL;
     }
-    expr_ty _res = NULL;
+    ResultTokenWithMetadata* _res = NULL;
     int _mark = p->mark;
     { // "!" NAME
         if (p->error_indicator) {
@@ -15753,7 +15753,7 @@ fstring_conversion_rule(Parser *p)
 }
 
 // fstring_full_format_spec: ':' fstring_format_spec*
-static expr_ty
+static ResultTokenWithMetadata*
 fstring_full_format_spec_rule(Parser *p)
 {
     if (p->level++ == MAXSTACK) {
@@ -15764,7 +15764,7 @@ fstring_full_format_spec_rule(Parser *p)
         p->level--;
         return NULL;
     }
-    expr_ty _res = NULL;
+    ResultTokenWithMetadata* _res = NULL;
     int _mark = p->mark;
     if (p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
         p->error_indicator = 1;
@@ -15781,10 +15781,10 @@ fstring_full_format_spec_rule(Parser *p)
             return NULL;
         }
         D(fprintf(stderr, "%*c> fstring_full_format_spec[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "':' fstring_format_spec*"));
-        Token * _literal;
+        Token * colon;
         asdl_seq * spec;
         if (
-            (_literal = _PyPegen_expect_token(p, 11))  // token=':'
+            (colon = _PyPegen_expect_token(p, 11))  // token=':'
             &&
             (spec = _loop0_112_rule(p))  // fstring_format_spec*
         )
@@ -15799,7 +15799,7 @@ fstring_full_format_spec_rule(Parser *p)
             UNUSED(_end_lineno); // Only used by EXTRA macro
             int _end_col_offset = _token->end_col_offset;
             UNUSED(_end_col_offset); // Only used by EXTRA macro
-            _res = spec ? _PyAST_JoinedStr ( ( asdl_expr_seq* ) spec , EXTRA ) : NULL;
+            _res = _PyPegen_setup_full_format_spec ( p , colon , ( asdl_expr_seq* ) spec , EXTRA );
             if (_res == NULL && PyErr_Occurred()) {
                 p->error_indicator = 1;
                 p->level--;
diff --git a/Parser/pegen.c b/Parser/pegen.c
index 262bfabf..da410ea 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -155,6 +155,16 @@ initialize_token(Parser *p, Token *parser_token, struct token *new_token, int to
         return -1;
     }
 
+    parser_token->metadata = NULL;
+    if (new_token->metadata != NULL) {
+        if (_PyArena_AddPyObject(p->arena, new_token->metadata) < 0) {
+            Py_DECREF(parser_token->metadata);
+            return -1;
+        }
+        parser_token->metadata = new_token->metadata;
+        new_token->metadata = NULL;
+    }
+
     parser_token->level = new_token->level;
     parser_token->lineno = new_token->lineno;
     parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
@@ -198,6 +208,7 @@ int
 _PyPegen_fill_token(Parser *p)
 {
     struct token new_token;
+    new_token.metadata = NULL;
     int type = _PyTokenizer_Get(p->tok, &new_token);
 
     // Record and skip '# type: ignore' comments
@@ -206,14 +217,14 @@ _PyPegen_fill_token(Parser *p)
         char *tag = PyMem_Malloc(len + 1);
         if (tag == NULL) {
             PyErr_NoMemory();
-            return -1;
+            goto error;
         }
         strncpy(tag, new_token.start, len);
         tag[len] = '\0';
         // Ownership of tag passes to the growable array
         if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
             PyErr_NoMemory();
-            return -1;
+            goto error;
         }
         type = _PyTokenizer_Get(p->tok, &new_token);
     }
@@ -234,11 +245,14 @@ _PyPegen_fill_token(Parser *p)
 
     // Check if we are at the limit of the token array capacity and resize if needed
     if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
-        return -1;
+        goto error;
     }
 
     Token *t = p->tokens[p->fill];
     return initialize_token(p, t, &new_token, type);
+error:
+    Py_XDECREF(new_token.metadata);
+    return -1;
 }
 
 #if defined(Py_DEBUG)
diff --git a/Parser/pegen.h b/Parser/pegen.h
index 6962013..8800e9f 100644
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -39,6 +39,7 @@ typedef struct {
     int level;
     int lineno, col_offset, end_lineno, end_col_offset;
     Memo *memo;
+    PyObject *metadata;
 } Token;
 
 typedef struct {
@@ -118,6 +119,11 @@ typedef struct {
     int is_keyword;
 } KeywordOrStarred;
 
+typedef struct {
+    void *result;
+    PyObject *metadata;
+} ResultTokenWithMetadata;
+
 // Internal parser functions
 #if defined(Py_DEBUG)
 void _PyPegen_clear_memo_statistics(void);
@@ -310,7 +316,8 @@ StarEtc *_PyPegen_star_etc(Parser *, arg_ty, asdl_seq *, arg_ty);
 arguments_ty _PyPegen_make_arguments(Parser *, asdl_arg_seq *, SlashWithDefault *,
                                      asdl_arg_seq *, asdl_seq *, StarEtc *);
 arguments_ty _PyPegen_empty_arguments(Parser *);
-expr_ty _PyPegen_formatted_value(Parser *, expr_ty, Token *, expr_ty, expr_ty, int, int, int, int, PyArena *);
+expr_ty _PyPegen_formatted_value(Parser *, expr_ty, Token *, ResultTokenWithMetadata *, ResultTokenWithMetadata *, Token *,
+                                 int, int, int, int, PyArena *);
 AugOperator *_PyPegen_augoperator(Parser*, operator_ty type);
 stmt_ty _PyPegen_function_def_decorators(Parser *, asdl_expr_seq *, stmt_ty);
 stmt_ty _PyPegen_class_def_decorators(Parser *, asdl_expr_seq *, stmt_ty);
@@ -329,7 +336,9 @@ expr_ty _PyPegen_ensure_real(Parser *p, expr_ty);
 asdl_seq *_PyPegen_join_sequences(Parser *, asdl_seq *, asdl_seq *);
 int _PyPegen_check_barry_as_flufl(Parser *, Token *);
 int _PyPegen_check_legacy_stmt(Parser *p, expr_ty t);
-expr_ty _PyPegen_check_fstring_conversion(Parser *p, Token *, expr_ty t);
+ResultTokenWithMetadata *_PyPegen_check_fstring_conversion(Parser *p, Token *, expr_ty t);
+ResultTokenWithMetadata *_PyPegen_setup_full_format_spec(Parser *, Token *, asdl_expr_seq *, int, int,
+                                                         int, int, PyArena *);
 mod_ty _PyPegen_make_module(Parser *, asdl_stmt_seq *);
 void *_PyPegen_arguments_parsing_error(Parser *, expr_ty);
 expr_ty _PyPegen_get_last_comprehension_item(comprehension_ty comprehension);
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index e26bad2..1f227da 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -165,6 +165,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
 
     int ret = 0;
     struct token new_token;
+    new_token.metadata = NULL;
 
     for (;;) {
         switch (_PyTokenizer_Get(p->tok, &new_token)) {
@@ -192,6 +193,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
 
 
 exit:
+    Py_XDECREF(new_token.metadata);
     // If we're in an f-string, we want the syntax error in the expression part
     // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
     // do not swallow it.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index a8649b8..8de0572 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -111,7 +111,7 @@ tok_new(void)
     tok->interactive_underflow = IUNDERFLOW_NORMAL;
     tok->str = NULL;
     tok->report_warnings = 1;
-    tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0};
+    tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
     tok->tok_mode_stack_index = 0;
     tok->tok_report_warnings = 1;
 #ifdef Py_DEBUG
@@ -391,6 +391,28 @@ restore_fstring_buffers(struct tok_state *tok)
 }
 
 static int
+set_fstring_expr(struct tok_state* tok, struct token *token, char c) {
+    assert(token != NULL);
+    assert(c == '}' || c == ':' || c == '!');
+    tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
+
+    if (!tok_mode->f_string_debug || token->metadata) {
+        return 0;
+    }
+
+    PyObject *res = PyUnicode_DecodeUTF8(
+        tok_mode->last_expr_buffer,
+        tok_mode->last_expr_size - tok_mode->last_expr_end,
+        NULL
+    );
+    if (!res) {
+        return -1;
+    }
+    token->metadata = res;
+    return 0;
+}
+
+static int
 update_fstring_expr(struct tok_state *tok, char cur)
 {
     assert(tok->cur != NULL);
@@ -2224,6 +2246,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         the_current_tok->last_expr_buffer = NULL;
         the_current_tok->last_expr_size = 0;
         the_current_tok->last_expr_end = -1;
+        the_current_tok->f_string_debug = 0;
 
         switch (*tok->start) {
             case 'F':
@@ -2350,10 +2373,12 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
          * by the `{` case, so for ensuring that we are on the 0th level, we need
          * to adjust it manually */
         int cursor = current_tok->curly_bracket_depth - (c != '{');
-
         if (cursor == 0 && !update_fstring_expr(tok, c)) {
             return MAKE_TOKEN(ENDMARKER);
         }
+        if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) {
+            return MAKE_TOKEN(ERRORTOKEN);
+        }
 
         if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
             current_tok->kind = TOK_FSTRING_MODE;
@@ -2445,6 +2470,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
                 current_tok->curly_bracket_expr_start_depth--;
                 current_tok->kind = TOK_FSTRING_MODE;
+                current_tok->f_string_debug = 0;
             }
         }
         break;
@@ -2458,6 +2484,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex));
     }
 
+    if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
+        current_tok->f_string_debug = 1;
+    }
+
     /* Punctuation character */
     p_start = tok->start;
     p_end = tok->cur;
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 2b94aec..8b4213c 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -31,6 +31,7 @@ struct token {
     int level;
     int lineno, col_offset, end_lineno, end_col_offset;
     const char *start, *end;
+    PyObject *metadata;
 };
 
 enum tokenizer_mode_kind_t {
@@ -58,6 +59,7 @@ typedef struct _tokenizer_mode {
     Py_ssize_t last_expr_size;
     Py_ssize_t last_expr_end;
     char* last_expr_buffer;
+    int f_string_debug;
 } tokenizer_mode;
 
 /* Tokenizer state */