gh-121130: Fix f-string format specifiers with debug expressions (#121150)

author: Pablo Galindo Salgado <Pablogsal@gmail.com> 2024-07-16 18:57:22 (GMT)
committer: GitHub <noreply@github.com> 2024-07-16 18:57:22 (GMT)
commit: c46d64e0ef8e92a6b4ab4805d813d7e4d6663380 (patch)
tree: 69bf2c0e394c896e05576baa94c7d750f0d27501 /Parser
parent: 69c68de43aef03dd52fabd21f99cb3b0f9329201 (diff)
download: cpython-c46d64e0ef8e92a6b4ab4805d813d7e4d6663380.zip
cpython-c46d64e0ef8e92a6b4ab4805d813d7e4d6663380.tar.gz
cpython-c46d64e0ef8e92a6b4ab4805d813d7e4d6663380.tar.bz2
4 files changed, 62 insertions, 27 deletions
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
index 44bf87d..0307a08 100644
--- a/Parser/action_helpers.c
+++ b/Parser/action_helpers.c
@@ -969,6 +969,8 @@ _PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv)
     return result_token_with_metadata(p, conv, conv_token->metadata);
 }
 
+static asdl_expr_seq *
+unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions);
 ResultTokenWithMetadata *
 _PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, int lineno, int col_offset,
                                 int end_lineno, int end_col_offset, PyArena *arena)
@@ -1007,8 +1009,15 @@ _PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, in
         assert(j == non_empty_count);
         spec = resized_spec;
     }
-    expr_ty res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno,
-                                   end_col_offset, p->arena);
+    expr_ty res;
+    if (asdl_seq_LEN(spec) == 0) {
+        res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno,
+                                    end_col_offset, p->arena);
+    } else {
+        res = _PyPegen_concatenate_strings(p, spec,
+                             lineno, col_offset, end_lineno,
+                             end_col_offset, arena);
+    }
     if (!res) {
         return NULL;
     }
@@ -1308,6 +1317,7 @@ unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions)
 
 expr_ty
 _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b) {
+
     asdl_expr_seq *expr = unpack_top_level_joined_strs(p, raw_expressions);
     Py_ssize_t n_items = asdl_seq_LEN(expr);
 
@@ -1472,7 +1482,6 @@ expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, Re
             debug_end_offset = end_col_offset;
             debug_metadata = closing_brace->metadata;
         }
-
         expr_ty debug_text = _PyAST_Constant(debug_metadata, NULL, lineno, col_offset + 1, debug_end_line,
                                              debug_end_offset - 1, p->arena);
         if (!debug_text) {
@@ -1505,16 +1514,23 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
     Py_ssize_t n_flattened_elements = 0;
     for (i = 0; i < len; i++) {
         expr_ty elem = asdl_seq_GET(strings, i);
-        if (elem->kind == Constant_kind) {
-            if (PyBytes_CheckExact(elem->v.Constant.value)) {
-                bytes_found = 1;
-            } else {
-                unicode_string_found = 1;
-            }
-            n_flattened_elements++;
-        } else {
-            n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
-            f_string_found = 1;
+        switch(elem->kind) {
+            case Constant_kind:
+                if (PyBytes_CheckExact(elem->v.Constant.value)) {
+                    bytes_found = 1;
+                } else {
+                    unicode_string_found = 1;
+                }
+                n_flattened_elements++;
+                break;
+            case JoinedStr_kind:
+                n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
+                f_string_found = 1;
+                break;
+            default:
+                n_flattened_elements++;
+                f_string_found = 1;
+                break;
         }
     }
 
@@ -1556,16 +1572,19 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
     Py_ssize_t j = 0;
     for (i = 0; i < len; i++) {
         expr_ty elem = asdl_seq_GET(strings, i);
-        if (elem->kind == Constant_kind) {
-            asdl_seq_SET(flattened, current_pos++, elem);
-        } else {
-            for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
-                expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
-                if (subvalue == NULL) {
-                    return NULL;
+        switch(elem->kind) {
+            case JoinedStr_kind:
+                for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
+                    expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
+                    if (subvalue == NULL) {
+                        return NULL;
+                    }
+                    asdl_seq_SET(flattened, current_pos++, subvalue);
                 }
-                asdl_seq_SET(flattened, current_pos++, subvalue);
-            }
+                break;
+            default:
+                asdl_seq_SET(flattened, current_pos++, elem);
+                break;
         }
     }
 
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
index 82b0e4e..93b5fbd 100644
--- a/Parser/lexer/lexer.c
+++ b/Parser/lexer/lexer.c
@@ -989,6 +989,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         the_current_tok->last_expr_buffer = NULL;
         the_current_tok->last_expr_size = 0;
         the_current_tok->last_expr_end = -1;
+        the_current_tok->in_format_spec = 0;
         the_current_tok->f_string_debug = 0;
 
         switch (*tok->start) {
@@ -1137,15 +1138,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
          * by the `{` case, so for ensuring that we are on the 0th level, we need
          * to adjust it manually */
         int cursor = current_tok->curly_bracket_depth - (c != '{');
-        if (cursor == 0 && !_PyLexer_update_fstring_expr(tok, c)) {
+        int in_format_spec = current_tok->in_format_spec;
+         int cursor_in_format_with_debug =
+             cursor == 1 && (current_tok->f_string_debug || in_format_spec);
+         int cursor_valid = cursor == 0 || cursor_in_format_with_debug;
+        if ((cursor_valid) && !_PyLexer_update_fstring_expr(tok, c)) {
             return MAKE_TOKEN(ENDMARKER);
         }
-        if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) {
+        if ((cursor_valid) && c != '{' && set_fstring_expr(tok, token, c)) {
             return MAKE_TOKEN(ERRORTOKEN);
         }
 
         if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
             current_tok->kind = TOK_FSTRING_MODE;
+            current_tok->in_format_spec = 1;
             p_start = tok->start;
             p_end = tok->cur;
             return MAKE_TOKEN(_PyToken_OneChar(c));
@@ -1235,6 +1241,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
                 current_tok->curly_bracket_expr_start_depth--;
                 current_tok->kind = TOK_FSTRING_MODE;
+                current_tok->in_format_spec = 0;
                 current_tok->f_string_debug = 0;
             }
         }
@@ -1317,11 +1324,11 @@ f_string_middle:
     tok->multi_line_start = tok->line_start;
     while (end_quote_size != current_tok->f_string_quote_size) {
         int c = tok_nextc(tok);
-        if (tok->done == E_ERROR) {
+        if (tok->done == E_ERROR || tok->done == E_DECODE) {
             return MAKE_TOKEN(ERRORTOKEN);
         }
         int in_format_spec = (
-                current_tok->last_expr_end != -1
+                current_tok->in_format_spec
                 &&
                 INSIDE_FSTRING_EXPR(current_tok)
         );
@@ -1337,6 +1344,7 @@ f_string_middle:
             if (in_format_spec && c == '\n') {
                 tok_backup(tok, c);
                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                current_tok->in_format_spec = 0;
                 p_start = tok->start;
                 p_end = tok->cur;
                 return MAKE_TOKEN(FSTRING_MIDDLE);
@@ -1378,6 +1386,9 @@ f_string_middle:
         }
 
         if (c == '{') {
+            if (!_PyLexer_update_fstring_expr(tok, c)) {
+                return MAKE_TOKEN(ENDMARKER);
+            }
             int peek = tok_nextc(tok);
             if (peek != '{' || in_format_spec) {
                 tok_backup(tok, peek);
@@ -1387,6 +1398,7 @@ f_string_middle:
                     return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expressions nested too deeply"));
                 }
                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                current_tok->in_format_spec = 0;
                 p_start = tok->start;
                 p_end = tok->cur;
             } else {
@@ -1406,13 +1418,15 @@ f_string_middle:
             // scanning (indicated by the end of the expression being set) and we are not at the top level
             // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
             // brackets, we can bypass it here.
-            if (peek == '}' && !in_format_spec) {
+            int cursor = current_tok->curly_bracket_depth;
+            if (peek == '}' && !in_format_spec && cursor == 0) {
                 p_start = tok->start;
                 p_end = tok->cur - 1;
             } else {
                 tok_backup(tok, peek);
                 tok_backup(tok, c);
                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                current_tok->in_format_spec = 0;
                 p_start = tok->start;
                 p_end = tok->cur;
             }
diff --git a/Parser/lexer/state.c b/Parser/lexer/state.c
index 653ddaf..647f291 100644
--- a/Parser/lexer/state.c
+++ b/Parser/lexer/state.c
@@ -74,6 +74,7 @@ free_fstring_expressions(struct tok_state *tok)
             mode->last_expr_buffer = NULL;
             mode->last_expr_size = 0;
             mode->last_expr_end = -1;
+            mode->in_format_spec = 0;
         }
     }
 }
diff --git a/Parser/lexer/state.h b/Parser/lexer/state.h
index 61d090d..9ed3bab 100644
--- a/Parser/lexer/state.h
+++ b/Parser/lexer/state.h
@@ -58,6 +58,7 @@ typedef struct _tokenizer_mode {
     Py_ssize_t last_expr_end;
     char* last_expr_buffer;
     int f_string_debug;
+    int in_format_spec;
 } tokenizer_mode;
 
 /* Tokenizer state */
author	Pablo Galindo Salgado <Pablogsal@gmail.com>	2024-07-16 18:57:22 (GMT)
committer	GitHub <noreply@github.com>	2024-07-16 18:57:22 (GMT)
commit	c46d64e0ef8e92a6b4ab4805d813d7e4d6663380 (patch)
tree	69bf2c0e394c896e05576baa94c7d750f0d27501 /Parser
parent	69c68de43aef03dd52fabd21f99cb3b0f9329201 (diff)
download	cpython-c46d64e0ef8e92a6b4ab4805d813d7e4d6663380.zip cpython-c46d64e0ef8e92a6b4ab4805d813d7e4d6663380.tar.gz cpython-c46d64e0ef8e92a6b4ab4805d813d7e4d6663380.tar.bz2