summaryrefslogtreecommitdiffstats
path: root/Parser/string_parser.c
diff options
context:
space:
mode:
authorPablo Galindo Salgado <Pablogsal@gmail.com>2023-04-19 16:18:16 (GMT)
committerGitHub <noreply@github.com>2023-04-19 16:18:16 (GMT)
commit1ef61cf71a218c71860ff6aecf0fd51edb8b65dc (patch)
treed0c4995cac9cb660b66498419d528254f26baf54 /Parser/string_parser.c
parenta6b07b5a345f7f54ee9f6d75e81d2fb55971b35c (diff)
downloadcpython-1ef61cf71a218c71860ff6aecf0fd51edb8b65dc.zip
cpython-1ef61cf71a218c71860ff6aecf0fd51edb8b65dc.tar.gz
cpython-1ef61cf71a218c71860ff6aecf0fd51edb8b65dc.tar.bz2
gh-102856: Initial implementation of PEP 701 (#102855)
Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com> Co-authored-by: Batuhan Taskaya <isidentical@gmail.com> Co-authored-by: Marta Gómez Macías <mgmacias@google.com> Co-authored-by: sunmy2019 <59365878+sunmy2019@users.noreply.github.com>
Diffstat (limited to 'Parser/string_parser.c')
-rw-r--r--Parser/string_parser.c1089
1 files changed, 35 insertions, 1054 deletions
diff --git a/Parser/string_parser.c b/Parser/string_parser.c
index c096bea..be5f0c4 100644
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@@ -135,7 +135,9 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
const char *first_invalid_escape;
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
- if (v != NULL && first_invalid_escape != NULL) {
+ // HACK: later we can simply pass the line no, since we don't preserve the tokens
+ // when we are decoding the string but we preserve the line numbers.
+ if (v != NULL && first_invalid_escape != NULL && t != NULL) {
if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
/* We have not decref u before because first_invalid_escape points
inside u. */
@@ -166,43 +168,43 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
return result;
}
-/* s must include the bracketing quote characters, and r, b, u,
- &/or f prefixes (if any), and embedded escape sequences (if any).
- _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
- If the string is an f-string, set *fstr and *fstrlen to the unparsed
- string object. Return 0 if no errors occurred. */
-int
-_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
- const char **fstr, Py_ssize_t *fstrlen, Token *t)
+PyObject *
+_PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
+{
+ if (raw) {
+ return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
+ }
+ return decode_unicode_with_escapes(p, s, len, t);
+}
+
+/* s must include the bracketing quote characters, and r, b &/or f prefixes
+ (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
+ _PyPegen_parse_string parses it, and returns the decoded Python string object. */
+PyObject *
+_PyPegen_parse_string(Parser *p, Token *t)
{
const char *s = PyBytes_AsString(t->bytes);
if (s == NULL) {
- return -1;
+ return NULL;
}
size_t len;
int quote = Py_CHARMASK(*s);
- int fmode = 0;
- *bytesmode = 0;
- *rawmode = 0;
- *result = NULL;
- *fstr = NULL;
+ int bytesmode = 0;
+ int rawmode = 0;
+
if (Py_ISALPHA(quote)) {
- while (!*bytesmode || !*rawmode) {
+ while (!bytesmode || !rawmode) {
if (quote == 'b' || quote == 'B') {
quote =(unsigned char)*++s;
- *bytesmode = 1;
+ bytesmode = 1;
}
else if (quote == 'u' || quote == 'U') {
quote = (unsigned char)*++s;
}
else if (quote == 'r' || quote == 'R') {
quote = (unsigned char)*++s;
- *rawmode = 1;
- }
- else if (quote == 'f' || quote == 'F') {
- quote = (unsigned char)*++s;
- fmode = 1;
+ rawmode = 1;
}
else {
break;
@@ -210,32 +212,21 @@ _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
}
}
- /* fstrings are only allowed in Python 3.6 and greater */
- if (fmode && p->feature_version < 6) {
- p->error_indicator = 1;
- RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
- return -1;
- }
-
- if (fmode && *bytesmode) {
- PyErr_BadInternalCall();
- return -1;
- }
if (quote != '\'' && quote != '\"') {
PyErr_BadInternalCall();
- return -1;
+ return NULL;
}
/* Skip the leading quote char. */
s++;
len = strlen(s);
if (len > INT_MAX) {
PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
- return -1;
+ return NULL;
}
if (s[--len] != quote) {
/* Last quote char must match the first. */
PyErr_BadInternalCall();
- return -1;
+ return NULL;
}
if (len >= 4 && s[0] == quote && s[1] == quote) {
/* A triple quoted string. We've already skipped one quote at
@@ -246,22 +237,13 @@ _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
/* And check that the last two match. */
if (s[--len] != quote || s[--len] != quote) {
PyErr_BadInternalCall();
- return -1;
+ return NULL;
}
}
- if (fmode) {
- /* Just return the bytes. The caller will parse the resulting
- string. */
- *fstr = s;
- *fstrlen = len;
- return 0;
- }
-
- /* Not an f-string. */
/* Avoid invoking escape decoding routines if possible. */
- *rawmode = *rawmode || strchr(s, '\\') == NULL;
- if (*bytesmode) {
+ rawmode = rawmode || strchr(s, '\\') == NULL;
+ if (bytesmode) {
/* Disallow non-ASCII characters. */
const char *ch;
for (ch = s; *ch; ch++) {
@@ -269,1014 +251,13 @@ _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
RAISE_SYNTAX_ERROR(
"bytes can only contain ASCII "
"literal characters");
- return -1;
- }
- }
- if (*rawmode) {
- *result = PyBytes_FromStringAndSize(s, len);
- }
- else {
- *result = decode_bytes_with_escapes(p, s, len, t);
- }
- }
- else {
- if (*rawmode) {
- *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
- }
- else {
- *result = decode_unicode_with_escapes(p, s, len, t);
- }
- }
- return *result == NULL ? -1 : 0;
-}
-
-
-
-// FSTRING STUFF
-
-/* Fix locations for the given node and its children.
-
- `parent` is the enclosing node.
- `expr_start` is the starting position of the expression (pointing to the open brace).
- `n` is the node which locations are going to be fixed relative to parent.
- `expr_str` is the child node's string representation, including braces.
-*/
-static bool
-fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
-{
- *p_lines = 0;
- *p_cols = 0;
- assert(expr_start != NULL && *expr_start == '{');
- if (parent && parent->bytes) {
- const char *parent_str = PyBytes_AsString(parent->bytes);
- if (!parent_str) {
- return false;
- }
- // The following is needed, in order to correctly shift the column
- // offset, in the case that (disregarding any whitespace) a newline
- // immediately follows the opening curly brace of the fstring expression.
- bool newline_after_brace = 1;
- const char *start = expr_start + 1;
- while (start && *start != '}' && *start != '\n') {
- if (*start != ' ' && *start != '\t' && *start != '\f') {
- newline_after_brace = 0;
- break;
- }
- start++;
- }
-
- // Account for the characters from the last newline character to our
- // left until the beginning of expr_start.
- if (!newline_after_brace) {
- start = expr_start;
- while (start > parent_str && *start != '\n') {
- start--;
- }
- *p_cols += (int)(expr_start - start);
- if (*start == '\n') {
- *p_cols -= 1;
- }
- }
- /* adjust the start based on the number of newlines encountered
- before the f-string expression */
- for (const char *p = parent_str; p < expr_start; p++) {
- if (*p == '\n') {
- (*p_lines)++;
- }
- }
- }
- return true;
-}
-
-
-/* Compile this expression in to an expr_ty. Add parens around the
- expression, in order to allow leading spaces in the expression. */
-static expr_ty
-fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
- Token *t)
-{
- expr_ty expr = NULL;
- char *str;
- Py_ssize_t len;
- const char *s;
- expr_ty result = NULL;
-
- assert(expr_end >= expr_start);
- assert(*(expr_start-1) == '{');
- assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
- *expr_end == '=');
-
- /* If the substring is all whitespace, it's an error. We need to catch this
- here, and not when we call PyParser_SimpleParseStringFlagsFilename,
- because turning the expression '' in to '()' would go from being invalid
- to valid. */
- for (s = expr_start; s != expr_end; s++) {
- char c = *s;
- /* The Python parser ignores only the following whitespace
- characters (\r already is converted to \n). */
- if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
- break;
- }
- }
-
- if (s == expr_end) {
- if (*expr_end == '!' || *expr_end == ':' || *expr_end == '=') {
- RAISE_SYNTAX_ERROR("f-string: expression required before '%c'", *expr_end);
- return NULL;
- }
- RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
- return NULL;
- }
-
- len = expr_end - expr_start;
- /* Allocate 3 extra bytes: open paren, close paren, null byte. */
- str = PyMem_Calloc(len + 3, sizeof(char));
- if (str == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
-
- // The call to fstring_find_expr_location is responsible for finding the column offset
- // the generated AST nodes need to be shifted to the right, which is equal to the number
- // of the f-string characters before the expression starts.
- memcpy(str+1, expr_start, len);
- int lines, cols;
- if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
- PyMem_Free(str);
- return NULL;
- }
-
- // The parentheses are needed in order to allow for leading whitespace within
- // the f-string expression. This consequently gets parsed as a group (see the
- // group rule in python.gram).
- str[0] = '(';
- str[len+1] = ')';
-
- struct tok_state* tok = _PyTokenizer_FromString(str, 1);
- if (tok == NULL) {
- PyMem_Free(str);
- return NULL;
- }
- tok->filename = Py_NewRef(p->tok->filename);
- tok->lineno = t->lineno + lines - 1;
-
- Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
- NULL, p->arena);
-
- p2->starting_lineno = t->lineno + lines;
- p2->starting_col_offset = lines != 0 ? cols : t->col_offset + cols;
-
- expr = _PyPegen_run_parser(p2);
-
- if (expr == NULL) {
- goto exit;
- }
- result = expr;
-
-exit:
- PyMem_Free(str);
- _PyPegen_Parser_Free(p2);
- _PyTokenizer_Free(tok);
- return result;
-}
-
-/* Return -1 on error.
-
- Return 0 if we reached the end of the literal.
-
- Return 1 if we haven't reached the end of the literal, but we want
- the caller to process the literal up to this point. Used for
- doubled braces.
-*/
-static int
-fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
- PyObject **literal, int recurse_lvl, Token *t)
-{
- /* Get any literal string. It ends when we hit an un-doubled left
- brace (which isn't part of a unicode name escape such as
- "\N{EULER CONSTANT}"), or the end of the string. */
-
- const char *s = *str;
- const char *literal_start = s;
- int result = 0;
-
- assert(*literal == NULL);
- while (s < end) {
- char ch = *s++;
- if (!raw && ch == '\\' && s < end) {
- ch = *s++;
- if (ch == 'N') {
- /* We need to look at and skip matching braces for "\N{name}"
- sequences because otherwise we'll think the opening '{'
- starts an expression, which is not the case with "\N".
- Keep looking for either a matched '{' '}' pair, or the end
- of the string. */
-
- if (s < end && *s++ == '{') {
- while (s < end && *s++ != '}') {
- }
- continue;
- }
-
- /* This is an invalid "\N" sequence, since it's a "\N" not
- followed by a "{". Just keep parsing this literal. This
- error will be caught later by
- decode_unicode_with_escapes(). */
- continue;
- }
- if (ch == '{' && warn_invalid_escape_sequence(p, s-1, t) < 0) {
- return -1;
- }
- }
- if (ch == '{' || ch == '}') {
- /* Check for doubled braces, but only at the top level. If
- we checked at every level, then f'{0:{3}}' would fail
- with the two closing braces. */
- if (recurse_lvl == 0) {
- if (s < end && *s == ch) {
- /* We're going to tell the caller that the literal ends
- here, but that they should continue scanning. But also
- skip over the second brace when we resume scanning. */
- *str = s + 1;
- result = 1;
- goto done;
- }
-
- /* Where a single '{' is the start of a new expression, a
- single '}' is not allowed. */
- if (ch == '}') {
- *str = s - 1;
- RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
- return -1;
- }
- }
- /* We're either at a '{', which means we're starting another
- expression; or a '}', which means we're at the end of this
- f-string (for a nested format_spec). */
- s--;
- break;
- }
- }
- *str = s;
- assert(s <= end);
- assert(s == end || *s == '{' || *s == '}');
-done:
- if (literal_start != s) {
- if (raw) {
- *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
- s - literal_start,
- NULL, NULL);
- }
- else {
- *literal = decode_unicode_with_escapes(p, literal_start,
- s - literal_start, t);
- }
- if (!*literal) {
- return -1;
- }
- }
- return result;
-}
-
-/* Forward declaration because parsing is recursive. */
-static expr_ty
-fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
- Token *first_token, Token* t, Token *last_token);
-
-/* Parse the f-string at *str, ending at end. We know *str starts an
- expression (so it must be a '{'). Returns the FormattedValue node, which
- includes the expression, conversion character, format_spec expression, and
- optionally the text of the expression (if = is used).
-
- Note that I don't do a perfect job here: I don't make sure that a
- closing brace doesn't match an opening paren, for example. It
- doesn't need to error on all invalid expressions, just correctly
- find the end of all valid ones. Any errors inside the expression
- will be caught when we parse it later.
-
- *expression is set to the expression. For an '=' "debug" expression,
- *expr_text is set to the debug text (the original text of the expression,
- including the '=' and any whitespace around it, as a string object). If
- not a debug expression, *expr_text set to NULL. */
-static int
-fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
- PyObject **expr_text, expr_ty *expression, Token *first_token,
- Token *t, Token *last_token)
-{
- /* Return -1 on error, else 0. */
-
- const char *expr_start;
- const char *expr_end;
- expr_ty simple_expression;
- expr_ty format_spec = NULL; /* Optional format specifier. */
- int conversion = -1; /* The conversion char. Use default if not
- specified, or !r if using = and no format
- spec. */
-
- /* 0 if we're not in a string, else the quote char we're trying to
- match (single or double quote). */
- char quote_char = 0;
-
- /* If we're inside a string, 1=normal, 3=triple-quoted. */
- int string_type = 0;
-
- /* Keep track of nesting level for braces/parens/brackets in
- expressions. */
- Py_ssize_t nested_depth = 0;
- char parenstack[MAXLEVEL];
-
- *expr_text = NULL;
-
- /* Can only nest one level deep. */
- if (recurse_lvl >= 2) {
- RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
- goto error;
- }
-
- /* The first char must be a left brace, or we wouldn't have gotten
- here. Skip over it. */
- assert(**str == '{');
- *str += 1;
-
- expr_start = *str;
- for (; *str < end; (*str)++) {
- char ch;
-
- /* Loop invariants. */
- assert(nested_depth >= 0);
- assert(*str >= expr_start && *str < end);
- if (quote_char) {
- assert(string_type == 1 || string_type == 3);
- } else {
- assert(string_type == 0);
- }
-
- ch = **str;
- /* Nowhere inside an expression is a backslash allowed. */
- if (ch == '\\') {
- /* Error: can't include a backslash character, inside
- parens or strings or not. */
- RAISE_SYNTAX_ERROR(
- "f-string expression part "
- "cannot include a backslash");
- goto error;
- }
- if (quote_char) {
- /* We're inside a string. See if we're at the end. */
- /* This code needs to implement the same non-error logic
- as tok_get from tokenizer.c, at the letter_quote
- label. To actually share that code would be a
- nightmare. But, it's unlikely to change and is small,
- so duplicate it here. Note we don't need to catch all
- of the errors, since they'll be caught when parsing the
- expression. We just need to match the non-error
- cases. Thus we can ignore \n in single-quoted strings,
- for example. Or non-terminated strings. */
- if (ch == quote_char) {
- /* Does this match the string_type (single or triple
- quoted)? */
- if (string_type == 3) {
- if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
- /* We're at the end of a triple quoted string. */
- *str += 2;
- string_type = 0;
- quote_char = 0;
- continue;
- }
- } else {
- /* We're at the end of a normal string. */
- quote_char = 0;
- string_type = 0;
- continue;
- }
- }
- } else if (ch == '\'' || ch == '"') {
- /* Is this a triple quoted string? */
- if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
- string_type = 3;
- *str += 2;
- } else {
- /* Start of a normal string. */
- string_type = 1;
- }
- /* Start looking for the end of the string. */
- quote_char = ch;
- } else if (ch == '[' || ch == '{' || ch == '(') {
- if (nested_depth >= MAXLEVEL) {
- RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
- goto error;
- }
- parenstack[nested_depth] = ch;
- nested_depth++;
- } else if (ch == '#') {
- /* Error: can't include a comment character, inside parens
- or not. */
- RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
- goto error;
- } else if (nested_depth == 0 &&
- (ch == '!' || ch == ':' || ch == '}' ||
- ch == '=' || ch == '>' || ch == '<')) {
- /* See if there's a next character. */
- if (*str+1 < end) {
- char next = *(*str+1);
-
- /* For "!=". since '=' is not an allowed conversion character,
- nothing is lost in this test. */
- if ((ch == '!' && next == '=') || /* != */
- (ch == '=' && next == '=') || /* == */
- (ch == '<' && next == '=') || /* <= */
- (ch == '>' && next == '=') /* >= */
- ) {
- *str += 1;
- continue;
- }
- }
- /* Don't get out of the loop for these, if they're single
- chars (not part of 2-char tokens). If by themselves, they
- don't end an expression (unlike say '!'). */
- if (ch == '>' || ch == '<') {
- continue;
- }
-
- /* Normal way out of this loop. */
- break;
- } else if (ch == ']' || ch == '}' || ch == ')') {
- if (!nested_depth) {
- RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
- goto error;
- }
- nested_depth--;
- int opening = (unsigned char)parenstack[nested_depth];
- if (!((opening == '(' && ch == ')') ||
- (opening == '[' && ch == ']') ||
- (opening == '{' && ch == '}')))
- {
- RAISE_SYNTAX_ERROR(
- "f-string: closing parenthesis '%c' "
- "does not match opening parenthesis '%c'",
- ch, opening);
- goto error;
- }
- } else {
- /* Just consume this char and loop around. */
- }
- }
- expr_end = *str;
- /* If we leave the above loop in a string or with mismatched parens, we
- don't really care. We'll get a syntax error when compiling the
- expression. But, we can produce a better error message, so let's just
- do that.*/
- if (quote_char) {
- RAISE_SYNTAX_ERROR("f-string: unterminated string");
- goto error;
- }
- if (nested_depth) {
- int opening = (unsigned char)parenstack[nested_depth - 1];
- RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
- goto error;
- }
-
- if (*str >= end) {
- goto unexpected_end_of_string;
- }
-
- /* Compile the expression as soon as possible, so we show errors
- related to the expression before errors related to the
- conversion or format_spec. */
- simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
- if (!simple_expression) {
- goto error;
- }
-
- /* Check for =, which puts the text value of the expression in
- expr_text. */
- if (**str == '=') {
- if (p->feature_version < 8) {
- RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
- "only supported in Python 3.8 and greater");
- goto error;
- }
- *str += 1;
-
- /* Skip over ASCII whitespace. No need to test for end of string
- here, since we know there's at least a trailing quote somewhere
- ahead. */
- while (Py_ISSPACE(**str)) {
- *str += 1;
- }
- if (*str >= end) {
- goto unexpected_end_of_string;
- }
- /* Set *expr_text to the text of the expression. */
- *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
- if (!*expr_text) {
- goto error;
- }
- }
-
- /* Check for a conversion char, if present. */
- if (**str == '!') {
- *str += 1;
- const char *conv_start = *str;
- while (1) {
- if (*str >= end) {
- goto unexpected_end_of_string;
- }
- if (**str == '}' || **str == ':') {
- break;
- }
- *str += 1;
- }
- if (*str == conv_start) {
- RAISE_SYNTAX_ERROR(
- "f-string: missed conversion character");
- goto error;
- }
-
- conversion = (unsigned char)*conv_start;
- /* Validate the conversion. */
- if ((*str != conv_start + 1) ||
- !(conversion == 's' || conversion == 'r' || conversion == 'a'))
- {
- PyObject *conv_obj = PyUnicode_FromStringAndSize(conv_start,
- *str-conv_start);
- if (conv_obj) {
- RAISE_SYNTAX_ERROR(
- "f-string: invalid conversion character %R: "
- "expected 's', 'r', or 'a'",
- conv_obj);
- Py_DECREF(conv_obj);
- }
- goto error;
- }
-
- }
-
- /* Check for the format spec, if present. */
- assert(*str < end);
- if (**str == ':') {
- *str += 1;
- if (*str >= end) {
- goto unexpected_end_of_string;
- }
-
- /* Parse the format spec. */
- format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
- first_token, t, last_token);
- if (!format_spec) {
- goto error;
- }
- }
-
- if (*str >= end || **str != '}') {
- goto unexpected_end_of_string;
- }
-
- /* We're at a right brace. Consume it. */
- assert(*str < end);
- assert(**str == '}');
- *str += 1;
-
- /* If we're in = mode (detected by non-NULL expr_text), and have no format
- spec and no explicit conversion, set the conversion to 'r'. */
- if (*expr_text && format_spec == NULL && conversion == -1) {
- conversion = 'r';
- }
-
- /* And now create the FormattedValue node that represents this
- entire expression with the conversion and format spec. */
- //TODO: Fix this
- *expression = _PyAST_FormattedValue(simple_expression, conversion,
- format_spec, first_token->lineno,
- first_token->col_offset,
- last_token->end_lineno,
- last_token->end_col_offset, p->arena);
- if (!*expression) {
- goto error;
- }
-
- return 0;
-
-unexpected_end_of_string:
- RAISE_SYNTAX_ERROR("f-string: expecting '}'");
- /* Falls through to error. */
-
-error:
- Py_XDECREF(*expr_text);
- return -1;
-
-}
-
-/* Return -1 on error.
-
- Return 0 if we have a literal (possible zero length) and an
- expression (zero length if at the end of the string.
-
- Return 1 if we have a literal, but no expression, and we want the
- caller to call us again. This is used to deal with doubled
- braces.
-
- When called multiple times on the string 'a{{b{0}c', this function
- will return:
-
- 1. the literal 'a{' with no expression, and a return value
- of 1. Despite the fact that there's no expression, the return
- value of 1 means we're not finished yet.
-
- 2. the literal 'b' and the expression '0', with a return value of
- 0. The fact that there's an expression means we're not finished.
-
- 3. literal 'c' with no expression and a return value of 0. The
- combination of the return value of 0 with no expression means
- we're finished.
-*/
-static int
-fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
- int recurse_lvl, PyObject **literal,
- PyObject **expr_text, expr_ty *expression,
- Token *first_token, Token *t, Token *last_token)
-{
- int result;
-
- assert(*literal == NULL && *expression == NULL);
-
- /* Get any literal string. */
- result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
- if (result < 0) {
- goto error;
- }
-
- assert(result == 0 || result == 1);
-
- if (result == 1) {
- /* We have a literal, but don't look at the expression. */
- return 1;
- }
-
- if (*str >= end || **str == '}') {
- /* We're at the end of the string or the end of a nested
- f-string: no expression. The top-level error case where we
- expect to be at the end of the string but we're at a '}' is
- handled later. */
- return 0;
- }
-
- /* We must now be the start of an expression, on a '{'. */
- assert(**str == '{');
-
- if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
- expression, first_token, t, last_token) < 0) {
- goto error;
- }
-
- return 0;
-
-error:
- Py_CLEAR(*literal);
- return -1;
-}
-
-#ifdef NDEBUG
-#define ExprList_check_invariants(l)
-#else
-static void
-ExprList_check_invariants(ExprList *l)
-{
- /* Check our invariants. Make sure this object is "live", and
- hasn't been deallocated. */
- assert(l->size >= 0);
- assert(l->p != NULL);
- if (l->size <= EXPRLIST_N_CACHED) {
- assert(l->data == l->p);
- }
-}
-#endif
-
-static void
-ExprList_Init(ExprList *l)
-{
- l->allocated = EXPRLIST_N_CACHED;
- l->size = 0;
-
- /* Until we start allocating dynamically, p points to data. */
- l->p = l->data;
-
- ExprList_check_invariants(l);
-}
-
-static int
-ExprList_Append(ExprList *l, expr_ty exp)
-{
- ExprList_check_invariants(l);
- if (l->size >= l->allocated) {
- /* We need to alloc (or realloc) the memory. */
- Py_ssize_t new_size = l->allocated * 2;
-
- /* See if we've ever allocated anything dynamically. */
- if (l->p == l->data) {
- Py_ssize_t i;
- /* We're still using the cached data. Switch to
- alloc-ing. */
- l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
- if (!l->p) {
- return -1;
- }
- /* Copy the cached data into the new buffer. */
- for (i = 0; i < l->size; i++) {
- l->p[i] = l->data[i];
- }
- } else {
- /* Just realloc. */
- expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
- if (!tmp) {
- PyMem_Free(l->p);
- l->p = NULL;
- return -1;
- }
- l->p = tmp;
- }
-
- l->allocated = new_size;
- assert(l->allocated == 2 * l->size);
- }
-
- l->p[l->size++] = exp;
-
- ExprList_check_invariants(l);
- return 0;
-}
-
-static void
-ExprList_Dealloc(ExprList *l)
-{
- ExprList_check_invariants(l);
-
- /* If there's been an error, or we've never dynamically allocated,
- do nothing. */
- if (!l->p || l->p == l->data) {
- /* Do nothing. */
- } else {
- /* We have dynamically allocated. Free the memory. */
- PyMem_Free(l->p);
- }
- l->p = NULL;
- l->size = -1;
-}
-
-static asdl_expr_seq *
-ExprList_Finish(ExprList *l, PyArena *arena)
-{
- asdl_expr_seq *seq;
-
- ExprList_check_invariants(l);
-
- /* Allocate the asdl_seq and copy the expressions in to it. */
- seq = _Py_asdl_expr_seq_new(l->size, arena);
- if (seq) {
- Py_ssize_t i;
- for (i = 0; i < l->size; i++) {
- asdl_seq_SET(seq, i, l->p[i]);
- }
- }
- ExprList_Dealloc(l);
- return seq;
-}
-
-#ifdef NDEBUG
-#define FstringParser_check_invariants(state)
-#else
-static void
-FstringParser_check_invariants(FstringParser *state)
-{
- if (state->last_str) {
- assert(PyUnicode_CheckExact(state->last_str));
- }
- ExprList_check_invariants(&state->expr_list);
-}
-#endif
-
-void
-_PyPegen_FstringParser_Init(FstringParser *state)
-{
- state->last_str = NULL;
- state->fmode = 0;
- ExprList_Init(&state->expr_list);
- FstringParser_check_invariants(state);
-}
-
-void
-_PyPegen_FstringParser_Dealloc(FstringParser *state)
-{
- FstringParser_check_invariants(state);
-
- Py_XDECREF(state->last_str);
- ExprList_Dealloc(&state->expr_list);
-}
-
-/* Make a Constant node, but decref the PyUnicode object being added. */
-static expr_ty
-make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
-{
- PyObject *s = *str;
- PyObject *kind = NULL;
- *str = NULL;
- assert(PyUnicode_CheckExact(s));
- if (_PyArena_AddPyObject(p->arena, s) < 0) {
- Py_DECREF(s);
- return NULL;
- }
- const char* the_str = PyBytes_AsString(first_token->bytes);
- if (the_str && the_str[0] == 'u') {
- kind = _PyPegen_new_identifier(p, "u");
- }
-
- if (kind == NULL && PyErr_Occurred()) {
- return NULL;
- }
-
- return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
- last_token->end_lineno, last_token->end_col_offset,
- p->arena);
-
-}
-
-
-/* Add a non-f-string (that is, a regular literal string). str is
- decref'd. */
-int
-_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
-{
- FstringParser_check_invariants(state);
-
- assert(PyUnicode_CheckExact(str));
-
- if (PyUnicode_GET_LENGTH(str) == 0) {
- Py_DECREF(str);
- return 0;
- }
-
- if (!state->last_str) {
- /* We didn't have a string before, so just remember this one. */
- state->last_str = str;
- } else {
- /* Concatenate this with the previous string. */
- PyUnicode_AppendAndDel(&state->last_str, str);
- if (!state->last_str) {
- return -1;
- }
- }
- FstringParser_check_invariants(state);
- return 0;
-}
-
-/* Parse an f-string. The f-string is in *str to end, with no
- 'f' or quotes. */
-int
-_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
- const char *end, int raw, int recurse_lvl,
- Token *first_token, Token* t, Token *last_token)
-{
- FstringParser_check_invariants(state);
- state->fmode = 1;
-
- /* Parse the f-string. */
- while (1) {
- PyObject *literal = NULL;
- PyObject *expr_text = NULL;
- expr_ty expression = NULL;
-
- /* If there's a zero length literal in front of the
- expression, literal will be NULL. If we're at the end of
- the f-string, expression will be NULL (unless result == 1,
- see below). */
- int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
- &literal, &expr_text,
- &expression, first_token, t, last_token);
- if (result < 0) {
- return -1;
- }
-
- /* Add the literal, if any. */
- if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
- Py_XDECREF(expr_text);
- return -1;
- }
- /* Add the expr_text, if any. */
- if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
- return -1;
- }
-
- /* We've dealt with the literal and expr_text, their ownership has
- been transferred to the state object. Don't look at them again. */
-
- /* See if we should just loop around to get the next literal
- and expression, while ignoring the expression this
- time. This is used for un-doubling braces, as an
- optimization. */
- if (result == 1) {
- continue;
- }
-
- if (!expression) {
- /* We're done with this f-string. */
- break;
- }
-
- /* We know we have an expression. Convert any existing string
- to a Constant node. */
- if (state->last_str) {
- /* Convert the existing last_str literal to a Constant node. */
- expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
- if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
- return -1;
- }
- }
-
- if (ExprList_Append(&state->expr_list, expression) < 0) {
- return -1;
- }
- }
-
- /* If recurse_lvl is zero, then we must be at the end of the
- string. Otherwise, we must be at a right brace. */
-
- if (recurse_lvl == 0 && *str < end-1) {
- RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
- return -1;
- }
- if (recurse_lvl != 0 && **str != '}') {
- RAISE_SYNTAX_ERROR("f-string: expecting '}'");
- return -1;
- }
-
- FstringParser_check_invariants(state);
- return 0;
-}
-
-/* Convert the partial state reflected in last_str and expr_list to an
- expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
-expr_ty
-_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
- Token *last_token)
-{
- asdl_expr_seq *seq;
-
- FstringParser_check_invariants(state);
-
- /* If we're just a constant string with no expressions, return
- that. */
- if (!state->fmode) {
- assert(!state->expr_list.size);
- if (!state->last_str) {
- /* Create a zero length string. */
- state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
- if (!state->last_str) {
- goto error;
+ return NULL;
}
}
- return make_str_node_and_del(p, &state->last_str, first_token, last_token);
- }
-
- /* Create a Constant node out of last_str, if needed. It will be the
- last node in our expression list. */
- if (state->last_str) {
- expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
- if (!str || ExprList_Append(&state->expr_list, str) < 0) {
- goto error;
+ if (rawmode) {
+ return PyBytes_FromStringAndSize(s, len);
}
+ return decode_bytes_with_escapes(p, s, len, t);
}
- /* This has already been freed. */
- assert(state->last_str == NULL);
-
- seq = ExprList_Finish(&state->expr_list, p->arena);
- if (!seq) {
- goto error;
- }
-
- return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
- last_token->end_lineno, last_token->end_col_offset,
- p->arena);
-
-error:
- _PyPegen_FstringParser_Dealloc(state);
- return NULL;
-}
-
-/* Given an f-string (with no 'f' or quotes) that's in *str and ends
- at end, parse it into an expr_ty. Return NULL on error. Adjust
- str to point past the parsed portion. */
-static expr_ty
-fstring_parse(Parser *p, const char **str, const char *end, int raw,
- int recurse_lvl, Token *first_token, Token* t, Token *last_token)
-{
- FstringParser state;
-
- _PyPegen_FstringParser_Init(&state);
- if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
- first_token, t, last_token) < 0) {
- _PyPegen_FstringParser_Dealloc(&state);
- return NULL;
- }
-
- return _PyPegen_FstringParser_Finish(p, &state, t, t);
+ return _PyPegen_decode_string(p, rawmode, s, len, t);
}