From c9c4444d9f11ae80c2c4cc7d40b6718419d81a97 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 21 Nov 2021 01:08:50 +0000 Subject: Refactor parser compilation units into specific components (GH-29676) --- Makefile.pre.in | 2 + PCbuild/_freeze_module.vcxproj | 2 + PCbuild/pythoncore.vcxproj | 2 + PCbuild/pythoncore.vcxproj.filters | 6 + Parser/action_helpers.c | 1289 ++++++++++++++++++++++++ Parser/pegen.c | 1936 +++--------------------------------- Parser/pegen.h | 112 ++- Parser/pegen_errors.c | 425 ++++++++ Tools/peg_generator/Makefile | 2 +- Tools/peg_generator/pegen/build.py | 2 + 10 files changed, 1909 insertions(+), 1869 deletions(-) create mode 100644 Parser/action_helpers.c create mode 100644 Parser/pegen_errors.c diff --git a/Makefile.pre.in b/Makefile.pre.in index 11ffdaa..fc4def8 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -331,6 +331,8 @@ LIBFFI_INCLUDEDIR= @LIBFFI_INCLUDEDIR@ PEGEN_OBJS= \ Parser/pegen.o \ + Parser/pegen_errors.o \ + Parser/action_helpers.o \ Parser/parser.o \ Parser/string_parser.o \ Parser/peg_api.o diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index d33e07c..6a91776 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -163,6 +163,8 @@ + + diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index b659981..70f0556 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -433,6 +433,8 @@ + + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 62aab5b..b19f027 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -1205,6 +1205,12 @@ Parser + + Parser + + + Parser + Parser diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c new file mode 100644 index 0000000..e5d7b66 --- /dev/null +++ b/Parser/action_helpers.c @@ -0,0 +1,1289 @@ +#include + +#include "pegen.h" +#include "string_parser.h" + +static PyObject * +_create_dummy_identifier(Parser *p) +{ + return _PyPegen_new_identifier(p, ""); +} + +void * +_PyPegen_dummy_name(Parser *p, ...) +{ + static void *cache = NULL; + + if (cache != NULL) { + return cache; + } + + PyObject *id = _create_dummy_identifier(p); + if (!id) { + return NULL; + } + cache = _PyAST_Name(id, Load, 1, 0, 1, 0, p->arena); + return cache; +} + +/* Creates a single-element asdl_seq* that contains a */ +asdl_seq * +_PyPegen_singleton_seq(Parser *p, void *a) +{ + assert(a != NULL); + asdl_seq *seq = (asdl_seq*)_Py_asdl_generic_seq_new(1, p->arena); + if (!seq) { + return NULL; + } + asdl_seq_SET_UNTYPED(seq, 0, a); + return seq; +} + +/* Creates a copy of seq and prepends a to it */ +asdl_seq * +_PyPegen_seq_insert_in_front(Parser *p, void *a, asdl_seq *seq) +{ + assert(a != NULL); + if (!seq) { + return _PyPegen_singleton_seq(p, a); + } + + asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(asdl_seq_LEN(seq) + 1, p->arena); + if (!new_seq) { + return NULL; + } + + asdl_seq_SET_UNTYPED(new_seq, 0, a); + for (Py_ssize_t i = 1, l = asdl_seq_LEN(new_seq); i < l; i++) { + asdl_seq_SET_UNTYPED(new_seq, i, asdl_seq_GET_UNTYPED(seq, i - 1)); + } + return new_seq; +} + +/* Creates a copy of seq and appends a to it */ +asdl_seq * +_PyPegen_seq_append_to_end(Parser *p, asdl_seq *seq, void *a) +{ + assert(a != NULL); + if (!seq) { + return _PyPegen_singleton_seq(p, a); + } + + asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(asdl_seq_LEN(seq) + 1, p->arena); + if (!new_seq) { + return NULL; + } + + for (Py_ssize_t i = 0, l = asdl_seq_LEN(new_seq); i + 1 < l; i++) { + asdl_seq_SET_UNTYPED(new_seq, i, asdl_seq_GET_UNTYPED(seq, i)); + } + asdl_seq_SET_UNTYPED(new_seq, asdl_seq_LEN(new_seq) - 1, a); + return new_seq; +} + +static Py_ssize_t +_get_flattened_seq_size(asdl_seq *seqs) +{ + Py_ssize_t size = 0; + for (Py_ssize_t i = 0, l = asdl_seq_LEN(seqs); i < l; i++) { + asdl_seq *inner_seq = asdl_seq_GET_UNTYPED(seqs, i); + size += asdl_seq_LEN(inner_seq); + } + return size; +} + +/* Flattens an asdl_seq* of asdl_seq*s */ +asdl_seq * +_PyPegen_seq_flatten(Parser *p, asdl_seq *seqs) +{ + Py_ssize_t flattened_seq_size = _get_flattened_seq_size(seqs); + assert(flattened_seq_size > 0); + + asdl_seq *flattened_seq = (asdl_seq*)_Py_asdl_generic_seq_new(flattened_seq_size, p->arena); + if (!flattened_seq) { + return NULL; + } + + int flattened_seq_idx = 0; + for (Py_ssize_t i = 0, l = asdl_seq_LEN(seqs); i < l; i++) { + asdl_seq *inner_seq = asdl_seq_GET_UNTYPED(seqs, i); + for (Py_ssize_t j = 0, li = asdl_seq_LEN(inner_seq); j < li; j++) { + asdl_seq_SET_UNTYPED(flattened_seq, flattened_seq_idx++, asdl_seq_GET_UNTYPED(inner_seq, j)); + } + } + assert(flattened_seq_idx == flattened_seq_size); + + return flattened_seq; +} + +void * +_PyPegen_seq_last_item(asdl_seq *seq) +{ + Py_ssize_t len = asdl_seq_LEN(seq); + return asdl_seq_GET_UNTYPED(seq, len - 1); +} + +void * +_PyPegen_seq_first_item(asdl_seq *seq) +{ + return asdl_seq_GET_UNTYPED(seq, 0); +} + +/* Creates a new name of the form . */ +expr_ty +_PyPegen_join_names_with_dot(Parser *p, expr_ty first_name, expr_ty second_name) +{ + assert(first_name != NULL && second_name != NULL); + PyObject *first_identifier = first_name->v.Name.id; + PyObject *second_identifier = second_name->v.Name.id; + + if (PyUnicode_READY(first_identifier) == -1) { + return NULL; + } + if (PyUnicode_READY(second_identifier) == -1) { + return NULL; + } + const char *first_str = PyUnicode_AsUTF8(first_identifier); + if (!first_str) { + return NULL; + } + const char *second_str = PyUnicode_AsUTF8(second_identifier); + if (!second_str) { + return NULL; + } + Py_ssize_t len = strlen(first_str) + strlen(second_str) + 1; // +1 for the dot + + PyObject *str = PyBytes_FromStringAndSize(NULL, len); + if (!str) { + return NULL; + } + + char *s = PyBytes_AS_STRING(str); + if (!s) { + return NULL; + } + + strcpy(s, first_str); + s += strlen(first_str); + *s++ = '.'; + strcpy(s, second_str); + s += strlen(second_str); + *s = '\0'; + + PyObject *uni = PyUnicode_DecodeUTF8(PyBytes_AS_STRING(str), PyBytes_GET_SIZE(str), NULL); + Py_DECREF(str); + if (!uni) { + return NULL; + } + PyUnicode_InternInPlace(&uni); + if (_PyArena_AddPyObject(p->arena, uni) < 0) { + Py_DECREF(uni); + return NULL; + } + + return _PyAST_Name(uni, Load, EXTRA_EXPR(first_name, second_name)); +} + +/* Counts the total number of dots in seq's tokens */ +int +_PyPegen_seq_count_dots(asdl_seq *seq) +{ + int number_of_dots = 0; + for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) { + Token *current_expr = asdl_seq_GET_UNTYPED(seq, i); + switch (current_expr->type) { + case ELLIPSIS: + number_of_dots += 3; + break; + case DOT: + number_of_dots += 1; + break; + default: + Py_UNREACHABLE(); + } + } + + return number_of_dots; +} + +/* Creates an alias with '*' as the identifier name */ +alias_ty +_PyPegen_alias_for_star(Parser *p, int lineno, int col_offset, int end_lineno, + int end_col_offset, PyArena *arena) { + PyObject *str = PyUnicode_InternFromString("*"); + if (!str) { + return NULL; + } + if (_PyArena_AddPyObject(p->arena, str) < 0) { + Py_DECREF(str); + return NULL; + } + return _PyAST_alias(str, NULL, lineno, col_offset, end_lineno, end_col_offset, arena); +} + +/* Creates a new asdl_seq* with the identifiers of all the names in seq */ +asdl_identifier_seq * +_PyPegen_map_names_to_ids(Parser *p, asdl_expr_seq *seq) +{ + Py_ssize_t len = asdl_seq_LEN(seq); + assert(len > 0); + + asdl_identifier_seq *new_seq = _Py_asdl_identifier_seq_new(len, p->arena); + if (!new_seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + expr_ty e = asdl_seq_GET(seq, i); + asdl_seq_SET(new_seq, i, e->v.Name.id); + } + return new_seq; +} + +/* Constructs a CmpopExprPair */ +CmpopExprPair * +_PyPegen_cmpop_expr_pair(Parser *p, cmpop_ty cmpop, expr_ty expr) +{ + assert(expr != NULL); + CmpopExprPair *a = _PyArena_Malloc(p->arena, sizeof(CmpopExprPair)); + if (!a) { + return NULL; + } + a->cmpop = cmpop; + a->expr = expr; + return a; +} + +asdl_int_seq * +_PyPegen_get_cmpops(Parser *p, asdl_seq *seq) +{ + Py_ssize_t len = asdl_seq_LEN(seq); + assert(len > 0); + + asdl_int_seq *new_seq = _Py_asdl_int_seq_new(len, p->arena); + if (!new_seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + CmpopExprPair *pair = asdl_seq_GET_UNTYPED(seq, i); + asdl_seq_SET(new_seq, i, pair->cmpop); + } + return new_seq; +} + +asdl_expr_seq * +_PyPegen_get_exprs(Parser *p, asdl_seq *seq) +{ + Py_ssize_t len = asdl_seq_LEN(seq); + assert(len > 0); + + asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); + if (!new_seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + CmpopExprPair *pair = asdl_seq_GET_UNTYPED(seq, i); + asdl_seq_SET(new_seq, i, pair->expr); + } + return new_seq; +} + +/* Creates an asdl_seq* where all the elements have been changed to have ctx as context */ +static asdl_expr_seq * +_set_seq_context(Parser *p, asdl_expr_seq *seq, expr_context_ty ctx) +{ + Py_ssize_t len = asdl_seq_LEN(seq); + if (len == 0) { + return NULL; + } + + asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); + if (!new_seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + expr_ty e = asdl_seq_GET(seq, i); + asdl_seq_SET(new_seq, i, _PyPegen_set_expr_context(p, e, ctx)); + } + return new_seq; +} + +static expr_ty +_set_name_context(Parser *p, expr_ty e, expr_context_ty ctx) +{ + return _PyAST_Name(e->v.Name.id, ctx, EXTRA_EXPR(e, e)); +} + +static expr_ty +_set_tuple_context(Parser *p, expr_ty e, expr_context_ty ctx) +{ + return _PyAST_Tuple( + _set_seq_context(p, e->v.Tuple.elts, ctx), + ctx, + EXTRA_EXPR(e, e)); +} + +static expr_ty +_set_list_context(Parser *p, expr_ty e, expr_context_ty ctx) +{ + return _PyAST_List( + _set_seq_context(p, e->v.List.elts, ctx), + ctx, + EXTRA_EXPR(e, e)); +} + +static expr_ty +_set_subscript_context(Parser *p, expr_ty e, expr_context_ty ctx) +{ + return _PyAST_Subscript(e->v.Subscript.value, e->v.Subscript.slice, + ctx, EXTRA_EXPR(e, e)); +} + +static expr_ty +_set_attribute_context(Parser *p, expr_ty e, expr_context_ty ctx) +{ + return _PyAST_Attribute(e->v.Attribute.value, e->v.Attribute.attr, + ctx, EXTRA_EXPR(e, e)); +} + +static expr_ty +_set_starred_context(Parser *p, expr_ty e, expr_context_ty ctx) +{ + return _PyAST_Starred(_PyPegen_set_expr_context(p, e->v.Starred.value, ctx), + ctx, EXTRA_EXPR(e, e)); +} + +/* Creates an `expr_ty` equivalent to `expr` but with `ctx` as context */ +expr_ty +_PyPegen_set_expr_context(Parser *p, expr_ty expr, expr_context_ty ctx) +{ + assert(expr != NULL); + + expr_ty new = NULL; + switch (expr->kind) { + case Name_kind: + new = _set_name_context(p, expr, ctx); + break; + case Tuple_kind: + new = _set_tuple_context(p, expr, ctx); + break; + case List_kind: + new = _set_list_context(p, expr, ctx); + break; + case Subscript_kind: + new = _set_subscript_context(p, expr, ctx); + break; + case Attribute_kind: + new = _set_attribute_context(p, expr, ctx); + break; + case Starred_kind: + new = _set_starred_context(p, expr, ctx); + break; + default: + new = expr; + } + return new; +} + +/* Constructs a KeyValuePair that is used when parsing a dict's key value pairs */ +KeyValuePair * +_PyPegen_key_value_pair(Parser *p, expr_ty key, expr_ty value) +{ + KeyValuePair *a = _PyArena_Malloc(p->arena, sizeof(KeyValuePair)); + if (!a) { + return NULL; + } + a->key = key; + a->value = value; + return a; +} + +/* Extracts all keys from an asdl_seq* of KeyValuePair*'s */ +asdl_expr_seq * +_PyPegen_get_keys(Parser *p, asdl_seq *seq) +{ + Py_ssize_t len = asdl_seq_LEN(seq); + asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); + if (!new_seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + KeyValuePair *pair = asdl_seq_GET_UNTYPED(seq, i); + asdl_seq_SET(new_seq, i, pair->key); + } + return new_seq; +} + +/* Extracts all values from an asdl_seq* of KeyValuePair*'s */ +asdl_expr_seq * +_PyPegen_get_values(Parser *p, asdl_seq *seq) +{ + Py_ssize_t len = asdl_seq_LEN(seq); + asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); + if (!new_seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + KeyValuePair *pair = asdl_seq_GET_UNTYPED(seq, i); + asdl_seq_SET(new_seq, i, pair->value); + } + return new_seq; +} + +/* Constructs a KeyPatternPair that is used when parsing mapping & class patterns */ +KeyPatternPair * +_PyPegen_key_pattern_pair(Parser *p, expr_ty key, pattern_ty pattern) +{ + KeyPatternPair *a = _PyArena_Malloc(p->arena, sizeof(KeyPatternPair)); + if (!a) { + return NULL; + } + a->key = key; + a->pattern = pattern; + return a; +} + +/* Extracts all keys from an asdl_seq* of KeyPatternPair*'s */ +asdl_expr_seq * +_PyPegen_get_pattern_keys(Parser *p, asdl_seq *seq) +{ + Py_ssize_t len = asdl_seq_LEN(seq); + asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); + if (!new_seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + KeyPatternPair *pair = asdl_seq_GET_UNTYPED(seq, i); + asdl_seq_SET(new_seq, i, pair->key); + } + return new_seq; +} + +/* Extracts all patterns from an asdl_seq* of KeyPatternPair*'s */ +asdl_pattern_seq * +_PyPegen_get_patterns(Parser *p, asdl_seq *seq) +{ + Py_ssize_t len = asdl_seq_LEN(seq); + asdl_pattern_seq *new_seq = _Py_asdl_pattern_seq_new(len, p->arena); + if (!new_seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + KeyPatternPair *pair = asdl_seq_GET_UNTYPED(seq, i); + asdl_seq_SET(new_seq, i, pair->pattern); + } + return new_seq; +} + +/* Constructs a NameDefaultPair */ +NameDefaultPair * +_PyPegen_name_default_pair(Parser *p, arg_ty arg, expr_ty value, Token *tc) +{ + NameDefaultPair *a = _PyArena_Malloc(p->arena, sizeof(NameDefaultPair)); + if (!a) { + return NULL; + } + a->arg = _PyPegen_add_type_comment_to_arg(p, arg, tc); + a->value = value; + return a; +} + +/* Constructs a SlashWithDefault */ +SlashWithDefault * +_PyPegen_slash_with_default(Parser *p, asdl_arg_seq *plain_names, asdl_seq *names_with_defaults) +{ + SlashWithDefault *a = _PyArena_Malloc(p->arena, sizeof(SlashWithDefault)); + if (!a) { + return NULL; + } + a->plain_names = plain_names; + a->names_with_defaults = names_with_defaults; + return a; +} + +/* Constructs a StarEtc */ +StarEtc * +_PyPegen_star_etc(Parser *p, arg_ty vararg, asdl_seq *kwonlyargs, arg_ty kwarg) +{ + StarEtc *a = _PyArena_Malloc(p->arena, sizeof(StarEtc)); + if (!a) { + return NULL; + } + a->vararg = vararg; + a->kwonlyargs = kwonlyargs; + a->kwarg = kwarg; + return a; +} + +asdl_seq * +_PyPegen_join_sequences(Parser *p, asdl_seq *a, asdl_seq *b) +{ + Py_ssize_t first_len = asdl_seq_LEN(a); + Py_ssize_t second_len = asdl_seq_LEN(b); + asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(first_len + second_len, p->arena); + if (!new_seq) { + return NULL; + } + + int k = 0; + for (Py_ssize_t i = 0; i < first_len; i++) { + asdl_seq_SET_UNTYPED(new_seq, k++, asdl_seq_GET_UNTYPED(a, i)); + } + for (Py_ssize_t i = 0; i < second_len; i++) { + asdl_seq_SET_UNTYPED(new_seq, k++, asdl_seq_GET_UNTYPED(b, i)); + } + + return new_seq; +} + +static asdl_arg_seq* +_get_names(Parser *p, asdl_seq *names_with_defaults) +{ + Py_ssize_t len = asdl_seq_LEN(names_with_defaults); + asdl_arg_seq *seq = _Py_asdl_arg_seq_new(len, p->arena); + if (!seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + NameDefaultPair *pair = asdl_seq_GET_UNTYPED(names_with_defaults, i); + asdl_seq_SET(seq, i, pair->arg); + } + return seq; +} + +static asdl_expr_seq * +_get_defaults(Parser *p, asdl_seq *names_with_defaults) +{ + Py_ssize_t len = asdl_seq_LEN(names_with_defaults); + asdl_expr_seq *seq = _Py_asdl_expr_seq_new(len, p->arena); + if (!seq) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + NameDefaultPair *pair = asdl_seq_GET_UNTYPED(names_with_defaults, i); + asdl_seq_SET(seq, i, pair->value); + } + return seq; +} + +static int +_make_posonlyargs(Parser *p, + asdl_arg_seq *slash_without_default, + SlashWithDefault *slash_with_default, + asdl_arg_seq **posonlyargs) { + if (slash_without_default != NULL) { + *posonlyargs = slash_without_default; + } + else if (slash_with_default != NULL) { + asdl_arg_seq *slash_with_default_names = + _get_names(p, slash_with_default->names_with_defaults); + if (!slash_with_default_names) { + return -1; + } + *posonlyargs = (asdl_arg_seq*)_PyPegen_join_sequences( + p, + (asdl_seq*)slash_with_default->plain_names, + (asdl_seq*)slash_with_default_names); + } + else { + *posonlyargs = _Py_asdl_arg_seq_new(0, p->arena); + } + return *posonlyargs == NULL ? -1 : 0; +} + +static int +_make_posargs(Parser *p, + asdl_arg_seq *plain_names, + asdl_seq *names_with_default, + asdl_arg_seq **posargs) { + if (plain_names != NULL && names_with_default != NULL) { + asdl_arg_seq *names_with_default_names = _get_names(p, names_with_default); + if (!names_with_default_names) { + return -1; + } + *posargs = (asdl_arg_seq*)_PyPegen_join_sequences( + p,(asdl_seq*)plain_names, (asdl_seq*)names_with_default_names); + } + else if (plain_names == NULL && names_with_default != NULL) { + *posargs = _get_names(p, names_with_default); + } + else if (plain_names != NULL && names_with_default == NULL) { + *posargs = plain_names; + } + else { + *posargs = _Py_asdl_arg_seq_new(0, p->arena); + } + return *posargs == NULL ? -1 : 0; +} + +static int +_make_posdefaults(Parser *p, + SlashWithDefault *slash_with_default, + asdl_seq *names_with_default, + asdl_expr_seq **posdefaults) { + if (slash_with_default != NULL && names_with_default != NULL) { + asdl_expr_seq *slash_with_default_values = + _get_defaults(p, slash_with_default->names_with_defaults); + if (!slash_with_default_values) { + return -1; + } + asdl_expr_seq *names_with_default_values = _get_defaults(p, names_with_default); + if (!names_with_default_values) { + return -1; + } + *posdefaults = (asdl_expr_seq*)_PyPegen_join_sequences( + p, + (asdl_seq*)slash_with_default_values, + (asdl_seq*)names_with_default_values); + } + else if (slash_with_default == NULL && names_with_default != NULL) { + *posdefaults = _get_defaults(p, names_with_default); + } + else if (slash_with_default != NULL && names_with_default == NULL) { + *posdefaults = _get_defaults(p, slash_with_default->names_with_defaults); + } + else { + *posdefaults = _Py_asdl_expr_seq_new(0, p->arena); + } + return *posdefaults == NULL ? -1 : 0; +} + +static int +_make_kwargs(Parser *p, StarEtc *star_etc, + asdl_arg_seq **kwonlyargs, + asdl_expr_seq **kwdefaults) { + if (star_etc != NULL && star_etc->kwonlyargs != NULL) { + *kwonlyargs = _get_names(p, star_etc->kwonlyargs); + } + else { + *kwonlyargs = _Py_asdl_arg_seq_new(0, p->arena); + } + + if (*kwonlyargs == NULL) { + return -1; + } + + if (star_etc != NULL && star_etc->kwonlyargs != NULL) { + *kwdefaults = _get_defaults(p, star_etc->kwonlyargs); + } + else { + *kwdefaults = _Py_asdl_expr_seq_new(0, p->arena); + } + + if (*kwdefaults == NULL) { + return -1; + } + + return 0; +} + +/* Constructs an arguments_ty object out of all the parsed constructs in the parameters rule */ +arguments_ty +_PyPegen_make_arguments(Parser *p, asdl_arg_seq *slash_without_default, + SlashWithDefault *slash_with_default, asdl_arg_seq *plain_names, + asdl_seq *names_with_default, StarEtc *star_etc) +{ + asdl_arg_seq *posonlyargs; + if (_make_posonlyargs(p, slash_without_default, slash_with_default, &posonlyargs) == -1) { + return NULL; + } + + asdl_arg_seq *posargs; + if (_make_posargs(p, plain_names, names_with_default, &posargs) == -1) { + return NULL; + } + + asdl_expr_seq *posdefaults; + if (_make_posdefaults(p,slash_with_default, names_with_default, &posdefaults) == -1) { + return NULL; + } + + arg_ty vararg = NULL; + if (star_etc != NULL && star_etc->vararg != NULL) { + vararg = star_etc->vararg; + } + + asdl_arg_seq *kwonlyargs; + asdl_expr_seq *kwdefaults; + if (_make_kwargs(p, star_etc, &kwonlyargs, &kwdefaults) == -1) { + return NULL; + } + + arg_ty kwarg = NULL; + if (star_etc != NULL && star_etc->kwarg != NULL) { + kwarg = star_etc->kwarg; + } + + return _PyAST_arguments(posonlyargs, posargs, vararg, kwonlyargs, + kwdefaults, kwarg, posdefaults, p->arena); +} + + +/* Constructs an empty arguments_ty object, that gets used when a function accepts no + * arguments. */ +arguments_ty +_PyPegen_empty_arguments(Parser *p) +{ + asdl_arg_seq *posonlyargs = _Py_asdl_arg_seq_new(0, p->arena); + if (!posonlyargs) { + return NULL; + } + asdl_arg_seq *posargs = _Py_asdl_arg_seq_new(0, p->arena); + if (!posargs) { + return NULL; + } + asdl_expr_seq *posdefaults = _Py_asdl_expr_seq_new(0, p->arena); + if (!posdefaults) { + return NULL; + } + asdl_arg_seq *kwonlyargs = _Py_asdl_arg_seq_new(0, p->arena); + if (!kwonlyargs) { + return NULL; + } + asdl_expr_seq *kwdefaults = _Py_asdl_expr_seq_new(0, p->arena); + if (!kwdefaults) { + return NULL; + } + + return _PyAST_arguments(posonlyargs, posargs, NULL, kwonlyargs, + kwdefaults, NULL, posdefaults, p->arena); +} + +/* Encapsulates the value of an operator_ty into an AugOperator struct */ +AugOperator * +_PyPegen_augoperator(Parser *p, operator_ty kind) +{ + AugOperator *a = _PyArena_Malloc(p->arena, sizeof(AugOperator)); + if (!a) { + return NULL; + } + a->kind = kind; + return a; +} + +/* Construct a FunctionDef equivalent to function_def, but with decorators */ +stmt_ty +_PyPegen_function_def_decorators(Parser *p, asdl_expr_seq *decorators, stmt_ty function_def) +{ + assert(function_def != NULL); + if (function_def->kind == AsyncFunctionDef_kind) { + return _PyAST_AsyncFunctionDef( + function_def->v.FunctionDef.name, function_def->v.FunctionDef.args, + function_def->v.FunctionDef.body, decorators, function_def->v.FunctionDef.returns, + function_def->v.FunctionDef.type_comment, function_def->lineno, + function_def->col_offset, function_def->end_lineno, function_def->end_col_offset, + p->arena); + } + + return _PyAST_FunctionDef( + function_def->v.FunctionDef.name, function_def->v.FunctionDef.args, + function_def->v.FunctionDef.body, decorators, + function_def->v.FunctionDef.returns, + function_def->v.FunctionDef.type_comment, function_def->lineno, + function_def->col_offset, function_def->end_lineno, + function_def->end_col_offset, p->arena); +} + +/* Construct a ClassDef equivalent to class_def, but with decorators */ +stmt_ty +_PyPegen_class_def_decorators(Parser *p, asdl_expr_seq *decorators, stmt_ty class_def) +{ + assert(class_def != NULL); + return _PyAST_ClassDef( + class_def->v.ClassDef.name, class_def->v.ClassDef.bases, + class_def->v.ClassDef.keywords, class_def->v.ClassDef.body, decorators, + class_def->lineno, class_def->col_offset, class_def->end_lineno, + class_def->end_col_offset, p->arena); +} + +/* Construct a KeywordOrStarred */ +KeywordOrStarred * +_PyPegen_keyword_or_starred(Parser *p, void *element, int is_keyword) +{ + KeywordOrStarred *a = _PyArena_Malloc(p->arena, sizeof(KeywordOrStarred)); + if (!a) { + return NULL; + } + a->element = element; + a->is_keyword = is_keyword; + return a; +} + +/* Get the number of starred expressions in an asdl_seq* of KeywordOrStarred*s */ +static int +_seq_number_of_starred_exprs(asdl_seq *seq) +{ + int n = 0; + for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) { + KeywordOrStarred *k = asdl_seq_GET_UNTYPED(seq, i); + if (!k->is_keyword) { + n++; + } + } + return n; +} + +/* Extract the starred expressions of an asdl_seq* of KeywordOrStarred*s */ +asdl_expr_seq * +_PyPegen_seq_extract_starred_exprs(Parser *p, asdl_seq *kwargs) +{ + int new_len = _seq_number_of_starred_exprs(kwargs); + if (new_len == 0) { + return NULL; + } + asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(new_len, p->arena); + if (!new_seq) { + return NULL; + } + + int idx = 0; + for (Py_ssize_t i = 0, len = asdl_seq_LEN(kwargs); i < len; i++) { + KeywordOrStarred *k = asdl_seq_GET_UNTYPED(kwargs, i); + if (!k->is_keyword) { + asdl_seq_SET(new_seq, idx++, k->element); + } + } + return new_seq; +} + +/* Return a new asdl_seq* with only the keywords in kwargs */ +asdl_keyword_seq* +_PyPegen_seq_delete_starred_exprs(Parser *p, asdl_seq *kwargs) +{ + Py_ssize_t len = asdl_seq_LEN(kwargs); + Py_ssize_t new_len = len - _seq_number_of_starred_exprs(kwargs); + if (new_len == 0) { + return NULL; + } + asdl_keyword_seq *new_seq = _Py_asdl_keyword_seq_new(new_len, p->arena); + if (!new_seq) { + return NULL; + } + + int idx = 0; + for (Py_ssize_t i = 0; i < len; i++) { + KeywordOrStarred *k = asdl_seq_GET_UNTYPED(kwargs, i); + if (k->is_keyword) { + asdl_seq_SET(new_seq, idx++, k->element); + } + } + return new_seq; +} + +expr_ty +_PyPegen_concatenate_strings(Parser *p, asdl_seq *strings) +{ + Py_ssize_t len = asdl_seq_LEN(strings); + assert(len > 0); + + Token *first = asdl_seq_GET_UNTYPED(strings, 0); + Token *last = asdl_seq_GET_UNTYPED(strings, len - 1); + + int bytesmode = 0; + PyObject *bytes_str = NULL; + + FstringParser state; + _PyPegen_FstringParser_Init(&state); + + for (Py_ssize_t i = 0; i < len; i++) { + Token *t = asdl_seq_GET_UNTYPED(strings, i); + + int this_bytesmode; + int this_rawmode; + PyObject *s; + const char *fstr; + Py_ssize_t fstrlen = -1; + + if (_PyPegen_parsestr(p, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen, t) != 0) { + goto error; + } + + /* Check that we are not mixing bytes with unicode. */ + if (i != 0 && bytesmode != this_bytesmode) { + RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals"); + Py_XDECREF(s); + goto error; + } + bytesmode = this_bytesmode; + + if (fstr != NULL) { + assert(s == NULL && !bytesmode); + + int result = _PyPegen_FstringParser_ConcatFstring(p, &state, &fstr, fstr + fstrlen, + this_rawmode, 0, first, t, last); + if (result < 0) { + goto error; + } + } + else { + /* String or byte string. */ + assert(s != NULL && fstr == NULL); + assert(bytesmode ? PyBytes_CheckExact(s) : PyUnicode_CheckExact(s)); + + if (bytesmode) { + if (i == 0) { + bytes_str = s; + } + else { + PyBytes_ConcatAndDel(&bytes_str, s); + if (!bytes_str) { + goto error; + } + } + } + else { + /* This is a regular string. Concatenate it. */ + if (_PyPegen_FstringParser_ConcatAndDel(&state, s) < 0) { + goto error; + } + } + } + } + + if (bytesmode) { + if (_PyArena_AddPyObject(p->arena, bytes_str) < 0) { + goto error; + } + return _PyAST_Constant(bytes_str, NULL, first->lineno, + first->col_offset, last->end_lineno, + last->end_col_offset, p->arena); + } + + return _PyPegen_FstringParser_Finish(p, &state, first, last); + +error: + Py_XDECREF(bytes_str); + _PyPegen_FstringParser_Dealloc(&state); + if (PyErr_Occurred()) { + _Pypegen_raise_decode_error(p); + } + return NULL; +} + +expr_ty +_PyPegen_ensure_imaginary(Parser *p, expr_ty exp) +{ + if (exp->kind != Constant_kind || !PyComplex_CheckExact(exp->v.Constant.value)) { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(exp, "imaginary number required in complex literal"); + return NULL; + } + return exp; +} + +expr_ty +_PyPegen_ensure_real(Parser *p, expr_ty exp) +{ + if (exp->kind != Constant_kind || PyComplex_CheckExact(exp->v.Constant.value)) { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(exp, "real number required in complex literal"); + return NULL; + } + return exp; +} + +mod_ty +_PyPegen_make_module(Parser *p, asdl_stmt_seq *a) { + asdl_type_ignore_seq *type_ignores = NULL; + Py_ssize_t num = p->type_ignore_comments.num_items; + if (num > 0) { + // Turn the raw (comment, lineno) pairs into TypeIgnore objects in the arena + type_ignores = _Py_asdl_type_ignore_seq_new(num, p->arena); + if (type_ignores == NULL) { + return NULL; + } + for (int i = 0; i < num; i++) { + PyObject *tag = _PyPegen_new_type_comment(p, p->type_ignore_comments.items[i].comment); + if (tag == NULL) { + return NULL; + } + type_ignore_ty ti = _PyAST_TypeIgnore(p->type_ignore_comments.items[i].lineno, + tag, p->arena); + if (ti == NULL) { + return NULL; + } + asdl_seq_SET(type_ignores, i, ti); + } + } + return _PyAST_Module(a, type_ignores, p->arena); +} + +PyObject * +_PyPegen_new_type_comment(Parser *p, const char *s) +{ + PyObject *res = PyUnicode_DecodeUTF8(s, strlen(s), NULL); + if (res == NULL) { + return NULL; + } + if (_PyArena_AddPyObject(p->arena, res) < 0) { + Py_DECREF(res); + return NULL; + } + return res; +} + +arg_ty +_PyPegen_add_type_comment_to_arg(Parser *p, arg_ty a, Token *tc) +{ + if (tc == NULL) { + return a; + } + const char *bytes = PyBytes_AsString(tc->bytes); + if (bytes == NULL) { + return NULL; + } + PyObject *tco = _PyPegen_new_type_comment(p, bytes); + if (tco == NULL) { + return NULL; + } + return _PyAST_arg(a->arg, a->annotation, tco, + a->lineno, a->col_offset, a->end_lineno, a->end_col_offset, + p->arena); +} + +/* Checks if the NOTEQUAL token is valid given the current parser flags +0 indicates success and nonzero indicates failure (an exception may be set) */ +int +_PyPegen_check_barry_as_flufl(Parser *p, Token* t) { + assert(t->bytes != NULL); + assert(t->type == NOTEQUAL); + + const char* tok_str = PyBytes_AS_STRING(t->bytes); + if (p->flags & PyPARSE_BARRY_AS_BDFL && strcmp(tok_str, "<>") != 0) { + RAISE_SYNTAX_ERROR("with Barry as BDFL, use '<>' instead of '!='"); + return -1; + } + if (!(p->flags & PyPARSE_BARRY_AS_BDFL)) { + return strcmp(tok_str, "!="); + } + return 0; +} + +int +_PyPegen_check_legacy_stmt(Parser *p, expr_ty name) { + if (name->kind != Name_kind) { + return 0; + } + const char* candidates[2] = {"print", "exec"}; + for (int i=0; i<2; i++) { + if (PyUnicode_CompareWithASCIIString(name->v.Name.id, candidates[i]) == 0) { + return 1; + } + } + return 0; +} + +const char * +_PyPegen_get_expr_name(expr_ty e) +{ + assert(e != NULL); + switch (e->kind) { + case Attribute_kind: + return "attribute"; + case Subscript_kind: + return "subscript"; + case Starred_kind: + return "starred"; + case Name_kind: + return "name"; + case List_kind: + return "list"; + case Tuple_kind: + return "tuple"; + case Lambda_kind: + return "lambda"; + case Call_kind: + return "function call"; + case BoolOp_kind: + case BinOp_kind: + case UnaryOp_kind: + return "expression"; + case GeneratorExp_kind: + return "generator expression"; + case Yield_kind: + case YieldFrom_kind: + return "yield expression"; + case Await_kind: + return "await expression"; + case ListComp_kind: + return "list comprehension"; + case SetComp_kind: + return "set comprehension"; + case DictComp_kind: + return "dict comprehension"; + case Dict_kind: + return "dict literal"; + case Set_kind: + return "set display"; + case JoinedStr_kind: + case FormattedValue_kind: + return "f-string expression"; + case Constant_kind: { + PyObject *value = e->v.Constant.value; + if (value == Py_None) { + return "None"; + } + if (value == Py_False) { + return "False"; + } + if (value == Py_True) { + return "True"; + } + if (value == Py_Ellipsis) { + return "ellipsis"; + } + return "literal"; + } + case Compare_kind: + return "comparison"; + case IfExp_kind: + return "conditional expression"; + case NamedExpr_kind: + return "named expression"; + default: + PyErr_Format(PyExc_SystemError, + "unexpected expression in assignment %d (line %d)", + e->kind, e->lineno); + return NULL; + } +} + +static inline expr_ty +_PyPegen_get_last_comprehension_item(comprehension_ty comprehension) { + if (comprehension->ifs == NULL || asdl_seq_LEN(comprehension->ifs) == 0) { + return comprehension->iter; + } + return PyPegen_last_item(comprehension->ifs, expr_ty); +} + +expr_ty _PyPegen_collect_call_seqs(Parser *p, asdl_expr_seq *a, asdl_seq *b, + int lineno, int col_offset, int end_lineno, + int end_col_offset, PyArena *arena) { + Py_ssize_t args_len = asdl_seq_LEN(a); + Py_ssize_t total_len = args_len; + + if (b == NULL) { + return _PyAST_Call(_PyPegen_dummy_name(p), a, NULL, lineno, col_offset, + end_lineno, end_col_offset, arena); + + } + + asdl_expr_seq *starreds = _PyPegen_seq_extract_starred_exprs(p, b); + asdl_keyword_seq *keywords = _PyPegen_seq_delete_starred_exprs(p, b); + + if (starreds) { + total_len += asdl_seq_LEN(starreds); + } + + asdl_expr_seq *args = _Py_asdl_expr_seq_new(total_len, arena); + + Py_ssize_t i = 0; + for (i = 0; i < args_len; i++) { + asdl_seq_SET(args, i, asdl_seq_GET(a, i)); + } + for (; i < total_len; i++) { + asdl_seq_SET(args, i, asdl_seq_GET(starreds, i - args_len)); + } + + return _PyAST_Call(_PyPegen_dummy_name(p), args, keywords, lineno, + col_offset, end_lineno, end_col_offset, arena); +} + +// AST Error reporting helpers + +expr_ty +_PyPegen_get_invalid_target(expr_ty e, TARGETS_TYPE targets_type) +{ + if (e == NULL) { + return NULL; + } + +#define VISIT_CONTAINER(CONTAINER, TYPE) do { \ + Py_ssize_t len = asdl_seq_LEN((CONTAINER)->v.TYPE.elts);\ + for (Py_ssize_t i = 0; i < len; i++) {\ + expr_ty other = asdl_seq_GET((CONTAINER)->v.TYPE.elts, i);\ + expr_ty child = _PyPegen_get_invalid_target(other, targets_type);\ + if (child != NULL) {\ + return child;\ + }\ + }\ + } while (0) + + // We only need to visit List and Tuple nodes recursively as those + // are the only ones that can contain valid names in targets when + // they are parsed as expressions. Any other kind of expression + // that is a container (like Sets or Dicts) is directly invalid and + // we don't need to visit it recursively. + + switch (e->kind) { + case List_kind: + VISIT_CONTAINER(e, List); + return NULL; + case Tuple_kind: + VISIT_CONTAINER(e, Tuple); + return NULL; + case Starred_kind: + if (targets_type == DEL_TARGETS) { + return e; + } + return _PyPegen_get_invalid_target(e->v.Starred.value, targets_type); + case Compare_kind: + // This is needed, because the `a in b` in `for a in b` gets parsed + // as a comparison, and so we need to search the left side of the comparison + // for invalid targets. + if (targets_type == FOR_TARGETS) { + cmpop_ty cmpop = (cmpop_ty) asdl_seq_GET(e->v.Compare.ops, 0); + if (cmpop == In) { + return _PyPegen_get_invalid_target(e->v.Compare.left, targets_type); + } + return NULL; + } + return e; + case Name_kind: + case Subscript_kind: + case Attribute_kind: + return NULL; + default: + return e; + } +} + +void *_PyPegen_arguments_parsing_error(Parser *p, expr_ty e) { + int kwarg_unpacking = 0; + for (Py_ssize_t i = 0, l = asdl_seq_LEN(e->v.Call.keywords); i < l; i++) { + keyword_ty keyword = asdl_seq_GET(e->v.Call.keywords, i); + if (!keyword->arg) { + kwarg_unpacking = 1; + } + } + + const char *msg = NULL; + if (kwarg_unpacking) { + msg = "positional argument follows keyword argument unpacking"; + } else { + msg = "positional argument follows keyword argument"; + } + + return RAISE_SYNTAX_ERROR(msg); +} + +void * +_PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq *comprehensions) +{ + /* The rule that calls this function is 'args for_if_clauses'. + For the input f(L, x for x in y), L and x are in args and + the for is parsed as a for_if_clause. We have to check if + len <= 1, so that input like dict((a, b) for a, b in x) + gets successfully parsed and then we pass the last + argument (x in the above example) as the location of the + error */ + Py_ssize_t len = asdl_seq_LEN(args->v.Call.args); + if (len <= 1) { + return NULL; + } + + comprehension_ty last_comprehension = PyPegen_last_item(comprehensions, comprehension_ty); + + return RAISE_SYNTAX_ERROR_KNOWN_RANGE( + (expr_ty) asdl_seq_GET(args->v.Call.args, len - 1), + _PyPegen_get_last_comprehension_item(last_comprehension), + "Generator expression must be parenthesized" + ); +} \ No newline at end of file diff --git a/Parser/pegen.c b/Parser/pegen.c index b760730..4f51c63 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -1,432 +1,21 @@ #include #include "pycore_ast.h" // _PyAST_Validate(), #include -#include "tokenizer.h" +#include "tokenizer.h" #include "pegen.h" -#include "string_parser.h" - -PyObject * -_PyPegen_new_type_comment(Parser *p, const char *s) -{ - PyObject *res = PyUnicode_DecodeUTF8(s, strlen(s), NULL); - if (res == NULL) { - return NULL; - } - if (_PyArena_AddPyObject(p->arena, res) < 0) { - Py_DECREF(res); - return NULL; - } - return res; -} - -arg_ty -_PyPegen_add_type_comment_to_arg(Parser *p, arg_ty a, Token *tc) -{ - if (tc == NULL) { - return a; - } - const char *bytes = PyBytes_AsString(tc->bytes); - if (bytes == NULL) { - return NULL; - } - PyObject *tco = _PyPegen_new_type_comment(p, bytes); - if (tco == NULL) { - return NULL; - } - return _PyAST_arg(a->arg, a->annotation, tco, - a->lineno, a->col_offset, a->end_lineno, a->end_col_offset, - p->arena); -} - -static int -init_normalization(Parser *p) -{ - if (p->normalize) { - return 1; - } - PyObject *m = PyImport_ImportModuleNoBlock("unicodedata"); - if (!m) - { - return 0; - } - p->normalize = PyObject_GetAttrString(m, "normalize"); - Py_DECREF(m); - if (!p->normalize) - { - return 0; - } - return 1; -} - -/* Checks if the NOTEQUAL token is valid given the current parser flags -0 indicates success and nonzero indicates failure (an exception may be set) */ -int -_PyPegen_check_barry_as_flufl(Parser *p, Token* t) { - assert(t->bytes != NULL); - assert(t->type == NOTEQUAL); - - const char* tok_str = PyBytes_AS_STRING(t->bytes); - if (p->flags & PyPARSE_BARRY_AS_BDFL && strcmp(tok_str, "<>") != 0) { - RAISE_SYNTAX_ERROR("with Barry as BDFL, use '<>' instead of '!='"); - return -1; - } - if (!(p->flags & PyPARSE_BARRY_AS_BDFL)) { - return strcmp(tok_str, "!="); - } - return 0; -} - -int -_PyPegen_check_legacy_stmt(Parser *p, expr_ty name) { - if (name->kind != Name_kind) { - return 0; - } - const char* candidates[2] = {"print", "exec"}; - for (int i=0; i<2; i++) { - if (PyUnicode_CompareWithASCIIString(name->v.Name.id, candidates[i]) == 0) { - return 1; - } - } - return 0; -} - -PyObject * -_PyPegen_new_identifier(Parser *p, const char *n) -{ - PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL); - if (!id) { - goto error; - } - /* PyUnicode_DecodeUTF8 should always return a ready string. */ - assert(PyUnicode_IS_READY(id)); - /* Check whether there are non-ASCII characters in the - identifier; if so, normalize to NFKC. */ - if (!PyUnicode_IS_ASCII(id)) - { - PyObject *id2; - if (!init_normalization(p)) - { - Py_DECREF(id); - goto error; - } - PyObject *form = PyUnicode_InternFromString("NFKC"); - if (form == NULL) - { - Py_DECREF(id); - goto error; - } - PyObject *args[2] = {form, id}; - id2 = _PyObject_FastCall(p->normalize, args, 2); - Py_DECREF(id); - Py_DECREF(form); - if (!id2) { - goto error; - } - if (!PyUnicode_Check(id2)) - { - PyErr_Format(PyExc_TypeError, - "unicodedata.normalize() must return a string, not " - "%.200s", - _PyType_Name(Py_TYPE(id2))); - Py_DECREF(id2); - goto error; - } - id = id2; - } - PyUnicode_InternInPlace(&id); - if (_PyArena_AddPyObject(p->arena, id) < 0) - { - Py_DECREF(id); - goto error; - } - return id; - -error: - p->error_indicator = 1; - return NULL; -} - -static PyObject * -_create_dummy_identifier(Parser *p) -{ - return _PyPegen_new_identifier(p, ""); -} - -const char * -_PyPegen_get_expr_name(expr_ty e) -{ - assert(e != NULL); - switch (e->kind) { - case Attribute_kind: - return "attribute"; - case Subscript_kind: - return "subscript"; - case Starred_kind: - return "starred"; - case Name_kind: - return "name"; - case List_kind: - return "list"; - case Tuple_kind: - return "tuple"; - case Lambda_kind: - return "lambda"; - case Call_kind: - return "function call"; - case BoolOp_kind: - case BinOp_kind: - case UnaryOp_kind: - return "expression"; - case GeneratorExp_kind: - return "generator expression"; - case Yield_kind: - case YieldFrom_kind: - return "yield expression"; - case Await_kind: - return "await expression"; - case ListComp_kind: - return "list comprehension"; - case SetComp_kind: - return "set comprehension"; - case DictComp_kind: - return "dict comprehension"; - case Dict_kind: - return "dict literal"; - case Set_kind: - return "set display"; - case JoinedStr_kind: - case FormattedValue_kind: - return "f-string expression"; - case Constant_kind: { - PyObject *value = e->v.Constant.value; - if (value == Py_None) { - return "None"; - } - if (value == Py_False) { - return "False"; - } - if (value == Py_True) { - return "True"; - } - if (value == Py_Ellipsis) { - return "ellipsis"; - } - return "literal"; - } - case Compare_kind: - return "comparison"; - case IfExp_kind: - return "conditional expression"; - case NamedExpr_kind: - return "named expression"; - default: - PyErr_Format(PyExc_SystemError, - "unexpected expression in assignment %d (line %d)", - e->kind, e->lineno); - return NULL; - } -} - -static int -raise_decode_error(Parser *p) -{ - assert(PyErr_Occurred()); - const char *errtype = NULL; - if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { - errtype = "unicode error"; - } - else if (PyErr_ExceptionMatches(PyExc_ValueError)) { - errtype = "value error"; - } - if (errtype) { - PyObject *type; - PyObject *value; - PyObject *tback; - PyObject *errstr; - PyErr_Fetch(&type, &value, &tback); - errstr = PyObject_Str(value); - if (errstr) { - RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); - Py_DECREF(errstr); - } - else { - PyErr_Clear(); - RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); - } - Py_XDECREF(type); - Py_XDECREF(value); - Py_XDECREF(tback); - } - - return -1; -} - -static inline void -raise_unclosed_parentheses_error(Parser *p) { - int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; - int error_col = p->tok->parencolstack[p->tok->level-1]; - RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, - error_lineno, error_col, error_lineno, -1, - "'%c' was never closed", - p->tok->parenstack[p->tok->level-1]); -} - -static void -raise_tokenizer_init_error(PyObject *filename) -{ - if (!(PyErr_ExceptionMatches(PyExc_LookupError) - || PyErr_ExceptionMatches(PyExc_SyntaxError) - || PyErr_ExceptionMatches(PyExc_ValueError) - || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { - return; - } - PyObject *errstr = NULL; - PyObject *tuple = NULL; - PyObject *type; - PyObject *value; - PyObject *tback; - PyErr_Fetch(&type, &value, &tback); - errstr = PyObject_Str(value); - if (!errstr) { - goto error; - } - - PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); - if (!tmp) { - goto error; - } - - tuple = PyTuple_Pack(2, errstr, tmp); - Py_DECREF(tmp); - if (!value) { - goto error; - } - PyErr_SetObject(PyExc_SyntaxError, tuple); - -error: - Py_XDECREF(type); - Py_XDECREF(value); - Py_XDECREF(tback); - Py_XDECREF(errstr); - Py_XDECREF(tuple); -} - -static int -tokenizer_error(Parser *p) -{ - if (PyErr_Occurred()) { - return -1; - } - - const char *msg = NULL; - PyObject* errtype = PyExc_SyntaxError; - Py_ssize_t col_offset = -1; - switch (p->tok->done) { - case E_TOKEN: - msg = "invalid token"; - break; - case E_EOF: - if (p->tok->level) { - raise_unclosed_parentheses_error(p); - } else { - RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); - } - return -1; - case E_DEDENT: - RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); - return -1; - case E_INTR: - if (!PyErr_Occurred()) { - PyErr_SetNone(PyExc_KeyboardInterrupt); - } - return -1; - case E_NOMEM: - PyErr_NoMemory(); - return -1; - case E_TABSPACE: - errtype = PyExc_TabError; - msg = "inconsistent use of tabs and spaces in indentation"; - break; - case E_TOODEEP: - errtype = PyExc_IndentationError; - msg = "too many levels of indentation"; - break; - case E_LINECONT: { - col_offset = p->tok->cur - p->tok->buf - 1; - msg = "unexpected character after line continuation character"; - break; - } - default: - msg = "unknown parsing error"; - } - RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, - col_offset >= 0 ? col_offset : 0, - p->tok->lineno, -1, msg); - return -1; -} +// Internal parser functions -void * -_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) +asdl_stmt_seq* +_PyPegen_interactive_exit(Parser *p) { - if (p->fill == 0) { - va_list va; - va_start(va, errmsg); - _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); - va_end(va); - return NULL; - } - - Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; - Py_ssize_t col_offset; - Py_ssize_t end_col_offset = -1; - if (t->col_offset == -1) { - if (p->tok->cur == p->tok->buf) { - col_offset = 0; - } else { - const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; - col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); - } - } else { - col_offset = t->col_offset + 1; - } - - if (t->end_col_offset != -1) { - end_col_offset = t->end_col_offset + 1; + if (p->errcode) { + *(p->errcode) = E_EOF; } - - va_list va; - va_start(va, errmsg); - _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); - va_end(va); - return NULL; } -static PyObject * -get_error_line(Parser *p, Py_ssize_t lineno) -{ - /* If the file descriptor is interactive, the source lines of the current - * (multi-line) statement are stored in p->tok->interactive_src_start. - * If not, we're parsing from a string, which means that the whole source - * is stored in p->tok->str. */ - assert(p->tok->fp == NULL || p->tok->fp == stdin); - - char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; - assert(cur_line != NULL); - - for (int i = 0; i < lineno - 1; i++) { - cur_line = strchr(cur_line, '\n') + 1; - } - - char *next_newline; - if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line - next_newline = cur_line + strlen(cur_line); - } - return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); -} - Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset) { @@ -448,127 +37,6 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset) return size; } -void * -_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, - Py_ssize_t lineno, Py_ssize_t col_offset, - Py_ssize_t end_lineno, Py_ssize_t end_col_offset, - const char *errmsg, va_list va) -{ - PyObject *value = NULL; - PyObject *errstr = NULL; - PyObject *error_line = NULL; - PyObject *tmp = NULL; - p->error_indicator = 1; - - if (end_lineno == CURRENT_POS) { - end_lineno = p->tok->lineno; - } - if (end_col_offset == CURRENT_POS) { - end_col_offset = p->tok->cur - p->tok->line_start; - } - - if (p->start_rule == Py_fstring_input) { - const char *fstring_msg = "f-string: "; - Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg); - - char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character - if (!new_errmsg) { - return (void *) PyErr_NoMemory(); - } - - // Copy both strings into new buffer - memcpy(new_errmsg, fstring_msg, strlen(fstring_msg)); - memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg)); - new_errmsg[len] = 0; - errmsg = new_errmsg; - } - errstr = PyUnicode_FromFormatV(errmsg, va); - if (!errstr) { - goto error; - } - - if (p->tok->fp_interactive) { - error_line = get_error_line(p, lineno); - } - else if (p->start_rule == Py_file_input) { - error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, - (int) lineno, p->tok->encoding); - } - - if (!error_line) { - /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, - then we need to find the error line from some other source, because - p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly - failed or we're parsing from a string or the REPL. There's a third edge case where - we're actually parsing from a file, which has an E_EOF SyntaxError and in that case - `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which - does not physically exist */ - assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); - - if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { - Py_ssize_t size = p->tok->inp - p->tok->buf; - error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); - } - else if (p->tok->fp == NULL || p->tok->fp == stdin) { - error_line = get_error_line(p, lineno); - } - else { - error_line = PyUnicode_FromStringAndSize("", 0); - } - if (!error_line) { - goto error; - } - } - - if (p->start_rule == Py_fstring_input) { - col_offset -= p->starting_col_offset; - end_col_offset -= p->starting_col_offset; - } - - Py_ssize_t col_number = col_offset; - Py_ssize_t end_col_number = end_col_offset; - - if (p->tok->encoding != NULL) { - col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); - if (col_number < 0) { - goto error; - } - if (end_col_number > 0) { - Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number); - if (end_col_offset < 0) { - goto error; - } else { - end_col_number = end_col_offset; - } - } - } - tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); - if (!tmp) { - goto error; - } - value = PyTuple_Pack(2, errstr, tmp); - Py_DECREF(tmp); - if (!value) { - goto error; - } - PyErr_SetObject(errtype, value); - - Py_DECREF(errstr); - Py_DECREF(value); - if (p->start_rule == Py_fstring_input) { - PyMem_Free((void *)errmsg); - } - return NULL; - -error: - Py_XDECREF(errstr); - Py_XDECREF(error_line); - if (p->start_rule == Py_fstring_input) { - PyMem_Free((void *)errmsg); - } - return NULL; -} - #if 0 static const char * token_name(int type) @@ -614,39 +82,24 @@ _PyPegen_update_memo(Parser *p, int mark, int type, void *node) return _PyPegen_insert_memo(p, mark, type, node); } -// Return dummy NAME. -void * -_PyPegen_dummy_name(Parser *p, ...) +static int +init_normalization(Parser *p) { - static void *cache = NULL; - - if (cache != NULL) { - return cache; + if (p->normalize) { + return 1; } - - PyObject *id = _create_dummy_identifier(p); - if (!id) { - return NULL; + PyObject *m = PyImport_ImportModuleNoBlock("unicodedata"); + if (!m) + { + return 0; } - cache = _PyAST_Name(id, Load, 1, 0, 1, 0, p->arena); - return cache; -} - -static int -_get_keyword_or_name_type(Parser *p, const char *name, int name_len) -{ - assert(name_len > 0); - if (name_len >= p->n_keyword_lists || - p->keywords[name_len] == NULL || - p->keywords[name_len]->type == -1) { - return NAME; - } - for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) { - if (strncmp(k->str, name, name_len) == 0) { - return k->type; - } + p->normalize = PyObject_GetAttrString(m, "normalize"); + Py_DECREF(m); + if (!p->normalize) + { + return 0; } - return NAME; + return 1; } static int @@ -686,6 +139,23 @@ growable_comment_array_deallocate(growable_comment_array *arr) { } static int +_get_keyword_or_name_type(Parser *p, const char *name, int name_len) +{ + assert(name_len > 0); + if (name_len >= p->n_keyword_lists || + p->keywords[name_len] == NULL || + p->keywords[name_len]->type == -1) { + return NAME; + } + for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) { + if (strncmp(k->str, name, name_len) == 0) { + return k->type; + } + } + return NAME; +} + +static int initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) { assert(token != NULL); @@ -715,10 +185,10 @@ initialize_token(Parser *p, Token *token, const char *start, const char *end, in p->fill += 1; if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) { - return raise_decode_error(p); + return _Pypegen_raise_decode_error(p); } - return (token_type == ERRORTOKEN ? tokenizer_error(p) : 0); + return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0); } static int @@ -791,7 +261,6 @@ _PyPegen_fill_token(Parser *p) return initialize_token(p, t, start, end, type); } - #if defined(Py_DEBUG) // Instrumentation to count the effectiveness of memoization. // The array counts the number of tokens skipped by memoization, @@ -989,6 +458,62 @@ _PyPegen_get_last_nonnwhitespace_token(Parser *p) return token; } +PyObject * +_PyPegen_new_identifier(Parser *p, const char *n) +{ + PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL); + if (!id) { + goto error; + } + /* PyUnicode_DecodeUTF8 should always return a ready string. */ + assert(PyUnicode_IS_READY(id)); + /* Check whether there are non-ASCII characters in the + identifier; if so, normalize to NFKC. */ + if (!PyUnicode_IS_ASCII(id)) + { + PyObject *id2; + if (!init_normalization(p)) + { + Py_DECREF(id); + goto error; + } + PyObject *form = PyUnicode_InternFromString("NFKC"); + if (form == NULL) + { + Py_DECREF(id); + goto error; + } + PyObject *args[2] = {form, id}; + id2 = _PyObject_FastCall(p->normalize, args, 2); + Py_DECREF(id); + Py_DECREF(form); + if (!id2) { + goto error; + } + if (!PyUnicode_Check(id2)) + { + PyErr_Format(PyExc_TypeError, + "unicodedata.normalize() must return a string, not " + "%.200s", + _PyType_Name(Py_TYPE(id2))); + Py_DECREF(id2); + goto error; + } + id = id2; + } + PyUnicode_InternInPlace(&id); + if (_PyArena_AddPyObject(p->arena, id) < 0) + { + Py_DECREF(id); + goto error; + } + return id; + +error: + p->error_indicator = 1; + return NULL; +} + static expr_ty _PyPegen_name_from_token(Parser *p, Token* t) { @@ -1009,7 +534,6 @@ _PyPegen_name_from_token(Parser *p, Token* t) t->end_col_offset, p->arena); } - expr_ty _PyPegen_name_token(Parser *p) { @@ -1023,7 +547,6 @@ _PyPegen_string_token(Parser *p) return _PyPegen_expect_token(p, STRING); } - expr_ty _PyPegen_soft_keyword_token(Parser *p) { Token *t = _PyPegen_expect_token(p, NAME); if (t == NULL) { @@ -1197,18 +720,6 @@ bad_single_statement(Parser *p) } } -void -_PyPegen_Parser_Free(Parser *p) -{ - Py_XDECREF(p->normalize); - for (int i = 0; i < p->size; i++) { - PyMem_Free(p->tokens[i]); - } - PyMem_Free(p->tokens); - growable_comment_array_deallocate(&p->type_ignore_comments); - PyMem_Free(p); -} - static int compute_parser_flags(PyCompilerFlags *flags) { @@ -1234,6 +745,8 @@ compute_parser_flags(PyCompilerFlags *flags) return parser_flags; } +// Parser API + Parser * _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags, int feature_version, int *errcode, PyArena *arena) @@ -1289,8 +802,20 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags, return p; } +void +_PyPegen_Parser_Free(Parser *p) +{ + Py_XDECREF(p->normalize); + for (int i = 0; i < p->size; i++) { + PyMem_Free(p->tokens[i]); + } + PyMem_Free(p->tokens); + growable_comment_array_deallocate(&p->type_ignore_comments); + PyMem_Free(p); +} + static void -reset_parser_state(Parser *p) +reset_parser_state_for_error_pass(Parser *p) { for (int i = 0; i < p->fill; i++) { p->tokens[i]->memo = NULL; @@ -1302,60 +827,6 @@ reset_parser_state(Parser *p) p->tok->interactive_underflow = IUNDERFLOW_STOP; } -static int -_PyPegen_check_tokenizer_errors(Parser *p) { - // Tokenize the whole input to see if there are any tokenization - // errors such as mistmatching parentheses. These will get priority - // over generic syntax errors only if the line number of the error is - // before the one that we had for the generic error. - - // We don't want to tokenize to the end for interactive input - if (p->tok->prompt != NULL) { - return 0; - } - - PyObject *type, *value, *traceback; - PyErr_Fetch(&type, &value, &traceback); - - Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; - Py_ssize_t current_err_line = current_token->lineno; - - int ret = 0; - - for (;;) { - const char *start; - const char *end; - switch (_PyTokenizer_Get(p->tok, &start, &end)) { - case ERRORTOKEN: - if (p->tok->level != 0) { - int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; - if (current_err_line > error_lineno) { - raise_unclosed_parentheses_error(p); - ret = -1; - goto exit; - } - } - break; - case ENDMARKER: - break; - default: - continue; - } - break; - } - - -exit: - if (PyErr_Occurred()) { - Py_XDECREF(value); - Py_XDECREF(type); - Py_XDECREF(traceback); - } else { - PyErr_Restore(type, value, traceback); - } - return ret; -} - void * _PyPegen_run_parser(Parser *p) { @@ -1364,46 +835,17 @@ _PyPegen_run_parser(Parser *p) if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) { return NULL; } + // Make a second parser pass. In this pass we activate heavier and slower checks + // to produce better error messages and more complete diagnostics. Extra "invalid_*" + // rules will be active during parsing. Token *last_token = p->tokens[p->fill - 1]; - reset_parser_state(p); + reset_parser_state_for_error_pass(p); _PyPegen_parse(p); - if (PyErr_Occurred()) { - // Prioritize tokenizer errors to custom syntax errors raised - // on the second phase only if the errors come from the parser. - if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) { - _PyPegen_check_tokenizer_errors(p); - } - return NULL; - } - if (p->fill == 0) { - RAISE_SYNTAX_ERROR("error at start before reading any input"); - } - else if (p->tok->done == E_EOF) { - if (p->tok->level) { - raise_unclosed_parentheses_error(p); - } else { - RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); - } - } - else { - if (p->tokens[p->fill-1]->type == INDENT) { - RAISE_INDENTATION_ERROR("unexpected indent"); - } - else if (p->tokens[p->fill-1]->type == DEDENT) { - RAISE_INDENTATION_ERROR("unexpected unindent"); - } - else { - // Use the last token we found on the first pass to avoid reporting - // incorrect locations for generic syntax errors just because we reached - // further away when trying to find specific syntax errors in the second - // pass. - RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); - // _PyPegen_check_tokenizer_errors will override the existing - // generic SyntaxError we just raised if errors are found. - _PyPegen_check_tokenizer_errors(p); - } - } - return NULL; + + // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure + // point. + _Pypegen_set_syntax_error(p, last_token); + return NULL; } if (p->start_rule == Py_single_input && bad_single_statement(p)) { @@ -1433,7 +875,7 @@ _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filena struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2); if (tok == NULL) { if (PyErr_Occurred()) { - raise_tokenizer_init_error(filename_ob); + _PyPegen_raise_tokenizer_init_error(filename_ob); return NULL; } return NULL; @@ -1478,7 +920,7 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen } if (tok == NULL) { if (PyErr_Occurred()) { - raise_tokenizer_init_error(filename_ob); + _PyPegen_raise_tokenizer_init_error(filename_ob); } return NULL; } @@ -1504,1138 +946,4 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen error: _PyTokenizer_Free(tok); return result; -} - -asdl_stmt_seq* -_PyPegen_interactive_exit(Parser *p) -{ - if (p->errcode) { - *(p->errcode) = E_EOF; - } - return NULL; -} - -/* Creates a single-element asdl_seq* that contains a */ -asdl_seq * -_PyPegen_singleton_seq(Parser *p, void *a) -{ - assert(a != NULL); - asdl_seq *seq = (asdl_seq*)_Py_asdl_generic_seq_new(1, p->arena); - if (!seq) { - return NULL; - } - asdl_seq_SET_UNTYPED(seq, 0, a); - return seq; -} - -/* Creates a copy of seq and prepends a to it */ -asdl_seq * -_PyPegen_seq_insert_in_front(Parser *p, void *a, asdl_seq *seq) -{ - assert(a != NULL); - if (!seq) { - return _PyPegen_singleton_seq(p, a); - } - - asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(asdl_seq_LEN(seq) + 1, p->arena); - if (!new_seq) { - return NULL; - } - - asdl_seq_SET_UNTYPED(new_seq, 0, a); - for (Py_ssize_t i = 1, l = asdl_seq_LEN(new_seq); i < l; i++) { - asdl_seq_SET_UNTYPED(new_seq, i, asdl_seq_GET_UNTYPED(seq, i - 1)); - } - return new_seq; -} - -/* Creates a copy of seq and appends a to it */ -asdl_seq * -_PyPegen_seq_append_to_end(Parser *p, asdl_seq *seq, void *a) -{ - assert(a != NULL); - if (!seq) { - return _PyPegen_singleton_seq(p, a); - } - - asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(asdl_seq_LEN(seq) + 1, p->arena); - if (!new_seq) { - return NULL; - } - - for (Py_ssize_t i = 0, l = asdl_seq_LEN(new_seq); i + 1 < l; i++) { - asdl_seq_SET_UNTYPED(new_seq, i, asdl_seq_GET_UNTYPED(seq, i)); - } - asdl_seq_SET_UNTYPED(new_seq, asdl_seq_LEN(new_seq) - 1, a); - return new_seq; -} - -static Py_ssize_t -_get_flattened_seq_size(asdl_seq *seqs) -{ - Py_ssize_t size = 0; - for (Py_ssize_t i = 0, l = asdl_seq_LEN(seqs); i < l; i++) { - asdl_seq *inner_seq = asdl_seq_GET_UNTYPED(seqs, i); - size += asdl_seq_LEN(inner_seq); - } - return size; -} - -/* Flattens an asdl_seq* of asdl_seq*s */ -asdl_seq * -_PyPegen_seq_flatten(Parser *p, asdl_seq *seqs) -{ - Py_ssize_t flattened_seq_size = _get_flattened_seq_size(seqs); - assert(flattened_seq_size > 0); - - asdl_seq *flattened_seq = (asdl_seq*)_Py_asdl_generic_seq_new(flattened_seq_size, p->arena); - if (!flattened_seq) { - return NULL; - } - - int flattened_seq_idx = 0; - for (Py_ssize_t i = 0, l = asdl_seq_LEN(seqs); i < l; i++) { - asdl_seq *inner_seq = asdl_seq_GET_UNTYPED(seqs, i); - for (Py_ssize_t j = 0, li = asdl_seq_LEN(inner_seq); j < li; j++) { - asdl_seq_SET_UNTYPED(flattened_seq, flattened_seq_idx++, asdl_seq_GET_UNTYPED(inner_seq, j)); - } - } - assert(flattened_seq_idx == flattened_seq_size); - - return flattened_seq; -} - -void * -_PyPegen_seq_last_item(asdl_seq *seq) -{ - Py_ssize_t len = asdl_seq_LEN(seq); - return asdl_seq_GET_UNTYPED(seq, len - 1); -} - -void * -_PyPegen_seq_first_item(asdl_seq *seq) -{ - return asdl_seq_GET_UNTYPED(seq, 0); -} - - -/* Creates a new name of the form . */ -expr_ty -_PyPegen_join_names_with_dot(Parser *p, expr_ty first_name, expr_ty second_name) -{ - assert(first_name != NULL && second_name != NULL); - PyObject *first_identifier = first_name->v.Name.id; - PyObject *second_identifier = second_name->v.Name.id; - - if (PyUnicode_READY(first_identifier) == -1) { - return NULL; - } - if (PyUnicode_READY(second_identifier) == -1) { - return NULL; - } - const char *first_str = PyUnicode_AsUTF8(first_identifier); - if (!first_str) { - return NULL; - } - const char *second_str = PyUnicode_AsUTF8(second_identifier); - if (!second_str) { - return NULL; - } - Py_ssize_t len = strlen(first_str) + strlen(second_str) + 1; // +1 for the dot - - PyObject *str = PyBytes_FromStringAndSize(NULL, len); - if (!str) { - return NULL; - } - - char *s = PyBytes_AS_STRING(str); - if (!s) { - return NULL; - } - - strcpy(s, first_str); - s += strlen(first_str); - *s++ = '.'; - strcpy(s, second_str); - s += strlen(second_str); - *s = '\0'; - - PyObject *uni = PyUnicode_DecodeUTF8(PyBytes_AS_STRING(str), PyBytes_GET_SIZE(str), NULL); - Py_DECREF(str); - if (!uni) { - return NULL; - } - PyUnicode_InternInPlace(&uni); - if (_PyArena_AddPyObject(p->arena, uni) < 0) { - Py_DECREF(uni); - return NULL; - } - - return _PyAST_Name(uni, Load, EXTRA_EXPR(first_name, second_name)); -} - -/* Counts the total number of dots in seq's tokens */ -int -_PyPegen_seq_count_dots(asdl_seq *seq) -{ - int number_of_dots = 0; - for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) { - Token *current_expr = asdl_seq_GET_UNTYPED(seq, i); - switch (current_expr->type) { - case ELLIPSIS: - number_of_dots += 3; - break; - case DOT: - number_of_dots += 1; - break; - default: - Py_UNREACHABLE(); - } - } - - return number_of_dots; -} - -/* Creates an alias with '*' as the identifier name */ -alias_ty -_PyPegen_alias_for_star(Parser *p, int lineno, int col_offset, int end_lineno, - int end_col_offset, PyArena *arena) { - PyObject *str = PyUnicode_InternFromString("*"); - if (!str) { - return NULL; - } - if (_PyArena_AddPyObject(p->arena, str) < 0) { - Py_DECREF(str); - return NULL; - } - return _PyAST_alias(str, NULL, lineno, col_offset, end_lineno, end_col_offset, arena); -} - -/* Creates a new asdl_seq* with the identifiers of all the names in seq */ -asdl_identifier_seq * -_PyPegen_map_names_to_ids(Parser *p, asdl_expr_seq *seq) -{ - Py_ssize_t len = asdl_seq_LEN(seq); - assert(len > 0); - - asdl_identifier_seq *new_seq = _Py_asdl_identifier_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - expr_ty e = asdl_seq_GET(seq, i); - asdl_seq_SET(new_seq, i, e->v.Name.id); - } - return new_seq; -} - -/* Constructs a CmpopExprPair */ -CmpopExprPair * -_PyPegen_cmpop_expr_pair(Parser *p, cmpop_ty cmpop, expr_ty expr) -{ - assert(expr != NULL); - CmpopExprPair *a = _PyArena_Malloc(p->arena, sizeof(CmpopExprPair)); - if (!a) { - return NULL; - } - a->cmpop = cmpop; - a->expr = expr; - return a; -} - -asdl_int_seq * -_PyPegen_get_cmpops(Parser *p, asdl_seq *seq) -{ - Py_ssize_t len = asdl_seq_LEN(seq); - assert(len > 0); - - asdl_int_seq *new_seq = _Py_asdl_int_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - CmpopExprPair *pair = asdl_seq_GET_UNTYPED(seq, i); - asdl_seq_SET(new_seq, i, pair->cmpop); - } - return new_seq; -} - -asdl_expr_seq * -_PyPegen_get_exprs(Parser *p, asdl_seq *seq) -{ - Py_ssize_t len = asdl_seq_LEN(seq); - assert(len > 0); - - asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - CmpopExprPair *pair = asdl_seq_GET_UNTYPED(seq, i); - asdl_seq_SET(new_seq, i, pair->expr); - } - return new_seq; -} - -/* Creates an asdl_seq* where all the elements have been changed to have ctx as context */ -static asdl_expr_seq * -_set_seq_context(Parser *p, asdl_expr_seq *seq, expr_context_ty ctx) -{ - Py_ssize_t len = asdl_seq_LEN(seq); - if (len == 0) { - return NULL; - } - - asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - expr_ty e = asdl_seq_GET(seq, i); - asdl_seq_SET(new_seq, i, _PyPegen_set_expr_context(p, e, ctx)); - } - return new_seq; -} - -static expr_ty -_set_name_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _PyAST_Name(e->v.Name.id, ctx, EXTRA_EXPR(e, e)); -} - -static expr_ty -_set_tuple_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _PyAST_Tuple( - _set_seq_context(p, e->v.Tuple.elts, ctx), - ctx, - EXTRA_EXPR(e, e)); -} - -static expr_ty -_set_list_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _PyAST_List( - _set_seq_context(p, e->v.List.elts, ctx), - ctx, - EXTRA_EXPR(e, e)); -} - -static expr_ty -_set_subscript_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _PyAST_Subscript(e->v.Subscript.value, e->v.Subscript.slice, - ctx, EXTRA_EXPR(e, e)); -} - -static expr_ty -_set_attribute_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _PyAST_Attribute(e->v.Attribute.value, e->v.Attribute.attr, - ctx, EXTRA_EXPR(e, e)); -} - -static expr_ty -_set_starred_context(Parser *p, expr_ty e, expr_context_ty ctx) -{ - return _PyAST_Starred(_PyPegen_set_expr_context(p, e->v.Starred.value, ctx), - ctx, EXTRA_EXPR(e, e)); -} - -/* Creates an `expr_ty` equivalent to `expr` but with `ctx` as context */ -expr_ty -_PyPegen_set_expr_context(Parser *p, expr_ty expr, expr_context_ty ctx) -{ - assert(expr != NULL); - - expr_ty new = NULL; - switch (expr->kind) { - case Name_kind: - new = _set_name_context(p, expr, ctx); - break; - case Tuple_kind: - new = _set_tuple_context(p, expr, ctx); - break; - case List_kind: - new = _set_list_context(p, expr, ctx); - break; - case Subscript_kind: - new = _set_subscript_context(p, expr, ctx); - break; - case Attribute_kind: - new = _set_attribute_context(p, expr, ctx); - break; - case Starred_kind: - new = _set_starred_context(p, expr, ctx); - break; - default: - new = expr; - } - return new; -} - -/* Constructs a KeyValuePair that is used when parsing a dict's key value pairs */ -KeyValuePair * -_PyPegen_key_value_pair(Parser *p, expr_ty key, expr_ty value) -{ - KeyValuePair *a = _PyArena_Malloc(p->arena, sizeof(KeyValuePair)); - if (!a) { - return NULL; - } - a->key = key; - a->value = value; - return a; -} - -/* Extracts all keys from an asdl_seq* of KeyValuePair*'s */ -asdl_expr_seq * -_PyPegen_get_keys(Parser *p, asdl_seq *seq) -{ - Py_ssize_t len = asdl_seq_LEN(seq); - asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - KeyValuePair *pair = asdl_seq_GET_UNTYPED(seq, i); - asdl_seq_SET(new_seq, i, pair->key); - } - return new_seq; -} - -/* Extracts all values from an asdl_seq* of KeyValuePair*'s */ -asdl_expr_seq * -_PyPegen_get_values(Parser *p, asdl_seq *seq) -{ - Py_ssize_t len = asdl_seq_LEN(seq); - asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - KeyValuePair *pair = asdl_seq_GET_UNTYPED(seq, i); - asdl_seq_SET(new_seq, i, pair->value); - } - return new_seq; -} - -/* Constructs a KeyPatternPair that is used when parsing mapping & class patterns */ -KeyPatternPair * -_PyPegen_key_pattern_pair(Parser *p, expr_ty key, pattern_ty pattern) -{ - KeyPatternPair *a = _PyArena_Malloc(p->arena, sizeof(KeyPatternPair)); - if (!a) { - return NULL; - } - a->key = key; - a->pattern = pattern; - return a; -} - -/* Extracts all keys from an asdl_seq* of KeyPatternPair*'s */ -asdl_expr_seq * -_PyPegen_get_pattern_keys(Parser *p, asdl_seq *seq) -{ - Py_ssize_t len = asdl_seq_LEN(seq); - asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - KeyPatternPair *pair = asdl_seq_GET_UNTYPED(seq, i); - asdl_seq_SET(new_seq, i, pair->key); - } - return new_seq; -} - -/* Extracts all patterns from an asdl_seq* of KeyPatternPair*'s */ -asdl_pattern_seq * -_PyPegen_get_patterns(Parser *p, asdl_seq *seq) -{ - Py_ssize_t len = asdl_seq_LEN(seq); - asdl_pattern_seq *new_seq = _Py_asdl_pattern_seq_new(len, p->arena); - if (!new_seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - KeyPatternPair *pair = asdl_seq_GET_UNTYPED(seq, i); - asdl_seq_SET(new_seq, i, pair->pattern); - } - return new_seq; -} - -/* Constructs a NameDefaultPair */ -NameDefaultPair * -_PyPegen_name_default_pair(Parser *p, arg_ty arg, expr_ty value, Token *tc) -{ - NameDefaultPair *a = _PyArena_Malloc(p->arena, sizeof(NameDefaultPair)); - if (!a) { - return NULL; - } - a->arg = _PyPegen_add_type_comment_to_arg(p, arg, tc); - a->value = value; - return a; -} - -/* Constructs a SlashWithDefault */ -SlashWithDefault * -_PyPegen_slash_with_default(Parser *p, asdl_arg_seq *plain_names, asdl_seq *names_with_defaults) -{ - SlashWithDefault *a = _PyArena_Malloc(p->arena, sizeof(SlashWithDefault)); - if (!a) { - return NULL; - } - a->plain_names = plain_names; - a->names_with_defaults = names_with_defaults; - return a; -} - -/* Constructs a StarEtc */ -StarEtc * -_PyPegen_star_etc(Parser *p, arg_ty vararg, asdl_seq *kwonlyargs, arg_ty kwarg) -{ - StarEtc *a = _PyArena_Malloc(p->arena, sizeof(StarEtc)); - if (!a) { - return NULL; - } - a->vararg = vararg; - a->kwonlyargs = kwonlyargs; - a->kwarg = kwarg; - return a; -} - -asdl_seq * -_PyPegen_join_sequences(Parser *p, asdl_seq *a, asdl_seq *b) -{ - Py_ssize_t first_len = asdl_seq_LEN(a); - Py_ssize_t second_len = asdl_seq_LEN(b); - asdl_seq *new_seq = (asdl_seq*)_Py_asdl_generic_seq_new(first_len + second_len, p->arena); - if (!new_seq) { - return NULL; - } - - int k = 0; - for (Py_ssize_t i = 0; i < first_len; i++) { - asdl_seq_SET_UNTYPED(new_seq, k++, asdl_seq_GET_UNTYPED(a, i)); - } - for (Py_ssize_t i = 0; i < second_len; i++) { - asdl_seq_SET_UNTYPED(new_seq, k++, asdl_seq_GET_UNTYPED(b, i)); - } - - return new_seq; -} - -static asdl_arg_seq* -_get_names(Parser *p, asdl_seq *names_with_defaults) -{ - Py_ssize_t len = asdl_seq_LEN(names_with_defaults); - asdl_arg_seq *seq = _Py_asdl_arg_seq_new(len, p->arena); - if (!seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - NameDefaultPair *pair = asdl_seq_GET_UNTYPED(names_with_defaults, i); - asdl_seq_SET(seq, i, pair->arg); - } - return seq; -} - -static asdl_expr_seq * -_get_defaults(Parser *p, asdl_seq *names_with_defaults) -{ - Py_ssize_t len = asdl_seq_LEN(names_with_defaults); - asdl_expr_seq *seq = _Py_asdl_expr_seq_new(len, p->arena); - if (!seq) { - return NULL; - } - for (Py_ssize_t i = 0; i < len; i++) { - NameDefaultPair *pair = asdl_seq_GET_UNTYPED(names_with_defaults, i); - asdl_seq_SET(seq, i, pair->value); - } - return seq; -} - -static int -_make_posonlyargs(Parser *p, - asdl_arg_seq *slash_without_default, - SlashWithDefault *slash_with_default, - asdl_arg_seq **posonlyargs) { - if (slash_without_default != NULL) { - *posonlyargs = slash_without_default; - } - else if (slash_with_default != NULL) { - asdl_arg_seq *slash_with_default_names = - _get_names(p, slash_with_default->names_with_defaults); - if (!slash_with_default_names) { - return -1; - } - *posonlyargs = (asdl_arg_seq*)_PyPegen_join_sequences( - p, - (asdl_seq*)slash_with_default->plain_names, - (asdl_seq*)slash_with_default_names); - } - else { - *posonlyargs = _Py_asdl_arg_seq_new(0, p->arena); - } - return *posonlyargs == NULL ? -1 : 0; -} - -static int -_make_posargs(Parser *p, - asdl_arg_seq *plain_names, - asdl_seq *names_with_default, - asdl_arg_seq **posargs) { - if (plain_names != NULL && names_with_default != NULL) { - asdl_arg_seq *names_with_default_names = _get_names(p, names_with_default); - if (!names_with_default_names) { - return -1; - } - *posargs = (asdl_arg_seq*)_PyPegen_join_sequences( - p,(asdl_seq*)plain_names, (asdl_seq*)names_with_default_names); - } - else if (plain_names == NULL && names_with_default != NULL) { - *posargs = _get_names(p, names_with_default); - } - else if (plain_names != NULL && names_with_default == NULL) { - *posargs = plain_names; - } - else { - *posargs = _Py_asdl_arg_seq_new(0, p->arena); - } - return *posargs == NULL ? -1 : 0; -} - -static int -_make_posdefaults(Parser *p, - SlashWithDefault *slash_with_default, - asdl_seq *names_with_default, - asdl_expr_seq **posdefaults) { - if (slash_with_default != NULL && names_with_default != NULL) { - asdl_expr_seq *slash_with_default_values = - _get_defaults(p, slash_with_default->names_with_defaults); - if (!slash_with_default_values) { - return -1; - } - asdl_expr_seq *names_with_default_values = _get_defaults(p, names_with_default); - if (!names_with_default_values) { - return -1; - } - *posdefaults = (asdl_expr_seq*)_PyPegen_join_sequences( - p, - (asdl_seq*)slash_with_default_values, - (asdl_seq*)names_with_default_values); - } - else if (slash_with_default == NULL && names_with_default != NULL) { - *posdefaults = _get_defaults(p, names_with_default); - } - else if (slash_with_default != NULL && names_with_default == NULL) { - *posdefaults = _get_defaults(p, slash_with_default->names_with_defaults); - } - else { - *posdefaults = _Py_asdl_expr_seq_new(0, p->arena); - } - return *posdefaults == NULL ? -1 : 0; -} - -static int -_make_kwargs(Parser *p, StarEtc *star_etc, - asdl_arg_seq **kwonlyargs, - asdl_expr_seq **kwdefaults) { - if (star_etc != NULL && star_etc->kwonlyargs != NULL) { - *kwonlyargs = _get_names(p, star_etc->kwonlyargs); - } - else { - *kwonlyargs = _Py_asdl_arg_seq_new(0, p->arena); - } - - if (*kwonlyargs == NULL) { - return -1; - } - - if (star_etc != NULL && star_etc->kwonlyargs != NULL) { - *kwdefaults = _get_defaults(p, star_etc->kwonlyargs); - } - else { - *kwdefaults = _Py_asdl_expr_seq_new(0, p->arena); - } - - if (*kwdefaults == NULL) { - return -1; - } - - return 0; -} - -/* Constructs an arguments_ty object out of all the parsed constructs in the parameters rule */ -arguments_ty -_PyPegen_make_arguments(Parser *p, asdl_arg_seq *slash_without_default, - SlashWithDefault *slash_with_default, asdl_arg_seq *plain_names, - asdl_seq *names_with_default, StarEtc *star_etc) -{ - asdl_arg_seq *posonlyargs; - if (_make_posonlyargs(p, slash_without_default, slash_with_default, &posonlyargs) == -1) { - return NULL; - } - - asdl_arg_seq *posargs; - if (_make_posargs(p, plain_names, names_with_default, &posargs) == -1) { - return NULL; - } - - asdl_expr_seq *posdefaults; - if (_make_posdefaults(p,slash_with_default, names_with_default, &posdefaults) == -1) { - return NULL; - } - - arg_ty vararg = NULL; - if (star_etc != NULL && star_etc->vararg != NULL) { - vararg = star_etc->vararg; - } - - asdl_arg_seq *kwonlyargs; - asdl_expr_seq *kwdefaults; - if (_make_kwargs(p, star_etc, &kwonlyargs, &kwdefaults) == -1) { - return NULL; - } - - arg_ty kwarg = NULL; - if (star_etc != NULL && star_etc->kwarg != NULL) { - kwarg = star_etc->kwarg; - } - - return _PyAST_arguments(posonlyargs, posargs, vararg, kwonlyargs, - kwdefaults, kwarg, posdefaults, p->arena); -} - - -/* Constructs an empty arguments_ty object, that gets used when a function accepts no - * arguments. */ -arguments_ty -_PyPegen_empty_arguments(Parser *p) -{ - asdl_arg_seq *posonlyargs = _Py_asdl_arg_seq_new(0, p->arena); - if (!posonlyargs) { - return NULL; - } - asdl_arg_seq *posargs = _Py_asdl_arg_seq_new(0, p->arena); - if (!posargs) { - return NULL; - } - asdl_expr_seq *posdefaults = _Py_asdl_expr_seq_new(0, p->arena); - if (!posdefaults) { - return NULL; - } - asdl_arg_seq *kwonlyargs = _Py_asdl_arg_seq_new(0, p->arena); - if (!kwonlyargs) { - return NULL; - } - asdl_expr_seq *kwdefaults = _Py_asdl_expr_seq_new(0, p->arena); - if (!kwdefaults) { - return NULL; - } - - return _PyAST_arguments(posonlyargs, posargs, NULL, kwonlyargs, - kwdefaults, NULL, posdefaults, p->arena); -} - -/* Encapsulates the value of an operator_ty into an AugOperator struct */ -AugOperator * -_PyPegen_augoperator(Parser *p, operator_ty kind) -{ - AugOperator *a = _PyArena_Malloc(p->arena, sizeof(AugOperator)); - if (!a) { - return NULL; - } - a->kind = kind; - return a; -} - -/* Construct a FunctionDef equivalent to function_def, but with decorators */ -stmt_ty -_PyPegen_function_def_decorators(Parser *p, asdl_expr_seq *decorators, stmt_ty function_def) -{ - assert(function_def != NULL); - if (function_def->kind == AsyncFunctionDef_kind) { - return _PyAST_AsyncFunctionDef( - function_def->v.FunctionDef.name, function_def->v.FunctionDef.args, - function_def->v.FunctionDef.body, decorators, function_def->v.FunctionDef.returns, - function_def->v.FunctionDef.type_comment, function_def->lineno, - function_def->col_offset, function_def->end_lineno, function_def->end_col_offset, - p->arena); - } - - return _PyAST_FunctionDef( - function_def->v.FunctionDef.name, function_def->v.FunctionDef.args, - function_def->v.FunctionDef.body, decorators, - function_def->v.FunctionDef.returns, - function_def->v.FunctionDef.type_comment, function_def->lineno, - function_def->col_offset, function_def->end_lineno, - function_def->end_col_offset, p->arena); -} - -/* Construct a ClassDef equivalent to class_def, but with decorators */ -stmt_ty -_PyPegen_class_def_decorators(Parser *p, asdl_expr_seq *decorators, stmt_ty class_def) -{ - assert(class_def != NULL); - return _PyAST_ClassDef( - class_def->v.ClassDef.name, class_def->v.ClassDef.bases, - class_def->v.ClassDef.keywords, class_def->v.ClassDef.body, decorators, - class_def->lineno, class_def->col_offset, class_def->end_lineno, - class_def->end_col_offset, p->arena); -} - -/* Construct a KeywordOrStarred */ -KeywordOrStarred * -_PyPegen_keyword_or_starred(Parser *p, void *element, int is_keyword) -{ - KeywordOrStarred *a = _PyArena_Malloc(p->arena, sizeof(KeywordOrStarred)); - if (!a) { - return NULL; - } - a->element = element; - a->is_keyword = is_keyword; - return a; -} - -/* Get the number of starred expressions in an asdl_seq* of KeywordOrStarred*s */ -static int -_seq_number_of_starred_exprs(asdl_seq *seq) -{ - int n = 0; - for (Py_ssize_t i = 0, l = asdl_seq_LEN(seq); i < l; i++) { - KeywordOrStarred *k = asdl_seq_GET_UNTYPED(seq, i); - if (!k->is_keyword) { - n++; - } - } - return n; -} - -/* Extract the starred expressions of an asdl_seq* of KeywordOrStarred*s */ -asdl_expr_seq * -_PyPegen_seq_extract_starred_exprs(Parser *p, asdl_seq *kwargs) -{ - int new_len = _seq_number_of_starred_exprs(kwargs); - if (new_len == 0) { - return NULL; - } - asdl_expr_seq *new_seq = _Py_asdl_expr_seq_new(new_len, p->arena); - if (!new_seq) { - return NULL; - } - - int idx = 0; - for (Py_ssize_t i = 0, len = asdl_seq_LEN(kwargs); i < len; i++) { - KeywordOrStarred *k = asdl_seq_GET_UNTYPED(kwargs, i); - if (!k->is_keyword) { - asdl_seq_SET(new_seq, idx++, k->element); - } - } - return new_seq; -} - -/* Return a new asdl_seq* with only the keywords in kwargs */ -asdl_keyword_seq* -_PyPegen_seq_delete_starred_exprs(Parser *p, asdl_seq *kwargs) -{ - Py_ssize_t len = asdl_seq_LEN(kwargs); - Py_ssize_t new_len = len - _seq_number_of_starred_exprs(kwargs); - if (new_len == 0) { - return NULL; - } - asdl_keyword_seq *new_seq = _Py_asdl_keyword_seq_new(new_len, p->arena); - if (!new_seq) { - return NULL; - } - - int idx = 0; - for (Py_ssize_t i = 0; i < len; i++) { - KeywordOrStarred *k = asdl_seq_GET_UNTYPED(kwargs, i); - if (k->is_keyword) { - asdl_seq_SET(new_seq, idx++, k->element); - } - } - return new_seq; -} - -expr_ty -_PyPegen_concatenate_strings(Parser *p, asdl_seq *strings) -{ - Py_ssize_t len = asdl_seq_LEN(strings); - assert(len > 0); - - Token *first = asdl_seq_GET_UNTYPED(strings, 0); - Token *last = asdl_seq_GET_UNTYPED(strings, len - 1); - - int bytesmode = 0; - PyObject *bytes_str = NULL; - - FstringParser state; - _PyPegen_FstringParser_Init(&state); - - for (Py_ssize_t i = 0; i < len; i++) { - Token *t = asdl_seq_GET_UNTYPED(strings, i); - - int this_bytesmode; - int this_rawmode; - PyObject *s; - const char *fstr; - Py_ssize_t fstrlen = -1; - - if (_PyPegen_parsestr(p, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen, t) != 0) { - goto error; - } - - /* Check that we are not mixing bytes with unicode. */ - if (i != 0 && bytesmode != this_bytesmode) { - RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals"); - Py_XDECREF(s); - goto error; - } - bytesmode = this_bytesmode; - - if (fstr != NULL) { - assert(s == NULL && !bytesmode); - - int result = _PyPegen_FstringParser_ConcatFstring(p, &state, &fstr, fstr + fstrlen, - this_rawmode, 0, first, t, last); - if (result < 0) { - goto error; - } - } - else { - /* String or byte string. */ - assert(s != NULL && fstr == NULL); - assert(bytesmode ? PyBytes_CheckExact(s) : PyUnicode_CheckExact(s)); - - if (bytesmode) { - if (i == 0) { - bytes_str = s; - } - else { - PyBytes_ConcatAndDel(&bytes_str, s); - if (!bytes_str) { - goto error; - } - } - } - else { - /* This is a regular string. Concatenate it. */ - if (_PyPegen_FstringParser_ConcatAndDel(&state, s) < 0) { - goto error; - } - } - } - } - - if (bytesmode) { - if (_PyArena_AddPyObject(p->arena, bytes_str) < 0) { - goto error; - } - return _PyAST_Constant(bytes_str, NULL, first->lineno, - first->col_offset, last->end_lineno, - last->end_col_offset, p->arena); - } - - return _PyPegen_FstringParser_Finish(p, &state, first, last); - -error: - Py_XDECREF(bytes_str); - _PyPegen_FstringParser_Dealloc(&state); - if (PyErr_Occurred()) { - raise_decode_error(p); - } - return NULL; -} - -expr_ty -_PyPegen_ensure_imaginary(Parser *p, expr_ty exp) -{ - if (exp->kind != Constant_kind || !PyComplex_CheckExact(exp->v.Constant.value)) { - RAISE_SYNTAX_ERROR_KNOWN_LOCATION(exp, "imaginary number required in complex literal"); - return NULL; - } - return exp; -} - -expr_ty -_PyPegen_ensure_real(Parser *p, expr_ty exp) -{ - if (exp->kind != Constant_kind || PyComplex_CheckExact(exp->v.Constant.value)) { - RAISE_SYNTAX_ERROR_KNOWN_LOCATION(exp, "real number required in complex literal"); - return NULL; - } - return exp; -} - -mod_ty -_PyPegen_make_module(Parser *p, asdl_stmt_seq *a) { - asdl_type_ignore_seq *type_ignores = NULL; - Py_ssize_t num = p->type_ignore_comments.num_items; - if (num > 0) { - // Turn the raw (comment, lineno) pairs into TypeIgnore objects in the arena - type_ignores = _Py_asdl_type_ignore_seq_new(num, p->arena); - if (type_ignores == NULL) { - return NULL; - } - for (int i = 0; i < num; i++) { - PyObject *tag = _PyPegen_new_type_comment(p, p->type_ignore_comments.items[i].comment); - if (tag == NULL) { - return NULL; - } - type_ignore_ty ti = _PyAST_TypeIgnore(p->type_ignore_comments.items[i].lineno, - tag, p->arena); - if (ti == NULL) { - return NULL; - } - asdl_seq_SET(type_ignores, i, ti); - } - } - return _PyAST_Module(a, type_ignores, p->arena); -} - -// Error reporting helpers - -expr_ty -_PyPegen_get_invalid_target(expr_ty e, TARGETS_TYPE targets_type) -{ - if (e == NULL) { - return NULL; - } - -#define VISIT_CONTAINER(CONTAINER, TYPE) do { \ - Py_ssize_t len = asdl_seq_LEN((CONTAINER)->v.TYPE.elts);\ - for (Py_ssize_t i = 0; i < len; i++) {\ - expr_ty other = asdl_seq_GET((CONTAINER)->v.TYPE.elts, i);\ - expr_ty child = _PyPegen_get_invalid_target(other, targets_type);\ - if (child != NULL) {\ - return child;\ - }\ - }\ - } while (0) - - // We only need to visit List and Tuple nodes recursively as those - // are the only ones that can contain valid names in targets when - // they are parsed as expressions. Any other kind of expression - // that is a container (like Sets or Dicts) is directly invalid and - // we don't need to visit it recursively. - - switch (e->kind) { - case List_kind: - VISIT_CONTAINER(e, List); - return NULL; - case Tuple_kind: - VISIT_CONTAINER(e, Tuple); - return NULL; - case Starred_kind: - if (targets_type == DEL_TARGETS) { - return e; - } - return _PyPegen_get_invalid_target(e->v.Starred.value, targets_type); - case Compare_kind: - // This is needed, because the `a in b` in `for a in b` gets parsed - // as a comparison, and so we need to search the left side of the comparison - // for invalid targets. - if (targets_type == FOR_TARGETS) { - cmpop_ty cmpop = (cmpop_ty) asdl_seq_GET(e->v.Compare.ops, 0); - if (cmpop == In) { - return _PyPegen_get_invalid_target(e->v.Compare.left, targets_type); - } - return NULL; - } - return e; - case Name_kind: - case Subscript_kind: - case Attribute_kind: - return NULL; - default: - return e; - } -} - -void *_PyPegen_arguments_parsing_error(Parser *p, expr_ty e) { - int kwarg_unpacking = 0; - for (Py_ssize_t i = 0, l = asdl_seq_LEN(e->v.Call.keywords); i < l; i++) { - keyword_ty keyword = asdl_seq_GET(e->v.Call.keywords, i); - if (!keyword->arg) { - kwarg_unpacking = 1; - } - } - - const char *msg = NULL; - if (kwarg_unpacking) { - msg = "positional argument follows keyword argument unpacking"; - } else { - msg = "positional argument follows keyword argument"; - } - - return RAISE_SYNTAX_ERROR(msg); -} - - -static inline expr_ty -_PyPegen_get_last_comprehension_item(comprehension_ty comprehension) { - if (comprehension->ifs == NULL || asdl_seq_LEN(comprehension->ifs) == 0) { - return comprehension->iter; - } - return PyPegen_last_item(comprehension->ifs, expr_ty); -} - -void * -_PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq *comprehensions) -{ - /* The rule that calls this function is 'args for_if_clauses'. - For the input f(L, x for x in y), L and x are in args and - the for is parsed as a for_if_clause. We have to check if - len <= 1, so that input like dict((a, b) for a, b in x) - gets successfully parsed and then we pass the last - argument (x in the above example) as the location of the - error */ - Py_ssize_t len = asdl_seq_LEN(args->v.Call.args); - if (len <= 1) { - return NULL; - } - - comprehension_ty last_comprehension = PyPegen_last_item(comprehensions, comprehension_ty); - - return RAISE_SYNTAX_ERROR_KNOWN_RANGE( - (expr_ty) asdl_seq_GET(args->v.Call.args, len - 1), - _PyPegen_get_last_comprehension_item(last_comprehension), - "Generator expression must be parenthesized" - ); -} - - -expr_ty _PyPegen_collect_call_seqs(Parser *p, asdl_expr_seq *a, asdl_seq *b, - int lineno, int col_offset, int end_lineno, - int end_col_offset, PyArena *arena) { - Py_ssize_t args_len = asdl_seq_LEN(a); - Py_ssize_t total_len = args_len; - - if (b == NULL) { - return _PyAST_Call(_PyPegen_dummy_name(p), a, NULL, lineno, col_offset, - end_lineno, end_col_offset, arena); - - } - - asdl_expr_seq *starreds = _PyPegen_seq_extract_starred_exprs(p, b); - asdl_keyword_seq *keywords = _PyPegen_seq_delete_starred_exprs(p, b); - - if (starreds) { - total_len += asdl_seq_LEN(starreds); - } - - asdl_expr_seq *args = _Py_asdl_expr_seq_new(total_len, arena); - - Py_ssize_t i = 0; - for (i = 0; i < args_len; i++) { - asdl_seq_SET(args, i, asdl_seq_GET(a, i)); - } - for (; i < total_len; i++) { - asdl_seq_SET(args, i, asdl_seq_GET(starreds, i - args_len)); - } - - return _PyAST_Call(_PyPegen_dummy_name(p), args, keywords, lineno, - col_offset, end_lineno, end_col_offset, arena); -} +} \ No newline at end of file diff --git a/Parser/pegen.h b/Parser/pegen.h index 8721d7e..e5e712a 100644 --- a/Parser/pegen.h +++ b/Parser/pegen.h @@ -23,6 +23,8 @@ #define PyPARSE_TYPE_COMMENTS 0x0040 #define PyPARSE_ASYNC_HACKS 0x0080 +#define CURRENT_POS (-5) + typedef struct _memo { int type; void *node; @@ -114,6 +116,7 @@ typedef struct { int is_keyword; } KeywordOrStarred; +// Internal parser functions #if defined(Py_DEBUG) void _PyPegen_clear_memo_statistics(void); PyObject *_PyPegen_get_memo_statistics(void); @@ -123,7 +126,6 @@ int _PyPegen_insert_memo(Parser *p, int mark, int type, void *node); int _PyPegen_update_memo(Parser *p, int mark, int type, void *node); int _PyPegen_is_memoized(Parser *p, int type, void *pres); - int _PyPegen_lookahead_with_name(int, expr_ty (func)(Parser *), Parser *); int _PyPegen_lookahead_with_int(int, Token *(func)(Parser *, int), Parser *, int); int _PyPegen_lookahead_with_string(int , expr_ty (func)(Parser *, const char*), Parser *, const char*); @@ -139,23 +141,24 @@ int _PyPegen_fill_token(Parser *p); expr_ty _PyPegen_name_token(Parser *p); expr_ty _PyPegen_number_token(Parser *p); void *_PyPegen_string_token(Parser *p); -const char *_PyPegen_get_expr_name(expr_ty); Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset); + +// Error handling functions and APIs +typedef enum { + STAR_TARGETS, + DEL_TARGETS, + FOR_TARGETS +} TARGETS_TYPE; + +int _Pypegen_raise_decode_error(Parser *p); +void _PyPegen_raise_tokenizer_init_error(PyObject *filename); +int _Pypegen_tokenizer_error(Parser *p); void *_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...); void *_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, Py_ssize_t lineno, Py_ssize_t col_offset, Py_ssize_t end_lineno, Py_ssize_t end_col_offset, const char *errmsg, va_list va); -void *_PyPegen_dummy_name(Parser *p, ...); - -void * _PyPegen_seq_last_item(asdl_seq *seq); -#define PyPegen_last_item(seq, type) ((type)_PyPegen_seq_last_item((asdl_seq*)seq)) - -void * _PyPegen_seq_first_item(asdl_seq *seq); -#define PyPegen_first_item(seq, type) ((type)_PyPegen_seq_first_item((asdl_seq*)seq)) - -#define CURRENT_POS (-5) - +void _Pypegen_set_syntax_error(Parser* p, Token* last_token); Py_LOCAL_INLINE(void *) RAISE_ERROR_KNOWN_LOCATION(Parser *p, PyObject *errtype, Py_ssize_t lineno, Py_ssize_t col_offset, @@ -170,10 +173,6 @@ RAISE_ERROR_KNOWN_LOCATION(Parser *p, PyObject *errtype, va_end(va); return NULL; } - -#define UNUSED(expr) do { (void)(expr); } while (0) -#define EXTRA_EXPR(head, tail) head->lineno, (head)->col_offset, (tail)->end_lineno, (tail)->end_col_offset, p->arena -#define EXTRA _start_lineno, _start_col_offset, _end_lineno, _end_col_offset, p->arena #define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, msg, ##__VA_ARGS__) #define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, msg, ##__VA_ARGS__) #define RAISE_SYNTAX_ERROR_KNOWN_RANGE(a, b, msg, ...) \ @@ -182,6 +181,7 @@ RAISE_ERROR_KNOWN_LOCATION(Parser *p, PyObject *errtype, RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, (a)->lineno, (a)->col_offset, (a)->end_lineno, (a)->end_col_offset, msg, ##__VA_ARGS__) #define RAISE_SYNTAX_ERROR_STARTING_FROM(a, msg, ...) \ RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, (a)->lineno, (a)->col_offset, CURRENT_POS, CURRENT_POS, msg, ##__VA_ARGS__) +#define RAISE_SYNTAX_ERROR_INVALID_TARGET(type, e) _RAISE_SYNTAX_ERROR_INVALID_TARGET(p, type, e) Py_LOCAL_INLINE(void *) CHECK_CALL(Parser *p, void *result) @@ -207,6 +207,39 @@ CHECK_CALL_NULL_ALLOWED(Parser *p, void *result) #define CHECK(type, result) ((type) CHECK_CALL(p, result)) #define CHECK_NULL_ALLOWED(type, result) ((type) CHECK_CALL_NULL_ALLOWED(p, result)) +expr_ty _PyPegen_get_invalid_target(expr_ty e, TARGETS_TYPE targets_type); +const char *_PyPegen_get_expr_name(expr_ty); +Py_LOCAL_INLINE(void *) +_RAISE_SYNTAX_ERROR_INVALID_TARGET(Parser *p, TARGETS_TYPE type, void *e) +{ + expr_ty invalid_target = CHECK_NULL_ALLOWED(expr_ty, _PyPegen_get_invalid_target(e, type)); + if (invalid_target != NULL) { + const char *msg; + if (type == STAR_TARGETS || type == FOR_TARGETS) { + msg = "cannot assign to %s"; + } + else { + msg = "cannot delete %s"; + } + return RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + invalid_target, + msg, + _PyPegen_get_expr_name(invalid_target) + ); + } + return RAISE_SYNTAX_ERROR("invalid syntax"); +} + +// Action utility functions + +void *_PyPegen_dummy_name(Parser *p, ...); +void * _PyPegen_seq_last_item(asdl_seq *seq); +#define PyPegen_last_item(seq, type) ((type)_PyPegen_seq_last_item((asdl_seq*)seq)) +void * _PyPegen_seq_first_item(asdl_seq *seq); +#define PyPegen_first_item(seq, type) ((type)_PyPegen_seq_first_item((asdl_seq*)seq)) +#define UNUSED(expr) do { (void)(expr); } while (0) +#define EXTRA_EXPR(head, tail) head->lineno, (head)->col_offset, (tail)->end_lineno, (tail)->end_col_offset, p->arena +#define EXTRA _start_lineno, _start_col_offset, _end_lineno, _end_col_offset, p->arena PyObject *_PyPegen_new_type_comment(Parser *, const char *); Py_LOCAL_INLINE(PyObject *) @@ -248,13 +281,6 @@ INVALID_VERSION_CHECK(Parser *p, int version, char *msg, void *node) arg_ty _PyPegen_add_type_comment_to_arg(Parser *, arg_ty, Token *); PyObject *_PyPegen_new_identifier(Parser *, const char *); -Parser *_PyPegen_Parser_New(struct tok_state *, int, int, int, int *, PyArena *); -void _PyPegen_Parser_Free(Parser *); -mod_ty _PyPegen_run_parser_from_file_pointer(FILE *, int, PyObject *, const char *, - const char *, const char *, PyCompilerFlags *, int *, PyArena *); -void *_PyPegen_run_parser(Parser *); -mod_ty _PyPegen_run_parser_from_string(const char *, int, PyObject *, PyCompilerFlags *, PyArena *); -asdl_stmt_seq *_PyPegen_interactive_exit(Parser *); asdl_seq *_PyPegen_singleton_seq(Parser *, void *); asdl_seq *_PyPegen_seq_insert_in_front(Parser *, void *, asdl_seq *); asdl_seq *_PyPegen_seq_append_to_end(Parser *, asdl_seq *, void *); @@ -295,40 +321,18 @@ asdl_seq *_PyPegen_join_sequences(Parser *, asdl_seq *, asdl_seq *); int _PyPegen_check_barry_as_flufl(Parser *, Token *); int _PyPegen_check_legacy_stmt(Parser *p, expr_ty t); mod_ty _PyPegen_make_module(Parser *, asdl_stmt_seq *); - -// Error reporting helpers -typedef enum { - STAR_TARGETS, - DEL_TARGETS, - FOR_TARGETS -} TARGETS_TYPE; -expr_ty _PyPegen_get_invalid_target(expr_ty e, TARGETS_TYPE targets_type); -#define RAISE_SYNTAX_ERROR_INVALID_TARGET(type, e) _RAISE_SYNTAX_ERROR_INVALID_TARGET(p, type, e) - -Py_LOCAL_INLINE(void *) -_RAISE_SYNTAX_ERROR_INVALID_TARGET(Parser *p, TARGETS_TYPE type, void *e) -{ - expr_ty invalid_target = CHECK_NULL_ALLOWED(expr_ty, _PyPegen_get_invalid_target(e, type)); - if (invalid_target != NULL) { - const char *msg; - if (type == STAR_TARGETS || type == FOR_TARGETS) { - msg = "cannot assign to %s"; - } - else { - msg = "cannot delete %s"; - } - return RAISE_SYNTAX_ERROR_KNOWN_LOCATION( - invalid_target, - msg, - _PyPegen_get_expr_name(invalid_target) - ); - } - return RAISE_SYNTAX_ERROR("invalid syntax"); -} - void *_PyPegen_arguments_parsing_error(Parser *, expr_ty); void *_PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq *comprehensions); +// Parser API + +Parser *_PyPegen_Parser_New(struct tok_state *, int, int, int, int *, PyArena *); +void _PyPegen_Parser_Free(Parser *); +mod_ty _PyPegen_run_parser_from_file_pointer(FILE *, int, PyObject *, const char *, + const char *, const char *, PyCompilerFlags *, int *, PyArena *); +void *_PyPegen_run_parser(Parser *); +mod_ty _PyPegen_run_parser_from_string(const char *, int, PyObject *, PyCompilerFlags *, PyArena *); +asdl_stmt_seq *_PyPegen_interactive_exit(Parser *); // Generated function in parse.c - function definition in python.gram void *_PyPegen_parse(Parser *); diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c new file mode 100644 index 0000000..6eeab0a --- /dev/null +++ b/Parser/pegen_errors.c @@ -0,0 +1,425 @@ +#include +#include + +#include "tokenizer.h" +#include "pegen.h" + +// TOKENIZER ERRORS + +void +_PyPegen_raise_tokenizer_init_error(PyObject *filename) +{ + if (!(PyErr_ExceptionMatches(PyExc_LookupError) + || PyErr_ExceptionMatches(PyExc_SyntaxError) + || PyErr_ExceptionMatches(PyExc_ValueError) + || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { + return; + } + PyObject *errstr = NULL; + PyObject *tuple = NULL; + PyObject *type; + PyObject *value; + PyObject *tback; + PyErr_Fetch(&type, &value, &tback); + errstr = PyObject_Str(value); + if (!errstr) { + goto error; + } + + PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); + if (!tmp) { + goto error; + } + + tuple = PyTuple_Pack(2, errstr, tmp); + Py_DECREF(tmp); + if (!value) { + goto error; + } + PyErr_SetObject(PyExc_SyntaxError, tuple); + +error: + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(tback); + Py_XDECREF(errstr); + Py_XDECREF(tuple); +} + +static inline void +raise_unclosed_parentheses_error(Parser *p) { + int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; + int error_col = p->tok->parencolstack[p->tok->level-1]; + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, + error_lineno, error_col, error_lineno, -1, + "'%c' was never closed", + p->tok->parenstack[p->tok->level-1]); +} + +int +_Pypegen_tokenizer_error(Parser *p) +{ + if (PyErr_Occurred()) { + return -1; + } + + const char *msg = NULL; + PyObject* errtype = PyExc_SyntaxError; + Py_ssize_t col_offset = -1; + switch (p->tok->done) { + case E_TOKEN: + msg = "invalid token"; + break; + case E_EOF: + if (p->tok->level) { + raise_unclosed_parentheses_error(p); + } else { + RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); + } + return -1; + case E_DEDENT: + RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); + return -1; + case E_INTR: + if (!PyErr_Occurred()) { + PyErr_SetNone(PyExc_KeyboardInterrupt); + } + return -1; + case E_NOMEM: + PyErr_NoMemory(); + return -1; + case E_TABSPACE: + errtype = PyExc_TabError; + msg = "inconsistent use of tabs and spaces in indentation"; + break; + case E_TOODEEP: + errtype = PyExc_IndentationError; + msg = "too many levels of indentation"; + break; + case E_LINECONT: { + col_offset = p->tok->cur - p->tok->buf - 1; + msg = "unexpected character after line continuation character"; + break; + } + default: + msg = "unknown parsing error"; + } + + RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, + col_offset >= 0 ? col_offset : 0, + p->tok->lineno, -1, msg); + return -1; +} + +int +_Pypegen_raise_decode_error(Parser *p) +{ + assert(PyErr_Occurred()); + const char *errtype = NULL; + if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { + errtype = "unicode error"; + } + else if (PyErr_ExceptionMatches(PyExc_ValueError)) { + errtype = "value error"; + } + if (errtype) { + PyObject *type; + PyObject *value; + PyObject *tback; + PyObject *errstr; + PyErr_Fetch(&type, &value, &tback); + errstr = PyObject_Str(value); + if (errstr) { + RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); + Py_DECREF(errstr); + } + else { + PyErr_Clear(); + RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); + } + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(tback); + } + + return -1; +} + +static int +_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { + // Tokenize the whole input to see if there are any tokenization + // errors such as mistmatching parentheses. These will get priority + // over generic syntax errors only if the line number of the error is + // before the one that we had for the generic error. + + // We don't want to tokenize to the end for interactive input + if (p->tok->prompt != NULL) { + return 0; + } + + PyObject *type, *value, *traceback; + PyErr_Fetch(&type, &value, &traceback); + + Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; + Py_ssize_t current_err_line = current_token->lineno; + + int ret = 0; + + for (;;) { + const char *start; + const char *end; + switch (_PyTokenizer_Get(p->tok, &start, &end)) { + case ERRORTOKEN: + if (p->tok->level != 0) { + int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; + if (current_err_line > error_lineno) { + raise_unclosed_parentheses_error(p); + ret = -1; + goto exit; + } + } + break; + case ENDMARKER: + break; + default: + continue; + } + break; + } + + +exit: + if (PyErr_Occurred()) { + Py_XDECREF(value); + Py_XDECREF(type); + Py_XDECREF(traceback); + } else { + PyErr_Restore(type, value, traceback); + } + return ret; +} + +// PARSER ERRORS + +void * +_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) +{ + if (p->fill == 0) { + va_list va; + va_start(va, errmsg); + _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); + va_end(va); + return NULL; + } + + Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; + Py_ssize_t col_offset; + Py_ssize_t end_col_offset = -1; + if (t->col_offset == -1) { + if (p->tok->cur == p->tok->buf) { + col_offset = 0; + } else { + const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; + col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); + } + } else { + col_offset = t->col_offset + 1; + } + + if (t->end_col_offset != -1) { + end_col_offset = t->end_col_offset + 1; + } + + va_list va; + va_start(va, errmsg); + _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); + va_end(va); + + return NULL; +} + +static PyObject * +get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) +{ + /* If the file descriptor is interactive, the source lines of the current + * (multi-line) statement are stored in p->tok->interactive_src_start. + * If not, we're parsing from a string, which means that the whole source + * is stored in p->tok->str. */ + assert(p->tok->fp == NULL || p->tok->fp == stdin); + + char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; + assert(cur_line != NULL); + + for (int i = 0; i < lineno - 1; i++) { + cur_line = strchr(cur_line, '\n') + 1; + } + + char *next_newline; + if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line + next_newline = cur_line + strlen(cur_line); + } + return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); +} + +void * +_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, + Py_ssize_t lineno, Py_ssize_t col_offset, + Py_ssize_t end_lineno, Py_ssize_t end_col_offset, + const char *errmsg, va_list va) +{ + PyObject *value = NULL; + PyObject *errstr = NULL; + PyObject *error_line = NULL; + PyObject *tmp = NULL; + p->error_indicator = 1; + + if (end_lineno == CURRENT_POS) { + end_lineno = p->tok->lineno; + } + if (end_col_offset == CURRENT_POS) { + end_col_offset = p->tok->cur - p->tok->line_start; + } + + if (p->start_rule == Py_fstring_input) { + const char *fstring_msg = "f-string: "; + Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg); + + char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character + if (!new_errmsg) { + return (void *) PyErr_NoMemory(); + } + + // Copy both strings into new buffer + memcpy(new_errmsg, fstring_msg, strlen(fstring_msg)); + memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg)); + new_errmsg[len] = 0; + errmsg = new_errmsg; + } + errstr = PyUnicode_FromFormatV(errmsg, va); + if (!errstr) { + goto error; + } + + if (p->tok->fp_interactive) { + error_line = get_error_line_from_tokenizer_buffers(p, lineno); + } + else if (p->start_rule == Py_file_input) { + error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, + (int) lineno, p->tok->encoding); + } + + if (!error_line) { + /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, + then we need to find the error line from some other source, because + p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly + failed or we're parsing from a string or the REPL. There's a third edge case where + we're actually parsing from a file, which has an E_EOF SyntaxError and in that case + `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which + does not physically exist */ + assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); + + if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { + Py_ssize_t size = p->tok->inp - p->tok->buf; + error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); + } + else if (p->tok->fp == NULL || p->tok->fp == stdin) { + error_line = get_error_line_from_tokenizer_buffers(p, lineno); + } + else { + error_line = PyUnicode_FromStringAndSize("", 0); + } + if (!error_line) { + goto error; + } + } + + if (p->start_rule == Py_fstring_input) { + col_offset -= p->starting_col_offset; + end_col_offset -= p->starting_col_offset; + } + + Py_ssize_t col_number = col_offset; + Py_ssize_t end_col_number = end_col_offset; + + if (p->tok->encoding != NULL) { + col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); + if (col_number < 0) { + goto error; + } + if (end_col_number > 0) { + Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number); + if (end_col_offset < 0) { + goto error; + } else { + end_col_number = end_col_offset; + } + } + } + tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); + if (!tmp) { + goto error; + } + value = PyTuple_Pack(2, errstr, tmp); + Py_DECREF(tmp); + if (!value) { + goto error; + } + PyErr_SetObject(errtype, value); + + Py_DECREF(errstr); + Py_DECREF(value); + if (p->start_rule == Py_fstring_input) { + PyMem_Free((void *)errmsg); + } + return NULL; + +error: + Py_XDECREF(errstr); + Py_XDECREF(error_line); + if (p->start_rule == Py_fstring_input) { + PyMem_Free((void *)errmsg); + } + return NULL; +} + +void +_Pypegen_set_syntax_error(Parser* p, Token* last_token) { + // Existing sintax error + if (PyErr_Occurred()) { + // Prioritize tokenizer errors to custom syntax errors raised + // on the second phase only if the errors come from the parser. + if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) { + _PyPegen_tokenize_full_source_to_check_for_errors(p); + } + // Propagate the existing syntax error. + return; + } + // Initialization error + if (p->fill == 0) { + RAISE_SYNTAX_ERROR("error at start before reading any input"); + } + // Parser encountered EOF (End of File) unexpectedtly + if (p->tok->done == E_EOF) { + if (p->tok->level) { + raise_unclosed_parentheses_error(p); + } else { + RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); + } + return; + } + // Indentation error in the tokenizer + if (last_token->type == INDENT || last_token->type == DEDENT) { + RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); + return; + } + // Unknown error (generic case) + + // Use the last token we found on the first pass to avoid reporting + // incorrect locations for generic syntax errors just because we reached + // further away when trying to find specific syntax errors in the second + // pass. + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); + // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing + // generic SyntaxError we just raised if errors are found. + _PyPegen_tokenize_full_source_to_check_for_errors(p); +} \ No newline at end of file diff --git a/Tools/peg_generator/Makefile b/Tools/peg_generator/Makefile index 6ad9c91..d010f19 100644 --- a/Tools/peg_generator/Makefile +++ b/Tools/peg_generator/Makefile @@ -22,7 +22,7 @@ data/xxl.py: build: peg_extension/parse.c -peg_extension/parse.c: $(GRAMMAR) $(TOKENS) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen.c ../../Parser/string_parser.c ../../Parser/*.h pegen/grammar_parser.py +peg_extension/parse.c: $(GRAMMAR) $(TOKENS) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen.c ../../Parser/pegen_errors.c ../../Parser/string_parser.c ../../Parser/action_helpers.c ../../Parser/*.h pegen/grammar_parser.py $(PYTHON) -m pegen -q c $(GRAMMAR) $(TOKENS) -o peg_extension/parse.c --compile-extension clean: diff --git a/Tools/peg_generator/pegen/build.py b/Tools/peg_generator/pegen/build.py index bf01078..c69e5c9 100644 --- a/Tools/peg_generator/pegen/build.py +++ b/Tools/peg_generator/pegen/build.py @@ -69,6 +69,8 @@ def compile_c_extension( str(MOD_DIR.parent.parent.parent / "Python" / "asdl.c"), str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer.c"), str(MOD_DIR.parent.parent.parent / "Parser" / "pegen.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "pegen_errors.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "action_helpers.c"), str(MOD_DIR.parent.parent.parent / "Parser" / "string_parser.c"), str(MOD_DIR.parent / "peg_extension" / "peg_extension.c"), generated_source_path, -- cgit v0.12