From ebebb6429c224c713e1c63a0b05d4840f52c7415 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 23 Apr 2020 18:36:06 +0300 Subject: bpo-40334: Improve various PEG-Parser related stuff (GH-19669) The changes in this commit are all related to @vstinner's original review comments of the initial PEP 617 implementation PR. --- Include/internal/pegen_interface.h | 34 +++++++++++++++++ Include/pegen_interface.h | 32 ---------------- Makefile.pre.in | 2 +- Modules/_peg_parser.c | 2 +- PCbuild/pythoncore.vcxproj | 2 +- Parser/pegen/peg_api.c | 2 +- Parser/pegen/pegen.c | 76 ++++++++++++++++++++++++-------------- Python/pythonrun.c | 2 +- 8 files changed, 88 insertions(+), 64 deletions(-) create mode 100644 Include/internal/pegen_interface.h delete mode 100644 Include/pegen_interface.h diff --git a/Include/internal/pegen_interface.h b/Include/internal/pegen_interface.h new file mode 100644 index 0000000..d8621c1 --- /dev/null +++ b/Include/internal/pegen_interface.h @@ -0,0 +1,34 @@ +#ifndef Py_PEGENINTERFACE +#define Py_PEGENINTERFACE +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include "Python.h" +#include "Python-ast.h" + +PyAPI_FUNC(mod_ty) PyPegen_ASTFromFile(const char *filename, int mode, PyArena *arena); +PyAPI_FUNC(mod_ty) PyPegen_ASTFromString(const char *str, int mode, PyCompilerFlags *flags, + PyArena *arena); +PyAPI_FUNC(mod_ty) PyPegen_ASTFromStringObject(const char *str, PyObject* filename, int mode, + PyCompilerFlags *flags, PyArena *arena); +PyAPI_FUNC(mod_ty) PyPegen_ASTFromFileObject(FILE *fp, PyObject *filename_ob, + int mode, const char *enc, const char *ps1, + const char *ps2, int *errcode, PyArena *arena); +PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromFile(const char *filename, int mode); +PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromString(const char *str, int mode, + PyCompilerFlags *flags); +PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromFileObject(FILE *, PyObject *filename_ob, + int mode, const char *enc, + const char *ps1, + const char *ps2, + int *errcode); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_PEGENINTERFACE*/ diff --git a/Include/pegen_interface.h b/Include/pegen_interface.h deleted file mode 100644 index bf5b296..0000000 --- a/Include/pegen_interface.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef Py_LIMITED_API -#ifndef Py_PEGENINTERFACE -#define Py_PEGENINTERFACE -#ifdef __cplusplus -extern "C" { -#endif - -#include "Python.h" -#include "Python-ast.h" - -PyAPI_FUNC(mod_ty) PyPegen_ASTFromFile(const char *filename, int mode, PyArena *arena); -PyAPI_FUNC(mod_ty) PyPegen_ASTFromString(const char *str, int mode, PyCompilerFlags *flags, - PyArena *arena); -PyAPI_FUNC(mod_ty) PyPegen_ASTFromStringObject(const char *str, PyObject* filename, int mode, - PyCompilerFlags *flags, PyArena *arena); -PyAPI_FUNC(mod_ty) PyPegen_ASTFromFileObject(FILE *fp, PyObject *filename_ob, - int mode, const char *enc, const char *ps1, - const char *ps2, int *errcode, PyArena *arena); -PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromFile(const char *filename, int mode); -PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromString(const char *str, int mode, - PyCompilerFlags *flags); -PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromFileObject(FILE *, PyObject *filename_ob, - int mode, const char *enc, - const char *ps1, - const char *ps2, - int *errcode); - -#ifdef __cplusplus -} -#endif -#endif /* !Py_PEGENINTERFACE*/ -#endif /* !Py_LIMITED_API */ diff --git a/Makefile.pre.in b/Makefile.pre.in index 29d7e34..3e4b20b 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -304,7 +304,7 @@ PEGEN_OBJS= \ PEGEN_HEADERS= \ - $(srcdir)/Include/pegen_interface.h \ + $(srcdir)/Include/internal/pegen_interface.h \ $(srcdir)/Parser/pegen/pegen.h \ $(srcdir)/Parser/pegen/parse_string.h diff --git a/Modules/_peg_parser.c b/Modules/_peg_parser.c index 0a84edc..cb5f9aa 100644 --- a/Modules/_peg_parser.c +++ b/Modules/_peg_parser.c @@ -1,5 +1,5 @@ #include -#include +#include "pegen_interface.h" PyObject * _Py_parse_file(PyObject *self, PyObject *args, PyObject *kwds) diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index d795c4d..3484f44 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -161,6 +161,7 @@ + @@ -213,7 +214,6 @@ - diff --git a/Parser/pegen/peg_api.c b/Parser/pegen/peg_api.c index 7c6903c..c42aa68 100644 --- a/Parser/pegen/peg_api.c +++ b/Parser/pegen/peg_api.c @@ -1,4 +1,4 @@ -#include +#include "pegen_interface.h" #include "../tokenizer.h" #include "pegen.h" diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index 0b70c95..a51c8aa 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -8,6 +8,9 @@ static int init_normalization(Parser *p) { + if (p->normalize) { + return 1; + } PyObject *m = PyImport_ImportModuleNoBlock("unicodedata"); if (!m) { @@ -36,7 +39,7 @@ _PyPegen_new_identifier(Parser *p, char *n) if (!PyUnicode_IS_ASCII(id)) { PyObject *id2; - if (!p->normalize && !init_normalization(p)) + if (!init_normalization(p)) { Py_DECREF(id); goto error; @@ -88,6 +91,9 @@ static inline Py_ssize_t byte_offset_to_character_offset(PyObject *line, int col_offset) { const char *str = PyUnicode_AsUTF8(line); + if (!str) { + return 0; + } PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL); if (!text) { return 0; @@ -171,9 +177,10 @@ _PyPegen_get_expr_name(expr_ty e) } } -static void +static int raise_decode_error(Parser *p) { + assert(PyErr_Occurred()); const char *errtype = NULL; if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { errtype = "unicode error"; @@ -197,6 +204,8 @@ raise_decode_error(Parser *p) Py_XDECREF(value); Py_XDECREF(tback); } + + return -1; } static void @@ -207,27 +216,33 @@ raise_tokenizer_init_error(PyObject *filename) || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { return; } - PyObject *type, *value, *tback, *errstr; + PyObject *errstr = NULL; + PyObject *tuple = NULL; + PyObject *type, *value, *tback; PyErr_Fetch(&type, &value, &tback); errstr = PyObject_Str(value); + if (!errstr) { + goto error; + } - Py_INCREF(Py_None); - PyObject *tmp = Py_BuildValue("(OiiN)", filename, 0, -1, Py_None); + PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); if (!tmp) { goto error; } - value = PyTuple_Pack(2, errstr, tmp); + tuple = PyTuple_Pack(2, errstr, tmp); Py_DECREF(tmp); if (!value) { goto error; } - PyErr_SetObject(PyExc_SyntaxError, value); + PyErr_SetObject(PyExc_SyntaxError, tuple); error: Py_XDECREF(type); Py_XDECREF(value); Py_XDECREF(tback); + Py_XDECREF(errstr); + Py_XDECREF(tuple); } static inline PyObject * @@ -337,9 +352,6 @@ tokenizer_error(Parser *p) errtype = PyExc_IndentationError; msg = "too many levels of indentation"; break; - case E_DECODE: - raise_decode_error(p); - return -1; case E_LINECONT: msg = "unexpected character after line continuation character"; break; @@ -513,7 +525,12 @@ _PyPegen_fill_token(Parser *p) const char *start, *end; int type = PyTokenizer_Get(p->tok, &start, &end); if (type == ERRORTOKEN) { - return tokenizer_error(p); + if (p->tok->done == E_DECODE) { + return raise_decode_error(p); + } + else { + return tokenizer_error(p); + } } if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) { type = NEWLINE; /* Add an extra newline */ @@ -530,13 +547,21 @@ _PyPegen_fill_token(Parser *p) if (p->fill == p->size) { int newsize = p->size * 2; - p->tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *)); - if (p->tokens == NULL) { - PyErr_Format(PyExc_MemoryError, "Realloc tokens failed"); + Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *)); + if (new_tokens == NULL) { + PyErr_NoMemory(); return -1; } + else { + p->tokens = new_tokens; + } for (int i = p->size; i < newsize; i++) { p->tokens[i] = PyMem_Malloc(sizeof(Token)); + if (p->tokens[i] == NULL) { + p->size = i; // Needed, in order to cleanup correctly after parser fails + PyErr_NoMemory(); + return -1; + } memset(p->tokens[i], '\0', sizeof(Token)); } p->size = newsize; @@ -566,8 +591,6 @@ _PyPegen_fill_token(Parser *p) t->end_lineno = p->starting_lineno + end_lineno; t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset; - // if (p->fill % 100 == 0) fprintf(stderr, "Filled at %d: %s \"%s\"\n", p->fill, - // token_name(type), PyBytes_AsString(t->bytes)); p->fill += 1; return 0; } @@ -614,6 +637,7 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres) { if (p->mark == p->fill) { if (_PyPegen_fill_token(p) < 0) { + p->error_indicator = 1; return -1; } } @@ -632,11 +656,9 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres) } p->mark = m->mark; *(void **)(pres) = m->node; - // fprintf(stderr, "%d < %d: memoized!\n", p->mark, p->fill); return 1; } } - // fprintf(stderr, "%d < %d: not memoized\n", p->mark, p->fill); return 0; } @@ -683,18 +705,15 @@ _PyPegen_expect_token(Parser *p, int type) { if (p->mark == p->fill) { if (_PyPegen_fill_token(p) < 0) { + p->error_indicator = 1; return NULL; } } Token *t = p->tokens[p->mark]; if (t->type != type) { - // fprintf(stderr, "No %s at %d\n", token_name(type), p->mark); return NULL; } p->mark += 1; - // fprintf(stderr, "Got %s at %d: %s\n", token_name(type), p->mark, - // PyBytes_AsString(t->bytes)); - return t; } @@ -888,8 +907,7 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena { Parser *p = PyMem_Malloc(sizeof(Parser)); if (p == NULL) { - PyErr_Format(PyExc_MemoryError, "Out of memory for Parser"); - return NULL; + return (Parser *) PyErr_NoMemory(); } assert(tok != NULL); p->tok = tok; @@ -898,10 +916,14 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena p->tokens = PyMem_Malloc(sizeof(Token *)); if (!p->tokens) { PyMem_Free(p); - PyErr_Format(PyExc_MemoryError, "Out of memory for tokens"); - return NULL; + return (Parser *) PyErr_NoMemory(); } p->tokens[0] = PyMem_Malloc(sizeof(Token)); + if (!p->tokens) { + PyMem_Free(p->tokens); + PyMem_Free(p); + return (Parser *) PyErr_NoMemory(); + } memset(p->tokens[0], '\0', sizeof(Token)); p->mark = 0; p->fill = 0; @@ -1187,7 +1209,7 @@ _PyPegen_seq_count_dots(asdl_seq *seq) number_of_dots += 1; break; default: - assert(current_expr->type == ELLIPSIS || current_expr->type == DOT); + Py_UNREACHABLE(); } } diff --git a/Python/pythonrun.c b/Python/pythonrun.c index e3fd3b2..3a2fe96 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -29,7 +29,7 @@ #include "ast.h" // PyAST_FromNodeObject() #include "marshal.h" // PyMarshal_ReadLongFromFile() -#include // PyPegen_ASTFrom* +#include "pegen_interface.h" // PyPegen_ASTFrom* #ifdef MS_WINDOWS # include "malloc.h" // alloca() -- cgit v0.12