bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814)

Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions.
author: Pablo Galindo <Pablogsal@gmail.com> 2019-03-01 23:34:44 (GMT)
committer: GitHub <noreply@github.com> 2019-03-01 23:34:44 (GMT)
commit: 1f24a719e7be5e49b876a5dc7daf21d01ee69faa (patch)
tree: 8f8f56cab78ef671a8cb7f54b8ec2495d9a435e6 /Parser/tokenizer.c
parent: 7eebbbd5b3907447eddadf5cb7cb1cc9230d15b2 (diff)
download: cpython-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.zip
cpython-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.tar.gz
cpython-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.tar.bz2
1 files changed, 0 insertions, 56 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 1ded9ad..44ec415 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -10,13 +10,11 @@
 #include "tokenizer.h"
 #include "errcode.h"
 
-#ifndef PGEN
 #include "unicodeobject.h"
 #include "bytesobject.h"
 #include "fileobject.h"
 #include "codecs.h"
 #include "abstract.h"
-#endif /* PGEN */
 
 /* Alternate tab spacing */
 #define ALTTABSIZE 1
@@ -81,11 +79,9 @@ tok_new(void)
     tok->enc = NULL;
     tok->encoding = NULL;
     tok->cont_line = 0;
-#ifndef PGEN
     tok->filename = NULL;
     tok->decoding_readline = NULL;
     tok->decoding_buffer = NULL;
-#endif
     tok->type_comments = 0;
 
     return tok;
@@ -104,28 +100,6 @@ new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
     return result;
 }
 
-#ifdef PGEN
-
-static char *
-decoding_fgets(char *s, int size, struct tok_state *tok)
-{
-    return fgets(s, size, tok->fp);
-}
-
-static int
-decoding_feof(struct tok_state *tok)
-{
-    return feof(tok->fp);
-}
-
-static char *
-decode_str(const char *str, int exec_input, struct tok_state *tok)
-{
-    return new_string(str, strlen(str), tok);
-}
-
-#else /* PGEN */
-
 static char *
 error_ret(struct tok_state *tok) /* XXX */
 {
@@ -551,7 +525,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
             return error_ret(tok);
         }
     }
-#ifndef PGEN
     /* The default encoding is UTF-8, so make sure we don't have any
        non-UTF-8 sequences in it. */
     if (line && !tok->encoding) {
@@ -574,7 +547,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
                 badchar, tok->filename, tok->lineno + 1);
         return error_ret(tok);
     }
-#endif
     return line;
 }
 
@@ -738,8 +710,6 @@ decode_str(const char *input, int single, struct tok_state *tok)
     return str;
 }
 
-#endif /* PGEN */
-
 /* Set up tokenizer for string */
 
 struct tok_state *
@@ -765,9 +735,7 @@ PyTokenizer_FromUTF8(const char *str, int exec_input)
     struct tok_state *tok = tok_new();
     if (tok == NULL)
         return NULL;
-#ifndef PGEN
     tok->input = str = translate_newlines(str, exec_input, tok);
-#endif
     if (str == NULL) {
         PyTokenizer_Free(tok);
         return NULL;
@@ -828,11 +796,9 @@ PyTokenizer_Free(struct tok_state *tok)
 {
     if (tok->encoding != NULL)
         PyMem_FREE(tok->encoding);
-#ifndef PGEN
     Py_XDECREF(tok->decoding_readline);
     Py_XDECREF(tok->decoding_buffer);
     Py_XDECREF(tok->filename);
-#endif
     if (tok->fp != NULL && tok->buf != NULL)
         PyMem_FREE(tok->buf);
     if (tok->input)
@@ -871,7 +837,6 @@ tok_nextc(struct tok_state *tok)
         }
         if (tok->prompt != NULL) {
             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
-#ifndef PGEN
             if (newtok != NULL) {
                 char *translated = translate_newlines(newtok, 0, tok);
                 PyMem_FREE(newtok);
@@ -900,7 +865,6 @@ tok_nextc(struct tok_state *tok)
                 strcpy(newtok, buf);
                 Py_DECREF(u);
             }
-#endif
             if (tok->nextprompt != NULL)
                 tok->prompt = tok->nextprompt;
             if (newtok == NULL)
@@ -1056,7 +1020,6 @@ tok_backup(struct tok_state *tok, int c)
 static int
 syntaxerror(struct tok_state *tok, const char *format, ...)
 {
-#ifndef PGEN
     va_list vargs;
 #ifdef HAVE_STDARG_PROTOTYPES
     va_start(vargs, format);
@@ -1069,9 +1032,6 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
                                tok->lineno,
                                (int)(tok->cur - tok->line_start));
     tok->done = E_ERROR;
-#else
-    tok->done = E_TOKEN;
-#endif
     return ERRORTOKEN;
 }
 
@@ -1083,9 +1043,6 @@ indenterror(struct tok_state *tok)
     return ERRORTOKEN;
 }
 
-#ifdef PGEN
-#define verify_identifier(tok) 1
-#else
 /* Verify that the identifier follows PEP 3131.
    All identifier strings are guaranteed to be "ready" unicode objects.
  */
@@ -1112,7 +1069,6 @@ verify_identifier(struct tok_state *tok)
         tok->done = E_IDENTIFIER;
     return result;
 }
-#endif
 
 static int
 tok_decimal_tail(struct tok_state *tok)
@@ -1667,25 +1623,20 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
     case '(':
     case '[':
     case '{':
-#ifndef PGEN
         if (tok->level >= MAXLEVEL) {
             return syntaxerror(tok, "too many nested parentheses");
         }
         tok->parenstack[tok->level] = c;
         tok->parenlinenostack[tok->level] = tok->lineno;
-#endif
         tok->level++;
         break;
     case ')':
     case ']':
     case '}':
-#ifndef PGEN
         if (!tok->level) {
             return syntaxerror(tok, "unmatched '%c'", c);
         }
-#endif
         tok->level--;
-#ifndef PGEN
         int opening = tok->parenstack[tok->level];
         if (!((opening == '(' && c == ')') ||
               (opening == '[' && c == ']') ||
@@ -1704,7 +1655,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                         c, opening);
             }
         }
-#endif
         break;
     }
 
@@ -1742,11 +1692,7 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
     FILE *fp;
     char *p_start =NULL , *p_end =NULL , *encoding = NULL;
 
-#ifndef PGEN
     fd = _Py_dup(fd);
-#else
-    fd = dup(fd);
-#endif
     if (fd < 0) {
         return NULL;
     }
@@ -1760,7 +1706,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
         fclose(fp);
         return NULL;
     }
-#ifndef PGEN
     if (filename != NULL) {
         Py_INCREF(filename);
         tok->filename = filename;
@@ -1773,7 +1718,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
             return encoding;
         }
     }
-#endif
     while (tok->lineno < 2 && tok->done == E_OK) {
         PyTokenizer_Get(tok, &p_start, &p_end);
     }
author	Pablo Galindo <Pablogsal@gmail.com>	2019-03-01 23:34:44 (GMT)
committer	GitHub <noreply@github.com>	2019-03-01 23:34:44 (GMT)
commit	1f24a719e7be5e49b876a5dc7daf21d01ee69faa (patch)
tree	8f8f56cab78ef671a8cb7f54b8ec2495d9a435e6 /Parser/tokenizer.c
parent	7eebbbd5b3907447eddadf5cb7cb1cc9230d15b2 (diff)
download	cpython-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.zip cpython-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.tar.gz cpython-1f24a719e7be5e49b876a5dc7daf21d01ee69faa.tar.bz2