gh-97973: Return all necessary information from the tokenizer (GH-97984)

Right now, the tokenizer only returns type and two pointers to the start and end of the token. This PR modifies the tokenizer to return the type and set all of the necessary information, so that the parser does not have to this.
author: Lysandros Nikolaou <lisandrosnik@gmail.com> 2022-10-06 23:07:17 (GMT)
committer: GitHub <noreply@github.com> 2022-10-06 23:07:17 (GMT)
commit: cbf0afd8a1474d68310331af9218606959d4cc22 (patch)
tree: cd421653d73d25c89fa8c02e5819517efa76f7b3
parent: b9d2e8171696514e9226164005f7bf24bf69e66d (diff)
download: cpython-cbf0afd8a1474d68310331af9218606959d4cc22.zip
cpython-cbf0afd8a1474d68310331af9218606959d4cc22.tar.gz
cpython-cbf0afd8a1474d68310331af9218606959d4cc22.tar.bz2
6 files changed, 159 insertions, 146 deletions
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-06-20-41-29.gh-issue-97973.gB-xWi.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-20-41-29.gh-issue-97973.gB-xWi.rst
new file mode 100644
index 0000000..a0095a6
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-20-41-29.gh-issue-97973.gB-xWi.rst
@@ -0,0 +1 @@
+Modify the tokenizer to return all necessary information the parser needs to set location information in the AST nodes, so that the parser does not have to calculate those doing pointer arithmetic.
diff --git a/Parser/pegen.c b/Parser/pegen.c
index a5d123d..1317606 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -123,16 +123,18 @@ growable_comment_array_deallocate(growable_comment_array *arr) {
 }
 
 static int
-_get_keyword_or_name_type(Parser *p, const char *name, int name_len)
+_get_keyword_or_name_type(Parser *p, struct token *new_token)
 {
+    int name_len = new_token->end_col_offset - new_token->col_offset;
     assert(name_len > 0);
+
     if (name_len >= p->n_keyword_lists ||
         p->keywords[name_len] == NULL ||
         p->keywords[name_len]->type == -1) {
         return NAME;
     }
     for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
-        if (strncmp(k->str, name, name_len) == 0) {
+        if (strncmp(k->str, new_token->start, name_len) == 0) {
             return k->type;
         }
     }
@@ -140,33 +142,26 @@ _get_keyword_or_name_type(Parser *p, const char *name, int name_len)
 }
 
 static int
-initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
-    assert(token != NULL);
+initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
+    assert(parser_token != NULL);
 
-    token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type;
-    token->bytes = PyBytes_FromStringAndSize(start, end - start);
-    if (token->bytes == NULL) {
+    parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
+    parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
+    if (parser_token->bytes == NULL) {
         return -1;
     }
-
-    if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) {
-        Py_DECREF(token->bytes);
+    if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
+        Py_DECREF(parser_token->bytes);
         return -1;
     }
 
-    token->level = p->tok->level;
-
-    const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start;
-    int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno;
-    int end_lineno = p->tok->lineno;
-
-    int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1;
-    int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1;
-
-    token->lineno = lineno;
-    token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset;
-    token->end_lineno = end_lineno;
-    token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset;
+    parser_token->level = new_token->level;
+    parser_token->lineno = new_token->lineno;
+    parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
+                                                                    : new_token->col_offset;
+    parser_token->end_lineno = new_token->end_lineno;
+    parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
+                                                                 : new_token->end_col_offset;
 
     p->fill += 1;
 
@@ -202,26 +197,25 @@ _resize_tokens_array(Parser *p) {
 int
 _PyPegen_fill_token(Parser *p)
 {
-    const char *start;
-    const char *end;
-    int type = _PyTokenizer_Get(p->tok, &start, &end);
+    struct token new_token;
+    int type = _PyTokenizer_Get(p->tok, &new_token);
 
     // Record and skip '# type: ignore' comments
     while (type == TYPE_IGNORE) {
-        Py_ssize_t len = end - start;
+        Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
         char *tag = PyMem_Malloc(len + 1);
         if (tag == NULL) {
             PyErr_NoMemory();
             return -1;
         }
-        strncpy(tag, start, len);
+        strncpy(tag, new_token.start, len);
         tag[len] = '\0';
         // Ownership of tag passes to the growable array
         if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
             PyErr_NoMemory();
             return -1;
         }
-        type = _PyTokenizer_Get(p->tok, &start, &end);
+        type = _PyTokenizer_Get(p->tok, &new_token);
     }
 
     // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
@@ -244,7 +238,7 @@ _PyPegen_fill_token(Parser *p)
     }
 
     Token *t = p->tokens[p->fill];
-    return initialize_token(p, t, start, end, type);
+    return initialize_token(p, t, &new_token, type);
 }
 
 #if defined(Py_DEBUG)
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index 95bbd43..7738cba 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -164,11 +164,10 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
     Py_ssize_t current_err_line = current_token->lineno;
 
     int ret = 0;
+    struct token new_token;
 
     for (;;) {
-        const char *start;
-        const char *end;
-        switch (_PyTokenizer_Get(p->tok, &start, &end)) {
+        switch (_PyTokenizer_Get(p->tok, &new_token)) {
             case ERRORTOKEN:
                 if (p->tok->level != 0) {
                     int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 3c37fd9..c5d3e58 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -36,6 +36,8 @@
 /* Don't ever change this -- it would break the portability of Python code */
 #define TABSIZE 8
 
+#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
+
 /* Forward */
 static struct tok_state *tok_new(void);
 static int tok_nextc(struct tok_state *tok);
@@ -1174,8 +1176,6 @@ syntaxerror_known_range(struct tok_state *tok,
     return ret;
 }
 
-
-
 static int
 indenterror(struct tok_state *tok)
 {
@@ -1391,12 +1391,32 @@ tok_continuation_line(struct tok_state *tok) {
 }
 
 static int
-tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
+token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
+{
+    assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
+    token->level = tok->level;
+    token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
+    token->end_lineno = tok->lineno;
+    token->col_offset = -1;
+    token->end_col_offset = -1;
+    token->start = start;
+    token->end = end;
+    if (start != NULL && end != NULL) {
+        const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
+        token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1;
+        token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1;
+    }
+    return type;
+}
+
+static int
+tok_get(struct tok_state *tok, struct token *token)
 {
     int c;
     int blankline, nonascii;
 
-    *p_start = *p_end = NULL;
+    const char *p_start = NULL;
+    const char *p_end = NULL;
   nextline:
     tok->start = NULL;
     blankline = 0;
@@ -1426,7 +1446,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                 // the level of indentation of whatever comes next.
                 cont_line_col = cont_line_col ? cont_line_col : col;
                 if ((c = tok_continuation_line(tok)) == -1) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
             }
             else {
@@ -1461,7 +1481,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
             if (col == tok->indstack[tok->indent]) {
                 /* No change */
                 if (altcol != tok->altindstack[tok->indent]) {
-                    return indenterror(tok);
+                    return MAKE_TOKEN(indenterror(tok));
                 }
             }
             else if (col > tok->indstack[tok->indent]) {
@@ -1469,10 +1489,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                 if (tok->indent+1 >= MAXINDENT) {
                     tok->done = E_TOODEEP;
                     tok->cur = tok->inp;
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
                 if (altcol <= tok->altindstack[tok->indent]) {
-                    return indenterror(tok);
+                    return MAKE_TOKEN(indenterror(tok));
                 }
                 tok->pendin++;
                 tok->indstack[++tok->indent] = col;
@@ -1488,10 +1508,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                 if (col != tok->indstack[tok->indent]) {
                     tok->done = E_DEDENT;
                     tok->cur = tok->inp;
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
                 if (altcol != tok->altindstack[tok->indent]) {
-                    return indenterror(tok);
+                    return MAKE_TOKEN(indenterror(tok));
                 }
             }
         }
@@ -1503,11 +1523,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
     if (tok->pendin != 0) {
         if (tok->pendin < 0) {
             tok->pendin++;
-            return DEDENT;
+            return MAKE_TOKEN(DEDENT);
         }
         else {
             tok->pendin--;
-            return INDENT;
+            return MAKE_TOKEN(INDENT);
         }
     }
 
@@ -1587,34 +1607,34 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
 
                 if (is_type_ignore) {
-                    *p_start = ignore_end;
-                    *p_end = tok->cur;
+                    p_start = ignore_end;
+                    p_end = tok->cur;
 
                     /* If this type ignore is the only thing on the line, consume the newline also. */
                     if (blankline) {
                         tok_nextc(tok);
                         tok->atbol = 1;
                     }
-                    return TYPE_IGNORE;
+                    return MAKE_TOKEN(TYPE_IGNORE);
                 } else {
-                    *p_start = type_start;  /* after type_comment_prefix */
-                    *p_end = tok->cur;
-                    return TYPE_COMMENT;
+                    p_start = type_start;
+                    p_end = tok->cur;
+                    return MAKE_TOKEN(TYPE_COMMENT);
                 }
             }
         }
     }
 
     if (tok->done == E_INTERACT_STOP) {
-        return ENDMARKER;
+        return MAKE_TOKEN(ENDMARKER);
     }
 
     /* Check for EOF and errors now */
     if (c == EOF) {
         if (tok->level) {
-            return ERRORTOKEN;
+            return MAKE_TOKEN(ERRORTOKEN);
         }
-        return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
+        return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
     }
 
     /* Identifier (most frequent token!) */
@@ -1654,11 +1674,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
         }
         tok_backup(tok, c);
         if (nonascii && !verify_identifier(tok)) {
-            return ERRORTOKEN;
+            return MAKE_TOKEN(ERRORTOKEN);
         }
 
-        *p_start = tok->start;
-        *p_end = tok->cur;
+        p_start = tok->start;
+        p_end = tok->cur;
 
         /* async/await parsing block. */
         if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
@@ -1673,10 +1693,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
             if (!tok->async_hacks || tok->async_def) {
                 /* Always recognize the keywords. */
                 if (memcmp(tok->start, "async", 5) == 0) {
-                    return ASYNC;
+                    return MAKE_TOKEN(ASYNC);
                 }
                 if (memcmp(tok->start, "await", 5) == 0) {
-                    return AWAIT;
+                    return MAKE_TOKEN(AWAIT);
                 }
             }
             else if (memcmp(tok->start, "async", 5) == 0) {
@@ -1684,13 +1704,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                    Look ahead one token to see if that is 'def'. */
 
                 struct tok_state ahead_tok;
-                const char *ahead_tok_start = NULL;
-                const char *ahead_tok_end = NULL;
+                struct token ahead_token;
                 int ahead_tok_kind;
 
                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
-                ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
-                                         &ahead_tok_end);
+                ahead_tok_kind = tok_get(&ahead_tok, &ahead_token);
 
                 if (ahead_tok_kind == NAME
                     && ahead_tok.cur - ahead_tok.start == 3
@@ -1700,12 +1718,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                        returning a plain NAME token, return ASYNC. */
                     tok->async_def_indent = tok->indent;
                     tok->async_def = 1;
-                    return ASYNC;
+                    return MAKE_TOKEN(ASYNC);
                 }
             }
         }
 
-        return NAME;
+        return MAKE_TOKEN(NAME);
     }
 
     /* Newline */
@@ -1714,15 +1732,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
         if (blankline || tok->level > 0) {
             goto nextline;
         }
-        *p_start = tok->start;
-        *p_end = tok->cur - 1; /* Leave '\n' out of the string */
+        p_start = tok->start;
+        p_end = tok->cur - 1; /* Leave '\n' out of the string */
         tok->cont_line = 0;
         if (tok->async_def) {
             /* We're somewhere inside an 'async def' function, and
                we've encountered a NEWLINE after its signature. */
             tok->async_def_nl = 1;
         }
-        return NEWLINE;
+        return MAKE_TOKEN(NEWLINE);
     }
 
     /* Period or number starting with period? */
@@ -1733,9 +1751,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
         } else if (c == '.') {
             c = tok_nextc(tok);
             if (c == '.') {
-                *p_start = tok->start;
-                *p_end = tok->cur;
-                return ELLIPSIS;
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(ELLIPSIS);
             }
             else {
                 tok_backup(tok, c);
@@ -1745,9 +1763,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
         else {
             tok_backup(tok, c);
         }
-        *p_start = tok->start;
-        *p_end = tok->cur;
-        return DOT;
+        p_start = tok->start;
+        p_end = tok->cur;
+        return MAKE_TOKEN(DOT);
     }
 
     /* Number */
@@ -1764,14 +1782,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     }
                     if (!isxdigit(c)) {
                         tok_backup(tok, c);
-                        return syntaxerror(tok, "invalid hexadecimal literal");
+                        return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal"));
                     }
                     do {
                         c = tok_nextc(tok);
                     } while (isxdigit(c));
                 } while (c == '_');
                 if (!verify_end_of_number(tok, c, "hexadecimal")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
             }
             else if (c == 'o' || c == 'O') {
@@ -1783,12 +1801,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     }
                     if (c < '0' || c >= '8') {
                         if (isdigit(c)) {
-                            return syntaxerror(tok,
-                                    "invalid digit '%c' in octal literal", c);
+                            return MAKE_TOKEN(syntaxerror(tok,
+                                    "invalid digit '%c' in octal literal", c));
                         }
                         else {
                             tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid octal literal");
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal"));
                         }
                     }
                     do {
@@ -1796,11 +1814,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     } while ('0' <= c && c < '8');
                 } while (c == '_');
                 if (isdigit(c)) {
-                    return syntaxerror(tok,
-                            "invalid digit '%c' in octal literal", c);
+                    return MAKE_TOKEN(syntaxerror(tok,
+                            "invalid digit '%c' in octal literal", c));
                 }
                 if (!verify_end_of_number(tok, c, "octal")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
             }
             else if (c == 'b' || c == 'B') {
@@ -1812,12 +1830,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     }
                     if (c != '0' && c != '1') {
                         if (isdigit(c)) {
-                            return syntaxerror(tok,
-                                    "invalid digit '%c' in binary literal", c);
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
                         }
                         else {
                             tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid binary literal");
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal"));
                         }
                     }
                     do {
@@ -1825,11 +1842,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     } while (c == '0' || c == '1');
                 } while (c == '_');
                 if (isdigit(c)) {
-                    return syntaxerror(tok,
-                            "invalid digit '%c' in binary literal", c);
+                    return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
                 }
                 if (!verify_end_of_number(tok, c, "binary")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
             }
             else {
@@ -1841,7 +1857,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                         c = tok_nextc(tok);
                         if (!isdigit(c)) {
                             tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid decimal literal");
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
                         }
                     }
                     if (c != '0') {
@@ -1854,7 +1870,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     nonzero = 1;
                     c = tok_decimal_tail(tok);
                     if (c == 0) {
-                        return ERRORTOKEN;
+                        return MAKE_TOKEN(ERRORTOKEN);
                     }
                 }
                 if (c == '.') {
@@ -1870,15 +1886,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                 else if (nonzero) {
                     /* Old-style octal: now disallowed. */
                     tok_backup(tok, c);
-                    return syntaxerror_known_range(
+                    return MAKE_TOKEN(syntaxerror_known_range(
                             tok, (int)(tok->start + 1 - tok->line_start),
                             (int)(zeros_end - tok->line_start),
                             "leading zeros in decimal integer "
                             "literals are not permitted; "
-                            "use an 0o prefix for octal integers");
+                            "use an 0o prefix for octal integers"));
                 }
                 if (!verify_end_of_number(tok, c, "decimal")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
             }
         }
@@ -1886,7 +1902,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
             /* Decimal */
             c = tok_decimal_tail(tok);
             if (c == 0) {
-                return ERRORTOKEN;
+                return MAKE_TOKEN(ERRORTOKEN);
             }
             {
                 /* Accept floating point numbers. */
@@ -1897,7 +1913,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     if (isdigit(c)) {
                         c = tok_decimal_tail(tok);
                         if (c == 0) {
-                            return ERRORTOKEN;
+                            return MAKE_TOKEN(ERRORTOKEN);
                         }
                     }
                 }
@@ -1911,21 +1927,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                         c = tok_nextc(tok);
                         if (!isdigit(c)) {
                             tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid decimal literal");
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
                         }
                     } else if (!isdigit(c)) {
                         tok_backup(tok, c);
                         if (!verify_end_of_number(tok, e, "decimal")) {
-                            return ERRORTOKEN;
+                            return MAKE_TOKEN(ERRORTOKEN);
                         }
                         tok_backup(tok, e);
-                        *p_start = tok->start;
-                        *p_end = tok->cur;
-                        return NUMBER;
+                        p_start = tok->start;
+                        p_end = tok->cur;
+                        return MAKE_TOKEN(NUMBER);
                     }
                     c = tok_decimal_tail(tok);
                     if (c == 0) {
-                        return ERRORTOKEN;
+                        return MAKE_TOKEN(ERRORTOKEN);
                     }
                 }
                 if (c == 'j' || c == 'J') {
@@ -1933,18 +1949,18 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
         imaginary:
                     c = tok_nextc(tok);
                     if (!verify_end_of_number(tok, c, "imaginary")) {
-                        return ERRORTOKEN;
+                        return MAKE_TOKEN(ERRORTOKEN);
                     }
                 }
                 else if (!verify_end_of_number(tok, c, "decimal")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
             }
         }
         tok_backup(tok, c);
-        *p_start = tok->start;
-        *p_end = tok->cur;
-        return NUMBER;
+        p_start = tok->start;
+        p_end = tok->cur;
+        return MAKE_TOKEN(NUMBER);
     }
 
   letter_quote:
@@ -1997,7 +2013,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     if (c != '\n') {
                         tok->done = E_EOFS;
                     }
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
                 else {
                     syntaxerror(tok, "unterminated string literal (detected at"
@@ -2005,7 +2021,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     if (c != '\n') {
                         tok->done = E_EOLS;
                     }
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                 }
             }
             if (c == quote) {
@@ -2019,15 +2035,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
             }
         }
 
-        *p_start = tok->start;
-        *p_end = tok->cur;
-        return STRING;
+        p_start = tok->start;
+        p_end = tok->cur;
+        return MAKE_TOKEN(STRING);
     }
 
     /* Line continuation */
     if (c == '\\') {
         if ((c = tok_continuation_line(tok)) == -1) {
-            return ERRORTOKEN;
+            return MAKE_TOKEN(ERRORTOKEN);
         }
         tok->cont_line = 1;
         goto again; /* Read next line */
@@ -2036,19 +2052,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
     /* Check for two-character token */
     {
         int c2 = tok_nextc(tok);
-        int token = _PyToken_TwoChars(c, c2);
-        if (token != OP) {
+        int current_token = _PyToken_TwoChars(c, c2);
+        if (current_token != OP) {
             int c3 = tok_nextc(tok);
-            int token3 = _PyToken_ThreeChars(c, c2, c3);
-            if (token3 != OP) {
-                token = token3;
+            int current_token3 = _PyToken_ThreeChars(c, c2, c3);
+            if (current_token3 != OP) {
+                current_token = current_token3;
             }
             else {
                 tok_backup(tok, c3);
             }
-            *p_start = tok->start;
-            *p_end = tok->cur;
-            return token;
+            p_start = tok->start;
+            p_end = tok->cur;
+            return MAKE_TOKEN(current_token);
         }
         tok_backup(tok, c2);
     }
@@ -2059,7 +2075,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
     case '[':
     case '{':
         if (tok->level >= MAXLEVEL) {
-            return syntaxerror(tok, "too many nested parentheses");
+            return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses"));
         }
         tok->parenstack[tok->level] = c;
         tok->parenlinenostack[tok->level] = tok->lineno;
@@ -2070,7 +2086,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
     case ']':
     case '}':
         if (!tok->level) {
-            return syntaxerror(tok, "unmatched '%c'", c);
+            return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
         }
         tok->level--;
         int opening = tok->parenstack[tok->level];
@@ -2079,16 +2095,16 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
               (opening == '{' && c == '}')))
         {
             if (tok->parenlinenostack[tok->level] != tok->lineno) {
-                return syntaxerror(tok,
+                return MAKE_TOKEN(syntaxerror(tok,
                         "closing parenthesis '%c' does not match "
                         "opening parenthesis '%c' on line %d",
-                        c, opening, tok->parenlinenostack[tok->level]);
+                        c, opening, tok->parenlinenostack[tok->level]));
             }
             else {
-                return syntaxerror(tok,
+                return MAKE_TOKEN(syntaxerror(tok,
                         "closing parenthesis '%c' does not match "
                         "opening parenthesis '%c'",
-                        c, opening);
+                        c, opening));
             }
         }
         break;
@@ -2097,20 +2113,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
     if (!Py_UNICODE_ISPRINTABLE(c)) {
         char hex[9];
         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
-        return syntaxerror(tok, "invalid non-printable character U+%s", hex);
+        return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex));
     }
 
     /* Punctuation character */
-    *p_start = tok->start;
-    *p_end = tok->cur;
-    return _PyToken_OneChar(c);
+    p_start = tok->start;
+    p_end = tok->cur;
+    return MAKE_TOKEN(_PyToken_OneChar(c));
 }
 
 int
-_PyTokenizer_Get(struct tok_state *tok,
-                 const char **p_start, const char **p_end)
+_PyTokenizer_Get(struct tok_state *tok, struct token *token)
 {
-    int result = tok_get(tok, p_start, p_end);
+    int result = tok_get(tok, token);
     if (tok->decoding_erred) {
         result = ERRORTOKEN;
         tok->done = E_DECODE;
@@ -2166,8 +2181,6 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
 {
     struct tok_state *tok;
     FILE *fp;
-    const char *p_start = NULL;
-    const char *p_end = NULL;
     char *encoding = NULL;
 
     fp = fdopen_borrow(fd);
@@ -2191,8 +2204,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
             return encoding;
         }
     }
+    struct token token;
     while (tok->lineno < 2 && tok->done == E_OK) {
-        _PyTokenizer_Get(tok, &p_start, &p_end);
+        _PyTokenizer_Get(tok, &token);
     }
     fclose(fp);
     if (tok->encoding) {
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 5ac64a9..5b8c7f3 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -27,6 +27,12 @@ enum interactive_underflow_t {
     IUNDERFLOW_STOP,
 };
 
+struct token {
+    int level;
+    int lineno, col_offset, end_lineno, end_col_offset;
+    const char *start, *end;
+};
+
 /* Tokenizer state */
 struct tok_state {
     /* Input state; buf <= cur <= inp <= end */
@@ -94,7 +100,7 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
 extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                               const char *, const char *);
 extern void _PyTokenizer_Free(struct tok_state *);
-extern int _PyTokenizer_Get(struct tok_state *, const char **, const char **);
+extern int _PyTokenizer_Get(struct tok_state *, struct token *);
 
 #define tok_dump _Py_tok_dump
 
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index c5124a6..8daa987 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -60,9 +60,8 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source)
 static PyObject *
 tokenizeriter_next(tokenizeriterobject *it)
 {
-    const char *start;
-    const char *end;
-    int type = _PyTokenizer_Get(it->tok, &start, &end);
+    struct token token;
+    int type = _PyTokenizer_Get(it->tok, &token);
     if (type == ERRORTOKEN && PyErr_Occurred()) {
         return NULL;
     }
@@ -71,11 +70,11 @@ tokenizeriter_next(tokenizeriterobject *it)
         return NULL;
     }
     PyObject *str = NULL;
-    if (start == NULL || end == NULL) {
+    if (token.start == NULL || token.end == NULL) {
         str = PyUnicode_FromString("");
     }
     else {
-        str = PyUnicode_FromStringAndSize(start, end - start);
+        str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
     }
     if (str == NULL) {
         return NULL;
@@ -92,11 +91,11 @@ tokenizeriter_next(tokenizeriterobject *it)
     int end_lineno = it->tok->lineno;
     int col_offset = -1;
     int end_col_offset = -1;
-    if (start != NULL && start >= line_start) {
-        col_offset = (int)(start - line_start);
+    if (token.start != NULL && token.start >= line_start) {
+        col_offset = (int)(token.start - line_start);
     }
-    if (end != NULL && end >= it->tok->line_start) {
-        end_col_offset = (int)(end - it->tok->line_start);
+    if (token.end != NULL && token.end >= it->tok->line_start) {
+        end_col_offset = (int)(token.end - it->tok->line_start);
     }
 
     return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
author	Lysandros Nikolaou <lisandrosnik@gmail.com>	2022-10-06 23:07:17 (GMT)
committer	GitHub <noreply@github.com>	2022-10-06 23:07:17 (GMT)
commit	cbf0afd8a1474d68310331af9218606959d4cc22 (patch)
tree	cd421653d73d25c89fa8c02e5819517efa76f7b3
parent	b9d2e8171696514e9226164005f7bf24bf69e66d (diff)
download	cpython-cbf0afd8a1474d68310331af9218606959d4cc22.zip cpython-cbf0afd8a1474d68310331af9218606959d4cc22.tar.gz cpython-cbf0afd8a1474d68310331af9218606959d4cc22.tar.bz2