summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarta Gómez Macías <mgmacias@google.com>2023-05-21 00:03:02 (GMT)
committerGitHub <noreply@github.com>2023-05-21 00:03:02 (GMT)
commit6715f91edcf6f379f666e18f57b8a0dcb724bf79 (patch)
tree25724d6eb5b8ff5e713f7bfd8f6c33e5a6d87f62
parent3ed57e4995d9f8583083483f397ddc3131720953 (diff)
downloadcpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.zip
cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.gz
cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.bz2
gh-102856: Python tokenizer implementation for PEP 701 (#104323)
This commit replaces the Python implementation of the tokenize module with an implementation that reuses the real C tokenizer via a private extension module. The tokenize module now implements a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward compatibility. As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
-rw-r--r--Doc/library/token-list.inc4
-rw-r--r--Doc/library/token.rst2
-rw-r--r--Grammar/Tokens4
-rw-r--r--Include/internal/pycore_global_objects_fini_generated.h1
-rw-r--r--Include/internal/pycore_global_strings.h1
-rw-r--r--Include/internal/pycore_runtime_init_generated.h1
-rw-r--r--Include/internal/pycore_token.h4
-rw-r--r--Include/internal/pycore_unicodeobject_generated.h3
-rw-r--r--Lib/inspect.py4
-rwxr-xr-xLib/tabnanny.py10
-rw-r--r--Lib/test/test_tabnanny.py4
-rw-r--r--Lib/test/test_tokenize.py183
-rw-r--r--Lib/token.py6
-rw-r--r--Lib/tokenize.py339
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst1
-rw-r--r--Parser/pegen.c4
-rw-r--r--Parser/pegen_errors.c4
-rw-r--r--Parser/token.c4
-rw-r--r--Parser/tokenizer.c57
-rw-r--r--Parser/tokenizer.h4
-rw-r--r--Python/Python-tokenize.c140
-rw-r--r--Python/clinic/Python-tokenize.c.h22
22 files changed, 426 insertions, 376 deletions
diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc
index 3b34509..e885de8 100644
--- a/Doc/library/token-list.inc
+++ b/Doc/library/token-list.inc
@@ -223,6 +223,10 @@
.. data:: FSTRING_END
+.. data:: COMMENT
+
+.. data:: NL
+
.. data:: ERRORTOKEN
.. data:: N_TOKENS
diff --git a/Doc/library/token.rst b/Doc/library/token.rst
index a1aceba..903847b 100644
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@@ -50,11 +50,13 @@ The following token type values aren't used by the C tokenizer but are needed fo
the :mod:`tokenize` module.
.. data:: COMMENT
+ :noindex:
Token value used to indicate a comment.
.. data:: NL
+ :noindex:
Token value used to indicate a non-terminating newline. The
:data:`NEWLINE` token indicates the end of a logical line of Python code;
diff --git a/Grammar/Tokens b/Grammar/Tokens
index 096876f..618ae81 100644
--- a/Grammar/Tokens
+++ b/Grammar/Tokens
@@ -64,9 +64,9 @@ SOFT_KEYWORD
FSTRING_START
FSTRING_MIDDLE
FSTRING_END
+COMMENT
+NL
ERRORTOKEN
# These aren't used by the C tokenizer but are needed for tokenize.py
-COMMENT
-NL
ENCODING
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index 8ca3545..5a1993e 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -918,6 +918,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exp));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extend));
+ _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extra_tokens));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(facility));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(factory));
_PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(false));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index 8e429bb..6196787 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -406,6 +406,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(exception)
STRUCT_FOR_ID(exp)
STRUCT_FOR_ID(extend)
+ STRUCT_FOR_ID(extra_tokens)
STRUCT_FOR_ID(facility)
STRUCT_FOR_ID(factory)
STRUCT_FOR_ID(false)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index 3edf076..59ec49a 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -912,6 +912,7 @@ extern "C" {
INIT_ID(exception), \
INIT_ID(exp), \
INIT_ID(extend), \
+ INIT_ID(extra_tokens), \
INIT_ID(facility), \
INIT_ID(factory), \
INIT_ID(false), \
diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h
index b9df876..c02e637 100644
--- a/Include/internal/pycore_token.h
+++ b/Include/internal/pycore_token.h
@@ -77,7 +77,9 @@ extern "C" {
#define FSTRING_START 61
#define FSTRING_MIDDLE 62
#define FSTRING_END 63
-#define ERRORTOKEN 64
+#define COMMENT 64
+#define NL 65
+#define ERRORTOKEN 66
#define N_TOKENS 68
#define NT_OFFSET 256
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index 0e1f717..8f8a067 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -1059,6 +1059,9 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
string = &_Py_ID(extend);
assert(_PyUnicode_CheckConsistency(string, 1));
_PyUnicode_InternInPlace(interp, &string);
+ string = &_Py_ID(extra_tokens);
+ assert(_PyUnicode_CheckConsistency(string, 1));
+ _PyUnicode_InternInPlace(interp, &string);
string = &_Py_ID(facility);
assert(_PyUnicode_CheckConsistency(string, 1));
_PyUnicode_InternInPlace(interp, &string);
diff --git a/Lib/inspect.py b/Lib/inspect.py
index 63f5aa9..7709a95 100644
--- a/Lib/inspect.py
+++ b/Lib/inspect.py
@@ -2187,7 +2187,7 @@ def _signature_strip_non_python_syntax(signature):
if string == ',':
current_parameter += 1
- if (type == ERRORTOKEN) and (string == '$'):
+ if (type == OP) and (string == '$'):
assert self_parameter is None
self_parameter = current_parameter
continue
@@ -2195,7 +2195,7 @@ def _signature_strip_non_python_syntax(signature):
add(string)
if (string == ','):
add(' ')
- clean_signature = ''.join(text)
+ clean_signature = ''.join(text).strip()
return clean_signature, self_parameter
diff --git a/Lib/tabnanny.py b/Lib/tabnanny.py
index 9d2df59..e2ac683 100755
--- a/Lib/tabnanny.py
+++ b/Lib/tabnanny.py
@@ -107,6 +107,10 @@ def check(file):
errprint("%r: Token Error: %s" % (file, msg))
return
+ except SyntaxError as msg:
+ errprint("%r: Token Error: %s" % (file, msg))
+ return
+
except IndentationError as msg:
errprint("%r: Indentation Error: %s" % (file, msg))
return
@@ -272,6 +276,12 @@ def format_witnesses(w):
return prefix + " " + ', '.join(firsts)
def process_tokens(tokens):
+ try:
+ _process_tokens(tokens)
+ except TabError as e:
+ raise NannyNag(e.lineno, e.msg, e.text)
+
+def _process_tokens(tokens):
INDENT = tokenize.INDENT
DEDENT = tokenize.DEDENT
NEWLINE = tokenize.NEWLINE
diff --git a/Lib/test/test_tabnanny.py b/Lib/test/test_tabnanny.py
index afb8da7..dac4731 100644
--- a/Lib/test/test_tabnanny.py
+++ b/Lib/test/test_tabnanny.py
@@ -223,7 +223,7 @@ class TestCheck(TestCase):
with TemporaryPyFile(SOURCE_CODES["nannynag_errored"]) as file_path:
out = f"{file_path!r}: *** Line 3: trouble in tab city! ***\n"
out += "offending line: '\\tprint(\"world\")\\n'\n"
- out += "indent not equal e.g. at tab size 1\n"
+ out += "inconsistent use of tabs and spaces in indentation\n"
tabnanny.verbose = 1
self.verify_tabnanny_check(file_path, out=out)
@@ -315,7 +315,7 @@ class TestCommandLine(TestCase):
def test_with_errored_file(self):
"""Should displays error when errored python file is given."""
with TemporaryPyFile(SOURCE_CODES["wrong_indented"]) as file_path:
- stderr = f"{file_path!r}: Indentation Error: "
+ stderr = f"{file_path!r}: Token Error: "
stderr += ('unindent does not match any outer indentation level'
' (<tokenize>, line 3)')
self.validate_cmd(file_path, stderr=stderr, expect_failure=True)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 911b53e..dda7243 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -3,7 +3,7 @@ from test.support import os_helper
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens,
- NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT)
+ NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
from io import BytesIO, StringIO
import unittest
from textwrap import dedent
@@ -82,7 +82,7 @@ class TokenizeTest(TestCase):
NAME 'False' (4, 11) (4, 16)
COMMENT '# NEWLINE' (4, 17) (4, 26)
NEWLINE '\\n' (4, 26) (4, 27)
- DEDENT '' (5, 0) (5, 0)
+ DEDENT '' (4, 27) (4, 27)
""")
indent_error_file = b"""\
def k(x):
@@ -230,6 +230,10 @@ def k(x):
continue
self.assertEqual(number_token(lit), lit)
for lit in INVALID_UNDERSCORE_LITERALS:
+ try:
+ number_token(lit)
+ except SyntaxError:
+ continue
self.assertNotEqual(number_token(lit), lit)
def test_string(self):
@@ -381,21 +385,119 @@ c"""', """\
STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
""")
self.check_tokenize('f"abc"', """\
- STRING 'f"abc"' (1, 0) (1, 6)
+ FSTRING_START 'f"' (1, 0) (1, 2)
+ FSTRING_MIDDLE 'abc' (1, 2) (1, 5)
+ FSTRING_END '"' (1, 5) (1, 6)
""")
self.check_tokenize('fR"a{b}c"', """\
- STRING 'fR"a{b}c"' (1, 0) (1, 9)
+ FSTRING_START 'fR"' (1, 0) (1, 3)
+ FSTRING_MIDDLE 'a' (1, 3) (1, 4)
+ OP '{' (1, 4) (1, 5)
+ NAME 'b' (1, 5) (1, 6)
+ OP '}' (1, 6) (1, 7)
+ FSTRING_MIDDLE 'c' (1, 7) (1, 8)
+ FSTRING_END '"' (1, 8) (1, 9)
+ """)
+ self.check_tokenize('fR"a{{{b!r}}}c"', """\
+ FSTRING_START 'fR"' (1, 0) (1, 3)
+ FSTRING_MIDDLE 'a{' (1, 3) (1, 5)
+ OP '{' (1, 6) (1, 7)
+ NAME 'b' (1, 7) (1, 8)
+ OP '!' (1, 8) (1, 9)
+ NAME 'r' (1, 9) (1, 10)
+ OP '}' (1, 10) (1, 11)
+ FSTRING_MIDDLE '}' (1, 11) (1, 12)
+ FSTRING_MIDDLE 'c' (1, 13) (1, 14)
+ FSTRING_END '"' (1, 14) (1, 15)
+ """)
+ self.check_tokenize('f"{{{1+1}}}"', """\
+ FSTRING_START 'f"' (1, 0) (1, 2)
+ FSTRING_MIDDLE '{' (1, 2) (1, 3)
+ OP '{' (1, 4) (1, 5)
+ NUMBER '1' (1, 5) (1, 6)
+ OP '+' (1, 6) (1, 7)
+ NUMBER '1' (1, 7) (1, 8)
+ OP '}' (1, 8) (1, 9)
+ FSTRING_MIDDLE '}' (1, 9) (1, 10)
+ FSTRING_END '"' (1, 11) (1, 12)
+ """)
+ self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\
+ FSTRING_START 'f\"""' (1, 0) (1, 4)
+ OP '{' (1, 4) (1, 5)
+ FSTRING_START "f'''" (1, 5) (1, 9)
+ OP '{' (1, 9) (1, 10)
+ FSTRING_START "f'" (1, 10) (1, 12)
+ OP '{' (1, 12) (1, 13)
+ FSTRING_START 'f"' (1, 13) (1, 15)
+ OP '{' (1, 15) (1, 16)
+ NUMBER '1' (1, 16) (1, 17)
+ OP '+' (1, 17) (1, 18)
+ NUMBER '1' (1, 18) (1, 19)
+ OP '}' (1, 19) (1, 20)
+ FSTRING_END '"' (1, 20) (1, 21)
+ OP '}' (1, 21) (1, 22)
+ FSTRING_END "'" (1, 22) (1, 23)
+ OP '}' (1, 23) (1, 24)
+ FSTRING_END "'''" (1, 24) (1, 27)
+ OP '}' (1, 27) (1, 28)
+ FSTRING_END '\"""' (1, 28) (1, 31)
+ """)
+ self.check_tokenize('f""" x\nstr(data, encoding={invalid!r})\n"""', """\
+ FSTRING_START 'f\"""' (1, 0) (1, 4)
+ FSTRING_MIDDLE ' x\\nstr(data, encoding=' (1, 4) (2, 19)
+ OP '{' (2, 19) (2, 20)
+ NAME 'invalid' (2, 20) (2, 27)
+ OP '!' (2, 27) (2, 28)
+ NAME 'r' (2, 28) (2, 29)
+ OP '}' (2, 29) (2, 30)
+ FSTRING_MIDDLE ')\\n' (2, 30) (3, 0)
+ FSTRING_END '\"""' (3, 0) (3, 3)
+ """)
+ self.check_tokenize('f"""123456789\nsomething{None}bad"""', """\
+ FSTRING_START 'f\"""' (1, 0) (1, 4)
+ FSTRING_MIDDLE '123456789\\nsomething' (1, 4) (2, 9)
+ OP '{' (2, 9) (2, 10)
+ NAME 'None' (2, 10) (2, 14)
+ OP '}' (2, 14) (2, 15)
+ FSTRING_MIDDLE 'bad' (2, 15) (2, 18)
+ FSTRING_END '\"""' (2, 18) (2, 21)
""")
self.check_tokenize('f"""abc"""', """\
- STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
+ FSTRING_START 'f\"""' (1, 0) (1, 4)
+ FSTRING_MIDDLE 'abc' (1, 4) (1, 7)
+ FSTRING_END '\"""' (1, 7) (1, 10)
""")
self.check_tokenize(r'f"abc\
def"', """\
- STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
+ FSTRING_START 'f"' (1, 0) (1, 2)
+ FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 2) (2, 3)
+ FSTRING_END '"' (2, 3) (2, 4)
""")
self.check_tokenize(r'Rf"abc\
def"', """\
- STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
+ FSTRING_START 'Rf"' (1, 0) (1, 3)
+ FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3)
+ FSTRING_END '"' (2, 3) (2, 4)
+ """)
+ self.check_tokenize("f'some words {a+b:.3f} more words {c+d=} final words'", """\
+ FSTRING_START "f'" (1, 0) (1, 2)
+ FSTRING_MIDDLE 'some words ' (1, 2) (1, 13)
+ OP '{' (1, 13) (1, 14)
+ NAME 'a' (1, 14) (1, 15)
+ OP '+' (1, 15) (1, 16)
+ NAME 'b' (1, 16) (1, 17)
+ OP ':' (1, 17) (1, 18)
+ FSTRING_MIDDLE '.3f' (1, 18) (1, 21)
+ OP '}' (1, 21) (1, 22)
+ FSTRING_MIDDLE ' more words ' (1, 22) (1, 34)
+ OP '{' (1, 34) (1, 35)
+ NAME 'c' (1, 35) (1, 36)
+ OP '+' (1, 36) (1, 37)
+ NAME 'd' (1, 37) (1, 38)
+ OP '=' (1, 38) (1, 39)
+ OP '}' (1, 39) (1, 40)
+ FSTRING_MIDDLE ' final words' (1, 40) (1, 52)
+ FSTRING_END "'" (1, 52) (1, 53)
""")
def test_function(self):
@@ -644,8 +746,8 @@ def"', """\
NEWLINE '\\n' (2, 5) (2, 6)
INDENT ' \\t' (3, 0) (3, 9)
NAME 'pass' (3, 9) (3, 13)
- DEDENT '' (4, 0) (4, 0)
- DEDENT '' (4, 0) (4, 0)
+ DEDENT '' (3, 14) (3, 14)
+ DEDENT '' (3, 14) (3, 14)
""")
def test_non_ascii_identifiers(self):
@@ -857,7 +959,7 @@ async def foo():
NUMBER '1' (2, 17) (2, 18)
OP ':' (2, 18) (2, 19)
NAME 'pass' (2, 20) (2, 24)
- DEDENT '' (3, 0) (3, 0)
+ DEDENT '' (2, 25) (2, 25)
""")
self.check_tokenize('''async def foo(async): await''', """\
@@ -905,7 +1007,7 @@ def f():
NAME 'await' (6, 2) (6, 7)
OP '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
- DEDENT '' (7, 0) (7, 0)
+ DEDENT '' (6, 12) (6, 12)
""")
self.check_tokenize('''\
@@ -943,7 +1045,7 @@ async def f():
NAME 'await' (6, 2) (6, 7)
OP '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
- DEDENT '' (7, 0) (7, 0)
+ DEDENT '' (6, 12) (6, 12)
""")
class GenerateTokensTest(TokenizeTest):
@@ -968,7 +1070,7 @@ def decistmt(s):
])
else:
result.append((toknum, tokval))
- return untokenize(result).decode('utf-8')
+ return untokenize(result).decode('utf-8').strip()
class TestMisc(TestCase):
@@ -1040,33 +1142,16 @@ class Test_Tokenize(TestCase):
nonlocal first
if not first:
first = True
- return line
+ yield line
else:
- return b''
+ yield b''
# skip the initial encoding token and the end tokens
- tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
- expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+ tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
+ expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
self.assertEqual(tokens, expected_tokens,
"bytes not decoded with encoding")
- def test__tokenize_does_not_decode_with_encoding_none(self):
- literal = '"ЉЊЈЁЂ"'
- first = False
- def readline():
- nonlocal first
- if not first:
- first = True
- return literal
- else:
- return b''
-
- # skip the end tokens
- tokens = list(_tokenize(readline, encoding=None))[:-2]
- expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
- self.assertEqual(tokens, expected_tokens,
- "string not tokenized when encoding is None")
-
class TestDetectEncoding(TestCase):
@@ -1326,7 +1411,7 @@ class TestTokenize(TestCase):
def test_tokenize(self):
import tokenize as tokenize_module
- encoding = object()
+ encoding = "utf-8"
encoding_used = None
def mock_detect_encoding(readline):
return encoding, [b'first', b'second']
@@ -1336,7 +1421,10 @@ class TestTokenize(TestCase):
encoding_used = encoding
out = []
while True:
- next_line = readline()
+ try:
+ next_line = next(readline)
+ except StopIteration:
+ return out
if next_line:
out.append(next_line)
continue
@@ -1356,7 +1444,7 @@ class TestTokenize(TestCase):
tokenize_module._tokenize = mock__tokenize
try:
results = tokenize(mock_readline)
- self.assertEqual(list(results),
+ self.assertEqual(list(results)[1:],
[b'first', b'second', b'1', b'2', b'3', b'4'])
finally:
tokenize_module.detect_encoding = orig_detect_encoding
@@ -1652,8 +1740,8 @@ class TestRoundtrip(TestCase):
if support.verbose >= 2:
print('tokenize', testfile)
with open(testfile, 'rb') as f:
- with self.subTest(file=testfile):
- self.check_roundtrip(f)
+ # with self.subTest(file=testfile):
+ self.check_roundtrip(f)
def roundtrip(self, code):
@@ -2496,13 +2584,13 @@ async def f():
def test_unicode(self):
self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
- NAME 'Örter' (1, 0) (1, 6)
- EQUAL '=' (1, 7) (1, 8)
- STRING "u'places'" (1, 9) (1, 18)
- NEWLINE '' (1, 18) (1, 18)
- NAME 'grün' (2, 0) (2, 5)
- EQUAL '=' (2, 6) (2, 7)
- STRING "U'green'" (2, 8) (2, 16)
+ NAME 'Örter' (1, 0) (1, 5)
+ EQUAL '=' (1, 6) (1, 7)
+ STRING "u'places'" (1, 8) (1, 17)
+ NEWLINE '' (1, 17) (1, 17)
+ NAME 'grün' (2, 0) (2, 4)
+ EQUAL '=' (2, 5) (2, 6)
+ STRING "U'green'" (2, 7) (2, 15)
""")
def test_invalid_syntax(self):
@@ -2559,8 +2647,7 @@ async def f():
compile(valid, "<string>", "exec")
invalid = generate_source(MAXINDENT)
- tokens = list(_generate_tokens_from_c_tokenizer(invalid))
- self.assertEqual(tokens[-1].type, NEWLINE)
+ self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
self.assertRaises(
IndentationError, compile, invalid, "<string>", "exec"
)
diff --git a/Lib/token.py b/Lib/token.py
index 1459d12..487f6ed 100644
--- a/Lib/token.py
+++ b/Lib/token.py
@@ -67,10 +67,10 @@ SOFT_KEYWORD = 60
FSTRING_START = 61
FSTRING_MIDDLE = 62
FSTRING_END = 63
+COMMENT = 64
+NL = 65
# These aren't used by the C tokenizer but are needed for tokenize.py
-ERRORTOKEN = 64
-COMMENT = 65
-NL = 66
+ERRORTOKEN = 66
ENCODING = 67
N_TOKENS = 68
# Special definitions for cooperation with parser
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 46d2224..bfe40c6 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -56,112 +56,11 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
else:
return self.type
-def group(*choices): return '(' + '|'.join(choices) + ')'
-def any(*choices): return group(*choices) + '*'
-def maybe(*choices): return group(*choices) + '?'
-
-# Note: we use unicode matching for names ("\w") but ascii matching for
-# number literals.
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'\w+'
-
-Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
-Binnumber = r'0[bB](?:_?[01])+'
-Octnumber = r'0[oO](?:_?[0-7])+'
-Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
-Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
-Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
- r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
-Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
-Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
-Number = group(Imagnumber, Floatnumber, Intnumber)
-
-# Return the empty string, plus all of the valid string prefixes.
-def _all_string_prefixes():
- # The valid string prefixes. Only contain the lower case versions,
- # and don't contain any permutations (include 'fr', but not
- # 'rf'). The various permutations will be generated.
- _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
- # if we add binary f-strings, add: ['fb', 'fbr']
- result = {''}
- for prefix in _valid_string_prefixes:
- for t in _itertools.permutations(prefix):
- # create a list with upper and lower versions of each
- # character
- for u in _itertools.product(*[(c, c.upper()) for c in t]):
- result.add(''.join(u))
- return result
-
-@functools.lru_cache
-def _compile(expr):
- return re.compile(expr, re.UNICODE)
-
-# Note that since _all_string_prefixes includes the empty string,
-# StringPrefix can be the empty string (making it optional).
-StringPrefix = group(*_all_string_prefixes())
-
-# Tail end of ' string.
-Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
-# Tail end of " string.
-Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
-# Tail end of ''' string.
-Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
-# Tail end of """ string.
-Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group(StringPrefix + "'''", StringPrefix + '"""')
-# Single-line ' or " string.
-String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
- StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-
-# Sorting in reverse order puts the long operators before their prefixes.
-# Otherwise if = came before ==, == would get recognized as two instances
-# of =.
-Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
-Funny = group(r'\r?\n', Special)
-
-PlainToken = group(Number, Funny, String, Name)
-Token = Ignore + PlainToken
-
-# First (or only) line of ' or " string.
-ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
- group("'", r'\\\r?\n'),
- StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
- group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
-PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-
-# For a given string prefix plus quotes, endpats maps it to a regex
-# to match the remainder of that string. _prefix can be empty, for
-# a normal single or triple quoted string (with no prefix).
-endpats = {}
-for _prefix in _all_string_prefixes():
- endpats[_prefix + "'"] = Single
- endpats[_prefix + '"'] = Double
- endpats[_prefix + "'''"] = Single3
- endpats[_prefix + '"""'] = Double3
-del _prefix
-
-# A set of all of the single and triple quoted string prefixes,
-# including the opening quotes.
-single_quoted = set()
-triple_quoted = set()
-for t in _all_string_prefixes():
- for u in (t + '"', t + "'"):
- single_quoted.add(u)
- for u in (t + '"""', t + "'''"):
- triple_quoted.add(u)
-del t, u
-
-tabsize = 8
class TokenError(Exception): pass
-class StopTokenizing(Exception): pass
+class StopTokenizing(Exception): pass
class Untokenizer:
@@ -213,6 +112,14 @@ class Untokenizer:
self.tokens.append(indent)
self.prev_col = len(indent)
startline = False
+ elif tok_type == FSTRING_MIDDLE:
+ if '{' in token or '}' in token:
+ end_line, end_col = end
+ end = (end_line, end_col + token.count('{') + token.count('}'))
+ token = re.sub('{', '{{', token)
+ token = re.sub('}', '}}', token)
+
+
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
@@ -255,6 +162,11 @@ class Untokenizer:
elif startline and indents:
toks_append(indents[-1])
startline = False
+ elif toknum == FSTRING_MIDDLE:
+ if '{' in tokval or '}' in tokval:
+ tokval = re.sub('{', '{{', tokval)
+ tokval = re.sub('}', '}}', tokval)
+
toks_append(tokval)
@@ -404,7 +316,6 @@ def open(filename):
buffer.close()
raise
-
def tokenize(readline):
"""
The tokenize() generator requires one argument, readline, which
@@ -425,192 +336,32 @@ def tokenize(readline):
which tells you which encoding was used to decode the bytes stream.
"""
encoding, consumed = detect_encoding(readline)
- empty = _itertools.repeat(b"")
- rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
- return _tokenize(rl_gen.__next__, encoding)
-
-
-def _tokenize(readline, encoding):
- lnum = parenlev = continued = 0
- numchars = '0123456789'
- contstr, needcont = '', 0
- contline = None
- indents = [0]
-
+ rl_gen = _itertools.chain(consumed, iter(readline, b""))
if encoding is not None:
if encoding == "utf-8-sig":
# BOM will already have been stripped.
encoding = "utf-8"
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
- last_line = b''
- line = b''
- while True: # loop over lines in stream
- try:
- # We capture the value of the line variable here because
- # readline uses the empty string '' to signal end of input,
- # hence `line` itself will always be overwritten at the end
- # of this loop.
- last_line = line
- line = readline()
- except StopIteration:
- line = b''
-
- if encoding is not None:
- line = line.decode(encoding)
- lnum += 1
- pos, max = 0, len(line)
-
- if contstr: # continued string
- if not line:
- raise TokenError("EOF in multi-line string", strstart)
- endmatch = endprog.match(line)
- if endmatch:
- pos = end = endmatch.end(0)
- yield TokenInfo(STRING, contstr + line[:end],
- strstart, (lnum, end), contline + line)
- contstr, needcont = '', 0
- contline = None
- elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
- yield TokenInfo(ERRORTOKEN, contstr + line,
- strstart, (lnum, len(line)), contline)
- contstr = ''
- contline = None
- continue
- else:
- contstr = contstr + line
- contline = contline + line
- continue
-
- elif parenlev == 0 and not continued: # new statement
- if not line: break
- column = 0
- while pos < max: # measure leading whitespace
- if line[pos] == ' ':
- column += 1
- elif line[pos] == '\t':
- column = (column//tabsize + 1)*tabsize
- elif line[pos] == '\f':
- column = 0
- else:
- break
- pos += 1
- if pos == max:
- break
-
- if line[pos] in '#\r\n': # skip comments or blank lines
- if line[pos] == '#':
- comment_token = line[pos:].rstrip('\r\n')
- yield TokenInfo(COMMENT, comment_token,
- (lnum, pos), (lnum, pos + len(comment_token)), line)
- pos += len(comment_token)
-
- yield TokenInfo(NL, line[pos:],
- (lnum, pos), (lnum, len(line)), line)
- continue
-
- if column > indents[-1]: # count indents or dedents
- indents.append(column)
- yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
- while column < indents[-1]:
- if column not in indents:
- raise IndentationError(
- "unindent does not match any outer indentation level",
- ("<tokenize>", lnum, pos, line))
- indents = indents[:-1]
-
- yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
-
- else: # continued statement
- if not line:
- raise TokenError("EOF in multi-line statement", (lnum, 0))
- continued = 0
-
- while pos < max:
- pseudomatch = _compile(PseudoToken).match(line, pos)
- if pseudomatch: # scan for tokens
- start, end = pseudomatch.span(1)
- spos, epos, pos = (lnum, start), (lnum, end), end
- if start == end:
- continue
- token, initial = line[start:end], line[start]
-
- if (initial in numchars or # ordinary number
- (initial == '.' and token != '.' and token != '...')):
- yield TokenInfo(NUMBER, token, spos, epos, line)
- elif initial in '\r\n':
- if parenlev > 0:
- yield TokenInfo(NL, token, spos, epos, line)
- else:
- yield TokenInfo(NEWLINE, token, spos, epos, line)
-
- elif initial == '#':
- assert not token.endswith("\n")
- yield TokenInfo(COMMENT, token, spos, epos, line)
-
- elif token in triple_quoted:
- endprog = _compile(endpats[token])
- endmatch = endprog.match(line, pos)
- if endmatch: # all on one line
- pos = endmatch.end(0)
- token = line[start:pos]
- yield TokenInfo(STRING, token, spos, (lnum, pos), line)
- else:
- strstart = (lnum, start) # multiple lines
- contstr = line[start:]
- contline = line
- break
-
- # Check up to the first 3 chars of the token to see if
- # they're in the single_quoted set. If so, they start
- # a string.
- # We're using the first 3, because we're looking for
- # "rb'" (for example) at the start of the token. If
- # we switch to longer prefixes, this needs to be
- # adjusted.
- # Note that initial == token[:1].
- # Also note that single quote checking must come after
- # triple quote checking (above).
- elif (initial in single_quoted or
- token[:2] in single_quoted or
- token[:3] in single_quoted):
- if token[-1] == '\n': # continued string
- strstart = (lnum, start)
- # Again, using the first 3 chars of the
- # token. This is looking for the matching end
- # regex for the correct type of quote
- # character. So it's really looking for
- # endpats["'"] or endpats['"'], by trying to
- # skip string prefix characters, if any.
- endprog = _compile(endpats.get(initial) or
- endpats.get(token[1]) or
- endpats.get(token[2]))
- contstr, needcont = line[start:], 1
- contline = line
- break
- else: # ordinary string
- yield TokenInfo(STRING, token, spos, epos, line)
-
- elif initial.isidentifier(): # ordinary name
- yield TokenInfo(NAME, token, spos, epos, line)
- elif initial == '\\': # continued stmt
- continued = 1
- else:
- if initial in '([{':
- parenlev += 1
- elif initial in ')]}':
- parenlev -= 1
- yield TokenInfo(OP, token, spos, epos, line)
- else:
- yield TokenInfo(ERRORTOKEN, line[pos],
- (lnum, pos), (lnum, pos+1), line)
- pos += 1
-
- # Add an implicit NEWLINE if the input doesn't end in one
- if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
- yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
- for indent in indents[1:]: # pop remaining indent levels
- yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
- yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+ yield from _tokenize(rl_gen, encoding)
+
+def _tokenize(rl_gen, encoding):
+ source = b"".join(rl_gen).decode(encoding)
+ token = None
+ for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
+ # TODO: Marta -> limpiar esto
+ if 6 < token.type <= 54:
+ token = token._replace(type=OP)
+ if token.type in {ASYNC, AWAIT}:
+ token = token._replace(type=NAME)
+ if token.type == NEWLINE:
+ l_start, c_start = token.start
+ l_end, c_end = token.end
+ token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
+
+ yield token
+ if token is not None:
+ last_line, _ = token.start
+ yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
def generate_tokens(readline):
@@ -619,7 +370,16 @@ def generate_tokens(readline):
This has the same API as tokenize(), except that it expects the *readline*
callable to return str objects instead of bytes.
"""
- return _tokenize(readline, None)
+ def _gen():
+ while True:
+ try:
+ line = readline()
+ except StopIteration:
+ return
+ if not line:
+ return
+ yield line.encode()
+ return _tokenize(_gen(), 'utf-8')
def main():
import argparse
@@ -656,7 +416,10 @@ def main():
tokens = list(tokenize(f.readline))
else:
filename = "<stdin>"
- tokens = _tokenize(sys.stdin.readline, None)
+ tokens = _tokenize(
+ (x.encode('utf-8') for x in iter(sys.stdin.readline, "")
+ ), "utf-8")
+
# Output the tokenization
for token in tokens:
@@ -682,10 +445,10 @@ def main():
perror("unexpected error: %s" % err)
raise
-def _generate_tokens_from_c_tokenizer(source):
+def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
import _tokenize as c_tokenizer
- for info in c_tokenizer.TokenizerIter(source):
+ for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst
new file mode 100644
index 0000000..ff831c9
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst
@@ -0,0 +1 @@
+Implement PEP 701 changes in the :mod:`tokenize` module. Patch by Marta Gómez Macías and Pablo Galindo Salgado
diff --git a/Parser/pegen.c b/Parser/pegen.c
index da410ea..b031a6f 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -208,7 +208,7 @@ int
_PyPegen_fill_token(Parser *p)
{
struct token new_token;
- new_token.metadata = NULL;
+ _PyToken_Init(&new_token);
int type = _PyTokenizer_Get(p->tok, &new_token);
// Record and skip '# type: ignore' comments
@@ -251,7 +251,7 @@ _PyPegen_fill_token(Parser *p)
Token *t = p->tokens[p->fill];
return initialize_token(p, t, &new_token, type);
error:
- Py_XDECREF(new_token.metadata);
+ _PyToken_Free(&new_token);
return -1;
}
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index 1f227da..af52905 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -165,7 +165,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
int ret = 0;
struct token new_token;
- new_token.metadata = NULL;
+ _PyToken_Init(&new_token);
for (;;) {
switch (_PyTokenizer_Get(p->tok, &new_token)) {
@@ -193,7 +193,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
exit:
- Py_XDECREF(new_token.metadata);
+ _PyToken_Free(&new_token);
// If we're in an f-string, we want the syntax error in the expression part
// to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
// do not swallow it.
diff --git a/Parser/token.c b/Parser/token.c
index 82267fb..2bc963a 100644
--- a/Parser/token.c
+++ b/Parser/token.c
@@ -70,9 +70,9 @@ const char * const _PyParser_TokenNames[] = {
"FSTRING_START",
"FSTRING_MIDDLE",
"FSTRING_END",
+ "COMMENT",
+ "NL",
"<ERRORTOKEN>",
- "<COMMENT>",
- "<NL>",
"<ENCODING>",
"<N_TOKENS>",
};
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index c5dc9e7..fb94fbe 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -111,6 +111,8 @@ tok_new(void)
tok->interactive_underflow = IUNDERFLOW_NORMAL;
tok->str = NULL;
tok->report_warnings = 1;
+ tok->tok_extra_tokens = 0;
+ tok->comment_newline = 0;
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
tok->tok_mode_stack_index = 0;
tok->tok_report_warnings = 1;
@@ -980,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok)
PyMem_Free(tok);
}
+void
+_PyToken_Free(struct token *token) {
+ Py_XDECREF(token->metadata);
+}
+
+void
+_PyToken_Init(struct token *token) {
+ token->metadata = NULL;
+}
+
static int
tok_readline_raw(struct tok_state *tok)
{
@@ -1636,6 +1648,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
return type;
}
+
static int
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
{
@@ -1649,6 +1662,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
tok->starting_col_offset = -1;
blankline = 0;
+
/* Get indentation level */
if (tok->atbol) {
int col = 0;
@@ -1749,12 +1763,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
tok->starting_col_offset = tok->col_offset;
/* Return pending indents/dedents */
- if (tok->pendin != 0) {
+ if (tok->pendin != 0) {
if (tok->pendin < 0) {
+ if (tok->tok_extra_tokens) {
+ p_start = tok->cur;
+ p_end = tok->cur;
+ }
tok->pendin++;
return MAKE_TOKEN(DEDENT);
}
else {
+ if (tok->tok_extra_tokens) {
+ p_start = tok->buf;
+ p_end = tok->cur;
+ }
tok->pendin--;
return MAKE_TOKEN(INDENT);
}
@@ -1803,13 +1825,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
}
- const char *prefix, *p, *type_start;
+ const char* p = NULL;
+ const char *prefix, *type_start;
int current_starting_col_offset;
while (c != EOF && c != '\n') {
c = tok_nextc(tok);
}
+ if (tok->tok_extra_tokens) {
+ p = tok->start;
+ }
+
if (tok->type_comments) {
p = tok->start;
current_starting_col_offset = tok->starting_col_offset;
@@ -1864,6 +1891,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
}
}
}
+ if (tok->tok_extra_tokens) {
+ tok_backup(tok, c); /* don't eat the newline or EOF */
+ p_start = p;
+ p_end = tok->cur;
+ tok->comment_newline = blankline;
+ return MAKE_TOKEN(COMMENT);
+ }
}
if (tok->done == E_INTERACT_STOP) {
@@ -1949,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
struct tok_state ahead_tok;
struct token ahead_token;
+ _PyToken_Init(&ahead_token);
int ahead_tok_kind;
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
@@ -1964,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
returning a plain NAME token, return ASYNC. */
tok->async_def_indent = tok->indent;
tok->async_def = 1;
+ _PyToken_Free(&ahead_token);
return MAKE_TOKEN(ASYNC);
}
+ _PyToken_Free(&ahead_token);
}
}
@@ -1976,8 +2013,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
if (c == '\n') {
tok->atbol = 1;
if (blankline || tok->level > 0) {
+ if (tok->tok_extra_tokens) {
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NL);
+ }
goto nextline;
}
+ if (tok->comment_newline && tok->tok_extra_tokens) {
+ tok->comment_newline = 0;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NL);
+ }
p_start = tok->start;
p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
@@ -2563,6 +2611,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
f_string_middle:
+ // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
+ // this.
+ tok->multi_line_start = tok->line_start;
while (end_quote_size != current_tok->f_string_quote_size) {
int c = tok_nextc(tok);
if (tok->done == E_ERROR) {
@@ -2788,7 +2839,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
// if fetching the encoding shows a warning.
tok->report_warnings = 0;
while (tok->lineno < 2 && tok->done == E_OK) {
+ _PyToken_Init(&token);
_PyTokenizer_Get(tok, &token);
+ _PyToken_Free(&token);
}
fclose(fp);
if (tok->encoding) {
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index fd169cf..3f34763 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -128,6 +128,8 @@ struct tok_state {
tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
int tok_mode_stack_index;
int tok_report_warnings;
+ int tok_extra_tokens;
+ int comment_newline;
#ifdef Py_DEBUG
int debug;
#endif
@@ -138,6 +140,8 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
const char *, const char *);
extern void _PyTokenizer_Free(struct tok_state *);
+extern void _PyToken_Free(struct token *);
+extern void _PyToken_Init(struct token *);
extern int _PyTokenizer_Get(struct tok_state *, struct token *);
#define tok_dump _Py_tok_dump
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 3394a51..ece2386 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -1,5 +1,8 @@
#include "Python.h"
+#include "errcode.h"
#include "../Parser/tokenizer.h"
+#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
+#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
static struct PyModuleDef _tokenizemodule;
@@ -34,11 +37,14 @@ typedef struct
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
source: str
+ *
+ extra_tokens: bool
[clinic start generated code]*/
static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source)
-/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
+tokenizeriter_new_impl(PyTypeObject *type, const char *source,
+ int extra_tokens)
+/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
{
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
if (self == NULL) {
@@ -54,20 +60,123 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source)
return NULL;
}
self->tok->filename = filename;
+ if (extra_tokens) {
+ self->tok->tok_extra_tokens = 1;
+ }
return (PyObject *)self;
}
+static int
+_tokenizer_error(struct tok_state *tok)
+{
+ if (PyErr_Occurred()) {
+ return -1;
+ }
+
+ const char *msg = NULL;
+ PyObject* errtype = PyExc_SyntaxError;
+ switch (tok->done) {
+ case E_TOKEN:
+ msg = "invalid token";
+ break;
+ case E_EOF:
+ if (tok->level) {
+ PyErr_Format(PyExc_SyntaxError,
+ "parenthesis '%c' was never closed",
+ tok->parenstack[tok->level-1]);
+ } else {
+ PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
+ }
+ return -1;
+ case E_DEDENT:
+ PyErr_Format(PyExc_IndentationError,
+ "unindent does not match any outer indentation level "
+ "(<tokenize>, line %d)",
+ tok->lineno);
+ return -1;
+ case E_INTR:
+ if (!PyErr_Occurred()) {
+ PyErr_SetNone(PyExc_KeyboardInterrupt);
+ }
+ return -1;
+ case E_NOMEM:
+ PyErr_NoMemory();
+ return -1;
+ case E_TABSPACE:
+ errtype = PyExc_TabError;
+ msg = "inconsistent use of tabs and spaces in indentation";
+ break;
+ case E_TOODEEP:
+ errtype = PyExc_IndentationError;
+ msg = "too many levels of indentation";
+ break;
+ case E_LINECONT: {
+ msg = "unexpected character after line continuation character";
+ break;
+ }
+ default:
+ msg = "unknown tokenization error";
+ }
+
+ PyObject* errstr = NULL;
+ PyObject* error_line = NULL;
+ PyObject* tmp = NULL;
+ PyObject* value = NULL;
+ int result = 0;
+
+ Py_ssize_t size = tok->inp - tok->buf;
+ error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
+ if (!error_line) {
+ result = -1;
+ goto exit;
+ }
+
+ tmp = Py_BuildValue("(OnnOii)", tok->filename, tok->lineno, 0, error_line, 0, 0);
+ if (!tmp) {
+ result = -1;
+ goto exit;
+ }
+
+ errstr = PyUnicode_FromString(msg);
+ if (!errstr) {
+ result = -1;
+ goto exit;
+ }
+
+ value = PyTuple_Pack(2, errstr, tmp);
+ if (!value) {
+ result = -1;
+ goto exit;
+ }
+
+ PyErr_SetObject(errtype, value);
+
+exit:
+ Py_XDECREF(errstr);
+ Py_XDECREF(error_line);
+ Py_XDECREF(tmp);
+ Py_XDECREF(value);
+ return result;
+}
+
static PyObject *
tokenizeriter_next(tokenizeriterobject *it)
{
+ PyObject* result = NULL;
struct token token;
+ _PyToken_Init(&token);
+
int type = _PyTokenizer_Get(it->tok, &token);
- if (type == ERRORTOKEN && PyErr_Occurred()) {
- return NULL;
+ if (type == ERRORTOKEN) {
+ if(!PyErr_Occurred()) {
+ _tokenizer_error(it->tok);
+ assert(PyErr_Occurred());
+ }
+ goto exit;
}
if (type == ERRORTOKEN || type == ENDMARKER) {
PyErr_SetString(PyExc_StopIteration, "EOF");
- return NULL;
+ goto exit;
}
PyObject *str = NULL;
if (token.start == NULL || token.end == NULL) {
@@ -77,28 +186,31 @@ tokenizeriter_next(tokenizeriterobject *it)
str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
}
if (str == NULL) {
- return NULL;
+ goto exit;
}
Py_ssize_t size = it->tok->inp - it->tok->buf;
PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
if (line == NULL) {
Py_DECREF(str);
- return NULL;
+ goto exit;
}
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
- int lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
- int end_lineno = it->tok->lineno;
- int col_offset = -1;
- int end_col_offset = -1;
+ Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
+ Py_ssize_t end_lineno = it->tok->lineno;
+ Py_ssize_t col_offset = -1;
+ Py_ssize_t end_col_offset = -1;
if (token.start != NULL && token.start >= line_start) {
- col_offset = (int)(token.start - line_start);
+ col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
}
if (token.end != NULL && token.end >= it->tok->line_start) {
- end_col_offset = (int)(token.end - it->tok->line_start);
+ end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
}
- return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+ result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+exit:
+ _PyToken_Free(&token);
+ return result;
}
static void
diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h
index 6af9374..7e77938 100644
--- a/Python/clinic/Python-tokenize.c.h
+++ b/Python/clinic/Python-tokenize.c.h
@@ -9,7 +9,8 @@ preserve
static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source);
+tokenizeriter_new_impl(PyTypeObject *type, const char *source,
+ int extra_tokens);
static PyObject *
tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
@@ -17,14 +18,14 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
PyObject *return_value = NULL;
#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
- #define NUM_KEYWORDS 1
+ #define NUM_KEYWORDS 2
static struct {
PyGC_Head _this_is_not_used;
PyObject_VAR_HEAD
PyObject *ob_item[NUM_KEYWORDS];
} _kwtuple = {
.ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
- .ob_item = { &_Py_ID(source), },
+ .ob_item = { &_Py_ID(source), &_Py_ID(extra_tokens), },
};
#undef NUM_KEYWORDS
#define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -33,19 +34,20 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
# define KWTUPLE NULL
#endif // !Py_BUILD_CORE
- static const char * const _keywords[] = {"source", NULL};
+ static const char * const _keywords[] = {"source", "extra_tokens", NULL};
static _PyArg_Parser _parser = {
.keywords = _keywords,
.fname = "tokenizeriter",
.kwtuple = KWTUPLE,
};
#undef KWTUPLE
- PyObject *argsbuf[1];
+ PyObject *argsbuf[2];
PyObject * const *fastargs;
Py_ssize_t nargs = PyTuple_GET_SIZE(args);
const char *source;
+ int extra_tokens;
- fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf);
+ fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf);
if (!fastargs) {
goto exit;
}
@@ -62,9 +64,13 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_ValueError, "embedded null character");
goto exit;
}
- return_value = tokenizeriter_new_impl(type, source);
+ extra_tokens = PyObject_IsTrue(fastargs[1]);
+ if (extra_tokens < 0) {
+ goto exit;
+ }
+ return_value = tokenizeriter_new_impl(type, source, extra_tokens);
exit:
return return_value;
}
-/*[clinic end generated code: output=8c2c09f651961986 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/