summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPablo Galindo Salgado <Pablogsal@gmail.com>2023-05-26 14:46:22 (GMT)
committerGitHub <noreply@github.com>2023-05-26 14:46:22 (GMT)
commit3fdb55c48291a459fb1e33edb5140ec0383222df (patch)
tree4c55319f16464b4fde991419d6f51583c4e8628f
parent2cb445635e99d4401949cabebd373288cfdd0138 (diff)
downloadcpython-3fdb55c48291a459fb1e33edb5140ec0383222df.zip
cpython-3fdb55c48291a459fb1e33edb5140ec0383222df.tar.gz
cpython-3fdb55c48291a459fb1e33edb5140ec0383222df.tar.bz2
gh-104972: Ensure that line attributes in tokens in the tokenize module are correct (#104975)
-rw-r--r--Lib/idlelib/idle_test/test_editor.py4
-rw-r--r--Lib/test/test_tokenize.py15
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst2
-rw-r--r--Python/Python-tokenize.c9
4 files changed, 21 insertions, 9 deletions
diff --git a/Lib/idlelib/idle_test/test_editor.py b/Lib/idlelib/idle_test/test_editor.py
index ba59c40..9296a6d 100644
--- a/Lib/idlelib/idle_test/test_editor.py
+++ b/Lib/idlelib/idle_test/test_editor.py
@@ -201,8 +201,8 @@ class IndentSearcherTest(unittest.TestCase):
test_info = (# text, (block, indent))
("", (None, None)),
("[1,", (None, None)), # TokenError
- ("if 1:\n", ('if 1:', None)),
- ("if 1:\n 2\n 3\n", ('if 1:', ' 2')),
+ ("if 1:\n", ('if 1:\n', None)),
+ ("if 1:\n 2\n 3\n", ('if 1:\n', ' 2\n')),
)
for code, expected_pair in test_info:
with self.subTest(code=code):
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 251ce2b..0b7c258 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1174,7 +1174,7 @@ class Test_Tokenize(TestCase):
# skip the initial encoding token and the end tokens
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
- expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+ expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
self.assertEqual(tokens, expected_tokens,
"bytes not decoded with encoding")
@@ -1657,7 +1657,6 @@ class TestRoundtrip(TestCase):
code = f.encode('utf-8')
else:
code = f.read()
- f.close()
readline = iter(code.splitlines(keepends=True)).__next__
tokens5 = list(tokenize(readline))
tokens2 = [tok[:2] for tok in tokens5]
@@ -1672,6 +1671,17 @@ class TestRoundtrip(TestCase):
tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
self.assertEqual(tokens2_from5, tokens2)
+ def check_line_extraction(self, f):
+ if isinstance(f, str):
+ code = f.encode('utf-8')
+ else:
+ code = f.read()
+ readline = iter(code.splitlines(keepends=True)).__next__
+ for tok in tokenize(readline):
+ if tok.type in {ENCODING, ENDMARKER}:
+ continue
+ self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]])
+
def test_roundtrip(self):
# There are some standard formatting practices that are easy to get right.
@@ -1768,6 +1778,7 @@ class TestRoundtrip(TestCase):
with open(testfile, 'rb') as f:
# with self.subTest(file=testfile):
self.check_roundtrip(f)
+ self.check_line_extraction(f)
def roundtrip(self, code):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst
new file mode 100644
index 0000000..05d50c1
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst
@@ -0,0 +1,2 @@
+Ensure that the ``line`` attribute in :class:`tokenize.TokenInfo` objects in
+the :mod:`tokenize` module are always correct. Patch by Pablo Galindo
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 0023e30..88087c1 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -194,15 +194,14 @@ tokenizeriter_next(tokenizeriterobject *it)
goto exit;
}
- Py_ssize_t size = it->tok->inp - it->tok->buf;
- assert(it->tok->buf[size-1] == '\n');
- size -= 1; // Remove the newline character from the end of the line
- PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
+ const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
+ Py_ssize_t size = it->tok->inp - line_start;
+ PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace");
if (line == NULL) {
Py_DECREF(str);
goto exit;
}
- const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
+
Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
Py_ssize_t end_lineno = it->tok->lineno;
Py_ssize_t col_offset = -1;