diff options
-rw-r--r-- | Lib/test/test_tokenize.py | 21 | ||||
-rw-r--r-- | Lib/tokenize.py | 17 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
3 files changed, 40 insertions, 1 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index b4e114c..42fc78f 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -5,6 +5,8 @@ The tests can be really simple. Given a small fragment of source code, print out a table with tokens. The ENDMARKER is omitted for brevity. + >>> import glob + >>> dump_tokens("1 + 1") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '1' (1, 0) (1, 1) @@ -835,7 +837,7 @@ from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, open as tokenize_open, Untokenizer) from io import BytesIO from unittest import TestCase, mock -import os, sys, glob +import os import token def dump_tokens(s): @@ -1427,6 +1429,22 @@ class UntokenizeTest(TestCase): self.assertEqual(untokenize(iter(tokens)), b'Hello ') +class TestRoundtrip(TestCase): + def roundtrip(self, code): + if isinstance(code, str): + code = code.encode('utf-8') + return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8') + + def test_indentation_semantics_retained(self): + """ + Ensure that although whitespace might be mutated in a roundtrip, + the semantic meaning of the indentation remains consistent. + """ + code = "if False:\n\tx=3\n\tx=3\n" + codelines = self.roundtrip(code).split('\n') + self.assertEqual(codelines[1], codelines[2]) + + __test__ = {"doctests" : doctests, 'decistmt': decistmt} def test_main(): @@ -1437,6 +1455,7 @@ def test_main(): support.run_unittest(TestDetectEncoding) support.run_unittest(TestTokenize) support.run_unittest(UntokenizeTest) + support.run_unittest(TestRoundtrip) if __name__ == "__main__": test_main() diff --git a/Lib/tokenize.py b/Lib/tokenize.py index f58c286..3ec9018 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -244,6 +244,8 @@ class Untokenizer: def untokenize(self, iterable): it = iter(iterable) + indents = [] + startline = False for t in it: if len(t) == 2: self.compat(t, it) @@ -254,6 +256,21 @@ class Untokenizer: continue if tok_type == ENDMARKER: break + if tok_type == INDENT: + indents.append(token) + continue + elif tok_type == DEDENT: + indents.pop() + self.prev_row, self.prev_col = end + continue + elif tok_type in (NEWLINE, NL): + startline = True + elif startline and indents: + indent = indents[-1] + if start[1] >= len(indent): + self.tokens.append(indent) + self.prev_col = len(indent) + startline = False self.add_whitespace(start) self.tokens.append(token) self.prev_row, self.prev_col = end @@ -47,6 +47,9 @@ Core and Builtins Library ------- +- Issue #20387: Restore semantic round-trip correctness in tokenize/untokenize + for tab-indented blocks. + - Issue #24456: Fixed possible buffer over-read in adpcm2lin() and lin2adpcm() functions of the audioop module. |