summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_tokenize.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/test/test_tokenize.py')
-rw-r--r--Lib/test/test_tokenize.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 75710db..480bff7 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,4 +1,5 @@
import os
+import re
import token
import tokenize
import unittest
@@ -1819,6 +1820,22 @@ class UntokenizeTest(TestCase):
self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
+def contains_ambiguous_backslash(source):
+ """Return `True` if the source contains a backslash on a
+ line by itself. For example:
+
+ a = (1
+ \\
+ )
+
+ Code like this cannot be untokenized exactly. This is because
+ the tokenizer does not produce any tokens for the line containing
+ the backslash and so there is no way to know its indent.
+ """
+ pattern = re.compile(br'\n\s*\\\r?\n')
+ return pattern.search(source) is not None
+
+
class TestRoundtrip(TestCase):
def check_roundtrip(self, f):
@@ -1829,6 +1846,9 @@ class TestRoundtrip(TestCase):
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
The test fails if the 3 pair tokenizations do not match.
+ If the source code can be untokenized unambiguously, the
+ untokenized code must match the original code exactly.
+
When untokenize bugs are fixed, untokenize with 5-tuples should
reproduce code that does not contain a backslash continuation
following spaces. A proper test should test this.
@@ -1852,6 +1872,13 @@ class TestRoundtrip(TestCase):
tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
self.assertEqual(tokens2_from5, tokens2)
+ if not contains_ambiguous_backslash(code):
+ # The BOM does not produce a token so there is no way to preserve it.
+ code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
+ readline = iter(code_without_bom.splitlines(keepends=True)).__next__
+ untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
+ self.assertEqual(code_without_bom, untokenized_code)
+
def check_line_extraction(self, f):
if isinstance(f, str):
code = f.encode('utf-8')