summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_tokenize.py
diff options
context:
space:
mode:
authorBenjamin Peterson <benjamin@python.org>2009-10-09 21:43:09 (GMT)
committerBenjamin Peterson <benjamin@python.org>2009-10-09 21:43:09 (GMT)
commitd3afadaa4908df544e0181c11199e59b1bfb5c37 (patch)
tree5b214ec4a85f64411b50dd40499bf9a7691d4a5f /Lib/test/test_tokenize.py
parentffc08fcad6d91a50224914e94eae6505b2e55548 (diff)
downloadcpython-d3afadaa4908df544e0181c11199e59b1bfb5c37.zip
cpython-d3afadaa4908df544e0181c11199e59b1bfb5c37.tar.gz
cpython-d3afadaa4908df544e0181c11199e59b1bfb5c37.tar.bz2
normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does
Diffstat (limited to 'Lib/test/test_tokenize.py')
-rw-r--r--Lib/test/test_tokenize.py30
1 files changed, 29 insertions, 1 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index f395ed4..ba705ba 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -719,7 +719,7 @@ class TestDetectEncoding(TestCase):
b'do_something(else)\n'
)
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
- self.assertEquals(encoding, 'latin-1')
+ self.assertEquals(encoding, 'iso-8859-1')
self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
def test_matched_bom_and_cookie_first_line(self):
@@ -775,6 +775,34 @@ class TestDetectEncoding(TestCase):
readline = self.get_readline(lines)
self.assertRaises(SyntaxError, detect_encoding, readline)
+ def test_latin1_normalization(self):
+ # See get_normal_name() in tokenizer.c.
+ encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
+ "iso-8859-1-unix", "iso-latin-1-mac")
+ for encoding in encodings:
+ for rep in ("-", "_"):
+ enc = encoding.replace("-", rep)
+ lines = (b"#!/usr/bin/python\n",
+ b"# coding: " + enc.encode("ascii") + b"\n",
+ b"print(things)\n",
+ b"do_something += 4\n")
+ rl = self.get_readline(lines)
+ found, consumed_lines = detect_encoding(rl)
+ self.assertEquals(found, "iso-8859-1")
+
+ def test_utf8_normalization(self):
+ # See get_normal_name() in tokenizer.c.
+ encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
+ for encoding in encodings:
+ for rep in ("-", "_"):
+ enc = encoding.replace("-", rep)
+ lines = (b"#!/usr/bin/python\n",
+ b"# coding: " + enc.encode("ascii") + b"\n",
+ b"1 + 3\n")
+ rl = self.get_readline(lines)
+ found, consumed_lines = detect_encoding(rl)
+ self.assertEquals(found, "utf-8")
+
def test_short_files(self):
readline = self.get_readline((b'print(something)\n',))
encoding, consumed_lines = detect_encoding(readline)