summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_tokenize.py
diff options
context:
space:
mode:
authorTrent Nelson <trent.nelson@snakebite.org>2008-03-18 22:41:35 (GMT)
committerTrent Nelson <trent.nelson@snakebite.org>2008-03-18 22:41:35 (GMT)
commit428de65ca99492436130165bfbaeb56d6d1daec7 (patch)
treed6c11516a28d8ca658e1f35ac6d7cc802958e336 /Lib/test/test_tokenize.py
parent112367a980481d54f8c21802ee2538a3485fdd41 (diff)
downloadcpython-428de65ca99492436130165bfbaeb56d6d1daec7.zip
cpython-428de65ca99492436130165bfbaeb56d6d1daec7.tar.gz
cpython-428de65ca99492436130165bfbaeb56d6d1daec7.tar.bz2
- Issue #719888: Updated tokenize to use a bytes API. generate_tokens has been
renamed tokenize and now works with bytes rather than strings. A new detect_encoding function has been added for determining source file encoding according to PEP-0263. Token sequences returned by tokenize always start with an ENCODING token which specifies the encoding used to decode the file. This token is used to encode the output of untokenize back to bytes. Credit goes to Michael "I'm-going-to-name-my-first-child-unittest" Foord from Resolver Systems for this work.
Diffstat (limited to 'Lib/test/test_tokenize.py')
-rw-r--r--Lib/test/test_tokenize.py351
1 files changed, 305 insertions, 46 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 2938520..308158f 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,13 +1,14 @@
+# -*- coding: utf-8 -*-
+
doctests = """
Tests for the tokenize module.
- >>> import glob, random, sys
-
The tests can be really simple. Given a small fragment of source
code, print out a table with tokens. The ENDMARK is omitted for
brevity.
>>> dump_tokens("1 + 1")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NUMBER '1' (1, 0) (1, 1)
OP '+' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
@@ -15,6 +16,7 @@ brevity.
>>> dump_tokens("if False:\\n"
... " # NL\\n"
... " True = False # NEWLINE\\n")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'if' (1, 0) (1, 2)
NAME 'False' (1, 3) (1, 8)
OP ':' (1, 8) (1, 9)
@@ -34,27 +36,12 @@ brevity.
... x += 2
... x += 5
... \"""
- >>> for tok in generate_tokens(StringIO(indent_error_file).readline): pass
+ >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
+ >>> for tok in tokenize(readline): pass
Traceback (most recent call last):
...
IndentationError: unindent does not match any outer indentation level
-Test roundtrip for `untokenize`. `f` is an open file or a string. The source
-code in f is tokenized, converted back to source code via tokenize.untokenize(),
-and tokenized again from the latter. The test fails if the second tokenization
-doesn't match the first.
-
- >>> def roundtrip(f):
- ... if isinstance(f, str): f = StringIO(f)
- ... token_list = list(generate_tokens(f.readline))
- ... f.close()
- ... tokens1 = [tok[:2] for tok in token_list]
- ... new_text = untokenize(tokens1)
- ... readline = iter(new_text.splitlines(1)).__next__
- ... tokens2 = [tok[:2] for tok in generate_tokens(readline)]
- ... return tokens1 == tokens2
- ...
-
There are some standard formattig practises that are easy to get right.
>>> roundtrip("if x == 1:\\n"
@@ -67,14 +54,14 @@ There are some standard formattig practises that are easy to get right.
Some people use different formatting conventions, which makes
untokenize a little trickier. Note that this test involves trailing
whitespace after the colon. Note that we use hex escapes to make the
-two trailing blanks apperant in the expected output.
+two trailing blanks apparent in the expected output.
>>> roundtrip("if x == 1 : \\n"
... " print(x)\\n")
True
>>> f = test_support.findfile("tokenize_tests.txt")
- >>> roundtrip(open(f))
+ >>> roundtrip(open(f, 'rb'))
True
>>> roundtrip("if x == 1:\\n"
@@ -122,27 +109,33 @@ Balancing continuation
Ordinary integers and binary operators
>>> dump_tokens("0xff <= 255")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NUMBER '0xff' (1, 0) (1, 4)
OP '<=' (1, 5) (1, 7)
NUMBER '255' (1, 8) (1, 11)
>>> dump_tokens("0b10 <= 255")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NUMBER '0b10' (1, 0) (1, 4)
OP '<=' (1, 5) (1, 7)
NUMBER '255' (1, 8) (1, 11)
>>> dump_tokens("0o123 <= 0O123")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NUMBER '0o123' (1, 0) (1, 5)
OP '<=' (1, 6) (1, 8)
NUMBER '0O123' (1, 9) (1, 14)
>>> dump_tokens("1234567 > ~0x15")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NUMBER '1234567' (1, 0) (1, 7)
OP '>' (1, 8) (1, 9)
OP '~' (1, 10) (1, 11)
NUMBER '0x15' (1, 11) (1, 15)
>>> dump_tokens("2134568 != 1231515")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NUMBER '2134568' (1, 0) (1, 7)
OP '!=' (1, 8) (1, 10)
NUMBER '1231515' (1, 11) (1, 18)
>>> dump_tokens("(-124561-1) & 200000000")
+ ENCODING 'utf-8' (0, 0) (0, 0)
OP '(' (1, 0) (1, 1)
OP '-' (1, 1) (1, 2)
NUMBER '124561' (1, 2) (1, 8)
@@ -152,15 +145,18 @@ Ordinary integers and binary operators
OP '&' (1, 12) (1, 13)
NUMBER '200000000' (1, 14) (1, 23)
>>> dump_tokens("0xdeadbeef != -1")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NUMBER '0xdeadbeef' (1, 0) (1, 10)
OP '!=' (1, 11) (1, 13)
OP '-' (1, 14) (1, 15)
NUMBER '1' (1, 15) (1, 16)
>>> dump_tokens("0xdeadc0de & 12345")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NUMBER '0xdeadc0de' (1, 0) (1, 10)
OP '&' (1, 11) (1, 12)
NUMBER '12345' (1, 13) (1, 18)
>>> dump_tokens("0xFF & 0x15 | 1234")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NUMBER '0xFF' (1, 0) (1, 4)
OP '&' (1, 5) (1, 6)
NUMBER '0x15' (1, 7) (1, 11)
@@ -170,18 +166,22 @@ Ordinary integers and binary operators
Long integers
>>> dump_tokens("x = 0")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '0' (1, 4) (1, 5)
>>> dump_tokens("x = 0xfffffffffff")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '0xffffffffff (1, 4) (1, 17)
>>> dump_tokens("x = 123141242151251616110")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '123141242151 (1, 4) (1, 25)
>>> dump_tokens("x = -15921590215012591")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
OP '-' (1, 4) (1, 5)
@@ -190,32 +190,39 @@ Long integers
Floating point numbers
>>> dump_tokens("x = 3.14159")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '3.14159' (1, 4) (1, 11)
>>> dump_tokens("x = 314159.")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '314159.' (1, 4) (1, 11)
>>> dump_tokens("x = .314159")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '.314159' (1, 4) (1, 11)
>>> dump_tokens("x = 3e14159")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '3e14159' (1, 4) (1, 11)
>>> dump_tokens("x = 3E123")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '3E123' (1, 4) (1, 9)
>>> dump_tokens("x+y = 3e-1230")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '+' (1, 1) (1, 2)
NAME 'y' (1, 2) (1, 3)
OP '=' (1, 4) (1, 5)
NUMBER '3e-1230' (1, 6) (1, 13)
>>> dump_tokens("x = 3.14e159")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '3.14e159' (1, 4) (1, 12)
@@ -223,6 +230,7 @@ Floating point numbers
String literals
>>> dump_tokens("x = ''; y = \\\"\\\"")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
STRING "''" (1, 4) (1, 6)
@@ -231,6 +239,7 @@ String literals
OP '=' (1, 10) (1, 11)
STRING '""' (1, 12) (1, 14)
>>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
STRING '\\'"\\'' (1, 4) (1, 7)
@@ -239,24 +248,28 @@ String literals
OP '=' (1, 11) (1, 12)
STRING '"\\'"' (1, 13) (1, 16)
>>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
STRING '"doesn\\'t "' (1, 4) (1, 14)
NAME 'shrink' (1, 14) (1, 20)
STRING '", does it"' (1, 20) (1, 31)
>>> dump_tokens("x = 'abc' + 'ABC'")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
STRING "'abc'" (1, 4) (1, 9)
OP '+' (1, 10) (1, 11)
STRING "'ABC'" (1, 12) (1, 17)
>>> dump_tokens('y = "ABC" + "ABC"')
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'y' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
STRING '"ABC"' (1, 4) (1, 9)
OP '+' (1, 10) (1, 11)
STRING '"ABC"' (1, 12) (1, 17)
>>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
STRING "r'abc'" (1, 4) (1, 10)
@@ -267,6 +280,7 @@ String literals
OP '+' (1, 29) (1, 30)
STRING "R'ABC'" (1, 31) (1, 37)
>>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'y' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
STRING 'r"abc"' (1, 4) (1, 10)
@@ -280,6 +294,7 @@ String literals
Operators
>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'def' (1, 0) (1, 3)
NAME 'd22' (1, 4) (1, 7)
OP '(' (1, 7) (1, 8)
@@ -301,6 +316,7 @@ Operators
OP ':' (1, 27) (1, 28)
NAME 'pass' (1, 29) (1, 33)
>>> dump_tokens("def d01v_(a=1, *k, **w): pass")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'def' (1, 0) (1, 3)
NAME 'd01v_' (1, 4) (1, 9)
OP '(' (1, 9) (1, 10)
@@ -321,6 +337,7 @@ Comparison
>>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'if' (1, 0) (1, 2)
NUMBER '1' (1, 3) (1, 4)
OP '<' (1, 5) (1, 6)
@@ -357,6 +374,7 @@ Comparison
Shift
>>> dump_tokens("x = 1 << 1 >> 5")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
@@ -368,6 +386,7 @@ Shift
Additive
>>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
@@ -390,6 +409,7 @@ Additive
Multiplicative
>>> dump_tokens("x = 1//1*1/5*12%0x12")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
@@ -407,6 +427,7 @@ Multiplicative
Unary
>>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
+ ENCODING 'utf-8' (0, 0) (0, 0)
OP '~' (1, 0) (1, 1)
NUMBER '1' (1, 1) (1, 2)
OP '^' (1, 3) (1, 4)
@@ -419,6 +440,7 @@ Unary
OP '-' (1, 16) (1, 17)
NUMBER '1' (1, 17) (1, 18)
>>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
+ ENCODING 'utf-8' (0, 0) (0, 0)
OP '-' (1, 0) (1, 1)
NUMBER '1' (1, 1) (1, 2)
OP '*' (1, 2) (1, 3)
@@ -442,6 +464,7 @@ Unary
Selector
>>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
+ ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'import' (1, 0) (1, 6)
NAME 'sys' (1, 7) (1, 10)
OP ',' (1, 10) (1, 11)
@@ -463,6 +486,7 @@ Selector
Methods
>>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
+ ENCODING 'utf-8' (0, 0) (0, 0)
OP '@' (1, 0) (1, 1)
NAME 'staticmethod (1, 1) (1, 13)
NEWLINE '\\n' (1, 13) (1, 14)
@@ -485,42 +509,43 @@ Backslash means line continuation, except for comments
True
>>> roundtrip("# Comment \\\\nx = 0")
True
-
- >>>
- >>> tempdir = os.path.dirname(f) or os.curdir
- >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
- >>> if not test_support.is_resource_enabled("compiler"):
- ... testfiles = random.sample(testfiles, 10)
- ...
- >>> for testfile in testfiles:
- ... if not roundtrip(open(testfile)): break
- ... else: True
- True
"""
-
from test import test_support
-from tokenize import (tokenize, untokenize, generate_tokens, NUMBER, NAME, OP,
- STRING, ENDMARKER, tok_name)
-from io import StringIO
-import os
+from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
+ STRING, ENDMARKER, tok_name, detect_encoding)
+from io import BytesIO
+from unittest import TestCase
+import os, sys, glob
def dump_tokens(s):
"""Print out the tokens in s in a table format.
The ENDMARKER is omitted.
"""
- f = StringIO(s)
- for type, token, start, end, line in generate_tokens(f.readline):
+ f = BytesIO(s.encode('utf-8'))
+ for type, token, start, end, line in tokenize(f.readline):
if type == ENDMARKER:
break
type = tok_name[type]
print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
-def roundtrip(s):
- f = StringIO(s)
- source = untokenize(generate_tokens(f.readline))
- print(source, end="")
+def roundtrip(f):
+ """
+ Test roundtrip for `untokenize`. `f` is an open file or a string.
+ The source code in f is tokenized, converted back to source code via
+ tokenize.untokenize(), and tokenized again from the latter. The test
+ fails if the second tokenization doesn't match the first.
+ """
+ if isinstance(f, str):
+ f = BytesIO(f.encode('utf-8'))
+ token_list = list(tokenize(f.readline))
+ f.close()
+ tokens1 = [tok[:2] for tok in token_list]
+ new_bytes = untokenize(tokens1)
+ readline = (line for line in new_bytes.splitlines(1)).__next__
+ tokens2 = [tok[:2] for tok in tokenize(readline)]
+ return tokens1 == tokens2
# This is an example from the docs, set up as a doctest.
def decistmt(s):
@@ -545,9 +570,8 @@ def decistmt(s):
>>> exec(decistmt(s))
-3.217160342717258261933904529E-7
"""
-
result = []
- g = generate_tokens(StringIO(s).readline) # tokenize the string
+ g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
for toknum, tokval, _, _, _ in g:
if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
result.extend([
@@ -558,7 +582,238 @@ def decistmt(s):
])
else:
result.append((toknum, tokval))
- return untokenize(result)
+ return untokenize(result).decode('utf-8')
+
+
+class TestTokenizerAdheresToPep0263(TestCase):
+ """
+ Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
+ """
+
+ def _testFile(self, filename):
+ path = os.path.join(os.path.dirname(__file__), filename)
+ return roundtrip(open(path, 'rb'))
+
+ def test_utf8_coding_cookie_and_no_utf8_bom(self):
+ f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
+ self.assertTrue(self._testFile(f))
+
+ def test_latin1_coding_cookie_and_utf8_bom(self):
+ """
+ As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
+ allowed encoding for the comment is 'utf-8'. The text file used in
+ this test starts with a BOM signature, but specifies latin1 as the
+ coding, so verify that a SyntaxError is raised, which matches the
+ behaviour of the interpreter when it encounters a similar condition.
+ """
+ f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
+ self.failUnlessRaises(SyntaxError, self._testFile, f)
+
+ def test_no_coding_cookie_and_utf8_bom(self):
+ f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
+ self.assertTrue(self._testFile(f))
+
+ def test_utf8_coding_cookie_and_utf8_bom(self):
+ f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
+ self.assertTrue(self._testFile(f))
+
+
+class Test_Tokenize(TestCase):
+
+ def test__tokenize_decodes_with_specified_encoding(self):
+ literal = '"ЉЊЈЁЂ"'
+ line = literal.encode('utf-8')
+ first = False
+ def readline():
+ nonlocal first
+ if not first:
+ first = True
+ return line
+ else:
+ return b''
+
+ # skip the initial encoding token and the end token
+ tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
+ expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+ self.assertEquals(tokens, expected_tokens,
+ "bytes not decoded with encoding")
+
+ def test__tokenize_does_not_decode_with_encoding_none(self):
+ literal = '"ЉЊЈЁЂ"'
+ first = False
+ def readline():
+ nonlocal first
+ if not first:
+ first = True
+ return literal
+ else:
+ return b''
+
+ # skip the end token
+ tokens = list(_tokenize(readline, encoding=None))[:-1]
+ expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+ self.assertEquals(tokens, expected_tokens,
+ "string not tokenized when encoding is None")
+
+
+class TestDetectEncoding(TestCase):
+
+ def get_readline(self, lines):
+ index = 0
+ def readline():
+ nonlocal index
+ if index == len(lines):
+ raise StopIteration
+ line = lines[index]
+ index += 1
+ return line
+ return readline
+
+ def test_no_bom_no_encoding_cookie(self):
+ lines = (
+ b'# something\n',
+ b'print(something)\n',
+ b'do_something(else)\n'
+ )
+ encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+ self.assertEquals(encoding, 'utf-8')
+ self.assertEquals(consumed_lines, list(lines[:2]))
+
+ def test_bom_no_cookie(self):
+ lines = (
+ b'\xef\xbb\xbf# something\n',
+ b'print(something)\n',
+ b'do_something(else)\n'
+ )
+ encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+ self.assertEquals(encoding, 'utf-8')
+ self.assertEquals(consumed_lines,
+ [b'# something\n', b'print(something)\n'])
+
+ def test_cookie_first_line_no_bom(self):
+ lines = (
+ b'# -*- coding: latin-1 -*-\n',
+ b'print(something)\n',
+ b'do_something(else)\n'
+ )
+ encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+ self.assertEquals(encoding, 'latin-1')
+ self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
+
+ def test_matched_bom_and_cookie_first_line(self):
+ lines = (
+ b'\xef\xbb\xbf# coding=utf-8\n',
+ b'print(something)\n',
+ b'do_something(else)\n'
+ )
+ encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+ self.assertEquals(encoding, 'utf-8')
+ self.assertEquals(consumed_lines, [b'# coding=utf-8\n'])
+
+ def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
+ lines = (
+ b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
+ b'print(something)\n',
+ b'do_something(else)\n'
+ )
+ readline = self.get_readline(lines)
+ self.assertRaises(SyntaxError, detect_encoding, readline)
+
+ def test_cookie_second_line_no_bom(self):
+ lines = (
+ b'#! something\n',
+ b'# vim: set fileencoding=ascii :\n',
+ b'print(something)\n',
+ b'do_something(else)\n'
+ )
+ encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+ self.assertEquals(encoding, 'ascii')
+ expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
+ self.assertEquals(consumed_lines, expected)
+
+ def test_matched_bom_and_cookie_second_line(self):
+ lines = (
+ b'\xef\xbb\xbf#! something\n',
+ b'f# coding=utf-8\n',
+ b'print(something)\n',
+ b'do_something(else)\n'
+ )
+ encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+ self.assertEquals(encoding, 'utf-8')
+ self.assertEquals(consumed_lines,
+ [b'#! something\n', b'f# coding=utf-8\n'])
+
+ def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
+ lines = (
+ b'\xef\xbb\xbf#! something\n',
+ b'# vim: set fileencoding=ascii :\n',
+ b'print(something)\n',
+ b'do_something(else)\n'
+ )
+ readline = self.get_readline(lines)
+ self.assertRaises(SyntaxError, detect_encoding, readline)
+
+ def test_short_files(self):
+ readline = self.get_readline((b'print(something)\n',))
+ encoding, consumed_lines = detect_encoding(readline)
+ self.assertEquals(encoding, 'utf-8')
+ self.assertEquals(consumed_lines, [b'print(something)\n'])
+
+ encoding, consumed_lines = detect_encoding(self.get_readline(()))
+ self.assertEquals(encoding, 'utf-8')
+ self.assertEquals(consumed_lines, [])
+
+ readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
+ encoding, consumed_lines = detect_encoding(readline)
+ self.assertEquals(encoding, 'utf-8')
+ self.assertEquals(consumed_lines, [b'print(something)\n'])
+
+ readline = self.get_readline((b'\xef\xbb\xbf',))
+ encoding, consumed_lines = detect_encoding(readline)
+ self.assertEquals(encoding, 'utf-8')
+ self.assertEquals(consumed_lines, [])
+
+
+class TestTokenize(TestCase):
+
+ def test_tokenize(self):
+ import tokenize as tokenize_module
+ encoding = object()
+ encoding_used = None
+ def mock_detect_encoding(readline):
+ return encoding, ['first', 'second']
+
+ def mock__tokenize(readline, encoding):
+ nonlocal encoding_used
+ encoding_used = encoding
+ out = []
+ while True:
+ next_line = readline()
+ if next_line:
+ out.append(next_line)
+ continue
+ return out
+
+ counter = 0
+ def mock_readline():
+ nonlocal counter
+ counter += 1
+ if counter == 5:
+ return b''
+ return counter
+
+ orig_detect_encoding = tokenize_module.detect_encoding
+ orig__tokenize = tokenize_module._tokenize
+ tokenize_module.detect_encoding = mock_detect_encoding
+ tokenize_module._tokenize = mock__tokenize
+ try:
+ results = tokenize(mock_readline)
+ self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4])
+ finally:
+ tokenize_module.detect_encoding = orig_detect_encoding
+ tokenize_module._tokenize = orig__tokenize
+
+ self.assertTrue(encoding_used, encoding)
__test__ = {"doctests" : doctests, 'decistmt': decistmt}
@@ -566,6 +821,10 @@ __test__ = {"doctests" : doctests, 'decistmt': decistmt}
def test_main():
from test import test_tokenize
test_support.run_doctest(test_tokenize, True)
+ test_support.run_unittest(TestTokenizerAdheresToPep0263)
+ test_support.run_unittest(Test_Tokenize)
+ test_support.run_unittest(TestDetectEncoding)
+ test_support.run_unittest(TestTokenize)
if __name__ == "__main__":
test_main()