diff options
Diffstat (limited to 'Lib/test/test_tokenize.py')
| -rw-r--r-- | Lib/test/test_tokenize.py | 589 |
1 files changed, 460 insertions, 129 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 489f68f..f9652ce 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,13 +1,12 @@ doctests = """ Tests for the tokenize module. - >>> import glob, random, sys - The tests can be really simple. Given a small fragment of source code, print out a table with tokens. The ENDMARK is omitted for brevity. >>> dump_tokens("1 + 1") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '1' (1, 0) (1, 1) OP '+' (1, 2) (1, 3) NUMBER '1' (1, 4) (1, 5) @@ -15,6 +14,7 @@ brevity. >>> dump_tokens("if False:\\n" ... " # NL\\n" ... " True = False # NEWLINE\\n") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'if' (1, 0) (1, 2) NAME 'False' (1, 3) (1, 8) OP ':' (1, 8) (1, 9) @@ -34,32 +34,16 @@ brevity. ... x += 2 ... x += 5 ... \""" - - >>> for tok in generate_tokens(StringIO(indent_error_file).readline): pass + >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline + >>> for tok in tokenize(readline): pass Traceback (most recent call last): ... IndentationError: unindent does not match any outer indentation level -Test roundtrip for `untokenize`. `f` is an open file or a string. The source -code in f is tokenized, converted back to source code via tokenize.untokenize(), -and tokenized again from the latter. The test fails if the second tokenization -doesn't match the first. - - >>> def roundtrip(f): - ... if isinstance(f, str): f = StringIO(f) - ... token_list = list(generate_tokens(f.readline)) - ... f.close() - ... tokens1 = [tok[:2] for tok in token_list] - ... new_text = untokenize(tokens1) - ... readline = iter(new_text.splitlines(1)).next - ... tokens2 = [tok[:2] for tok in generate_tokens(readline)] - ... return tokens1 == tokens2 - ... - There are some standard formatting practices that are easy to get right. >>> roundtrip("if x == 1:\\n" - ... " print x\\n") + ... " print(x)\\n") True >>> roundtrip("# This is a comment\\n# This also") @@ -68,26 +52,26 @@ There are some standard formatting practices that are easy to get right. Some people use different formatting conventions, which makes untokenize a little trickier. Note that this test involves trailing whitespace after the colon. Note that we use hex escapes to make the -two trailing blanks apperant in the expected output. +two trailing blanks apparent in the expected output. >>> roundtrip("if x == 1 : \\n" - ... " print x\\n") + ... " print(x)\\n") True - >>> f = test_support.findfile("tokenize_tests" + os.extsep + "txt") - >>> roundtrip(open(f)) + >>> f = support.findfile("tokenize_tests.txt") + >>> roundtrip(open(f, 'rb')) True >>> roundtrip("if x == 1:\\n" ... " # A comment by itself.\\n" - ... " print x # Comment here, too.\\n" + ... " print(x) # Comment here, too.\\n" ... " # Another comment.\\n" ... "after_if = True\\n") True >>> roundtrip("if (x # The comments need to go in the right place\\n" ... " == 1):\\n" - ... " print 'x==1'\\n") + ... " print('x==1')\\n") True >>> roundtrip("class Test: # A comment here\\n" @@ -102,8 +86,8 @@ Some error-handling code >>> roundtrip("try: import somemodule\\n" ... "except ImportError: # comment\\n" - ... " print 'Can not import' # comment2\\n" - ... "else: print 'Loaded'\\n") + ... " print('Can not import' # comment2\\n)" + ... "else: print('Loaded')\\n") True Balancing continuation @@ -123,27 +107,33 @@ Balancing continuation Ordinary integers and binary operators >>> dump_tokens("0xff <= 255") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xff' (1, 0) (1, 4) OP '<=' (1, 5) (1, 7) NUMBER '255' (1, 8) (1, 11) >>> dump_tokens("0b10 <= 255") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0b10' (1, 0) (1, 4) OP '<=' (1, 5) (1, 7) NUMBER '255' (1, 8) (1, 11) - >>> dump_tokens("0o123 <= 0123") + >>> dump_tokens("0o123 <= 0O123") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0o123' (1, 0) (1, 5) OP '<=' (1, 6) (1, 8) - NUMBER '0123' (1, 9) (1, 13) - >>> dump_tokens("01234567 > ~0x15") - NUMBER '01234567' (1, 0) (1, 8) - OP '>' (1, 9) (1, 10) - OP '~' (1, 11) (1, 12) - NUMBER '0x15' (1, 12) (1, 16) - >>> dump_tokens("2134568 != 01231515") + NUMBER '0O123' (1, 9) (1, 14) + >>> dump_tokens("1234567 > ~0x15") + ENCODING 'utf-8' (0, 0) (0, 0) + NUMBER '1234567' (1, 0) (1, 7) + OP '>' (1, 8) (1, 9) + OP '~' (1, 10) (1, 11) + NUMBER '0x15' (1, 11) (1, 15) + >>> dump_tokens("2134568 != 1231515") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '2134568' (1, 0) (1, 7) OP '!=' (1, 8) (1, 10) - NUMBER '01231515' (1, 11) (1, 19) - >>> dump_tokens("(-124561-1) & 0200000000") + NUMBER '1231515' (1, 11) (1, 18) + >>> dump_tokens("(-124561-1) & 200000000") + ENCODING 'utf-8' (0, 0) (0, 0) OP '(' (1, 0) (1, 1) OP '-' (1, 1) (1, 2) NUMBER '124561' (1, 2) (1, 8) @@ -151,17 +141,20 @@ Ordinary integers and binary operators NUMBER '1' (1, 9) (1, 10) OP ')' (1, 10) (1, 11) OP '&' (1, 12) (1, 13) - NUMBER '0200000000' (1, 14) (1, 24) + NUMBER '200000000' (1, 14) (1, 23) >>> dump_tokens("0xdeadbeef != -1") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xdeadbeef' (1, 0) (1, 10) OP '!=' (1, 11) (1, 13) OP '-' (1, 14) (1, 15) NUMBER '1' (1, 15) (1, 16) - >>> dump_tokens("0xdeadc0de & 012345") + >>> dump_tokens("0xdeadc0de & 12345") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xdeadc0de' (1, 0) (1, 10) OP '&' (1, 11) (1, 12) - NUMBER '012345' (1, 13) (1, 19) + NUMBER '12345' (1, 13) (1, 18) >>> dump_tokens("0xFF & 0x15 | 1234") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xFF' (1, 0) (1, 4) OP '&' (1, 5) (1, 6) NUMBER '0x15' (1, 7) (1, 11) @@ -170,53 +163,64 @@ Ordinary integers and binary operators Long integers - >>> dump_tokens("x = 0L") + >>> dump_tokens("x = 0") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) - NUMBER '0L' (1, 4) (1, 6) + NUMBER '0' (1, 4) (1, 5) >>> dump_tokens("x = 0xfffffffffff") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '0xffffffffff (1, 4) (1, 17) - >>> dump_tokens("x = 123141242151251616110l") + >>> dump_tokens("x = 123141242151251616110") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) - NUMBER '123141242151 (1, 4) (1, 26) - >>> dump_tokens("x = -15921590215012591L") + NUMBER '123141242151 (1, 4) (1, 25) + >>> dump_tokens("x = -15921590215012591") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) OP '-' (1, 4) (1, 5) - NUMBER '159215902150 (1, 5) (1, 23) + NUMBER '159215902150 (1, 5) (1, 22) Floating point numbers >>> dump_tokens("x = 3.14159") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3.14159' (1, 4) (1, 11) >>> dump_tokens("x = 314159.") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '314159.' (1, 4) (1, 11) >>> dump_tokens("x = .314159") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '.314159' (1, 4) (1, 11) >>> dump_tokens("x = 3e14159") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3e14159' (1, 4) (1, 11) >>> dump_tokens("x = 3E123") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3E123' (1, 4) (1, 9) >>> dump_tokens("x+y = 3e-1230") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '+' (1, 1) (1, 2) NAME 'y' (1, 2) (1, 3) OP '=' (1, 4) (1, 5) NUMBER '3e-1230' (1, 6) (1, 13) >>> dump_tokens("x = 3.14e159") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3.14e159' (1, 4) (1, 12) @@ -224,6 +228,7 @@ Floating point numbers String literals >>> dump_tokens("x = ''; y = \\\"\\\"") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING "''" (1, 4) (1, 6) @@ -232,6 +237,7 @@ String literals OP '=' (1, 10) (1, 11) STRING '""' (1, 12) (1, 14) >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING '\\'"\\'' (1, 4) (1, 7) @@ -240,72 +246,53 @@ String literals OP '=' (1, 11) (1, 12) STRING '"\\'"' (1, 13) (1, 16) >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING '"doesn\\'t "' (1, 4) (1, 14) NAME 'shrink' (1, 14) (1, 20) STRING '", does it"' (1, 20) (1, 31) - >>> dump_tokens("x = u'abc' + U'ABC'") + >>> dump_tokens("x = 'abc' + 'ABC'") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) - STRING "u'abc'" (1, 4) (1, 10) - OP '+' (1, 11) (1, 12) - STRING "U'ABC'" (1, 13) (1, 19) - >>> dump_tokens('y = u"ABC" + U"ABC"') + STRING "'abc'" (1, 4) (1, 9) + OP '+' (1, 10) (1, 11) + STRING "'ABC'" (1, 12) (1, 17) + >>> dump_tokens('y = "ABC" + "ABC"') + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'y' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) - STRING 'u"ABC"' (1, 4) (1, 10) - OP '+' (1, 11) (1, 12) - STRING 'U"ABC"' (1, 13) (1, 19) - >>> dump_tokens("x = ur'abc' + Ur'ABC' + uR'ABC' + UR'ABC'") + STRING '"ABC"' (1, 4) (1, 9) + OP '+' (1, 10) (1, 11) + STRING '"ABC"' (1, 12) (1, 17) + >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) - STRING "ur'abc'" (1, 4) (1, 11) - OP '+' (1, 12) (1, 13) - STRING "Ur'ABC'" (1, 14) (1, 21) - OP '+' (1, 22) (1, 23) - STRING "uR'ABC'" (1, 24) (1, 31) - OP '+' (1, 32) (1, 33) - STRING "UR'ABC'" (1, 34) (1, 41) - >>> dump_tokens('y = ur"abc" + Ur"ABC" + uR"ABC" + UR"ABC"') + STRING "r'abc'" (1, 4) (1, 10) + OP '+' (1, 11) (1, 12) + STRING "r'ABC'" (1, 13) (1, 19) + OP '+' (1, 20) (1, 21) + STRING "R'ABC'" (1, 22) (1, 28) + OP '+' (1, 29) (1, 30) + STRING "R'ABC'" (1, 31) (1, 37) + >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"') + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'y' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) - STRING 'ur"abc"' (1, 4) (1, 11) - OP '+' (1, 12) (1, 13) - STRING 'Ur"ABC"' (1, 14) (1, 21) - OP '+' (1, 22) (1, 23) - STRING 'uR"ABC"' (1, 24) (1, 31) - OP '+' (1, 32) (1, 33) - STRING 'UR"ABC"' (1, 34) (1, 41) - - >>> dump_tokens("b'abc' + B'abc'") - STRING "b'abc'" (1, 0) (1, 6) - OP '+' (1, 7) (1, 8) - STRING "B'abc'" (1, 9) (1, 15) - >>> dump_tokens('b"abc" + B"abc"') - STRING 'b"abc"' (1, 0) (1, 6) - OP '+' (1, 7) (1, 8) - STRING 'B"abc"' (1, 9) (1, 15) - >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'") - STRING "br'abc'" (1, 0) (1, 7) - OP '+' (1, 8) (1, 9) - STRING "bR'abc'" (1, 10) (1, 17) - OP '+' (1, 18) (1, 19) - STRING "Br'abc'" (1, 20) (1, 27) - OP '+' (1, 28) (1, 29) - STRING "BR'abc'" (1, 30) (1, 37) - >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"') - STRING 'br"abc"' (1, 0) (1, 7) - OP '+' (1, 8) (1, 9) - STRING 'bR"abc"' (1, 10) (1, 17) - OP '+' (1, 18) (1, 19) - STRING 'Br"abc"' (1, 20) (1, 27) - OP '+' (1, 28) (1, 29) - STRING 'BR"abc"' (1, 30) (1, 37) + STRING 'r"abc"' (1, 4) (1, 10) + OP '+' (1, 11) (1, 12) + STRING 'r"ABC"' (1, 13) (1, 19) + OP '+' (1, 20) (1, 21) + STRING 'R"ABC"' (1, 22) (1, 28) + OP '+' (1, 29) (1, 30) + STRING 'R"ABC"' (1, 31) (1, 37) Operators >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'def' (1, 0) (1, 3) NAME 'd22' (1, 4) (1, 7) OP '(' (1, 7) (1, 8) @@ -327,6 +314,7 @@ Operators OP ':' (1, 27) (1, 28) NAME 'pass' (1, 29) (1, 33) >>> dump_tokens("def d01v_(a=1, *k, **w): pass") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'def' (1, 0) (1, 3) NAME 'd01v_' (1, 4) (1, 9) OP '(' (1, 9) (1, 10) @@ -347,6 +335,7 @@ Comparison >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " + ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'if' (1, 0) (1, 2) NUMBER '1' (1, 3) (1, 4) OP '<' (1, 5) (1, 6) @@ -383,6 +372,7 @@ Comparison Shift >>> dump_tokens("x = 1 << 1 >> 5") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '1' (1, 4) (1, 5) @@ -393,7 +383,8 @@ Shift Additive - >>> dump_tokens("x = 1 - y + 15 - 01 + 0x124 + z + a[5]") + >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '1' (1, 4) (1, 5) @@ -402,20 +393,21 @@ Additive OP '+' (1, 10) (1, 11) NUMBER '15' (1, 12) (1, 14) OP '-' (1, 15) (1, 16) - NUMBER '01' (1, 17) (1, 19) - OP '+' (1, 20) (1, 21) - NUMBER '0x124' (1, 22) (1, 27) - OP '+' (1, 28) (1, 29) - NAME 'z' (1, 30) (1, 31) - OP '+' (1, 32) (1, 33) - NAME 'a' (1, 34) (1, 35) - OP '[' (1, 35) (1, 36) - NUMBER '5' (1, 36) (1, 37) - OP ']' (1, 37) (1, 38) + NUMBER '1' (1, 17) (1, 18) + OP '+' (1, 19) (1, 20) + NUMBER '0x124' (1, 21) (1, 26) + OP '+' (1, 27) (1, 28) + NAME 'z' (1, 29) (1, 30) + OP '+' (1, 31) (1, 32) + NAME 'a' (1, 33) (1, 34) + OP '[' (1, 34) (1, 35) + NUMBER '5' (1, 35) (1, 36) + OP ']' (1, 36) (1, 37) Multiplicative >>> dump_tokens("x = 1//1*1/5*12%0x12") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '1' (1, 4) (1, 5) @@ -433,6 +425,7 @@ Multiplicative Unary >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1") + ENCODING 'utf-8' (0, 0) (0, 0) OP '~' (1, 0) (1, 1) NUMBER '1' (1, 1) (1, 2) OP '^' (1, 3) (1, 4) @@ -445,6 +438,7 @@ Unary OP '-' (1, 16) (1, 17) NUMBER '1' (1, 17) (1, 18) >>> dump_tokens("-1*1/1+1*1//1 - ---1**1") + ENCODING 'utf-8' (0, 0) (0, 0) OP '-' (1, 0) (1, 1) NUMBER '1' (1, 1) (1, 2) OP '*' (1, 2) (1, 3) @@ -468,6 +462,7 @@ Unary Selector >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'import' (1, 0) (1, 6) NAME 'sys' (1, 7) (1, 10) OP ',' (1, 10) (1, 11) @@ -489,6 +484,7 @@ Selector Methods >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass") + ENCODING 'utf-8' (0, 0) (0, 0) OP '@' (1, 0) (1, 1) NAME 'staticmethod (1, 1) (1, 13) NEWLINE '\\n' (1, 13) (1, 14) @@ -520,22 +516,27 @@ Two string literals on the same line Test roundtrip on random python modules. pass the '-ucpu' option to process the full directory. - >>> + >>> import random >>> tempdir = os.path.dirname(f) or os.curdir >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py")) - >>> if not test_support.is_resource_enabled("cpu"): +tokenize is broken on test_pep3131.py because regular expressions are broken on +the obscure unicode identifiers in it. *sigh* + >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py")) + >>> if not support.is_resource_enabled("cpu"): ... testfiles = random.sample(testfiles, 10) ... >>> for testfile in testfiles: - ... if not roundtrip(open(testfile)): - ... print "Roundtrip failed for file %s" % testfile + ... if not roundtrip(open(testfile, 'rb')): + ... print("Roundtrip failed for file %s" % testfile) ... break ... else: True True Evil tabs + >>> dump_tokens("def f():\\n\\tif x\\n \\tpass") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'def' (1, 0) (1, 3) NAME 'f' (1, 4) (1, 5) OP '(' (1, 5) (1, 6) @@ -553,44 +554,77 @@ Evil tabs Pathological whitespace (http://bugs.python.org/issue16152) >>> dump_tokens("@ ") + ENCODING 'utf-8' (0, 0) (0, 0) OP '@' (1, 0) (1, 1) -""" +Non-ascii identifiers + + >>> dump_tokens("Örter = 'places'\\ngrün = 'green'") + ENCODING 'utf-8' (0, 0) (0, 0) + NAME 'Örter' (1, 0) (1, 5) + OP '=' (1, 6) (1, 7) + STRING "'places'" (1, 8) (1, 16) + NEWLINE '\\n' (1, 16) (1, 17) + NAME 'grün' (2, 0) (2, 4) + OP '=' (2, 5) (2, 6) + STRING "'green'" (2, 7) (2, 14) +""" -from test import test_support -from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP, - STRING, ENDMARKER, tok_name) -from StringIO import StringIO -import os +from test import support +from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, + STRING, ENDMARKER, tok_name, detect_encoding, + open as tokenize_open) +from io import BytesIO +from unittest import TestCase +import os, sys, glob def dump_tokens(s): """Print out the tokens in s in a table format. The ENDMARKER is omitted. """ - f = StringIO(s) - for type, token, start, end, line in generate_tokens(f.readline): + f = BytesIO(s.encode('utf-8')) + for type, token, start, end, line in tokenize(f.readline): if type == ENDMARKER: break type = tok_name[type] print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()) +def roundtrip(f): + """ + Test roundtrip for `untokenize`. `f` is an open file or a string. + The source code in f is tokenized, converted back to source code via + tokenize.untokenize(), and tokenized again from the latter. The test + fails if the second tokenization doesn't match the first. + """ + if isinstance(f, str): + f = BytesIO(f.encode('utf-8')) + try: + token_list = list(tokenize(f.readline)) + finally: + f.close() + tokens1 = [tok[:2] for tok in token_list] + new_bytes = untokenize(tokens1) + readline = (line for line in new_bytes.splitlines(1)).__next__ + tokens2 = [tok[:2] for tok in tokenize(readline)] + return tokens1 == tokens2 + # This is an example from the docs, set up as a doctest. def decistmt(s): """Substitute Decimals for floats in a string of statements. >>> from decimal import Decimal - >>> s = 'print +21.3e-5*-.1234/81.7' + >>> s = 'print(+21.3e-5*-.1234/81.7)' >>> decistmt(s) - "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')" + "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))" The format of the exponent is inherited from the platform C library. Known cases are "e-007" (Windows) and "e-07" (not Windows). Since - we're only showing 12 digits, and the 13th isn't close to 5, the + we're only showing 11 digits, and the 12th isn't close to 5, the rest of the output should be platform-independent. >>> exec(s) #doctest: +ELLIPSIS - -3.21716034272e-0...7 + -3.2171603427...e-0...7 Output from calculations with Decimal should be identical across all platforms. @@ -598,9 +632,8 @@ def decistmt(s): >>> exec(decistmt(s)) -3.217160342717258261933904529E-7 """ - result = [] - g = generate_tokens(StringIO(s).readline) # tokenize the string + g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string for toknum, tokval, _, _, _ in g: if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens result.extend([ @@ -611,15 +644,313 @@ def decistmt(s): ]) else: result.append((toknum, tokval)) - return untokenize(result) + return untokenize(result).decode('utf-8') -__test__ = {"doctests" : doctests, 'decistmt': decistmt} +class TestTokenizerAdheresToPep0263(TestCase): + """ + Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263. + """ + def _testFile(self, filename): + path = os.path.join(os.path.dirname(__file__), filename) + return roundtrip(open(path, 'rb')) + + def test_utf8_coding_cookie_and_no_utf8_bom(self): + f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt' + self.assertTrue(self._testFile(f)) + + def test_latin1_coding_cookie_and_utf8_bom(self): + """ + As per PEP 0263, if a file starts with a utf-8 BOM signature, the only + allowed encoding for the comment is 'utf-8'. The text file used in + this test starts with a BOM signature, but specifies latin1 as the + coding, so verify that a SyntaxError is raised, which matches the + behaviour of the interpreter when it encounters a similar condition. + """ + f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt' + self.assertRaises(SyntaxError, self._testFile, f) + + def test_no_coding_cookie_and_utf8_bom(self): + f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt' + self.assertTrue(self._testFile(f)) + + def test_utf8_coding_cookie_and_utf8_bom(self): + f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt' + self.assertTrue(self._testFile(f)) + + def test_bad_coding_cookie(self): + self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py') + self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py') + + +class Test_Tokenize(TestCase): + + def test__tokenize_decodes_with_specified_encoding(self): + literal = '"ЉЊЈЁЂ"' + line = literal.encode('utf-8') + first = False + def readline(): + nonlocal first + if not first: + first = True + return line + else: + return b'' + + # skip the initial encoding token and the end token + tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] + expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + self.assertEqual(tokens, expected_tokens, + "bytes not decoded with encoding") + + def test__tokenize_does_not_decode_with_encoding_none(self): + literal = '"ЉЊЈЁЂ"' + first = False + def readline(): + nonlocal first + if not first: + first = True + return literal + else: + return b'' + + # skip the end token + tokens = list(_tokenize(readline, encoding=None))[:-1] + expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + self.assertEqual(tokens, expected_tokens, + "string not tokenized when encoding is None") + + +class TestDetectEncoding(TestCase): + + def get_readline(self, lines): + index = 0 + def readline(): + nonlocal index + if index == len(lines): + raise StopIteration + line = lines[index] + index += 1 + return line + return readline + + def test_no_bom_no_encoding_cookie(self): + lines = ( + b'# something\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8') + self.assertEqual(consumed_lines, list(lines[:2])) + + def test_bom_no_cookie(self): + lines = ( + b'\xef\xbb\xbf# something\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8-sig') + self.assertEqual(consumed_lines, + [b'# something\n', b'print(something)\n']) + + def test_cookie_first_line_no_bom(self): + lines = ( + b'# -*- coding: latin-1 -*-\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso-8859-1') + self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) + + def test_matched_bom_and_cookie_first_line(self): + lines = ( + b'\xef\xbb\xbf# coding=utf-8\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8-sig') + self.assertEqual(consumed_lines, [b'# coding=utf-8\n']) + + def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): + lines = ( + b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n', + b'print(something)\n', + b'do_something(else)\n' + ) + readline = self.get_readline(lines) + self.assertRaises(SyntaxError, detect_encoding, readline) + + def test_cookie_second_line_no_bom(self): + lines = ( + b'#! something\n', + b'# vim: set fileencoding=ascii :\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'ascii') + expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] + self.assertEqual(consumed_lines, expected) + + def test_matched_bom_and_cookie_second_line(self): + lines = ( + b'\xef\xbb\xbf#! something\n', + b'f# coding=utf-8\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8-sig') + self.assertEqual(consumed_lines, + [b'#! something\n', b'f# coding=utf-8\n']) + + def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): + lines = ( + b'\xef\xbb\xbf#! something\n', + b'# vim: set fileencoding=ascii :\n', + b'print(something)\n', + b'do_something(else)\n' + ) + readline = self.get_readline(lines) + self.assertRaises(SyntaxError, detect_encoding, readline) + + def test_latin1_normalization(self): + # See get_normal_name() in tokenizer.c. + encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", + "iso-8859-1-unix", "iso-latin-1-mac") + for encoding in encodings: + for rep in ("-", "_"): + enc = encoding.replace("-", rep) + lines = (b"#!/usr/bin/python\n", + b"# coding: " + enc.encode("ascii") + b"\n", + b"print(things)\n", + b"do_something += 4\n") + rl = self.get_readline(lines) + found, consumed_lines = detect_encoding(rl) + self.assertEqual(found, "iso-8859-1") + + def test_syntaxerror_latin1(self): + # Issue 14629: need to raise SyntaxError if the first + # line(s) have non-UTF-8 characters + lines = ( + b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S + ) + readline = self.get_readline(lines) + self.assertRaises(SyntaxError, detect_encoding, readline) + + + def test_utf8_normalization(self): + # See get_normal_name() in tokenizer.c. + encodings = ("utf-8", "utf-8-mac", "utf-8-unix") + for encoding in encodings: + for rep in ("-", "_"): + enc = encoding.replace("-", rep) + lines = (b"#!/usr/bin/python\n", + b"# coding: " + enc.encode("ascii") + b"\n", + b"1 + 3\n") + rl = self.get_readline(lines) + found, consumed_lines = detect_encoding(rl) + self.assertEqual(found, "utf-8") + + def test_short_files(self): + readline = self.get_readline((b'print(something)\n',)) + encoding, consumed_lines = detect_encoding(readline) + self.assertEqual(encoding, 'utf-8') + self.assertEqual(consumed_lines, [b'print(something)\n']) + + encoding, consumed_lines = detect_encoding(self.get_readline(())) + self.assertEqual(encoding, 'utf-8') + self.assertEqual(consumed_lines, []) + + readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) + encoding, consumed_lines = detect_encoding(readline) + self.assertEqual(encoding, 'utf-8-sig') + self.assertEqual(consumed_lines, [b'print(something)\n']) + + readline = self.get_readline((b'\xef\xbb\xbf',)) + encoding, consumed_lines = detect_encoding(readline) + self.assertEqual(encoding, 'utf-8-sig') + self.assertEqual(consumed_lines, []) + + readline = self.get_readline((b'# coding: bad\n',)) + self.assertRaises(SyntaxError, detect_encoding, readline) + + def test_open(self): + filename = support.TESTFN + '.py' + self.addCleanup(support.unlink, filename) + + # test coding cookie + for encoding in ('iso-8859-15', 'utf-8'): + with open(filename, 'w', encoding=encoding) as fp: + print("# coding: %s" % encoding, file=fp) + print("print('euro:\u20ac')", file=fp) + with tokenize_open(filename) as fp: + self.assertEqual(fp.encoding, encoding) + self.assertEqual(fp.mode, 'r') + + # test BOM (no coding cookie) + with open(filename, 'w', encoding='utf-8-sig') as fp: + print("print('euro:\u20ac')", file=fp) + with tokenize_open(filename) as fp: + self.assertEqual(fp.encoding, 'utf-8-sig') + self.assertEqual(fp.mode, 'r') + +class TestTokenize(TestCase): + + def test_tokenize(self): + import tokenize as tokenize_module + encoding = object() + encoding_used = None + def mock_detect_encoding(readline): + return encoding, ['first', 'second'] + + def mock__tokenize(readline, encoding): + nonlocal encoding_used + encoding_used = encoding + out = [] + while True: + next_line = readline() + if next_line: + out.append(next_line) + continue + return out + + counter = 0 + def mock_readline(): + nonlocal counter + counter += 1 + if counter == 5: + return b'' + return counter + + orig_detect_encoding = tokenize_module.detect_encoding + orig__tokenize = tokenize_module._tokenize + tokenize_module.detect_encoding = mock_detect_encoding + tokenize_module._tokenize = mock__tokenize + try: + results = tokenize(mock_readline) + self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4]) + finally: + tokenize_module.detect_encoding = orig_detect_encoding + tokenize_module._tokenize = orig__tokenize + + self.assertTrue(encoding_used, encoding) + + +__test__ = {"doctests" : doctests, 'decistmt': decistmt} def test_main(): from test import test_tokenize - test_support.run_doctest(test_tokenize, True) + support.run_doctest(test_tokenize, True) + support.run_unittest(TestTokenizerAdheresToPep0263) + support.run_unittest(Test_Tokenize) + support.run_unittest(TestDetectEncoding) + support.run_unittest(TestTokenize) if __name__ == "__main__": test_main() |
