Lib/test/test_tokenize.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223

"""Tests for the tokenize module.

The tests were originally written in the old Python style, where the
test output was compared to a golden file.  This docstring represents
the first steps towards rewriting the entire test as a doctest.

The tests can be really simple.  Given a small fragment of source
code, print out a table with the tokens.  The ENDMARK is omitted for
brevity.

>>> dump_tokens("1 + 1")
NUMBER      '1'           (1, 0) (1, 1)
OP          '+'           (1, 2) (1, 3)
NUMBER      '1'           (1, 4) (1, 5)

A comment generates a token here, unlike in the parser module.  The
comment token is followed by an NL or a NEWLINE token, depending on
whether the line contains the completion of a statement.

>>> dump_tokens("if False:\\n"
...             "    # NL\\n"
...             "    True = False # NEWLINE\\n")
NAME        'if'          (1, 0) (1, 2)
NAME        'False'       (1, 3) (1, 8)
OP          ':'           (1, 8) (1, 9)
NEWLINE     '\\n'          (1, 9) (1, 10)
COMMENT     '# NL'        (2, 4) (2, 8)
NL          '\\n'          (2, 8) (2, 9)
INDENT      '    '        (3, 0) (3, 4)
NAME        'True'        (3, 4) (3, 8)
OP          '='           (3, 9) (3, 10)
NAME        'False'       (3, 11) (3, 16)
COMMENT     '# NEWLINE'   (3, 17) (3, 26)
NEWLINE     '\\n'          (3, 26) (3, 27)
DEDENT      ''            (4, 0) (4, 0)


There will be a bunch more tests of specific source patterns.

The tokenize module also defines an untokenize function that should
regenerate the original program text from the tokens.

There are some standard formatting practices that are easy to get right.

>>> roundtrip("if x == 1:\\n"
...           "    print x\\n")
if x == 1:
    print x

Some people use different formatting conventions, which makes
untokenize a little trickier.  Note that this test involves trailing
whitespace after the colon.  Note that we use hex escapes to make the
two trailing blanks apparent in the expected output.

>>> roundtrip("if   x  ==  1  :  \\n"
...           "  print x\\n")
if   x  ==  1  :\x20\x20
  print x

Comments need to go in the right place.

>>> roundtrip("if x == 1:\\n"
...           "    # A comment by itself.\\n"
...           "    print x  # Comment here, too.\\n"
...           "    # Another comment.\\n"
...           "after_if = True\\n")
if x == 1:
    # A comment by itself.
    print x  # Comment here, too.
    # Another comment.
after_if = True

>>> roundtrip("if (x  # The comments need to go in the right place\\n"
...           "    == 1):\\n"
...           "    print 'x == 1'\\n")
if (x  # The comments need to go in the right place
    == 1):
    print 'x == 1'

"""

import os, glob, random, time, sys
from cStringIO import StringIO
from test.test_support import (verbose, findfile, is_resource_enabled,
                               TestFailed)
from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
                      ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)

# How much time in seconds can pass before we print a 'Still working' message.
_PRINT_WORKING_MSG_INTERVAL = 5 * 60

# Test roundtrip for `untokenize`.  `f` is a file path.  The source code in f
# is tokenized, converted back to source code via tokenize.untokenize(),
# and tokenized again from the latter.  The test fails if the second
# tokenization doesn't match the first.
def test_roundtrip(f):
    ## print 'Testing:', f
    fobj = open(f)
    try:
        fulltok = list(generate_tokens(fobj.readline))
    finally:
        fobj.close()

    t1 = [tok[:2] for tok in fulltok]
    newtext = untokenize(t1)
    readline = iter(newtext.splitlines(1)).next
    t2 = [tok[:2] for tok in generate_tokens(readline)]
    if t1 != t2:
        raise TestFailed("untokenize() roundtrip failed for %r" % f)

def dump_tokens(s):
    """Print out the tokens in s in a table format.

    The ENDMARKER is omitted.
    """
    f = StringIO(s)
    for type, token, start, end, line in generate_tokens(f.readline):
        if type == ENDMARKER:
            break
        type = tok_name[type]
        print "%(type)-10.10s  %(token)-13.13r %(start)s %(end)s" % locals()

def roundtrip(s):
    f = StringIO(s)
    source = untokenize(generate_tokens(f.readline))
    print source,

# This is an example from the docs, set up as a doctest.
def decistmt(s):
    """Substitute Decimals for floats in a string of statements.

    >>> from decimal import Decimal
    >>> s = 'print +21.3e-5*-.1234/81.7'
    >>> decistmt(s)
    "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"

    The format of the exponent is inherited from the platform C library.
    Known cases are "e-007" (Windows) and "e-07" (not Windows).  Since
    we're only showing 12 digits, and the 13th isn't close to 5, the
    rest of the output should be platform-independent.

    >>> exec(s) #doctest: +ELLIPSIS
    -3.21716034272e-0...7

    Output from calculations with Decimal should be identical across all
    platforms.

    >>> exec(decistmt(s))
    -3.217160342717258261933904529E-7
    """

    result = []
    g = generate_tokens(StringIO(s).readline)   # tokenize the string
    for toknum, tokval, _, _, _  in g:
        if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
            result.extend([
                (NAME, 'Decimal'),
                (OP, '('),
                (STRING, repr(tokval)),
                (OP, ')')
            ])
        else:
            result.append((toknum, tokval))
    return untokenize(result)

def test_main():
    if verbose:
        print 'starting...'

    next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL

    # This displays the tokenization of tokenize_tests.py to stdout, and
    # regrtest.py checks that this equals the expected output (in the
    # test/output/ directory).
    f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
    tokenize(f.readline)
    f.close()

    # Now run test_roundtrip() over tokenize_test.py too, and over all
    # (if the "compiler" resource is enabled) or a small random sample (if
    # "compiler" is not enabled) of the test*.py files.
    f = findfile('tokenize_tests' + os.extsep + 'txt')
    test_roundtrip(f)

    testdir = os.path.dirname(f) or os.curdir
    testfiles = glob.glob(testdir + os.sep + 'test*.py')
    if not is_resource_enabled('compiler'):
        testfiles = random.sample(testfiles, 10)

    for f in testfiles:
        # Print still working message since this test can be really slow
        if next_time <= time.time():
            next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
            print >>sys.__stdout__, '  test_main still working, be patient...'
            sys.__stdout__.flush()

        test_roundtrip(f)

    # Test detecton of IndentationError.
    sampleBadText = """\
def foo():
    bar
  baz
"""

    try:
        for tok in generate_tokens(StringIO(sampleBadText).readline):
            pass
    except IndentationError:
        pass
    else:
        raise TestFailed("Did not detect IndentationError:")

    # Run the doctests in this module.
    from test import test_tokenize  # i.e., this module
    from test.test_support import run_doctest
    run_doctest(test_tokenize, verbose)

    if verbose:
        print 'finished'

if __name__ == "__main__":
    test_main()