Tools/parser/com2ann.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308

"""Helper module to tranlate 3.5 type comments to 3.6 variable annotations."""
import re
import os
import ast
import argparse
import tokenize
from collections import defaultdict
from textwrap import dedent
from io import BytesIO

__all__ = ['com2ann', 'TYPE_COM']

TYPE_COM = re.compile(r'\s*#\s*type\s*:.*$', flags=re.DOTALL)
TRAIL_OR_COM = re.compile(r'\s*$|\s*#.*$', flags=re.DOTALL)


class _Data:
    """Internal class describing global data on file."""
    def __init__(self, lines, tokens):
        self.lines = lines
        self.tokens = tokens
        ttab = defaultdict(list) # maps line number to token numbers
        for i, tok in enumerate(tokens):
            ttab[tok.start[0]].append(i)
        self.ttab = ttab
        self.success = [] # list of lines where type comments where processed
        self.fail = [] # list of lines where type comments where rejected


def skip_blank(d, lno):
    while d.lines[lno].strip() == '':
        lno += 1
    return lno


def find_start(d, lcom):
    """Find first char of the assignment target."""
    i = d.ttab[lcom + 1][-2] # index of type comment token in tokens list
    while ((d.tokens[i].exact_type != tokenize.NEWLINE) and
           (d.tokens[i].exact_type != tokenize.ENCODING)):
        i -= 1
    lno = d.tokens[i].start[0]
    return skip_blank(d, lno)


def check_target(stmt):
    if len(stmt.body):
        assign = stmt.body[0]
    else:
        return False
    if isinstance(assign, ast.Assign) and len(assign.targets) == 1:
        targ = assign.targets[0]
    else:
        return False
    if (isinstance(targ, ast.Name) or isinstance(targ, ast.Attribute)
        or isinstance(targ, ast.Subscript)):
        return True
    return False


def find_eq(d, lstart):
    """Find equal sign starting from lstart taking care about d[f(x=1)] = 5."""
    col = pars = 0
    lno = lstart
    while d.lines[lno][col] != '=' or pars != 0:
        ch = d.lines[lno][col]
        if ch in '([{':
            pars += 1
        elif ch in ')]}':
            pars -= 1
        if ch == '#' or col == len(d.lines[lno])-1:
            lno = skip_blank(d, lno+1)
            col = 0
        else:
            col += 1
    return lno, col


def find_val(d, poseq):
    """Find position of first char of assignment value starting from poseq."""
    lno, col = poseq
    while (d.lines[lno][col].isspace() or d.lines[lno][col] in '=\\'):
        if col == len(d.lines[lno])-1:
            lno += 1
            col = 0
        else:
            col += 1
    return lno, col


def find_targ(d, poseq):
    """Find position of last char of target (annotation goes here)."""
    lno, col = poseq
    while (d.lines[lno][col].isspace() or d.lines[lno][col] in '=\\'):
        if col == 0:
            lno -= 1
            col = len(d.lines[lno])-1
        else:
            col -= 1
    return lno, col+1


def trim(new_lines, string, ltarg, poseq, lcom, ccom):
    """Remove None or Ellipsis from assignment value.

    Also remove parens if one has (None), (...) etc.
    string -- 'None' or '...'
    ltarg -- line where last char of target is located
    poseq -- position of equal sign
    lcom, ccom -- position of type comment
    """
    nopars = lambda s: s.replace('(', '').replace(')', '')
    leq, ceq = poseq
    end = ccom if leq == lcom else len(new_lines[leq])
    subline = new_lines[leq][:ceq]
    if leq == ltarg:
        subline = subline.rstrip()
    new_lines[leq] = subline + (new_lines[leq][end:] if leq == lcom
                                else new_lines[leq][ceq+1:end])

    for lno in range(leq+1,lcom):
        new_lines[lno] = nopars(new_lines[lno])

    if lcom != leq:
        subline = nopars(new_lines[lcom][:ccom]).replace(string, '')
        if (not subline.isspace()):
            subline = subline.rstrip()
        new_lines[lcom] = subline + new_lines[lcom][ccom:]


def _com2ann(d, drop_None, drop_Ellipsis):
    new_lines = d.lines[:]
    for lcom, line in enumerate(d.lines):
        match = re.search(TYPE_COM, line)
        if match:
            # strip " #  type  :  annotation  \n" -> "annotation  \n"
            tp = match.group().lstrip()[1:].lstrip()[4:].lstrip()[1:].lstrip()
            submatch = re.search(TRAIL_OR_COM, tp)
            subcom = ''
            if submatch and submatch.group():
                subcom = submatch.group()
                tp = tp[:submatch.start()]
            if tp == 'ignore':
                continue
            ccom = match.start()
            if not any(d.tokens[i].exact_type == tokenize.COMMENT
                   for i in d.ttab[lcom + 1]):
                d.fail.append(lcom)
                continue # type comment inside string
            lstart = find_start(d, lcom)
            stmt_str = dedent(''.join(d.lines[lstart:lcom+1]))
            try:
                stmt = ast.parse(stmt_str)
            except SyntaxError:
                d.fail.append(lcom)
                continue # for or with statements
            if not check_target(stmt):
                d.fail.append(lcom)
                continue

            d.success.append(lcom)
            val = stmt.body[0].value

            # writing output now
            poseq = find_eq(d, lstart)
            lval, cval = find_val(d, poseq)
            ltarg, ctarg = find_targ(d, poseq)

            op_par = ''
            cl_par = ''
            if isinstance(val, ast.Tuple):
                if d.lines[lval][cval] != '(':
                    op_par = '('
                    cl_par = ')'
            # write the comment first
            new_lines[lcom] = d.lines[lcom][:ccom].rstrip() + cl_par + subcom
            ccom = len(d.lines[lcom][:ccom].rstrip())

            string = False
            if isinstance(val, ast.Tuple):
            # t = 1, 2 -> t = (1, 2); only latter is allowed with annotation
                free_place = int(new_lines[lval][cval-2:cval] == '  ')
                new_lines[lval] = (new_lines[lval][:cval-free_place] +
                                       op_par + new_lines[lval][cval:])
            elif isinstance(val, ast.Ellipsis) and drop_Ellipsis:
                string = '...'
            elif (isinstance(val, ast.NameConstant) and
                        val.value is None and drop_None):
                string = 'None'
            if string:
                trim(new_lines, string, ltarg, poseq, lcom, ccom)

            # finally write an annotation
            new_lines[ltarg] = (new_lines[ltarg][:ctarg] +
                              ': ' + tp + new_lines[ltarg][ctarg:])
    return ''.join(new_lines)


def com2ann(code, *, drop_None=False, drop_Ellipsis=False, silent=False):
    """Translate type comments to type annotations in code.

    Take code as string and return this string where::

      variable = value # type: annotation # real comment

    is translated to::

      variable: annotation = value # real comment

    For unsupported syntax cases, the type comments are
    left intact. If drop_None is True or if drop_Ellipsis
    is True translate correcpondingly::

      variable = None # type: annotation
      variable = ... # type: annotation

    into::

      variable: annotation

    The tool tries to preserve code formatting as much as
    possible, but an exact translation is not guarateed.
    A summary of translated comments id printed by default.
    """
    try:
        ast.parse(code) # we want to work only with file without syntax errors
    except SyntaxError:
        return None
    lines = code.splitlines(keepends=True)
    rl = BytesIO(code.encode('utf-8')).readline
    tokens = list(tokenize.tokenize(rl))

    data = _Data(lines, tokens)
    new_code = _com2ann(data, drop_None, drop_Ellipsis)

    if not silent:
        if data.success:
            print('Comments translated on lines:',
                  ', '.join(str(lno+1) for lno in data.success))
        if data.fail:
            print('Comments rejected on lines:',
                  ', '.join(str(lno+1) for lno in data.fail))
        if not data.success and not data.fail:
            print('No type comments found')

    return new_code


def translate_file(infile, outfile, dnone, dell, silent):
    try:
        descr = tokenize.open(infile)
    except SyntaxError:
        print("Cannot open", infile)
        return
    with descr as f:
        code = f.read()
        enc = f.encoding
    if not silent:
        print('File:', infile)
    new_code = com2ann(code, drop_None=dnone,
                             drop_Ellipsis=dell,
                             silent=silent)
    if new_code is None:
        print("SyntaxError in", infile)
        return
    with open(outfile, 'wb') as f:
        f.write((new_code).encode(enc))


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("-o", "--outfile",
                        help="output file, will be overwritten if exists,\n"
                             "defaults to input file")
    parser.add_argument("infile",
                        help="input file or directory for translation, must\n"
                             "contain no syntax errors, for directory\n"
                             "the outfile is ignored and translation is\n"
                             "made in place")
    parser.add_argument("-s", "--silent",
                        help="Do not print summary for line numbers of\n"
                             "translated and rejected comments",
                        action="store_true")
    parser.add_argument("-n", "--drop-none",
                   help="drop any None as assignment value during\n"
                        "translation if it is annotated by a type coment",
                   action="store_true")
    parser.add_argument("-e", "--drop-ellipsis",
                   help="drop any Ellipsis (...) as assignment value during\n"
                        "translation if it is annotated by a type coment",
                   action="store_true")
    args = parser.parse_args()
    if args.outfile is None:
        args.outfile = args.infile

    if os.path.isfile(args.infile):
        translate_file(args.infile, args.outfile,
                       args.drop_none, args.drop_ellipsis, args.silent)
    else:
        for root, dirs, files in os.walk(args.infile):
            for afile in files:
                _, ext = os.path.splitext(afile)
                if ext == '.py' or ext == '.pyi':
                    fname = os.path.join(root, afile)
                    translate_file(fname, fname,
                                   args.drop_none, args.drop_ellipsis,
                                   args.silent)