diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2018-12-22 09:18:40 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-12-22 09:18:40 (GMT) |
commit | 8ac658114dec4964479baecfbc439fceb40eaa79 (patch) | |
tree | e66c4c3beda293a6fdf01763306697d15d0af157 /Tools | |
parent | c1b4b0f6160e1919394586f44b12538505fed300 (diff) | |
download | cpython-8ac658114dec4964479baecfbc439fceb40eaa79.zip cpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.gz cpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.bz2 |
bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)
"Include/token.h", "Lib/token.py" (containing now some data moved from
"Lib/tokenize.py") and new files "Parser/token.c" (containing the code
moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included
in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by
"Tools/scripts/generate_token.py". The script overwrites files only if
needed and can be used on the read-only sources tree.
"Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py"
instead of been executable itself.
Added new make targets "regen-token" and "regen-symbol" which are now
dependencies of "regen-all".
The documentation contains now strings for operators and punctuation tokens.
Diffstat (limited to 'Tools')
-rwxr-xr-x | Tools/scripts/generate_symbol_py.py | 53 | ||||
-rw-r--r-- | Tools/scripts/generate_token.py | 268 |
2 files changed, 321 insertions, 0 deletions
diff --git a/Tools/scripts/generate_symbol_py.py b/Tools/scripts/generate_symbol_py.py new file mode 100755 index 0000000..9219b09 --- /dev/null +++ b/Tools/scripts/generate_symbol_py.py @@ -0,0 +1,53 @@ +#! /usr/bin/env python3 +# This script generates the symbol.py source file. + +import sys +import re + +def main(inFileName="Include/graminit.h", outFileName="Lib/symbol.py"): + try: + fp = open(inFileName) + except OSError as err: + sys.stderr.write("I/O error: %s\n" % str(err)) + sys.exit(1) + with fp: + lines = fp.read().split("\n") + prog = re.compile( + "#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)", + re.IGNORECASE) + tokens = {} + for line in lines: + match = prog.match(line) + if match: + name, val = match.group(1, 2) + val = int(val) + tokens[val] = name # reverse so we can sort them... + keys = sorted(tokens.keys()) + # load the output skeleton from the target: + try: + fp = open(outFileName) + except OSError as err: + sys.stderr.write("I/O error: %s\n" % str(err)) + sys.exit(2) + with fp: + format = fp.read().split("\n") + try: + start = format.index("#--start constants--") + 1 + end = format.index("#--end constants--") + except ValueError: + sys.stderr.write("target does not contain format markers") + sys.exit(3) + lines = [] + for val in keys: + lines.append("%s = %d" % (tokens[val], val)) + format[start:end] = lines + try: + fp = open(outFileName, 'w') + except OSError as err: + sys.stderr.write("I/O error: %s\n" % str(err)) + sys.exit(4) + with fp: + fp.write("\n".join(format)) + +if __name__ == '__main__': + main(*sys.argv[1:]) diff --git a/Tools/scripts/generate_token.py b/Tools/scripts/generate_token.py new file mode 100644 index 0000000..f2745e8 --- /dev/null +++ b/Tools/scripts/generate_token.py @@ -0,0 +1,268 @@ +#! /usr/bin/env python3 +# This script generates token related files from Grammar/Tokens: +# +# Doc/library/token-list.inc +# Include/token.h +# Parser/token.c +# Lib/token.py + + +NT_OFFSET = 256 + +def load_tokens(path): + tok_names = [] + string_to_tok = {} + ERRORTOKEN = None + with open(path) as fp: + for line in fp: + line = line.strip() + # strip comments + i = line.find('#') + if i >= 0: + line = line[:i].strip() + if not line: + continue + fields = line.split() + name = fields[0] + value = len(tok_names) + if name == 'ERRORTOKEN': + ERRORTOKEN = value + string = fields[1] if len(fields) > 1 else None + if string: + string = eval(string) + string_to_tok[string] = value + tok_names.append(name) + return tok_names, ERRORTOKEN, string_to_tok + + +def update_file(file, content): + try: + with open(file, 'r') as fobj: + if fobj.read() == content: + return False + except (OSError, ValueError): + pass + with open(file, 'w') as fobj: + fobj.write(content) + return True + + +token_h_template = """\ +/* Auto-generated by Tools/scripts/generate_token.py */ + +/* Token types */ +#ifndef Py_LIMITED_API +#ifndef Py_TOKEN_H +#define Py_TOKEN_H +#ifdef __cplusplus +extern "C" { +#endif + +#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */ + +%s\ +#define N_TOKENS %d +#define NT_OFFSET %d + +/* Special definitions for cooperation with parser */ + +#define ISTERMINAL(x) ((x) < NT_OFFSET) +#define ISNONTERMINAL(x) ((x) >= NT_OFFSET) +#define ISEOF(x) ((x) == ENDMARKER) + + +PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */ +PyAPI_FUNC(int) PyToken_OneChar(int); +PyAPI_FUNC(int) PyToken_TwoChars(int, int); +PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_TOKEN_H */ +#endif /* Py_LIMITED_API */ +""" + +def make_h(infile, outfile='Include/token.h'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + + defines = [] + for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): + defines.append("#define %-15s %d\n" % (name, value)) + + if update_file(outfile, token_h_template % ( + ''.join(defines), + len(tok_names), + NT_OFFSET + )): + print("%s regenerated from %s" % (outfile, infile)) + + +token_c_template = """\ +/* Auto-generated by Tools/scripts/generate_token.py */ + +#include "Python.h" +#include "token.h" + +/* Token names */ + +const char * const _PyParser_TokenNames[] = { +%s\ +}; + +/* Return the token corresponding to a single character */ + +int +PyToken_OneChar(int c1) +{ +%s\ + return OP; +} + +int +PyToken_TwoChars(int c1, int c2) +{ +%s\ + return OP; +} + +int +PyToken_ThreeChars(int c1, int c2, int c3) +{ +%s\ + return OP; +} +""" + +def generate_chars_to_token(mapping, n=1): + result = [] + write = result.append + indent = ' ' * n + write(indent) + write('switch (c%d) {\n' % (n,)) + for c in sorted(mapping): + write(indent) + value = mapping[c] + if isinstance(value, dict): + write("case '%s':\n" % (c,)) + write(generate_chars_to_token(value, n + 1)) + write(indent) + write(' break;\n') + else: + write("case '%s': return %s;\n" % (c, value)) + write(indent) + write('}\n') + return ''.join(result) + +def make_c(infile, outfile='Parser/token.c'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + string_to_tok['<>'] = string_to_tok['!='] + chars_to_token = {} + for string, value in string_to_tok.items(): + assert 1 <= len(string) <= 3 + name = tok_names[value] + m = chars_to_token.setdefault(len(string), {}) + for c in string[:-1]: + m = m.setdefault(c, {}) + m[string[-1]] = name + + names = [] + for value, name in enumerate(tok_names): + if value >= ERRORTOKEN: + name = '<%s>' % name + names.append(' "%s",\n' % name) + names.append(' "<N_TOKENS>",\n') + + if update_file(outfile, token_c_template % ( + ''.join(names), + generate_chars_to_token(chars_to_token[1]), + generate_chars_to_token(chars_to_token[2]), + generate_chars_to_token(chars_to_token[3]) + )): + print("%s regenerated from %s" % (outfile, infile)) + + +token_inc_template = """\ +.. Auto-generated by Tools/scripts/generate_token.py +%s +.. data:: N_TOKENS + +.. data:: NT_OFFSET +""" + +def make_rst(infile, outfile='Doc/library/token-list.inc'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + tok_to_string = {value: s for s, value in string_to_tok.items()} + + names = [] + for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): + names.append('.. data:: %s' % (name,)) + if value in tok_to_string: + names.append('') + names.append(' Token value for ``"%s"``.' % tok_to_string[value]) + names.append('') + + if update_file(outfile, token_inc_template % '\n'.join(names)): + print("%s regenerated from %s" % (outfile, infile)) + + +token_py_template = '''\ +"""Token constants.""" +# Auto-generated by Tools/scripts/generate_token.py + +__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF'] + +%s +N_TOKENS = %d +# Special definitions for cooperation with parser +NT_OFFSET = %d + +tok_name = {value: name + for name, value in globals().items() + if isinstance(value, int) and not name.startswith('_')} +__all__.extend(tok_name.values()) + +EXACT_TOKEN_TYPES = { +%s +} + +def ISTERMINAL(x): + return x < NT_OFFSET + +def ISNONTERMINAL(x): + return x >= NT_OFFSET + +def ISEOF(x): + return x == ENDMARKER +''' + +def make_py(infile, outfile='Lib/token.py'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + + constants = [] + for value, name in enumerate(tok_names): + constants.append('%s = %d' % (name, value)) + constants.insert(ERRORTOKEN, + "# These aren't used by the C tokenizer but are needed for tokenize.py") + + token_types = [] + for s, value in sorted(string_to_tok.items()): + token_types.append(' %r: %s,' % (s, tok_names[value])) + + if update_file(outfile, token_py_template % ( + '\n'.join(constants), + len(tok_names), + NT_OFFSET, + '\n'.join(token_types), + )): + print("%s regenerated from %s" % (outfile, infile)) + + +def main(op, infile='Grammar/Tokens', *args): + make = globals()['make_' + op] + make(infile, *args) + + +if __name__ == '__main__': + import sys + main(*sys.argv[1:]) |