diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2018-12-22 09:18:40 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-12-22 09:18:40 (GMT) |
commit | 8ac658114dec4964479baecfbc439fceb40eaa79 (patch) | |
tree | e66c4c3beda293a6fdf01763306697d15d0af157 /Lib | |
parent | c1b4b0f6160e1919394586f44b12538505fed300 (diff) | |
download | cpython-8ac658114dec4964479baecfbc439fceb40eaa79.zip cpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.gz cpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.bz2 |
bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)
"Include/token.h", "Lib/token.py" (containing now some data moved from
"Lib/tokenize.py") and new files "Parser/token.c" (containing the code
moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included
in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by
"Tools/scripts/generate_token.py". The script overwrites files only if
needed and can be used on the read-only sources tree.
"Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py"
instead of been executable itself.
Added new make targets "regen-token" and "regen-symbol" which are now
dependencies of "regen-all".
The documentation contains now strings for operators and punctuation tokens.
Diffstat (limited to 'Lib')
-rw-r--r--[-rwxr-xr-x] | Lib/symbol.py | 20 | ||||
-rw-r--r-- | Lib/test/test_symbol.py | 5 | ||||
-rw-r--r-- | Lib/test/test_tokenize.py | 2 | ||||
-rw-r--r-- | Lib/token.py | 134 | ||||
-rw-r--r-- | Lib/tokenize.py | 66 |
5 files changed, 70 insertions, 157 deletions
diff --git a/Lib/symbol.py b/Lib/symbol.py index dc7dcba..40d0ed1 100755..100644 --- a/Lib/symbol.py +++ b/Lib/symbol.py @@ -1,5 +1,3 @@ -#! /usr/bin/env python3 - """Non-terminal symbols of Python grammar (from "graminit.h").""" # This file is automatically generated; please don't muck it up! @@ -7,7 +5,11 @@ # To update the symbols in this file, 'cd' to the top directory of # the python source tree after building the interpreter and run: # -# ./python Lib/symbol.py +# python3 Tools/scripts/generate_symbol_py.py Include/graminit.h Lib/symbol.py +# +# or just +# +# make regen-symbol #--start constants-- single_input = 256 @@ -103,14 +105,4 @@ sym_name = {} for _name, _value in list(globals().items()): if type(_value) is type(0): sym_name[_value] = _name - - -def _main(): - import sys - import token - if len(sys.argv) == 1: - sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"] - token._main() - -if __name__ == "__main__": - _main() +del _name, _value diff --git a/Lib/test/test_symbol.py b/Lib/test/test_symbol.py index c1306f5..ed86aec 100644 --- a/Lib/test/test_symbol.py +++ b/Lib/test/test_symbol.py @@ -6,6 +6,9 @@ import subprocess SYMBOL_FILE = support.findfile('symbol.py') +GEN_SYMBOL_FILE = os.path.join(os.path.dirname(__file__), + '..', '..', 'Tools', 'scripts', + 'generate_symbol_py.py') GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), '..', '..', 'Include', 'graminit.h') TEST_PY_FILE = 'symbol_test.py' @@ -22,7 +25,7 @@ class TestSymbolGeneration(unittest.TestCase): def _generate_symbols(self, grammar_file, target_symbol_py_file): proc = subprocess.Popen([sys.executable, - SYMBOL_FILE, + GEN_SYMBOL_FILE, grammar_file, target_symbol_py_file], stderr=subprocess.PIPE) stderr = proc.communicate()[1] diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ff14479..04a1254 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1619,6 +1619,8 @@ class TestRoundtrip(TestCase): testfiles = random.sample(testfiles, 10) for testfile in testfiles: + if support.verbose >= 2: + print('tokenize', testfile) with open(testfile, 'rb') as f: with self.subTest(file=testfile): self.check_roundtrip(f) diff --git a/Lib/token.py b/Lib/token.py index ba13205..5af7e6b 100644 --- a/Lib/token.py +++ b/Lib/token.py @@ -1,15 +1,8 @@ -"""Token constants (from "token.h").""" +"""Token constants.""" +# Auto-generated by Tools/scripts/generate_token.py __all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF'] -# This file is automatically generated; please don't muck it up! -# -# To update the symbols in this file, 'cd' to the top directory of -# the python source tree after building the interpreter and run: -# -# ./python Lib/token.py - -#--start constants-- ENDMARKER = 0 NAME = 1 NUMBER = 2 @@ -63,23 +56,70 @@ AT = 49 ATEQUAL = 50 RARROW = 51 ELLIPSIS = 52 -# Don't forget to update the table _PyParser_TokenNames in tokenizer.c! OP = 53 -ERRORTOKEN = 54 # These aren't used by the C tokenizer but are needed for tokenize.py +ERRORTOKEN = 54 COMMENT = 55 NL = 56 ENCODING = 57 N_TOKENS = 58 # Special definitions for cooperation with parser NT_OFFSET = 256 -#--end constants-- tok_name = {value: name for name, value in globals().items() if isinstance(value, int) and not name.startswith('_')} __all__.extend(tok_name.values()) +EXACT_TOKEN_TYPES = { + '!=': NOTEQUAL, + '%': PERCENT, + '%=': PERCENTEQUAL, + '&': AMPER, + '&=': AMPEREQUAL, + '(': LPAR, + ')': RPAR, + '*': STAR, + '**': DOUBLESTAR, + '**=': DOUBLESTAREQUAL, + '*=': STAREQUAL, + '+': PLUS, + '+=': PLUSEQUAL, + ',': COMMA, + '-': MINUS, + '-=': MINEQUAL, + '->': RARROW, + '.': DOT, + '...': ELLIPSIS, + '/': SLASH, + '//': DOUBLESLASH, + '//=': DOUBLESLASHEQUAL, + '/=': SLASHEQUAL, + ':': COLON, + ';': SEMI, + '<': LESS, + '<<': LEFTSHIFT, + '<<=': LEFTSHIFTEQUAL, + '<=': LESSEQUAL, + '=': EQUAL, + '==': EQEQUAL, + '>': GREATER, + '>=': GREATEREQUAL, + '>>': RIGHTSHIFT, + '>>=': RIGHTSHIFTEQUAL, + '@': AT, + '@=': ATEQUAL, + '[': LSQB, + ']': RSQB, + '^': CIRCUMFLEX, + '^=': CIRCUMFLEXEQUAL, + '{': LBRACE, + '|': VBAR, + '|=': VBAREQUAL, + '}': RBRACE, + '~': TILDE, +} + def ISTERMINAL(x): return x < NT_OFFSET @@ -88,73 +128,3 @@ def ISNONTERMINAL(x): def ISEOF(x): return x == ENDMARKER - - -def _main(): - import re - import sys - args = sys.argv[1:] - inFileName = args and args[0] or "Include/token.h" - outFileName = "Lib/token.py" - if len(args) > 1: - outFileName = args[1] - try: - fp = open(inFileName) - except OSError as err: - sys.stdout.write("I/O error: %s\n" % str(err)) - sys.exit(1) - with fp: - lines = fp.read().split("\n") - prog = re.compile( - r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)", - re.IGNORECASE) - comment_regex = re.compile( - r"^\s*/\*\s*(.+?)\s*\*/\s*$", - re.IGNORECASE) - - tokens = {} - prev_val = None - for line in lines: - match = prog.match(line) - if match: - name, val = match.group(1, 2) - val = int(val) - tokens[val] = {'token': name} # reverse so we can sort them... - prev_val = val - else: - comment_match = comment_regex.match(line) - if comment_match and prev_val is not None: - comment = comment_match.group(1) - tokens[prev_val]['comment'] = comment - keys = sorted(tokens.keys()) - # load the output skeleton from the target: - try: - fp = open(outFileName) - except OSError as err: - sys.stderr.write("I/O error: %s\n" % str(err)) - sys.exit(2) - with fp: - format = fp.read().split("\n") - try: - start = format.index("#--start constants--") + 1 - end = format.index("#--end constants--") - except ValueError: - sys.stderr.write("target does not contain format markers") - sys.exit(3) - lines = [] - for key in keys: - lines.append("%s = %d" % (tokens[key]["token"], key)) - if "comment" in tokens[key]: - lines.append("# %s" % tokens[key]["comment"]) - format[start:end] = lines - try: - fp = open(outFileName, 'w') - except OSError as err: - sys.stderr.write("I/O error: %s\n" % str(err)) - sys.exit(4) - with fp: - fp.write("\n".join(format)) - - -if __name__ == "__main__": - _main() diff --git a/Lib/tokenize.py b/Lib/tokenize.py index fce010b..cf1ecc9 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -32,6 +32,7 @@ import itertools as _itertools import re import sys from token import * +from token import EXACT_TOKEN_TYPES cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) @@ -41,55 +42,6 @@ __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", "untokenize", "TokenInfo"] del token -EXACT_TOKEN_TYPES = { - '(': LPAR, - ')': RPAR, - '[': LSQB, - ']': RSQB, - ':': COLON, - ',': COMMA, - ';': SEMI, - '+': PLUS, - '-': MINUS, - '*': STAR, - '/': SLASH, - '|': VBAR, - '&': AMPER, - '<': LESS, - '>': GREATER, - '=': EQUAL, - '.': DOT, - '%': PERCENT, - '{': LBRACE, - '}': RBRACE, - '==': EQEQUAL, - '!=': NOTEQUAL, - '<=': LESSEQUAL, - '>=': GREATEREQUAL, - '~': TILDE, - '^': CIRCUMFLEX, - '<<': LEFTSHIFT, - '>>': RIGHTSHIFT, - '**': DOUBLESTAR, - '+=': PLUSEQUAL, - '-=': MINEQUAL, - '*=': STAREQUAL, - '/=': SLASHEQUAL, - '%=': PERCENTEQUAL, - '&=': AMPEREQUAL, - '|=': VBAREQUAL, - '^=': CIRCUMFLEXEQUAL, - '<<=': LEFTSHIFTEQUAL, - '>>=': RIGHTSHIFTEQUAL, - '**=': DOUBLESTAREQUAL, - '//': DOUBLESLASH, - '//=': DOUBLESLASHEQUAL, - '...': ELLIPSIS, - '->': RARROW, - '@': AT, - '@=': ATEQUAL, -} - class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): def __repr__(self): annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) @@ -163,17 +115,11 @@ Triple = group(StringPrefix + "'''", StringPrefix + '"""') String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') -# Because of leftmost-then-longest match semantics, be sure to put the -# longest operators first (e.g., if = came before ==, == would get -# recognized as two instances of =). -Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", - r"//=?", r"->", - r"[+\-*/%&@|^=<>]=?", - r"~") - -Bracket = '[][(){}]' -Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') -Funny = group(Operator, Bracket, Special) +# Sorting in reverse order puts the long operators before their prefixes. +# Otherwise if = came before ==, == would get recognized as two instances +# of =. +Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) +Funny = group(r'\r?\n', Special) PlainToken = group(Number, Funny, String, Name) Token = Ignore + PlainToken |