diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2018-12-22 09:18:40 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-12-22 09:18:40 (GMT) |
commit | 8ac658114dec4964479baecfbc439fceb40eaa79 (patch) | |
tree | e66c4c3beda293a6fdf01763306697d15d0af157 | |
parent | c1b4b0f6160e1919394586f44b12538505fed300 (diff) | |
download | cpython-8ac658114dec4964479baecfbc439fceb40eaa79.zip cpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.gz cpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.bz2 |
bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)
"Include/token.h", "Lib/token.py" (containing now some data moved from
"Lib/tokenize.py") and new files "Parser/token.c" (containing the code
moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included
in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by
"Tools/scripts/generate_token.py". The script overwrites files only if
needed and can be used on the read-only sources tree.
"Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py"
instead of been executable itself.
Added new make targets "regen-token" and "regen-symbol" which are now
dependencies of "regen-all".
The documentation contains now strings for operators and punctuation tokens.
-rw-r--r-- | .gitattributes | 4 | ||||
-rw-r--r-- | Doc/library/token-list.inc | 206 | ||||
-rw-r--r-- | Doc/library/token.rst | 59 | ||||
-rw-r--r-- | Grammar/Tokens | 62 | ||||
-rw-r--r-- | Include/token.h | 11 | ||||
-rw-r--r--[-rwxr-xr-x] | Lib/symbol.py | 20 | ||||
-rw-r--r-- | Lib/test/test_symbol.py | 5 | ||||
-rw-r--r-- | Lib/test/test_tokenize.py | 2 | ||||
-rw-r--r-- | Lib/token.py | 134 | ||||
-rw-r--r-- | Lib/tokenize.py | 66 | ||||
-rw-r--r-- | Makefile.pre.in | 36 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst | 2 | ||||
-rw-r--r-- | PCbuild/pythoncore.vcxproj | 1 | ||||
-rw-r--r-- | PCbuild/pythoncore.vcxproj.filters | 3 | ||||
-rw-r--r-- | Parser/token.c | 233 | ||||
-rw-r--r-- | Parser/tokenizer.c | 237 | ||||
-rwxr-xr-x | Tools/scripts/generate_symbol_py.py | 53 | ||||
-rw-r--r-- | Tools/scripts/generate_token.py | 268 |
18 files changed, 940 insertions, 462 deletions
diff --git a/.gitattributes b/.gitattributes index 16237bb..c9a54fb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -55,3 +55,7 @@ Include/opcode.h linguist-generated=true Python/opcode_targets.h linguist-generated=true Objects/typeslots.inc linguist-generated=true Modules/unicodedata_db.h linguist-generated=true +Doc/library/token-list.inc linguist-generated=true +Include/token.h linguist-generated=true +Lib/token.py linguist-generated=true +Parser/token.c linguist-generated=true diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc new file mode 100644 index 0000000..cd6e0f2 --- /dev/null +++ b/Doc/library/token-list.inc @@ -0,0 +1,206 @@ +.. Auto-generated by Tools/scripts/generate_token.py +.. data:: ENDMARKER + +.. data:: NAME + +.. data:: NUMBER + +.. data:: STRING + +.. data:: NEWLINE + +.. data:: INDENT + +.. data:: DEDENT + +.. data:: LPAR + + Token value for ``"("``. + +.. data:: RPAR + + Token value for ``")"``. + +.. data:: LSQB + + Token value for ``"["``. + +.. data:: RSQB + + Token value for ``"]"``. + +.. data:: COLON + + Token value for ``":"``. + +.. data:: COMMA + + Token value for ``","``. + +.. data:: SEMI + + Token value for ``";"``. + +.. data:: PLUS + + Token value for ``"+"``. + +.. data:: MINUS + + Token value for ``"-"``. + +.. data:: STAR + + Token value for ``"*"``. + +.. data:: SLASH + + Token value for ``"/"``. + +.. data:: VBAR + + Token value for ``"|"``. + +.. data:: AMPER + + Token value for ``"&"``. + +.. data:: LESS + + Token value for ``"<"``. + +.. data:: GREATER + + Token value for ``">"``. + +.. data:: EQUAL + + Token value for ``"="``. + +.. data:: DOT + + Token value for ``"."``. + +.. data:: PERCENT + + Token value for ``"%"``. + +.. data:: LBRACE + + Token value for ``"{"``. + +.. data:: RBRACE + + Token value for ``"}"``. + +.. data:: EQEQUAL + + Token value for ``"=="``. + +.. data:: NOTEQUAL + + Token value for ``"!="``. + +.. data:: LESSEQUAL + + Token value for ``"<="``. + +.. data:: GREATEREQUAL + + Token value for ``">="``. + +.. data:: TILDE + + Token value for ``"~"``. + +.. data:: CIRCUMFLEX + + Token value for ``"^"``. + +.. data:: LEFTSHIFT + + Token value for ``"<<"``. + +.. data:: RIGHTSHIFT + + Token value for ``">>"``. + +.. data:: DOUBLESTAR + + Token value for ``"**"``. + +.. data:: PLUSEQUAL + + Token value for ``"+="``. + +.. data:: MINEQUAL + + Token value for ``"-="``. + +.. data:: STAREQUAL + + Token value for ``"*="``. + +.. data:: SLASHEQUAL + + Token value for ``"/="``. + +.. data:: PERCENTEQUAL + + Token value for ``"%="``. + +.. data:: AMPEREQUAL + + Token value for ``"&="``. + +.. data:: VBAREQUAL + + Token value for ``"|="``. + +.. data:: CIRCUMFLEXEQUAL + + Token value for ``"^="``. + +.. data:: LEFTSHIFTEQUAL + + Token value for ``"<<="``. + +.. data:: RIGHTSHIFTEQUAL + + Token value for ``">>="``. + +.. data:: DOUBLESTAREQUAL + + Token value for ``"**="``. + +.. data:: DOUBLESLASH + + Token value for ``"//"``. + +.. data:: DOUBLESLASHEQUAL + + Token value for ``"//="``. + +.. data:: AT + + Token value for ``"@"``. + +.. data:: ATEQUAL + + Token value for ``"@="``. + +.. data:: RARROW + + Token value for ``"->"``. + +.. data:: ELLIPSIS + + Token value for ``"..."``. + +.. data:: OP + +.. data:: ERRORTOKEN + +.. data:: N_TOKENS + +.. data:: NT_OFFSET diff --git a/Doc/library/token.rst b/Doc/library/token.rst index 3739910..5358eb5 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -44,64 +44,7 @@ functions. The functions mirror definitions in the Python C header files. The token constants are: -.. data:: ENDMARKER - NAME - NUMBER - STRING - NEWLINE - INDENT - DEDENT - LPAR - RPAR - LSQB - RSQB - COLON - COMMA - SEMI - PLUS - MINUS - STAR - SLASH - VBAR - AMPER - LESS - GREATER - EQUAL - DOT - PERCENT - LBRACE - RBRACE - EQEQUAL - NOTEQUAL - LESSEQUAL - GREATEREQUAL - TILDE - CIRCUMFLEX - LEFTSHIFT - RIGHTSHIFT - DOUBLESTAR - PLUSEQUAL - MINEQUAL - STAREQUAL - SLASHEQUAL - PERCENTEQUAL - AMPEREQUAL - VBAREQUAL - CIRCUMFLEXEQUAL - LEFTSHIFTEQUAL - RIGHTSHIFTEQUAL - DOUBLESTAREQUAL - DOUBLESLASH - DOUBLESLASHEQUAL - AT - ATEQUAL - RARROW - ELLIPSIS - OP - ERRORTOKEN - N_TOKENS - NT_OFFSET - +.. include:: token-list.inc The following token type values aren't used by the C tokenizer but are needed for the :mod:`tokenize` module. diff --git a/Grammar/Tokens b/Grammar/Tokens new file mode 100644 index 0000000..9595673 --- /dev/null +++ b/Grammar/Tokens @@ -0,0 +1,62 @@ +ENDMARKER +NAME +NUMBER +STRING +NEWLINE +INDENT +DEDENT + +LPAR '(' +RPAR ')' +LSQB '[' +RSQB ']' +COLON ':' +COMMA ',' +SEMI ';' +PLUS '+' +MINUS '-' +STAR '*' +SLASH '/' +VBAR '|' +AMPER '&' +LESS '<' +GREATER '>' +EQUAL '=' +DOT '.' +PERCENT '%' +LBRACE '{' +RBRACE '}' +EQEQUAL '==' +NOTEQUAL '!=' +LESSEQUAL '<=' +GREATEREQUAL '>=' +TILDE '~' +CIRCUMFLEX '^' +LEFTSHIFT '<<' +RIGHTSHIFT '>>' +DOUBLESTAR '**' +PLUSEQUAL '+=' +MINEQUAL '-=' +STAREQUAL '*=' +SLASHEQUAL '/=' +PERCENTEQUAL '%=' +AMPEREQUAL '&=' +VBAREQUAL '|=' +CIRCUMFLEXEQUAL '^=' +LEFTSHIFTEQUAL '<<=' +RIGHTSHIFTEQUAL '>>=' +DOUBLESTAREQUAL '**=' +DOUBLESLASH '//' +DOUBLESLASHEQUAL '//=' +AT '@' +ATEQUAL '@=' +RARROW '->' +ELLIPSIS '...' + +OP +ERRORTOKEN + +# These aren't used by the C tokenizer but are needed for tokenize.py +COMMENT +NL +ENCODING diff --git a/Include/token.h b/Include/token.h index cd1cd00..2d491e6 100644 --- a/Include/token.h +++ b/Include/token.h @@ -1,3 +1,4 @@ +/* Auto-generated by Tools/scripts/generate_token.py */ /* Token types */ #ifndef Py_LIMITED_API @@ -62,25 +63,19 @@ extern "C" { #define ATEQUAL 50 #define RARROW 51 #define ELLIPSIS 52 -/* Don't forget to update the table _PyParser_TokenNames in tokenizer.c! */ #define OP 53 #define ERRORTOKEN 54 -/* These aren't used by the C tokenizer but are needed for tokenize.py */ -#define COMMENT 55 -#define NL 56 -#define ENCODING 57 #define N_TOKENS 58 +#define NT_OFFSET 256 /* Special definitions for cooperation with parser */ -#define NT_OFFSET 256 - #define ISTERMINAL(x) ((x) < NT_OFFSET) #define ISNONTERMINAL(x) ((x) >= NT_OFFSET) #define ISEOF(x) ((x) == ENDMARKER) -PyAPI_DATA(const char *) _PyParser_TokenNames[]; /* Token names */ +PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */ PyAPI_FUNC(int) PyToken_OneChar(int); PyAPI_FUNC(int) PyToken_TwoChars(int, int); PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int); diff --git a/Lib/symbol.py b/Lib/symbol.py index dc7dcba..40d0ed1 100755..100644 --- a/Lib/symbol.py +++ b/Lib/symbol.py @@ -1,5 +1,3 @@ -#! /usr/bin/env python3 - """Non-terminal symbols of Python grammar (from "graminit.h").""" # This file is automatically generated; please don't muck it up! @@ -7,7 +5,11 @@ # To update the symbols in this file, 'cd' to the top directory of # the python source tree after building the interpreter and run: # -# ./python Lib/symbol.py +# python3 Tools/scripts/generate_symbol_py.py Include/graminit.h Lib/symbol.py +# +# or just +# +# make regen-symbol #--start constants-- single_input = 256 @@ -103,14 +105,4 @@ sym_name = {} for _name, _value in list(globals().items()): if type(_value) is type(0): sym_name[_value] = _name - - -def _main(): - import sys - import token - if len(sys.argv) == 1: - sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"] - token._main() - -if __name__ == "__main__": - _main() +del _name, _value diff --git a/Lib/test/test_symbol.py b/Lib/test/test_symbol.py index c1306f5..ed86aec 100644 --- a/Lib/test/test_symbol.py +++ b/Lib/test/test_symbol.py @@ -6,6 +6,9 @@ import subprocess SYMBOL_FILE = support.findfile('symbol.py') +GEN_SYMBOL_FILE = os.path.join(os.path.dirname(__file__), + '..', '..', 'Tools', 'scripts', + 'generate_symbol_py.py') GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), '..', '..', 'Include', 'graminit.h') TEST_PY_FILE = 'symbol_test.py' @@ -22,7 +25,7 @@ class TestSymbolGeneration(unittest.TestCase): def _generate_symbols(self, grammar_file, target_symbol_py_file): proc = subprocess.Popen([sys.executable, - SYMBOL_FILE, + GEN_SYMBOL_FILE, grammar_file, target_symbol_py_file], stderr=subprocess.PIPE) stderr = proc.communicate()[1] diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ff14479..04a1254 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1619,6 +1619,8 @@ class TestRoundtrip(TestCase): testfiles = random.sample(testfiles, 10) for testfile in testfiles: + if support.verbose >= 2: + print('tokenize', testfile) with open(testfile, 'rb') as f: with self.subTest(file=testfile): self.check_roundtrip(f) diff --git a/Lib/token.py b/Lib/token.py index ba13205..5af7e6b 100644 --- a/Lib/token.py +++ b/Lib/token.py @@ -1,15 +1,8 @@ -"""Token constants (from "token.h").""" +"""Token constants.""" +# Auto-generated by Tools/scripts/generate_token.py __all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF'] -# This file is automatically generated; please don't muck it up! -# -# To update the symbols in this file, 'cd' to the top directory of -# the python source tree after building the interpreter and run: -# -# ./python Lib/token.py - -#--start constants-- ENDMARKER = 0 NAME = 1 NUMBER = 2 @@ -63,23 +56,70 @@ AT = 49 ATEQUAL = 50 RARROW = 51 ELLIPSIS = 52 -# Don't forget to update the table _PyParser_TokenNames in tokenizer.c! OP = 53 -ERRORTOKEN = 54 # These aren't used by the C tokenizer but are needed for tokenize.py +ERRORTOKEN = 54 COMMENT = 55 NL = 56 ENCODING = 57 N_TOKENS = 58 # Special definitions for cooperation with parser NT_OFFSET = 256 -#--end constants-- tok_name = {value: name for name, value in globals().items() if isinstance(value, int) and not name.startswith('_')} __all__.extend(tok_name.values()) +EXACT_TOKEN_TYPES = { + '!=': NOTEQUAL, + '%': PERCENT, + '%=': PERCENTEQUAL, + '&': AMPER, + '&=': AMPEREQUAL, + '(': LPAR, + ')': RPAR, + '*': STAR, + '**': DOUBLESTAR, + '**=': DOUBLESTAREQUAL, + '*=': STAREQUAL, + '+': PLUS, + '+=': PLUSEQUAL, + ',': COMMA, + '-': MINUS, + '-=': MINEQUAL, + '->': RARROW, + '.': DOT, + '...': ELLIPSIS, + '/': SLASH, + '//': DOUBLESLASH, + '//=': DOUBLESLASHEQUAL, + '/=': SLASHEQUAL, + ':': COLON, + ';': SEMI, + '<': LESS, + '<<': LEFTSHIFT, + '<<=': LEFTSHIFTEQUAL, + '<=': LESSEQUAL, + '=': EQUAL, + '==': EQEQUAL, + '>': GREATER, + '>=': GREATEREQUAL, + '>>': RIGHTSHIFT, + '>>=': RIGHTSHIFTEQUAL, + '@': AT, + '@=': ATEQUAL, + '[': LSQB, + ']': RSQB, + '^': CIRCUMFLEX, + '^=': CIRCUMFLEXEQUAL, + '{': LBRACE, + '|': VBAR, + '|=': VBAREQUAL, + '}': RBRACE, + '~': TILDE, +} + def ISTERMINAL(x): return x < NT_OFFSET @@ -88,73 +128,3 @@ def ISNONTERMINAL(x): def ISEOF(x): return x == ENDMARKER - - -def _main(): - import re - import sys - args = sys.argv[1:] - inFileName = args and args[0] or "Include/token.h" - outFileName = "Lib/token.py" - if len(args) > 1: - outFileName = args[1] - try: - fp = open(inFileName) - except OSError as err: - sys.stdout.write("I/O error: %s\n" % str(err)) - sys.exit(1) - with fp: - lines = fp.read().split("\n") - prog = re.compile( - r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)", - re.IGNORECASE) - comment_regex = re.compile( - r"^\s*/\*\s*(.+?)\s*\*/\s*$", - re.IGNORECASE) - - tokens = {} - prev_val = None - for line in lines: - match = prog.match(line) - if match: - name, val = match.group(1, 2) - val = int(val) - tokens[val] = {'token': name} # reverse so we can sort them... - prev_val = val - else: - comment_match = comment_regex.match(line) - if comment_match and prev_val is not None: - comment = comment_match.group(1) - tokens[prev_val]['comment'] = comment - keys = sorted(tokens.keys()) - # load the output skeleton from the target: - try: - fp = open(outFileName) - except OSError as err: - sys.stderr.write("I/O error: %s\n" % str(err)) - sys.exit(2) - with fp: - format = fp.read().split("\n") - try: - start = format.index("#--start constants--") + 1 - end = format.index("#--end constants--") - except ValueError: - sys.stderr.write("target does not contain format markers") - sys.exit(3) - lines = [] - for key in keys: - lines.append("%s = %d" % (tokens[key]["token"], key)) - if "comment" in tokens[key]: - lines.append("# %s" % tokens[key]["comment"]) - format[start:end] = lines - try: - fp = open(outFileName, 'w') - except OSError as err: - sys.stderr.write("I/O error: %s\n" % str(err)) - sys.exit(4) - with fp: - fp.write("\n".join(format)) - - -if __name__ == "__main__": - _main() diff --git a/Lib/tokenize.py b/Lib/tokenize.py index fce010b..cf1ecc9 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -32,6 +32,7 @@ import itertools as _itertools import re import sys from token import * +from token import EXACT_TOKEN_TYPES cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) @@ -41,55 +42,6 @@ __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", "untokenize", "TokenInfo"] del token -EXACT_TOKEN_TYPES = { - '(': LPAR, - ')': RPAR, - '[': LSQB, - ']': RSQB, - ':': COLON, - ',': COMMA, - ';': SEMI, - '+': PLUS, - '-': MINUS, - '*': STAR, - '/': SLASH, - '|': VBAR, - '&': AMPER, - '<': LESS, - '>': GREATER, - '=': EQUAL, - '.': DOT, - '%': PERCENT, - '{': LBRACE, - '}': RBRACE, - '==': EQEQUAL, - '!=': NOTEQUAL, - '<=': LESSEQUAL, - '>=': GREATEREQUAL, - '~': TILDE, - '^': CIRCUMFLEX, - '<<': LEFTSHIFT, - '>>': RIGHTSHIFT, - '**': DOUBLESTAR, - '+=': PLUSEQUAL, - '-=': MINEQUAL, - '*=': STAREQUAL, - '/=': SLASHEQUAL, - '%=': PERCENTEQUAL, - '&=': AMPEREQUAL, - '|=': VBAREQUAL, - '^=': CIRCUMFLEXEQUAL, - '<<=': LEFTSHIFTEQUAL, - '>>=': RIGHTSHIFTEQUAL, - '**=': DOUBLESTAREQUAL, - '//': DOUBLESLASH, - '//=': DOUBLESLASHEQUAL, - '...': ELLIPSIS, - '->': RARROW, - '@': AT, - '@=': ATEQUAL, -} - class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): def __repr__(self): annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) @@ -163,17 +115,11 @@ Triple = group(StringPrefix + "'''", StringPrefix + '"""') String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') -# Because of leftmost-then-longest match semantics, be sure to put the -# longest operators first (e.g., if = came before ==, == would get -# recognized as two instances of =). -Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", - r"//=?", r"->", - r"[+\-*/%&@|^=<>]=?", - r"~") - -Bracket = '[][(){}]' -Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') -Funny = group(Operator, Bracket, Special) +# Sorting in reverse order puts the long operators before their prefixes. +# Otherwise if = came before ==, == would get recognized as two instances +# of =. +Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) +Funny = group(r'\r?\n', Special) PlainToken = group(Number, Funny, String, Name) Token = Ignore + PlainToken diff --git a/Makefile.pre.in b/Makefile.pre.in index 518602b..04312e1 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -302,6 +302,7 @@ POBJS= \ Parser/metagrammar.o \ Parser/firstsets.o \ Parser/grammar.o \ + Parser/token.o \ Parser/pgen.o PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o @@ -559,7 +560,7 @@ coverage-lcov: @echo # Force regeneration of parser and importlib -coverage-report: regen-grammar regen-importlib +coverage-report: regen-grammar regen-token regen-importlib @ # build with coverage info $(MAKE) coverage @ # run tests, ignore failures @@ -741,7 +742,7 @@ regen-importlib: Programs/_freeze_importlib # Regenerate all generated files regen-all: regen-opcode regen-opcode-targets regen-typeslots regen-grammar \ - regen-ast regen-importlib clinic + regen-token regen-symbol regen-ast regen-importlib clinic ############################################################################ # Special rules for object files @@ -849,6 +850,37 @@ regen-opcode: $(srcdir)/Include/opcode.h.new $(UPDATE_FILE) $(srcdir)/Include/opcode.h $(srcdir)/Include/opcode.h.new +.PHONY: regen-token +regen-token: + # Regenerate Doc/library/token-list.inc from Grammar/Tokens + # using Tools/scripts/generate_token.py + $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py rst \ + $(srcdir)/Grammar/Tokens \ + $(srcdir)/Doc/library/token-list.inc + # Regenerate Include/token.h from Grammar/Tokens + # using Tools/scripts/generate_token.py + $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py h \ + $(srcdir)/Grammar/Tokens \ + $(srcdir)/Include/token.h + # Regenerate Parser/token.c from Grammar/Tokens + # using Tools/scripts/generate_token.py + $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py c \ + $(srcdir)/Grammar/Tokens \ + $(srcdir)/Parser/token.c + # Regenerate Lib/token.py from Grammar/Tokens + # using Tools/scripts/generate_token.py + $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py py \ + $(srcdir)/Grammar/Tokens \ + $(srcdir)/Lib/token.py + +.PHONY: regen-symbol +regen-symbol: $(srcdir)/Include/graminit.h + # Regenerate Lib/symbol.py from Include/graminit.h + # using Tools/scripts/generate_symbol_py.py + $(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_symbol_py.py \ + $(srcdir)/Include/graminit.h \ + $(srcdir)/Lib/symbol.py + Python/compile.o Python/symtable.o Python/ast_unparse.o Python/ast.o: $(srcdir)/Include/graminit.h $(srcdir)/Include/Python-ast.h Python/getplatform.o: $(srcdir)/Python/getplatform.c diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst b/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst new file mode 100644 index 0000000..2118252 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst @@ -0,0 +1,2 @@ +The C and Python code and the documentation related to tokens are now generated +from a single source file :file:`Grammar/Tokens`. diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 78ec9a1..ddf7f49 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -367,6 +367,7 @@ <ClCompile Include="..\Parser\parser.c" /> <ClCompile Include="..\Parser\parsetok.c" /> <ClCompile Include="..\Parser\tokenizer.c" /> + <ClCompile Include="..\Parser\token.c" /> <ClCompile Include="..\PC\invalid_parameter_handler.c" /> <ClCompile Include="..\PC\winreg.c" /> <ClCompile Include="..\PC\config.c" /> diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 5a43a99..77b018f 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -866,6 +866,9 @@ <ClCompile Include="..\Parser\tokenizer.c"> <Filter>Parser</Filter> </ClCompile> + <ClCompile Include="..\Parser\token.c"> + <Filter>Parser</Filter> + </ClCompile> <ClCompile Include="..\PC\winreg.c"> <Filter>PC</Filter> </ClCompile> diff --git a/Parser/token.c b/Parser/token.c new file mode 100644 index 0000000..35519aa --- /dev/null +++ b/Parser/token.c @@ -0,0 +1,233 @@ +/* Auto-generated by Tools/scripts/generate_token.py */ + +#include "Python.h" +#include "token.h" + +/* Token names */ + +const char * const _PyParser_TokenNames[] = { + "ENDMARKER", + "NAME", + "NUMBER", + "STRING", + "NEWLINE", + "INDENT", + "DEDENT", + "LPAR", + "RPAR", + "LSQB", + "RSQB", + "COLON", + "COMMA", + "SEMI", + "PLUS", + "MINUS", + "STAR", + "SLASH", + "VBAR", + "AMPER", + "LESS", + "GREATER", + "EQUAL", + "DOT", + "PERCENT", + "LBRACE", + "RBRACE", + "EQEQUAL", + "NOTEQUAL", + "LESSEQUAL", + "GREATEREQUAL", + "TILDE", + "CIRCUMFLEX", + "LEFTSHIFT", + "RIGHTSHIFT", + "DOUBLESTAR", + "PLUSEQUAL", + "MINEQUAL", + "STAREQUAL", + "SLASHEQUAL", + "PERCENTEQUAL", + "AMPEREQUAL", + "VBAREQUAL", + "CIRCUMFLEXEQUAL", + "LEFTSHIFTEQUAL", + "RIGHTSHIFTEQUAL", + "DOUBLESTAREQUAL", + "DOUBLESLASH", + "DOUBLESLASHEQUAL", + "AT", + "ATEQUAL", + "RARROW", + "ELLIPSIS", + "OP", + "<ERRORTOKEN>", + "<COMMENT>", + "<NL>", + "<ENCODING>", + "<N_TOKENS>", +}; + +/* Return the token corresponding to a single character */ + +int +PyToken_OneChar(int c1) +{ + switch (c1) { + case '%': return PERCENT; + case '&': return AMPER; + case '(': return LPAR; + case ')': return RPAR; + case '*': return STAR; + case '+': return PLUS; + case ',': return COMMA; + case '-': return MINUS; + case '.': return DOT; + case '/': return SLASH; + case ':': return COLON; + case ';': return SEMI; + case '<': return LESS; + case '=': return EQUAL; + case '>': return GREATER; + case '@': return AT; + case '[': return LSQB; + case ']': return RSQB; + case '^': return CIRCUMFLEX; + case '{': return LBRACE; + case '|': return VBAR; + case '}': return RBRACE; + case '~': return TILDE; + } + return OP; +} + +int +PyToken_TwoChars(int c1, int c2) +{ + switch (c1) { + case '!': + switch (c2) { + case '=': return NOTEQUAL; + } + break; + case '%': + switch (c2) { + case '=': return PERCENTEQUAL; + } + break; + case '&': + switch (c2) { + case '=': return AMPEREQUAL; + } + break; + case '*': + switch (c2) { + case '*': return DOUBLESTAR; + case '=': return STAREQUAL; + } + break; + case '+': + switch (c2) { + case '=': return PLUSEQUAL; + } + break; + case '-': + switch (c2) { + case '=': return MINEQUAL; + case '>': return RARROW; + } + break; + case '/': + switch (c2) { + case '/': return DOUBLESLASH; + case '=': return SLASHEQUAL; + } + break; + case '<': + switch (c2) { + case '<': return LEFTSHIFT; + case '=': return LESSEQUAL; + case '>': return NOTEQUAL; + } + break; + case '=': + switch (c2) { + case '=': return EQEQUAL; + } + break; + case '>': + switch (c2) { + case '=': return GREATEREQUAL; + case '>': return RIGHTSHIFT; + } + break; + case '@': + switch (c2) { + case '=': return ATEQUAL; + } + break; + case '^': + switch (c2) { + case '=': return CIRCUMFLEXEQUAL; + } + break; + case '|': + switch (c2) { + case '=': return VBAREQUAL; + } + break; + } + return OP; +} + +int +PyToken_ThreeChars(int c1, int c2, int c3) +{ + switch (c1) { + case '*': + switch (c2) { + case '*': + switch (c3) { + case '=': return DOUBLESTAREQUAL; + } + break; + } + break; + case '.': + switch (c2) { + case '.': + switch (c3) { + case '.': return ELLIPSIS; + } + break; + } + break; + case '/': + switch (c2) { + case '/': + switch (c3) { + case '=': return DOUBLESLASHEQUAL; + } + break; + } + break; + case '<': + switch (c2) { + case '<': + switch (c3) { + case '=': return LEFTSHIFTEQUAL; + } + break; + } + break; + case '>': + switch (c2) { + case '>': + switch (c3) { + case '=': return RIGHTSHIFTEQUAL; + } + break; + } + break; + } + return OP; +} diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index c246ee2..0e6c1a8 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -48,72 +48,6 @@ static int tok_nextc(struct tok_state *tok); static void tok_backup(struct tok_state *tok, int c); -/* Token names */ - -const char *_PyParser_TokenNames[] = { - "ENDMARKER", - "NAME", - "NUMBER", - "STRING", - "NEWLINE", - "INDENT", - "DEDENT", - "LPAR", - "RPAR", - "LSQB", - "RSQB", - "COLON", - "COMMA", - "SEMI", - "PLUS", - "MINUS", - "STAR", - "SLASH", - "VBAR", - "AMPER", - "LESS", - "GREATER", - "EQUAL", - "DOT", - "PERCENT", - "LBRACE", - "RBRACE", - "EQEQUAL", - "NOTEQUAL", - "LESSEQUAL", - "GREATEREQUAL", - "TILDE", - "CIRCUMFLEX", - "LEFTSHIFT", - "RIGHTSHIFT", - "DOUBLESTAR", - "PLUSEQUAL", - "MINEQUAL", - "STAREQUAL", - "SLASHEQUAL", - "PERCENTEQUAL", - "AMPEREQUAL", - "VBAREQUAL", - "CIRCUMFLEXEQUAL", - "LEFTSHIFTEQUAL", - "RIGHTSHIFTEQUAL", - "DOUBLESTAREQUAL", - "DOUBLESLASH", - "DOUBLESLASHEQUAL", - "AT", - "ATEQUAL", - "RARROW", - "ELLIPSIS", - /* This table must match the #defines in token.h! */ - "OP", - "<ERRORTOKEN>", - "COMMENT", - "NL", - "ENCODING", - "<N_TOKENS>" -}; - - /* Create and initialize a new tok_state structure */ static struct tok_state * @@ -1114,177 +1048,6 @@ tok_backup(struct tok_state *tok, int c) } -/* Return the token corresponding to a single character */ - -int -PyToken_OneChar(int c) -{ - switch (c) { - case '(': return LPAR; - case ')': return RPAR; - case '[': return LSQB; - case ']': return RSQB; - case ':': return COLON; - case ',': return COMMA; - case ';': return SEMI; - case '+': return PLUS; - case '-': return MINUS; - case '*': return STAR; - case '/': return SLASH; - case '|': return VBAR; - case '&': return AMPER; - case '<': return LESS; - case '>': return GREATER; - case '=': return EQUAL; - case '.': return DOT; - case '%': return PERCENT; - case '{': return LBRACE; - case '}': return RBRACE; - case '^': return CIRCUMFLEX; - case '~': return TILDE; - case '@': return AT; - default: return OP; - } -} - - -int -PyToken_TwoChars(int c1, int c2) -{ - switch (c1) { - case '=': - switch (c2) { - case '=': return EQEQUAL; - } - break; - case '!': - switch (c2) { - case '=': return NOTEQUAL; - } - break; - case '<': - switch (c2) { - case '>': return NOTEQUAL; - case '=': return LESSEQUAL; - case '<': return LEFTSHIFT; - } - break; - case '>': - switch (c2) { - case '=': return GREATEREQUAL; - case '>': return RIGHTSHIFT; - } - break; - case '+': - switch (c2) { - case '=': return PLUSEQUAL; - } - break; - case '-': - switch (c2) { - case '=': return MINEQUAL; - case '>': return RARROW; - } - break; - case '*': - switch (c2) { - case '*': return DOUBLESTAR; - case '=': return STAREQUAL; - } - break; - case '/': - switch (c2) { - case '/': return DOUBLESLASH; - case '=': return SLASHEQUAL; - } - break; - case '|': - switch (c2) { - case '=': return VBAREQUAL; - } - break; - case '%': - switch (c2) { - case '=': return PERCENTEQUAL; - } - break; - case '&': - switch (c2) { - case '=': return AMPEREQUAL; - } - break; - case '^': - switch (c2) { - case '=': return CIRCUMFLEXEQUAL; - } - break; - case '@': - switch (c2) { - case '=': return ATEQUAL; - } - break; - } - return OP; -} - -int -PyToken_ThreeChars(int c1, int c2, int c3) -{ - switch (c1) { - case '<': - switch (c2) { - case '<': - switch (c3) { - case '=': - return LEFTSHIFTEQUAL; - } - break; - } - break; - case '>': - switch (c2) { - case '>': - switch (c3) { - case '=': - return RIGHTSHIFTEQUAL; - } - break; - } - break; - case '*': - switch (c2) { - case '*': - switch (c3) { - case '=': - return DOUBLESTAREQUAL; - } - break; - } - break; - case '/': - switch (c2) { - case '/': - switch (c3) { - case '=': - return DOUBLESLASHEQUAL; - } - break; - } - break; - case '.': - switch (c2) { - case '.': - switch (c3) { - case '.': - return ELLIPSIS; - } - break; - } - break; - } - return OP; -} - static int syntaxerror(struct tok_state *tok, const char *format, ...) { diff --git a/Tools/scripts/generate_symbol_py.py b/Tools/scripts/generate_symbol_py.py new file mode 100755 index 0000000..9219b09 --- /dev/null +++ b/Tools/scripts/generate_symbol_py.py @@ -0,0 +1,53 @@ +#! /usr/bin/env python3 +# This script generates the symbol.py source file. + +import sys +import re + +def main(inFileName="Include/graminit.h", outFileName="Lib/symbol.py"): + try: + fp = open(inFileName) + except OSError as err: + sys.stderr.write("I/O error: %s\n" % str(err)) + sys.exit(1) + with fp: + lines = fp.read().split("\n") + prog = re.compile( + "#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)", + re.IGNORECASE) + tokens = {} + for line in lines: + match = prog.match(line) + if match: + name, val = match.group(1, 2) + val = int(val) + tokens[val] = name # reverse so we can sort them... + keys = sorted(tokens.keys()) + # load the output skeleton from the target: + try: + fp = open(outFileName) + except OSError as err: + sys.stderr.write("I/O error: %s\n" % str(err)) + sys.exit(2) + with fp: + format = fp.read().split("\n") + try: + start = format.index("#--start constants--") + 1 + end = format.index("#--end constants--") + except ValueError: + sys.stderr.write("target does not contain format markers") + sys.exit(3) + lines = [] + for val in keys: + lines.append("%s = %d" % (tokens[val], val)) + format[start:end] = lines + try: + fp = open(outFileName, 'w') + except OSError as err: + sys.stderr.write("I/O error: %s\n" % str(err)) + sys.exit(4) + with fp: + fp.write("\n".join(format)) + +if __name__ == '__main__': + main(*sys.argv[1:]) diff --git a/Tools/scripts/generate_token.py b/Tools/scripts/generate_token.py new file mode 100644 index 0000000..f2745e8 --- /dev/null +++ b/Tools/scripts/generate_token.py @@ -0,0 +1,268 @@ +#! /usr/bin/env python3 +# This script generates token related files from Grammar/Tokens: +# +# Doc/library/token-list.inc +# Include/token.h +# Parser/token.c +# Lib/token.py + + +NT_OFFSET = 256 + +def load_tokens(path): + tok_names = [] + string_to_tok = {} + ERRORTOKEN = None + with open(path) as fp: + for line in fp: + line = line.strip() + # strip comments + i = line.find('#') + if i >= 0: + line = line[:i].strip() + if not line: + continue + fields = line.split() + name = fields[0] + value = len(tok_names) + if name == 'ERRORTOKEN': + ERRORTOKEN = value + string = fields[1] if len(fields) > 1 else None + if string: + string = eval(string) + string_to_tok[string] = value + tok_names.append(name) + return tok_names, ERRORTOKEN, string_to_tok + + +def update_file(file, content): + try: + with open(file, 'r') as fobj: + if fobj.read() == content: + return False + except (OSError, ValueError): + pass + with open(file, 'w') as fobj: + fobj.write(content) + return True + + +token_h_template = """\ +/* Auto-generated by Tools/scripts/generate_token.py */ + +/* Token types */ +#ifndef Py_LIMITED_API +#ifndef Py_TOKEN_H +#define Py_TOKEN_H +#ifdef __cplusplus +extern "C" { +#endif + +#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */ + +%s\ +#define N_TOKENS %d +#define NT_OFFSET %d + +/* Special definitions for cooperation with parser */ + +#define ISTERMINAL(x) ((x) < NT_OFFSET) +#define ISNONTERMINAL(x) ((x) >= NT_OFFSET) +#define ISEOF(x) ((x) == ENDMARKER) + + +PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */ +PyAPI_FUNC(int) PyToken_OneChar(int); +PyAPI_FUNC(int) PyToken_TwoChars(int, int); +PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_TOKEN_H */ +#endif /* Py_LIMITED_API */ +""" + +def make_h(infile, outfile='Include/token.h'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + + defines = [] + for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): + defines.append("#define %-15s %d\n" % (name, value)) + + if update_file(outfile, token_h_template % ( + ''.join(defines), + len(tok_names), + NT_OFFSET + )): + print("%s regenerated from %s" % (outfile, infile)) + + +token_c_template = """\ +/* Auto-generated by Tools/scripts/generate_token.py */ + +#include "Python.h" +#include "token.h" + +/* Token names */ + +const char * const _PyParser_TokenNames[] = { +%s\ +}; + +/* Return the token corresponding to a single character */ + +int +PyToken_OneChar(int c1) +{ +%s\ + return OP; +} + +int +PyToken_TwoChars(int c1, int c2) +{ +%s\ + return OP; +} + +int +PyToken_ThreeChars(int c1, int c2, int c3) +{ +%s\ + return OP; +} +""" + +def generate_chars_to_token(mapping, n=1): + result = [] + write = result.append + indent = ' ' * n + write(indent) + write('switch (c%d) {\n' % (n,)) + for c in sorted(mapping): + write(indent) + value = mapping[c] + if isinstance(value, dict): + write("case '%s':\n" % (c,)) + write(generate_chars_to_token(value, n + 1)) + write(indent) + write(' break;\n') + else: + write("case '%s': return %s;\n" % (c, value)) + write(indent) + write('}\n') + return ''.join(result) + +def make_c(infile, outfile='Parser/token.c'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + string_to_tok['<>'] = string_to_tok['!='] + chars_to_token = {} + for string, value in string_to_tok.items(): + assert 1 <= len(string) <= 3 + name = tok_names[value] + m = chars_to_token.setdefault(len(string), {}) + for c in string[:-1]: + m = m.setdefault(c, {}) + m[string[-1]] = name + + names = [] + for value, name in enumerate(tok_names): + if value >= ERRORTOKEN: + name = '<%s>' % name + names.append(' "%s",\n' % name) + names.append(' "<N_TOKENS>",\n') + + if update_file(outfile, token_c_template % ( + ''.join(names), + generate_chars_to_token(chars_to_token[1]), + generate_chars_to_token(chars_to_token[2]), + generate_chars_to_token(chars_to_token[3]) + )): + print("%s regenerated from %s" % (outfile, infile)) + + +token_inc_template = """\ +.. Auto-generated by Tools/scripts/generate_token.py +%s +.. data:: N_TOKENS + +.. data:: NT_OFFSET +""" + +def make_rst(infile, outfile='Doc/library/token-list.inc'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + tok_to_string = {value: s for s, value in string_to_tok.items()} + + names = [] + for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): + names.append('.. data:: %s' % (name,)) + if value in tok_to_string: + names.append('') + names.append(' Token value for ``"%s"``.' % tok_to_string[value]) + names.append('') + + if update_file(outfile, token_inc_template % '\n'.join(names)): + print("%s regenerated from %s" % (outfile, infile)) + + +token_py_template = '''\ +"""Token constants.""" +# Auto-generated by Tools/scripts/generate_token.py + +__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF'] + +%s +N_TOKENS = %d +# Special definitions for cooperation with parser +NT_OFFSET = %d + +tok_name = {value: name + for name, value in globals().items() + if isinstance(value, int) and not name.startswith('_')} +__all__.extend(tok_name.values()) + +EXACT_TOKEN_TYPES = { +%s +} + +def ISTERMINAL(x): + return x < NT_OFFSET + +def ISNONTERMINAL(x): + return x >= NT_OFFSET + +def ISEOF(x): + return x == ENDMARKER +''' + +def make_py(infile, outfile='Lib/token.py'): + tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) + + constants = [] + for value, name in enumerate(tok_names): + constants.append('%s = %d' % (name, value)) + constants.insert(ERRORTOKEN, + "# These aren't used by the C tokenizer but are needed for tokenize.py") + + token_types = [] + for s, value in sorted(string_to_tok.items()): + token_types.append(' %r: %s,' % (s, tok_names[value])) + + if update_file(outfile, token_py_template % ( + '\n'.join(constants), + len(tok_names), + NT_OFFSET, + '\n'.join(token_types), + )): + print("%s regenerated from %s" % (outfile, infile)) + + +def main(op, infile='Grammar/Tokens', *args): + make = globals()['make_' + op] + make(infile, *args) + + +if __name__ == '__main__': + import sys + main(*sys.argv[1:]) |