summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2018-12-22 09:18:40 (GMT)
committerGitHub <noreply@github.com>2018-12-22 09:18:40 (GMT)
commit8ac658114dec4964479baecfbc439fceb40eaa79 (patch)
treee66c4c3beda293a6fdf01763306697d15d0af157 /Lib
parentc1b4b0f6160e1919394586f44b12538505fed300 (diff)
downloadcpython-8ac658114dec4964479baecfbc439fceb40eaa79.zip
cpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.gz
cpython-8ac658114dec4964479baecfbc439fceb40eaa79.tar.bz2
bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)
"Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens.
Diffstat (limited to 'Lib')
-rw-r--r--[-rwxr-xr-x]Lib/symbol.py20
-rw-r--r--Lib/test/test_symbol.py5
-rw-r--r--Lib/test/test_tokenize.py2
-rw-r--r--Lib/token.py134
-rw-r--r--Lib/tokenize.py66
5 files changed, 70 insertions, 157 deletions
diff --git a/Lib/symbol.py b/Lib/symbol.py
index dc7dcba..40d0ed1 100755..100644
--- a/Lib/symbol.py
+++ b/Lib/symbol.py
@@ -1,5 +1,3 @@
-#! /usr/bin/env python3
-
"""Non-terminal symbols of Python grammar (from "graminit.h")."""
# This file is automatically generated; please don't muck it up!
@@ -7,7 +5,11 @@
# To update the symbols in this file, 'cd' to the top directory of
# the python source tree after building the interpreter and run:
#
-# ./python Lib/symbol.py
+# python3 Tools/scripts/generate_symbol_py.py Include/graminit.h Lib/symbol.py
+#
+# or just
+#
+# make regen-symbol
#--start constants--
single_input = 256
@@ -103,14 +105,4 @@ sym_name = {}
for _name, _value in list(globals().items()):
if type(_value) is type(0):
sym_name[_value] = _name
-
-
-def _main():
- import sys
- import token
- if len(sys.argv) == 1:
- sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"]
- token._main()
-
-if __name__ == "__main__":
- _main()
+del _name, _value
diff --git a/Lib/test/test_symbol.py b/Lib/test/test_symbol.py
index c1306f5..ed86aec 100644
--- a/Lib/test/test_symbol.py
+++ b/Lib/test/test_symbol.py
@@ -6,6 +6,9 @@ import subprocess
SYMBOL_FILE = support.findfile('symbol.py')
+GEN_SYMBOL_FILE = os.path.join(os.path.dirname(__file__),
+ '..', '..', 'Tools', 'scripts',
+ 'generate_symbol_py.py')
GRAMMAR_FILE = os.path.join(os.path.dirname(__file__),
'..', '..', 'Include', 'graminit.h')
TEST_PY_FILE = 'symbol_test.py'
@@ -22,7 +25,7 @@ class TestSymbolGeneration(unittest.TestCase):
def _generate_symbols(self, grammar_file, target_symbol_py_file):
proc = subprocess.Popen([sys.executable,
- SYMBOL_FILE,
+ GEN_SYMBOL_FILE,
grammar_file,
target_symbol_py_file], stderr=subprocess.PIPE)
stderr = proc.communicate()[1]
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index ff14479..04a1254 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1619,6 +1619,8 @@ class TestRoundtrip(TestCase):
testfiles = random.sample(testfiles, 10)
for testfile in testfiles:
+ if support.verbose >= 2:
+ print('tokenize', testfile)
with open(testfile, 'rb') as f:
with self.subTest(file=testfile):
self.check_roundtrip(f)
diff --git a/Lib/token.py b/Lib/token.py
index ba13205..5af7e6b 100644
--- a/Lib/token.py
+++ b/Lib/token.py
@@ -1,15 +1,8 @@
-"""Token constants (from "token.h")."""
+"""Token constants."""
+# Auto-generated by Tools/scripts/generate_token.py
__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
-# This file is automatically generated; please don't muck it up!
-#
-# To update the symbols in this file, 'cd' to the top directory of
-# the python source tree after building the interpreter and run:
-#
-# ./python Lib/token.py
-
-#--start constants--
ENDMARKER = 0
NAME = 1
NUMBER = 2
@@ -63,23 +56,70 @@ AT = 49
ATEQUAL = 50
RARROW = 51
ELLIPSIS = 52
-# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
OP = 53
-ERRORTOKEN = 54
# These aren't used by the C tokenizer but are needed for tokenize.py
+ERRORTOKEN = 54
COMMENT = 55
NL = 56
ENCODING = 57
N_TOKENS = 58
# Special definitions for cooperation with parser
NT_OFFSET = 256
-#--end constants--
tok_name = {value: name
for name, value in globals().items()
if isinstance(value, int) and not name.startswith('_')}
__all__.extend(tok_name.values())
+EXACT_TOKEN_TYPES = {
+ '!=': NOTEQUAL,
+ '%': PERCENT,
+ '%=': PERCENTEQUAL,
+ '&': AMPER,
+ '&=': AMPEREQUAL,
+ '(': LPAR,
+ ')': RPAR,
+ '*': STAR,
+ '**': DOUBLESTAR,
+ '**=': DOUBLESTAREQUAL,
+ '*=': STAREQUAL,
+ '+': PLUS,
+ '+=': PLUSEQUAL,
+ ',': COMMA,
+ '-': MINUS,
+ '-=': MINEQUAL,
+ '->': RARROW,
+ '.': DOT,
+ '...': ELLIPSIS,
+ '/': SLASH,
+ '//': DOUBLESLASH,
+ '//=': DOUBLESLASHEQUAL,
+ '/=': SLASHEQUAL,
+ ':': COLON,
+ ';': SEMI,
+ '<': LESS,
+ '<<': LEFTSHIFT,
+ '<<=': LEFTSHIFTEQUAL,
+ '<=': LESSEQUAL,
+ '=': EQUAL,
+ '==': EQEQUAL,
+ '>': GREATER,
+ '>=': GREATEREQUAL,
+ '>>': RIGHTSHIFT,
+ '>>=': RIGHTSHIFTEQUAL,
+ '@': AT,
+ '@=': ATEQUAL,
+ '[': LSQB,
+ ']': RSQB,
+ '^': CIRCUMFLEX,
+ '^=': CIRCUMFLEXEQUAL,
+ '{': LBRACE,
+ '|': VBAR,
+ '|=': VBAREQUAL,
+ '}': RBRACE,
+ '~': TILDE,
+}
+
def ISTERMINAL(x):
return x < NT_OFFSET
@@ -88,73 +128,3 @@ def ISNONTERMINAL(x):
def ISEOF(x):
return x == ENDMARKER
-
-
-def _main():
- import re
- import sys
- args = sys.argv[1:]
- inFileName = args and args[0] or "Include/token.h"
- outFileName = "Lib/token.py"
- if len(args) > 1:
- outFileName = args[1]
- try:
- fp = open(inFileName)
- except OSError as err:
- sys.stdout.write("I/O error: %s\n" % str(err))
- sys.exit(1)
- with fp:
- lines = fp.read().split("\n")
- prog = re.compile(
- r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
- re.IGNORECASE)
- comment_regex = re.compile(
- r"^\s*/\*\s*(.+?)\s*\*/\s*$",
- re.IGNORECASE)
-
- tokens = {}
- prev_val = None
- for line in lines:
- match = prog.match(line)
- if match:
- name, val = match.group(1, 2)
- val = int(val)
- tokens[val] = {'token': name} # reverse so we can sort them...
- prev_val = val
- else:
- comment_match = comment_regex.match(line)
- if comment_match and prev_val is not None:
- comment = comment_match.group(1)
- tokens[prev_val]['comment'] = comment
- keys = sorted(tokens.keys())
- # load the output skeleton from the target:
- try:
- fp = open(outFileName)
- except OSError as err:
- sys.stderr.write("I/O error: %s\n" % str(err))
- sys.exit(2)
- with fp:
- format = fp.read().split("\n")
- try:
- start = format.index("#--start constants--") + 1
- end = format.index("#--end constants--")
- except ValueError:
- sys.stderr.write("target does not contain format markers")
- sys.exit(3)
- lines = []
- for key in keys:
- lines.append("%s = %d" % (tokens[key]["token"], key))
- if "comment" in tokens[key]:
- lines.append("# %s" % tokens[key]["comment"])
- format[start:end] = lines
- try:
- fp = open(outFileName, 'w')
- except OSError as err:
- sys.stderr.write("I/O error: %s\n" % str(err))
- sys.exit(4)
- with fp:
- fp.write("\n".join(format))
-
-
-if __name__ == "__main__":
- _main()
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index fce010b..cf1ecc9 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -32,6 +32,7 @@ import itertools as _itertools
import re
import sys
from token import *
+from token import EXACT_TOKEN_TYPES
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -41,55 +42,6 @@ __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
"untokenize", "TokenInfo"]
del token
-EXACT_TOKEN_TYPES = {
- '(': LPAR,
- ')': RPAR,
- '[': LSQB,
- ']': RSQB,
- ':': COLON,
- ',': COMMA,
- ';': SEMI,
- '+': PLUS,
- '-': MINUS,
- '*': STAR,
- '/': SLASH,
- '|': VBAR,
- '&': AMPER,
- '<': LESS,
- '>': GREATER,
- '=': EQUAL,
- '.': DOT,
- '%': PERCENT,
- '{': LBRACE,
- '}': RBRACE,
- '==': EQEQUAL,
- '!=': NOTEQUAL,
- '<=': LESSEQUAL,
- '>=': GREATEREQUAL,
- '~': TILDE,
- '^': CIRCUMFLEX,
- '<<': LEFTSHIFT,
- '>>': RIGHTSHIFT,
- '**': DOUBLESTAR,
- '+=': PLUSEQUAL,
- '-=': MINEQUAL,
- '*=': STAREQUAL,
- '/=': SLASHEQUAL,
- '%=': PERCENTEQUAL,
- '&=': AMPEREQUAL,
- '|=': VBAREQUAL,
- '^=': CIRCUMFLEXEQUAL,
- '<<=': LEFTSHIFTEQUAL,
- '>>=': RIGHTSHIFTEQUAL,
- '**=': DOUBLESTAREQUAL,
- '//': DOUBLESLASH,
- '//=': DOUBLESLASHEQUAL,
- '...': ELLIPSIS,
- '->': RARROW,
- '@': AT,
- '@=': ATEQUAL,
-}
-
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self):
annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
@@ -163,17 +115,11 @@ Triple = group(StringPrefix + "'''", StringPrefix + '"""')
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
- r"//=?", r"->",
- r"[+\-*/%&@|^=<>]=?",
- r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-Funny = group(Operator, Bracket, Special)
+# Sorting in reverse order puts the long operators before their prefixes.
+# Otherwise if = came before ==, == would get recognized as two instances
+# of =.
+Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
+Funny = group(r'\r?\n', Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken