summaryrefslogtreecommitdiffstats
path: root/Tools
diff options
context:
space:
mode:
Diffstat (limited to 'Tools')
-rw-r--r--Tools/peg_generator/Makefile11
-rwxr-xr-xTools/peg_generator/pegen/__main__.py127
-rw-r--r--Tools/peg_generator/pegen/build.py101
-rw-r--r--Tools/peg_generator/pegen/c_generator.py37
-rw-r--r--Tools/peg_generator/pegen/testutil.py10
-rwxr-xr-xTools/peg_generator/scripts/test_parse_directory.py17
6 files changed, 215 insertions, 88 deletions
diff --git a/Tools/peg_generator/Makefile b/Tools/peg_generator/Makefile
index fb67a21..a37cbfc 100644
--- a/Tools/peg_generator/Makefile
+++ b/Tools/peg_generator/Makefile
@@ -10,6 +10,7 @@ CPYTHON ?= ../../Lib
MYPY ?= mypy
GRAMMAR = ../../Grammar/python.gram
+TOKENS = ../../Grammar/Tokens
TESTFILE = data/cprog.py
TIMEFILE = data/xxl.py
TESTDIR = .
@@ -20,8 +21,8 @@ data/xxl.py:
build: peg_extension/parse.c
-peg_extension/parse.c: $(GRAMMAR) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen/pegen.c ../../Parser/pegen/parse_string.c ../../Parser/pegen/*.h pegen/grammar_parser.py
- $(PYTHON) -m pegen -q -c $(GRAMMAR) -o peg_extension/parse.c --compile-extension
+peg_extension/parse.c: $(GRAMMAR) $(TOKENS) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen/pegen.c ../../Parser/pegen/parse_string.c ../../Parser/pegen/*.h pegen/grammar_parser.py
+ $(PYTHON) -m pegen -q c $(GRAMMAR) $(TOKENS) -o peg_extension/parse.c --compile-extension
clean:
-rm -f peg_extension/*.o peg_extension/*.so peg_extension/parse.c
@@ -79,7 +80,8 @@ time_stdlib_parse: data/xxl.py
test_local:
$(PYTHON) scripts/test_parse_directory.py \
- -g $(GRAMMAR) \
+ --grammar-file $(GRAMMAR) \
+ --tokens-file $(TOKENS) \
-d $(TESTDIR) \
$(TESTFLAGS) \
--exclude "*/failset/*" \
@@ -88,7 +90,8 @@ test_local:
test_global: $(CPYTHON)
$(PYTHON) scripts/test_parse_directory.py \
- -g $(GRAMMAR) \
+ --grammar-file $(GRAMMAR) \
+ --tokens-file $(TOKENS) \
-d $(CPYTHON) \
$(TESTFLAGS) \
--exclude "*/test2to3/*" \
diff --git a/Tools/peg_generator/pegen/__main__.py b/Tools/peg_generator/pegen/__main__.py
index 6696d13..1dcbaad 100755
--- a/Tools/peg_generator/pegen/__main__.py
+++ b/Tools/peg_generator/pegen/__main__.py
@@ -11,6 +11,64 @@ import time
import token
import traceback
+from typing import Tuple
+
+from pegen.build import Grammar, Parser, Tokenizer, ParserGenerator
+
+
+def generate_c_code(
+ args: argparse.Namespace,
+) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
+ from pegen.build import build_c_parser_and_generator
+
+ verbose = args.verbose
+ verbose_tokenizer = verbose >= 3
+ verbose_parser = verbose == 2 or verbose >= 4
+ try:
+ grammar, parser, tokenizer, gen = build_c_parser_and_generator(
+ args.grammar_filename,
+ args.tokens_filename,
+ args.output,
+ args.compile_extension,
+ verbose_tokenizer,
+ verbose_parser,
+ args.verbose,
+ keep_asserts_in_extension=False if args.optimized else True,
+ skip_actions=args.skip_actions,
+ )
+ return grammar, parser, tokenizer, gen
+ except Exception as err:
+ if args.verbose:
+ raise # Show traceback
+ traceback.print_exception(err.__class__, err, None)
+ sys.stderr.write("For full traceback, use -v\n")
+ sys.exit(1)
+
+
+def generate_python_code(
+ args: argparse.Namespace,
+) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
+ from pegen.build import build_python_parser_and_generator
+
+ verbose = args.verbose
+ verbose_tokenizer = verbose >= 3
+ verbose_parser = verbose == 2 or verbose >= 4
+ try:
+ grammar, parser, tokenizer, gen = build_python_parser_and_generator(
+ args.grammar_filename,
+ args.output,
+ verbose_tokenizer,
+ verbose_parser,
+ skip_actions=args.skip_actions,
+ )
+ return grammar, parser, tokenizer, gen
+ except Exception as err:
+ if args.verbose:
+ raise # Show traceback
+ traceback.print_exception(err.__class__, err, None)
+ sys.stderr.write("For full traceback, use -v\n")
+ sys.exit(1)
+
argparser = argparse.ArgumentParser(
prog="pegen", description="Experimental PEG-like parser generator"
@@ -23,63 +81,52 @@ argparser.add_argument(
default=0,
help="Print timing stats; repeat for more debug output",
)
-argparser.add_argument(
- "-c", "--cpython", action="store_true", help="Generate C code for inclusion into CPython"
+subparsers = argparser.add_subparsers(help="target language for the generated code")
+
+c_parser = subparsers.add_parser("c", help="Generate C code for inclusion into CPython")
+c_parser.set_defaults(func=generate_c_code)
+c_parser.add_argument("grammar_filename", help="Grammar description")
+c_parser.add_argument("tokens_filename", help="Tokens description")
+c_parser.add_argument(
+ "-o", "--output", metavar="OUT", default="parse.c", help="Where to write the generated parser"
)
-argparser.add_argument(
+c_parser.add_argument(
"--compile-extension",
action="store_true",
help="Compile generated C code into an extension module",
)
-argparser.add_argument(
+c_parser.add_argument(
+ "--optimized", action="store_true", help="Compile the extension in optimized mode"
+)
+c_parser.add_argument(
+ "--skip-actions", action="store_true", help="Suppress code emission for rule actions",
+)
+
+python_parser = subparsers.add_parser("python", help="Generate Python code")
+python_parser.set_defaults(func=generate_python_code)
+python_parser.add_argument("grammar_filename", help="Grammar description")
+python_parser.add_argument(
"-o",
"--output",
metavar="OUT",
- help="Where to write the generated parser (default parse.py or parse.c)",
+ default="parse.py",
+ help="Where to write the generated parser",
)
-argparser.add_argument("filename", help="Grammar description")
-argparser.add_argument(
- "--optimized", action="store_true", help="Compile the extension in optimized mode"
-)
-argparser.add_argument(
+python_parser.add_argument(
"--skip-actions", action="store_true", help="Suppress code emission for rule actions",
)
def main() -> None:
- from pegen.build import build_parser_and_generator
from pegen.testutil import print_memstats
args = argparser.parse_args()
- verbose = args.verbose
- verbose_tokenizer = verbose >= 3
- verbose_parser = verbose == 2 or verbose >= 4
- t0 = time.time()
-
- output_file = args.output
- if not output_file:
- if args.cpython:
- output_file = "parse.c"
- else:
- output_file = "parse.py"
+ if "func" not in args:
+ argparser.error("Must specify the target language mode ('c' or 'python')")
- try:
- grammar, parser, tokenizer, gen = build_parser_and_generator(
- args.filename,
- output_file,
- args.compile_extension,
- verbose_tokenizer,
- verbose_parser,
- args.verbose,
- keep_asserts_in_extension=False if args.optimized else True,
- skip_actions=args.skip_actions,
- )
- except Exception as err:
- if args.verbose:
- raise # Show traceback
- traceback.print_exception(err.__class__, err, None)
- sys.stderr.write("For full traceback, use -v\n")
- sys.exit(1)
+ t0 = time.time()
+ grammar, parser, tokenizer, gen = args.func(args)
+ t1 = time.time()
if not args.quiet:
if args.verbose:
@@ -110,8 +157,6 @@ def main() -> None:
else:
print()
- t1 = time.time()
-
if args.verbose:
dt = t1 - t0
diag = tokenizer.diagnose()
diff --git a/Tools/peg_generator/pegen/build.py b/Tools/peg_generator/pegen/build.py
index 0f5d73e..94248ff 100644
--- a/Tools/peg_generator/pegen/build.py
+++ b/Tools/peg_generator/pegen/build.py
@@ -3,8 +3,9 @@ import shutil
import tokenize
import sys
import sysconfig
+import itertools
-from typing import Optional, Tuple
+from typing import Optional, Tuple, List, IO, Iterator, Set, Dict
from pegen.c_generator import CParserGenerator
from pegen.grammar import Grammar
@@ -17,12 +18,12 @@ from pegen.tokenizer import Tokenizer
MOD_DIR = pathlib.Path(__file__).parent
-def get_extra_flags(compiler_flags, compiler_py_flags_nodist):
+def get_extra_flags(compiler_flags: str, compiler_py_flags_nodist: str) -> List[str]:
flags = sysconfig.get_config_var(compiler_flags)
py_flags_nodist = sysconfig.get_config_var(compiler_py_flags_nodist)
if flags is None or py_flags_nodist is None:
return []
- return f'{flags} {py_flags_nodist}'.split()
+ return f"{flags} {py_flags_nodist}".split()
def compile_c_extension(
@@ -45,15 +46,15 @@ def compile_c_extension(
from distutils.core import Distribution, Extension
from distutils.command.clean import clean # type: ignore
from distutils.command.build_ext import build_ext # type: ignore
- from distutils.tests.support import fixup_build_ext
+ from distutils.tests.support import fixup_build_ext # type: ignore
if verbose:
distutils.log.set_verbosity(distutils.log.DEBUG)
source_file_path = pathlib.Path(generated_source_path)
extension_name = source_file_path.stem
- extra_compile_args = get_extra_flags('CFLAGS', 'PY_CFLAGS_NODIST')
- extra_link_args = get_extra_flags('LDFLAGS', 'PY_LDFLAGS_NODIST')
+ extra_compile_args = get_extra_flags("CFLAGS", "PY_CFLAGS_NODIST")
+ extra_link_args = get_extra_flags("LDFLAGS", "PY_LDFLAGS_NODIST")
if keep_asserts:
extra_compile_args.append("-UNDEBUG")
extension = [
@@ -111,39 +112,69 @@ def build_parser(
return grammar, parser, tokenizer
-def build_generator(
- tokenizer: Tokenizer,
+def generate_token_definitions(tokens: IO[str]) -> Tuple[Dict[str, int], Set[str]]:
+ exact_tokens = {}
+ non_exact_tokens = set()
+ numbers = itertools.count(0)
+
+ for line in tokens:
+ line = line.strip()
+
+ if not line or line.startswith("#"):
+ continue
+
+ pieces = line.split()
+ index = next(numbers)
+
+ if len(pieces) == 1:
+ (token,) = pieces
+ non_exact_tokens.add(token)
+ elif len(pieces) == 2:
+ _, op = pieces
+ exact_tokens[op.strip("'")] = index
+ else:
+ raise ValueError(f"Unexpected line found in Tokens file: {line}")
+
+ return exact_tokens, non_exact_tokens
+
+
+def build_c_generator(
grammar: Grammar,
grammar_file: str,
+ tokens_file: str,
output_file: str,
compile_extension: bool = False,
verbose_c_extension: bool = False,
keep_asserts_in_extension: bool = True,
skip_actions: bool = False,
) -> ParserGenerator:
- # TODO: Allow other extensions; pass the output type as an argument.
- if not output_file.endswith((".c", ".py")):
- raise RuntimeError("Your output file must either be a .c or .py file")
+ with open(tokens_file, "r") as tok_file:
+ exact_tok, non_exact_tok = generate_token_definitions(tok_file)
with open(output_file, "w") as file:
- gen: ParserGenerator
- if output_file.endswith(".c"):
- gen = CParserGenerator(grammar, file, skip_actions=skip_actions)
- elif output_file.endswith(".py"):
- gen = PythonParserGenerator(grammar, file) # TODO: skip_actions
- else:
- assert False # Should have been checked above
+ gen: ParserGenerator = CParserGenerator(
+ grammar, exact_tok, non_exact_tok, file, skip_actions=skip_actions
+ )
gen.generate(grammar_file)
- if compile_extension and output_file.endswith(".c"):
+ if compile_extension:
compile_c_extension(
output_file, verbose=verbose_c_extension, keep_asserts=keep_asserts_in_extension
)
+ return gen
+
+def build_python_generator(
+ grammar: Grammar, grammar_file: str, output_file: str, skip_actions: bool = False,
+) -> ParserGenerator:
+ with open(output_file, "w") as file:
+ gen: ParserGenerator = PythonParserGenerator(grammar, file) # TODO: skip_actions
+ gen.generate(grammar_file)
return gen
-def build_parser_and_generator(
+def build_c_parser_and_generator(
grammar_file: str,
+ tokens_file: str,
output_file: str,
compile_extension: bool = False,
verbose_tokenizer: bool = False,
@@ -152,10 +183,11 @@ def build_parser_and_generator(
keep_asserts_in_extension: bool = True,
skip_actions: bool = False,
) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
- """Generate rules, parser, tokenizer, parser generator for a given grammar
+ """Generate rules, C parser, tokenizer, parser generator for a given grammar
Args:
grammar_file (string): Path for the grammar file
+ tokens_file (string): Path for the tokens file
output_file (string): Path for the output file
compile_extension (bool, optional): Whether to compile the C extension.
Defaults to False.
@@ -170,10 +202,10 @@ def build_parser_and_generator(
skip_actions (bool, optional): Whether to pretend no rule has any actions.
"""
grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser)
- gen = build_generator(
- tokenizer,
+ gen = build_c_generator(
grammar,
grammar_file,
+ tokens_file,
output_file,
compile_extension,
verbose_c_extension,
@@ -182,3 +214,26 @@ def build_parser_and_generator(
)
return grammar, parser, tokenizer, gen
+
+
+def build_python_parser_and_generator(
+ grammar_file: str,
+ output_file: str,
+ verbose_tokenizer: bool = False,
+ verbose_parser: bool = False,
+ skip_actions: bool = False,
+) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
+ """Generate rules, python parser, tokenizer, parser generator for a given grammar
+
+ Args:
+ grammar_file (string): Path for the grammar file
+ output_file (string): Path for the output file
+ verbose_tokenizer (bool, optional): Whether to display additional output
+ when generating the tokenizer. Defaults to False.
+ verbose_parser (bool, optional): Whether to display additional output
+ when generating the parser. Defaults to False.
+ skip_actions (bool, optional): Whether to pretend no rule has any actions.
+ """
+ grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser)
+ gen = build_python_generator(grammar, grammar_file, output_file, skip_actions=skip_actions,)
+ return grammar, parser, tokenizer, gen
diff --git a/Tools/peg_generator/pegen/c_generator.py b/Tools/peg_generator/pegen/c_generator.py
index 6c4b8f1..a01c309 100644
--- a/Tools/peg_generator/pegen/c_generator.py
+++ b/Tools/peg_generator/pegen/c_generator.py
@@ -1,6 +1,6 @@
import ast
import re
-from typing import Any, cast, Dict, IO, Optional, List, Text, Tuple
+from typing import Any, cast, Dict, IO, Optional, List, Text, Tuple, Set
from pegen.grammar import (
Cut,
@@ -22,7 +22,6 @@ from pegen.grammar import (
)
from pegen import grammar
from pegen.parser_generator import dedupe, ParserGenerator
-from pegen.tokenizer import exact_token_types
EXTENSION_PREFIX = """\
#include "pegen.h"
@@ -43,8 +42,15 @@ _PyPegen_parse(Parser *p)
class CCallMakerVisitor(GrammarVisitor):
- def __init__(self, parser_generator: ParserGenerator):
+ def __init__(
+ self,
+ parser_generator: ParserGenerator,
+ exact_tokens: Dict[str, int],
+ non_exact_tokens: Set[str],
+ ):
self.gen = parser_generator
+ self.exact_tokens = exact_tokens
+ self.non_exact_tokens = non_exact_tokens
self.cache: Dict[Any, Any] = {}
self.keyword_cache: Dict[str, int] = {}
@@ -55,10 +61,7 @@ class CCallMakerVisitor(GrammarVisitor):
def visit_NameLeaf(self, node: NameLeaf) -> Tuple[str, str]:
name = node.value
- if name in ("NAME", "NUMBER", "STRING"):
- name = name.lower()
- return f"{name}_var", f"_PyPegen_{name}_token(p)"
- if name in ("NEWLINE", "DEDENT", "INDENT", "ENDMARKER", "ASYNC", "AWAIT"):
+ if name in self.non_exact_tokens:
name = name.lower()
return f"{name}_var", f"_PyPegen_{name}_token(p)"
return f"{name}_var", f"{name}_rule(p)"
@@ -68,12 +71,12 @@ class CCallMakerVisitor(GrammarVisitor):
if re.match(r"[a-zA-Z_]\w*\Z", val): # This is a keyword
return self.keyword_helper(val)
else:
- assert val in exact_token_types, f"{node.value} is not a known literal"
- type = exact_token_types[val]
+ assert val in self.exact_tokens, f"{node.value} is not a known literal"
+ type = self.exact_tokens[val]
return "literal", f"_PyPegen_expect_token(p, {type})"
def visit_Rhs(self, node: Rhs) -> Tuple[Optional[str], str]:
- def can_we_inline(node):
+ def can_we_inline(node: Rhs) -> int:
if len(node.alts) != 1 or len(node.alts[0].items) != 1:
return False
# If the alternative has an action we cannot inline
@@ -152,12 +155,16 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
def __init__(
self,
grammar: grammar.Grammar,
+ exact_tokens: Dict[str, int],
+ non_exact_tokens: Set[str],
file: Optional[IO[Text]],
debug: bool = False,
skip_actions: bool = False,
):
super().__init__(grammar, file)
- self.callmakervisitor: CCallMakerVisitor = CCallMakerVisitor(self)
+ self.callmakervisitor: CCallMakerVisitor = CCallMakerVisitor(
+ self, exact_tokens, non_exact_tokens
+ )
self._varname_counter = 0
self.debug = debug
self.skip_actions = skip_actions
@@ -184,7 +191,11 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
self.print(f"}}")
def out_of_memory_return(
- self, expr: str, returnval: str, message: str = "Parser out of memory", cleanup_code=None
+ self,
+ expr: str,
+ returnval: str,
+ message: str = "Parser out of memory",
+ cleanup_code: Optional[str] = None,
) -> None:
self.print(f"if ({expr}) {{")
with self.indent():
@@ -465,7 +476,7 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
self.visit(item, names=names)
self.print(")")
- def emit_action(self, node: Alt, cleanup_code=None) -> None:
+ def emit_action(self, node: Alt, cleanup_code: Optional[str] = None) -> None:
self.print(f"res = {node.action};")
self.print("if (res == NULL && PyErr_Occurred()) {")
diff --git a/Tools/peg_generator/pegen/testutil.py b/Tools/peg_generator/pegen/testutil.py
index 5a91862..1f79d8f 100644
--- a/Tools/peg_generator/pegen/testutil.py
+++ b/Tools/peg_generator/pegen/testutil.py
@@ -5,6 +5,7 @@ import pathlib
import sys
import textwrap
import tokenize
+import token
from typing import Any, cast, Dict, IO, Type, Final
@@ -16,6 +17,11 @@ from pegen.parser import Parser
from pegen.python_generator import PythonParserGenerator
from pegen.tokenizer import Tokenizer
+EXACT_TOKENS = token.EXACT_TOKEN_TYPES # type: ignore
+NON_EXACT_TOKENS = {
+ name for index, name in token.tok_name.items() if index not in EXACT_TOKENS.values()
+}
+
def generate_parser(grammar: Grammar) -> Type[Parser]:
# Generate a parser.
@@ -70,7 +76,7 @@ def import_file(full_name: str, path: str) -> Any:
def generate_c_parser_source(grammar: Grammar) -> str:
out = io.StringIO()
- genr = CParserGenerator(grammar, out)
+ genr = CParserGenerator(grammar, EXACT_TOKENS, NON_EXACT_TOKENS, out)
genr.generate("<string>")
return out.getvalue()
@@ -90,7 +96,7 @@ def generate_parser_c_extension(
assert not os.listdir(path)
source = path / "parse.c"
with open(source, "w") as file:
- genr = CParserGenerator(grammar, file, debug=debug)
+ genr = CParserGenerator(grammar, EXACT_TOKENS, NON_EXACT_TOKENS, file, debug=debug)
genr.generate("parse.c")
compile_c_extension(str(source), build_dir=str(path))
diff --git a/Tools/peg_generator/scripts/test_parse_directory.py b/Tools/peg_generator/scripts/test_parse_directory.py
index 06a38fc..6511a2d 100755
--- a/Tools/peg_generator/scripts/test_parse_directory.py
+++ b/Tools/peg_generator/scripts/test_parse_directory.py
@@ -13,7 +13,7 @@ from pathlib import PurePath
from typing import List, Optional, Any
sys.path.insert(0, os.getcwd())
-from pegen.build import build_parser_and_generator
+from pegen.build import build_c_parser_and_generator
from pegen.testutil import print_memstats
from scripts import show_parse
@@ -26,7 +26,8 @@ argparser = argparse.ArgumentParser(
description="Helper program to test directories or files for pegen",
)
argparser.add_argument("-d", "--directory", help="Directory path containing files to test")
-argparser.add_argument("-g", "--grammar-file", help="Grammar file path")
+argparser.add_argument("--grammar-file", help="Grammar file path")
+argparser.add_argument("--tokens-file", help="Tokens file path")
argparser.add_argument(
"-e", "--exclude", action="append", default=[], help="Glob(s) for matching files to exclude"
)
@@ -114,6 +115,7 @@ def compare_trees(
def parse_directory(
directory: str,
grammar_file: str,
+ tokens_file: str,
verbose: bool,
excluded_files: List[str],
skip_actions: bool,
@@ -131,15 +133,16 @@ def parse_directory(
print("You must specify a directory of files to test.", file=sys.stderr)
return 1
- if grammar_file:
+ if grammar_file and tokens_file:
if not os.path.exists(grammar_file):
print(f"The specified grammar file, {grammar_file}, does not exist.", file=sys.stderr)
return 1
try:
if not extension and parser == "pegen":
- build_parser_and_generator(
+ build_c_parser_and_generator(
grammar_file,
+ tokens_file,
"peg_extension/parse.c",
compile_extension=True,
skip_actions=skip_actions,
@@ -154,7 +157,9 @@ def parse_directory(
return 1
else:
- print("A grammar file was not provided - attempting to use existing file...\n")
+ print(
+ "A grammar file or a tokens file was not provided - attempting to use existing parser from stdlib...\n"
+ )
if parser == "pegen":
try:
@@ -264,6 +269,7 @@ def main() -> None:
args = argparser.parse_args()
directory = args.directory
grammar_file = args.grammar_file
+ tokens_file = args.tokens_file
verbose = args.verbose
excluded_files = args.exclude
skip_actions = args.skip_actions
@@ -273,6 +279,7 @@ def main() -> None:
parse_directory(
directory,
grammar_file,
+ tokens_file,
verbose,
excluded_files,
skip_actions,