bpo-40334: Avoid collisions between parser variables and grammar variables (GH-19987)

This is for the C generator: - Disallow rule and variable names starting with `_` - Rename most local variable names generated by the parser to start with `_` Exceptions: - Renaming `p` to `_p` will be a separate PR - There are still some names that might clash, e.g. - anything starting with `Py` - C reserved words (`if` etc.) - Macros like `EXTRA` and `CHECK`
author: Pablo Galindo <Pablogsal@gmail.com> 2020-05-10 04:34:50 (GMT)
committer: GitHub <noreply@github.com> 2020-05-10 04:34:50 (GMT)
commit: ac7a92cc0a821699df48bc2e30a02c25d6338f78 (patch)
tree: d7f301b66d79b6d3ce497669a99a5f9ce47ba3b5 /Tools
parent: 2c3d508c5fabe40dac848fb9ae558069f0576879 (diff)
download: cpython-ac7a92cc0a821699df48bc2e30a02c25d6338f78.zip
cpython-ac7a92cc0a821699df48bc2e30a02c25d6338f78.tar.gz
cpython-ac7a92cc0a821699df48bc2e30a02c25d6338f78.tar.bz2
2 files changed, 93 insertions, 78 deletions
diff --git a/Tools/peg_generator/pegen/c_generator.py b/Tools/peg_generator/pegen/c_generator.py
index b7a9942..6c9aa3f 100644
--- a/Tools/peg_generator/pegen/c_generator.py
+++ b/Tools/peg_generator/pegen/c_generator.py
@@ -1,5 +1,5 @@
 import ast
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 import re
 from typing import Any, Dict, IO, Optional, List, Text, Tuple, Set
 from enum import Enum
@@ -101,7 +101,7 @@ class CCallMakerVisitor(GrammarVisitor):
         if keyword not in self.keyword_cache:
             self.keyword_cache[keyword] = self.gen.keyword_type()
         return FunctionCall(
-            assigned_variable="keyword",
+            assigned_variable="_keyword",
             function="_PyPegen_expect_token",
             arguments=["p", self.keyword_cache[keyword]],
             return_type="Token *",
@@ -140,7 +140,7 @@ class CCallMakerVisitor(GrammarVisitor):
             function=f"{name}_rule",
             arguments=["p"],
             return_type=type,
-            comment=f"{node}"
+            comment=f"{node}",
         )
 
     def visit_StringLeaf(self, node: StringLeaf) -> FunctionCall:
@@ -151,7 +151,7 @@ class CCallMakerVisitor(GrammarVisitor):
             assert val in self.exact_tokens, f"{node.value} is not a known literal"
             type = self.exact_tokens[val]
             return FunctionCall(
-                assigned_variable="literal",
+                assigned_variable="_literal",
                 function=f"_PyPegen_expect_token",
                 arguments=["p", type],
                 nodetype=NodeTypes.GENERIC_TOKEN,
@@ -175,8 +175,10 @@ class CCallMakerVisitor(GrammarVisitor):
         else:
             name = self.gen.name_node(node)
             self.cache[node] = FunctionCall(
-                assigned_variable=f"{name}_var", function=f"{name}_rule", arguments=["p"],
-                comment=f"{node}"
+                assigned_variable=f"{name}_var",
+                function=f"{name}_rule",
+                arguments=["p"],
+                comment=f"{node}",
             )
         return self.cache[node]
 
@@ -217,11 +219,11 @@ class CCallMakerVisitor(GrammarVisitor):
     def visit_Opt(self, node: Opt) -> FunctionCall:
         call = self.visit(node.node)
         return FunctionCall(
-            assigned_variable="opt_var",
+            assigned_variable="_opt_var",
             function=call.function,
             arguments=call.arguments,
             force_true=True,
-            comment=f"{node}"
+            comment=f"{node}",
         )
 
     def visit_Repeat0(self, node: Repeat0) -> FunctionCall:
@@ -268,7 +270,7 @@ class CCallMakerVisitor(GrammarVisitor):
 
     def visit_Cut(self, node: Cut) -> FunctionCall:
         return FunctionCall(
-            assigned_variable="cut_var",
+            assigned_variable="_cut_var",
             return_type="int",
             function="1",
             nodetype=NodeTypes.CUT_OPERATOR,
@@ -418,46 +420,46 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
             self.print("p->error_indicator = 1;")
             self.print("return NULL;")
         self.print("}")
-        self.print("int start_lineno = p->tokens[mark]->lineno;")
-        self.print("UNUSED(start_lineno); // Only used by EXTRA macro")
-        self.print("int start_col_offset = p->tokens[mark]->col_offset;")
-        self.print("UNUSED(start_col_offset); // Only used by EXTRA macro")
+        self.print("int _start_lineno = p->tokens[_mark]->lineno;")
+        self.print("UNUSED(_start_lineno); // Only used by EXTRA macro")
+        self.print("int _start_col_offset = p->tokens[_mark]->col_offset;")
+        self.print("UNUSED(_start_col_offset); // Only used by EXTRA macro")
 
     def _set_up_token_end_metadata_extraction(self) -> None:
-        self.print("Token *token = _PyPegen_get_last_nonnwhitespace_token(p);")
-        self.print("if (token == NULL) {")
+        self.print("Token *_token = _PyPegen_get_last_nonnwhitespace_token(p);")
+        self.print("if (_token == NULL) {")
         with self.indent():
             self.print("return NULL;")
         self.print("}")
-        self.print(f"int end_lineno = token->end_lineno;")
-        self.print("UNUSED(end_lineno); // Only used by EXTRA macro")
-        self.print(f"int end_col_offset = token->end_col_offset;")
-        self.print("UNUSED(end_col_offset); // Only used by EXTRA macro")
+        self.print("int _end_lineno = _token->end_lineno;")
+        self.print("UNUSED(_end_lineno); // Only used by EXTRA macro")
+        self.print("int _end_col_offset = _token->end_col_offset;")
+        self.print("UNUSED(_end_col_offset); // Only used by EXTRA macro")
 
     def _set_up_rule_memoization(self, node: Rule, result_type: str) -> None:
         self.print("{")
         with self.indent():
-            self.print(f"{result_type} res = NULL;")
-            self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &res))")
+            self.print(f"{result_type} _res = NULL;")
+            self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res))")
             with self.indent():
-                self.print("return res;")
-            self.print("int mark = p->mark;")
-            self.print("int resmark = p->mark;")
+                self.print("return _res;")
+            self.print("int _mark = p->mark;")
+            self.print("int _resmark = p->mark;")
             self.print("while (1) {")
             with self.indent():
                 self.call_with_errorcheck_return(
-                    f"_PyPegen_update_memo(p, mark, {node.name}_type, res)", "res"
+                    f"_PyPegen_update_memo(p, _mark, {node.name}_type, _res)", "_res"
                 )
-                self.print("p->mark = mark;")
-                self.print(f"void *raw = {node.name}_raw(p);")
-                self.print("if (raw == NULL || p->mark <= resmark)")
+                self.print("p->mark = _mark;")
+                self.print(f"void *_raw = {node.name}_raw(p);")
+                self.print("if (_raw == NULL || p->mark <= _resmark)")
                 with self.indent():
                     self.print("break;")
-                self.print("resmark = p->mark;")
-                self.print("res = raw;")
+                self.print(f"_resmark = p->mark;")
+                self.print("_res = _raw;")
             self.print("}")
-            self.print("p->mark = resmark;")
-            self.print("return res;")
+            self.print(f"p->mark = _resmark;")
+            self.print("return _res;")
         self.print("}")
         self.print(f"static {result_type}")
         self.print(f"{node.name}_raw(Parser *p)")
@@ -473,12 +475,12 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
             with self.indent():
                 self.print("return NULL;")
             self.print("}")
-            self.print(f"{result_type} res = NULL;")
+            self.print(f"{result_type} _res = NULL;")
             if memoize:
-                self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &res))")
+                self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res))")
                 with self.indent():
-                    self.print("return res;")
-            self.print("int mark = p->mark;")
+                    self.print("return _res;")
+            self.print("int _mark = p->mark;")
             if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts):
                 self._set_up_token_start_metadata_extraction()
             self.visit(
@@ -488,13 +490,13 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
                 rulename=node.name if memoize else None,
             )
             if self.debug:
-                self.print(f'fprintf(stderr, "Fail at %d: {node.name}\\n", p->mark);')
-            self.print("res = NULL;")
+                self.print('fprintf(stderr, "Fail at %d: {node.name}\\n", p->mark);')
+            self.print("_res = NULL;")
         self.print("  done:")
         with self.indent():
             if memoize:
-                self.print(f"_PyPegen_insert_memo(p, mark, {node.name}_type, res);")
-            self.print("return res;")
+                self.print(f"_PyPegen_insert_memo(p, _mark, {node.name}_type, _res);")
+            self.print("return _res;")
 
     def _handle_loop_rule_body(self, node: Rule, rhs: Rhs) -> None:
         memoize = self._should_memoize(node)
@@ -505,17 +507,17 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
             with self.indent():
                 self.print("return NULL;")
             self.print("}")
-            self.print(f"void *res = NULL;")
+            self.print("void *_res = NULL;")
             if memoize:
-                self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &res))")
+                self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res))")
                 with self.indent():
-                    self.print("return res;")
-            self.print("int mark = p->mark;")
-            self.print("int start_mark = p->mark;")
-            self.print("void **children = PyMem_Malloc(sizeof(void *));")
-            self.out_of_memory_return(f"!children", "NULL")
-            self.print("ssize_t children_capacity = 1;")
-            self.print("ssize_t n = 0;")
+                    self.print("return _res;")
+            self.print("int _mark = p->mark;")
+            self.print("int _start_mark = p->mark;")
+            self.print("void **_children = PyMem_Malloc(sizeof(void *));")
+            self.out_of_memory_return(f"!_children", "NULL")
+            self.print("ssize_t _children_capacity = 1;")
+            self.print("ssize_t _n = 0;")
             if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts):
                 self._set_up_token_start_metadata_extraction()
             self.visit(
@@ -525,23 +527,23 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
                 rulename=node.name if memoize else None,
             )
             if is_repeat1:
-                self.print("if (n == 0 || p->error_indicator) {")
+                self.print("if (_n == 0 || p->error_indicator) {")
                 with self.indent():
-                    self.print("PyMem_Free(children);")
+                    self.print("PyMem_Free(_children);")
                     self.print("return NULL;")
                 self.print("}")
-            self.print("asdl_seq *seq = _Py_asdl_seq_new(n, p->arena);")
+            self.print("asdl_seq *_seq = _Py_asdl_seq_new(_n, p->arena);")
             self.out_of_memory_return(
-                f"!seq",
+                "!_seq",
                 "NULL",
                 message=f"asdl_seq_new {node.name}",
-                cleanup_code="PyMem_Free(children);",
+                cleanup_code="PyMem_Free(_children);",
             )
-            self.print("for (int i = 0; i < n; i++) asdl_seq_SET(seq, i, children[i]);")
-            self.print("PyMem_Free(children);")
+            self.print("for (int i = 0; i < _n; i++) asdl_seq_SET(_seq, i, _children[i]);")
+            self.print("PyMem_Free(_children);")
             if node.name:
-                self.print(f"_PyPegen_insert_memo(p, start_mark, {node.name}_type, seq);")
-            self.print("return seq;")
+                self.print(f"_PyPegen_insert_memo(p, _start_mark, {node.name}_type, _seq);")
+            self.print("return _seq;")
 
     def visit_Rule(self, node: Rule) -> None:
         is_loop = node.is_loop()
@@ -599,9 +601,9 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
         self.print(")")
 
     def emit_action(self, node: Alt, cleanup_code: Optional[str] = None) -> None:
-        self.print(f"res = {node.action};")
+        self.print(f"_res = {node.action};")
 
-        self.print("if (res == NULL && PyErr_Occurred()) {")
+        self.print("if (_res == NULL && PyErr_Occurred()) {")
         with self.indent():
             self.print("p->error_indicator = 1;")
             if cleanup_code:
@@ -611,7 +613,7 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
 
         if self.debug:
             self.print(
-                f'fprintf(stderr, "Hit with action [%d-%d]: %s\\n", mark, p->mark, "{node}");'
+                f'fprintf(stderr, "Hit with action [%d-%d]: %s\\n", _mark, p->mark, "{node}");'
             )
 
     def emit_default_action(self, is_gather: bool, node: Alt) -> None:
@@ -619,7 +621,7 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
             if is_gather:
                 assert len(self.local_variable_names) == 2
                 self.print(
-                    f"res = _PyPegen_seq_insert_in_front(p, "
+                    f"_res = _PyPegen_seq_insert_in_front(p, "
                     f"{self.local_variable_names[0]}, {self.local_variable_names[1]});"
                 )
             else:
@@ -628,17 +630,17 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
                         f'fprintf(stderr, "Hit without action [%d:%d]: %s\\n", mark, p->mark, "{node}");'
                     )
                 self.print(
-                    f"res = _PyPegen_dummy_name(p, {', '.join(self.local_variable_names)});"
+                    f"_res = _PyPegen_dummy_name(p, {', '.join(self.local_variable_names)});"
                 )
         else:
             if self.debug:
                 self.print(
                     f'fprintf(stderr, "Hit with default action [%d:%d]: %s\\n", mark, p->mark, "{node}");'
                 )
-            self.print(f"res = {self.local_variable_names[0]};")
+            self.print(f"_res = {self.local_variable_names[0]};")
 
     def emit_dummy_action(self) -> None:
-        self.print(f"res = _PyPegen_dummy_name(p);")
+        self.print("_res = _PyPegen_dummy_name(p);")
 
     def handle_alt_normal(self, node: Alt, is_gather: bool) -> None:
         self.join_conditions(keyword="if", node=node)
@@ -671,20 +673,22 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
             if self.skip_actions:
                 self.emit_dummy_action()
             elif node.action:
-                self.emit_action(node, cleanup_code="PyMem_Free(children);")
+                self.emit_action(node, cleanup_code="PyMem_Free(_children);")
             else:
                 self.emit_default_action(is_gather, node)
 
             # Add the result of rule to the temporary buffer of children. This buffer
             # will populate later an asdl_seq with all elements to return.
-            self.print("if (n == children_capacity) {")
+            self.print("if (_n == _children_capacity) {")
             with self.indent():
-                self.print("children_capacity *= 2;")
-                self.print("children = PyMem_Realloc(children, children_capacity*sizeof(void *));")
-                self.out_of_memory_return(f"!children", "NULL", message=f"realloc {rulename}")
+                self.print("_children_capacity *= 2;")
+                self.print(
+                    "_children = PyMem_Realloc(_children, _children_capacity*sizeof(void *));"
+                )
+                self.out_of_memory_return(f"!_children", "NULL", message=f"realloc {rulename}")
             self.print("}")
-            self.print(f"children[n++] = res;")
-            self.print("mark = p->mark;")
+            self.print("_children[_n++] = _res;")
+            self.print("_mark = p->mark;")
         self.print("}")
 
     def visit_Alt(
@@ -699,11 +703,11 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
                     var_type = "void *"
                 else:
                     var_type += " "
-                if v == "cut_var":
+                if v == "_cut_var":
                     v += " = 0"  # cut_var must be initialized
                 self.print(f"{var_type}{v};")
-                if v == "opt_var":
-                    self.print("UNUSED(opt_var); // Silence compiler warnings")
+                if v == "_opt_var":
+                    self.print("UNUSED(_opt_var); // Silence compiler warnings")
 
             with self.local_variable_context():
                 if is_loop:
@@ -711,9 +715,9 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
                 else:
                     self.handle_alt_normal(node, is_gather)
 
-            self.print("p->mark = mark;")
-            if "cut_var" in vars:
-                self.print("if (cut_var) return NULL;")
+            self.print("p->mark = _mark;")
+            if "_cut_var" in vars:
+                self.print("if (_cut_var) return NULL;")
         self.print("}")
 
     def collect_vars(self, node: Alt) -> Dict[Optional[str], Optional[str]]:
diff --git a/Tools/peg_generator/pegen/parser_generator.py b/Tools/peg_generator/pegen/parser_generator.py
index 0345251..689022b 100644
--- a/Tools/peg_generator/pegen/parser_generator.py
+++ b/Tools/peg_generator/pegen/parser_generator.py
@@ -27,6 +27,11 @@ class RuleCheckingVisitor(GrammarVisitor):
             # TODO: Add line/col info to (leaf) nodes
             raise GrammarError(f"Dangling reference to rule {node.value!r}")
 
+    def visit_NamedItem(self, node: NameLeaf) -> None:
+        if node.name and node.name.startswith("_"):
+            raise GrammarError(f"Variable names cannot start with underscore: '{node.name}'")
+        self.visit(node.item)
+
 
 class ParserGenerator:
 
@@ -36,6 +41,7 @@ class ParserGenerator:
         self.grammar = grammar
         self.tokens = tokens
         self.rules = grammar.rules
+        self.validate_rule_names()
         if "trailer" not in grammar.metas and "start" not in self.rules:
             raise GrammarError("Grammar without a trailer must have a 'start' rule")
         checker = RuleCheckingVisitor(self.rules, self.tokens)
@@ -51,6 +57,11 @@ class ParserGenerator:
         self.all_rules: Dict[str, Rule] = {}  # Rules + temporal rules
         self._local_variable_stack: List[List[str]] = []
 
+    def validate_rule_names(self):
+        for rule in self.rules:
+            if rule.startswith("_"):
+                raise GrammarError(f"Rule names cannot start with underscore: '{rule}'")
+
     @contextlib.contextmanager
     def local_variable_context(self) -> Iterator[None]:
         self._local_variable_stack.append([])
author	Pablo Galindo <Pablogsal@gmail.com>	2020-05-10 04:34:50 (GMT)
committer	GitHub <noreply@github.com>	2020-05-10 04:34:50 (GMT)
commit	ac7a92cc0a821699df48bc2e30a02c25d6338f78 (patch)
tree	d7f301b66d79b6d3ce497669a99a5f9ce47ba3b5 /Tools
parent	2c3d508c5fabe40dac848fb9ae558069f0576879 (diff)
download	cpython-ac7a92cc0a821699df48bc2e30a02c25d6338f78.zip cpython-ac7a92cc0a821699df48bc2e30a02c25d6338f78.tar.gz cpython-ac7a92cc0a821699df48bc2e30a02c25d6338f78.tar.bz2