From dd1c638b92ae1552207451c82ed95aa2c1f07201 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)" Date: Thu, 8 Sep 2016 00:40:07 +0000 Subject: lib2to3.pgen3.driver.load_grammar() now creates a stable cache file between runs given the same Grammar.txt input regardless of the hash randomization setting. --- Lib/lib2to3/pgen2/driver.py | 15 +++++---- Lib/lib2to3/pgen2/grammar.py | 28 ++++++++++++++-- Lib/lib2to3/pgen2/pgen.py | 8 ++--- Lib/lib2to3/tests/support.py | 6 ++-- Lib/lib2to3/tests/test_parser.py | 72 ++++++++++++++++++++++++++++++++++++++-- Misc/NEWS | 4 +++ 6 files changed, 115 insertions(+), 18 deletions(-) diff --git a/Lib/lib2to3/pgen2/driver.py b/Lib/lib2to3/pgen2/driver.py index 3ccc69d..a27b9cb 100644 --- a/Lib/lib2to3/pgen2/driver.py +++ b/Lib/lib2to3/pgen2/driver.py @@ -106,16 +106,19 @@ class Driver(object): return self.parse_tokens(tokens, debug) +def _generate_pickle_name(gt): + head, tail = os.path.splitext(gt) + if tail == ".txt": + tail = "" + return head + tail + ".".join(map(str, sys.version_info)) + ".pickle" + + def load_grammar(gt="Grammar.txt", gp=None, save=True, force=False, logger=None): """Load the grammar (maybe from a pickle).""" if logger is None: logger = logging.getLogger() - if gp is None: - head, tail = os.path.splitext(gt) - if tail == ".txt": - tail = "" - gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle" + gp = _generate_pickle_name(gt) if gp is None else gp if force or not _newer(gp, gt): logger.info("Generating grammar tables from %s", gt) g = pgen.generate_grammar(gt) @@ -124,7 +127,7 @@ def load_grammar(gt="Grammar.txt", gp=None, try: g.dump(gp) except OSError as e: - logger.info("Writing failed:"+str(e)) + logger.info("Writing failed: %s", e) else: g = grammar.Grammar() g.load(gp) diff --git a/Lib/lib2to3/pgen2/grammar.py b/Lib/lib2to3/pgen2/grammar.py index b4481d1..52cdbc0 100644 --- a/Lib/lib2to3/pgen2/grammar.py +++ b/Lib/lib2to3/pgen2/grammar.py @@ -13,6 +13,7 @@ fallback token code OP, but the parser needs the actual token code. """ # Python imports +import collections import pickle # Local imports @@ -85,9 +86,21 @@ class Grammar(object): self.start = 256 def dump(self, filename): - """Dump the grammar tables to a pickle file.""" + """Dump the grammar tables to a pickle file. + + dump() recursively changes all dict to OrderedDict, so the pickled file + is not exactly the same as what was passed in to dump(). load() uses the + pickled file to create the tables, but only changes OrderedDict to dict + at the top level; it does not recursively change OrderedDict to dict. + So, the loaded tables are different from the original tables that were + passed to load() in that some of the OrderedDict (from the pickled file) + are not changed back to dict. For parsing, this has no effect on + performance because OrderedDict uses dict's __getitem__ with nothing in + between. + """ with open(filename, "wb") as f: - pickle.dump(self.__dict__, f, 2) + d = _make_deterministic(self.__dict__) + pickle.dump(d, f, 2) def load(self, filename): """Load the grammar tables from a pickle file.""" @@ -124,6 +137,17 @@ class Grammar(object): print("start", self.start) +def _make_deterministic(top): + if isinstance(top, dict): + return collections.OrderedDict( + sorted(((k, _make_deterministic(v)) for k, v in top.items()))) + if isinstance(top, list): + return [_make_deterministic(e) for e in top] + if isinstance(top, tuple): + return tuple(_make_deterministic(e) for e in top) + return top + + # Map from operator to number (since tokenize doesn't do this) opmap_raw = """ diff --git a/Lib/lib2to3/pgen2/pgen.py b/Lib/lib2to3/pgen2/pgen.py index 2c51eef..b0cbd16 100644 --- a/Lib/lib2to3/pgen2/pgen.py +++ b/Lib/lib2to3/pgen2/pgen.py @@ -39,7 +39,7 @@ class ParserGenerator(object): states = [] for state in dfa: arcs = [] - for label, next in state.arcs.items(): + for label, next in sorted(state.arcs.items()): arcs.append((self.make_label(c, label), dfa.index(next))) if state.isfinal: arcs.append((0, dfa.index(state))) @@ -52,7 +52,7 @@ class ParserGenerator(object): def make_first(self, c, name): rawfirst = self.first[name] first = {} - for label in rawfirst: + for label in sorted(rawfirst): ilabel = self.make_label(c, label) ##assert ilabel not in first # XXX failed on <> ... != first[ilabel] = 1 @@ -192,7 +192,7 @@ class ParserGenerator(object): for label, next in nfastate.arcs: if label is not None: addclosure(next, arcs.setdefault(label, {})) - for label, nfaset in arcs.items(): + for label, nfaset in sorted(arcs.items()): for st in states: if st.nfaset == nfaset: break @@ -222,7 +222,7 @@ class ParserGenerator(object): print("Dump of DFA for", name) for i, state in enumerate(dfa): print(" State", i, state.isfinal and "(final)" or "") - for label, next in state.arcs.items(): + for label, next in sorted(state.arcs.items()): print(" %s -> %d" % (label, dfa.index(next))) def simplify_dfa(self, dfa): diff --git a/Lib/lib2to3/tests/support.py b/Lib/lib2to3/tests/support.py index 6f2d214..0897177 100644 --- a/Lib/lib2to3/tests/support.py +++ b/Lib/lib2to3/tests/support.py @@ -11,13 +11,13 @@ from textwrap import dedent # Local imports from lib2to3 import pytree, refactor -from lib2to3.pgen2 import driver +from lib2to3.pgen2 import driver as pgen2_driver test_dir = os.path.dirname(__file__) proj_dir = os.path.normpath(os.path.join(test_dir, "..")) grammar_path = os.path.join(test_dir, "..", "Grammar.txt") -grammar = driver.load_grammar(grammar_path) -driver = driver.Driver(grammar, convert=pytree.convert) +grammar = pgen2_driver.load_grammar(grammar_path) +driver = pgen2_driver.Driver(grammar, convert=pytree.convert) def parse_string(string): return driver.parse_string(reformat(string), debug=True) diff --git a/Lib/lib2to3/tests/test_parser.py b/Lib/lib2to3/tests/test_parser.py index b533c01..0b2ca8b 100644 --- a/Lib/lib2to3/tests/test_parser.py +++ b/Lib/lib2to3/tests/test_parser.py @@ -6,8 +6,6 @@ parts of the grammar we've changed, we also make sure we can parse the test_grammar.py files from both Python 2 and Python 3. """ -from __future__ import with_statement - # Testing imports from . import support from .support import driver, test_dir @@ -15,12 +13,15 @@ from test.support import verbose # Python imports import os +import shutil +import subprocess import sys +import tempfile import unittest import warnings -import subprocess # Local imports +from lib2to3.pgen2 import driver as pgen2_driver from lib2to3.pgen2 import tokenize from ..pgen2.parse import ParseError from lib2to3.pygram import python_symbols as syms @@ -35,6 +36,71 @@ class TestDriver(support.TestCase): self.assertEqual(t.children[1].children[0].type, syms.print_stmt) +class TestPgen2Caching(support.TestCase): + def test_load_grammar_from_txt_file(self): + pgen2_driver.load_grammar(support.grammar_path, save=False, force=True) + + def test_load_grammar_from_pickle(self): + # Make a copy of the grammar file in a temp directory we are + # guaranteed to be able to write to. + tmpdir = tempfile.mkdtemp() + try: + grammar_copy = os.path.join( + tmpdir, os.path.basename(support.grammar_path)) + shutil.copy(support.grammar_path, grammar_copy) + pickle_name = pgen2_driver._generate_pickle_name(grammar_copy) + + pgen2_driver.load_grammar(grammar_copy, save=True, force=True) + self.assertTrue(os.path.exists(pickle_name)) + + os.unlink(grammar_copy) # Only the pickle remains... + pgen2_driver.load_grammar(grammar_copy, save=False, force=False) + finally: + shutil.rmtree(tmpdir) + + @unittest.skipIf(sys.executable is None, 'sys.executable required') + def test_load_grammar_from_subprocess(self): + tmpdir = tempfile.mkdtemp() + tmpsubdir = os.path.join(tmpdir, 'subdir') + try: + os.mkdir(tmpsubdir) + grammar_base = os.path.basename(support.grammar_path) + grammar_copy = os.path.join(tmpdir, grammar_base) + grammar_sub_copy = os.path.join(tmpsubdir, grammar_base) + shutil.copy(support.grammar_path, grammar_copy) + shutil.copy(support.grammar_path, grammar_sub_copy) + pickle_name = pgen2_driver._generate_pickle_name(grammar_copy) + pickle_sub_name = pgen2_driver._generate_pickle_name( + grammar_sub_copy) + self.assertNotEqual(pickle_name, pickle_sub_name) + + # Generate a pickle file from this process. + pgen2_driver.load_grammar(grammar_copy, save=True, force=True) + self.assertTrue(os.path.exists(pickle_name)) + + # Generate a new pickle file in a subprocess with a most likely + # different hash randomization seed. + sub_env = dict(os.environ) + sub_env['PYTHONHASHSEED'] = 'random' + subprocess.check_call( + [sys.executable, '-c', """ +from lib2to3.pgen2 import driver as pgen2_driver +pgen2_driver.load_grammar(%r, save=True, force=True) + """ % (grammar_sub_copy,)], + env=sub_env) + self.assertTrue(os.path.exists(pickle_sub_name)) + + with open(pickle_name, 'rb') as pickle_f_1, \ + open(pickle_sub_name, 'rb') as pickle_f_2: + self.assertEqual( + pickle_f_1.read(), pickle_f_2.read(), + msg='Grammar caches generated using different hash seeds' + ' were not identical.') + finally: + shutil.rmtree(tmpdir) + + + class GrammarTest(support.TestCase): def validate(self, code): support.parse_string(code) diff --git a/Misc/NEWS b/Misc/NEWS index 9398b8d..e4991c9 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -67,6 +67,10 @@ Core and Builtins Library ------- +- lib2to3.pgen3.driver.load_grammar() now creates a stable cache file + between runs given the same Grammar.txt input regardless of the hash + randomization setting. + - Issue #27570: Avoid zero-length memcpy() etc calls with null source pointers in the "ctypes" and "array" modules. -- cgit v0.12