diff options
author | Benjamin Peterson <benjamin@python.org> | 2017-12-09 18:26:52 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-12-09 18:26:52 (GMT) |
commit | 42aa93b8ff2f7879282b06efc73a31ec7785e602 (patch) | |
tree | 92ee301e1f487a7f5aa8ec78a36ebc50d21d6ec9 /Lib/importlib/_bootstrap_external.py | |
parent | 28d8d14013ade0657fed4673f5fa3c08eb2b1944 (diff) | |
download | cpython-42aa93b8ff2f7879282b06efc73a31ec7785e602.zip cpython-42aa93b8ff2f7879282b06efc73a31ec7785e602.tar.gz cpython-42aa93b8ff2f7879282b06efc73a31ec7785e602.tar.bz2 |
closes bpo-31650: PEP 552 (Deterministic pycs) implementation (#4575)
Python now supports checking bytecode cache up-to-dateness with a hash of the
source contents rather than volatile source metadata. See the PEP for details.
While a fairly straightforward idea, quite a lot of code had to be modified due
to the pervasiveness of pyc implementation details in the codebase. Changes in
this commit include:
- The core changes to importlib to understand how to read, validate, and
regenerate hash-based pycs.
- Support for generating hash-based pycs in py_compile and compileall.
- Modifications to our siphash implementation to support passing a custom
key. We then expose it to importlib through _imp.
- Updates to all places in the interpreter, standard library, and tests that
manually generate or parse pyc files to grok the new format.
- Support in the interpreter command line code for long options like
--check-hash-based-pycs.
- Tests and documentation for all of the above.
Diffstat (limited to 'Lib/importlib/_bootstrap_external.py')
-rw-r--r-- | Lib/importlib/_bootstrap_external.py | 197 |
1 files changed, 142 insertions, 55 deletions
diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index 41de8a7..e808507 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -242,6 +242,7 @@ _code_type = type(_write_atomic.__code__) # Python 3.6rc1 3379 (more thorough __class__ validation #23722) # Python 3.7a0 3390 (add LOAD_METHOD and CALL_METHOD opcodes) # Python 3.7a0 3391 (update GET_AITER #31709) +# Python 3.7a0 3392 (PEP 552: Deterministic pycs) # # MAGIC must change whenever the bytecode emitted by the compiler may no # longer be understood by older implementations of the eval loop (usually @@ -250,7 +251,7 @@ _code_type = type(_write_atomic.__code__) # Whenever MAGIC_NUMBER is changed, the ranges in the magic_values array # in PC/launcher.c must also be updated. -MAGIC_NUMBER = (3391).to_bytes(2, 'little') + b'\r\n' +MAGIC_NUMBER = (3392).to_bytes(2, 'little') + b'\r\n' _RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little') # For import.c _PYCACHE = '__pycache__' @@ -429,63 +430,93 @@ def _find_module_shim(self, fullname): return loader -def _validate_bytecode_header(data, source_stats=None, name=None, path=None): - """Validate the header of the passed-in bytecode against source_stats (if - given) and returning the bytecode that can be compiled by compile(). +def _classify_pyc(data, name, exc_details): + """Perform basic validity checking of a pyc header and return the flags field, + which determines how the pyc should be further validated against the source. - All other arguments are used to enhance error reporting. + *data* is the contents of the pyc file. (Only the first 16 bytes are + required, though.) - ImportError is raised when the magic number is incorrect or the bytecode is - found to be stale. EOFError is raised when the data is found to be - truncated. + *name* is the name of the module being imported. It is used for logging. + + *exc_details* is a dictionary passed to ImportError if it raised for + improved debugging. + + ImportError is raised when the magic number is incorrect or when the flags + field is invalid. EOFError is raised when the data is found to be truncated. """ - exc_details = {} - if name is not None: - exc_details['name'] = name - else: - # To prevent having to make all messages have a conditional name. - name = '<bytecode>' - if path is not None: - exc_details['path'] = path magic = data[:4] - raw_timestamp = data[4:8] - raw_size = data[8:12] if magic != MAGIC_NUMBER: - message = 'bad magic number in {!r}: {!r}'.format(name, magic) + message = f'bad magic number in {name!r}: {magic!r}' _bootstrap._verbose_message('{}', message) raise ImportError(message, **exc_details) - elif len(raw_timestamp) != 4: - message = 'reached EOF while reading timestamp in {!r}'.format(name) + if len(data) < 16: + message = f'reached EOF while reading pyc header of {name!r}' _bootstrap._verbose_message('{}', message) raise EOFError(message) - elif len(raw_size) != 4: - message = 'reached EOF while reading size of source in {!r}'.format(name) + flags = _r_long(data[4:8]) + # Only the first two flags are defined. + if flags & ~0b11: + message = f'invalid flags {flags!r} in {name!r}' + raise ImportError(message, **exc_details) + return flags + + +def _validate_timestamp_pyc(data, source_mtime, source_size, name, + exc_details): + """Validate a pyc against the source last-modified time. + + *data* is the contents of the pyc file. (Only the first 16 bytes are + required.) + + *source_mtime* is the last modified timestamp of the source file. + + *source_size* is None or the size of the source file in bytes. + + *name* is the name of the module being imported. It is used for logging. + + *exc_details* is a dictionary passed to ImportError if it raised for + improved debugging. + + An ImportError is raised if the bytecode is stale. + + """ + if _r_long(data[8:12]) != (source_mtime & 0xFFFFFFFF): + message = f'bytecode is stale for {name!r}' _bootstrap._verbose_message('{}', message) - raise EOFError(message) - if source_stats is not None: - try: - source_mtime = int(source_stats['mtime']) - except KeyError: - pass - else: - if _r_long(raw_timestamp) != source_mtime: - message = 'bytecode is stale for {!r}'.format(name) - _bootstrap._verbose_message('{}', message) - raise ImportError(message, **exc_details) - try: - source_size = source_stats['size'] & 0xFFFFFFFF - except KeyError: - pass - else: - if _r_long(raw_size) != source_size: - raise ImportError('bytecode is stale for {!r}'.format(name), - **exc_details) - return data[12:] + raise ImportError(message, **exc_details) + if (source_size is not None and + _r_long(data[12:16]) != (source_size & 0xFFFFFFFF)): + raise ImportError(f'bytecode is stale for {name!r}', **exc_details) + + +def _validate_hash_pyc(data, source_hash, name, exc_details): + """Validate a hash-based pyc by checking the real source hash against the one in + the pyc header. + + *data* is the contents of the pyc file. (Only the first 16 bytes are + required.) + + *source_hash* is the importlib.util.source_hash() of the source file. + + *name* is the name of the module being imported. It is used for logging. + + *exc_details* is a dictionary passed to ImportError if it raised for + improved debugging. + + An ImportError is raised if the bytecode is stale. + + """ + if data[8:16] != source_hash: + raise ImportError( + f'hash in bytecode doesn\'t match hash of source {name!r}', + **exc_details, + ) def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None): - """Compile bytecode as returned by _validate_bytecode_header().""" + """Compile bytecode as found in a pyc.""" code = marshal.loads(data) if isinstance(code, _code_type): _bootstrap._verbose_message('code object from {!r}', bytecode_path) @@ -496,16 +527,28 @@ def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None): raise ImportError('Non-code object in {!r}'.format(bytecode_path), name=name, path=bytecode_path) -def _code_to_bytecode(code, mtime=0, source_size=0): - """Compile a code object into bytecode for writing out to a byte-compiled - file.""" + +def _code_to_timestamp_pyc(code, mtime=0, source_size=0): + "Produce the data for a timestamp-based pyc." data = bytearray(MAGIC_NUMBER) + data.extend(_w_long(0)) data.extend(_w_long(mtime)) data.extend(_w_long(source_size)) data.extend(marshal.dumps(code)) return data +def _code_to_hash_pyc(code, source_hash, checked=True): + "Produce the data for a hash-based pyc." + data = bytearray(MAGIC_NUMBER) + flags = 0b1 | checked << 1 + data.extend(_w_long(flags)) + assert len(source_hash) == 8 + data.extend(source_hash) + data.extend(marshal.dumps(code)) + return data + + def decode_source(source_bytes): """Decode bytes representing source code and return the string. @@ -751,6 +794,10 @@ class SourceLoader(_LoaderBasics): """ source_path = self.get_filename(fullname) source_mtime = None + source_bytes = None + source_hash = None + hash_based = False + check_source = True try: bytecode_path = cache_from_source(source_path) except NotImplementedError: @@ -767,10 +814,34 @@ class SourceLoader(_LoaderBasics): except OSError: pass else: + exc_details = { + 'name': fullname, + 'path': bytecode_path, + } try: - bytes_data = _validate_bytecode_header(data, - source_stats=st, name=fullname, - path=bytecode_path) + flags = _classify_pyc(data, fullname, exc_details) + bytes_data = memoryview(data)[16:] + hash_based = flags & 0b1 != 0 + if hash_based: + check_source = flags & 0b10 != 0 + if (_imp.check_hash_based_pycs != 'never' and + (check_source or + _imp.check_hash_based_pycs == 'always')): + source_bytes = self.get_data(source_path) + source_hash = _imp.source_hash( + _RAW_MAGIC_NUMBER, + source_bytes, + ) + _validate_hash_pyc(data, source_hash, fullname, + exc_details) + else: + _validate_timestamp_pyc( + data, + source_mtime, + st['size'], + fullname, + exc_details, + ) except (ImportError, EOFError): pass else: @@ -779,13 +850,19 @@ class SourceLoader(_LoaderBasics): return _compile_bytecode(bytes_data, name=fullname, bytecode_path=bytecode_path, source_path=source_path) - source_bytes = self.get_data(source_path) + if source_bytes is None: + source_bytes = self.get_data(source_path) code_object = self.source_to_code(source_bytes, source_path) _bootstrap._verbose_message('code object from {}', source_path) if (not sys.dont_write_bytecode and bytecode_path is not None and source_mtime is not None): - data = _code_to_bytecode(code_object, source_mtime, - len(source_bytes)) + if hash_based: + if source_hash is None: + source_hash = _imp.source_hash(source_bytes) + data = _code_to_hash_pyc(code_object, source_hash, check_source) + else: + data = _code_to_timestamp_pyc(code_object, source_mtime, + len(source_bytes)) try: self._cache_bytecode(source_path, bytecode_path, data) _bootstrap._verbose_message('wrote {!r}', bytecode_path) @@ -887,8 +964,18 @@ class SourcelessFileLoader(FileLoader, _LoaderBasics): def get_code(self, fullname): path = self.get_filename(fullname) data = self.get_data(path) - bytes_data = _validate_bytecode_header(data, name=fullname, path=path) - return _compile_bytecode(bytes_data, name=fullname, bytecode_path=path) + # Call _classify_pyc to do basic validation of the pyc but ignore the + # result. There's no source to check against. + exc_details = { + 'name': fullname, + 'path': path, + } + _classify_pyc(data, fullname, exc_details) + return _compile_bytecode( + memoryview(data)[16:], + name=fullname, + bytecode_path=path, + ) def get_source(self, fullname): """Return None as there is no source code.""" |