summaryrefslogtreecommitdiffstats
path: root/Lib/importlib/_bootstrap_external.py
diff options
context:
space:
mode:
authorBenjamin Peterson <benjamin@python.org>2017-12-09 18:26:52 (GMT)
committerGitHub <noreply@github.com>2017-12-09 18:26:52 (GMT)
commit42aa93b8ff2f7879282b06efc73a31ec7785e602 (patch)
tree92ee301e1f487a7f5aa8ec78a36ebc50d21d6ec9 /Lib/importlib/_bootstrap_external.py
parent28d8d14013ade0657fed4673f5fa3c08eb2b1944 (diff)
downloadcpython-42aa93b8ff2f7879282b06efc73a31ec7785e602.zip
cpython-42aa93b8ff2f7879282b06efc73a31ec7785e602.tar.gz
cpython-42aa93b8ff2f7879282b06efc73a31ec7785e602.tar.bz2
closes bpo-31650: PEP 552 (Deterministic pycs) implementation (#4575)
Python now supports checking bytecode cache up-to-dateness with a hash of the source contents rather than volatile source metadata. See the PEP for details. While a fairly straightforward idea, quite a lot of code had to be modified due to the pervasiveness of pyc implementation details in the codebase. Changes in this commit include: - The core changes to importlib to understand how to read, validate, and regenerate hash-based pycs. - Support for generating hash-based pycs in py_compile and compileall. - Modifications to our siphash implementation to support passing a custom key. We then expose it to importlib through _imp. - Updates to all places in the interpreter, standard library, and tests that manually generate or parse pyc files to grok the new format. - Support in the interpreter command line code for long options like --check-hash-based-pycs. - Tests and documentation for all of the above.
Diffstat (limited to 'Lib/importlib/_bootstrap_external.py')
-rw-r--r--Lib/importlib/_bootstrap_external.py197
1 files changed, 142 insertions, 55 deletions
diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py
index 41de8a7..e808507 100644
--- a/Lib/importlib/_bootstrap_external.py
+++ b/Lib/importlib/_bootstrap_external.py
@@ -242,6 +242,7 @@ _code_type = type(_write_atomic.__code__)
# Python 3.6rc1 3379 (more thorough __class__ validation #23722)
# Python 3.7a0 3390 (add LOAD_METHOD and CALL_METHOD opcodes)
# Python 3.7a0 3391 (update GET_AITER #31709)
+# Python 3.7a0 3392 (PEP 552: Deterministic pycs)
#
# MAGIC must change whenever the bytecode emitted by the compiler may no
# longer be understood by older implementations of the eval loop (usually
@@ -250,7 +251,7 @@ _code_type = type(_write_atomic.__code__)
# Whenever MAGIC_NUMBER is changed, the ranges in the magic_values array
# in PC/launcher.c must also be updated.
-MAGIC_NUMBER = (3391).to_bytes(2, 'little') + b'\r\n'
+MAGIC_NUMBER = (3392).to_bytes(2, 'little') + b'\r\n'
_RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little') # For import.c
_PYCACHE = '__pycache__'
@@ -429,63 +430,93 @@ def _find_module_shim(self, fullname):
return loader
-def _validate_bytecode_header(data, source_stats=None, name=None, path=None):
- """Validate the header of the passed-in bytecode against source_stats (if
- given) and returning the bytecode that can be compiled by compile().
+def _classify_pyc(data, name, exc_details):
+ """Perform basic validity checking of a pyc header and return the flags field,
+ which determines how the pyc should be further validated against the source.
- All other arguments are used to enhance error reporting.
+ *data* is the contents of the pyc file. (Only the first 16 bytes are
+ required, though.)
- ImportError is raised when the magic number is incorrect or the bytecode is
- found to be stale. EOFError is raised when the data is found to be
- truncated.
+ *name* is the name of the module being imported. It is used for logging.
+
+ *exc_details* is a dictionary passed to ImportError if it raised for
+ improved debugging.
+
+ ImportError is raised when the magic number is incorrect or when the flags
+ field is invalid. EOFError is raised when the data is found to be truncated.
"""
- exc_details = {}
- if name is not None:
- exc_details['name'] = name
- else:
- # To prevent having to make all messages have a conditional name.
- name = '<bytecode>'
- if path is not None:
- exc_details['path'] = path
magic = data[:4]
- raw_timestamp = data[4:8]
- raw_size = data[8:12]
if magic != MAGIC_NUMBER:
- message = 'bad magic number in {!r}: {!r}'.format(name, magic)
+ message = f'bad magic number in {name!r}: {magic!r}'
_bootstrap._verbose_message('{}', message)
raise ImportError(message, **exc_details)
- elif len(raw_timestamp) != 4:
- message = 'reached EOF while reading timestamp in {!r}'.format(name)
+ if len(data) < 16:
+ message = f'reached EOF while reading pyc header of {name!r}'
_bootstrap._verbose_message('{}', message)
raise EOFError(message)
- elif len(raw_size) != 4:
- message = 'reached EOF while reading size of source in {!r}'.format(name)
+ flags = _r_long(data[4:8])
+ # Only the first two flags are defined.
+ if flags & ~0b11:
+ message = f'invalid flags {flags!r} in {name!r}'
+ raise ImportError(message, **exc_details)
+ return flags
+
+
+def _validate_timestamp_pyc(data, source_mtime, source_size, name,
+ exc_details):
+ """Validate a pyc against the source last-modified time.
+
+ *data* is the contents of the pyc file. (Only the first 16 bytes are
+ required.)
+
+ *source_mtime* is the last modified timestamp of the source file.
+
+ *source_size* is None or the size of the source file in bytes.
+
+ *name* is the name of the module being imported. It is used for logging.
+
+ *exc_details* is a dictionary passed to ImportError if it raised for
+ improved debugging.
+
+ An ImportError is raised if the bytecode is stale.
+
+ """
+ if _r_long(data[8:12]) != (source_mtime & 0xFFFFFFFF):
+ message = f'bytecode is stale for {name!r}'
_bootstrap._verbose_message('{}', message)
- raise EOFError(message)
- if source_stats is not None:
- try:
- source_mtime = int(source_stats['mtime'])
- except KeyError:
- pass
- else:
- if _r_long(raw_timestamp) != source_mtime:
- message = 'bytecode is stale for {!r}'.format(name)
- _bootstrap._verbose_message('{}', message)
- raise ImportError(message, **exc_details)
- try:
- source_size = source_stats['size'] & 0xFFFFFFFF
- except KeyError:
- pass
- else:
- if _r_long(raw_size) != source_size:
- raise ImportError('bytecode is stale for {!r}'.format(name),
- **exc_details)
- return data[12:]
+ raise ImportError(message, **exc_details)
+ if (source_size is not None and
+ _r_long(data[12:16]) != (source_size & 0xFFFFFFFF)):
+ raise ImportError(f'bytecode is stale for {name!r}', **exc_details)
+
+
+def _validate_hash_pyc(data, source_hash, name, exc_details):
+ """Validate a hash-based pyc by checking the real source hash against the one in
+ the pyc header.
+
+ *data* is the contents of the pyc file. (Only the first 16 bytes are
+ required.)
+
+ *source_hash* is the importlib.util.source_hash() of the source file.
+
+ *name* is the name of the module being imported. It is used for logging.
+
+ *exc_details* is a dictionary passed to ImportError if it raised for
+ improved debugging.
+
+ An ImportError is raised if the bytecode is stale.
+
+ """
+ if data[8:16] != source_hash:
+ raise ImportError(
+ f'hash in bytecode doesn\'t match hash of source {name!r}',
+ **exc_details,
+ )
def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
- """Compile bytecode as returned by _validate_bytecode_header()."""
+ """Compile bytecode as found in a pyc."""
code = marshal.loads(data)
if isinstance(code, _code_type):
_bootstrap._verbose_message('code object from {!r}', bytecode_path)
@@ -496,16 +527,28 @@ def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
raise ImportError('Non-code object in {!r}'.format(bytecode_path),
name=name, path=bytecode_path)
-def _code_to_bytecode(code, mtime=0, source_size=0):
- """Compile a code object into bytecode for writing out to a byte-compiled
- file."""
+
+def _code_to_timestamp_pyc(code, mtime=0, source_size=0):
+ "Produce the data for a timestamp-based pyc."
data = bytearray(MAGIC_NUMBER)
+ data.extend(_w_long(0))
data.extend(_w_long(mtime))
data.extend(_w_long(source_size))
data.extend(marshal.dumps(code))
return data
+def _code_to_hash_pyc(code, source_hash, checked=True):
+ "Produce the data for a hash-based pyc."
+ data = bytearray(MAGIC_NUMBER)
+ flags = 0b1 | checked << 1
+ data.extend(_w_long(flags))
+ assert len(source_hash) == 8
+ data.extend(source_hash)
+ data.extend(marshal.dumps(code))
+ return data
+
+
def decode_source(source_bytes):
"""Decode bytes representing source code and return the string.
@@ -751,6 +794,10 @@ class SourceLoader(_LoaderBasics):
"""
source_path = self.get_filename(fullname)
source_mtime = None
+ source_bytes = None
+ source_hash = None
+ hash_based = False
+ check_source = True
try:
bytecode_path = cache_from_source(source_path)
except NotImplementedError:
@@ -767,10 +814,34 @@ class SourceLoader(_LoaderBasics):
except OSError:
pass
else:
+ exc_details = {
+ 'name': fullname,
+ 'path': bytecode_path,
+ }
try:
- bytes_data = _validate_bytecode_header(data,
- source_stats=st, name=fullname,
- path=bytecode_path)
+ flags = _classify_pyc(data, fullname, exc_details)
+ bytes_data = memoryview(data)[16:]
+ hash_based = flags & 0b1 != 0
+ if hash_based:
+ check_source = flags & 0b10 != 0
+ if (_imp.check_hash_based_pycs != 'never' and
+ (check_source or
+ _imp.check_hash_based_pycs == 'always')):
+ source_bytes = self.get_data(source_path)
+ source_hash = _imp.source_hash(
+ _RAW_MAGIC_NUMBER,
+ source_bytes,
+ )
+ _validate_hash_pyc(data, source_hash, fullname,
+ exc_details)
+ else:
+ _validate_timestamp_pyc(
+ data,
+ source_mtime,
+ st['size'],
+ fullname,
+ exc_details,
+ )
except (ImportError, EOFError):
pass
else:
@@ -779,13 +850,19 @@ class SourceLoader(_LoaderBasics):
return _compile_bytecode(bytes_data, name=fullname,
bytecode_path=bytecode_path,
source_path=source_path)
- source_bytes = self.get_data(source_path)
+ if source_bytes is None:
+ source_bytes = self.get_data(source_path)
code_object = self.source_to_code(source_bytes, source_path)
_bootstrap._verbose_message('code object from {}', source_path)
if (not sys.dont_write_bytecode and bytecode_path is not None and
source_mtime is not None):
- data = _code_to_bytecode(code_object, source_mtime,
- len(source_bytes))
+ if hash_based:
+ if source_hash is None:
+ source_hash = _imp.source_hash(source_bytes)
+ data = _code_to_hash_pyc(code_object, source_hash, check_source)
+ else:
+ data = _code_to_timestamp_pyc(code_object, source_mtime,
+ len(source_bytes))
try:
self._cache_bytecode(source_path, bytecode_path, data)
_bootstrap._verbose_message('wrote {!r}', bytecode_path)
@@ -887,8 +964,18 @@ class SourcelessFileLoader(FileLoader, _LoaderBasics):
def get_code(self, fullname):
path = self.get_filename(fullname)
data = self.get_data(path)
- bytes_data = _validate_bytecode_header(data, name=fullname, path=path)
- return _compile_bytecode(bytes_data, name=fullname, bytecode_path=path)
+ # Call _classify_pyc to do basic validation of the pyc but ignore the
+ # result. There's no source to check against.
+ exc_details = {
+ 'name': fullname,
+ 'path': path,
+ }
+ _classify_pyc(data, fullname, exc_details)
+ return _compile_bytecode(
+ memoryview(data)[16:],
+ name=fullname,
+ bytecode_path=path,
+ )
def get_source(self, fullname):
"""Return None as there is no source code."""