From 4f97d64c831c94660ceb01f34d51fa236ad968b0 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Tue, 22 Mar 2022 11:37:00 +0200 Subject: bpo-45150: Add hashlib.file_digest() for efficient file hashing (GH-31930) --- Doc/library/hashlib.rst | 43 +++++++++++++++++ Lib/hashlib.py | 48 ++++++++++++++++++- Lib/test/test_hashlib.py | 54 ++++++++++++++++++++++ .../2022-03-16-11-52-52.bpo-45150.kYbIME.rst | 1 + 4 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst index aa24131..da97b0e 100644 --- a/Doc/library/hashlib.rst +++ b/Doc/library/hashlib.rst @@ -228,6 +228,49 @@ by the SHAKE algorithm. exchange the value safely in email or other non-binary environments. +File hashing +------------ + +The hashlib module provides a helper function for efficient hashing of +a file or file-like object. + +.. function:: file_digest(fileobj, digest, /) + + Return a digest object that has been updated with contents of file object. + + *fileobj* must be a file-like object opened for reading in binary mode. + It accepts file objects from builtin :func:`open`, :class:`~io.BytesIO` + instances, SocketIO objects from :meth:`socket.socket.makefile`, and + similar. The function may bypass Python's I/O and use the file descriptor + from :meth:`~io.IOBase.fileno` directly. *fileobj* must be assumed to be + in an unknown state after this function returns or raises. It is up to + the caller to close *fileobj*. + + *digest* must either be a hash algorithm name as a *str*, a hash + constructor, or a callable that returns a hash object. + + Example: + + >>> import io, hashlib, hmac + >>> with open(hashlib.__file__, "rb") as f: + ... digest = hashlib.file_digest(f, "sha256") + ... + >>> digest.hexdigest() # doctest: +ELLIPSIS + '...' + + >>> buf = io.BytesIO(b"somedata") + >>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512) + >>> digest = hashlib.file_digest(buf, lambda: mac1) + + >>> digest is mac1 + True + >>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512) + >>> mac1.digest() == mac2.digest() + True + + .. versionadded:: 3.11 + + Key derivation -------------- diff --git a/Lib/hashlib.py b/Lib/hashlib.py index 5625018..b546a3f 100644 --- a/Lib/hashlib.py +++ b/Lib/hashlib.py @@ -65,7 +65,7 @@ algorithms_guaranteed = set(__always_supported) algorithms_available = set(__always_supported) __all__ = __always_supported + ('new', 'algorithms_guaranteed', - 'algorithms_available', 'pbkdf2_hmac') + 'algorithms_available', 'pbkdf2_hmac', 'file_digest') __builtin_constructor_cache = {} @@ -254,6 +254,52 @@ except ImportError: pass +def file_digest(fileobj, digest, /, *, _bufsize=2**18): + """Hash the contents of a file-like object. Returns a digest object. + + *fileobj* must be a file-like object opened for reading in binary mode. + It accepts file objects from open(), io.BytesIO(), and SocketIO objects. + The function may bypass Python's I/O and use the file descriptor *fileno* + directly. + + *digest* must either be a hash algorithm name as a *str*, a hash + constructor, or a callable that returns a hash object. + """ + # On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy + # hashing with hardware acceleration. + if isinstance(digest, str): + digestobj = new(digest) + else: + digestobj = digest() + + if hasattr(fileobj, "getbuffer"): + # io.BytesIO object, use zero-copy buffer + digestobj.update(fileobj.getbuffer()) + return digestobj + + # Only binary files implement readinto(). + if not ( + hasattr(fileobj, "readinto") + and hasattr(fileobj, "readable") + and fileobj.readable() + ): + raise ValueError( + f"'{fileobj!r}' is not a file-like object in binary reading mode." + ) + + # binary file, socket.SocketIO object + # Note: socket I/O uses different syscalls than file I/O. + buf = bytearray(_bufsize) # Reusable buffer to reduce allocations. + view = memoryview(buf) + while True: + size = fileobj.readinto(buf) + if size == 0: + break # EOF + digestobj.update(view[:size]) + + return digestobj + + for __func_name in __always_supported: # try them all, some may not work due to the OpenSSL # version not supporting that algorithm. diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py index ea31f8b..daf6e38 100644 --- a/Lib/test/test_hashlib.py +++ b/Lib/test/test_hashlib.py @@ -10,6 +10,7 @@ import array from binascii import unhexlify import hashlib import importlib +import io import itertools import os import sys @@ -20,6 +21,7 @@ import warnings from test import support from test.support import _4G, bigmemtest from test.support.import_helper import import_fresh_module +from test.support import os_helper from test.support import threading_helper from test.support import warnings_helper from http.client import HTTPException @@ -371,6 +373,31 @@ class HashLibTestCase(unittest.TestCase): if not shake: self.assertEqual(len(digest), m.digest_size) + if not shake and kwargs.get("key") is None: + # skip shake and blake2 extended parameter tests + self.check_file_digest(name, data, hexdigest) + + def check_file_digest(self, name, data, hexdigest): + hexdigest = hexdigest.lower() + digests = [name] + digests.extend(self.constructors_to_test[name]) + + with open(os_helper.TESTFN, "wb") as f: + f.write(data) + + try: + for digest in digests: + buf = io.BytesIO(data) + buf.seek(0) + self.assertEqual( + hashlib.file_digest(buf, digest).hexdigest(), hexdigest + ) + with open(os_helper.TESTFN, "rb") as f: + digestobj = hashlib.file_digest(f, digest) + self.assertEqual(digestobj.hexdigest(), hexdigest) + finally: + os.unlink(os_helper.TESTFN) + def check_no_unicode(self, algorithm_name): # Unicode objects are not allowed as input. constructors = self.constructors_to_test[algorithm_name] @@ -1117,6 +1144,33 @@ class KDFTests(unittest.TestCase): self.assertNotIn("blake2b512", hashlib.algorithms_available) self.assertNotIn("sha3-512", hashlib.algorithms_available) + def test_file_digest(self): + data = b'a' * 65536 + d1 = hashlib.sha256() + self.addCleanup(os.unlink, os_helper.TESTFN) + with open(os_helper.TESTFN, "wb") as f: + for _ in range(10): + d1.update(data) + f.write(data) + + with open(os_helper.TESTFN, "rb") as f: + d2 = hashlib.file_digest(f, hashlib.sha256) + + self.assertEqual(d1.hexdigest(), d2.hexdigest()) + self.assertEqual(d1.name, d2.name) + self.assertIs(type(d1), type(d2)) + + with self.assertRaises(ValueError): + hashlib.file_digest(None, "sha256") + + with self.assertRaises(ValueError): + with open(os_helper.TESTFN, "r") as f: + hashlib.file_digest(f, "sha256") + + with self.assertRaises(ValueError): + with open(os_helper.TESTFN, "wb") as f: + hashlib.file_digest(f, "sha256") + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst new file mode 100644 index 0000000..1c6ea5a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst @@ -0,0 +1 @@ +Add :func:`hashlib.file_digest` helper for efficient hashing of file object. -- cgit v0.12