summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/hashlib.rst43
-rw-r--r--Lib/hashlib.py48
-rw-r--r--Lib/test/test_hashlib.py54
-rw-r--r--Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst1
4 files changed, 145 insertions, 1 deletions
diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst
index aa24131..da97b0e 100644
--- a/Doc/library/hashlib.rst
+++ b/Doc/library/hashlib.rst
@@ -228,6 +228,49 @@ by the SHAKE algorithm.
exchange the value safely in email or other non-binary environments.
+File hashing
+------------
+
+The hashlib module provides a helper function for efficient hashing of
+a file or file-like object.
+
+.. function:: file_digest(fileobj, digest, /)
+
+ Return a digest object that has been updated with contents of file object.
+
+ *fileobj* must be a file-like object opened for reading in binary mode.
+ It accepts file objects from builtin :func:`open`, :class:`~io.BytesIO`
+ instances, SocketIO objects from :meth:`socket.socket.makefile`, and
+ similar. The function may bypass Python's I/O and use the file descriptor
+ from :meth:`~io.IOBase.fileno` directly. *fileobj* must be assumed to be
+ in an unknown state after this function returns or raises. It is up to
+ the caller to close *fileobj*.
+
+ *digest* must either be a hash algorithm name as a *str*, a hash
+ constructor, or a callable that returns a hash object.
+
+ Example:
+
+ >>> import io, hashlib, hmac
+ >>> with open(hashlib.__file__, "rb") as f:
+ ... digest = hashlib.file_digest(f, "sha256")
+ ...
+ >>> digest.hexdigest() # doctest: +ELLIPSIS
+ '...'
+
+ >>> buf = io.BytesIO(b"somedata")
+ >>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512)
+ >>> digest = hashlib.file_digest(buf, lambda: mac1)
+
+ >>> digest is mac1
+ True
+ >>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512)
+ >>> mac1.digest() == mac2.digest()
+ True
+
+ .. versionadded:: 3.11
+
+
Key derivation
--------------
diff --git a/Lib/hashlib.py b/Lib/hashlib.py
index 5625018..b546a3f 100644
--- a/Lib/hashlib.py
+++ b/Lib/hashlib.py
@@ -65,7 +65,7 @@ algorithms_guaranteed = set(__always_supported)
algorithms_available = set(__always_supported)
__all__ = __always_supported + ('new', 'algorithms_guaranteed',
- 'algorithms_available', 'pbkdf2_hmac')
+ 'algorithms_available', 'pbkdf2_hmac', 'file_digest')
__builtin_constructor_cache = {}
@@ -254,6 +254,52 @@ except ImportError:
pass
+def file_digest(fileobj, digest, /, *, _bufsize=2**18):
+ """Hash the contents of a file-like object. Returns a digest object.
+
+ *fileobj* must be a file-like object opened for reading in binary mode.
+ It accepts file objects from open(), io.BytesIO(), and SocketIO objects.
+ The function may bypass Python's I/O and use the file descriptor *fileno*
+ directly.
+
+ *digest* must either be a hash algorithm name as a *str*, a hash
+ constructor, or a callable that returns a hash object.
+ """
+ # On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy
+ # hashing with hardware acceleration.
+ if isinstance(digest, str):
+ digestobj = new(digest)
+ else:
+ digestobj = digest()
+
+ if hasattr(fileobj, "getbuffer"):
+ # io.BytesIO object, use zero-copy buffer
+ digestobj.update(fileobj.getbuffer())
+ return digestobj
+
+ # Only binary files implement readinto().
+ if not (
+ hasattr(fileobj, "readinto")
+ and hasattr(fileobj, "readable")
+ and fileobj.readable()
+ ):
+ raise ValueError(
+ f"'{fileobj!r}' is not a file-like object in binary reading mode."
+ )
+
+ # binary file, socket.SocketIO object
+ # Note: socket I/O uses different syscalls than file I/O.
+ buf = bytearray(_bufsize) # Reusable buffer to reduce allocations.
+ view = memoryview(buf)
+ while True:
+ size = fileobj.readinto(buf)
+ if size == 0:
+ break # EOF
+ digestobj.update(view[:size])
+
+ return digestobj
+
+
for __func_name in __always_supported:
# try them all, some may not work due to the OpenSSL
# version not supporting that algorithm.
diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py
index ea31f8b..daf6e38 100644
--- a/Lib/test/test_hashlib.py
+++ b/Lib/test/test_hashlib.py
@@ -10,6 +10,7 @@ import array
from binascii import unhexlify
import hashlib
import importlib
+import io
import itertools
import os
import sys
@@ -20,6 +21,7 @@ import warnings
from test import support
from test.support import _4G, bigmemtest
from test.support.import_helper import import_fresh_module
+from test.support import os_helper
from test.support import threading_helper
from test.support import warnings_helper
from http.client import HTTPException
@@ -371,6 +373,31 @@ class HashLibTestCase(unittest.TestCase):
if not shake:
self.assertEqual(len(digest), m.digest_size)
+ if not shake and kwargs.get("key") is None:
+ # skip shake and blake2 extended parameter tests
+ self.check_file_digest(name, data, hexdigest)
+
+ def check_file_digest(self, name, data, hexdigest):
+ hexdigest = hexdigest.lower()
+ digests = [name]
+ digests.extend(self.constructors_to_test[name])
+
+ with open(os_helper.TESTFN, "wb") as f:
+ f.write(data)
+
+ try:
+ for digest in digests:
+ buf = io.BytesIO(data)
+ buf.seek(0)
+ self.assertEqual(
+ hashlib.file_digest(buf, digest).hexdigest(), hexdigest
+ )
+ with open(os_helper.TESTFN, "rb") as f:
+ digestobj = hashlib.file_digest(f, digest)
+ self.assertEqual(digestobj.hexdigest(), hexdigest)
+ finally:
+ os.unlink(os_helper.TESTFN)
+
def check_no_unicode(self, algorithm_name):
# Unicode objects are not allowed as input.
constructors = self.constructors_to_test[algorithm_name]
@@ -1117,6 +1144,33 @@ class KDFTests(unittest.TestCase):
self.assertNotIn("blake2b512", hashlib.algorithms_available)
self.assertNotIn("sha3-512", hashlib.algorithms_available)
+ def test_file_digest(self):
+ data = b'a' * 65536
+ d1 = hashlib.sha256()
+ self.addCleanup(os.unlink, os_helper.TESTFN)
+ with open(os_helper.TESTFN, "wb") as f:
+ for _ in range(10):
+ d1.update(data)
+ f.write(data)
+
+ with open(os_helper.TESTFN, "rb") as f:
+ d2 = hashlib.file_digest(f, hashlib.sha256)
+
+ self.assertEqual(d1.hexdigest(), d2.hexdigest())
+ self.assertEqual(d1.name, d2.name)
+ self.assertIs(type(d1), type(d2))
+
+ with self.assertRaises(ValueError):
+ hashlib.file_digest(None, "sha256")
+
+ with self.assertRaises(ValueError):
+ with open(os_helper.TESTFN, "r") as f:
+ hashlib.file_digest(f, "sha256")
+
+ with self.assertRaises(ValueError):
+ with open(os_helper.TESTFN, "wb") as f:
+ hashlib.file_digest(f, "sha256")
+
if __name__ == "__main__":
unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst
new file mode 100644
index 0000000..1c6ea5a
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst
@@ -0,0 +1 @@
+Add :func:`hashlib.file_digest` helper for efficient hashing of file object.