diff options
author | Cody Maloney <cmaloney@users.noreply.github.com> | 2025-02-07 11:06:11 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-07 11:06:11 (GMT) |
commit | a3d5aab9a89e311cded9c724ce7d5a873e4d680d (patch) | |
tree | a9414f49045a51f457a3ea93e4841d4a65e8212f | |
parent | ae132edc296d27c6ed04fe4d400c67e3cfb622e8 (diff) | |
download | cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.zip cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.tar.gz cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.tar.bz2 |
gh-129005: Align FileIO.readall between _pyio and _io (#129705)
Utilize `bytearray.resize()` and `os.readinto()` to reduce copies
and match behavior of `_io.FileIO.readall()`.
There is still an extra copy which means twice the memory required
compared to FileIO because there isn't a zero-copy path from
`bytearray` -> `bytes` currently.
On my system reading a 2 GB file:
`./python -m test -M8g -uall test_largefile -m test.test_largefile.PyLargeFileTest.test_large_read -v`
Goes from ~2.7 seconds -> ~2.2 seconds
Co-authored-by: Victor Stinner <vstinner@python.org>
-rw-r--r-- | Lib/_pyio.py | 37 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst | 2 |
2 files changed, 25 insertions, 14 deletions
diff --git a/Lib/_pyio.py b/Lib/_pyio.py index b3a8f37..f7370df 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1454,6 +1454,17 @@ class BufferedRandom(BufferedWriter, BufferedReader): return BufferedWriter.write(self, b) +def _new_buffersize(bytes_read): + # Parallels _io/fileio.c new_buffersize + if bytes_read > 65536: + addend = bytes_read >> 3 + else: + addend = 256 + bytes_read + if addend < DEFAULT_BUFFER_SIZE: + addend = DEFAULT_BUFFER_SIZE + return bytes_read + addend + + class FileIO(RawIOBase): _fd = -1 _created = False @@ -1672,22 +1683,20 @@ class FileIO(RawIOBase): except OSError: pass - result = bytearray() - while True: - if len(result) >= bufsize: - bufsize = len(result) - bufsize += max(bufsize, DEFAULT_BUFFER_SIZE) - n = bufsize - len(result) - try: - chunk = os.read(self._fd, n) - except BlockingIOError: - if result: - break + result = bytearray(bufsize) + bytes_read = 0 + try: + while n := os.readinto(self._fd, memoryview(result)[bytes_read:]): + bytes_read += n + if bytes_read >= len(result): + result.resize(_new_buffersize(bytes_read)) + except BlockingIOError: + if not bytes_read: return None - if not chunk: # reached the end of the file - break - result += chunk + assert len(result) - bytes_read >= 1, \ + "os.readinto buffer size 0 will result in erroneous EOF / returns 0" + result.resize(bytes_read) return bytes(result) def readinto(self, buffer): diff --git a/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst new file mode 100644 index 0000000..236d776 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst @@ -0,0 +1,2 @@ +``_pyio.FileIO.readall()`` now allocates, resizes, and fills a data buffer +using the same algorithm ``_io.FileIO.readall()`` uses. |