summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCody Maloney <cmaloney@users.noreply.github.com>2025-02-07 11:06:11 (GMT)
committerGitHub <noreply@github.com>2025-02-07 11:06:11 (GMT)
commita3d5aab9a89e311cded9c724ce7d5a873e4d680d (patch)
treea9414f49045a51f457a3ea93e4841d4a65e8212f
parentae132edc296d27c6ed04fe4d400c67e3cfb622e8 (diff)
downloadcpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.zip
cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.tar.gz
cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.tar.bz2
gh-129005: Align FileIO.readall between _pyio and _io (#129705)
Utilize `bytearray.resize()` and `os.readinto()` to reduce copies and match behavior of `_io.FileIO.readall()`. There is still an extra copy which means twice the memory required compared to FileIO because there isn't a zero-copy path from `bytearray` -> `bytes` currently. On my system reading a 2 GB file: `./python -m test -M8g -uall test_largefile -m test.test_largefile.PyLargeFileTest.test_large_read -v` Goes from ~2.7 seconds -> ~2.2 seconds Co-authored-by: Victor Stinner <vstinner@python.org>
-rw-r--r--Lib/_pyio.py37
-rw-r--r--Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst2
2 files changed, 25 insertions, 14 deletions
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
index b3a8f37..f7370df 100644
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -1454,6 +1454,17 @@ class BufferedRandom(BufferedWriter, BufferedReader):
return BufferedWriter.write(self, b)
+def _new_buffersize(bytes_read):
+ # Parallels _io/fileio.c new_buffersize
+ if bytes_read > 65536:
+ addend = bytes_read >> 3
+ else:
+ addend = 256 + bytes_read
+ if addend < DEFAULT_BUFFER_SIZE:
+ addend = DEFAULT_BUFFER_SIZE
+ return bytes_read + addend
+
+
class FileIO(RawIOBase):
_fd = -1
_created = False
@@ -1672,22 +1683,20 @@ class FileIO(RawIOBase):
except OSError:
pass
- result = bytearray()
- while True:
- if len(result) >= bufsize:
- bufsize = len(result)
- bufsize += max(bufsize, DEFAULT_BUFFER_SIZE)
- n = bufsize - len(result)
- try:
- chunk = os.read(self._fd, n)
- except BlockingIOError:
- if result:
- break
+ result = bytearray(bufsize)
+ bytes_read = 0
+ try:
+ while n := os.readinto(self._fd, memoryview(result)[bytes_read:]):
+ bytes_read += n
+ if bytes_read >= len(result):
+ result.resize(_new_buffersize(bytes_read))
+ except BlockingIOError:
+ if not bytes_read:
return None
- if not chunk: # reached the end of the file
- break
- result += chunk
+ assert len(result) - bytes_read >= 1, \
+ "os.readinto buffer size 0 will result in erroneous EOF / returns 0"
+ result.resize(bytes_read)
return bytes(result)
def readinto(self, buffer):
diff --git a/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst
new file mode 100644
index 0000000..236d776
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst
@@ -0,0 +1,2 @@
+``_pyio.FileIO.readall()`` now allocates, resizes, and fills a data buffer
+using the same algorithm ``_io.FileIO.readall()`` uses.