gh-129005: Align FileIO.readall between _pyio and _io (#129705)

Utilize `bytearray.resize()` and `os.readinto()` to reduce copies and match behavior of `_io.FileIO.readall()`. There is still an extra copy which means twice the memory required compared to FileIO because there isn't a zero-copy path from `bytearray` -> `bytes` currently. On my system reading a 2 GB file: `./python -m test -M8g -uall test_largefile -m test.test_largefile.PyLargeFileTest.test_large_read -v` Goes from ~2.7 seconds -> ~2.2 seconds Co-authored-by: Victor Stinner <vstinner@python.org>
author: Cody Maloney <cmaloney@users.noreply.github.com> 2025-02-07 11:06:11 (GMT)
committer: GitHub <noreply@github.com> 2025-02-07 11:06:11 (GMT)
commit: a3d5aab9a89e311cded9c724ce7d5a873e4d680d (patch)
tree: a9414f49045a51f457a3ea93e4841d4a65e8212f
parent: ae132edc296d27c6ed04fe4d400c67e3cfb622e8 (diff)
download: cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.zip
cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.tar.gz
cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.tar.bz2
2 files changed, 25 insertions, 14 deletions
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
index b3a8f37..f7370df 100644
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -1454,6 +1454,17 @@ class BufferedRandom(BufferedWriter, BufferedReader):
         return BufferedWriter.write(self, b)
 
 
+def _new_buffersize(bytes_read):
+    # Parallels _io/fileio.c new_buffersize
+    if bytes_read > 65536:
+        addend = bytes_read >> 3
+    else:
+        addend = 256 + bytes_read
+    if addend < DEFAULT_BUFFER_SIZE:
+        addend = DEFAULT_BUFFER_SIZE
+    return bytes_read + addend
+
+
 class FileIO(RawIOBase):
     _fd = -1
     _created = False
@@ -1672,22 +1683,20 @@ class FileIO(RawIOBase):
                 except OSError:
                     pass
 
-        result = bytearray()
-        while True:
-            if len(result) >= bufsize:
-                bufsize = len(result)
-                bufsize += max(bufsize, DEFAULT_BUFFER_SIZE)
-            n = bufsize - len(result)
-            try:
-                chunk = os.read(self._fd, n)
-            except BlockingIOError:
-                if result:
-                    break
+        result = bytearray(bufsize)
+        bytes_read = 0
+        try:
+            while n := os.readinto(self._fd, memoryview(result)[bytes_read:]):
+                bytes_read += n
+                if bytes_read >= len(result):
+                    result.resize(_new_buffersize(bytes_read))
+        except BlockingIOError:
+            if not bytes_read:
                 return None
-            if not chunk: # reached the end of the file
-                break
-            result += chunk
 
+        assert len(result) - bytes_read >= 1, \
+            "os.readinto buffer size 0 will result in erroneous EOF / returns 0"
+        result.resize(bytes_read)
         return bytes(result)
 
     def readinto(self, buffer):
diff --git a/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst
new file mode 100644
index 0000000..236d776
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst
@@ -0,0 +1,2 @@
+``_pyio.FileIO.readall()`` now allocates, resizes, and fills a data buffer
+using the same algorithm ``_io.FileIO.readall()`` uses.
author	Cody Maloney <cmaloney@users.noreply.github.com>	2025-02-07 11:06:11 (GMT)
committer	GitHub <noreply@github.com>	2025-02-07 11:06:11 (GMT)
commit	a3d5aab9a89e311cded9c724ce7d5a873e4d680d (patch)
tree	a9414f49045a51f457a3ea93e4841d4a65e8212f
parent	ae132edc296d27c6ed04fe4d400c67e3cfb622e8 (diff)
download	cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.zip cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.tar.gz cpython-a3d5aab9a89e311cded9c724ce7d5a873e4d680d.tar.bz2