summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/_pyio.py22
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2024-06-19-19-54-35.gh-issue-120754.uF29sj.rst1
-rw-r--r--Modules/_io/fileio.c70
3 files changed, 60 insertions, 33 deletions
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
index 7d298e1..75b5ad1 100644
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -1577,6 +1577,7 @@ class FileIO(RawIOBase):
self._blksize = getattr(fdfstat, 'st_blksize', 0)
if self._blksize <= 1:
self._blksize = DEFAULT_BUFFER_SIZE
+ self._estimated_size = fdfstat.st_size
if _setmode:
# don't translate newlines (\r\n <=> \n)
@@ -1654,14 +1655,18 @@ class FileIO(RawIOBase):
"""
self._checkClosed()
self._checkReadable()
- bufsize = DEFAULT_BUFFER_SIZE
- try:
- pos = os.lseek(self._fd, 0, SEEK_CUR)
- end = os.fstat(self._fd).st_size
- if end >= pos:
- bufsize = end - pos + 1
- except OSError:
- pass
+ if self._estimated_size <= 0:
+ bufsize = DEFAULT_BUFFER_SIZE
+ else:
+ bufsize = self._estimated_size + 1
+
+ if self._estimated_size > 65536:
+ try:
+ pos = os.lseek(self._fd, 0, SEEK_CUR)
+ if self._estimated_size >= pos:
+ bufsize = self._estimated_size - pos + 1
+ except OSError:
+ pass
result = bytearray()
while True:
@@ -1737,6 +1742,7 @@ class FileIO(RawIOBase):
if size is None:
size = self.tell()
os.ftruncate(self._fd, size)
+ self._estimated_size = size
return size
def close(self):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-19-19-54-35.gh-issue-120754.uF29sj.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-19-19-54-35.gh-issue-120754.uF29sj.rst
new file mode 100644
index 0000000..46481d8
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2024-06-19-19-54-35.gh-issue-120754.uF29sj.rst
@@ -0,0 +1 @@
+Reduce the number of system calls invoked when reading a whole file (ex. ``open('a.txt').read()``). For a sample program that reads the contents of the 400+ ``.rst`` files in the cpython repository ``Doc`` folder, there is an over 10% reduction in system call count.
diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c
index b5129ff..d5bf328 100644
--- a/Modules/_io/fileio.c
+++ b/Modules/_io/fileio.c
@@ -54,6 +54,9 @@
# define SMALLCHUNK BUFSIZ
#endif
+/* Size at which a buffer is considered "large" and behavior should change to
+ avoid excessive memory allocation */
+#define LARGE_BUFFER_CUTOFF_SIZE 65536
/*[clinic input]
module _io
@@ -72,6 +75,7 @@ typedef struct {
unsigned int closefd : 1;
char finalizing;
unsigned int blksize;
+ Py_off_t estimated_size;
PyObject *weakreflist;
PyObject *dict;
} fileio;
@@ -196,6 +200,7 @@ fileio_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
self->appending = 0;
self->seekable = -1;
self->blksize = 0;
+ self->estimated_size = -1;
self->closefd = 1;
self->weakreflist = NULL;
}
@@ -482,6 +487,9 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
if (fdfstat.st_blksize > 1)
self->blksize = fdfstat.st_blksize;
#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
+ if (fdfstat.st_size < PY_SSIZE_T_MAX) {
+ self->estimated_size = (Py_off_t)fdfstat.st_size;
+ }
}
#if defined(MS_WINDOWS) || defined(__CYGWIN__)
@@ -684,7 +692,7 @@ new_buffersize(fileio *self, size_t currentsize)
giving us amortized linear-time behavior. For bigger sizes, use a
less-than-double growth factor to avoid excessive allocation. */
assert(currentsize <= PY_SSIZE_T_MAX);
- if (currentsize > 65536)
+ if (currentsize > LARGE_BUFFER_CUTOFF_SIZE)
addend = currentsize >> 3;
else
addend = 256 + currentsize;
@@ -707,43 +715,56 @@ static PyObject *
_io_FileIO_readall_impl(fileio *self)
/*[clinic end generated code: output=faa0292b213b4022 input=dbdc137f55602834]*/
{
- struct _Py_stat_struct status;
Py_off_t pos, end;
PyObject *result;
Py_ssize_t bytes_read = 0;
Py_ssize_t n;
size_t bufsize;
- int fstat_result;
- if (self->fd < 0)
+ if (self->fd < 0) {
return err_closed();
+ }
- Py_BEGIN_ALLOW_THREADS
- _Py_BEGIN_SUPPRESS_IPH
-#ifdef MS_WINDOWS
- pos = _lseeki64(self->fd, 0L, SEEK_CUR);
-#else
- pos = lseek(self->fd, 0L, SEEK_CUR);
-#endif
- _Py_END_SUPPRESS_IPH
- fstat_result = _Py_fstat_noraise(self->fd, &status);
- Py_END_ALLOW_THREADS
-
- if (fstat_result == 0)
- end = status.st_size;
- else
- end = (Py_off_t)-1;
-
- if (end > 0 && end >= pos && pos >= 0 && end - pos < PY_SSIZE_T_MAX) {
+ end = self->estimated_size;
+ if (end <= 0) {
+ /* Use a default size and resize as needed. */
+ bufsize = SMALLCHUNK;
+ }
+ else {
/* This is probably a real file, so we try to allocate a
buffer one byte larger than the rest of the file. If the
calculation is right then we should get EOF without having
to enlarge the buffer. */
- bufsize = (size_t)(end - pos + 1);
- } else {
- bufsize = SMALLCHUNK;
+ if (end > _PY_READ_MAX - 1) {
+ bufsize = _PY_READ_MAX;
+ }
+ else {
+ bufsize = (size_t)end + 1;
+ }
+
+ /* While a lot of code does open().read() to get the whole contents
+ of a file it is possible a caller seeks/reads a ways into the file
+ then calls readall() to get the rest, which would result in allocating
+ more than required. Guard against that for larger files where we expect
+ the I/O time to dominate anyways while keeping small files fast. */
+ if (bufsize > LARGE_BUFFER_CUTOFF_SIZE) {
+ Py_BEGIN_ALLOW_THREADS
+ _Py_BEGIN_SUPPRESS_IPH
+#ifdef MS_WINDOWS
+ pos = _lseeki64(self->fd, 0L, SEEK_CUR);
+#else
+ pos = lseek(self->fd, 0L, SEEK_CUR);
+#endif
+ _Py_END_SUPPRESS_IPH
+ Py_END_ALLOW_THREADS
+
+ if (end >= pos && pos >= 0 && (end - pos) < (_PY_READ_MAX - 1)) {
+ bufsize = (size_t)(end - pos) + 1;
+ }
+ }
}
+
result = PyBytes_FromStringAndSize(NULL, bufsize);
if (result == NULL)
return NULL;
@@ -783,7 +804,6 @@ _io_FileIO_readall_impl(fileio *self)
return NULL;
}
bytes_read += n;
- pos += n;
}
if (PyBytes_GET_SIZE(result) > bytes_read) {