summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGiampaolo Rodola <g.rodola@gmail.com>2018-11-12 14:18:15 (GMT)
committerGitHub <noreply@github.com>2018-11-12 14:18:15 (GMT)
commit19c46a4c96553b2a8390bf8a0e138f2b23e28ed6 (patch)
treef8c59fab93db82769dc938e1c5f28961c165292c
parentcd449806fac1246cb7b4d392026fe6986ec01fb7 (diff)
downloadcpython-19c46a4c96553b2a8390bf8a0e138f2b23e28ed6.zip
cpython-19c46a4c96553b2a8390bf8a0e138f2b23e28ed6.tar.gz
cpython-19c46a4c96553b2a8390bf8a0e138f2b23e28ed6.tar.bz2
bpo-33695 shutil.copytree() + os.scandir() cache (#7874)
-rw-r--r--Doc/whatsnew/3.8.rst8
-rw-r--r--Lib/shutil.py137
-rw-r--r--Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst7
3 files changed, 96 insertions, 56 deletions
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
index 91e0d5b..e5e6d4a5 100644
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -277,6 +277,14 @@ Optimizations
See :ref:`shutil-platform-dependent-efficient-copy-operations` section.
(Contributed by Giampaolo Rodola' in :issue:`25427`.)
+* :func:`shutil.copytree` uses :func:`os.scandir` function and all copy
+ functions depending from it use cached :func:`os.stat` values. The speedup
+ for copying a directory with 8000 files is around +9% on Linux, +20% on
+ Windows and +30% on a Windows SMB share. Also the number of :func:`os.stat`
+ syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
+ on network filesystems. (Contributed by Giampaolo Rodola' in :issue:`33695`.)
+
+
* The default protocol in the :mod:`pickle` module is now Protocol 4,
first introduced in Python 3.4. It offers better performance and smaller
size compared to Protocol 3 available since Python 3.0.
diff --git a/Lib/shutil.py b/Lib/shutil.py
index b7a7df3..74348ba 100644
--- a/Lib/shutil.py
+++ b/Lib/shutil.py
@@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE):
def _samefile(src, dst):
# Macintosh, Unix.
+ if isinstance(src, os.DirEntry) and hasattr(os.path, 'samestat'):
+ try:
+ return os.path.samestat(src.stat(), os.stat(dst))
+ except OSError:
+ return False
+
if hasattr(os.path, 'samefile'):
try:
return os.path.samefile(src, dst)
@@ -210,6 +216,12 @@ def _samefile(src, dst):
return (os.path.normcase(os.path.abspath(src)) ==
os.path.normcase(os.path.abspath(dst)))
+def _stat(fn):
+ return fn.stat() if isinstance(fn, os.DirEntry) else os.stat(fn)
+
+def _islink(fn):
+ return fn.is_symlink() if isinstance(fn, os.DirEntry) else os.path.islink(fn)
+
def copyfile(src, dst, *, follow_symlinks=True):
"""Copy data from src to dst in the most efficient way possible.
@@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True):
file_size = 0
for i, fn in enumerate([src, dst]):
try:
- st = os.stat(fn)
+ st = _stat(fn)
except OSError:
# File most likely does not exist
pass
else:
# XXX What about other special files? (sockets, devices...)
if stat.S_ISFIFO(st.st_mode):
+ fn = fn.path if isinstance(fn, os.DirEntry) else fn
raise SpecialFileError("`%s` is a named pipe" % fn)
if _WINDOWS and i == 0:
file_size = st.st_size
- if not follow_symlinks and os.path.islink(src):
+ if not follow_symlinks and _islink(src):
os.symlink(os.readlink(src), dst)
else:
with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
@@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True):
(e.g. Linux) this method does nothing.
"""
- if not follow_symlinks and os.path.islink(src) and os.path.islink(dst):
+ if not follow_symlinks and _islink(src) and os.path.islink(dst):
if hasattr(os, 'lchmod'):
stat_func, chmod_func = os.lstat, os.lchmod
else:
return
elif hasattr(os, 'chmod'):
- stat_func, chmod_func = os.stat, os.chmod
+ stat_func, chmod_func = _stat, os.chmod
else:
return
@@ -325,7 +338,7 @@ def copystat(src, dst, *, follow_symlinks=True):
pass
# follow symlinks (aka don't not follow symlinks)
- follow = follow_symlinks or not (os.path.islink(src) and os.path.islink(dst))
+ follow = follow_symlinks or not (_islink(src) and os.path.islink(dst))
if follow:
# use the real function if it exists
def lookup(name):
@@ -339,7 +352,10 @@ def copystat(src, dst, *, follow_symlinks=True):
return fn
return _nop
- st = lookup("stat")(src, follow_symlinks=follow)
+ if isinstance(src, os.DirEntry):
+ st = src.stat(follow_symlinks=follow)
+ else:
+ st = lookup("stat")(src, follow_symlinks=follow)
mode = stat.S_IMODE(st.st_mode)
lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns),
follow_symlinks=follow)
@@ -415,79 +431,47 @@ def ignore_patterns(*patterns):
return set(ignored_names)
return _ignore_patterns
-def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
- ignore_dangling_symlinks=False):
- """Recursively copy a directory tree.
-
- The destination directory must not already exist.
- If exception(s) occur, an Error is raised with a list of reasons.
-
- If the optional symlinks flag is true, symbolic links in the
- source tree result in symbolic links in the destination tree; if
- it is false, the contents of the files pointed to by symbolic
- links are copied. If the file pointed by the symlink doesn't
- exist, an exception will be added in the list of errors raised in
- an Error exception at the end of the copy process.
-
- You can set the optional ignore_dangling_symlinks flag to true if you
- want to silence this exception. Notice that this has no effect on
- platforms that don't support os.symlink.
-
- The optional ignore argument is a callable. If given, it
- is called with the `src` parameter, which is the directory
- being visited by copytree(), and `names` which is the list of
- `src` contents, as returned by os.listdir():
-
- callable(src, names) -> ignored_names
-
- Since copytree() is called recursively, the callable will be
- called once for each directory that is copied. It returns a
- list of names relative to the `src` directory that should
- not be copied.
-
- The optional copy_function argument is a callable that will be used
- to copy each file. It will be called with the source path and the
- destination path as arguments. By default, copy2() is used, but any
- function that supports the same signature (like copy()) can be used.
-
- """
- names = os.listdir(src)
+def _copytree(entries, src, dst, symlinks, ignore, copy_function,
+ ignore_dangling_symlinks):
if ignore is not None:
- ignored_names = ignore(src, names)
+ ignored_names = ignore(src, set(os.listdir(src)))
else:
ignored_names = set()
os.makedirs(dst)
errors = []
- for name in names:
- if name in ignored_names:
+ use_srcentry = copy_function is copy2 or copy_function is copy
+
+ for srcentry in entries:
+ if srcentry.name in ignored_names:
continue
- srcname = os.path.join(src, name)
- dstname = os.path.join(dst, name)
+ srcname = os.path.join(src, srcentry.name)
+ dstname = os.path.join(dst, srcentry.name)
+ srcobj = srcentry if use_srcentry else srcname
try:
- if os.path.islink(srcname):
+ if srcentry.is_symlink():
linkto = os.readlink(srcname)
if symlinks:
# We can't just leave it to `copy_function` because legacy
# code with a custom `copy_function` may rely on copytree
# doing the right thing.
os.symlink(linkto, dstname)
- copystat(srcname, dstname, follow_symlinks=not symlinks)
+ copystat(srcobj, dstname, follow_symlinks=not symlinks)
else:
# ignore dangling symlink if the flag is on
if not os.path.exists(linkto) and ignore_dangling_symlinks:
continue
# otherwise let the copy occurs. copy2 will raise an error
- if os.path.isdir(srcname):
- copytree(srcname, dstname, symlinks, ignore,
+ if srcentry.is_dir():
+ copytree(srcobj, dstname, symlinks, ignore,
copy_function)
else:
- copy_function(srcname, dstname)
- elif os.path.isdir(srcname):
- copytree(srcname, dstname, symlinks, ignore, copy_function)
+ copy_function(srcobj, dstname)
+ elif srcentry.is_dir():
+ copytree(srcobj, dstname, symlinks, ignore, copy_function)
else:
# Will raise a SpecialFileError for unsupported file types
- copy_function(srcname, dstname)
+ copy_function(srcentry, dstname)
# catch the Error from the recursive copytree so that we can
# continue with other files
except Error as err:
@@ -504,6 +488,47 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
raise Error(errors)
return dst
+def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
+ ignore_dangling_symlinks=False):
+ """Recursively copy a directory tree.
+
+ The destination directory must not already exist.
+ If exception(s) occur, an Error is raised with a list of reasons.
+
+ If the optional symlinks flag is true, symbolic links in the
+ source tree result in symbolic links in the destination tree; if
+ it is false, the contents of the files pointed to by symbolic
+ links are copied. If the file pointed by the symlink doesn't
+ exist, an exception will be added in the list of errors raised in
+ an Error exception at the end of the copy process.
+
+ You can set the optional ignore_dangling_symlinks flag to true if you
+ want to silence this exception. Notice that this has no effect on
+ platforms that don't support os.symlink.
+
+ The optional ignore argument is a callable. If given, it
+ is called with the `src` parameter, which is the directory
+ being visited by copytree(), and `names` which is the list of
+ `src` contents, as returned by os.listdir():
+
+ callable(src, names) -> ignored_names
+
+ Since copytree() is called recursively, the callable will be
+ called once for each directory that is copied. It returns a
+ list of names relative to the `src` directory that should
+ not be copied.
+
+ The optional copy_function argument is a callable that will be used
+ to copy each file. It will be called with the source path and the
+ destination path as arguments. By default, copy2() is used, but any
+ function that supports the same signature (like copy()) can be used.
+
+ """
+ with os.scandir(src) as entries:
+ return _copytree(entries=entries, src=src, dst=dst, symlinks=symlinks,
+ ignore=ignore, copy_function=copy_function,
+ ignore_dangling_symlinks=ignore_dangling_symlinks)
+
# version vulnerable to race conditions
def _rmtree_unsafe(path, onerror):
try:
diff --git a/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst b/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst
new file mode 100644
index 0000000..2195045
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst
@@ -0,0 +1,7 @@
+:func:`shutil.copytree` uses :func:`os.scandir` function and all copy
+functions depending from it use cached :func:`os.stat` values. The speedup
+for copying a directory with 8000 files is around +9% on Linux, +20% on
+Windows and + 30% on a Windows SMB share. Also the number of :func:`os.stat`
+syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
+on network filesystems.
+(Contributed by Giampaolo Rodola' in :issue:`33695`.)