From 19c46a4c96553b2a8390bf8a0e138f2b23e28ed6 Mon Sep 17 00:00:00 2001 From: Giampaolo Rodola Date: Mon, 12 Nov 2018 06:18:15 -0800 Subject: bpo-33695 shutil.copytree() + os.scandir() cache (#7874) --- Doc/whatsnew/3.8.rst | 8 ++ Lib/shutil.py | 137 ++++++++++++--------- .../2018-06-23-12-47-37.bpo-33695.seRTxh.rst | 7 ++ 3 files changed, 96 insertions(+), 56 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index 91e0d5b..e5e6d4a5 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -277,6 +277,14 @@ Optimizations See :ref:`shutil-platform-dependent-efficient-copy-operations` section. (Contributed by Giampaolo Rodola' in :issue:`25427`.) +* :func:`shutil.copytree` uses :func:`os.scandir` function and all copy + functions depending from it use cached :func:`os.stat` values. The speedup + for copying a directory with 8000 files is around +9% on Linux, +20% on + Windows and +30% on a Windows SMB share. Also the number of :func:`os.stat` + syscalls is reduced by 38% making :func:`shutil.copytree` especially faster + on network filesystems. (Contributed by Giampaolo Rodola' in :issue:`33695`.) + + * The default protocol in the :mod:`pickle` module is now Protocol 4, first introduced in Python 3.4. It offers better performance and smaller size compared to Protocol 3 available since Python 3.0. diff --git a/Lib/shutil.py b/Lib/shutil.py index b7a7df3..74348ba 100644 --- a/Lib/shutil.py +++ b/Lib/shutil.py @@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE): def _samefile(src, dst): # Macintosh, Unix. + if isinstance(src, os.DirEntry) and hasattr(os.path, 'samestat'): + try: + return os.path.samestat(src.stat(), os.stat(dst)) + except OSError: + return False + if hasattr(os.path, 'samefile'): try: return os.path.samefile(src, dst) @@ -210,6 +216,12 @@ def _samefile(src, dst): return (os.path.normcase(os.path.abspath(src)) == os.path.normcase(os.path.abspath(dst))) +def _stat(fn): + return fn.stat() if isinstance(fn, os.DirEntry) else os.stat(fn) + +def _islink(fn): + return fn.is_symlink() if isinstance(fn, os.DirEntry) else os.path.islink(fn) + def copyfile(src, dst, *, follow_symlinks=True): """Copy data from src to dst in the most efficient way possible. @@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True): file_size = 0 for i, fn in enumerate([src, dst]): try: - st = os.stat(fn) + st = _stat(fn) except OSError: # File most likely does not exist pass else: # XXX What about other special files? (sockets, devices...) if stat.S_ISFIFO(st.st_mode): + fn = fn.path if isinstance(fn, os.DirEntry) else fn raise SpecialFileError("`%s` is a named pipe" % fn) if _WINDOWS and i == 0: file_size = st.st_size - if not follow_symlinks and os.path.islink(src): + if not follow_symlinks and _islink(src): os.symlink(os.readlink(src), dst) else: with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst: @@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True): (e.g. Linux) this method does nothing. """ - if not follow_symlinks and os.path.islink(src) and os.path.islink(dst): + if not follow_symlinks and _islink(src) and os.path.islink(dst): if hasattr(os, 'lchmod'): stat_func, chmod_func = os.lstat, os.lchmod else: return elif hasattr(os, 'chmod'): - stat_func, chmod_func = os.stat, os.chmod + stat_func, chmod_func = _stat, os.chmod else: return @@ -325,7 +338,7 @@ def copystat(src, dst, *, follow_symlinks=True): pass # follow symlinks (aka don't not follow symlinks) - follow = follow_symlinks or not (os.path.islink(src) and os.path.islink(dst)) + follow = follow_symlinks or not (_islink(src) and os.path.islink(dst)) if follow: # use the real function if it exists def lookup(name): @@ -339,7 +352,10 @@ def copystat(src, dst, *, follow_symlinks=True): return fn return _nop - st = lookup("stat")(src, follow_symlinks=follow) + if isinstance(src, os.DirEntry): + st = src.stat(follow_symlinks=follow) + else: + st = lookup("stat")(src, follow_symlinks=follow) mode = stat.S_IMODE(st.st_mode) lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns), follow_symlinks=follow) @@ -415,79 +431,47 @@ def ignore_patterns(*patterns): return set(ignored_names) return _ignore_patterns -def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2, - ignore_dangling_symlinks=False): - """Recursively copy a directory tree. - - The destination directory must not already exist. - If exception(s) occur, an Error is raised with a list of reasons. - - If the optional symlinks flag is true, symbolic links in the - source tree result in symbolic links in the destination tree; if - it is false, the contents of the files pointed to by symbolic - links are copied. If the file pointed by the symlink doesn't - exist, an exception will be added in the list of errors raised in - an Error exception at the end of the copy process. - - You can set the optional ignore_dangling_symlinks flag to true if you - want to silence this exception. Notice that this has no effect on - platforms that don't support os.symlink. - - The optional ignore argument is a callable. If given, it - is called with the `src` parameter, which is the directory - being visited by copytree(), and `names` which is the list of - `src` contents, as returned by os.listdir(): - - callable(src, names) -> ignored_names - - Since copytree() is called recursively, the callable will be - called once for each directory that is copied. It returns a - list of names relative to the `src` directory that should - not be copied. - - The optional copy_function argument is a callable that will be used - to copy each file. It will be called with the source path and the - destination path as arguments. By default, copy2() is used, but any - function that supports the same signature (like copy()) can be used. - - """ - names = os.listdir(src) +def _copytree(entries, src, dst, symlinks, ignore, copy_function, + ignore_dangling_symlinks): if ignore is not None: - ignored_names = ignore(src, names) + ignored_names = ignore(src, set(os.listdir(src))) else: ignored_names = set() os.makedirs(dst) errors = [] - for name in names: - if name in ignored_names: + use_srcentry = copy_function is copy2 or copy_function is copy + + for srcentry in entries: + if srcentry.name in ignored_names: continue - srcname = os.path.join(src, name) - dstname = os.path.join(dst, name) + srcname = os.path.join(src, srcentry.name) + dstname = os.path.join(dst, srcentry.name) + srcobj = srcentry if use_srcentry else srcname try: - if os.path.islink(srcname): + if srcentry.is_symlink(): linkto = os.readlink(srcname) if symlinks: # We can't just leave it to `copy_function` because legacy # code with a custom `copy_function` may rely on copytree # doing the right thing. os.symlink(linkto, dstname) - copystat(srcname, dstname, follow_symlinks=not symlinks) + copystat(srcobj, dstname, follow_symlinks=not symlinks) else: # ignore dangling symlink if the flag is on if not os.path.exists(linkto) and ignore_dangling_symlinks: continue # otherwise let the copy occurs. copy2 will raise an error - if os.path.isdir(srcname): - copytree(srcname, dstname, symlinks, ignore, + if srcentry.is_dir(): + copytree(srcobj, dstname, symlinks, ignore, copy_function) else: - copy_function(srcname, dstname) - elif os.path.isdir(srcname): - copytree(srcname, dstname, symlinks, ignore, copy_function) + copy_function(srcobj, dstname) + elif srcentry.is_dir(): + copytree(srcobj, dstname, symlinks, ignore, copy_function) else: # Will raise a SpecialFileError for unsupported file types - copy_function(srcname, dstname) + copy_function(srcentry, dstname) # catch the Error from the recursive copytree so that we can # continue with other files except Error as err: @@ -504,6 +488,47 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2, raise Error(errors) return dst +def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2, + ignore_dangling_symlinks=False): + """Recursively copy a directory tree. + + The destination directory must not already exist. + If exception(s) occur, an Error is raised with a list of reasons. + + If the optional symlinks flag is true, symbolic links in the + source tree result in symbolic links in the destination tree; if + it is false, the contents of the files pointed to by symbolic + links are copied. If the file pointed by the symlink doesn't + exist, an exception will be added in the list of errors raised in + an Error exception at the end of the copy process. + + You can set the optional ignore_dangling_symlinks flag to true if you + want to silence this exception. Notice that this has no effect on + platforms that don't support os.symlink. + + The optional ignore argument is a callable. If given, it + is called with the `src` parameter, which is the directory + being visited by copytree(), and `names` which is the list of + `src` contents, as returned by os.listdir(): + + callable(src, names) -> ignored_names + + Since copytree() is called recursively, the callable will be + called once for each directory that is copied. It returns a + list of names relative to the `src` directory that should + not be copied. + + The optional copy_function argument is a callable that will be used + to copy each file. It will be called with the source path and the + destination path as arguments. By default, copy2() is used, but any + function that supports the same signature (like copy()) can be used. + + """ + with os.scandir(src) as entries: + return _copytree(entries=entries, src=src, dst=dst, symlinks=symlinks, + ignore=ignore, copy_function=copy_function, + ignore_dangling_symlinks=ignore_dangling_symlinks) + # version vulnerable to race conditions def _rmtree_unsafe(path, onerror): try: diff --git a/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst b/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst new file mode 100644 index 0000000..2195045 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst @@ -0,0 +1,7 @@ +:func:`shutil.copytree` uses :func:`os.scandir` function and all copy +functions depending from it use cached :func:`os.stat` values. The speedup +for copying a directory with 8000 files is around +9% on Linux, +20% on +Windows and + 30% on a Windows SMB share. Also the number of :func:`os.stat` +syscalls is reduced by 38% making :func:`shutil.copytree` especially faster +on network filesystems. +(Contributed by Giampaolo Rodola' in :issue:`33695`.) -- cgit v0.12