diff options
author | Barney Gale <barney.gale@gmail.com> | 2024-04-11 00:26:53 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-11 00:26:53 (GMT) |
commit | 0cc71bde001950d3634c235e2b0d24cda6ce7dce (patch) | |
tree | b7cdb33ccd2c1d0f793a8c3b60f590d9af8c0e0e | |
parent | 6258844c27e3b5a43816e7c559089a5fe0a47123 (diff) | |
download | cpython-0cc71bde001950d3634c235e2b0d24cda6ce7dce.zip cpython-0cc71bde001950d3634c235e2b0d24cda6ce7dce.tar.gz cpython-0cc71bde001950d3634c235e2b0d24cda6ce7dce.tar.bz2 |
GH-117586: Speed up `pathlib.Path.walk()` by working with strings (#117726)
Move `pathlib.Path.walk()` implementation into `glob._Globber`. The new
`glob._Globber.walk()` classmethod works with strings internally, which is
a little faster than generating `Path` objects and keeping them normalized.
The `pathlib.Path.walk()` method converts the strings back to path objects.
In the private pathlib ABCs, our existing subclass of `_Globber` ensures
that `PathBase` instances are used throughout.
Follow-up to #117589.
-rw-r--r-- | Lib/glob.py | 37 | ||||
-rw-r--r-- | Lib/pathlib/__init__.py | 20 | ||||
-rw-r--r-- | Lib/pathlib/_abc.py | 65 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2024-04-10-21-08-32.gh-issue-117586.UCL__1.rst | 1 |
4 files changed, 52 insertions, 71 deletions
diff --git a/Lib/glob.py b/Lib/glob.py index 62cf039..b1d2681 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -498,3 +498,40 @@ class _Globber: yield path except OSError: pass + + @classmethod + def walk(cls, root, top_down, on_error, follow_symlinks): + """Walk the directory tree from the given root, similar to os.walk(). + """ + paths = [root] + while paths: + path = paths.pop() + if isinstance(path, tuple): + yield path + continue + try: + with cls.scandir(path) as scandir_it: + dirnames = [] + filenames = [] + if not top_down: + paths.append((path, dirnames, filenames)) + for entry in scandir_it: + name = entry.name + try: + if entry.is_dir(follow_symlinks=follow_symlinks): + if not top_down: + paths.append(cls.parse_entry(entry)) + dirnames.append(name) + else: + filenames.append(name) + except OSError: + filenames.append(name) + except OSError as error: + if on_error is not None: + on_error(error) + else: + if top_down: + yield path, dirnames, filenames + if dirnames: + prefix = cls.add_slash(path) + paths += [cls.concat_path(prefix, d) for d in reversed(dirnames)] diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index 88e3286..746cbcd 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -586,18 +586,6 @@ class Path(_abc.PathBase, PurePath): """ return (self._make_child_relpath(name) for name in os.listdir(self)) - def _scandir(self): - return os.scandir(self) - - def _make_child_direntry(self, entry): - # Transform an entry yielded from _scandir() into a path object. - path_str = entry.name if str(self) == '.' else entry.path - path = self.with_segments(path_str) - path._str = path_str - path._drv = self.drive - path._root = self.root - path._tail_cached = self._tail + [entry.name] - return path def _make_child_relpath(self, name): if not name: @@ -663,8 +651,12 @@ class Path(_abc.PathBase, PurePath): def walk(self, top_down=True, on_error=None, follow_symlinks=False): """Walk the directory tree from this directory, similar to os.walk().""" sys.audit("pathlib.Path.walk", self, on_error, follow_symlinks) - return _abc.PathBase.walk( - self, top_down=top_down, on_error=on_error, follow_symlinks=follow_symlinks) + root_dir = str(self) + results = self._globber.walk(root_dir, top_down, on_error, follow_symlinks) + for path_str, dirnames, filenames in results: + if root_dir == '.': + path_str = path_str[2:] + yield self._from_parsed_string(path_str), dirnames, filenames def absolute(self): """Return an absolute version of this path diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 553f797..b6cab0d 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -45,10 +45,16 @@ def _is_case_sensitive(parser): class Globber(glob._Globber): lstat = operator.methodcaller('lstat') - scandir = operator.methodcaller('_scandir') add_slash = operator.methodcaller('joinpath', '') @staticmethod + def scandir(path): + # Emulate os.scandir(), which returns an object that can be used as a + # context manager. This method is called by walk() and glob(). + from contextlib import nullcontext + return nullcontext(path.iterdir()) + + @staticmethod def concat_path(path, text): """Appends text to the given path. """ @@ -677,20 +683,6 @@ class PathBase(PurePathBase): """ raise UnsupportedOperation(self._unsupported_msg('iterdir()')) - def _scandir(self): - # Emulate os.scandir(), which returns an object that can be used as a - # context manager. This method is called by walk() and glob(). - from contextlib import nullcontext - return nullcontext(self.iterdir()) - - def _make_child_direntry(self, entry): - # Transform an entry yielded from _scandir() into a path object. - # PathBase._scandir() yields PathBase objects, so this is a no-op. - return entry - - def _make_child_relpath(self, name): - return self.joinpath(name) - def _glob_selector(self, parts, case_sensitive, recurse_symlinks): if case_sensitive is None: case_sensitive = _is_case_sensitive(self.parser) @@ -724,48 +716,7 @@ class PathBase(PurePathBase): def walk(self, top_down=True, on_error=None, follow_symlinks=False): """Walk the directory tree from this directory, similar to os.walk().""" - paths = [self] - - while paths: - path = paths.pop() - if isinstance(path, tuple): - yield path - continue - - # We may not have read permission for self, in which case we can't - # get a list of the files the directory contains. os.walk() - # always suppressed the exception in that instance, rather than - # blow up for a minor reason when (say) a thousand readable - # directories are still left to visit. That logic is copied here. - try: - scandir_obj = path._scandir() - except OSError as error: - if on_error is not None: - on_error(error) - continue - - with scandir_obj as scandir_it: - dirnames = [] - filenames = [] - if not top_down: - paths.append((path, dirnames, filenames)) - for entry in scandir_it: - try: - is_dir = entry.is_dir(follow_symlinks=follow_symlinks) - except OSError: - # Carried over from os.path.isdir(). - is_dir = False - - if is_dir: - if not top_down: - paths.append(path._make_child_direntry(entry)) - dirnames.append(entry.name) - else: - filenames.append(entry.name) - - if top_down: - yield path, dirnames, filenames - paths += [path._make_child_relpath(d) for d in reversed(dirnames)] + return self._globber.walk(self, top_down, on_error, follow_symlinks) def absolute(self): """Return an absolute version of this path diff --git a/Misc/NEWS.d/next/Library/2024-04-10-21-08-32.gh-issue-117586.UCL__1.rst b/Misc/NEWS.d/next/Library/2024-04-10-21-08-32.gh-issue-117586.UCL__1.rst new file mode 100644 index 0000000..aefac85 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-04-10-21-08-32.gh-issue-117586.UCL__1.rst @@ -0,0 +1 @@ +Speed up :meth:`pathlib.Path.walk` by working with strings internally. |