diff options
author | Barney Gale <barney.gale@gmail.com> | 2023-05-07 21:12:50 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-07 21:12:50 (GMT) |
commit | c0ece3dc9791694e960952ba74070efaaa79a676 (patch) | |
tree | 17d6baebf2f6a39e129b7e74da2ab56a029e8d16 /Lib/pathlib.py | |
parent | 8d95012c95988dc517db6e09348aab996868699c (diff) | |
download | cpython-c0ece3dc9791694e960952ba74070efaaa79a676.zip cpython-c0ece3dc9791694e960952ba74070efaaa79a676.tar.gz cpython-c0ece3dc9791694e960952ba74070efaaa79a676.tar.bz2 |
GH-102613: Improve performance of `pathlib.Path.rglob()` (GH-104244)
Stop de-duplicating results in `_RecursiveWildcardSelector`. A new
`_DoubleRecursiveWildcardSelector` class is introduced which performs
de-duplication, but this is used _only_ for patterns with multiple
non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding
the use of a set, `PurePath.__hash__()` is not called, and so paths do not
need to be stringified and case-normalised.
Also merge adjacent '**' segments in patterns.
Diffstat (limited to 'Lib/pathlib.py')
-rw-r--r-- | Lib/pathlib.py | 54 |
1 files changed, 37 insertions, 17 deletions
diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 68255aa..20ec1ce 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -64,17 +64,25 @@ def _is_case_sensitive(flavour): @functools.lru_cache() def _make_selector(pattern_parts, flavour, case_sensitive): pat = pattern_parts[0] - child_parts = pattern_parts[1:] if not pat: return _TerminatingSelector() if pat == '**': - cls = _RecursiveWildcardSelector - elif pat == '..': - cls = _ParentSelector - elif '**' in pat: - raise ValueError("Invalid pattern: '**' can only be an entire path component") + child_parts_idx = 1 + while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**': + child_parts_idx += 1 + child_parts = pattern_parts[child_parts_idx:] + if '**' in child_parts: + cls = _DoubleRecursiveWildcardSelector + else: + cls = _RecursiveWildcardSelector else: - cls = _WildcardSelector + child_parts = pattern_parts[1:] + if pat == '..': + cls = _ParentSelector + elif '**' in pat: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + cls = _WildcardSelector return cls(pat, child_parts, flavour, case_sensitive) @@ -183,20 +191,32 @@ class _RecursiveWildcardSelector(_Selector): def _select_from(self, parent_path, scandir): try: - yielded = set() - try: - successor_select = self.successor._select_from - for starting_point in self._iterate_directories(parent_path, scandir): - for p in successor_select(starting_point, scandir): - if p not in yielded: - yield p - yielded.add(p) - finally: - yielded.clear() + successor_select = self.successor._select_from + for starting_point in self._iterate_directories(parent_path, scandir): + for p in successor_select(starting_point, scandir): + yield p except PermissionError: return +class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector): + """ + Like _RecursiveWildcardSelector, but also de-duplicates results from + successive selectors. This is necessary if the pattern contains + multiple non-adjacent '**' segments. + """ + + def _select_from(self, parent_path, scandir): + yielded = set() + try: + for p in super()._select_from(parent_path, scandir): + if p not in yielded: + yield p + yielded.add(p) + finally: + yielded.clear() + + # # Public API # |