summaryrefslogtreecommitdiffstats
path: root/Lib/pathlib.py
diff options
context:
space:
mode:
authorBarney Gale <barney.gale@gmail.com>2023-05-07 21:12:50 (GMT)
committerGitHub <noreply@github.com>2023-05-07 21:12:50 (GMT)
commitc0ece3dc9791694e960952ba74070efaaa79a676 (patch)
tree17d6baebf2f6a39e129b7e74da2ab56a029e8d16 /Lib/pathlib.py
parent8d95012c95988dc517db6e09348aab996868699c (diff)
downloadcpython-c0ece3dc9791694e960952ba74070efaaa79a676.zip
cpython-c0ece3dc9791694e960952ba74070efaaa79a676.tar.gz
cpython-c0ece3dc9791694e960952ba74070efaaa79a676.tar.bz2
GH-102613: Improve performance of `pathlib.Path.rglob()` (GH-104244)
Stop de-duplicating results in `_RecursiveWildcardSelector`. A new `_DoubleRecursiveWildcardSelector` class is introduced which performs de-duplication, but this is used _only_ for patterns with multiple non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding the use of a set, `PurePath.__hash__()` is not called, and so paths do not need to be stringified and case-normalised. Also merge adjacent '**' segments in patterns.
Diffstat (limited to 'Lib/pathlib.py')
-rw-r--r--Lib/pathlib.py54
1 files changed, 37 insertions, 17 deletions
diff --git a/Lib/pathlib.py b/Lib/pathlib.py
index 68255aa..20ec1ce 100644
--- a/Lib/pathlib.py
+++ b/Lib/pathlib.py
@@ -64,17 +64,25 @@ def _is_case_sensitive(flavour):
@functools.lru_cache()
def _make_selector(pattern_parts, flavour, case_sensitive):
pat = pattern_parts[0]
- child_parts = pattern_parts[1:]
if not pat:
return _TerminatingSelector()
if pat == '**':
- cls = _RecursiveWildcardSelector
- elif pat == '..':
- cls = _ParentSelector
- elif '**' in pat:
- raise ValueError("Invalid pattern: '**' can only be an entire path component")
+ child_parts_idx = 1
+ while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
+ child_parts_idx += 1
+ child_parts = pattern_parts[child_parts_idx:]
+ if '**' in child_parts:
+ cls = _DoubleRecursiveWildcardSelector
+ else:
+ cls = _RecursiveWildcardSelector
else:
- cls = _WildcardSelector
+ child_parts = pattern_parts[1:]
+ if pat == '..':
+ cls = _ParentSelector
+ elif '**' in pat:
+ raise ValueError("Invalid pattern: '**' can only be an entire path component")
+ else:
+ cls = _WildcardSelector
return cls(pat, child_parts, flavour, case_sensitive)
@@ -183,20 +191,32 @@ class _RecursiveWildcardSelector(_Selector):
def _select_from(self, parent_path, scandir):
try:
- yielded = set()
- try:
- successor_select = self.successor._select_from
- for starting_point in self._iterate_directories(parent_path, scandir):
- for p in successor_select(starting_point, scandir):
- if p not in yielded:
- yield p
- yielded.add(p)
- finally:
- yielded.clear()
+ successor_select = self.successor._select_from
+ for starting_point in self._iterate_directories(parent_path, scandir):
+ for p in successor_select(starting_point, scandir):
+ yield p
except PermissionError:
return
+class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
+ """
+ Like _RecursiveWildcardSelector, but also de-duplicates results from
+ successive selectors. This is necessary if the pattern contains
+ multiple non-adjacent '**' segments.
+ """
+
+ def _select_from(self, parent_path, scandir):
+ yielded = set()
+ try:
+ for p in super()._select_from(parent_path, scandir):
+ if p not in yielded:
+ yield p
+ yielded.add(p)
+ finally:
+ yielded.clear()
+
+
#
# Public API
#