summaryrefslogtreecommitdiffstats
path: root/Lib/pathlib
diff options
context:
space:
mode:
authorBarney Gale <barney.gale@gmail.com>2024-01-20 02:10:25 (GMT)
committerGitHub <noreply@github.com>2024-01-20 02:10:25 (GMT)
commit6313cdde58f34648a430d2830357c9d2a5b67b87 (patch)
treeab329b153fb95322934d83eaf62b9a6749d0f09c /Lib/pathlib
parent681e9e85a2c1f72576ddfbd766506e2d6db34862 (diff)
downloadcpython-6313cdde58f34648a430d2830357c9d2a5b67b87.zip
cpython-6313cdde58f34648a430d2830357c9d2a5b67b87.tar.gz
cpython-6313cdde58f34648a430d2830357c9d2a5b67b87.tar.bz2
GH-79634: Accept path-like objects as pathlib glob patterns. (#114017)
Allow `os.PathLike` objects to be passed as patterns to `pathlib.Path.glob()` and `rglob()`. (It's already possible to use them in `PurePath.match()`) While we're in the area: - Allow empty glob patterns in `PathBase` (but not `Path`) - Speed up globbing in `PathBase` by generating paths with trailing slashes only as a final step, rather than for every intermediate directory. - Simplify and speed up handling of rare patterns involving both `**` and `..` segments.
Diffstat (limited to 'Lib/pathlib')
-rw-r--r--Lib/pathlib/__init__.py49
-rw-r--r--Lib/pathlib/_abc.py98
2 files changed, 78 insertions, 69 deletions
diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py
index f14d35b..b043aed 100644
--- a/Lib/pathlib/__init__.py
+++ b/Lib/pathlib/__init__.py
@@ -467,6 +467,29 @@ class PurePath(_abc.PurePathBase):
from urllib.parse import quote_from_bytes
return prefix + quote_from_bytes(os.fsencode(path))
+ @property
+ def _pattern_stack(self):
+ """Stack of path components, to be used with patterns in glob()."""
+ parts = self._tail.copy()
+ pattern = self._raw_path
+ if self.anchor:
+ raise NotImplementedError("Non-relative patterns are unsupported")
+ elif not parts:
+ raise ValueError("Unacceptable pattern: {!r}".format(pattern))
+ elif pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
+ # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
+ parts.append('')
+ elif parts[-1] == '**':
+ # GH-70303: '**' only matches directories. Add trailing slash.
+ warnings.warn(
+ "Pattern ending '**' will match files and directories in a "
+ "future Python release. Add a trailing slash to match only "
+ "directories and remove this warning.",
+ FutureWarning, 4)
+ parts.append('')
+ parts.reverse()
+ return parts
+
# Subclassing os.PathLike makes isinstance() checks slower,
# which in turn makes Path construction slower. Register instead!
@@ -580,7 +603,7 @@ class Path(_abc.PathBase, PurePath):
def _scandir(self):
return os.scandir(self)
- def _make_child_entry(self, entry, is_dir=False):
+ def _make_child_entry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str)
@@ -591,6 +614,8 @@ class Path(_abc.PathBase, PurePath):
return path
def _make_child_relpath(self, name):
+ if not name:
+ return self
path_str = str(self)
tail = self._tail
if tail:
@@ -611,14 +636,8 @@ class Path(_abc.PathBase, PurePath):
kind, including directories) matching the given relative pattern.
"""
sys.audit("pathlib.Path.glob", self, pattern)
- if pattern.endswith('**'):
- # GH-70303: '**' only matches directories. Add trailing slash.
- warnings.warn(
- "Pattern ending '**' will match files and directories in a "
- "future Python release. Add a trailing slash to match only "
- "directories and remove this warning.",
- FutureWarning, 2)
- pattern = f'{pattern}/'
+ if not isinstance(pattern, PurePath):
+ pattern = self.with_segments(pattern)
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
@@ -628,15 +647,9 @@ class Path(_abc.PathBase, PurePath):
this subtree.
"""
sys.audit("pathlib.Path.rglob", self, pattern)
- if pattern.endswith('**'):
- # GH-70303: '**' only matches directories. Add trailing slash.
- warnings.warn(
- "Pattern ending '**' will match files and directories in a "
- "future Python release. Add a trailing slash to match only "
- "directories and remove this warning.",
- FutureWarning, 2)
- pattern = f'{pattern}/'
- pattern = f'**/{pattern}'
+ if not isinstance(pattern, PurePath):
+ pattern = self.with_segments(pattern)
+ pattern = '**' / pattern
return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py
index 48a6c21..e5eeb4a 100644
--- a/Lib/pathlib/_abc.py
+++ b/Lib/pathlib/_abc.py
@@ -63,6 +63,12 @@ def _compile_pattern(pat, sep, case_sensitive):
return re.compile(regex, flags=flags).match
+def _select_special(paths, part):
+ """Yield special literal children of the given paths."""
+ for path in paths:
+ yield path._make_child_relpath(part)
+
+
def _select_children(parent_paths, dir_only, follow_symlinks, match):
"""Yield direct children of given paths, filtering by name and type."""
if follow_symlinks is None:
@@ -84,7 +90,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
except OSError:
continue
if match(entry.name):
- yield parent_path._make_child_entry(entry, dir_only)
+ yield parent_path._make_child_entry(entry)
def _select_recursive(parent_paths, dir_only, follow_symlinks):
@@ -107,7 +113,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
for entry in entries:
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
- paths.append(path._make_child_entry(entry, dir_only))
+ paths.append(path._make_child_entry(entry))
continue
except OSError:
pass
@@ -427,6 +433,14 @@ class PurePathBase:
a drive)."""
return self.pathmod.isabs(self._raw_path)
+ @property
+ def _pattern_stack(self):
+ """Stack of path components, to be used with patterns in glob()."""
+ anchor, parts = self._stack
+ if anchor:
+ raise NotImplementedError("Non-relative patterns are unsupported")
+ return parts
+
def match(self, path_pattern, *, case_sensitive=None):
"""
Return True if this path matches the given pattern.
@@ -436,11 +450,10 @@ class PurePathBase:
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.pathmod)
sep = path_pattern.pathmod.sep
- pattern_str = str(path_pattern)
if path_pattern.anchor:
- pass
+ pattern_str = str(path_pattern)
elif path_pattern.parts:
- pattern_str = f'**{sep}{pattern_str}'
+ pattern_str = str('**' / path_pattern)
else:
raise ValueError("empty pattern")
match = _compile_pattern(pattern_str, sep, case_sensitive)
@@ -714,10 +727,8 @@ class PathBase(PurePathBase):
from contextlib import nullcontext
return nullcontext(self.iterdir())
- def _make_child_entry(self, entry, is_dir=False):
+ def _make_child_entry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
- if is_dir:
- return entry.joinpath('')
return entry
def _make_child_relpath(self, name):
@@ -727,57 +738,35 @@ class PathBase(PurePathBase):
"""Iterate over this subtree and yield all existing files (of any
kind, including directories) matching the given relative pattern.
"""
- path_pattern = self.with_segments(pattern)
- if path_pattern.anchor:
- raise NotImplementedError("Non-relative patterns are unsupported")
- elif not path_pattern.parts:
- raise ValueError("Unacceptable pattern: {!r}".format(pattern))
-
- pattern_parts = list(path_pattern.parts)
- if not self.pathmod.split(pattern)[1]:
- # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
- pattern_parts.append('')
-
+ if not isinstance(pattern, PurePathBase):
+ pattern = self.with_segments(pattern)
if case_sensitive is None:
# TODO: evaluate case-sensitivity of each directory in _select_children().
case_sensitive = _is_case_sensitive(self.pathmod)
- # If symlinks are handled consistently, and the pattern does not
- # contain '..' components, then we can use a 'walk-and-match' strategy
- # when expanding '**' wildcards. When a '**' wildcard is encountered,
- # all following pattern parts are immediately consumed and used to
- # build a `re.Pattern` object. This pattern is used to filter the
- # recursive walk. As a result, pattern parts following a '**' wildcard
- # do not perform any filesystem access, which can be much faster!
- filter_paths = follow_symlinks is not None and '..' not in pattern_parts
+ stack = pattern._pattern_stack
+ specials = ('', '.', '..')
+ filter_paths = False
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self.joinpath('')] if self.is_dir() else [])
- part_idx = 0
- while part_idx < len(pattern_parts):
- part = pattern_parts[part_idx]
- part_idx += 1
- if part == '':
- # Trailing slash.
- pass
- elif part == '..':
- paths = (path._make_child_relpath('..') for path in paths)
+ while stack:
+ part = stack.pop()
+ if part in specials:
+ paths = _select_special(paths, part)
elif part == '**':
# Consume adjacent '**' components.
- while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**':
- part_idx += 1
-
- if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '':
- dir_only = pattern_parts[-1] == ''
- paths = _select_recursive(paths, dir_only, follow_symlinks)
+ while stack and stack[-1] == '**':
+ stack.pop()
- # Filter out paths that don't match pattern.
- prefix_len = len(str(self._make_child_relpath('_'))) - 1
- match = _compile_pattern(str(path_pattern), sep, case_sensitive)
- paths = (path for path in paths if match(str(path), prefix_len))
- return paths
+ # Consume adjacent non-special components and enable post-walk
+ # regex filtering, provided we're treating symlinks consistently.
+ if follow_symlinks is not None:
+ while stack and stack[-1] not in specials:
+ filter_paths = True
+ stack.pop()
- dir_only = part_idx < len(pattern_parts)
+ dir_only = bool(stack)
paths = _select_recursive(paths, dir_only, follow_symlinks)
if deduplicate_paths:
# De-duplicate if we've already seen a '**' component.
@@ -786,9 +775,14 @@ class PathBase(PurePathBase):
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
- dir_only = part_idx < len(pattern_parts)
+ dir_only = bool(stack)
match = _compile_pattern(part, sep, case_sensitive)
paths = _select_children(paths, dir_only, follow_symlinks, match)
+ if filter_paths:
+ # Filter out paths that don't match pattern.
+ prefix_len = len(str(self._make_child_relpath('_'))) - 1
+ match = _compile_pattern(str(pattern), sep, case_sensitive)
+ paths = (path for path in paths if match(str(path), prefix_len))
return paths
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
@@ -796,8 +790,10 @@ class PathBase(PurePathBase):
directories) matching the given relative pattern, anywhere in
this subtree.
"""
- return self.glob(
- f'**/{pattern}', case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
+ if not isinstance(pattern, PurePathBase):
+ pattern = self.with_segments(pattern)
+ pattern = '**' / pattern
+ return self.glob(pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk()."""