summaryrefslogtreecommitdiffstats
path: root/Lib/glob.py
diff options
context:
space:
mode:
authorBarney Gale <barney.gale@gmail.com>2023-11-13 17:15:56 (GMT)
committerGitHub <noreply@github.com>2023-11-13 17:15:56 (GMT)
commitcf67ebfb315ce36175f3d425249d7c6560f6d0d5 (patch)
tree3007eaa7164eba027714b9752aecea60627e6de6 /Lib/glob.py
parentbabb787047e0f7807c8238d3b1a3128dac30bd5c (diff)
downloadcpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.zip
cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.tar.gz
cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.tar.bz2
GH-72904: Add `glob.translate()` function (#106703)
Add `glob.translate()` function that converts a pathname with shell wildcards to a regular expression. The regular expression is used by pathlib to implement `match()` and `glob()`. This function differs from `fnmatch.translate()` in that wildcards do not match path separators by default, and that a `*` pattern segment matches precisely one path segment. When *recursive* is set to true, `**` pattern segments match any number of path segments, and `**` cannot appear outside its own segment. In pathlib, this change speeds up directory walking (because `_make_child_relpath()` does less work), makes path objects smaller (they don't need a `_lines` slot), and removes the need for some gnarly code. Co-authored-by: Jason R. Coombs <jaraco@jaraco.com> Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>
Diffstat (limited to 'Lib/glob.py')
-rw-r--r--Lib/glob.py60
1 files changed, 60 insertions, 0 deletions
diff --git a/Lib/glob.py b/Lib/glob.py
index a725642..4a335a1 100644
--- a/Lib/glob.py
+++ b/Lib/glob.py
@@ -249,3 +249,63 @@ def escape(pathname):
_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
+
+
+def translate(pat, *, recursive=False, include_hidden=False, seps=None):
+ """Translate a pathname with shell wildcards to a regular expression.
+
+ If `recursive` is true, the pattern segment '**' will match any number of
+ path segments; if '**' appears outside its own segment, ValueError will be
+ raised.
+
+ If `include_hidden` is true, wildcards can match path segments beginning
+ with a dot ('.').
+
+ If a sequence of separator characters is given to `seps`, they will be
+ used to split the pattern into segments and match path separators. If not
+ given, os.path.sep and os.path.altsep (where available) are used.
+ """
+ if not seps:
+ if os.path.altsep:
+ seps = (os.path.sep, os.path.altsep)
+ else:
+ seps = os.path.sep
+ escaped_seps = ''.join(map(re.escape, seps))
+ any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
+ not_sep = f'[^{escaped_seps}]'
+ if include_hidden:
+ one_last_segment = f'{not_sep}+'
+ one_segment = f'{one_last_segment}{any_sep}'
+ any_segments = f'(?:.+{any_sep})?'
+ any_last_segments = '.*'
+ else:
+ one_last_segment = f'[^{escaped_seps}.]{not_sep}*'
+ one_segment = f'{one_last_segment}{any_sep}'
+ any_segments = f'(?:{one_segment})*'
+ any_last_segments = f'{any_segments}(?:{one_last_segment})?'
+
+ results = []
+ parts = re.split(any_sep, pat)
+ last_part_idx = len(parts) - 1
+ for idx, part in enumerate(parts):
+ if part == '*':
+ results.append(one_segment if idx < last_part_idx else one_last_segment)
+ continue
+ if recursive:
+ if part == '**':
+ if idx < last_part_idx:
+ if parts[idx + 1] != '**':
+ results.append(any_segments)
+ else:
+ results.append(any_last_segments)
+ continue
+ elif '**' in part:
+ raise ValueError("Invalid pattern: '**' can only be an entire path component")
+ if part:
+ if not include_hidden and part[0] in '*?':
+ results.append(r'(?!\.)')
+ results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
+ if idx < last_part_idx:
+ results.append(any_sep)
+ res = ''.join(results)
+ return fr'(?s:{res})\Z'