diff options
author | Barney Gale <barney.gale@gmail.com> | 2023-11-13 17:15:56 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-13 17:15:56 (GMT) |
commit | cf67ebfb315ce36175f3d425249d7c6560f6d0d5 (patch) | |
tree | 3007eaa7164eba027714b9752aecea60627e6de6 /Lib/glob.py | |
parent | babb787047e0f7807c8238d3b1a3128dac30bd5c (diff) | |
download | cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.zip cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.tar.gz cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.tar.bz2 |
GH-72904: Add `glob.translate()` function (#106703)
Add `glob.translate()` function that converts a pathname with shell wildcards to a regular expression. The regular expression is used by pathlib to implement `match()` and `glob()`.
This function differs from `fnmatch.translate()` in that wildcards do not match path separators by default, and that a `*` pattern segment matches precisely one path segment. When *recursive* is set to true, `**` pattern segments match any number of path segments, and `**` cannot appear outside its own segment.
In pathlib, this change speeds up directory walking (because `_make_child_relpath()` does less work), makes path objects smaller (they don't need a `_lines` slot), and removes the need for some gnarly code.
Co-authored-by: Jason R. Coombs <jaraco@jaraco.com>
Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>
Diffstat (limited to 'Lib/glob.py')
-rw-r--r-- | Lib/glob.py | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/Lib/glob.py b/Lib/glob.py index a725642..4a335a1 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -249,3 +249,63 @@ def escape(pathname): _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) + + +def translate(pat, *, recursive=False, include_hidden=False, seps=None): + """Translate a pathname with shell wildcards to a regular expression. + + If `recursive` is true, the pattern segment '**' will match any number of + path segments; if '**' appears outside its own segment, ValueError will be + raised. + + If `include_hidden` is true, wildcards can match path segments beginning + with a dot ('.'). + + If a sequence of separator characters is given to `seps`, they will be + used to split the pattern into segments and match path separators. If not + given, os.path.sep and os.path.altsep (where available) are used. + """ + if not seps: + if os.path.altsep: + seps = (os.path.sep, os.path.altsep) + else: + seps = os.path.sep + escaped_seps = ''.join(map(re.escape, seps)) + any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps + not_sep = f'[^{escaped_seps}]' + if include_hidden: + one_last_segment = f'{not_sep}+' + one_segment = f'{one_last_segment}{any_sep}' + any_segments = f'(?:.+{any_sep})?' + any_last_segments = '.*' + else: + one_last_segment = f'[^{escaped_seps}.]{not_sep}*' + one_segment = f'{one_last_segment}{any_sep}' + any_segments = f'(?:{one_segment})*' + any_last_segments = f'{any_segments}(?:{one_last_segment})?' + + results = [] + parts = re.split(any_sep, pat) + last_part_idx = len(parts) - 1 + for idx, part in enumerate(parts): + if part == '*': + results.append(one_segment if idx < last_part_idx else one_last_segment) + continue + if recursive: + if part == '**': + if idx < last_part_idx: + if parts[idx + 1] != '**': + results.append(any_segments) + else: + results.append(any_last_segments) + continue + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + if part: + if not include_hidden and part[0] in '*?': + results.append(r'(?!\.)') + results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)) + if idx < last_part_idx: + results.append(any_sep) + res = ''.join(results) + return fr'(?s:{res})\Z' |