GH-72904: Add `glob.translate()` function (#106703)

Add `glob.translate()` function that converts a pathname with shell wildcards to a regular expression. The regular expression is used by pathlib to implement `match()` and `glob()`. This function differs from `fnmatch.translate()` in that wildcards do not match path separators by default, and that a `*` pattern segment matches precisely one path segment. When *recursive* is set to true, `**` pattern segments match any number of path segments, and `**` cannot appear outside its own segment. In pathlib, this change speeds up directory walking (because `_make_child_relpath()` does less work), makes path objects smaller (they don't need a `_lines` slot), and removes the need for some gnarly code. Co-authored-by: Jason R. Coombs <jaraco@jaraco.com> Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>
author: Barney Gale <barney.gale@gmail.com> 2023-11-13 17:15:56 (GMT)
committer: GitHub <noreply@github.com> 2023-11-13 17:15:56 (GMT)
commit: cf67ebfb315ce36175f3d425249d7c6560f6d0d5 (patch)
tree: 3007eaa7164eba027714b9752aecea60627e6de6 /Lib/pathlib.py
parent: babb787047e0f7807c8238d3b1a3128dac30bd5c (diff)
download: cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.zip
cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.tar.gz
cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.tar.bz2
1 files changed, 21 insertions, 104 deletions
diff --git a/Lib/pathlib.py b/Lib/pathlib.py
index 47a043c..c06ea5c 100644
--- a/Lib/pathlib.py
+++ b/Lib/pathlib.py
@@ -6,8 +6,8 @@ operating systems.
 """
 
 import contextlib
-import fnmatch
 import functools
+import glob
 import io
 import ntpath
 import os
@@ -76,78 +76,16 @@ def _is_case_sensitive(pathmod):
 #
 
 
-# fnmatch.translate() returns a regular expression that includes a prefix and
-# a suffix, which enable matching newlines and ensure the end of the string is
-# matched, respectively. These features are undesirable for our implementation
-# of PurePatch.match(), which represents path separators as newlines and joins
-# pattern segments together. As a workaround, we define a slice object that
-# can remove the prefix and suffix from any translate() result. See the
-# _compile_pattern_lines() function for more details.
-_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
-_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
-_SWAP_SEP_AND_NEWLINE = {
-    '/': str.maketrans({'/': '\n', '\n': '/'}),
-    '\\': str.maketrans({'\\': '\n', '\n': '\\'}),
-}
-
-
 @functools.lru_cache(maxsize=256)
-def _compile_pattern(pat, case_sensitive):
+def _compile_pattern(pat, sep, case_sensitive):
     """Compile given glob pattern to a re.Pattern object (observing case
-    sensitivity), or None if the pattern should match everything."""
-    if pat == '*':
-        return None
+    sensitivity)."""
     flags = re.NOFLAG if case_sensitive else re.IGNORECASE
-    return re.compile(fnmatch.translate(pat), flags).match
-
-
-@functools.lru_cache()
-def _compile_pattern_lines(pattern_lines, case_sensitive):
-    """Compile the given pattern lines to an `re.Pattern` object.
-
-    The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
-    its path separators and newlines swapped (e.g. '**\n*.py`). By using
-    newlines to separate path components, and not setting `re.DOTALL`, we
-    ensure that the `*` wildcard cannot match path separators.
-
-    The returned `re.Pattern` object may have its `match()` method called to
-    match a complete pattern, or `search()` to match from the right. The
-    argument supplied to these methods must also have its path separators and
-    newlines swapped.
-    """
-
-    # Match the start of the path, or just after a path separator
-    parts = ['^']
-    for part in pattern_lines.splitlines(keepends=True):
-        if part == '*\n':
-            part = r'.+\n'
-        elif part == '*':
-            part = r'.+'
-        elif part == '**\n':
-            # '**/' component: we use '(?s:.)' rather than '.' so that path
-            # separators (i.e. newlines) are matched. The trailing '^' ensures
-            # we terminate after a path separator (i.e. on a new line).
-            part = r'(?s:.)*^'
-        elif part == '**':
-            # '**' component.
-            part = r'(?s:.)*'
-        elif '**' in part:
-            raise ValueError("Invalid pattern: '**' can only be an entire path component")
-        else:
-            # Any other component: pass to fnmatch.translate(). We slice off
-            # the common prefix and suffix added by translate() to ensure that
-            # re.DOTALL is not set, and the end of the string not matched,
-            # respectively. With DOTALL not set, '*' wildcards will not match
-            # path separators, because the '.' characters in the pattern will
-            # not match newlines.
-            part = fnmatch.translate(part)[_FNMATCH_SLICE]
-        parts.append(part)
-    # Match the end of the path, always.
-    parts.append(r'\Z')
-    flags = re.MULTILINE
-    if not case_sensitive:
-        flags |= re.IGNORECASE
-    return re.compile(''.join(parts), flags=flags)
+    regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep)
+    # The string representation of an empty path is a single dot ('.'). Empty
+    # paths shouldn't match wildcards, so we consume it with an atomic group.
+    regex = r'(\.\Z)?+' + regex
+    return re.compile(regex, flags).match
 
 
 def _select_children(parent_paths, dir_only, follow_symlinks, match):
@@ -171,7 +109,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
                     except OSError:
                         continue
                 name = entry.name
-                if match is None or match(name):
+                if match(name):
                     yield parent_path._make_child_relpath(name)
 
 
@@ -297,10 +235,6 @@ class PurePath:
         # to implement comparison methods like `__lt__()`.
         '_parts_normcase_cached',
 
-        # The `_lines_cached` slot stores the string path with path separators
-        # and newlines swapped. This is used to implement `match()`.
-        '_lines_cached',
-
         # The `_hash` slot stores the hash of the case-normalized string
         # path. It's set when `__hash__()` is called for the first time.
         '_hash',
@@ -475,20 +409,6 @@ class PurePath:
             self._parts_normcase_cached = self._str_normcase.split(self.pathmod.sep)
             return self._parts_normcase_cached
 
-    @property
-    def _lines(self):
-        # Path with separators and newlines swapped, for pattern matching.
-        try:
-            return self._lines_cached
-        except AttributeError:
-            path_str = str(self)
-            if path_str == '.':
-                self._lines_cached = ''
-            else:
-                trans = _SWAP_SEP_AND_NEWLINE[self.pathmod.sep]
-                self._lines_cached = path_str.translate(trans)
-            return self._lines_cached
-
     def __eq__(self, other):
         if not isinstance(other, PurePath):
             return NotImplemented
@@ -763,13 +683,16 @@ class PurePath:
             path_pattern = self.with_segments(path_pattern)
         if case_sensitive is None:
             case_sensitive = _is_case_sensitive(self.pathmod)
-        pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
+        sep = path_pattern.pathmod.sep
+        pattern_str = str(path_pattern)
         if path_pattern.drive or path_pattern.root:
-            return pattern.match(self._lines) is not None
+            pass
         elif path_pattern._tail:
-            return pattern.search(self._lines) is not None
+            pattern_str = f'**{sep}{pattern_str}'
         else:
             raise ValueError("empty pattern")
+        match = _compile_pattern(pattern_str, sep, case_sensitive)
+        return match(str(self)) is not None
 
 
 # Subclassing os.PathLike makes isinstance() checks slower,
@@ -1069,26 +992,19 @@ class _PathBase(PurePath):
         return contextlib.nullcontext(self.iterdir())
 
     def _make_child_relpath(self, name):
-        sep = self.pathmod.sep
-        lines_name = name.replace('\n', sep)
-        lines_str = self._lines
         path_str = str(self)
         tail = self._tail
         if tail:
-            path_str = f'{path_str}{sep}{name}'
-            lines_str = f'{lines_str}\n{lines_name}'
+            path_str = f'{path_str}{self.pathmod.sep}{name}'
         elif path_str != '.':
             path_str = f'{path_str}{name}'
-            lines_str = f'{lines_str}{lines_name}'
         else:
             path_str = name
-            lines_str = lines_name
         path = self.with_segments(path_str)
         path._str = path_str
         path._drv = self.drive
         path._root = self.root
         path._tail_cached = tail + [name]
-        path._lines_cached = lines_str
         return path
 
     def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
@@ -1139,6 +1055,7 @@ class _PathBase(PurePath):
         # do not perform any filesystem access, which can be much faster!
         filter_paths = follow_symlinks is not None and '..' not in pattern_parts
         deduplicate_paths = False
+        sep = self.pathmod.sep
         paths = iter([self] if self.is_dir() else [])
         part_idx = 0
         while part_idx < len(pattern_parts):
@@ -1159,9 +1076,9 @@ class _PathBase(PurePath):
                     paths = _select_recursive(paths, dir_only, follow_symlinks)
 
                     # Filter out paths that don't match pattern.
-                    prefix_len = len(self._make_child_relpath('_')._lines) - 1
-                    match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match
-                    paths = (path for path in paths if match(path._lines[prefix_len:]))
+                    prefix_len = len(str(self._make_child_relpath('_'))) - 1
+                    match = _compile_pattern(str(path_pattern), sep, case_sensitive)
+                    paths = (path for path in paths if match(str(path), prefix_len))
                     return paths
 
                 dir_only = part_idx < len(pattern_parts)
@@ -1174,7 +1091,7 @@ class _PathBase(PurePath):
                 raise ValueError("Invalid pattern: '**' can only be an entire path component")
             else:
                 dir_only = part_idx < len(pattern_parts)
-                match = _compile_pattern(part, case_sensitive)
+                match = _compile_pattern(part, sep, case_sensitive)
                 paths = _select_children(paths, dir_only, follow_symlinks, match)
         return paths
author	Barney Gale <barney.gale@gmail.com>	2023-11-13 17:15:56 (GMT)
committer	GitHub <noreply@github.com>	2023-11-13 17:15:56 (GMT)
commit	cf67ebfb315ce36175f3d425249d7c6560f6d0d5 (patch)
tree	3007eaa7164eba027714b9752aecea60627e6de6 /Lib/pathlib.py
parent	babb787047e0f7807c8238d3b1a3128dac30bd5c (diff)
download	cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.zip cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.tar.gz cpython-cf67ebfb315ce36175f3d425249d7c6560f6d0d5.tar.bz2