diff options
author | Barney Gale <barney.gale@gmail.com> | 2023-05-31 20:37:37 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-31 20:37:37 (GMT) |
commit | e7cb216339e8aa8d00f7f55f9e37864b1720ca0f (patch) | |
tree | 51cf3d735a04925c12f5d276c52153477345965c /Lib/pathlib.py | |
parent | 076f3cda140a45b08c2c9518bc19624aae50d3a3 (diff) | |
download | cpython-e7cb216339e8aa8d00f7f55f9e37864b1720ca0f.zip cpython-e7cb216339e8aa8d00f7f55f9e37864b1720ca0f.tar.gz cpython-e7cb216339e8aa8d00f7f55f9e37864b1720ca0f.tar.bz2 |
[3.12] GH-105113: Improve performance of `pathlib.PurePath.match()` (GH-105114)
We now compile a `re.Pattern` object for the entire pattern. This is made
more difficult by `fnmatch` not treating directory separators as special
when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts
onto separate *lines* in a string, and ensure we don't set `re.DOTALL`.
Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com>
Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
Diffstat (limited to 'Lib/pathlib.py')
-rw-r--r-- | Lib/pathlib.py | 86 |
1 files changed, 72 insertions, 14 deletions
diff --git a/Lib/pathlib.py b/Lib/pathlib.py index a42085e..29517e4 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -54,6 +54,7 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) +@functools.cache def _is_case_sensitive(flavour): return flavour.normcase('Aa') == 'Aa' @@ -61,6 +62,22 @@ def _is_case_sensitive(flavour): # Globbing helpers # + +# fnmatch.translate() returns a regular expression that includes a prefix and +# a suffix, which enable matching newlines and ensure the end of the string is +# matched, respectively. These features are undesirable for our implementation +# of PurePatch.match(), which represents path separators as newlines and joins +# pattern segments together. As a workaround, we define a slice object that +# can remove the prefix and suffix from any translate() result. See the +# _compile_pattern_lines() function for more details. +_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') +_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) +_SWAP_SEP_AND_NEWLINE = { + '/': str.maketrans({'/': '\n', '\n': '/'}), + '\\': str.maketrans({'\\': '\n', '\n': '\\'}), +} + + @functools.lru_cache() def _make_selector(pattern_parts, flavour, case_sensitive): pat = pattern_parts[0] @@ -92,6 +109,38 @@ def _compile_pattern(pat, case_sensitive): return re.compile(fnmatch.translate(pat), flags).match +@functools.lru_cache() +def _compile_pattern_lines(pattern_lines, case_sensitive): + """Compile the given pattern lines to an `re.Pattern` object. + + The *pattern_lines* argument is a glob-style pattern (e.g. '*/*.py') with + its path separators and newlines swapped (e.g. '*\n*.py`). By using + newlines to separate path components, and not setting `re.DOTALL`, we + ensure that the `*` wildcard cannot match path separators. + + The returned `re.Pattern` object may have its `match()` method called to + match a complete pattern, or `search()` to match from the right. The + argument supplied to these methods must also have its path separators and + newlines swapped. + """ + + # Match the start of the path, or just after a path separator + parts = ['^'] + for part in pattern_lines.splitlines(keepends=True): + # We slice off the common prefix and suffix added by translate() to + # ensure that re.DOTALL is not set, and the end of the string not + # matched, respectively. With DOTALL not set, '*' wildcards will not + # match path separators, because the '.' characters in the pattern + # will not match newlines. + parts.append(fnmatch.translate(part)[_FNMATCH_SLICE]) + # Match the end of the path, always. + parts.append(r'\Z') + flags = re.MULTILINE + if not case_sensitive: + flags |= re.IGNORECASE + return re.compile(''.join(parts), flags=flags) + + class _Selector: """A selector matches a specific glob pattern part against the children of a given path.""" @@ -274,6 +323,10 @@ class PurePath(object): # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', + # The `_lines_cached` slot stores the string path with path separators + # and newlines swapped. This is used to implement `match()`. + '_lines_cached', + # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. '_hash', @@ -439,6 +492,16 @@ class PurePath(object): self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) return self._parts_normcase_cached + @property + def _lines(self): + # Path with separators and newlines swapped, for pattern matching. + try: + return self._lines_cached + except AttributeError: + trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] + self._lines_cached = str(self).translate(trans) + return self._lines_cached + def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented @@ -695,23 +758,18 @@ class PurePath(object): """ Return True if this path matches the given pattern. """ + if not isinstance(path_pattern, PurePath): + path_pattern = self.with_segments(path_pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self._flavour) - pat = self.with_segments(path_pattern) - if not pat.parts: + pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive) + if path_pattern.drive or path_pattern.root: + return pattern.match(self._lines) is not None + elif path_pattern._tail: + return pattern.search(self._lines) is not None + else: raise ValueError("empty pattern") - pat_parts = pat.parts - parts = self.parts - if pat.drive or pat.root: - if len(pat_parts) != len(parts): - return False - elif len(pat_parts) > len(parts): - return False - for part, pat in zip(reversed(parts), reversed(pat_parts)): - match = _compile_pattern(pat, case_sensitive) - if not match(part): - return False - return True + # Can't subclass os.PathLike from PurePath and keep the constructor # optimizations in PurePath.__slots__. |