summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBarney Gale <barney.gale@gmail.com>2024-04-12 21:19:21 (GMT)
committerGitHub <noreply@github.com>2024-04-12 21:19:21 (GMT)
commit0eb52f5f266d9e0a662f28a4d2dfef8c746cf96e (patch)
tree2cccc9ca7a407f88e57b1bf1b7406a1c73005ce2
parent069de14cb948f56b37e507f367b99c5563d3685e (diff)
downloadcpython-0eb52f5f266d9e0a662f28a4d2dfef8c746cf96e.zip
cpython-0eb52f5f266d9e0a662f28a4d2dfef8c746cf96e.tar.gz
cpython-0eb52f5f266d9e0a662f28a4d2dfef8c746cf96e.tar.bz2
GH-115060: Speed up `pathlib.Path.glob()` by not scanning literal parts (#117732)
Don't bother calling `os.scandir()` to scan for literal pattern segments, like `foo` in `foo/*.py`. Instead, append the segment(s) as-is and call through to the next selector with `exists=False`, which signals that the path might not exist. Subsequent selectors will call `os.scandir()` or `os.lstat()` to filter out missing paths as needed.
-rw-r--r--Lib/glob.py22
-rw-r--r--Lib/pathlib/_abc.py8
-rw-r--r--Lib/test/test_pathlib/test_pathlib_abc.py21
-rw-r--r--Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst2
4 files changed, 42 insertions, 11 deletions
diff --git a/Lib/glob.py b/Lib/glob.py
index b1d2681..72cf222 100644
--- a/Lib/glob.py
+++ b/Lib/glob.py
@@ -331,9 +331,10 @@ class _Globber:
"""Class providing shell-style pattern matching and globbing.
"""
- def __init__(self, sep, case_sensitive, recursive=False):
+ def __init__(self, sep, case_sensitive, case_pedantic=False, recursive=False):
self.sep = sep
self.case_sensitive = case_sensitive
+ self.case_pedantic = case_pedantic
self.recursive = recursive
# Low-level methods
@@ -373,6 +374,8 @@ class _Globber:
selector = self.recursive_selector
elif part in _special_parts:
selector = self.special_selector
+ elif not self.case_pedantic and magic_check.search(part) is None:
+ selector = self.literal_selector
else:
selector = self.wildcard_selector
return selector(part, parts)
@@ -387,6 +390,23 @@ class _Globber:
return select_next(path, exists)
return select_special
+ def literal_selector(self, part, parts):
+ """Returns a function that selects a literal descendant of a path.
+ """
+
+ # Optimization: consume and join any subsequent literal parts here,
+ # rather than leaving them for the next selector. This reduces the
+ # number of string concatenation operations and calls to add_slash().
+ while parts and magic_check.search(parts[-1]) is None:
+ part += self.sep + parts.pop()
+
+ select_next = self.selector(parts)
+
+ def select_literal(path, exists=False):
+ path = self.concat_path(self.add_slash(path), part)
+ return select_next(path, exists=False)
+ return select_literal
+
def wildcard_selector(self, part, parts):
"""Returns a function that selects direct children of a given path,
filtering by pattern.
diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py
index b6cab0d..b51ad6f 100644
--- a/Lib/pathlib/_abc.py
+++ b/Lib/pathlib/_abc.py
@@ -686,8 +686,14 @@ class PathBase(PurePathBase):
def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.parser)
+ case_pedantic = False
+ else:
+ # The user has expressed a case sensitivity choice, but we don't
+ # know the case sensitivity of the underlying filesystem, so we
+ # must use scandir() for everything, including non-wildcard parts.
+ case_pedantic = True
recursive = True if recurse_symlinks else glob._no_recurse_symlinks
- globber = self._globber(self.parser.sep, case_sensitive, recursive)
+ globber = self._globber(self.parser.sep, case_sensitive, case_pedantic, recursive)
return globber.selector(parts)
def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py
index 336115c..6656b03 100644
--- a/Lib/test/test_pathlib/test_pathlib_abc.py
+++ b/Lib/test/test_pathlib/test_pathlib_abc.py
@@ -1429,10 +1429,10 @@ class DummyPath(PathBase):
return "{}({!r})".format(self.__class__.__name__, self.as_posix())
def stat(self, *, follow_symlinks=True):
- if follow_symlinks:
- path = str(self.resolve())
+ if follow_symlinks or self.name in ('', '.', '..'):
+ path = str(self.resolve(strict=True))
else:
- path = str(self.parent.resolve() / self.name)
+ path = str(self.parent.resolve(strict=True) / self.name)
if path in self._files:
st_mode = stat.S_IFREG
elif path in self._directories:
@@ -1741,8 +1741,9 @@ class DummyPathTest(DummyPurePathTest):
def test_glob_posix(self):
P = self.cls
p = P(self.base)
+ q = p / "FILEa"
given = set(p.glob("FILEa"))
- expect = set()
+ expect = {q} if q.exists() else set()
self.assertEqual(given, expect)
self.assertEqual(set(p.glob("FILEa*")), set())
@@ -1753,8 +1754,6 @@ class DummyPathTest(DummyPurePathTest):
self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") })
self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") })
self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") })
- self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"})
- self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"})
def test_glob_empty_pattern(self):
P = self.cls
@@ -1857,8 +1856,9 @@ class DummyPathTest(DummyPurePathTest):
def test_rglob_posix(self):
P = self.cls
p = P(self.base, "dirC")
+ q = p / "dirD" / "FILEd"
given = set(p.rglob("FILEd"))
- expect = set()
+ expect = {q} if q.exists() else set()
self.assertEqual(given, expect)
self.assertEqual(set(p.rglob("FILEd*")), set())
@@ -1868,7 +1868,6 @@ class DummyPathTest(DummyPurePathTest):
p = P(self.base, "dirC")
self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") })
self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") })
- self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"})
@needs_symlinks
def test_rglob_recurse_symlinks_common(self):
@@ -1931,7 +1930,11 @@ class DummyPathTest(DummyPurePathTest):
self.assertEqual(set(p.glob("dirA/../file*")), { P(self.base, "dirA/../fileA") })
self.assertEqual(set(p.glob("dirA/../file*/..")), set())
self.assertEqual(set(p.glob("../xyzzy")), set())
- self.assertEqual(set(p.glob("xyzzy/..")), set())
+ if self.cls.parser is posixpath:
+ self.assertEqual(set(p.glob("xyzzy/..")), set())
+ else:
+ # ".." segments are normalized first on Windows, so this path is stat()able.
+ self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "xyzzy", "..") })
self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)})
@needs_symlinks
diff --git a/Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst b/Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst
new file mode 100644
index 0000000..b5084a0
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst
@@ -0,0 +1,2 @@
+Speed up :meth:`pathlib.Path.glob` by not scanning directories for
+non-wildcard pattern segments.