From 836f5433f7ee79f208186ca0d10594b22cf7e05b Mon Sep 17 00:00:00 2001 From: Johannes Gijsbers Date: Sat, 8 Jan 2005 13:13:19 +0000 Subject: Patch #943206: `glob.glob()` currently calls itself recursively to build a list of matches of the dirname part of the pattern and then filters by the basename part. This is effectively BFS. ``glob.glob('*/*/*/*/*/foo')`` will build a huge list of all directories 5 levels deep even if only a handful of them contain a ``foo`` entry. A generator-based recusion would never have to store these list at once by implementing DFS. This patch converts the `glob` function to an `iglob` recursive generator . `glob()` now just returns ``list(iglob(pattern))``. I also cleaned up the code a bit (reduced duplicate `has_magic()` checks and created a second `glob0` helper func so that the main loop need not be duplicated). Thanks to Cherniavsky Beni for the patch! --- Doc/lib/libglob.tex | 8 ++++++- Lib/glob.py | 62 +++++++++++++++++++++++++++++++++------------------ Lib/test/test_glob.py | 4 +++- 3 files changed, 50 insertions(+), 24 deletions(-) diff --git a/Doc/lib/libglob.tex b/Doc/lib/libglob.tex index 0d0d712..f3f4fb7 100644 --- a/Doc/lib/libglob.tex +++ b/Doc/lib/libglob.tex @@ -16,7 +16,7 @@ and \function{os.path.expandvars()}.) \index{filenames!pathname expansion} \begin{funcdesc}{glob}{pathname} -Returns a possibly-empty list of path names that match \var{pathname}, +Return a possibly-empty list of path names that match \var{pathname}, which must be a string containing a path specification. \var{pathname} can be either absolute (like \file{/usr/src/Python-1.5/Makefile}) or relative (like @@ -24,6 +24,12 @@ which must be a string containing a path specification. Broken symlinks are included in the results (as in the shell). \end{funcdesc} +\begin{funcdesc}{iglob}{pathname} +Return an iterator which yields the same values as \function{glob()} +without actually storing them all simultaneously. +\versionadded{2.5} +\end{funcdesc} + For example, consider a directory containing only the following files: \file{1.gif}, \file{2.txt}, and \file{card.gif}. \function{glob()} will produce the following results. Notice how any leading components diff --git a/Lib/glob.py b/Lib/glob.py index 4ba4138..ecc6d25 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -4,7 +4,7 @@ import os import fnmatch import re -__all__ = ["glob"] +__all__ = ["glob", "iglob"] def glob(pathname): """Return a list of paths matching a pathname pattern. @@ -12,35 +12,42 @@ def glob(pathname): The pattern may contain simple shell-style wildcards a la fnmatch. """ + return list(iglob(pathname)) + +def iglob(pathname): + """Return a list of paths matching a pathname pattern. + + The pattern may contain simple shell-style wildcards a la fnmatch. + + """ if not has_magic(pathname): if os.path.lexists(pathname): - return [pathname] - else: - return [] + yield pathname + return dirname, basename = os.path.split(pathname) if not dirname: - return glob1(os.curdir, basename) - elif has_magic(dirname): - list = glob(dirname) + for name in glob1(os.curdir, basename): + yield name + return + if has_magic(dirname): + dirs = iglob(dirname) else: - list = [dirname] - if not has_magic(basename): - result = [] - for dirname in list: - if basename or os.path.isdir(dirname): - name = os.path.join(dirname, basename) - if os.path.lexists(name): - result.append(name) + dirs = [dirname] + if has_magic(basename): + glob_in_dir = glob1 else: - result = [] - for dirname in list: - sublist = glob1(dirname, basename) - for name in sublist: - result.append(os.path.join(dirname, name)) - return result + glob_in_dir = glob0 + for dirname in dirs: + for name in glob_in_dir(dirname, basename): + yield os.path.join(dirname, name) + +# These 2 helper functions non-recursively glob inside a literal directory. +# They return a list of basenames. `glob1` accepts a pattern while `glob0` +# takes a literal basename (so it only has to check for its existence). def glob1(dirname, pattern): - if not dirname: dirname = os.curdir + if not dirname: + dirname = os.curdir try: names = os.listdir(dirname) except os.error: @@ -49,6 +56,17 @@ def glob1(dirname, pattern): names=filter(lambda x: x[0]!='.',names) return fnmatch.filter(names,pattern) +def glob0(dirname, basename): + if basename == '': + # `os.path.split()` returns an empty basename for paths ending with a + # directory separator. 'q*x/' should match only directories. + if os.isdir(dirname): + return [basename] + else: + if os.path.lexists(os.path.join(dirname, basename)): + return [basename] + return [] + magic_check = re.compile('[*?[]') diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index af81dab..f23dcf1 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -61,7 +61,9 @@ class GlobTests(unittest.TestCase): else: pattern = os.path.join(*parts) p = os.path.join(self.tempdir, pattern) - return glob.glob(p) + res = glob.glob(p) + self.assertEqual(list(glob.iglob(p)), res) + return res def assertSequencesEqual_noorder(self, l1, l2): self.assertEqual(set(l1), set(l2)) -- cgit v0.12