diff options
author | Johannes Gijsbers <jlg@dds.nl> | 2005-01-08 13:13:19 (GMT) |
---|---|---|
committer | Johannes Gijsbers <jlg@dds.nl> | 2005-01-08 13:13:19 (GMT) |
commit | 836f5433f7ee79f208186ca0d10594b22cf7e05b (patch) | |
tree | d5909575b732a17a75dc5e0e76180d61a5d037c6 | |
parent | e4172eadf3fb9c1de591305ad4ca4ce3e252abd3 (diff) | |
download | cpython-836f5433f7ee79f208186ca0d10594b22cf7e05b.zip cpython-836f5433f7ee79f208186ca0d10594b22cf7e05b.tar.gz cpython-836f5433f7ee79f208186ca0d10594b22cf7e05b.tar.bz2 |
Patch #943206:
`glob.glob()` currently calls itself recursively to build a list of matches of
the dirname part of the pattern and then filters by the basename part. This is
effectively BFS. ``glob.glob('*/*/*/*/*/foo')`` will build a huge list of all
directories 5 levels deep even if only a handful of them contain a ``foo``
entry. A generator-based recusion would never have to store these list at once
by implementing DFS. This patch converts the `glob` function to an `iglob`
recursive generator . `glob()` now just returns ``list(iglob(pattern))``.
I also cleaned up the code a bit (reduced duplicate `has_magic()` checks and
created a second `glob0` helper func so that the main loop need not be
duplicated).
Thanks to Cherniavsky Beni for the patch!
-rw-r--r-- | Doc/lib/libglob.tex | 8 | ||||
-rw-r--r-- | Lib/glob.py | 62 | ||||
-rw-r--r-- | Lib/test/test_glob.py | 4 |
3 files changed, 50 insertions, 24 deletions
diff --git a/Doc/lib/libglob.tex b/Doc/lib/libglob.tex index 0d0d712..f3f4fb7 100644 --- a/Doc/lib/libglob.tex +++ b/Doc/lib/libglob.tex @@ -16,7 +16,7 @@ and \function{os.path.expandvars()}.) \index{filenames!pathname expansion} \begin{funcdesc}{glob}{pathname} -Returns a possibly-empty list of path names that match \var{pathname}, +Return a possibly-empty list of path names that match \var{pathname}, which must be a string containing a path specification. \var{pathname} can be either absolute (like \file{/usr/src/Python-1.5/Makefile}) or relative (like @@ -24,6 +24,12 @@ which must be a string containing a path specification. Broken symlinks are included in the results (as in the shell). \end{funcdesc} +\begin{funcdesc}{iglob}{pathname} +Return an iterator which yields the same values as \function{glob()} +without actually storing them all simultaneously. +\versionadded{2.5} +\end{funcdesc} + For example, consider a directory containing only the following files: \file{1.gif}, \file{2.txt}, and \file{card.gif}. \function{glob()} will produce the following results. Notice how any leading components diff --git a/Lib/glob.py b/Lib/glob.py index 4ba4138..ecc6d25 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -4,7 +4,7 @@ import os import fnmatch import re -__all__ = ["glob"] +__all__ = ["glob", "iglob"] def glob(pathname): """Return a list of paths matching a pathname pattern. @@ -12,35 +12,42 @@ def glob(pathname): The pattern may contain simple shell-style wildcards a la fnmatch. """ + return list(iglob(pathname)) + +def iglob(pathname): + """Return a list of paths matching a pathname pattern. + + The pattern may contain simple shell-style wildcards a la fnmatch. + + """ if not has_magic(pathname): if os.path.lexists(pathname): - return [pathname] - else: - return [] + yield pathname + return dirname, basename = os.path.split(pathname) if not dirname: - return glob1(os.curdir, basename) - elif has_magic(dirname): - list = glob(dirname) + for name in glob1(os.curdir, basename): + yield name + return + if has_magic(dirname): + dirs = iglob(dirname) else: - list = [dirname] - if not has_magic(basename): - result = [] - for dirname in list: - if basename or os.path.isdir(dirname): - name = os.path.join(dirname, basename) - if os.path.lexists(name): - result.append(name) + dirs = [dirname] + if has_magic(basename): + glob_in_dir = glob1 else: - result = [] - for dirname in list: - sublist = glob1(dirname, basename) - for name in sublist: - result.append(os.path.join(dirname, name)) - return result + glob_in_dir = glob0 + for dirname in dirs: + for name in glob_in_dir(dirname, basename): + yield os.path.join(dirname, name) + +# These 2 helper functions non-recursively glob inside a literal directory. +# They return a list of basenames. `glob1` accepts a pattern while `glob0` +# takes a literal basename (so it only has to check for its existence). def glob1(dirname, pattern): - if not dirname: dirname = os.curdir + if not dirname: + dirname = os.curdir try: names = os.listdir(dirname) except os.error: @@ -49,6 +56,17 @@ def glob1(dirname, pattern): names=filter(lambda x: x[0]!='.',names) return fnmatch.filter(names,pattern) +def glob0(dirname, basename): + if basename == '': + # `os.path.split()` returns an empty basename for paths ending with a + # directory separator. 'q*x/' should match only directories. + if os.isdir(dirname): + return [basename] + else: + if os.path.lexists(os.path.join(dirname, basename)): + return [basename] + return [] + magic_check = re.compile('[*?[]') diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index af81dab..f23dcf1 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -61,7 +61,9 @@ class GlobTests(unittest.TestCase): else: pattern = os.path.join(*parts) p = os.path.join(self.tempdir, pattern) - return glob.glob(p) + res = glob.glob(p) + self.assertEqual(list(glob.iglob(p)), res) + return res def assertSequencesEqual_noorder(self, l1, l2): self.assertEqual(set(l1), set(l2)) |