From 8a9c284437652826a3da83ec38c4536fa111eb40 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 22 Sep 2001 21:30:22 +0000 Subject: Make difflib.ndiff() and difflib.Differ.compare() generators. This restores the 2.1 ability of Tools/scripts/ndiff.py to start producing output before the entire comparison is complete. --- Doc/lib/libdifflib.tex | 26 +++++++------ Lib/difflib.py | 100 +++++++++++++++++++++++-------------------------- Misc/NEWS | 23 +++++++++++- Tools/scripts/ndiff.py | 5 +-- 4 files changed, 84 insertions(+), 70 deletions(-) diff --git a/Doc/lib/libdifflib.tex b/Doc/lib/libdifflib.tex index cc9a776..a669435 100644 --- a/Doc/lib/libdifflib.tex +++ b/Doc/lib/libdifflib.tex @@ -32,7 +32,7 @@ \begin{classdesc*}{Differ} This is a class for comparing sequences of lines of text, and - producing human-readable differences or deltas. Differ uses + producing human-readable differences or deltas. Differ uses \class{SequenceMatcher} both to compare sequences of lines, and to compare sequences of characters within similar (near-matching) lines. @@ -85,7 +85,7 @@ \begin{funcdesc}{ndiff}{a, b\optional{, linejunk\optional{, charjunk}}} Compare \var{a} and \var{b} (lists of strings); return a - \class{Differ}-style delta. + \class{Differ}-style delta (a generator generating the delta lines). Optional keyword parameters \var{linejunk} and \var{charjunk} are for filter functions (or \code{None}): @@ -109,12 +109,12 @@ ... 'ore\ntree\nemu\n'.splitlines(1))) >>> print ''.join(diff), - one -? ^ +? ^ + ore -? ^ +? ^ - two - three -? - +? - + tree + emu \end{verbatim} @@ -132,6 +132,7 @@ \begin{verbatim} >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), ... 'ore\ntree\nemu\n'.splitlines(1)) +>>> diff = list(diff) # materialize the generated delta into a list >>> print ''.join(restore(diff, 1)), one two @@ -226,7 +227,7 @@ of the other sequences. If \var{isjunk} was omitted or \code{None}, \method{get_longest_match()} returns \code{(\var{i}, \var{j}, \var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal - to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where + to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where \code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and \code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}. For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those @@ -303,7 +304,7 @@ of the other sequences. deleted. Note that \code{\var{j1} == \var{j2}} in this case.} \lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be - inserted at \code{\var{a}[\var{i1}:\var{i1}]}. + inserted at \code{\var{a}[\var{i1}:\var{i1}]}. Note that \code{\var{i1} == \var{i2}} in this case.} \lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] == @@ -459,13 +460,14 @@ The \class{Differ} class has this constructor: method: \begin{methoddesc}{compare}{a, b} - Compare two sequences of lines; return the resulting delta (list). + Compare two sequences of lines, and generate the delta (a sequence + of lines). Each sequence must contain individual single-line strings ending with newlines. Such sequences can be obtained from the - \method{readlines()} method of file-like objects. The list returned - is also made up of newline-terminated strings, and ready to be used - with the \method{writelines()} method of a file-like object. + \method{readlines()} method of file-like objects. The delta generated + also consists of newline-terminated strings, ready to be printed as-is + via the \method{writeline()} method of a file-like object. \end{methoddesc} @@ -506,7 +508,7 @@ functions to filter out line and character ``junk.'' See the Finally, we compare the two: \begin{verbatim} ->>> result = d.compare(text1, text2) +>>> result = list(d.compare(text1, text2)) \end{verbatim} \code{result} is a list of strings, so let's pretty-print it: diff --git a/Lib/difflib.py b/Lib/difflib.py index a41d4d5..8493503 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -1,5 +1,7 @@ #! /usr/bin/env python +from __future__ import generators + """ Module difflib -- helpers for computing deltas between objects. @@ -22,8 +24,6 @@ Class Differ: __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 'Differ'] -TRACE = 0 - class SequenceMatcher: """ @@ -406,9 +406,6 @@ class SequenceMatcher: a[besti+bestsize] == b[bestj+bestsize]: bestsize = bestsize + 1 - if TRACE: - print "get_matching_blocks", alo, ahi, blo, bhi - print " returns", besti, bestj, bestsize return besti, bestj, bestsize def get_matching_blocks(self): @@ -432,8 +429,6 @@ class SequenceMatcher: la, lb = len(self.a), len(self.b) self.__helper(0, la, 0, lb, self.matching_blocks) self.matching_blocks.append( (la, lb, 0) ) - if TRACE: - print '*** matching blocks', self.matching_blocks return self.matching_blocks # builds list of matching blocks covering a[alo:ahi] and @@ -694,7 +689,7 @@ class Differ: Finally, we compare the two: - >>> result = d.compare(text1, text2) + >>> result = list(d.compare(text1, text2)) 'result' is a list of strings, so let's pretty-print it: @@ -731,7 +726,7 @@ class Differ: Construct a text differencer, with optional filters. compare(a, b) - Compare two sequences of lines; return the resulting delta (list). + Compare two sequences of lines; generate the resulting delta. """ def __init__(self, linejunk=None, charjunk=None): @@ -753,16 +748,15 @@ class Differ: self.linejunk = linejunk self.charjunk = charjunk - self.results = [] def compare(self, a, b): r""" - Compare two sequences of lines; return the resulting delta (list). + Compare two sequences of lines; generate the resulting delta. Each sequence must contain individual single-line strings ending with newlines. Such sequences can be obtained from the `readlines()` method - of file-like objects. The list returned is also made up of - newline-terminated strings, ready to be used with the `writelines()` + of file-like objects. The delta generated also consists of newline- + terminated strings, ready to be printed as-is via the writeline() method of a file-like object. Example: @@ -783,34 +777,38 @@ class Differ: cruncher = SequenceMatcher(self.linejunk, a, b) for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): if tag == 'replace': - self._fancy_replace(a, alo, ahi, b, blo, bhi) + g = self._fancy_replace(a, alo, ahi, b, blo, bhi) elif tag == 'delete': - self._dump('-', a, alo, ahi) + g = self._dump('-', a, alo, ahi) elif tag == 'insert': - self._dump('+', b, blo, bhi) + g = self._dump('+', b, blo, bhi) elif tag == 'equal': - self._dump(' ', a, alo, ahi) + g = self._dump(' ', a, alo, ahi) else: raise ValueError, 'unknown tag ' + `tag` - results = self.results - self.results = [] - return results + + for line in g: + yield line def _dump(self, tag, x, lo, hi): - """Store comparison results for a same-tagged range.""" + """Generate comparison results for a same-tagged range.""" for i in xrange(lo, hi): - self.results.append('%s %s' % (tag, x[i])) + yield '%s %s' % (tag, x[i]) def _plain_replace(self, a, alo, ahi, b, blo, bhi): assert alo < ahi and blo < bhi # dump the shorter block first -- reduces the burden on short-term # memory if the blocks are of very different sizes if bhi - blo < ahi - alo: - self._dump('+', b, blo, bhi) - self._dump('-', a, alo, ahi) + first = self._dump('+', b, blo, bhi) + second = self._dump('-', a, alo, ahi) else: - self._dump('-', a, alo, ahi) - self._dump('+', b, blo, bhi) + first = self._dump('-', a, alo, ahi) + second = self._dump('+', b, blo, bhi) + + for g in first, second: + for line in g: + yield line def _fancy_replace(self, a, alo, ahi, b, blo, bhi): r""" @@ -830,12 +828,6 @@ class Differ: ? ^ ^ ^ """ - if TRACE: - self.results.append('*** _fancy_replace %s %s %s %s\n' - % (alo, ahi, blo, bhi)) - self._dump('>', a, alo, ahi) - self._dump('<', b, blo, bhi) - # don't synch up unless the lines have a similarity score of at # least cutoff; best_ratio tracks the best score seen so far best_ratio, cutoff = 0.74, 0.75 @@ -869,7 +861,8 @@ class Differ: # no non-identical "pretty close" pair if eqi is None: # no identical pair either -- treat it as a straight replace - self._plain_replace(a, alo, ahi, b, blo, bhi) + for line in self._plain_replace(a, alo, ahi, b, blo, bhi): + yield line return # no close pair, but an identical pair -- synch up on that best_i, best_j, best_ratio = eqi, eqj, 1.0 @@ -879,14 +872,10 @@ class Differ: # a[best_i] very similar to b[best_j]; eqi is None iff they're not # identical - if TRACE: - self.results.append('*** best_ratio %s %s %s %s\n' - % (best_ratio, best_i, best_j)) - self._dump('>', a, best_i, best_i+1) - self._dump('<', b, best_j, best_j+1) # pump out diffs from before the synch point - self._fancy_helper(a, alo, best_i, b, blo, best_j) + for line in self._fancy_helper(a, alo, best_i, b, blo, best_j): + yield line # do intraline marking on the synch pair aelt, belt = a[best_i], b[best_j] @@ -908,22 +897,28 @@ class Differ: btags += ' ' * lb else: raise ValueError, 'unknown tag ' + `tag` - self._qformat(aelt, belt, atags, btags) + for line in self._qformat(aelt, belt, atags, btags): + yield line else: # the synch pair is identical - self.results.append(' ' + aelt) + yield ' ' + aelt # pump out diffs from after the synch point - self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi) + for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi): + yield line def _fancy_helper(self, a, alo, ahi, b, blo, bhi): + g = [] if alo < ahi: if blo < bhi: - self._fancy_replace(a, alo, ahi, b, blo, bhi) + g = self._fancy_replace(a, alo, ahi, b, blo, bhi) else: - self._dump('-', a, alo, ahi) + g = self._dump('-', a, alo, ahi) elif blo < bhi: - self._dump('+', b, blo, bhi) + g = self._dump('+', b, blo, bhi) + + for line in g: + yield line def _qformat(self, aline, bline, atags, btags): r""" @@ -949,13 +944,13 @@ class Differ: atags = atags[common:].rstrip() btags = btags[common:].rstrip() - self.results.append("- " + aline) + yield "- " + aline if atags: - self.results.append("? %s%s\n" % ("\t" * common, atags)) + yield "? %s%s\n" % ("\t" * common, atags) - self.results.append("+ " + bline) + yield "+ " + bline if btags: - self.results.append("? %s%s\n" % ("\t" * common, btags)) + yield "? %s%s\n" % ("\t" * common, btags) # With respect to junk, an earlier version of ndiff simply refused to # *start* a match with a junk element. The result was cases like this: @@ -1050,7 +1045,7 @@ def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK): def restore(delta, which): r""" - Return one of the two sequences that generated a delta. + Generate one of the two sequences that generated a delta. Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract lines originating from file 1 or 2 (parameter `which`), stripping off line @@ -1060,6 +1055,7 @@ def restore(delta, which): >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), ... 'ore\ntree\nemu\n'.splitlines(1)) + >>> diff = list(diff) >>> print ''.join(restore(diff, 1)), one two @@ -1075,11 +1071,9 @@ def restore(delta, which): raise ValueError, ('unknown delta choice (must be 1 or 2): %r' % which) prefixes = (" ", tag) - results = [] for line in delta: if line[:2] in prefixes: - results.append(line[2:]) - return results + yield line[2:] def _test(): import doctest, difflib diff --git a/Misc/NEWS b/Misc/NEWS index 713eb17..cf8e3fc 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -30,7 +30,7 @@ Core - In 2.2a3, __new__ would only see sequential arguments passed to the type in a constructor call; __init__ would see both sequential and - positional arguments. This made no sense whatsoever any more, so + keyword arguments. This made no sense whatsoever any more, so now both __new__ and __init__ see all arguments. - In 2.2a3, hash() applied to an instance of a subclass of str or unicode @@ -54,6 +54,10 @@ Core Library +- difflib.ndiff() and difflib.Differ.compare() are generators now. This + restores the ability of Tools/scripts/ndiff.py to start producing output + before the entire comparison is complete. + - StringIO.StringIO instances and cStringIO.StringIO instances support iteration just like file objects (i.e. their .readline() method is called for each iteration until it returns an empty string). @@ -124,10 +128,25 @@ New platforms Tests +- The "classic" standard tests, which work by comparing stdout to + an expected-output file under Lib/test/output/, no longer stop at + the first mismatch. Instead the test is run to completion, and a + variant of ndiff-style comparison is used to report all differences. + This is much easier to understand than the previous style of reporting. + +- The unittest-based standard tests now use regrtest's test_main() + convention, instead of running as a side-effect of merely being + imported. This allows these tests to be run in more natural and + flexible ways as unittests, outside the regrtest framework. + +- regrtest.py is much better integrated with unittest and doctest now, + especially in regard to reporting errors. + Windows - Large file support now also works for files > 4GB, on filesystems - that support it (NTFS under Windows 2000). + that support it (NTFS under Windows 2000). See "What's New in + Python 2.2a3" for more detail. What's New in Python 2.2a3? diff --git a/Tools/scripts/ndiff.py b/Tools/scripts/ndiff.py index a5468f6..6f0f9a9 100755 --- a/Tools/scripts/ndiff.py +++ b/Tools/scripts/ndiff.py @@ -73,9 +73,8 @@ def fcompare(f1name, f2name): a = f1.readlines(); f1.close() b = f2.readlines(); f2.close() - - diff = difflib.ndiff(a, b) - sys.stdout.writelines(diff) + for line in difflib.ndiff(a, b): + print line, return 1 -- cgit v0.12