diff options
-rw-r--r-- | Doc/lib/libdifflib.tex | 32 | ||||
-rw-r--r-- | Lib/difflib.py | 73 | ||||
-rw-r--r-- | Misc/NEWS | 13 |
3 files changed, 84 insertions, 34 deletions
diff --git a/Doc/lib/libdifflib.tex b/Doc/lib/libdifflib.tex index 3bc103b..37e401e 100644 --- a/Doc/lib/libdifflib.tex +++ b/Doc/lib/libdifflib.tex @@ -90,13 +90,19 @@ Optional keyword parameters \var{linejunk} and \var{charjunk} are for filter functions (or \code{None}): - \var{linejunk}: A function that should accept a single string - argument, and return true if the string is junk (or false if it is - not). The default is module-level function + \var{linejunk}: A function that accepts a single string + argument, and returns true if the string is junk, or false if not. + The default is (\code{None}), starting with Python 2.3. Before then, + the default was the module-level function \function{IS_LINE_JUNK()}, which filters out lines without visible characters, except for at most one pound character (\character{\#}). + As of Python 2.3, the underlying \class{SequenceMatcher} class + does a dynamic analysis of which lines are so frequent as to + constitute noise, and this usually works better than the pre-2.3 + default. - \var{charjunk}: A function that should accept a string of length 1. + \var{charjunk}: A function that accepts a character (a string of + length 1), and returns if the character is junk, or false if not. The default is module-level function \function{IS_CHARACTER_JUNK()}, which filters out whitespace characters (a blank or tab; note: bad idea to include newline in this!). @@ -150,7 +156,7 @@ emu Return true for ignorable lines. The line \var{line} is ignorable if \var{line} is blank or contains a single \character{\#}, otherwise it is not ignorable. Used as a default for parameter - \var{linejunk} in \function{ndiff()}. + \var{linejunk} in \function{ndiff()} before Python 2.3. \end{funcdesc} @@ -443,16 +449,14 @@ The \class{Differ} class has this constructor: Optional keyword parameters \var{linejunk} and \var{charjunk} are for filter functions (or \code{None}): - \var{linejunk}: A function that should accept a single string - argument, and return true if the string is junk. The default is - module-level function \function{IS_LINE_JUNK()}, which filters out - lines without visible characters, except for at most one pound - character (\character{\#}). + \var{linejunk}: A function that accepts a single string + argument, and returns true if the string is junk. The default is + \code{None}, meaning that no line is considered junk. - \var{charjunk}: A function that should accept a string of length 1. - The default is module-level function \function{IS_CHARACTER_JUNK()}, - which filters out whitespace characters (a blank or tab; note: bad - idea to include newline in this!). + \var{charjunk}: A function that accepts a single character argument + (a string of length 1), and returns true if the character is junk. + The default is \code{None}, meaning that no character is + considered junk. \end{classdesc} \class{Differ} objects are used (deltas generated) via a single diff --git a/Lib/difflib.py b/Lib/difflib.py index 69002f9..8fd2561 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -161,8 +161,6 @@ class SequenceMatcher: # b2j # for x in b, b2j[x] is a list of the indices (into b) # at which x appears; junk elements do not appear - # b2jhas - # b2j.has_key # fullbcount # for x in b, fullbcount[x] == the number of times x # appears in b; only materialized if really needed (used @@ -188,6 +186,10 @@ class SequenceMatcher: # for x in b, isbjunk(x) == isjunk(x) but much faster; # it's really the has_key method of a hidden dict. # DOES NOT WORK for x in a! + # isbpopular + # for x in b, isbpopular(x) is true iff b is reasonably long + # (at least 200 elements) and x accounts for more than 1% of + # its elements. DOES NOT WORK for x in a! self.isjunk = isjunk self.a = self.b = None @@ -266,6 +268,12 @@ class SequenceMatcher: # map at all, which stops the central find_longest_match method # from starting any matching block at a junk element ... # also creates the fast isbjunk function ... + # b2j also does not contain entries for "popular" elements, meaning + # elements that account for more than 1% of the total elements, and + # when the sequence is reasonably large (>= 200 elements); this can + # be viewed as an adaptive notion of semi-junk, and yields an enormous + # speedup when, e.g., comparing program files with hundreds of + # instances of "return NULL;" ... # note that this is only called when b changes; so for cross-product # kinds of matches, it's best to call set_seq2 once, then set_seq1 # repeatedly @@ -282,25 +290,36 @@ class SequenceMatcher: # out the junk later is much cheaper than building b2j "right" # from the start. b = self.b + n = len(b) self.b2j = b2j = {} - self.b2jhas = b2jhas = b2j.has_key - for i in xrange(len(b)): - elt = b[i] - if b2jhas(elt): - b2j[elt].append(i) + populardict = {} + for i, elt in enumerate(b): + if elt in b2j: + indices = b2j[elt] + if n >= 200 and len(indices) * 100 > n: + populardict[elt] = 1 + del indices[:] + else: + indices.append(i) else: b2j[elt] = [i] + # Purge leftover indices for popular elements. + for elt in populardict: + del b2j[elt] + # Now b2j.keys() contains elements uniquely, and especially when # the sequence is a string, that's usually a good deal smaller # than len(string). The difference is the number of isjunk calls # saved. - isjunk, junkdict = self.isjunk, {} + isjunk = self.isjunk + junkdict = {} if isjunk: - for elt in b2j.keys(): - if isjunk(elt): - junkdict[elt] = 1 # value irrelevant; it's a set - del b2j[elt] + for d in populardict, b2j: + for elt in d.keys(): + if isjunk(elt): + junkdict[elt] = 1 + del d[elt] # Now for x in b, isjunk(x) == junkdict.has_key(x), but the # latter is much faster. Note too that while there may be a @@ -308,6 +327,7 @@ class SequenceMatcher: # elements is probably small. So the memory burden of keeping # this dict alive is likely trivial compared to the size of b2j. self.isbjunk = junkdict.has_key + self.isbpopular = populardict.has_key def find_longest_match(self, alo, ahi, blo, bhi): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. @@ -388,6 +408,19 @@ class SequenceMatcher: besti, bestj, bestsize = i-k+1, j-k+1, k j2len = newj2len + # Extend the best by non-junk elements on each end. In particular, + # "popular" non-junk elements aren't in b2j, which greatly speeds + # the inner loop above, but also means "the best" match so far + # doesn't contain any junk *or* popular non-junk elements. + while besti > alo and bestj > blo and \ + not isbjunk(b[bestj-1]) and \ + a[besti-1] == b[bestj-1]: + besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 + while besti+bestsize < ahi and bestj+bestsize < bhi and \ + not isbjunk(b[bestj+bestsize]) and \ + a[besti+bestsize] == b[bestj+bestsize]: + bestsize += 1 + # Now that we have a wholly interesting match (albeit possibly # empty!), we may as well suck up the matching junk on each # side of it too. Can't think of a good reason not to, and it @@ -736,12 +769,16 @@ class Differ: - `linejunk`: A function that should accept a single string argument, and return true iff the string is junk. The module-level function `IS_LINE_JUNK` may be used to filter out lines without visible - characters, except for at most one splat ('#'). + characters, except for at most one splat ('#'). It is recommended + to leave linejunk None; as of Python 2.3, the underlying + SequenceMatcher class has grown an adaptive notion of "noise" lines + that's better than any static definition the author has ever been + able to craft. - `charjunk`: A function that should accept a string of length 1. The module-level function `IS_CHARACTER_JUNK` may be used to filter out whitespace characters (a blank or tab; **note**: bad idea to include - newline in this!). + newline in this!). Use of IS_CHARACTER_JUNK is recommended. """ self.linejunk = linejunk @@ -1005,7 +1042,7 @@ def IS_CHARACTER_JUNK(ch, ws=" \t"): del re -def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK): +def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): r""" Compare `a` and `b` (lists of strings); return a `Differ`-style delta. @@ -1013,9 +1050,9 @@ def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK): functions (or None): - linejunk: A function that should accept a single string argument, and - return true iff the string is junk. The default is module-level function - IS_LINE_JUNK, which filters out lines without visible characters, except - for at most one splat ('#'). + return true iff the string is junk. The default is None, and is + recommended; as of Python 2.3, an adaptive notion of "noise" lines is + used that does a good job on its own. - charjunk: A function that should accept a string of length 1. The default is module-level function IS_CHARACTER_JUNK, which filters out @@ -72,9 +72,9 @@ Core and builtins Extension modules -- The bsddb.*open functions can now take 'None' as a filename. +- The bsddb.*open functions can now take 'None' as a filename. This will create a temporary in-memory bsddb that won't be - written to disk. + written to disk. - posix.mknod was added. @@ -99,6 +99,15 @@ Extension modules Library +- difflib's SequenceMatcher class now does a dynamic analysis of + which elements are so frequent as to constitute noise. For + comparing files as sequences of lines, this generally works better + than the IS_LINE_JUNK function, and function ndiff's linejunk + argument defaults to None now as a result. A happy benefit is + that SequenceMatcher may run much faster now when applied + to large files with many duplicate lines (for example, C program + text with lots of repeated "}" and "return NULL;" lines). + - New Text.dump() method in Tkinter module. - New distutils commands for building packagers were added to |