3 files changed, 84 insertions, 34 deletions
diff --git a/Doc/lib/libdifflib.tex b/Doc/lib/libdifflib.tex
index 3bc103b..37e401e 100644
--- a/Doc/lib/libdifflib.tex
+++ b/Doc/lib/libdifflib.tex
@@ -90,13 +90,19 @@
   Optional keyword parameters \var{linejunk} and \var{charjunk} are
   for filter functions (or \code{None}):
 
-  \var{linejunk}: A function that should accept a single string
-  argument, and return true if the string is junk (or false if it is
-  not). The default is module-level function
+  \var{linejunk}: A function that accepts a single string
+  argument, and returns true if the string is junk, or false if not.
+  The default is (\code{None}), starting with Python 2.3.  Before then,
+  the default was the module-level function
   \function{IS_LINE_JUNK()}, which filters out lines without visible
   characters, except for at most one pound character (\character{\#}).
+  As of Python 2.3, the underlying \class{SequenceMatcher} class
+  does a dynamic analysis of which lines are so frequent as to
+  constitute noise, and this usually works better than the pre-2.3
+  default.
 
-  \var{charjunk}: A function that should accept a string of length 1.
+  \var{charjunk}: A function that accepts a character (a string of
+  length 1), and returns if the character is junk, or false if not.
   The default is module-level function \function{IS_CHARACTER_JUNK()},
   which filters out whitespace characters (a blank or tab; note: bad
   idea to include newline in this!).
@@ -150,7 +156,7 @@ emu
   Return true for ignorable lines.  The line \var{line} is ignorable
   if \var{line} is blank or contains a single \character{\#},
   otherwise it is not ignorable.  Used as a default for parameter
-  \var{linejunk} in \function{ndiff()}.
+  \var{linejunk} in \function{ndiff()} before Python 2.3.
 \end{funcdesc}
 
 
@@ -443,16 +449,14 @@ The \class{Differ} class has this constructor:
   Optional keyword parameters \var{linejunk} and \var{charjunk} are
   for filter functions (or \code{None}):
 
-  \var{linejunk}: A function that should accept a single string
-  argument, and return true if the string is junk.  The default is
-  module-level function \function{IS_LINE_JUNK()}, which filters out
-  lines without visible characters, except for at most one pound
-  character (\character{\#}).
+  \var{linejunk}: A function that accepts a single string
+  argument, and returns true if the string is junk.  The default is
+  \code{None}, meaning that no line is considered junk.
 
-  \var{charjunk}: A function that should accept a string of length 1.
-  The default is module-level function \function{IS_CHARACTER_JUNK()},
-  which filters out whitespace characters (a blank or tab; note: bad
-  idea to include newline in this!).
+  \var{charjunk}: A function that accepts a single character argument
+  (a string of length 1), and returns true if the character is junk.
+  The default is \code{None}, meaning that no character is
+  considered junk.
 \end{classdesc}
 
 \class{Differ} objects are used (deltas generated) via a single
diff --git a/Lib/difflib.py b/Lib/difflib.py
index 69002f9..8fd2561 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -161,8 +161,6 @@ class SequenceMatcher:
         # b2j
         #      for x in b, b2j[x] is a list of the indices (into b)
         #      at which x appears; junk elements do not appear
-        # b2jhas
-        #      b2j.has_key
         # fullbcount
         #      for x in b, fullbcount[x] == the number of times x
         #      appears in b; only materialized if really needed (used
@@ -188,6 +186,10 @@ class SequenceMatcher:
         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
         #      it's really the has_key method of a hidden dict.
         #      DOES NOT WORK for x in a!
+        # isbpopular
+        #      for x in b, isbpopular(x) is true iff b is reasonably long
+        #      (at least 200 elements) and x accounts for more than 1% of
+        #      its elements.  DOES NOT WORK for x in a!
 
         self.isjunk = isjunk
         self.a = self.b = None
@@ -266,6 +268,12 @@ class SequenceMatcher:
     # map at all, which stops the central find_longest_match method
     # from starting any matching block at a junk element ...
     # also creates the fast isbjunk function ...
+    # b2j also does not contain entries for "popular" elements, meaning
+    # elements that account for more than 1% of the total elements, and
+    # when the sequence is reasonably large (>= 200 elements); this can
+    # be viewed as an adaptive notion of semi-junk, and yields an enormous
+    # speedup when, e.g., comparing program files with hundreds of
+    # instances of "return NULL;" ...
     # note that this is only called when b changes; so for cross-product
     # kinds of matches, it's best to call set_seq2 once, then set_seq1
     # repeatedly
@@ -282,25 +290,36 @@ class SequenceMatcher:
         # out the junk later is much cheaper than building b2j "right"
         # from the start.
         b = self.b
+        n = len(b)
         self.b2j = b2j = {}
-        self.b2jhas = b2jhas = b2j.has_key
-        for i in xrange(len(b)):
-            elt = b[i]
-            if b2jhas(elt):
-                b2j[elt].append(i)
+        populardict = {}
+        for i, elt in enumerate(b):
+            if elt in b2j:
+                indices = b2j[elt]
+                if n >= 200 and len(indices) * 100 > n:
+                    populardict[elt] = 1
+                    del indices[:]
+                else:
+                    indices.append(i)
             else:
                 b2j[elt] = [i]
 
+        # Purge leftover indices for popular elements.
+        for elt in populardict:
+            del b2j[elt]
+
         # Now b2j.keys() contains elements uniquely, and especially when
         # the sequence is a string, that's usually a good deal smaller
         # than len(string).  The difference is the number of isjunk calls
         # saved.
-        isjunk, junkdict = self.isjunk, {}
+        isjunk = self.isjunk
+        junkdict = {}
         if isjunk:
-            for elt in b2j.keys():
-                if isjunk(elt):
-                    junkdict[elt] = 1   # value irrelevant; it's a set
-                    del b2j[elt]
+            for d in populardict, b2j:
+                for elt in d.keys():
+                    if isjunk(elt):
+                        junkdict[elt] = 1
+                        del d[elt]
 
         # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
         # latter is much faster.  Note too that while there may be a
@@ -308,6 +327,7 @@ class SequenceMatcher:
         # elements is probably small.  So the memory burden of keeping
         # this dict alive is likely trivial compared to the size of b2j.
         self.isbjunk = junkdict.has_key
+        self.isbpopular = populardict.has_key
 
     def find_longest_match(self, alo, ahi, blo, bhi):
         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -388,6 +408,19 @@ class SequenceMatcher:
                     besti, bestj, bestsize = i-k+1, j-k+1, k
             j2len = newj2len
 
+        # Extend the best by non-junk elements on each end.  In particular,
+        # "popular" non-junk elements aren't in b2j, which greatly speeds
+        # the inner loop above, but also means "the best" match so far
+        # doesn't contain any junk *or* popular non-junk elements.
+        while besti > alo and bestj > blo and \
+              not isbjunk(b[bestj-1]) and \
+              a[besti-1] == b[bestj-1]:
+            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
+        while besti+bestsize < ahi and bestj+bestsize < bhi and \
+              not isbjunk(b[bestj+bestsize]) and \
+              a[besti+bestsize] == b[bestj+bestsize]:
+            bestsize += 1
+
         # Now that we have a wholly interesting match (albeit possibly
         # empty!), we may as well suck up the matching junk on each
         # side of it too.  Can't think of a good reason not to, and it
@@ -736,12 +769,16 @@ class Differ:
         - `linejunk`: A function that should accept a single string argument,
           and return true iff the string is junk. The module-level function
           `IS_LINE_JUNK` may be used to filter out lines without visible
-          characters, except for at most one splat ('#').
+          characters, except for at most one splat ('#').  It is recommended
+          to leave linejunk None; as of Python 2.3, the underlying
+          SequenceMatcher class has grown an adaptive notion of "noise" lines
+          that's better than any static definition the author has ever been
+          able to craft.
 
         - `charjunk`: A function that should accept a string of length 1. The
           module-level function `IS_CHARACTER_JUNK` may be used to filter out
           whitespace characters (a blank or tab; **note**: bad idea to include
-          newline in this!).
+          newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
         """
 
         self.linejunk = linejunk
@@ -1005,7 +1042,7 @@ def IS_CHARACTER_JUNK(ch, ws=" \t"):
 
 del re
 
-def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK):
+def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
     r"""
     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
 
@@ -1013,9 +1050,9 @@ def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK):
     functions (or None):
 
     - linejunk: A function that should accept a single string argument, and
-      return true iff the string is junk. The default is module-level function
-      IS_LINE_JUNK, which filters out lines without visible characters, except
-      for at most one splat ('#').
+      return true iff the string is junk.  The default is None, and is
+      recommended; as of Python 2.3, an adaptive notion of "noise" lines is
+      used that does a good job on its own.
 
     - charjunk: A function that should accept a string of length 1. The
       default is module-level function IS_CHARACTER_JUNK, which filters out
diff --git a/Misc/NEWS b/Misc/NEWS
index 5f7e942..903e5b0 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -72,9 +72,9 @@ Core and builtins
 
 Extension modules
 
-- The bsddb.*open functions can now take 'None' as a filename. 
+- The bsddb.*open functions can now take 'None' as a filename.
   This will create a temporary in-memory bsddb that won't be
-  written to disk. 
+  written to disk.
 
 - posix.mknod was added.
 
@@ -99,6 +99,15 @@ Extension modules
 
 Library
 
+- difflib's SequenceMatcher class now does a dynamic analysis of
+  which elements are so frequent as to constitute noise.  For
+  comparing files as sequences of lines, this generally works better
+  than the IS_LINE_JUNK function, and function ndiff's linejunk
+  argument defaults to None now as a result.  A happy benefit is
+  that SequenceMatcher may run much faster now when applied
+  to large files with many duplicate lines (for example, C program
+  text with lots of repeated "}" and "return NULL;" lines).
+
 - New Text.dump() method in Tkinter module.
 
 - New distutils commands for building packagers were added to