summaryrefslogtreecommitdiffstats
path: root/Doc
diff options
context:
space:
mode:
authorFred Drake <fdrake@acm.org>2001-08-13 19:31:59 (GMT)
committerFred Drake <fdrake@acm.org>2001-08-13 19:31:59 (GMT)
commit6943a29cbf2cc5a743cb55857b623f7ab3168190 (patch)
tree0425f0ca27cc4f40c29a76f66ef54d4bf1b932f5 /Doc
parent97dbec97bc18b88e0380f683009f6827f836c630 (diff)
downloadcpython-6943a29cbf2cc5a743cb55857b623f7ab3168190.zip
cpython-6943a29cbf2cc5a743cb55857b623f7ab3168190.tar.gz
cpython-6943a29cbf2cc5a743cb55857b623f7ab3168190.tar.bz2
David Goodger <dgoodger@atsautomation.com>:
Documentation for difflib/ndiff refactoring: more of the ndiff functionality has been moved to the underlying library (difflib). This closes SF patch #445413.
Diffstat (limited to 'Doc')
-rw-r--r--Doc/lib/libdifflib.tex267
1 files changed, 240 insertions, 27 deletions
diff --git a/Doc/lib/libdifflib.tex b/Doc/lib/libdifflib.tex
index ce191c0..cc9a776 100644
--- a/Doc/lib/libdifflib.tex
+++ b/Doc/lib/libdifflib.tex
@@ -10,6 +10,48 @@
\versionadded{2.1}
+\begin{classdesc*}{SequenceMatcher}
+ This is a flexible class for comparing pairs of sequences of any
+ type, so long as the sequence elements are hashable. The basic
+ algorithm predates, and is a little fancier than, an algorithm
+ published in the late 1980's by Ratcliff and Obershelp under the
+ hyperbolic name ``gestalt pattern matching.'' The idea is to find
+ the longest contiguous matching subsequence that contains no
+ ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
+ address junk). The same idea is then applied recursively to the
+ pieces of the sequences to the left and to the right of the matching
+ subsequence. This does not yield minimal edit sequences, but does
+ tend to yield matches that ``look right'' to people.
+
+ \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
+ time in the worst case and quadratic time in the expected case.
+ \class{SequenceMatcher} is quadratic time for the worst case and has
+ expected-case behavior dependent in a complicated way on how many
+ elements the sequences have in common; best case time is linear.
+\end{classdesc*}
+
+\begin{classdesc*}{Differ}
+ This is a class for comparing sequences of lines of text, and
+ producing human-readable differences or deltas. Differ uses
+ \class{SequenceMatcher} both to compare sequences of lines, and to
+ compare sequences of characters within similar (near-matching)
+ lines.
+
+ Each line of a \class{Differ} delta begins with a two-letter code:
+
+\begin{tableii}{l|l}{code}{Code}{Meaning}
+ \lineii{'- '}{line unique to sequence 1}
+ \lineii{'+ '}{line unique to sequence 2}
+ \lineii{' '}{line common to both sequences}
+ \lineii{'? '}{line not present in either input sequence}
+\end{tableii}
+
+ Lines beginning with `\code{?~}' attempt to guide the eye to
+ intraline differences, and were not present in either input
+ sequence. These lines can be confusing if the sequences contain tab
+ characters.
+\end{classdesc*}
+
\begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
n\optional{, cutoff}}}
Return a list of the best ``good enough'' matches. \var{word} is a
@@ -40,25 +82,85 @@
\end{verbatim}
\end{funcdesc}
-\begin{classdesc*}{SequenceMatcher}
- This is a flexible class for comparing pairs of sequences of any
- type, so long as the sequence elements are hashable. The basic
- algorithm predates, and is a little fancier than, an algorithm
- published in the late 1980's by Ratcliff and Obershelp under the
- hyperbolic name ``gestalt pattern matching.'' The idea is to find
- the longest contiguous matching subsequence that contains no
- ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
- address junk). The same idea is then applied recursively to the
- pieces of the sequences to the left and to the right of the matching
- subsequence. This does not yield minimal edit sequences, but does
- tend to yield matches that ``look right'' to people.
+\begin{funcdesc}{ndiff}{a, b\optional{, linejunk\optional{,
+ charjunk}}}
+ Compare \var{a} and \var{b} (lists of strings); return a
+ \class{Differ}-style delta.
- \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
- time in the worst case and quadratic time in the expected case.
- \class{SequenceMatcher} is quadratic time for the worst case and has
- expected-case behavior dependent in a complicated way on how many
- elements the sequences have in common; best case time is linear.
-\end{classdesc*}
+ Optional keyword parameters \var{linejunk} and \var{charjunk} are
+ for filter functions (or \code{None}):
+
+ \var{linejunk}: A function that should accept a single string
+ argument, and return true if the string is junk (or false if it is
+ not). The default is module-level function
+ \function{IS_LINE_JUNK()}, which filters out lines without visible
+ characters, except for at most one pound character (\character{\#}).
+
+ \var{charjunk}: A function that should accept a string of length 1.
+ The default is module-level function \function{IS_CHARACTER_JUNK()},
+ which filters out whitespace characters (a blank or tab; note: bad
+ idea to include newline in this!).
+
+ \file{Tools/scripts/ndiff.py} is a command-line front-end to this
+ function.
+
+\begin{verbatim}
+>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
+... 'ore\ntree\nemu\n'.splitlines(1)))
+>>> print ''.join(diff),
+- one
+? ^
++ ore
+? ^
+- two
+- three
+? -
++ tree
++ emu
+\end{verbatim}
+\end{funcdesc}
+
+\begin{funcdesc}{restore}{sequence, which}
+ Return one of the two sequences that generated a delta.
+
+ Given a \var{sequence} produced by \method{Differ.compare()} or
+ \function{ndiff()}, extract lines originating from file 1 or 2
+ (parameter \var{which}), stripping off line prefixes.
+
+ Example:
+
+\begin{verbatim}
+>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
+... 'ore\ntree\nemu\n'.splitlines(1))
+>>> print ''.join(restore(diff, 1)),
+one
+two
+three
+>>> print ''.join(restore(diff, 2)),
+ore
+tree
+emu
+\end{verbatim}
+
+\end{funcdesc}
+
+
+\begin{funcdesc}{IS_LINE_JUNK}{line}:
+
+ Return 1 for ignorable line: iff \var{line} is blank or contains a
+ single \character{\#}. Used as a default for parameter
+ \var{linejunk} in \function{ndiff()}.
+
+\end{funcdesc}
+
+
+\begin{funcdesc}{IS_CHARACTER_JUNK}{ch}:
+
+ Return 1 for ignorable character: iff \var{ch} is a space or tab.
+ Used as a default for parameter \var{charjunk} in
+ \function{ndiff()}.
+
+\end{funcdesc}
\begin{seealso}
@@ -231,9 +333,9 @@ replace a[3:4] (x) b[2:3] (y)
range [0, 1].
Where T is the total number of elements in both sequences, and M is
- the number of matches, this is 2.0*M / T. Note that this is \code{1.}
- if the sequences are identical, and \code{0.} if they have nothing in
- common.
+ the number of matches, this is 2.0*M / T. Note that this is
+ \code{1.0} if the sequences are identical, and \code{0.0} if they
+ have nothing in common.
This is expensive to compute if \method{get_matching_blocks()} or
\method{get_opcodes()} hasn't already been called, in which case you
@@ -272,7 +374,7 @@ at least as large as \method{ratio()}:
\end{verbatim}
-\subsection{Examples \label{difflib-examples}}
+\subsection{SequenceMatcher Examples \label{sequencematcher-examples}}
This example compares two strings, considering blanks to be ``junk:''
@@ -321,11 +423,122 @@ insert a[8:8] b[8:17]
equal a[14:29] b[23:38]
\end{verbatim}
-See \file{Tools/scripts/ndiff.py} from the Python source distribution
-for a fancy human-friendly file differencer, which uses
-\class{SequenceMatcher} both to view files as sequences of lines, and
-lines as sequences of characters.
-
See also the function \function{get_close_matches()} in this module,
which shows how simple code building on \class{SequenceMatcher} can be
used to do useful work.
+
+
+\subsection{Differ Objects \label{differ-objects}}
+
+Note that \class{Differ}-generated deltas make no claim to be
+\strong{minimal} diffs. To the contrary, minimal diffs are often
+counter-intuitive, because they synch up anywhere possible, sometimes
+accidental matches 100 pages apart. Restricting synch points to
+contiguous matches preserves some notion of locality, at the
+occasional cost of producing a longer diff.
+
+The \class{Differ} class has this constructor:
+
+\begin{classdesc}{Differ}{\optional{linejunk\optional{, charjunk}}}
+ Optional keyword parameters \var{linejunk} and \var{charjunk} are
+ for filter functions (or \code{None}):
+
+ \var{linejunk}: A function that should accept a single string
+ argument, and return true iff the string is junk. The default is
+ module-level function \function{IS_LINE_JUNK()}, which filters out
+ lines without visible characters, except for at most one pound
+ character (\character{\#}).
+
+ \var{charjunk}: A function that should accept a string of length 1.
+ The default is module-level function \function{IS_CHARACTER_JUNK()},
+ which filters out whitespace characters (a blank or tab; note: bad
+ idea to include newline in this!).
+\end{classdesc}
+
+\class{Differ} objects are used (deltas generated) via a single
+method:
+
+\begin{methoddesc}{compare}{a, b}
+ Compare two sequences of lines; return the resulting delta (list).
+
+ Each sequence must contain individual single-line strings ending
+ with newlines. Such sequences can be obtained from the
+ \method{readlines()} method of file-like objects. The list returned
+ is also made up of newline-terminated strings, and ready to be used
+ with the \method{writelines()} method of a file-like object.
+\end{methoddesc}
+
+
+\subsection{Differ Example \label{differ-examples}}
+
+This example compares two texts. First we set up the texts, sequences
+of individual single-line strings ending with newlines (such sequences
+can also be obtained from the \method{readlines()} method of file-like
+objects):
+
+\begin{verbatim}
+>>> text1 = ''' 1. Beautiful is better than ugly.
+... 2. Explicit is better than implicit.
+... 3. Simple is better than complex.
+... 4. Complex is better than complicated.
+... '''.splitlines(1)
+>>> len(text1)
+4
+>>> text1[0][-1]
+'\n'
+>>> text2 = ''' 1. Beautiful is better than ugly.
+... 3. Simple is better than complex.
+... 4. Complicated is better than complex.
+... 5. Flat is better than nested.
+... '''.splitlines(1)
+\end{verbatim}
+
+Next we instantiate a Differ object:
+
+\begin{verbatim}
+>>> d = Differ()
+\end{verbatim}
+
+Note that when instantiating a \class{Differ} object we may pass
+functions to filter out line and character ``junk.'' See the
+\method{Differ()} constructor for details.
+
+Finally, we compare the two:
+
+\begin{verbatim}
+>>> result = d.compare(text1, text2)
+\end{verbatim}
+
+\code{result} is a list of strings, so let's pretty-print it:
+
+\begin{verbatim}
+>>> from pprint import pprint
+>>> pprint(result)
+[' 1. Beautiful is better than ugly.\n',
+ '- 2. Explicit is better than implicit.\n',
+ '- 3. Simple is better than complex.\n',
+ '+ 3. Simple is better than complex.\n',
+ '? ++ \n',
+ '- 4. Complex is better than complicated.\n',
+ '? ^ ---- ^ \n',
+ '+ 4. Complicated is better than complex.\n',
+ '? ++++ ^ ^ \n',
+ '+ 5. Flat is better than nested.\n']
+\end{verbatim}
+
+As a single multi-line string it looks like this:
+
+\begin{verbatim}
+>>> import sys
+>>> sys.stdout.writelines(result)
+ 1. Beautiful is better than ugly.
+- 2. Explicit is better than implicit.
+- 3. Simple is better than complex.
++ 3. Simple is better than complex.
+? ++
+- 4. Complex is better than complicated.
+? ^ ---- ^
++ 4. Complicated is better than complex.
+? ++++ ^ ^
++ 5. Flat is better than nested.
+\end{verbatim}