summaryrefslogtreecommitdiffstats
path: root/Lib/difflib.py
diff options
context:
space:
mode:
authorGreg Ward <greg@gerg.ca>2015-04-21 00:21:21 (GMT)
committerGreg Ward <greg@gerg.ca>2015-04-21 00:21:21 (GMT)
commit4d9d2563f51edad448a960d9490a6f56ac733735 (patch)
tree732ccd1c3bded6c2b25b942c7f55c5cc685dab1e /Lib/difflib.py
parentd19458ac51633cac979b7c7b9439ea89f179c8c8 (diff)
downloadcpython-4d9d2563f51edad448a960d9490a6f56ac733735.zip
cpython-4d9d2563f51edad448a960d9490a6f56ac733735.tar.gz
cpython-4d9d2563f51edad448a960d9490a6f56ac733735.tar.bz2
#17445: difflib: add diff_bytes(), to compare bytes rather than str
Some applications (e.g. traditional Unix diff, version control systems) neither know nor care about the encodings of the files they are comparing. They are textual, but to the diff utility they are just bytes. This worked fine under Python 2, because all of the hardcoded strings in difflib.py are ASCII, so could safely be combined with old-style u'' strings. But it stopped working in 3.x. The solution is to use surrogate escapes for a lossless bytes->str->bytes roundtrip. That means {unified,context}_diff() can continue to just handle strings without worrying about bytes. Callers who have to deal with bytes will need to change to using diff_bytes(). Use case: Mercurial's test runner uses difflib to compare current hg output with known good output. But Mercurial's output is just bytes, since it can contain: * file contents (arbitrary unknown encoding) * filenames (arbitrary unknown encoding) * usernames and commit messages (usually UTF-8, but not guaranteed because old versions of Mercurial did not enforce it) * user messages (locale encoding) Since the output of any given hg command can include text in multiple encodings, it is hopeless to try to treat it as decodable Unicode text. It's just bytes, all the way down. This is an elaboration of a patch by Terry Reedy.
Diffstat (limited to 'Lib/difflib.py')
-rw-r--r--Lib/difflib.py51
1 files changed, 50 insertions, 1 deletions
diff --git a/Lib/difflib.py b/Lib/difflib.py
index 758f1aa..96fd9ab 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -28,7 +28,7 @@ Class HtmlDiff:
__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
- 'unified_diff', 'HtmlDiff', 'Match']
+ 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
from heapq import nlargest as _nlargest
from collections import namedtuple as _namedtuple
@@ -1174,6 +1174,7 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
four
"""
+ _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
started = False
for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
if not started:
@@ -1261,6 +1262,7 @@ def context_diff(a, b, fromfile='', tofile='',
four
"""
+ _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ')
started = False
for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
@@ -1292,6 +1294,53 @@ def context_diff(a, b, fromfile='', tofile='',
for line in b[j1:j2]:
yield prefix[tag] + line
+def _check_types(a, b, *args):
+ # Checking types is weird, but the alternative is garbled output when
+ # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
+ # without this check, passing filenames as bytes results in output like
+ # --- b'oldfile.txt'
+ # +++ b'newfile.txt'
+ # because of how str.format() incorporates bytes objects.
+ if a and not isinstance(a[0], str):
+ raise TypeError('lines to compare must be str, not %s (%r)' %
+ (type(a[0]).__name__, a[0]))
+ if b and not isinstance(b[0], str):
+ raise TypeError('lines to compare must be str, not %s (%r)' %
+ (type(b[0]).__name__, b[0]))
+ for arg in args:
+ if not isinstance(arg, str):
+ raise TypeError('all arguments must be str, not: %r' % (arg,))
+
+def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
+ fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
+ r"""
+ Compare `a` and `b`, two sequences of lines represented as bytes rather
+ than str. This is a wrapper for `dfunc`, which is typically either
+ unified_diff() or context_diff(). Inputs are losslessly converted to
+ strings so that `dfunc` only has to worry about strings, and encoded
+ back to bytes on return. This is necessary to compare files with
+ unknown or inconsistent encoding. All other inputs (except `n`) must be
+ bytes rather than str.
+ """
+ def decode(s):
+ try:
+ return s.decode('ascii', 'surrogateescape')
+ except AttributeError as err:
+ msg = ('all arguments must be bytes, not %s (%r)' %
+ (type(s).__name__, s))
+ raise TypeError(msg) from err
+ a = list(map(decode, a))
+ b = list(map(decode, b))
+ fromfile = decode(fromfile)
+ tofile = decode(tofile)
+ fromfiledate = decode(fromfiledate)
+ tofiledate = decode(tofiledate)
+ lineterm = decode(lineterm)
+
+ lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
+ for line in lines:
+ yield line.encode('ascii', 'surrogateescape')
+
def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
r"""
Compare `a` and `b` (lists of strings); return a `Differ`-style delta.