#17445: difflib: add diff_bytes(), to compare bytes rather than str

Some applications (e.g. traditional Unix diff, version control systems) neither know nor care about the encodings of the files they are comparing. They are textual, but to the diff utility they are just bytes. This worked fine under Python 2, because all of the hardcoded strings in difflib.py are ASCII, so could safely be combined with old-style u'' strings. But it stopped working in 3.x. The solution is to use surrogate escapes for a lossless bytes->str->bytes roundtrip. That means {unified,context}_diff() can continue to just handle strings without worrying about bytes. Callers who have to deal with bytes will need to change to using diff_bytes(). Use case: Mercurial's test runner uses difflib to compare current hg output with known good output. But Mercurial's output is just bytes, since it can contain: * file contents (arbitrary unknown encoding) * filenames (arbitrary unknown encoding) * usernames and commit messages (usually UTF-8, but not guaranteed because old versions of Mercurial did not enforce it) * user messages (locale encoding) Since the output of any given hg command can include text in multiple encodings, it is hopeless to try to treat it as decodable Unicode text. It's just bytes, all the way down. This is an elaboration of a patch by Terry Reedy.
author: Greg Ward <greg@gerg.ca> 2015-04-21 00:21:21 (GMT)
committer: Greg Ward <greg@gerg.ca> 2015-04-21 00:21:21 (GMT)
commit: 4d9d2563f51edad448a960d9490a6f56ac733735 (patch)
tree: 732ccd1c3bded6c2b25b942c7f55c5cc685dab1e
parent: d19458ac51633cac979b7c7b9439ea89f179c8c8 (diff)
download: cpython-4d9d2563f51edad448a960d9490a6f56ac733735.zip
cpython-4d9d2563f51edad448a960d9490a6f56ac733735.tar.gz
cpython-4d9d2563f51edad448a960d9490a6f56ac733735.tar.bz2
5 files changed, 218 insertions, 2 deletions
diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst
index 4427065..efaac7a 100644
--- a/Doc/library/difflib.rst
+++ b/Doc/library/difflib.rst
@@ -315,6 +315,21 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
 
    See :ref:`difflib-interface` for a more detailed example.
 
+.. function:: diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'', fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\\n')
+
+   Compare *a* and *b* (lists of bytes objects) using *dfunc*; yield a
+   sequence of delta lines (also bytes) in the format returned by *dfunc*.
+   *dfunc* must be a callable, typically either :func:`unified_diff` or
+   :func:`context_diff`.
+
+   Allows you to compare data with unknown or inconsistent encoding. All
+   inputs except *n* must be bytes objects, not str. Works by losslessly
+   converting all inputs (except *n*) to str, and calling ``dfunc(a, b,
+   fromfile, tofile, fromfiledate, tofiledate, n, lineterm)``. The output of
+   *dfunc* is then converted back to bytes, so the delta lines that you
+   receive have the same unknown/inconsistent encodings as *a* and *b*.
+
+   .. versionadded:: 3.5
 
 .. function:: IS_LINE_JUNK(line)
 
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
index 0492ef9..f364317 100644
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -302,6 +302,9 @@ difflib
   charset of HTML document changed from ``'ISO-8859-1'`` to ``'utf-8'``.
   (Contributed by Berker Peksag in :issue:`2052`.)
 
+* It's now possible to compare lists of byte strings with
+  :func:`difflib.diff_bytes` (fixes a regression from Python 2).
+
 distutils
 ---------
 
diff --git a/Lib/difflib.py b/Lib/difflib.py
index 758f1aa..96fd9ab 100644
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@@ -28,7 +28,7 @@ Class HtmlDiff:
 
 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
            'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
-           'unified_diff', 'HtmlDiff', 'Match']
+           'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
 
 from heapq import nlargest as _nlargest
 from collections import namedtuple as _namedtuple
@@ -1174,6 +1174,7 @@ def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
      four
     """
 
+    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
     started = False
     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
         if not started:
@@ -1261,6 +1262,7 @@ def context_diff(a, b, fromfile='', tofile='',
       four
     """
 
+    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
     prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
     started = False
     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
@@ -1292,6 +1294,53 @@ def context_diff(a, b, fromfile='', tofile='',
                     for line in b[j1:j2]:
                         yield prefix[tag] + line
 
+def _check_types(a, b, *args):
+    # Checking types is weird, but the alternative is garbled output when
+    # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
+    # without this check, passing filenames as bytes results in output like
+    #   --- b'oldfile.txt'
+    #   +++ b'newfile.txt'
+    # because of how str.format() incorporates bytes objects.
+    if a and not isinstance(a[0], str):
+        raise TypeError('lines to compare must be str, not %s (%r)' %
+                        (type(a[0]).__name__, a[0]))
+    if b and not isinstance(b[0], str):
+        raise TypeError('lines to compare must be str, not %s (%r)' %
+                        (type(b[0]).__name__, b[0]))
+    for arg in args:
+        if not isinstance(arg, str):
+            raise TypeError('all arguments must be str, not: %r' % (arg,))
+
+def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
+               fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
+    r"""
+    Compare `a` and `b`, two sequences of lines represented as bytes rather
+    than str. This is a wrapper for `dfunc`, which is typically either
+    unified_diff() or context_diff(). Inputs are losslessly converted to
+    strings so that `dfunc` only has to worry about strings, and encoded
+    back to bytes on return. This is necessary to compare files with
+    unknown or inconsistent encoding. All other inputs (except `n`) must be
+    bytes rather than str.
+    """
+    def decode(s):
+        try:
+            return s.decode('ascii', 'surrogateescape')
+        except AttributeError as err:
+            msg = ('all arguments must be bytes, not %s (%r)' %
+                   (type(s).__name__, s))
+            raise TypeError(msg) from err
+    a = list(map(decode, a))
+    b = list(map(decode, b))
+    fromfile = decode(fromfile)
+    tofile = decode(tofile)
+    fromfiledate = decode(fromfiledate)
+    tofiledate = decode(tofiledate)
+    lineterm = decode(lineterm)
+
+    lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
+    for line in lines:
+        yield line.encode('ascii', 'surrogateescape')
+
 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
     r"""
     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py
index a078e71..ab9debf 100644
--- a/Lib/test/test_difflib.py
+++ b/Lib/test/test_difflib.py
@@ -322,12 +322,157 @@ class TestOutputFormat(unittest.TestCase):
         self.assertEqual(fmt(0,0), '0')
 
 
+class TestBytes(unittest.TestCase):
+    # don't really care about the content of the output, just the fact
+    # that it's bytes and we don't crash
+    def check(self, diff):
+        diff = list(diff)   # trigger exceptions first
+        for line in diff:
+            self.assertIsInstance(
+                line, bytes,
+                "all lines of diff should be bytes, but got: %r" % line)
+
+    def test_byte_content(self):
+        # if we receive byte strings, we return byte strings
+        a = [b'hello', b'andr\xe9']     # iso-8859-1 bytes
+        b = [b'hello', b'andr\xc3\xa9'] # utf-8 bytes
+
+        unified = difflib.unified_diff
+        context = difflib.context_diff
+
+        check = self.check
+        check(difflib.diff_bytes(unified, a, a))
+        check(difflib.diff_bytes(unified, a, b))
+
+        # now with filenames (content and filenames are all bytes!)
+        check(difflib.diff_bytes(unified, a, a, b'a', b'a'))
+        check(difflib.diff_bytes(unified, a, b, b'a', b'b'))
+
+        # and with filenames and dates
+        check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013'))
+        check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013'))
+
+        # same all over again, with context diff
+        check(difflib.diff_bytes(context, a, a))
+        check(difflib.diff_bytes(context, a, b))
+        check(difflib.diff_bytes(context, a, a, b'a', b'a'))
+        check(difflib.diff_bytes(context, a, b, b'a', b'b'))
+        check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013'))
+        check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013'))
+
+    def test_byte_filenames(self):
+        # somebody renamed a file from ISO-8859-2 to UTF-8
+        fna = b'\xb3odz.txt'    # "łodz.txt"
+        fnb = b'\xc5\x82odz.txt'
+
+        # they transcoded the content at the same time
+        a = [b'\xa3odz is a city in Poland.']
+        b = [b'\xc5\x81odz is a city in Poland.']
+
+        check = self.check
+        unified = difflib.unified_diff
+        context = difflib.context_diff
+        check(difflib.diff_bytes(unified, a, b, fna, fnb))
+        check(difflib.diff_bytes(context, a, b, fna, fnb))
+
+        def assertDiff(expect, actual):
+            # do not compare expect and equal as lists, because unittest
+            # uses difflib to report difference between lists
+            actual = list(actual)
+            self.assertEqual(len(expect), len(actual))
+            for e, a in zip(expect, actual):
+                self.assertEqual(e, a)
+
+        expect = [
+            b'--- \xb3odz.txt',
+            b'+++ \xc5\x82odz.txt',
+            b'@@ -1 +1 @@',
+            b'-\xa3odz is a city in Poland.',
+            b'+\xc5\x81odz is a city in Poland.',
+        ]
+        actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'')
+        assertDiff(expect, actual)
+
+        # with dates (plain ASCII)
+        datea = b'2005-03-18'
+        dateb = b'2005-03-19'
+        check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb))
+        check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb))
+
+        expect = [
+            # note the mixed encodings here: this is deeply wrong by every
+            # tenet of Unicode, but it doesn't crash, it's parseable by
+            # patch, and it's how UNIX(tm) diff behaves
+            b'--- \xb3odz.txt\t2005-03-18',
+            b'+++ \xc5\x82odz.txt\t2005-03-19',
+            b'@@ -1 +1 @@',
+            b'-\xa3odz is a city in Poland.',
+            b'+\xc5\x81odz is a city in Poland.',
+        ]
+        actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb,
+                                    lineterm=b'')
+        assertDiff(expect, actual)
+
+    def test_mixed_types_content(self):
+        # type of input content must be consistent: all str or all bytes
+        a = [b'hello']
+        b = ['hello']
+
+        unified = difflib.unified_diff
+        context = difflib.context_diff
+
+        expect = "lines to compare must be str, not bytes (b'hello')"
+        self._assert_type_error(expect, unified, a, b)
+        self._assert_type_error(expect, unified, b, a)
+        self._assert_type_error(expect, context, a, b)
+        self._assert_type_error(expect, context, b, a)
+
+        expect = "all arguments must be bytes, not str ('hello')"
+        self._assert_type_error(expect, difflib.diff_bytes, unified, a, b)
+        self._assert_type_error(expect, difflib.diff_bytes, unified, b, a)
+        self._assert_type_error(expect, difflib.diff_bytes, context, a, b)
+        self._assert_type_error(expect, difflib.diff_bytes, context, b, a)
+
+    def test_mixed_types_filenames(self):
+        # cannot pass filenames as bytes if content is str (this may not be
+        # the right behaviour, but at least the test demonstrates how
+        # things work)
+        a = ['hello\n']
+        b = ['ohell\n']
+        fna = b'ol\xe9.txt'     # filename transcoded from ISO-8859-1
+        fnb = b'ol\xc3a9.txt'   # to UTF-8
+        self._assert_type_error(
+            "all arguments must be str, not: b'ol\\xe9.txt'",
+            difflib.unified_diff, a, b, fna, fnb)
+
+    def test_mixed_types_dates(self):
+        # type of dates must be consistent with type of contents
+        a = [b'foo\n']
+        b = [b'bar\n']
+        datea = '1 fév'
+        dateb = '3 fév'
+        self._assert_type_error(
+            "all arguments must be bytes, not str ('1 fév')",
+            difflib.diff_bytes, difflib.unified_diff,
+            a, b, b'a', b'b', datea, dateb)
+
+        # if input is str, non-ASCII dates are fine
+        a = ['foo\n']
+        b = ['bar\n']
+        list(difflib.unified_diff(a, b, 'a', 'b', datea, dateb))
+
+    def _assert_type_error(self, msg, generator, *args):
+        with self.assertRaises(TypeError) as ctx:
+            list(generator(*args))
+        self.assertEqual(msg, str(ctx.exception))
+
+
 def test_main():
     difflib.HtmlDiff._default_prefix = 0
     Doctests = doctest.DocTestSuite(difflib)
     run_unittest(
         TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs,
-        TestOutputFormat, Doctests)
+        TestOutputFormat, TestBytes, Doctests)
 
 if __name__ == '__main__':
     test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index 5e31bd0..7277617 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -195,6 +195,10 @@ Library
 - Issue #23310: Fix MagicMock's initializer to work with __methods__, just
   like configure_mock().  Patch by Kasia Jachim.
 
+- Issue #17445: add difflib.diff_bytes() to support comparison of
+  byte strings (fixes a regression from Python 2).
+
+
 Build
 -----
author	Greg Ward <greg@gerg.ca>	2015-04-21 00:21:21 (GMT)
committer	Greg Ward <greg@gerg.ca>	2015-04-21 00:21:21 (GMT)
commit	4d9d2563f51edad448a960d9490a6f56ac733735 (patch)
tree	732ccd1c3bded6c2b25b942c7f55c5cc685dab1e
parent	d19458ac51633cac979b7c7b9439ea89f179c8c8 (diff)
download	cpython-4d9d2563f51edad448a960d9490a6f56ac733735.zip cpython-4d9d2563f51edad448a960d9490a6f56ac733735.tar.gz cpython-4d9d2563f51edad448a960d9490a6f56ac733735.tar.bz2