From 102029dfd645123244437dee6fc8c0f72125433b Mon Sep 17 00:00:00 2001 From: Berker Peksag Date: Sun, 15 Mar 2015 01:18:47 +0200 Subject: Issue #2052: Add charset parameter to HtmlDiff.make_file(). --- Doc/library/difflib.rst | 7 ++++++- Doc/whatsnew/3.5.rst | 8 ++++++++ Lib/difflib.py | 19 +++++++++++-------- Lib/test/test_difflib.py | 35 +++++++++++++++++++++++++++++++++++ Lib/test/test_difflib_expect.html | 2 +- Misc/NEWS | 2 ++ 6 files changed, 63 insertions(+), 10 deletions(-) diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst index 329bde0..4427065 100644 --- a/Doc/library/difflib.rst +++ b/Doc/library/difflib.rst @@ -104,7 +104,8 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module. The following methods are public: - .. method:: make_file(fromlines, tolines, fromdesc='', todesc='', context=False, numlines=5) + .. method:: make_file(fromlines, tolines, fromdesc='', todesc='', context=False, \ + numlines=5, *, charset='utf-8') Compares *fromlines* and *tolines* (lists of strings) and returns a string which is a complete HTML file containing a table showing line by line differences with @@ -123,6 +124,10 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module. the next difference highlight at the top of the browser without any leading context). + .. versionchanged:: 3.5 + *charset* keyword-only argument was added. The default charset of + HTML document changed from ``'ISO-8859-1'`` to ``'utf-8'``. + .. method:: make_table(fromlines, tolines, fromdesc='', todesc='', context=False, numlines=5) Compares *fromlines* and *tolines* (lists of strings) and returns a string which diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst index 2f79848..21fafd0 100644 --- a/Doc/whatsnew/3.5.rst +++ b/Doc/whatsnew/3.5.rst @@ -225,6 +225,14 @@ contextlib don't provide any options to redirect it. (Contributed by Berker Peksag in :issue:`22389`.) +difflib +------- + +* The charset of the HTML document generated by :meth:`difflib.HtmlDiff.make_file` + can now be customized by using *charset* keyword-only parameter. The default + charset of HTML document changed from ``'ISO-8859-1'`` to ``'utf-8'``. + (Contributed by Berker Peksag in :issue:`2052`.) + distutils --------- diff --git a/Lib/difflib.py b/Lib/difflib.py index ae3479d..758f1aa 100644 --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -1598,7 +1598,7 @@ _file_template = """ + content="text/html; charset=%(charset)s" /> @@ -1685,8 +1685,8 @@ class HtmlDiff(object): self._linejunk = linejunk self._charjunk = charjunk - def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False, - numlines=5): + def make_file(self, fromlines, tolines, fromdesc='', todesc='', + context=False, numlines=5, *, charset='utf-8'): """Returns HTML file of side by side comparison with change highlights Arguments: @@ -1701,13 +1701,16 @@ class HtmlDiff(object): When context is False, controls the number of lines to place the "next" link anchors before the next change (so click of "next" link jumps to just before the change). + charset -- charset of the HTML document """ - return self._file_template % dict( - styles = self._styles, - legend = self._legend, - table = self.make_table(fromlines,tolines,fromdesc,todesc, - context=context,numlines=numlines)) + return (self._file_template % dict( + styles=self._styles, + legend=self._legend, + table=self.make_table(fromlines, tolines, fromdesc, todesc, + context=context, numlines=numlines), + charset=charset + )).encode(charset, 'xmlcharrefreplace').decode(charset) def _tab_newline_replace(self,fromlines,tolines): """Returns from/to line lists with tabs expanded and newlines removed. diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py index 0ba8f0e..a078e71 100644 --- a/Lib/test/test_difflib.py +++ b/Lib/test/test_difflib.py @@ -107,6 +107,20 @@ patch914575_to1 = """ 5. Flat is better than nested. """ +patch914575_nonascii_from1 = """ + 1. Beautiful is beTTer than ugly. + 2. Explicit is better than ımplıcıt. + 3. Simple is better than complex. + 4. Complex is better than complicated. +""" + +patch914575_nonascii_to1 = """ + 1. Beautiful is better than ügly. + 3. Sımple is better than complex. + 4. Complicated is better than cömplex. + 5. Flat is better than nested. +""" + patch914575_from2 = """ \t\tLine 1: preceeded by from:[tt] to:[ssss] \t\tLine 2: preceeded by from:[sstt] to:[sssst] @@ -223,6 +237,27 @@ class TestSFpatches(unittest.TestCase): new = [(i%2 and "K:%d" or "V:B:%d") % i for i in range(limit*2)] difflib.SequenceMatcher(None, old, new).get_opcodes() + def test_make_file_default_charset(self): + html_diff = difflib.HtmlDiff() + output = html_diff.make_file(patch914575_from1.splitlines(), + patch914575_to1.splitlines()) + self.assertIn('content="text/html; charset=utf-8"', output) + + def test_make_file_iso88591_charset(self): + html_diff = difflib.HtmlDiff() + output = html_diff.make_file(patch914575_from1.splitlines(), + patch914575_to1.splitlines(), + charset='iso-8859-1') + self.assertIn('content="text/html; charset=iso-8859-1"', output) + + def test_make_file_usascii_charset_with_nonascii_input(self): + html_diff = difflib.HtmlDiff() + output = html_diff.make_file(patch914575_nonascii_from1.splitlines(), + patch914575_nonascii_to1.splitlines(), + charset='us-ascii') + self.assertIn('content="text/html; charset=us-ascii"', output) + self.assertIn('ımplıcıt', output) + class TestOutputFormat(unittest.TestCase): def test_tab_delimiter(self): diff --git a/Lib/test/test_difflib_expect.html b/Lib/test/test_difflib_expect.html index 71b6d7a..ea7a24e 100644 --- a/Lib/test/test_difflib_expect.html +++ b/Lib/test/test_difflib_expect.html @@ -6,7 +6,7 @@ + content="text/html; charset=utf-8" />