1 files changed, 181 insertions, 1 deletions
diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py
index 0ba8f0e05b..ab9debf8e2 100644
--- a/Lib/test/test_difflib.py
+++ b/Lib/test/test_difflib.py
@@ -107,6 +107,20 @@ patch914575_to1 = """
    5. Flat is better than nested.
 """
 
+patch914575_nonascii_from1 = """
+   1. Beautiful is beTTer than ugly.
+   2. Explicit is better than ımplıcıt.
+   3. Simple is better than complex.
+   4. Complex is better than complicated.
+"""
+
+patch914575_nonascii_to1 = """
+   1. Beautiful is better than ügly.
+   3.   Sımple is better than complex.
+   4. Complicated is better than cömplex.
+   5. Flat is better than nested.
+"""
+
 patch914575_from2 = """
 \t\tLine 1: preceeded by from:[tt] to:[ssss]
   \t\tLine 2: preceeded by from:[sstt] to:[sssst]
@@ -223,6 +237,27 @@ class TestSFpatches(unittest.TestCase):
         new = [(i%2 and "K:%d" or "V:B:%d") % i for i in range(limit*2)]
         difflib.SequenceMatcher(None, old, new).get_opcodes()
 
+    def test_make_file_default_charset(self):
+        html_diff = difflib.HtmlDiff()
+        output = html_diff.make_file(patch914575_from1.splitlines(),
+                                     patch914575_to1.splitlines())
+        self.assertIn('content="text/html; charset=utf-8"', output)
+
+    def test_make_file_iso88591_charset(self):
+        html_diff = difflib.HtmlDiff()
+        output = html_diff.make_file(patch914575_from1.splitlines(),
+                                     patch914575_to1.splitlines(),
+                                     charset='iso-8859-1')
+        self.assertIn('content="text/html; charset=iso-8859-1"', output)
+
+    def test_make_file_usascii_charset_with_nonascii_input(self):
+        html_diff = difflib.HtmlDiff()
+        output = html_diff.make_file(patch914575_nonascii_from1.splitlines(),
+                                     patch914575_nonascii_to1.splitlines(),
+                                     charset='us-ascii')
+        self.assertIn('content="text/html; charset=us-ascii"', output)
+        self.assertIn('&#305;mpl&#305;c&#305;t', output)
+
 
 class TestOutputFormat(unittest.TestCase):
     def test_tab_delimiter(self):
@@ -287,12 +322,157 @@ class TestOutputFormat(unittest.TestCase):
         self.assertEqual(fmt(0,0), '0')
 
 
+class TestBytes(unittest.TestCase):
+    # don't really care about the content of the output, just the fact
+    # that it's bytes and we don't crash
+    def check(self, diff):
+        diff = list(diff)   # trigger exceptions first
+        for line in diff:
+            self.assertIsInstance(
+                line, bytes,
+                "all lines of diff should be bytes, but got: %r" % line)
+
+    def test_byte_content(self):
+        # if we receive byte strings, we return byte strings
+        a = [b'hello', b'andr\xe9']     # iso-8859-1 bytes
+        b = [b'hello', b'andr\xc3\xa9'] # utf-8 bytes
+
+        unified = difflib.unified_diff
+        context = difflib.context_diff
+
+        check = self.check
+        check(difflib.diff_bytes(unified, a, a))
+        check(difflib.diff_bytes(unified, a, b))
+
+        # now with filenames (content and filenames are all bytes!)
+        check(difflib.diff_bytes(unified, a, a, b'a', b'a'))
+        check(difflib.diff_bytes(unified, a, b, b'a', b'b'))
+
+        # and with filenames and dates
+        check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013'))
+        check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013'))
+
+        # same all over again, with context diff
+        check(difflib.diff_bytes(context, a, a))
+        check(difflib.diff_bytes(context, a, b))
+        check(difflib.diff_bytes(context, a, a, b'a', b'a'))
+        check(difflib.diff_bytes(context, a, b, b'a', b'b'))
+        check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013'))
+        check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013'))
+
+    def test_byte_filenames(self):
+        # somebody renamed a file from ISO-8859-2 to UTF-8
+        fna = b'\xb3odz.txt'    # "łodz.txt"
+        fnb = b'\xc5\x82odz.txt'
+
+        # they transcoded the content at the same time
+        a = [b'\xa3odz is a city in Poland.']
+        b = [b'\xc5\x81odz is a city in Poland.']
+
+        check = self.check
+        unified = difflib.unified_diff
+        context = difflib.context_diff
+        check(difflib.diff_bytes(unified, a, b, fna, fnb))
+        check(difflib.diff_bytes(context, a, b, fna, fnb))
+
+        def assertDiff(expect, actual):
+            # do not compare expect and equal as lists, because unittest
+            # uses difflib to report difference between lists
+            actual = list(actual)
+            self.assertEqual(len(expect), len(actual))
+            for e, a in zip(expect, actual):
+                self.assertEqual(e, a)
+
+        expect = [
+            b'--- \xb3odz.txt',
+            b'+++ \xc5\x82odz.txt',
+            b'@@ -1 +1 @@',
+            b'-\xa3odz is a city in Poland.',
+            b'+\xc5\x81odz is a city in Poland.',
+        ]
+        actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'')
+        assertDiff(expect, actual)
+
+        # with dates (plain ASCII)
+        datea = b'2005-03-18'
+        dateb = b'2005-03-19'
+        check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb))
+        check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb))
+
+        expect = [
+            # note the mixed encodings here: this is deeply wrong by every
+            # tenet of Unicode, but it doesn't crash, it's parseable by
+            # patch, and it's how UNIX(tm) diff behaves
+            b'--- \xb3odz.txt\t2005-03-18',
+            b'+++ \xc5\x82odz.txt\t2005-03-19',
+            b'@@ -1 +1 @@',
+            b'-\xa3odz is a city in Poland.',
+            b'+\xc5\x81odz is a city in Poland.',
+        ]
+        actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb,
+                                    lineterm=b'')
+        assertDiff(expect, actual)
+
+    def test_mixed_types_content(self):
+        # type of input content must be consistent: all str or all bytes
+        a = [b'hello']
+        b = ['hello']
+
+        unified = difflib.unified_diff
+        context = difflib.context_diff
+
+        expect = "lines to compare must be str, not bytes (b'hello')"
+        self._assert_type_error(expect, unified, a, b)
+        self._assert_type_error(expect, unified, b, a)
+        self._assert_type_error(expect, context, a, b)
+        self._assert_type_error(expect, context, b, a)
+
+        expect = "all arguments must be bytes, not str ('hello')"
+        self._assert_type_error(expect, difflib.diff_bytes, unified, a, b)
+        self._assert_type_error(expect, difflib.diff_bytes, unified, b, a)
+        self._assert_type_error(expect, difflib.diff_bytes, context, a, b)
+        self._assert_type_error(expect, difflib.diff_bytes, context, b, a)
+
+    def test_mixed_types_filenames(self):
+        # cannot pass filenames as bytes if content is str (this may not be
+        # the right behaviour, but at least the test demonstrates how
+        # things work)
+        a = ['hello\n']
+        b = ['ohell\n']
+        fna = b'ol\xe9.txt'     # filename transcoded from ISO-8859-1
+        fnb = b'ol\xc3a9.txt'   # to UTF-8
+        self._assert_type_error(
+            "all arguments must be str, not: b'ol\\xe9.txt'",
+            difflib.unified_diff, a, b, fna, fnb)
+
+    def test_mixed_types_dates(self):
+        # type of dates must be consistent with type of contents
+        a = [b'foo\n']
+        b = [b'bar\n']
+        datea = '1 fév'
+        dateb = '3 fév'
+        self._assert_type_error(
+            "all arguments must be bytes, not str ('1 fév')",
+            difflib.diff_bytes, difflib.unified_diff,
+            a, b, b'a', b'b', datea, dateb)
+
+        # if input is str, non-ASCII dates are fine
+        a = ['foo\n']
+        b = ['bar\n']
+        list(difflib.unified_diff(a, b, 'a', 'b', datea, dateb))
+
+    def _assert_type_error(self, msg, generator, *args):
+        with self.assertRaises(TypeError) as ctx:
+            list(generator(*args))
+        self.assertEqual(msg, str(ctx.exception))
+
+
 def test_main():
     difflib.HtmlDiff._default_prefix = 0
     Doctests = doctest.DocTestSuite(difflib)
     run_unittest(
         TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs,
-        TestOutputFormat, Doctests)
+        TestOutputFormat, TestBytes, Doctests)
 
 if __name__ == '__main__':
     test_main()