1import difflib
2from test.support import run_unittest, findfile
3import unittest
4import doctest
5import sys
6
7
8class TestWithAscii(unittest.TestCase):
9    def test_one_insert(self):
10        sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
11        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
12        self.assertEqual(list(sm.get_opcodes()),
13            [   ('insert', 0, 0, 0, 1),
14                ('equal', 0, 100, 1, 101)])
15        self.assertEqual(sm.bpopular, set())
16        sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
17        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
18        self.assertEqual(list(sm.get_opcodes()),
19            [   ('equal', 0, 50, 0, 50),
20                ('insert', 50, 50, 50, 51),
21                ('equal', 50, 100, 51, 101)])
22        self.assertEqual(sm.bpopular, set())
23
24    def test_one_delete(self):
25        sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
26        self.assertAlmostEqual(sm.ratio(), 0.994, places=3)
27        self.assertEqual(list(sm.get_opcodes()),
28            [   ('equal', 0, 40, 0, 40),
29                ('delete', 40, 41, 40, 40),
30                ('equal', 41, 81, 40, 80)])
31
32    def test_bjunk(self):
33        sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ',
34                a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40)
35        self.assertEqual(sm.bjunk, set())
36
37        sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ',
38                a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20)
39        self.assertEqual(sm.bjunk, {' '})
40
41        sm = difflib.SequenceMatcher(isjunk=lambda x: x in [' ', 'b'],
42                a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20)
43        self.assertEqual(sm.bjunk, {' ', 'b'})
44
45
46class TestAutojunk(unittest.TestCase):
47    """Tests for the autojunk parameter added in 2.7"""
48    def test_one_insert_homogenous_sequence(self):
49        # By default autojunk=True and the heuristic kicks in for a sequence
50        # of length 200+
51        seq1 = 'b' * 200
52        seq2 = 'a' + 'b' * 200
53
54        sm = difflib.SequenceMatcher(None, seq1, seq2)
55        self.assertAlmostEqual(sm.ratio(), 0, places=3)
56        self.assertEqual(sm.bpopular, {'b'})
57
58        # Now turn the heuristic off
59        sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False)
60        self.assertAlmostEqual(sm.ratio(), 0.9975, places=3)
61        self.assertEqual(sm.bpopular, set())
62
63
64class TestSFbugs(unittest.TestCase):
65    def test_ratio_for_null_seqn(self):
66        # Check clearing of SF bug 763023
67        s = difflib.SequenceMatcher(None, [], [])
68        self.assertEqual(s.ratio(), 1)
69        self.assertEqual(s.quick_ratio(), 1)
70        self.assertEqual(s.real_quick_ratio(), 1)
71
72    def test_comparing_empty_lists(self):
73        # Check fix for bug #979794
74        group_gen = difflib.SequenceMatcher(None, [], []).get_grouped_opcodes()
75        self.assertRaises(StopIteration, next, group_gen)
76        diff_gen = difflib.unified_diff([], [])
77        self.assertRaises(StopIteration, next, diff_gen)
78
79    def test_matching_blocks_cache(self):
80        # Issue #21635
81        s = difflib.SequenceMatcher(None, "abxcd", "abcd")
82        first = s.get_matching_blocks()
83        second = s.get_matching_blocks()
84        self.assertEqual(second[0].size, 2)
85        self.assertEqual(second[1].size, 2)
86        self.assertEqual(second[2].size, 0)
87
88    def test_added_tab_hint(self):
89        # Check fix for bug #1488943
90        diff = list(difflib.Differ().compare(["\tI am a buggy"],["\t\tI am a bug"]))
91        self.assertEqual("- \tI am a buggy", diff[0])
92        self.assertEqual("?            --\n", diff[1])
93        self.assertEqual("+ \t\tI am a bug", diff[2])
94        self.assertEqual("? +\n", diff[3])
95
96patch914575_from1 = """
97   1. Beautiful is beTTer than ugly.
98   2. Explicit is better than implicit.
99   3. Simple is better than complex.
100   4. Complex is better than complicated.
101"""
102
103patch914575_to1 = """
104   1. Beautiful is better than ugly.
105   3.   Simple is better than complex.
106   4. Complicated is better than complex.
107   5. Flat is better than nested.
108"""
109
110patch914575_nonascii_from1 = """
111   1. Beautiful is beTTer than ugly.
112   2. Explicit is better than ımplıcıt.
113   3. Simple is better than complex.
114   4. Complex is better than complicated.
115"""
116
117patch914575_nonascii_to1 = """
118   1. Beautiful is better than ügly.
119   3.   Sımple is better than complex.
120   4. Complicated is better than cömplex.
121   5. Flat is better than nested.
122"""
123
124patch914575_from2 = """
125\t\tLine 1: preceded by from:[tt] to:[ssss]
126  \t\tLine 2: preceded by from:[sstt] to:[sssst]
127  \t \tLine 3: preceded by from:[sstst] to:[ssssss]
128Line 4:  \thas from:[sst] to:[sss] after :
129Line 5: has from:[t] to:[ss] at end\t
130"""
131
132patch914575_to2 = """
133    Line 1: preceded by from:[tt] to:[ssss]
134    \tLine 2: preceded by from:[sstt] to:[sssst]
135      Line 3: preceded by from:[sstst] to:[ssssss]
136Line 4:   has from:[sst] to:[sss] after :
137Line 5: has from:[t] to:[ss] at end
138"""
139
140patch914575_from3 = """line 0
1411234567890123456789012345689012345
142line 1
143line 2
144line 3
145line 4   changed
146line 5   changed
147line 6   changed
148line 7
149line 8  subtracted
150line 9
1511234567890123456789012345689012345
152short line
153just fits in!!
154just fits in two lines yup!!
155the end"""
156
157patch914575_to3 = """line 0
1581234567890123456789012345689012345
159line 1
160line 2    added
161line 3
162line 4   chanGEd
163line 5a  chanGed
164line 6a  changEd
165line 7
166line 8
167line 9
1681234567890
169another long line that needs to be wrapped
170just fitS in!!
171just fits in two lineS yup!!
172the end"""
173
174class TestSFpatches(unittest.TestCase):
175
176    def test_html_diff(self):
177        # Check SF patch 914575 for generating HTML differences
178        f1a = ((patch914575_from1 + '123\n'*10)*3)
179        t1a = (patch914575_to1 + '123\n'*10)*3
180        f1b = '456\n'*10 + f1a
181        t1b = '456\n'*10 + t1a
182        f1a = f1a.splitlines()
183        t1a = t1a.splitlines()
184        f1b = f1b.splitlines()
185        t1b = t1b.splitlines()
186        f2 = patch914575_from2.splitlines()
187        t2 = patch914575_to2.splitlines()
188        f3 = patch914575_from3
189        t3 = patch914575_to3
190        i = difflib.HtmlDiff()
191        j = difflib.HtmlDiff(tabsize=2)
192        k = difflib.HtmlDiff(wrapcolumn=14)
193
194        full = i.make_file(f1a,t1a,'from','to',context=False,numlines=5)
195        tables = '\n'.join(
196            [
197             '<h2>Context (first diff within numlines=5(default))</h2>',
198             i.make_table(f1a,t1a,'from','to',context=True),
199             '<h2>Context (first diff after numlines=5(default))</h2>',
200             i.make_table(f1b,t1b,'from','to',context=True),
201             '<h2>Context (numlines=6)</h2>',
202             i.make_table(f1a,t1a,'from','to',context=True,numlines=6),
203             '<h2>Context (numlines=0)</h2>',
204             i.make_table(f1a,t1a,'from','to',context=True,numlines=0),
205             '<h2>Same Context</h2>',
206             i.make_table(f1a,f1a,'from','to',context=True),
207             '<h2>Same Full</h2>',
208             i.make_table(f1a,f1a,'from','to',context=False),
209             '<h2>Empty Context</h2>',
210             i.make_table([],[],'from','to',context=True),
211             '<h2>Empty Full</h2>',
212             i.make_table([],[],'from','to',context=False),
213             '<h2>tabsize=2</h2>',
214             j.make_table(f2,t2),
215             '<h2>tabsize=default</h2>',
216             i.make_table(f2,t2),
217             '<h2>Context (wrapcolumn=14,numlines=0)</h2>',
218             k.make_table(f3.splitlines(),t3.splitlines(),context=True,numlines=0),
219             '<h2>wrapcolumn=14,splitlines()</h2>',
220             k.make_table(f3.splitlines(),t3.splitlines()),
221             '<h2>wrapcolumn=14,splitlines(True)</h2>',
222             k.make_table(f3.splitlines(True),t3.splitlines(True)),
223             ])
224        actual = full.replace('</body>','\n%s\n</body>' % tables)
225
226        # temporarily uncomment next two lines to baseline this test
227        #with open('test_difflib_expect.html','w') as fp:
228        #    fp.write(actual)
229
230        with open(findfile('test_difflib_expect.html')) as fp:
231            self.assertEqual(actual, fp.read())
232
233    def test_recursion_limit(self):
234        # Check if the problem described in patch #1413711 exists.
235        limit = sys.getrecursionlimit()
236        old = [(i%2 and "K:%d" or "V:A:%d") % i for i in range(limit*2)]
237        new = [(i%2 and "K:%d" or "V:B:%d") % i for i in range(limit*2)]
238        difflib.SequenceMatcher(None, old, new).get_opcodes()
239
240    def test_make_file_default_charset(self):
241        html_diff = difflib.HtmlDiff()
242        output = html_diff.make_file(patch914575_from1.splitlines(),
243                                     patch914575_to1.splitlines())
244        self.assertIn('content="text/html; charset=utf-8"', output)
245
246    def test_make_file_iso88591_charset(self):
247        html_diff = difflib.HtmlDiff()
248        output = html_diff.make_file(patch914575_from1.splitlines(),
249                                     patch914575_to1.splitlines(),
250                                     charset='iso-8859-1')
251        self.assertIn('content="text/html; charset=iso-8859-1"', output)
252
253    def test_make_file_usascii_charset_with_nonascii_input(self):
254        html_diff = difflib.HtmlDiff()
255        output = html_diff.make_file(patch914575_nonascii_from1.splitlines(),
256                                     patch914575_nonascii_to1.splitlines(),
257                                     charset='us-ascii')
258        self.assertIn('content="text/html; charset=us-ascii"', output)
259        self.assertIn('&#305;mpl&#305;c&#305;t', output)
260
261
262class TestOutputFormat(unittest.TestCase):
263    def test_tab_delimiter(self):
264        args = ['one', 'two', 'Original', 'Current',
265            '2005-01-26 23:30:50', '2010-04-02 10:20:52']
266        ud = difflib.unified_diff(*args, lineterm='')
267        self.assertEqual(list(ud)[0:2], [
268                           "--- Original\t2005-01-26 23:30:50",
269                           "+++ Current\t2010-04-02 10:20:52"])
270        cd = difflib.context_diff(*args, lineterm='')
271        self.assertEqual(list(cd)[0:2], [
272                           "*** Original\t2005-01-26 23:30:50",
273                           "--- Current\t2010-04-02 10:20:52"])
274
275    def test_no_trailing_tab_on_empty_filedate(self):
276        args = ['one', 'two', 'Original', 'Current']
277        ud = difflib.unified_diff(*args, lineterm='')
278        self.assertEqual(list(ud)[0:2], ["--- Original", "+++ Current"])
279
280        cd = difflib.context_diff(*args, lineterm='')
281        self.assertEqual(list(cd)[0:2], ["*** Original", "--- Current"])
282
283    def test_range_format_unified(self):
284        # Per the diff spec at http://www.unix.org/single_unix_specification/
285        spec = '''\
286           Each <range> field shall be of the form:
287             %1d", <beginning line number>  if the range contains exactly one line,
288           and:
289            "%1d,%1d", <beginning line number>, <number of lines> otherwise.
290           If a range is empty, its beginning line number shall be the number of
291           the line just before the range, or 0 if the empty range starts the file.
292        '''
293        fmt = difflib._format_range_unified
294        self.assertEqual(fmt(3,3), '3,0')
295        self.assertEqual(fmt(3,4), '4')
296        self.assertEqual(fmt(3,5), '4,2')
297        self.assertEqual(fmt(3,6), '4,3')
298        self.assertEqual(fmt(0,0), '0,0')
299
300    def test_range_format_context(self):
301        # Per the diff spec at http://www.unix.org/single_unix_specification/
302        spec = '''\
303           The range of lines in file1 shall be written in the following format
304           if the range contains two or more lines:
305               "*** %d,%d ****\n", <beginning line number>, <ending line number>
306           and the following format otherwise:
307               "*** %d ****\n", <ending line number>
308           The ending line number of an empty range shall be the number of the preceding line,
309           or 0 if the range is at the start of the file.
310
311           Next, the range of lines in file2 shall be written in the following format
312           if the range contains two or more lines:
313               "--- %d,%d ----\n", <beginning line number>, <ending line number>
314           and the following format otherwise:
315               "--- %d ----\n", <ending line number>
316        '''
317        fmt = difflib._format_range_context
318        self.assertEqual(fmt(3,3), '3')
319        self.assertEqual(fmt(3,4), '4')
320        self.assertEqual(fmt(3,5), '4,5')
321        self.assertEqual(fmt(3,6), '4,6')
322        self.assertEqual(fmt(0,0), '0')
323
324
325class TestBytes(unittest.TestCase):
326    # don't really care about the content of the output, just the fact
327    # that it's bytes and we don't crash
328    def check(self, diff):
329        diff = list(diff)   # trigger exceptions first
330        for line in diff:
331            self.assertIsInstance(
332                line, bytes,
333                "all lines of diff should be bytes, but got: %r" % line)
334
335    def test_byte_content(self):
336        # if we receive byte strings, we return byte strings
337        a = [b'hello', b'andr\xe9']     # iso-8859-1 bytes
338        b = [b'hello', b'andr\xc3\xa9'] # utf-8 bytes
339
340        unified = difflib.unified_diff
341        context = difflib.context_diff
342
343        check = self.check
344        check(difflib.diff_bytes(unified, a, a))
345        check(difflib.diff_bytes(unified, a, b))
346
347        # now with filenames (content and filenames are all bytes!)
348        check(difflib.diff_bytes(unified, a, a, b'a', b'a'))
349        check(difflib.diff_bytes(unified, a, b, b'a', b'b'))
350
351        # and with filenames and dates
352        check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013'))
353        check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013'))
354
355        # same all over again, with context diff
356        check(difflib.diff_bytes(context, a, a))
357        check(difflib.diff_bytes(context, a, b))
358        check(difflib.diff_bytes(context, a, a, b'a', b'a'))
359        check(difflib.diff_bytes(context, a, b, b'a', b'b'))
360        check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013'))
361        check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013'))
362
363    def test_byte_filenames(self):
364        # somebody renamed a file from ISO-8859-2 to UTF-8
365        fna = b'\xb3odz.txt'    # "łodz.txt"
366        fnb = b'\xc5\x82odz.txt'
367
368        # they transcoded the content at the same time
369        a = [b'\xa3odz is a city in Poland.']
370        b = [b'\xc5\x81odz is a city in Poland.']
371
372        check = self.check
373        unified = difflib.unified_diff
374        context = difflib.context_diff
375        check(difflib.diff_bytes(unified, a, b, fna, fnb))
376        check(difflib.diff_bytes(context, a, b, fna, fnb))
377
378        def assertDiff(expect, actual):
379            # do not compare expect and equal as lists, because unittest
380            # uses difflib to report difference between lists
381            actual = list(actual)
382            self.assertEqual(len(expect), len(actual))
383            for e, a in zip(expect, actual):
384                self.assertEqual(e, a)
385
386        expect = [
387            b'--- \xb3odz.txt',
388            b'+++ \xc5\x82odz.txt',
389            b'@@ -1 +1 @@',
390            b'-\xa3odz is a city in Poland.',
391            b'+\xc5\x81odz is a city in Poland.',
392        ]
393        actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'')
394        assertDiff(expect, actual)
395
396        # with dates (plain ASCII)
397        datea = b'2005-03-18'
398        dateb = b'2005-03-19'
399        check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb))
400        check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb))
401
402        expect = [
403            # note the mixed encodings here: this is deeply wrong by every
404            # tenet of Unicode, but it doesn't crash, it's parseable by
405            # patch, and it's how UNIX(tm) diff behaves
406            b'--- \xb3odz.txt\t2005-03-18',
407            b'+++ \xc5\x82odz.txt\t2005-03-19',
408            b'@@ -1 +1 @@',
409            b'-\xa3odz is a city in Poland.',
410            b'+\xc5\x81odz is a city in Poland.',
411        ]
412        actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb,
413                                    lineterm=b'')
414        assertDiff(expect, actual)
415
416    def test_mixed_types_content(self):
417        # type of input content must be consistent: all str or all bytes
418        a = [b'hello']
419        b = ['hello']
420
421        unified = difflib.unified_diff
422        context = difflib.context_diff
423
424        expect = "lines to compare must be str, not bytes (b'hello')"
425        self._assert_type_error(expect, unified, a, b)
426        self._assert_type_error(expect, unified, b, a)
427        self._assert_type_error(expect, context, a, b)
428        self._assert_type_error(expect, context, b, a)
429
430        expect = "all arguments must be bytes, not str ('hello')"
431        self._assert_type_error(expect, difflib.diff_bytes, unified, a, b)
432        self._assert_type_error(expect, difflib.diff_bytes, unified, b, a)
433        self._assert_type_error(expect, difflib.diff_bytes, context, a, b)
434        self._assert_type_error(expect, difflib.diff_bytes, context, b, a)
435
436    def test_mixed_types_filenames(self):
437        # cannot pass filenames as bytes if content is str (this may not be
438        # the right behaviour, but at least the test demonstrates how
439        # things work)
440        a = ['hello\n']
441        b = ['ohell\n']
442        fna = b'ol\xe9.txt'     # filename transcoded from ISO-8859-1
443        fnb = b'ol\xc3a9.txt'   # to UTF-8
444        self._assert_type_error(
445            "all arguments must be str, not: b'ol\\xe9.txt'",
446            difflib.unified_diff, a, b, fna, fnb)
447
448    def test_mixed_types_dates(self):
449        # type of dates must be consistent with type of contents
450        a = [b'foo\n']
451        b = [b'bar\n']
452        datea = '1 fév'
453        dateb = '3 fév'
454        self._assert_type_error(
455            "all arguments must be bytes, not str ('1 fév')",
456            difflib.diff_bytes, difflib.unified_diff,
457            a, b, b'a', b'b', datea, dateb)
458
459        # if input is str, non-ASCII dates are fine
460        a = ['foo\n']
461        b = ['bar\n']
462        list(difflib.unified_diff(a, b, 'a', 'b', datea, dateb))
463
464    def _assert_type_error(self, msg, generator, *args):
465        with self.assertRaises(TypeError) as ctx:
466            list(generator(*args))
467        self.assertEqual(msg, str(ctx.exception))
468
469
470def test_main():
471    difflib.HtmlDiff._default_prefix = 0
472    Doctests = doctest.DocTestSuite(difflib)
473    run_unittest(
474        TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs,
475        TestOutputFormat, TestBytes, Doctests)
476
477if __name__ == '__main__':
478    test_main()
479